checks/ps.include

   1 #!/usr/bin/python
   2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
   3 # +------------------------------------------------------------------+
   4 # |             ____ _               _        __  __ _  __           |
   5 # |            / ___| |__   ___  ___| | __   |  \/  | |/ /           |
   6 # |           | |   | '_ \ / _ \/ __| |/ /   | |\/| | ' /            |
   7 # |           | |___| | | |  __/ (__|   <    | |  | | . \            |
   8 # |            \____|_| |_|\___|\___|_|\_\___|_|  |_|_|\_\           |
   9 # |                                                                  |
  10 # | Copyright Mathias Kettner 2014             mk@mathias-kettner.de |
  11 # +------------------------------------------------------------------+
  12 #
  13 # This file is part of Check_MK.
  14 # The official homepage is at http://mathias-kettner.de/check_mk.
  15 #
  16 # check_mk is free software;  you can redistribute it and/or modify it
  17 # under the  terms of the  GNU General Public License  as published by
  18 # the Free Software Foundation in version 2.  check_mk is  distributed
  19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY;  with-
  20 # out even the implied warranty of  MERCHANTABILITY  or  FITNESS FOR A
  21 # PARTICULAR PURPOSE. See the  GNU General Public License for more de-
  22 # tails. You should have  received  a copy of the  GNU  General Public
  23 # License along with GNU Make; see the file  COPYING.  If  not,  write
  24 # to the Free Software Foundation, Inc., 51 Franklin St,  Fifth Floor,
  25 # Boston, MA 02110-1301 USA.
  26 GRAB_USER = False
  27
  28 factory_settings["ps_default_levels"] = {
  29     "levels": (1, 1, 99999, 99999),
  30 }
  31
  32 ps_info = collections.namedtuple(
  33     "Process_Info", ('user', 'virtual', 'physical', 'cputime', 'process_id', 'pagefile',
  34                      'usermode_time', 'kernelmode_time', 'handles', 'threads', 'uptime'))
  35
  36 ps_info.__new__.__defaults__ = (None,) * len(ps_info._fields)
  37
  38
  39 def ps_info_tuple(entry):
  40     ps_tuple_re = regex(r"^\((.*)\)$")
  41     matched_ps_info = ps_tuple_re.match(entry)
  42     if matched_ps_info:
  43         return ps_info(*matched_ps_info.group(1).split(","))
  44     return False
  45
  46
  47 def ps_wato_configured_inventory_rules(invrules):
  48     inventory_specs = []
  49     for value in host_extra_conf(host_name(), invrules):
  50         default_params = value.get('default_params', value)
  51         if "cpu_rescale_max" not in default_params:
  52             default_params["cpu_rescale_max"] = None
  53
  54         inventory_specs.append((value['descr'], value.get('match'), value.get('user'),
  55                                 default_params))
  56
  57     return inventory_specs
  58
  59
  60 def inventory_ps_common(invrules, parsed):
  61     inventory_specs = ps_wato_configured_inventory_rules(invrules)
  62
  63     inventory = []
  64     for line in parsed:
  65         for servicedesc, pattern, userspec, default_params in inventory_specs:
  66             # First entry in line is the node name or None for non-clusters
  67             process_line = line[1:]
  68             matches = process_matches(process_line, pattern, userspec)
  69             if not matches:
  70                 continue  # skip not matched lines
  71
  72             # User capturing on rule
  73             if userspec == GRAB_USER:
  74                 i_userspec = process_line[0].user
  75             else:
  76                 i_userspec = userspec
  77
  78             i_servicedesc = servicedesc.replace("%u", i_userspec or "")
  79
  80             # Process capture
  81             if hasattr(matches, 'groups'):
  82                 match_groups = [g if g else "" for g in matches.groups()]
  83             else:
  84                 match_groups = []
  85
  86             i_servicedesc = replace_service_description(i_servicedesc, match_groups, pattern)
  87
  88             # Problem here: We need to instantiate all subexpressions
  89             # with their actual values of the found process.
  90             inv_params = {
  91                 "process": pattern,
  92                 "match_groups": match_groups,
  93                 "user": i_userspec,
  94             }
  95
  96             # default_params is either a clean dict with optional
  97             # parameters to set as default or - from version 1.2.4 - the
  98             # dict from the rule itself. In the later case we need to remove
  99             # the keys that do not specify default parameters
 100             for key, value in default_params.items():
 101                 if key not in ("descr", "match", "user", "perfdata"):
 102                     inv_params[key] = value
 103
 104             inv = (i_servicedesc, inv_params)
 105
 106             if inv not in inventory:
 107                 inventory.append(inv)
 108
 109     return inventory
 110
 111
 112 def replace_service_description(service_description, match_groups, pattern):
 113
 114     # New in 1.2.2b4: Alle %1, %2, etc. to be replaced with first, second, ...
 115     # group. This allows a reordering of the matched groups
 116     service_description = re.sub(r'%(\d+)', r'{\1}', service_description)
 117
 118     # First argument is None, because format is zero indexed
 119     service_description = service_description.format(None, *match_groups)
 120
 121     num_elements_replace = service_description.count('%s')
 122     if len(match_groups) < num_elements_replace:
 123         raise MKGeneralException(
 124             "Invalid entry in inventory_processes_rules: "
 125             "service description '%s' contains "
 126             "%d replaceable elements, but "
 127             "regular expression '%s' contains only %d subexpression(s)." %
 128             (service_description, num_elements_replace, pattern, len(match_groups)))
 129
 130     # It is allowed (1.1.4) that the pattern contains more subexpressions
 131     # then the service description. In that case only the first
 132     # subexpressions are used as item.
 133     service_description = service_description % tuple(match_groups[:num_elements_replace])
 134
 135     return service_description
 136
 137
 138 def match_user(user, user_pattern):
 139     if user_pattern:
 140         if user_pattern.startswith('~'):
 141             if not regex(user_pattern[1:]).match(user):
 142                 return False
 143
 144         elif user_pattern != user:
 145             return False
 146     return None
 147
 148
 149 def process_matches(process_line, process_pattern, user_pattern, match_groups=None):
 150     user, command_line = process_line[0].user, process_line[1:]
 151
 152     if match_user(user, user_pattern) is False:
 153         return False
 154
 155     if not process_pattern:
 156         # Process name not relevant
 157         return True
 158
 159     elif process_pattern.startswith("~"):
 160         # Regex for complete process command line
 161         reg = regex(process_pattern[1:])  # skip "~"
 162         m = reg.match(" ".join(command_line))
 163         if not m:
 164             return False
 165         if match_groups:
 166             return m.groups() == tuple(match_groups)
 167         return m
 168
 169     # Exact match on name of executable
 170     return command_line[0] == process_pattern
 171
 172
 173 # produce text or html output intended for the long output field of a check
 174 # from details about a process.  the input is expected to be a list (one
 175 # per process) of lists (one per data field) of key-value tuples where the
 176 # value is again a 2-field tuple, first is the value, second is the unit.
 177 # This function is actually fairly generic so it could be used for other
 178 # data structured the same way
 179 def format_process_list(processes, html_output):
 180     def format_value(value):
 181         value, unit = value
 182         if isinstance(value, float):
 183             return "%.1f%s" % (value, unit)
 184         return "%s%s" % (value, unit)
 185
 186     if html_output:
 187         table_bracket = "<table>%s</table>"
 188         line_bracket = "<tr>%s</tr>"
 189         cell_bracket = "<td>%.0s%s</td>"
 190         cell_seperator = ""
 191
 192         headers = []
 193         headers_found = set()
 194
 195         for process in processes:
 196             for key, value in process:
 197                 if key not in headers_found:
 198                     headers.append(key)
 199                     headers_found.add(key)
 200
 201         # make sure each process has all fields from the table
 202         processes_filled = []
 203         for process in processes:
 204             dictified = dict(process)
 205             processes_filled.append([(key, dictified.get(key, "")) for key in headers])
 206         processes = processes_filled
 207         header_line = "<tr><th>" + "</th><th>".join(headers) + "</th></tr>"
 208     else:
 209         table_bracket = "%s"
 210         line_bracket = "%s\r\n"
 211         cell_bracket = "%s %s"
 212         cell_seperator = ", "
 213         header_line = ""
 214
 215     return table_bracket % (header_line + "".join([
 216         line_bracket % cell_seperator.join(
 217             [cell_bracket % (key, format_value(value))
 218              for key, value in process])
 219         for process in processes
 220     ]))
 221
 222
 223 # Parse time as output by ps into seconds.
 224 # Example 1: "12:17"
 225 # Example 2: "55:12:17"
 226 # Example 3: "7-12:34:59" (with 7 days)
 227 # Example 4: "7123459" (only seconds, windows)
 228 def parse_ps_time(text):
 229     if "-" in text:
 230         tokens = text.split("-")
 231         days = int(tokens[0] or 0)
 232         text = tokens[1]
 233     else:
 234         days = 0
 235
 236     day_secs = sum(
 237         [factor * int(v or 0) for factor, v in zip([1, 60, 3600], reversed(text.split(":")))])
 238
 239     return 86400 * days + day_secs
 240
 241
 242 # This function is repeated in cmk/gui/plugins/wato/check_parameters/ps.py
 243 # Update that function too until we can import them
 244 def ps_cleanup_params(params):
 245     # New parameter format: dictionary. Example:
 246     # {
 247     #    "user" : "foo",
 248     #    "process" : "/usr/bin/food",
 249     #    "warnmin" : 1,
 250     #    "okmin"   : 1,
 251     #    "okmax"   : 1,
 252     #    "warnmax" : 1,
 253     # }
 254
 255     # Even newer format:
 256     # {
 257     #   "user" : "foo",
 258     #   "levels" : (1, 1, 99999, 99999)
 259     # }
 260     if isinstance(params, (list, tuple)):
 261         if len(params) == 5:
 262             procname, warnmin, okmin, okmax, warnmax = params
 263             user = None
 264         elif len(params) == 6:
 265             procname, user, warnmin, okmin, okmax, warnmax = params
 266
 267         params = {
 268             "process": procname,
 269             "levels": (warnmin, okmin, okmax, warnmax),
 270             "user": user,
 271         }
 272
 273     elif any(k in params for k in ['okmin', 'warnmin', 'okmax', 'warnmax']):
 274         params["levels"] = (
 275             params.pop("warnmin", 1),
 276             params.pop("okmin", 1),
 277             params.pop("okmax", 99999),
 278             params.pop("warnmax", 99999),
 279         )
 280
 281     if "cpu_rescale_max" not in params:
 282         params["cpu_rescale_max"] = None
 283
 284     return params
 285
 286
 287 def check_ps_common(item, params, parsed, cpu_cores=1, info_name="process", total_ram=None):
 288     params = ps_cleanup_params(params)
 289
 290     processes = check_ps_process_capture(parsed, params, cpu_cores)
 291
 292     yield ps_count_check(processes, params, info_name)
 293
 294     for memory_state in memory_check(processes, params):
 295         yield memory_state
 296
 297     if processes.resident_size and "resident_levels_perc" in params:
 298         yield memory_perc_check(processes, params, total_ram)
 299
 300     # CPU
 301     if processes.count:
 302         yield cpu_check(processes.percent_cpu, item, params)
 303
 304     if "single_cpulevels" in params:
 305         for ps_state in individual_process_check(processes, params):
 306             yield ps_state
 307
 308     # only check handle_count if provided by wmic counters
 309     if processes.handle_count:
 310         yield handle_count_check(processes, params)
 311
 312     if processes.min_elapsed is not None:
 313         yield uptime_check(processes, params)
 314
 315     if params.get("process_info", None):
 316         infotext = "\n" + format_process_list(processes, params["process_info"] == "html")
 317         yield 0, infotext
 318
 319
 320 def upperlevels(value, warn, crit, readable=str):
 321     infotext = ''
 322     if value >= crit:
 323         state = 2
 324     elif value >= warn:
 325         state = 1
 326     else:
 327         state = 0
 328     if state:
 329         infotext = ": (warn/crit at %s/%s)" % tuple(map(readable, (warn, crit)))
 330
 331     return state, infotext
 332
 333
 334 def boundary_levels(value, warnmin, okmin, okmax, warnmax):
 335     infotext = ''
 336     state = 0
 337     if value > warnmax or value < warnmin:
 338         state = 2
 339     elif value > okmax or value < okmin:
 340         state = 1
 341     else:
 342         state = 0
 343     if state:
 344         infotext = ": (ok from %d to %d)" % (okmin, okmax)
 345
 346     return state, infotext
 347
 348
 349 def ps_count_check(processes, params, info_name):
 350     warnmin, okmin, okmax, warnmax = params["levels"]
 351     count = processes.count
 352
 353     perfdata = [("count", count, okmax + 1, warnmax + 1, 0)]
 354     infotext = "%d %s%s" % (count, info_name, '' if count == 1 else 'es')
 355
 356     state, warntext = boundary_levels(count, warnmin, okmin, okmax, warnmax)
 357     infotext += warntext
 358
 359     if processes.running_on_nodes:
 360         infotext += " [running on %s]" % ", ".join(sorted(processes.running_on_nodes))
 361
 362     return state, infotext, perfdata
 363
 364
 365 def memory_check(processes, params):
 366     """Check levels for virtual and physical used memory"""
 367     for size, title, levels, metric in [
 368         (processes.virtual_size, "virtual", "virtual_levels", "vsz"),
 369         (processes.resident_size, "physical", "resident_levels", "rss"),
 370     ]:
 371         if size == 0:
 372             continue
 373
 374         infotext = "%s %s" % (get_bytes_human_readable(size * 1024.0), title)
 375         warn_levels, crit_levels = params.get(levels, (None, None))
 376         if levels in params:
 377             state, levelstext = upperlevels(size * 1024, warn_levels, crit_levels,
 378                                             get_bytes_human_readable)
 379         else:
 380             state, levelstext = 0, ""
 381         yield state, infotext + levelstext, [(metric, size, warn_levels, crit_levels)]
 382
 383
 384 def memory_perc_check(processes, params, total_ram):
 385     """Check levels that are in percent of the total RAM of the host"""
 386     warn_perc, crit_perc = params["resident_levels_perc"]
 387     if not total_ram:
 388         infotext = "percentual RAM levels configured, but total RAM is unknown"
 389         state = 3
 390     else:
 391         resident_perc = 100 * float(processes.resident_size * 1024) / total_ram
 392         infotext = "%s of total RAM" % get_percent_human_readable(resident_perc)
 393         state, levelstext = upperlevels(resident_perc, warn_perc, crit_perc,
 394                                         get_percent_human_readable)
 395         infotext += levelstext
 396
 397     return state, infotext
 398
 399
 400 def cpu_check(percent_cpu, item, params):
 401     """Check levels for cpu utilization from given process"""
 402
 403     infotext = "%.1f%% CPU" % percent_cpu
 404     warn_cpu, crit_cpu = params.get("cpulevels", (None, None, None))[:2]
 405     perf_data = [("pcpu", percent_cpu, warn_cpu, crit_cpu)]
 406
 407     # CPU might come with previous
 408     if "cpu_average" in params:
 409         now = time.time()
 410         avg_cpu = get_average("ps.%s.cpu" % item, now, percent_cpu, params["cpu_average"], False)
 411         infotext += " (%d min average: %.1f%%)" % (params["cpu_average"], avg_cpu)
 412         perf_data.append(("pcpuavg", avg_cpu, warn_cpu, crit_cpu, 0, params["cpu_average"]))
 413         percent_cpu = avg_cpu  # use this for level comparison
 414
 415     state, levelstext = upperlevels(percent_cpu, warn_cpu, crit_cpu,
 416                                     get_percent_human_readable) if "cpulevels" in params else (0,
 417                                                                                                "")
 418     return state, infotext + levelstext, perf_data
 419
 420
 421 def individual_process_check(processes, params):
 422     warn_cpu_single, crit_cpu_single = params["single_cpulevels"]
 423     for p in processes:
 424         cpu_usage, name, pid = 0.0, None, None
 425
 426         for the_item, (value, _unit) in p:
 427             if the_item == "name":
 428                 name = value
 429             if the_item == "pid":
 430                 pid = value
 431             elif the_item.startswith("cpu usage"):
 432                 cpu_usage += value
 433
 434         state, levelstext = upperlevels(cpu_usage, warn_cpu_single, crit_cpu_single,
 435                                         get_percent_human_readable)
 436         process_description = name + " with PID %s" % pid if pid else ""
 437         infotext = "%.1f%% CPU for %s%s" % (cpu_usage, process_description, levelstext)
 438         yield state, infotext
 439
 440
 441 def uptime_check(times, params):
 442     """Check how long the process is running"""
 443     state = 0
 444     if times.min_elapsed == times.max_elapsed:
 445         infotext = "running for %s" % get_age_human_readable(times.min_elapsed)
 446     else:
 447         infotext = "youngest running for {}, oldest running for {}".format(
 448             *map(get_age_human_readable, [times.min_elapsed, times.max_elapsed]))
 449
 450     if "max_age" in params:
 451         warn_age, crit_age = params["max_age"]
 452         state, levelstext = upperlevels(times.max_elapsed, warn_age, crit_age,
 453                                         get_age_human_readable)
 454         infotext += levelstext
 455
 456     return state, infotext
 457
 458
 459 def handle_count_check(processes, params):
 460     infotext = "%d process handles" % processes.handle_count
 461     warn_handle, crit_handle = params.get("handle_count", (None, None))
 462     state, levelstext = upperlevels(processes.handle_count, warn_handle,
 463                                     crit_handle) if "handle_count" in params else (0, "")
 464     return state, infotext + levelstext, [("process_handles", processes.handle_count, warn_handle,
 465                                            crit_handle)]
 466
 467
 468 def cpu_rate(counter, now, lifetime):
 469     try:
 470         return get_rate(counter, now, lifetime, onwrap=RAISE)
 471     except MKCounterWrapped:
 472         return 0
 473
 474
 475 class ProcessAggregator(object):
 476     """Collects information about all instances of monitored processes"""
 477
 478     def __init__(self, cpu_cores, params):
 479         self.cpu_cores = cpu_cores
 480         self.params = params
 481         self.virtual_size = 0
 482         self.resident_size = 0
 483         self.handle_count = 0
 484         self.percent_cpu = 0.0
 485         self.max_elapsed = None
 486         self.min_elapsed = None
 487         self.processes = []
 488         self.running_on_nodes = set()
 489
 490     def __getitem__(self, item):
 491         return self.processes[item]
 492
 493     @property
 494     def count(self):
 495         return len(self.processes)
 496
 497     def append(self, process):
 498         self.processes.append(process)
 499
 500     def core_weight(self, is_win):
 501         cpu_rescale_max = self.params.get('cpu_rescale_max')
 502
 503         # Rule not set up, only windows scaled
 504         if cpu_rescale_max is None and not is_win:
 505             return 1.0
 506
 507         # Current rule is set. Explicitly ask not to divide
 508         if cpu_rescale_max is False:
 509             return 1.0
 510
 511         # Use default of division
 512         return 1.0 / self.cpu_cores
 513
 514     def lifetimes(self, process_info, process):
 515         # process_info.cputime contains the used CPU time and possibly,
 516         # separated by /, also the total elapsed time since the birth of the
 517         # process.
 518         if '/' in process_info.cputime:
 519             elapsed_text = process_info.cputime.split('/')[1]
 520         else:
 521             # uptime is a windows only value, introduced in Werk 4029. For
 522             # future consistency should be moved to the cputime entry and
 523             # separated by a /
 524             if process_info.uptime:
 525                 elapsed_text = process_info.uptime
 526             else:
 527                 elapsed_text = None
 528
 529         if elapsed_text:
 530             elapsed = parse_ps_time(elapsed_text)
 531             self.min_elapsed = min(self.min_elapsed or elapsed, elapsed)
 532             self.max_elapsed = max(self.max_elapsed, elapsed)
 533
 534             now = time.time()
 535             creation_time_unix = int(now - elapsed)
 536             if creation_time_unix != 0:
 537                 process.append((
 538                     "creation time",
 539                     (get_timestamp_human_readable(creation_time_unix), ""),
 540                 ))
 541
 542     def cpu_usage(self, process_info, process):
 543
 544         now = time.time()
 545
 546         pcpu_text = process_info.cputime.split('/')[0]
 547
 548         if ":" in pcpu_text:  # In linux is a time
 549             total_seconds = parse_ps_time(pcpu_text)
 550             pid = process_info.process_id
 551             cputime = cpu_rate("ps_stat.pcpu.%s" % pid, now, total_seconds)
 552
 553             pcpu = cputime * 100 * self.core_weight(is_win=False)
 554             process.append(("pid", (pid, "")))
 555
 556         # windows cpu times
 557         elif process_info.usermode_time and process_info.kernelmode_time:
 558             pid = process_info.process_id
 559
 560             user_per_sec = cpu_rate("ps_wmic.user.%s" % pid, now, int(process_info.usermode_time))
 561             kernel_per_sec = cpu_rate("ps_wmic.kernel.%s" % pid, now,
 562                                       int(process_info.kernelmode_time))
 563
 564             if not all([user_per_sec, kernel_per_sec]):
 565                 user_per_sec = 0
 566                 kernel_per_sec = 0
 567
 568             core_weight = self.core_weight(is_win=True)
 569             user_perc = user_per_sec / 100000.0 * core_weight
 570             kernel_perc = kernel_per_sec / 100000.0 * core_weight
 571             pcpu = user_perc + kernel_perc
 572             process.append(("cpu usage (user space)", (user_perc, "%")))
 573             process.append(("cpu usage (kernel space)", (kernel_perc, "%")))
 574             process.append(("pid", (pid, "")))
 575
 576         else:  # Solaris, BSD, aix cpu times
 577             if pcpu_text == '-':  # Solaris defunct
 578                 pcpu_text = 0.0
 579             pcpu = float(pcpu_text) * self.core_weight(is_win=False)
 580
 581         self.percent_cpu += pcpu
 582         process.append(("cpu usage", (pcpu, "%")))
 583
 584         if process_info.pagefile:
 585             process.append(("pagefile usage", (process_info.pagefile, "")))
 586
 587         if process_info.handles:
 588             self.handle_count += int(process_info.handles)
 589             process.append(("handle count", (int(process_info.handles), "")))
 590
 591
 592 def check_ps_process_capture(parsed, params, cpu_cores):
 593
 594     ps_aggregator = ProcessAggregator(cpu_cores, params)
 595
 596     for line in parsed:
 597         node_name, process_line = line[0], line[1:]
 598         process_info, command_line = process_line[0], process_line[1:]
 599
 600         if process_matches(process_line, params.get("process"), params.get("user"),
 601                            params.get('match_groups')):
 602             process = []
 603
 604             if node_name is not None:
 605                 ps_aggregator.running_on_nodes.add(node_name)
 606
 607             if command_line:
 608                 process.append(("name", (command_line[0], "")))
 609
 610             # extended performance data: virtualsize, residentsize, %cpu
 611             if all(process_info[1:4]):
 612                 process.append(("user", (process_info.user, "")))
 613                 process.append(("virtual size", (int(process_info.virtual), "kB")))
 614                 process.append(("resident size", (int(process_info.physical), "kB")))
 615
 616                 ps_aggregator.virtual_size += int(process_info.virtual)  # kB
 617                 ps_aggregator.resident_size += int(process_info.physical)  # kB
 618
 619                 ps_aggregator.lifetimes(process_info, process)
 620                 ps_aggregator.cpu_usage(process_info, process)
 621
 622             ps_aggregator.append(process)
 623
 624     return ps_aggregator