Cleanup config.nodes_of
[check_mk.git] / checks / ps.include
blob511430226b92242b9a87dcdb228c0ad07b978235
1 #!/usr/bin/python
2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
3 # +------------------------------------------------------------------+
4 # | ____ _ _ __ __ _ __ |
5 # | / ___| |__ ___ ___| | __ | \/ | |/ / |
6 # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
7 # | | |___| | | | __/ (__| < | | | | . \ |
8 # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
9 # | |
10 # | Copyright Mathias Kettner 2014 mk@mathias-kettner.de |
11 # +------------------------------------------------------------------+
13 # This file is part of Check_MK.
14 # The official homepage is at http://mathias-kettner.de/check_mk.
16 # check_mk is free software; you can redistribute it and/or modify it
17 # under the terms of the GNU General Public License as published by
18 # the Free Software Foundation in version 2. check_mk is distributed
19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
20 # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
21 # PARTICULAR PURPOSE. See the GNU General Public License for more de-
22 # tails. You should have received a copy of the GNU General Public
23 # License along with GNU Make; see the file COPYING. If not, write
24 # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
25 # Boston, MA 02110-1301 USA.
26 GRAB_USER = False
28 factory_settings["ps_default_levels"] = {
29 "levels": (1, 1, 99999, 99999),
32 ps_info = collections.namedtuple(
33 "Process_Info", ('user', 'virtual', 'physical', 'cputime', 'process_id', 'pagefile',
34 'usermode_time', 'kernelmode_time', 'handles', 'threads', 'uptime'))
36 ps_info.__new__.__defaults__ = (None,) * len(ps_info._fields)
39 def ps_info_tuple(entry):
40 ps_tuple_re = regex(r"^\((.*)\)$")
41 matched_ps_info = ps_tuple_re.match(entry)
42 if matched_ps_info:
43 return ps_info(*matched_ps_info.group(1).split(","))
44 return False
47 def ps_wato_configured_inventory_rules(invrules):
48 inventory_specs = []
49 for value in host_extra_conf(host_name(), invrules):
50 default_params = value.get('default_params', value)
51 if "cpu_rescale_max" not in default_params:
52 default_params["cpu_rescale_max"] = None
54 inventory_specs.append((value['descr'], value.get('match'), value.get('user'),
55 default_params))
57 return inventory_specs
60 def inventory_ps_common(invrules, parsed):
61 inventory_specs = ps_wato_configured_inventory_rules(invrules)
63 inventory = []
64 for line in parsed:
65 for servicedesc, pattern, userspec, default_params in inventory_specs:
66 # First entry in line is the node name or None for non-clusters
67 process_line = line[1:]
68 matches = process_matches(process_line, pattern, userspec)
69 if not matches:
70 continue # skip not matched lines
72 # User capturing on rule
73 if userspec == GRAB_USER:
74 i_userspec = process_line[0].user
75 else:
76 i_userspec = userspec
78 i_servicedesc = servicedesc.replace("%u", i_userspec or "")
80 # Process capture
81 if hasattr(matches, 'groups'):
82 match_groups = [g if g else "" for g in matches.groups()]
83 else:
84 match_groups = []
86 i_servicedesc = replace_service_description(i_servicedesc, match_groups, pattern)
88 # Problem here: We need to instantiate all subexpressions
89 # with their actual values of the found process.
90 inv_params = {
91 "process": pattern,
92 "match_groups": match_groups,
93 "user": i_userspec,
96 # default_params is either a clean dict with optional
97 # parameters to set as default or - from version 1.2.4 - the
98 # dict from the rule itself. In the later case we need to remove
99 # the keys that do not specify default parameters
100 for key, value in default_params.items():
101 if key not in ("descr", "match", "user", "perfdata"):
102 inv_params[key] = value
104 inv = (i_servicedesc, inv_params)
106 if inv not in inventory:
107 inventory.append(inv)
109 return inventory
112 def replace_service_description(service_description, match_groups, pattern):
114 # New in 1.2.2b4: Alle %1, %2, etc. to be replaced with first, second, ...
115 # group. This allows a reordering of the matched groups
116 service_description = re.sub(r'%(\d+)', r'{\1}', service_description)
118 # First argument is None, because format is zero indexed
119 service_description = service_description.format(None, *match_groups)
121 num_elements_replace = service_description.count('%s')
122 if len(match_groups) < num_elements_replace:
123 raise MKGeneralException(
124 "Invalid entry in inventory_processes_rules: "
125 "service description '%s' contains "
126 "%d replaceable elements, but "
127 "regular expression '%s' contains only %d subexpression(s)." %
128 (service_description, num_elements_replace, pattern, len(match_groups)))
130 # It is allowed (1.1.4) that the pattern contains more subexpressions
131 # then the service description. In that case only the first
132 # subexpressions are used as item.
133 service_description = service_description % tuple(match_groups[:num_elements_replace])
135 return service_description
138 def match_user(user, user_pattern):
139 if user_pattern:
140 if user_pattern.startswith('~'):
141 if not regex(user_pattern[1:]).match(user):
142 return False
144 elif user_pattern != user:
145 return False
146 return None
149 def process_matches(process_line, process_pattern, user_pattern, match_groups=None):
150 user, command_line = process_line[0].user, process_line[1:]
152 if match_user(user, user_pattern) is False:
153 return False
155 if not process_pattern:
156 # Process name not relevant
157 return True
159 elif process_pattern.startswith("~"):
160 # Regex for complete process command line
161 reg = regex(process_pattern[1:]) # skip "~"
162 m = reg.match(" ".join(command_line))
163 if not m:
164 return False
165 if match_groups:
166 return m.groups() == tuple(match_groups)
167 return m
169 # Exact match on name of executable
170 return command_line[0] == process_pattern
173 # produce text or html output intended for the long output field of a check
174 # from details about a process. the input is expected to be a list (one
175 # per process) of lists (one per data field) of key-value tuples where the
176 # value is again a 2-field tuple, first is the value, second is the unit.
177 # This function is actually fairly generic so it could be used for other
178 # data structured the same way
179 def format_process_list(processes, html_output):
180 def format_value(value):
181 value, unit = value
182 if isinstance(value, float):
183 return "%.1f%s" % (value, unit)
184 return "%s%s" % (value, unit)
186 if html_output:
187 table_bracket = "<table>%s</table>"
188 line_bracket = "<tr>%s</tr>"
189 cell_bracket = "<td>%.0s%s</td>"
190 cell_seperator = ""
192 headers = []
193 headers_found = set()
195 for process in processes:
196 for key, value in process:
197 if key not in headers_found:
198 headers.append(key)
199 headers_found.add(key)
201 # make sure each process has all fields from the table
202 processes_filled = []
203 for process in processes:
204 dictified = dict(process)
205 processes_filled.append([(key, dictified.get(key, "")) for key in headers])
206 processes = processes_filled
207 header_line = "<tr><th>" + "</th><th>".join(headers) + "</th></tr>"
208 else:
209 table_bracket = "%s"
210 line_bracket = "%s\r\n"
211 cell_bracket = "%s %s"
212 cell_seperator = ", "
213 header_line = ""
215 return table_bracket % (header_line + "".join([
216 line_bracket % cell_seperator.join(
217 [cell_bracket % (key, format_value(value))
218 for key, value in process])
219 for process in processes
223 # Parse time as output by ps into seconds.
224 # Example 1: "12:17"
225 # Example 2: "55:12:17"
226 # Example 3: "7-12:34:59" (with 7 days)
227 # Example 4: "7123459" (only seconds, windows)
228 def parse_ps_time(text):
229 if "-" in text:
230 tokens = text.split("-")
231 days = int(tokens[0] or 0)
232 text = tokens[1]
233 else:
234 days = 0
236 day_secs = sum(
237 [factor * int(v or 0) for factor, v in zip([1, 60, 3600], reversed(text.split(":")))])
239 return 86400 * days + day_secs
242 # This function is repeated in cmk/gui/plugins/wato/check_parameters/ps.py
243 # Update that function too until we can import them
244 def ps_cleanup_params(params):
245 # New parameter format: dictionary. Example:
247 # "user" : "foo",
248 # "process" : "/usr/bin/food",
249 # "warnmin" : 1,
250 # "okmin" : 1,
251 # "okmax" : 1,
252 # "warnmax" : 1,
255 # Even newer format:
257 # "user" : "foo",
258 # "levels" : (1, 1, 99999, 99999)
260 if isinstance(params, (list, tuple)):
261 if len(params) == 5:
262 procname, warnmin, okmin, okmax, warnmax = params
263 user = None
264 elif len(params) == 6:
265 procname, user, warnmin, okmin, okmax, warnmax = params
267 params = {
268 "process": procname,
269 "levels": (warnmin, okmin, okmax, warnmax),
270 "user": user,
273 elif any(k in params for k in ['okmin', 'warnmin', 'okmax', 'warnmax']):
274 params["levels"] = (
275 params.pop("warnmin", 1),
276 params.pop("okmin", 1),
277 params.pop("okmax", 99999),
278 params.pop("warnmax", 99999),
281 if "cpu_rescale_max" not in params:
282 params["cpu_rescale_max"] = None
284 return params
287 def check_ps_common(item, params, parsed, cpu_cores=1, info_name="process", total_ram=None):
288 params = ps_cleanup_params(params)
290 processes = check_ps_process_capture(parsed, params, cpu_cores)
292 yield ps_count_check(processes, params, info_name)
294 for memory_state in memory_check(processes, params):
295 yield memory_state
297 if processes.resident_size and "resident_levels_perc" in params:
298 yield memory_perc_check(processes, params, total_ram)
300 # CPU
301 if processes.count:
302 yield cpu_check(processes.percent_cpu, item, params)
304 if "single_cpulevels" in params:
305 for ps_state in individual_process_check(processes, params):
306 yield ps_state
308 # only check handle_count if provided by wmic counters
309 if processes.handle_count:
310 yield handle_count_check(processes, params)
312 if processes.min_elapsed is not None:
313 yield uptime_check(processes, params)
315 if params.get("process_info", None):
316 infotext = "\n" + format_process_list(processes, params["process_info"] == "html")
317 yield 0, infotext
320 def upperlevels(value, warn, crit, readable=str):
321 infotext = ''
322 if value >= crit:
323 state = 2
324 elif value >= warn:
325 state = 1
326 else:
327 state = 0
328 if state:
329 infotext = ": (warn/crit at %s/%s)" % tuple(map(readable, (warn, crit)))
331 return state, infotext
334 def boundary_levels(value, warnmin, okmin, okmax, warnmax):
335 infotext = ''
336 state = 0
337 if value > warnmax or value < warnmin:
338 state = 2
339 elif value > okmax or value < okmin:
340 state = 1
341 else:
342 state = 0
343 if state:
344 infotext = ": (ok from %d to %d)" % (okmin, okmax)
346 return state, infotext
349 def ps_count_check(processes, params, info_name):
350 warnmin, okmin, okmax, warnmax = params["levels"]
351 count = processes.count
353 perfdata = [("count", count, okmax + 1, warnmax + 1, 0)]
354 infotext = "%d %s%s" % (count, info_name, '' if count == 1 else 'es')
356 state, warntext = boundary_levels(count, warnmin, okmin, okmax, warnmax)
357 infotext += warntext
359 if processes.running_on_nodes:
360 infotext += " [running on %s]" % ", ".join(sorted(processes.running_on_nodes))
362 return state, infotext, perfdata
365 def memory_check(processes, params):
366 """Check levels for virtual and physical used memory"""
367 for size, title, levels, metric in [
368 (processes.virtual_size, "virtual", "virtual_levels", "vsz"),
369 (processes.resident_size, "physical", "resident_levels", "rss"),
371 if size == 0:
372 continue
374 infotext = "%s %s" % (get_bytes_human_readable(size * 1024.0), title)
375 warn_levels, crit_levels = params.get(levels, (None, None))
376 if levels in params:
377 state, levelstext = upperlevels(size * 1024, warn_levels, crit_levels,
378 get_bytes_human_readable)
379 else:
380 state, levelstext = 0, ""
381 yield state, infotext + levelstext, [(metric, size, warn_levels, crit_levels)]
384 def memory_perc_check(processes, params, total_ram):
385 """Check levels that are in percent of the total RAM of the host"""
386 warn_perc, crit_perc = params["resident_levels_perc"]
387 if not total_ram:
388 infotext = "percentual RAM levels configured, but total RAM is unknown"
389 state = 3
390 else:
391 resident_perc = 100 * float(processes.resident_size * 1024) / total_ram
392 infotext = "%s of total RAM" % get_percent_human_readable(resident_perc)
393 state, levelstext = upperlevels(resident_perc, warn_perc, crit_perc,
394 get_percent_human_readable)
395 infotext += levelstext
397 return state, infotext
400 def cpu_check(percent_cpu, item, params):
401 """Check levels for cpu utilization from given process"""
403 infotext = "%.1f%% CPU" % percent_cpu
404 warn_cpu, crit_cpu = params.get("cpulevels", (None, None, None))[:2]
405 perf_data = [("pcpu", percent_cpu, warn_cpu, crit_cpu)]
407 # CPU might come with previous
408 if "cpu_average" in params:
409 now = time.time()
410 avg_cpu = get_average("ps.%s.cpu" % item, now, percent_cpu, params["cpu_average"], False)
411 infotext += " (%d min average: %.1f%%)" % (params["cpu_average"], avg_cpu)
412 perf_data.append(("pcpuavg", avg_cpu, warn_cpu, crit_cpu, 0, params["cpu_average"]))
413 percent_cpu = avg_cpu # use this for level comparison
415 state, levelstext = upperlevels(percent_cpu, warn_cpu, crit_cpu,
416 get_percent_human_readable) if "cpulevels" in params else (0,
418 return state, infotext + levelstext, perf_data
421 def individual_process_check(processes, params):
422 warn_cpu_single, crit_cpu_single = params["single_cpulevels"]
423 for p in processes:
424 cpu_usage, name, pid = 0.0, None, None
426 for the_item, (value, _unit) in p:
427 if the_item == "name":
428 name = value
429 if the_item == "pid":
430 pid = value
431 elif the_item.startswith("cpu usage"):
432 cpu_usage += value
434 state, levelstext = upperlevels(cpu_usage, warn_cpu_single, crit_cpu_single,
435 get_percent_human_readable)
436 process_description = name + " with PID %s" % pid if pid else ""
437 infotext = "%.1f%% CPU for %s%s" % (cpu_usage, process_description, levelstext)
438 yield state, infotext
441 def uptime_check(times, params):
442 """Check how long the process is running"""
443 state = 0
444 if times.min_elapsed == times.max_elapsed:
445 infotext = "running for %s" % get_age_human_readable(times.min_elapsed)
446 else:
447 infotext = "youngest running for {}, oldest running for {}".format(
448 *map(get_age_human_readable, [times.min_elapsed, times.max_elapsed]))
450 if "max_age" in params:
451 warn_age, crit_age = params["max_age"]
452 state, levelstext = upperlevels(times.max_elapsed, warn_age, crit_age,
453 get_age_human_readable)
454 infotext += levelstext
456 return state, infotext
459 def handle_count_check(processes, params):
460 infotext = "%d process handles" % processes.handle_count
461 warn_handle, crit_handle = params.get("handle_count", (None, None))
462 state, levelstext = upperlevels(processes.handle_count, warn_handle,
463 crit_handle) if "handle_count" in params else (0, "")
464 return state, infotext + levelstext, [("process_handles", processes.handle_count, warn_handle,
465 crit_handle)]
468 def cpu_rate(counter, now, lifetime):
469 try:
470 return get_rate(counter, now, lifetime, onwrap=RAISE)
471 except MKCounterWrapped:
472 return 0
475 class ProcessAggregator(object):
476 """Collects information about all instances of monitored processes"""
478 def __init__(self, cpu_cores, params):
479 self.cpu_cores = cpu_cores
480 self.params = params
481 self.virtual_size = 0
482 self.resident_size = 0
483 self.handle_count = 0
484 self.percent_cpu = 0.0
485 self.max_elapsed = None
486 self.min_elapsed = None
487 self.processes = []
488 self.running_on_nodes = set()
490 def __getitem__(self, item):
491 return self.processes[item]
493 @property
494 def count(self):
495 return len(self.processes)
497 def append(self, process):
498 self.processes.append(process)
500 def core_weight(self, is_win):
501 cpu_rescale_max = self.params.get('cpu_rescale_max')
503 # Rule not set up, only windows scaled
504 if cpu_rescale_max is None and not is_win:
505 return 1.0
507 # Current rule is set. Explicitly ask not to divide
508 if cpu_rescale_max is False:
509 return 1.0
511 # Use default of division
512 return 1.0 / self.cpu_cores
514 def lifetimes(self, process_info, process):
515 # process_info.cputime contains the used CPU time and possibly,
516 # separated by /, also the total elapsed time since the birth of the
517 # process.
518 if '/' in process_info.cputime:
519 elapsed_text = process_info.cputime.split('/')[1]
520 else:
521 # uptime is a windows only value, introduced in Werk 4029. For
522 # future consistency should be moved to the cputime entry and
523 # separated by a /
524 if process_info.uptime:
525 elapsed_text = process_info.uptime
526 else:
527 elapsed_text = None
529 if elapsed_text:
530 elapsed = parse_ps_time(elapsed_text)
531 self.min_elapsed = min(self.min_elapsed or elapsed, elapsed)
532 self.max_elapsed = max(self.max_elapsed, elapsed)
534 now = time.time()
535 creation_time_unix = int(now - elapsed)
536 if creation_time_unix != 0:
537 process.append((
538 "creation time",
539 (get_timestamp_human_readable(creation_time_unix), ""),
542 def cpu_usage(self, process_info, process):
544 now = time.time()
546 pcpu_text = process_info.cputime.split('/')[0]
548 if ":" in pcpu_text: # In linux is a time
549 total_seconds = parse_ps_time(pcpu_text)
550 pid = process_info.process_id
551 cputime = cpu_rate("ps_stat.pcpu.%s" % pid, now, total_seconds)
553 pcpu = cputime * 100 * self.core_weight(is_win=False)
554 process.append(("pid", (pid, "")))
556 # windows cpu times
557 elif process_info.usermode_time and process_info.kernelmode_time:
558 pid = process_info.process_id
560 user_per_sec = cpu_rate("ps_wmic.user.%s" % pid, now, int(process_info.usermode_time))
561 kernel_per_sec = cpu_rate("ps_wmic.kernel.%s" % pid, now,
562 int(process_info.kernelmode_time))
564 if not all([user_per_sec, kernel_per_sec]):
565 user_per_sec = 0
566 kernel_per_sec = 0
568 core_weight = self.core_weight(is_win=True)
569 user_perc = user_per_sec / 100000.0 * core_weight
570 kernel_perc = kernel_per_sec / 100000.0 * core_weight
571 pcpu = user_perc + kernel_perc
572 process.append(("cpu usage (user space)", (user_perc, "%")))
573 process.append(("cpu usage (kernel space)", (kernel_perc, "%")))
574 process.append(("pid", (pid, "")))
576 else: # Solaris, BSD, aix cpu times
577 if pcpu_text == '-': # Solaris defunct
578 pcpu_text = 0.0
579 pcpu = float(pcpu_text) * self.core_weight(is_win=False)
581 self.percent_cpu += pcpu
582 process.append(("cpu usage", (pcpu, "%")))
584 if process_info.pagefile:
585 process.append(("pagefile usage", (process_info.pagefile, "")))
587 if process_info.handles:
588 self.handle_count += int(process_info.handles)
589 process.append(("handle count", (int(process_info.handles), "")))
592 def check_ps_process_capture(parsed, params, cpu_cores):
594 ps_aggregator = ProcessAggregator(cpu_cores, params)
596 for line in parsed:
597 node_name, process_line = line[0], line[1:]
598 process_info, command_line = process_line[0], process_line[1:]
600 if process_matches(process_line, params.get("process"), params.get("user"),
601 params.get('match_groups')):
602 process = []
604 if node_name is not None:
605 ps_aggregator.running_on_nodes.add(node_name)
607 if command_line:
608 process.append(("name", (command_line[0], "")))
610 # extended performance data: virtualsize, residentsize, %cpu
611 if all(process_info[1:4]):
612 process.append(("user", (process_info.user, "")))
613 process.append(("virtual size", (int(process_info.virtual), "kB")))
614 process.append(("resident size", (int(process_info.physical), "kB")))
616 ps_aggregator.virtual_size += int(process_info.virtual) # kB
617 ps_aggregator.resident_size += int(process_info.physical) # kB
619 ps_aggregator.lifetimes(process_info, process)
620 ps_aggregator.cpu_usage(process_info, process)
622 ps_aggregator.append(process)
624 return ps_aggregator