2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
3 # +------------------------------------------------------------------+
4 # | ____ _ _ __ __ _ __ |
5 # | / ___| |__ ___ ___| | __ | \/ | |/ / |
6 # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
7 # | | |___| | | | __/ (__| < | | | | . \ |
8 # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
10 # | Copyright Mathias Kettner 2014 mk@mathias-kettner.de |
11 # +------------------------------------------------------------------+
13 # This file is part of Check_MK.
14 # The official homepage is at http://mathias-kettner.de/check_mk.
16 # check_mk is free software; you can redistribute it and/or modify it
17 # under the terms of the GNU General Public License as published by
18 # the Free Software Foundation in version 2. check_mk is distributed
19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
20 # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
21 # PARTICULAR PURPOSE. See the GNU General Public License for more de-
22 # tails. You should have received a copy of the GNU General Public
23 # License along with GNU Make; see the file COPYING. If not, write
24 # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
25 # Boston, MA 02110-1301 USA.
28 factory_settings
["ps_default_levels"] = {
29 "levels": (1, 1, 99999, 99999),
32 ps_info
= collections
.namedtuple(
33 "Process_Info", ('user', 'virtual', 'physical', 'cputime', 'process_id', 'pagefile',
34 'usermode_time', 'kernelmode_time', 'handles', 'threads', 'uptime'))
36 ps_info
.__new
__.__defaults
__ = (None,) * len(ps_info
._fields
)
39 def ps_info_tuple(entry
):
40 ps_tuple_re
= regex(r
"^\((.*)\)$")
41 matched_ps_info
= ps_tuple_re
.match(entry
)
43 return ps_info(*matched_ps_info
.group(1).split(","))
47 def ps_wato_configured_inventory_rules(invrules
):
49 for value
in host_extra_conf(host_name(), invrules
):
50 default_params
= value
.get('default_params', value
)
51 if "cpu_rescale_max" not in default_params
:
52 default_params
["cpu_rescale_max"] = None
54 inventory_specs
.append((value
['descr'], value
.get('match'), value
.get('user'),
57 return inventory_specs
60 def inventory_ps_common(invrules
, parsed
):
61 inventory_specs
= ps_wato_configured_inventory_rules(invrules
)
65 for servicedesc
, pattern
, userspec
, default_params
in inventory_specs
:
66 # First entry in line is the node name or None for non-clusters
67 process_line
= line
[1:]
68 matches
= process_matches(process_line
, pattern
, userspec
)
70 continue # skip not matched lines
72 # User capturing on rule
73 if userspec
== GRAB_USER
:
74 i_userspec
= process_line
[0].user
78 i_servicedesc
= servicedesc
.replace("%u", i_userspec
or "")
81 if hasattr(matches
, 'groups'):
82 match_groups
= [g
if g
else "" for g
in matches
.groups()]
86 i_servicedesc
= replace_service_description(i_servicedesc
, match_groups
, pattern
)
88 # Problem here: We need to instantiate all subexpressions
89 # with their actual values of the found process.
92 "match_groups": match_groups
,
96 # default_params is either a clean dict with optional
97 # parameters to set as default or - from version 1.2.4 - the
98 # dict from the rule itself. In the later case we need to remove
99 # the keys that do not specify default parameters
100 for key
, value
in default_params
.items():
101 if key
not in ("descr", "match", "user", "perfdata"):
102 inv_params
[key
] = value
104 inv
= (i_servicedesc
, inv_params
)
106 if inv
not in inventory
:
107 inventory
.append(inv
)
112 def replace_service_description(service_description
, match_groups
, pattern
):
114 # New in 1.2.2b4: Alle %1, %2, etc. to be replaced with first, second, ...
115 # group. This allows a reordering of the matched groups
116 service_description
= re
.sub(r
'%(\d+)', r
'{\1}', service_description
)
118 # First argument is None, because format is zero indexed
119 service_description
= service_description
.format(None, *match_groups
)
121 num_elements_replace
= service_description
.count('%s')
122 if len(match_groups
) < num_elements_replace
:
123 raise MKGeneralException(
124 "Invalid entry in inventory_processes_rules: "
125 "service description '%s' contains "
126 "%d replaceable elements, but "
127 "regular expression '%s' contains only %d subexpression(s)." %
128 (service_description
, num_elements_replace
, pattern
, len(match_groups
)))
130 # It is allowed (1.1.4) that the pattern contains more subexpressions
131 # then the service description. In that case only the first
132 # subexpressions are used as item.
133 service_description
= service_description
% tuple(match_groups
[:num_elements_replace
])
135 return service_description
138 def match_user(user
, user_pattern
):
140 if user_pattern
.startswith('~'):
141 if not regex(user_pattern
[1:]).match(user
):
144 elif user_pattern
!= user
:
149 def process_matches(process_line
, process_pattern
, user_pattern
, match_groups
=None):
150 user
, command_line
= process_line
[0].user
, process_line
[1:]
152 if match_user(user
, user_pattern
) is False:
155 if not process_pattern
:
156 # Process name not relevant
159 elif process_pattern
.startswith("~"):
160 # Regex for complete process command line
161 reg
= regex(process_pattern
[1:]) # skip "~"
162 m
= reg
.match(" ".join(command_line
))
166 return m
.groups() == tuple(match_groups
)
169 # Exact match on name of executable
170 return command_line
[0] == process_pattern
173 # produce text or html output intended for the long output field of a check
174 # from details about a process. the input is expected to be a list (one
175 # per process) of lists (one per data field) of key-value tuples where the
176 # value is again a 2-field tuple, first is the value, second is the unit.
177 # This function is actually fairly generic so it could be used for other
178 # data structured the same way
179 def format_process_list(processes
, html_output
):
180 def format_value(value
):
182 if isinstance(value
, float):
183 return "%.1f%s" % (value
, unit
)
184 return "%s%s" % (value
, unit
)
187 table_bracket
= "<table>%s</table>"
188 line_bracket
= "<tr>%s</tr>"
189 cell_bracket
= "<td>%.0s%s</td>"
193 headers_found
= set()
195 for process
in processes
:
196 for key
, value
in process
:
197 if key
not in headers_found
:
199 headers_found
.add(key
)
201 # make sure each process has all fields from the table
202 processes_filled
= []
203 for process
in processes
:
204 dictified
= dict(process
)
205 processes_filled
.append([(key
, dictified
.get(key
, "")) for key
in headers
])
206 processes
= processes_filled
207 header_line
= "<tr><th>" + "</th><th>".join(headers
) + "</th></tr>"
210 line_bracket
= "%s\r\n"
211 cell_bracket
= "%s %s"
212 cell_seperator
= ", "
215 return table_bracket
% (header_line
+ "".join([
216 line_bracket
% cell_seperator
.join(
217 [cell_bracket
% (key
, format_value(value
))
218 for key
, value
in process
])
219 for process
in processes
223 # Parse time as output by ps into seconds.
225 # Example 2: "55:12:17"
226 # Example 3: "7-12:34:59" (with 7 days)
227 # Example 4: "7123459" (only seconds, windows)
228 def parse_ps_time(text
):
230 tokens
= text
.split("-")
231 days
= int(tokens
[0] or 0)
237 [factor
* int(v
or 0) for factor
, v
in zip([1, 60, 3600], reversed(text
.split(":")))])
239 return 86400 * days
+ day_secs
242 # This function is repeated in cmk/gui/plugins/wato/check_parameters/ps.py
243 # Update that function too until we can import them
244 def ps_cleanup_params(params
):
245 # New parameter format: dictionary. Example:
248 # "process" : "/usr/bin/food",
258 # "levels" : (1, 1, 99999, 99999)
260 if isinstance(params
, (list, tuple)):
262 procname
, warnmin
, okmin
, okmax
, warnmax
= params
264 elif len(params
) == 6:
265 procname
, user
, warnmin
, okmin
, okmax
, warnmax
= params
269 "levels": (warnmin
, okmin
, okmax
, warnmax
),
273 elif any(k
in params
for k
in ['okmin', 'warnmin', 'okmax', 'warnmax']):
275 params
.pop("warnmin", 1),
276 params
.pop("okmin", 1),
277 params
.pop("okmax", 99999),
278 params
.pop("warnmax", 99999),
281 if "cpu_rescale_max" not in params
:
282 params
["cpu_rescale_max"] = None
287 def check_ps_common(item
, params
, parsed
, cpu_cores
=1, info_name
="process", total_ram
=None):
288 params
= ps_cleanup_params(params
)
290 processes
= check_ps_process_capture(parsed
, params
, cpu_cores
)
292 yield ps_count_check(processes
, params
, info_name
)
294 for memory_state
in memory_check(processes
, params
):
297 if processes
.resident_size
and "resident_levels_perc" in params
:
298 yield memory_perc_check(processes
, params
, total_ram
)
302 yield cpu_check(processes
.percent_cpu
, item
, params
)
304 if "single_cpulevels" in params
:
305 for ps_state
in individual_process_check(processes
, params
):
308 # only check handle_count if provided by wmic counters
309 if processes
.handle_count
:
310 yield handle_count_check(processes
, params
)
312 if processes
.min_elapsed
is not None:
313 yield uptime_check(processes
, params
)
315 if params
.get("process_info", None):
316 infotext
= "\n" + format_process_list(processes
, params
["process_info"] == "html")
320 def upperlevels(value
, warn
, crit
, readable
=str):
329 infotext
= ": (warn/crit at %s/%s)" % tuple(map(readable
, (warn
, crit
)))
331 return state
, infotext
334 def boundary_levels(value
, warnmin
, okmin
, okmax
, warnmax
):
337 if value
> warnmax
or value
< warnmin
:
339 elif value
> okmax
or value
< okmin
:
344 infotext
= ": (ok from %d to %d)" % (okmin
, okmax
)
346 return state
, infotext
349 def ps_count_check(processes
, params
, info_name
):
350 warnmin
, okmin
, okmax
, warnmax
= params
["levels"]
351 count
= processes
.count
353 perfdata
= [("count", count
, okmax
+ 1, warnmax
+ 1, 0)]
354 infotext
= "%d %s%s" % (count
, info_name
, '' if count
== 1 else 'es')
356 state
, warntext
= boundary_levels(count
, warnmin
, okmin
, okmax
, warnmax
)
359 if processes
.running_on_nodes
:
360 infotext
+= " [running on %s]" % ", ".join(sorted(processes
.running_on_nodes
))
362 return state
, infotext
, perfdata
365 def memory_check(processes
, params
):
366 """Check levels for virtual and physical used memory"""
367 for size
, title
, levels
, metric
in [
368 (processes
.virtual_size
, "virtual", "virtual_levels", "vsz"),
369 (processes
.resident_size
, "physical", "resident_levels", "rss"),
374 infotext
= "%s %s" % (get_bytes_human_readable(size
* 1024.0), title
)
375 warn_levels
, crit_levels
= params
.get(levels
, (None, None))
377 state
, levelstext
= upperlevels(size
* 1024, warn_levels
, crit_levels
,
378 get_bytes_human_readable
)
380 state
, levelstext
= 0, ""
381 yield state
, infotext
+ levelstext
, [(metric
, size
, warn_levels
, crit_levels
)]
384 def memory_perc_check(processes
, params
, total_ram
):
385 """Check levels that are in percent of the total RAM of the host"""
386 warn_perc
, crit_perc
= params
["resident_levels_perc"]
388 infotext
= "percentual RAM levels configured, but total RAM is unknown"
391 resident_perc
= 100 * float(processes
.resident_size
* 1024) / total_ram
392 infotext
= "%s of total RAM" % get_percent_human_readable(resident_perc
)
393 state
, levelstext
= upperlevels(resident_perc
, warn_perc
, crit_perc
,
394 get_percent_human_readable
)
395 infotext
+= levelstext
397 return state
, infotext
400 def cpu_check(percent_cpu
, item
, params
):
401 """Check levels for cpu utilization from given process"""
403 infotext
= "%.1f%% CPU" % percent_cpu
404 warn_cpu
, crit_cpu
= params
.get("cpulevels", (None, None, None))[:2]
405 perf_data
= [("pcpu", percent_cpu
, warn_cpu
, crit_cpu
)]
407 # CPU might come with previous
408 if "cpu_average" in params
:
410 avg_cpu
= get_average("ps.%s.cpu" % item
, now
, percent_cpu
, params
["cpu_average"], False)
411 infotext
+= " (%d min average: %.1f%%)" % (params
["cpu_average"], avg_cpu
)
412 perf_data
.append(("pcpuavg", avg_cpu
, warn_cpu
, crit_cpu
, 0, params
["cpu_average"]))
413 percent_cpu
= avg_cpu
# use this for level comparison
415 state
, levelstext
= upperlevels(percent_cpu
, warn_cpu
, crit_cpu
,
416 get_percent_human_readable
) if "cpulevels" in params
else (0,
418 return state
, infotext
+ levelstext
, perf_data
421 def individual_process_check(processes
, params
):
422 warn_cpu_single
, crit_cpu_single
= params
["single_cpulevels"]
424 cpu_usage
, name
, pid
= 0.0, None, None
426 for the_item
, (value
, _unit
) in p
:
427 if the_item
== "name":
429 if the_item
== "pid":
431 elif the_item
.startswith("cpu usage"):
434 state
, levelstext
= upperlevels(cpu_usage
, warn_cpu_single
, crit_cpu_single
,
435 get_percent_human_readable
)
436 process_description
= name
+ " with PID %s" % pid
if pid
else ""
437 infotext
= "%.1f%% CPU for %s%s" % (cpu_usage
, process_description
, levelstext
)
438 yield state
, infotext
441 def uptime_check(times
, params
):
442 """Check how long the process is running"""
444 if times
.min_elapsed
== times
.max_elapsed
:
445 infotext
= "running for %s" % get_age_human_readable(times
.min_elapsed
)
447 infotext
= "youngest running for {}, oldest running for {}".format(
448 *map(get_age_human_readable
, [times
.min_elapsed
, times
.max_elapsed
]))
450 if "max_age" in params
:
451 warn_age
, crit_age
= params
["max_age"]
452 state
, levelstext
= upperlevels(times
.max_elapsed
, warn_age
, crit_age
,
453 get_age_human_readable
)
454 infotext
+= levelstext
456 return state
, infotext
459 def handle_count_check(processes
, params
):
460 infotext
= "%d process handles" % processes
.handle_count
461 warn_handle
, crit_handle
= params
.get("handle_count", (None, None))
462 state
, levelstext
= upperlevels(processes
.handle_count
, warn_handle
,
463 crit_handle
) if "handle_count" in params
else (0, "")
464 return state
, infotext
+ levelstext
, [("process_handles", processes
.handle_count
, warn_handle
,
468 def cpu_rate(counter
, now
, lifetime
):
470 return get_rate(counter
, now
, lifetime
, onwrap
=RAISE
)
471 except MKCounterWrapped
:
475 class ProcessAggregator(object):
476 """Collects information about all instances of monitored processes"""
478 def __init__(self
, cpu_cores
, params
):
479 self
.cpu_cores
= cpu_cores
481 self
.virtual_size
= 0
482 self
.resident_size
= 0
483 self
.handle_count
= 0
484 self
.percent_cpu
= 0.0
485 self
.max_elapsed
= None
486 self
.min_elapsed
= None
488 self
.running_on_nodes
= set()
490 def __getitem__(self
, item
):
491 return self
.processes
[item
]
495 return len(self
.processes
)
497 def append(self
, process
):
498 self
.processes
.append(process
)
500 def core_weight(self
, is_win
):
501 cpu_rescale_max
= self
.params
.get('cpu_rescale_max')
503 # Rule not set up, only windows scaled
504 if cpu_rescale_max
is None and not is_win
:
507 # Current rule is set. Explicitly ask not to divide
508 if cpu_rescale_max
is False:
511 # Use default of division
512 return 1.0 / self
.cpu_cores
514 def lifetimes(self
, process_info
, process
):
515 # process_info.cputime contains the used CPU time and possibly,
516 # separated by /, also the total elapsed time since the birth of the
518 if '/' in process_info
.cputime
:
519 elapsed_text
= process_info
.cputime
.split('/')[1]
521 # uptime is a windows only value, introduced in Werk 4029. For
522 # future consistency should be moved to the cputime entry and
524 if process_info
.uptime
:
525 elapsed_text
= process_info
.uptime
530 elapsed
= parse_ps_time(elapsed_text
)
531 self
.min_elapsed
= min(self
.min_elapsed
or elapsed
, elapsed
)
532 self
.max_elapsed
= max(self
.max_elapsed
, elapsed
)
535 creation_time_unix
= int(now
- elapsed
)
536 if creation_time_unix
!= 0:
539 (get_timestamp_human_readable(creation_time_unix
), ""),
542 def cpu_usage(self
, process_info
, process
):
546 pcpu_text
= process_info
.cputime
.split('/')[0]
548 if ":" in pcpu_text
: # In linux is a time
549 total_seconds
= parse_ps_time(pcpu_text
)
550 pid
= process_info
.process_id
551 cputime
= cpu_rate("ps_stat.pcpu.%s" % pid
, now
, total_seconds
)
553 pcpu
= cputime
* 100 * self
.core_weight(is_win
=False)
554 process
.append(("pid", (pid
, "")))
557 elif process_info
.usermode_time
and process_info
.kernelmode_time
:
558 pid
= process_info
.process_id
560 user_per_sec
= cpu_rate("ps_wmic.user.%s" % pid
, now
, int(process_info
.usermode_time
))
561 kernel_per_sec
= cpu_rate("ps_wmic.kernel.%s" % pid
, now
,
562 int(process_info
.kernelmode_time
))
564 if not all([user_per_sec
, kernel_per_sec
]):
568 core_weight
= self
.core_weight(is_win
=True)
569 user_perc
= user_per_sec
/ 100000.0 * core_weight
570 kernel_perc
= kernel_per_sec
/ 100000.0 * core_weight
571 pcpu
= user_perc
+ kernel_perc
572 process
.append(("cpu usage (user space)", (user_perc
, "%")))
573 process
.append(("cpu usage (kernel space)", (kernel_perc
, "%")))
574 process
.append(("pid", (pid
, "")))
576 else: # Solaris, BSD, aix cpu times
577 if pcpu_text
== '-': # Solaris defunct
579 pcpu
= float(pcpu_text
) * self
.core_weight(is_win
=False)
581 self
.percent_cpu
+= pcpu
582 process
.append(("cpu usage", (pcpu
, "%")))
584 if process_info
.pagefile
:
585 process
.append(("pagefile usage", (process_info
.pagefile
, "")))
587 if process_info
.handles
:
588 self
.handle_count
+= int(process_info
.handles
)
589 process
.append(("handle count", (int(process_info
.handles
), "")))
592 def check_ps_process_capture(parsed
, params
, cpu_cores
):
594 ps_aggregator
= ProcessAggregator(cpu_cores
, params
)
597 node_name
, process_line
= line
[0], line
[1:]
598 process_info
, command_line
= process_line
[0], process_line
[1:]
600 if process_matches(process_line
, params
.get("process"), params
.get("user"),
601 params
.get('match_groups')):
604 if node_name
is not None:
605 ps_aggregator
.running_on_nodes
.add(node_name
)
608 process
.append(("name", (command_line
[0], "")))
610 # extended performance data: virtualsize, residentsize, %cpu
611 if all(process_info
[1:4]):
612 process
.append(("user", (process_info
.user
, "")))
613 process
.append(("virtual size", (int(process_info
.virtual
), "kB")))
614 process
.append(("resident size", (int(process_info
.physical
), "kB")))
616 ps_aggregator
.virtual_size
+= int(process_info
.virtual
) # kB
617 ps_aggregator
.resident_size
+= int(process_info
.physical
) # kB
619 ps_aggregator
.lifetimes(process_info
, process
)
620 ps_aggregator
.cpu_usage(process_info
, process
)
622 ps_aggregator
.append(process
)