Cleanup config.nodes_of
[check_mk.git] / checks / cpu_util.include
bloba7a6621b8daab384f282c418f47f0cfda565e1a3
1 #!/usr/bin/python
2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
3 # +------------------------------------------------------------------+
4 # | ____ _ _ __ __ _ __ |
5 # | / ___| |__ ___ ___| | __ | \/ | |/ / |
6 # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
7 # | | |___| | | | __/ (__| < | | | | . \ |
8 # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
9 # | |
10 # | Copyright Mathias Kettner 2014 mk@mathias-kettner.de |
11 # +------------------------------------------------------------------+
13 # This file is part of Check_MK.
14 # The official homepage is at http://mathias-kettner.de/check_mk.
16 # check_mk is free software; you can redistribute it and/or modify it
17 # under the terms of the GNU General Public License as published by
18 # the Free Software Foundation in version 2. check_mk is distributed
19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
20 # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
21 # PARTICULAR PURPOSE. See the GNU General Public License for more de-
22 # tails. You should have received a copy of the GNU General Public
23 # License along with GNU Make; see the file COPYING. If not, write
24 # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
25 # Boston, MA 02110-1301 USA.
27 # Common file for all (modern) checks that check CPU utilization (not load!)
29 # Example for check parameters:
30 # 1. Variant: Tuple (warn, crit). This is legacy style
31 # 2. Variant: dictionary:
33 # param = {
34 # "util" : .... --> compatible with check_levels(), optional
35 # "average" : 15 # -> compute average for 15 minutes, optional
36 # }
39 # This one can handle user, system and wait. values is a list of:
40 # - 0 - name: name of core
41 # - 1 - user: normal processes executing in user mode
42 # - 2 - nice: niced processes executing in user mode
43 # - 3 - system: processes executing in kernel mode
44 # - 4 - idle: twiddling thumbs
45 # - 5 - iowait: waiting for I/O to complete
46 # - 6 - irq: servicing interrupts
47 # - 7 - softirq: servicing softirqs
48 # - 8 - steal: involuntary wait
49 # - 9 - guest: time spent in guest OK, also counted in 0 (user)
50 # -10 - guest_nice: time spent in niced guest OK, also counted in 1 (nice)
51 class CpuInfo(
52 collections.namedtuple("CPU_utilization",
53 ('name', 'user', 'nice', 'system', 'idle', 'iowait', 'irq',
54 'softirq', 'steal', 'guest', 'guest_nice'))):
55 __slots__ = ()
57 @property
58 def util_total(self):
59 return self.user + self.nice + self.system + self.iowait + self.irq + self.softirq + self.steal
61 @property
62 def total_sum(self):
63 return self.util_total + self.idle
65 @property
66 def utils_perc(self):
67 # https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/kernel/sched/cputime.c
68 # see 'account_guest_time'
69 # if task_nice(p) <= 0:
70 # cpustat[CPUTIME_USER] += cputime;
71 # cpustat[CPUTIME_GUEST] += cputime;
72 guest = self.guest + self.guest_nice
73 user = self.user + self.nice - guest
75 system = self.system + self.irq + self.softirq
76 wait = self.iowait
77 steal = self.steal
78 total_sum = self.total_sum
80 perc = [
81 100.0 * float(x) / float(total_sum)
82 for x in [user, system, wait, steal, guest, self.util_total]
84 return perc
87 def cpu_info(elements, caster=int):
88 entries = [elements[0]] + map(caster, elements[1:])
89 entries.extend([0] * (11 - len(entries)))
90 return CpuInfo(*entries)
93 def util_counter(stats, this_time):
94 # Compute jiffi-differences of all relevant counters
95 diff_values = []
96 for n, v in enumerate(stats[1:], start=1):
97 countername = "cpu.util.%d" % n
98 last_val = get_item_state(countername, (0, 0))[1]
99 diff_values.append(v - last_val)
100 set_item_state(countername, (this_time, v))
102 return cpu_info([stats.name] + diff_values)
105 # normalize name of a cpu core so that the perfdata-template
106 # recognizes it. If the input name doesn't end on a number, this
107 # returns consecutive numbers per call so this function has to be
108 # called exactly once per core
109 def cpu_util_core_name(orig, core_index):
110 expr = regex(r"\d+$")
111 match = expr.search(orig)
112 if match is not None:
113 num = match.group(0)
114 else:
115 # fallback: if the cores have odd names, use
116 # consecutive numbers for each call
117 num = core_index
118 return "cpu_core_util_%s" % num
121 def check_cpu_util(util, params, this_time=None, cores=None, perf_max=100):
122 # Convert legacy param style to new dict style
123 if params is None:
124 params = {}
125 elif isinstance(params, tuple):
126 params = {"util": params}
128 if this_time is None:
129 this_time = time.time()
131 levels = params.get("util")
132 if levels is None: # legacy rules before 1.6
133 levels = params.get("levels")
135 warn, crit = levels if isinstance(levels, tuple) else (None, None) # only for perfdata
136 perfdata = [("util", util, warn, crit, 0, perf_max)]
138 # Averaging
139 if "average" in params:
140 util_avg = get_average("cpu_utilization.avg", this_time, util, params["average"])
141 perfdata.append(("util_average", util_avg, warn, crit, 0, perf_max))
142 state, infotext, extraperf = check_levels(
143 util_avg,
144 "util_average",
145 levels,
146 human_readable_func=get_percent_human_readable,
147 infoname="%dmin average" % params["average"])
148 else:
149 state, infotext, extraperf = check_levels(
150 util,
151 "util",
152 levels,
153 human_readable_func=get_percent_human_readable,
154 infoname="Total CPU")
156 perfdata += extraperf[1:] # reference curve for predictive levels
157 yield state, infotext, perfdata
159 if "core_util_time_total" in params:
160 threshold, warn, crit = params["core_util_time_total"]
161 yield cpu_util_time(this_time, "total", util, threshold, warn, crit)
163 if cores and any([x in params for x in ["core_util_graph", "core_util_time", "levels_single"]]):
164 for core_index, (core, total_perc) in enumerate(cores):
165 for perfdata in util_perfdata(core, total_perc, core_index, this_time, params):
166 yield perfdata
169 def check_cpu_util_unix(values, params, cores=None, values_counter=True):
170 this_time = time.time()
171 if values_counter:
172 diff_values = util_counter(values, this_time)
173 sum_jiffies = diff_values.total_sum
174 if sum_jiffies == 0:
175 raise MKCounterWrapped("Too short time difference since last check")
176 user_perc, system_perc, wait_perc, steal_perc, guest_perc, util_total_perc = diff_values.utils_perc
177 else:
178 user_perc = values.user
179 system_perc = values.system
180 wait_perc = values.iowait
181 util_total_perc = values.util_total
183 yield check_levels(
184 user_perc, 'user', None, human_readable_func=get_percent_human_readable, infoname="User")
185 yield check_levels(
186 system_perc,
187 'system',
188 None,
189 human_readable_func=get_percent_human_readable,
190 infoname="System")
191 yield check_levels(
192 wait_perc,
193 'wait',
194 params.get('iowait'),
195 human_readable_func=get_percent_human_readable,
196 infoname="Wait")
198 # Compute values used in virtualized environments (Xen, etc.)
199 # Only do this for counters that have counted at least one tick
200 # since the system boot. This avoids silly output in systems
201 # where these counters are not being used
202 if values.steal:
203 yield check_levels(
204 steal_perc,
205 "steal",
206 params.get('steal'),
207 human_readable_func=get_percent_human_readable,
208 infoname="Steal")
210 if values.guest:
211 yield check_levels(
212 guest_perc,
213 'guest',
214 None,
215 human_readable_func=get_percent_human_readable,
216 infoname="Guest")
218 summary_cores = []
219 if cores:
220 for core in cores:
221 prev_total = get_item_state("cpu.util.%s.total" % core.name, 0)
222 util_total = core.util_total
223 total_diff = util_total - prev_total
224 set_item_state("cpu.util.%s.total" % core.name, util_total)
225 total_perc = (100.0 * total_diff / sum_jiffies) * len(cores)
226 summary_cores.append((core.name, total_perc))
228 for check_result in check_cpu_util(
229 util_total_perc, params, this_time, summary_cores, perf_max=None):
230 yield check_result
233 def util_perfdata(core, total_perc, core_index, this_time, params):
234 if "core_util_graph" in params:
235 yield 0, "", [(cpu_util_core_name(core, core_index), total_perc)]
237 if "core_util_time" in params:
238 threshold, warn, crit = params["core_util_time"]
239 yield cpu_util_time(this_time, core, total_perc, threshold, warn, crit)
241 state, infotext, _ = check_levels(
242 total_perc,
243 "core_%s" % core, # Not used in perfdata
244 params.get('levels_single'),
245 human_readable_func=get_percent_human_readable,
246 infoname="Core %s" % core)
247 if state:
248 yield state, infotext, []
251 def check_cpu_util_linux_container(_no_item, params, info):
252 if not params:
253 params = {}
255 ticks = {}
256 for line in info:
257 if line[0] == "cpu":
258 ticks["total"] = sum(map(int, line[1:]))
259 else:
260 ticks[line[0]] = int(line[1])
262 this_time = time.time()
263 node_delta = get_rate("node_total", this_time, ticks["total"])
264 container_delta = get_rate("container", this_time, ticks["user"] + ticks["system"])
266 if not node_delta or not container_delta:
267 raise MKCounterWrapped("Too short time difference since last check")
269 cpu_usage = (container_delta / node_delta) * ticks["num_cpus"] * 100.0
271 return check_cpu_util(cpu_usage, params)
274 # .--helper--------------------------------------------------------------.
275 # | _ _ |
276 # | | |__ ___| |_ __ ___ _ __ |
277 # | | '_ \ / _ \ | '_ \ / _ \ '__| |
278 # | | | | | __/ | |_) | __/ | |
279 # | |_| |_|\___|_| .__/ \___|_| |
280 # | |_| |
281 # '----------------------------------------------------------------------'
284 def cpu_util_time(this_time, core, perc, threshold, warn_core, crit_core):
285 core_state_name = "cpu.util.core.high.%s" % core
286 if perc > threshold:
287 timestamp = get_item_state(core_state_name, 0)
288 high_load_duration = (this_time - timestamp)
289 state, infotext, _ = check_levels(
290 high_load_duration,
291 "%s_is_under_high_load_for" % core, # Not used
292 (warn_core, crit_core),
293 human_readable_func=get_age_human_readable,
294 infoname="%s is under high load for" % core)
295 if timestamp == 0:
296 set_item_state(core_state_name, this_time)
297 elif state:
298 return state, infotext
299 return 0, ""
301 clear_item_state(core_state_name)
302 return 0, ""