2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
3 # +------------------------------------------------------------------+
4 # | ____ _ _ __ __ _ __ |
5 # | / ___| |__ ___ ___| | __ | \/ | |/ / |
6 # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
7 # | | |___| | | | __/ (__| < | | | | . \ |
8 # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
10 # | Copyright Mathias Kettner 2014 mk@mathias-kettner.de |
11 # +------------------------------------------------------------------+
13 # This file is part of Check_MK.
14 # The official homepage is at http://mathias-kettner.de/check_mk.
16 # check_mk is free software; you can redistribute it and/or modify it
17 # under the terms of the GNU General Public License as published by
18 # the Free Software Foundation in version 2. check_mk is distributed
19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
20 # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
21 # PARTICULAR PURPOSE. See the GNU General Public License for more de-
22 # tails. You should have received a copy of the GNU General Public
23 # License along with GNU Make; see the file COPYING. If not, write
24 # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
25 # Boston, MA 02110-1301 USA.
27 # Common file for all (modern) checks that check CPU utilization (not load!)
29 # Example for check parameters:
30 # 1. Variant: Tuple (warn, crit). This is legacy style
31 # 2. Variant: dictionary:
34 # "util" : .... --> compatible with check_levels(), optional
35 # "average" : 15 # -> compute average for 15 minutes, optional
39 # This one can handle user, system and wait. values is a list of:
40 # - 0 - name: name of core
41 # - 1 - user: normal processes executing in user mode
42 # - 2 - nice: niced processes executing in user mode
43 # - 3 - system: processes executing in kernel mode
44 # - 4 - idle: twiddling thumbs
45 # - 5 - iowait: waiting for I/O to complete
46 # - 6 - irq: servicing interrupts
47 # - 7 - softirq: servicing softirqs
48 # - 8 - steal: involuntary wait
49 # - 9 - guest: time spent in guest OK, also counted in 0 (user)
50 # -10 - guest_nice: time spent in niced guest OK, also counted in 1 (nice)
52 collections
.namedtuple("CPU_utilization",
53 ('name', 'user', 'nice', 'system', 'idle', 'iowait', 'irq',
54 'softirq', 'steal', 'guest', 'guest_nice'))):
59 return self
.user
+ self
.nice
+ self
.system
+ self
.iowait
+ self
.irq
+ self
.softirq
+ self
.steal
63 return self
.util_total
+ self
.idle
67 # https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/kernel/sched/cputime.c
68 # see 'account_guest_time'
69 # if task_nice(p) <= 0:
70 # cpustat[CPUTIME_USER] += cputime;
71 # cpustat[CPUTIME_GUEST] += cputime;
72 guest
= self
.guest
+ self
.guest_nice
73 user
= self
.user
+ self
.nice
- guest
75 system
= self
.system
+ self
.irq
+ self
.softirq
78 total_sum
= self
.total_sum
81 100.0 * float(x
) / float(total_sum
)
82 for x
in [user
, system
, wait
, steal
, guest
, self
.util_total
]
87 def cpu_info(elements
, caster
=int):
88 entries
= [elements
[0]] + map(caster
, elements
[1:])
89 entries
.extend([0] * (11 - len(entries
)))
90 return CpuInfo(*entries
)
93 def util_counter(stats
, this_time
):
94 # Compute jiffi-differences of all relevant counters
96 for n
, v
in enumerate(stats
[1:], start
=1):
97 countername
= "cpu.util.%d" % n
98 last_val
= get_item_state(countername
, (0, 0))[1]
99 diff_values
.append(v
- last_val
)
100 set_item_state(countername
, (this_time
, v
))
102 return cpu_info([stats
.name
] + diff_values
)
105 # normalize name of a cpu core so that the perfdata-template
106 # recognizes it. If the input name doesn't end on a number, this
107 # returns consecutive numbers per call so this function has to be
108 # called exactly once per core
109 def cpu_util_core_name(orig
, core_index
):
110 expr
= regex(r
"\d+$")
111 match
= expr
.search(orig
)
112 if match
is not None:
115 # fallback: if the cores have odd names, use
116 # consecutive numbers for each call
118 return "cpu_core_util_%s" % num
121 def check_cpu_util(util
, params
, this_time
=None, cores
=None, perf_max
=100):
122 # Convert legacy param style to new dict style
125 elif isinstance(params
, tuple):
126 params
= {"util": params
}
128 if this_time
is None:
129 this_time
= time
.time()
131 levels
= params
.get("util")
132 if levels
is None: # legacy rules before 1.6
133 levels
= params
.get("levels")
135 warn
, crit
= levels
if isinstance(levels
, tuple) else (None, None) # only for perfdata
136 perfdata
= [("util", util
, warn
, crit
, 0, perf_max
)]
139 if "average" in params
:
140 util_avg
= get_average("cpu_utilization.avg", this_time
, util
, params
["average"])
141 perfdata
.append(("util_average", util_avg
, warn
, crit
, 0, perf_max
))
142 state
, infotext
, extraperf
= check_levels(
146 human_readable_func
=get_percent_human_readable
,
147 infoname
="%dmin average" % params
["average"])
149 state
, infotext
, extraperf
= check_levels(
153 human_readable_func
=get_percent_human_readable
,
154 infoname
="Total CPU")
156 perfdata
+= extraperf
[1:] # reference curve for predictive levels
157 yield state
, infotext
, perfdata
159 if "core_util_time_total" in params
:
160 threshold
, warn
, crit
= params
["core_util_time_total"]
161 yield cpu_util_time(this_time
, "total", util
, threshold
, warn
, crit
)
163 if cores
and any([x
in params
for x
in ["core_util_graph", "core_util_time", "levels_single"]]):
164 for core_index
, (core
, total_perc
) in enumerate(cores
):
165 for perfdata
in util_perfdata(core
, total_perc
, core_index
, this_time
, params
):
169 def check_cpu_util_unix(values
, params
, cores
=None, values_counter
=True):
170 this_time
= time
.time()
172 diff_values
= util_counter(values
, this_time
)
173 sum_jiffies
= diff_values
.total_sum
175 raise MKCounterWrapped("Too short time difference since last check")
176 user_perc
, system_perc
, wait_perc
, steal_perc
, guest_perc
, util_total_perc
= diff_values
.utils_perc
178 user_perc
= values
.user
179 system_perc
= values
.system
180 wait_perc
= values
.iowait
181 util_total_perc
= values
.util_total
184 user_perc
, 'user', None, human_readable_func
=get_percent_human_readable
, infoname
="User")
189 human_readable_func
=get_percent_human_readable
,
194 params
.get('iowait'),
195 human_readable_func
=get_percent_human_readable
,
198 # Compute values used in virtualized environments (Xen, etc.)
199 # Only do this for counters that have counted at least one tick
200 # since the system boot. This avoids silly output in systems
201 # where these counters are not being used
207 human_readable_func
=get_percent_human_readable
,
215 human_readable_func
=get_percent_human_readable
,
221 prev_total
= get_item_state("cpu.util.%s.total" % core
.name
, 0)
222 util_total
= core
.util_total
223 total_diff
= util_total
- prev_total
224 set_item_state("cpu.util.%s.total" % core
.name
, util_total
)
225 total_perc
= (100.0 * total_diff
/ sum_jiffies
) * len(cores
)
226 summary_cores
.append((core
.name
, total_perc
))
228 for check_result
in check_cpu_util(
229 util_total_perc
, params
, this_time
, summary_cores
, perf_max
=None):
233 def util_perfdata(core
, total_perc
, core_index
, this_time
, params
):
234 if "core_util_graph" in params
:
235 yield 0, "", [(cpu_util_core_name(core
, core_index
), total_perc
)]
237 if "core_util_time" in params
:
238 threshold
, warn
, crit
= params
["core_util_time"]
239 yield cpu_util_time(this_time
, core
, total_perc
, threshold
, warn
, crit
)
241 state
, infotext
, _
= check_levels(
243 "core_%s" % core
, # Not used in perfdata
244 params
.get('levels_single'),
245 human_readable_func
=get_percent_human_readable
,
246 infoname
="Core %s" % core
)
248 yield state
, infotext
, []
251 def check_cpu_util_linux_container(_no_item
, params
, info
):
258 ticks
["total"] = sum(map(int, line
[1:]))
260 ticks
[line
[0]] = int(line
[1])
262 this_time
= time
.time()
263 node_delta
= get_rate("node_total", this_time
, ticks
["total"])
264 container_delta
= get_rate("container", this_time
, ticks
["user"] + ticks
["system"])
266 if not node_delta
or not container_delta
:
267 raise MKCounterWrapped("Too short time difference since last check")
269 cpu_usage
= (container_delta
/ node_delta
) * ticks
["num_cpus"] * 100.0
271 return check_cpu_util(cpu_usage
, params
)
274 # .--helper--------------------------------------------------------------.
276 # | | |__ ___| |_ __ ___ _ __ |
277 # | | '_ \ / _ \ | '_ \ / _ \ '__| |
278 # | | | | | __/ | |_) | __/ | |
279 # | |_| |_|\___|_| .__/ \___|_| |
281 # '----------------------------------------------------------------------'
284 def cpu_util_time(this_time
, core
, perc
, threshold
, warn_core
, crit_core
):
285 core_state_name
= "cpu.util.core.high.%s" % core
287 timestamp
= get_item_state(core_state_name
, 0)
288 high_load_duration
= (this_time
- timestamp
)
289 state
, infotext
, _
= check_levels(
291 "%s_is_under_high_load_for" % core
, # Not used
292 (warn_core
, crit_core
),
293 human_readable_func
=get_age_human_readable
,
294 infoname
="%s is under high load for" % core
)
296 set_item_state(core_state_name
, this_time
)
298 return state
, infotext
301 clear_item_state(core_state_name
)