Cleanup config.nodes_of
[check_mk.git] / checks / drbd
blobf1611320fb52289d04d52998447bd4b81cee2685
1 #!/usr/bin/python
2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
3 # +------------------------------------------------------------------+
4 # | ____ _ _ __ __ _ __ |
5 # | / ___| |__ ___ ___| | __ | \/ | |/ / |
6 # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
7 # | | |___| | | | __/ (__| < | | | | . \ |
8 # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
9 # | |
10 # | Copyright Mathias Kettner 2014 mk@mathias-kettner.de |
11 # +------------------------------------------------------------------+
13 # This file is part of Check_MK.
14 # The official homepage is at http://mathias-kettner.de/check_mk.
16 # check_mk is free software; you can redistribute it and/or modify it
17 # under the terms of the GNU General Public License as published by
18 # the Free Software Foundation in version 2. check_mk is distributed
19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
20 # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
21 # PARTICULAR PURPOSE. See the GNU General Public License for more de-
22 # tails. You should have received a copy of the GNU General Public
23 # License along with GNU Make; see the file COPYING. If not, write
24 # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
25 # Boston, MA 02110-1301 USA.
27 # Author: Lars Michelsen <lm@mathias-kettner.de>
29 # Example outputs from agent:
31 # While syncing:
32 # <<<drbd>>>
33 # version: 8.3.8 (api:88/proto:86-94)
34 # GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by cssint@erzc20, 2010-06-17 14:47:26
35 # 0: cs:SyncSource ro:Primary/Secondary ds:UpToDate/Inconsistent C r----
36 # ns:12031428 nr:0 dw:12031364 dr:1175992347 al:2179 bm:71877 lo:37 pe:0 ua:37 ap:0 ep:1 wo:b oos:301729988
37 # [=======>............] sync'ed: 42.4% (294656/510908)M delay_probe: 145637
38 # finish: 1:23:28 speed: 60,172 (51,448) K/sec
40 # Sync stalled:
41 # <<<drbd>>>
42 # b01srv05:~ # cat /proc/drbd
43 # version: 8.3.8 (api:88/proto:86-94)
44 # GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by cssint@erzc20, 2010-06-17 14:47:26
45 # 0: cs:SyncSource ro:Primary/Secondary ds:UpToDate/Inconsistent C r----
46 # ns:11545876 nr:0 dw:11545900 dr:954551211 al:1955 bm:58360 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:523171100
47 # [>....................] sync'ed: 0.1% (510908/510908)M delay_probe: 135599
48 # stalled
50 # Synced:
51 # <<<drbd>>>
52 # version: 8.3.8 (api:88/proto:86-94)
53 # GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by cssint@erzc20, 2010-06-17 14:47:26
54 # 0: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r----
55 # ns:12227928 nr:0 dw:12227864 dr:1477722351 al:2300 bm:90294 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0
57 # Description of the /proc/drbd output:
58 # http://www.drbd.org/users-guide/ch-admin.html#s-proc-drbd
60 # The information from /proc/drbd are grouped as followed (Extracted from doc above)
62 # General:
63 # cs (connection state). Status of the network connection. See the section called
64 # “Connection states” for details about the various connection states.
65 # Available States:
66 # StandAlone. No network configuration available. The resource has not yet been connected,
67 # or has been administratively disconnected (using drbdadm disconnect),
68 # or has dropped its connection due to failed authentication or split brain.
69 # Disconnecting. Temporary state during disconnection. The next state is StandAlone.
70 # Unconnected. Temporary state, prior to a connection attempt.
71 # Possible next states: WFConnection and WFReportParams.
72 # Timeout. Temporary state following a timeout in the communication with the peer. Next state: Unconnected.
73 # BrokenPipe. Temporary state after the connection to the peer was lost. Next state: Unconnected.
74 # NetworkFailure. Temporary state after the connection to the partner was lost. Next state: Unconnected.
75 # ProtocolError. Temporary state after the connection to the partner was lost. Next state: Unconnected.
76 # TearDown. Temporary state. The peer is closing the connection. Next state: Unconnected.
77 # WFConnection. This node is waiting until the peer node becomes visible on the network.
78 # WFReportParams. TCP connection has been established, this node waits for the first network packet from the peer.
79 # Connected. A DRBD connection has been established, data mirroring is now active. This is the normal state.
80 # StartingSyncS. Full synchronization, initiated by the administrator, is just starting.
81 # The next possible states are: SyncSource or PausedSyncS.
82 # StartingSyncT. Full synchronization, initiated by the administrator, is just starting. Next state: WFSyncUUID.
83 # WFBitMapS. Partial synchronization is just starting. Next possible states: SyncSource or PausedSyncS.
84 # WFBitMapT. Partial synchronization is just starting. Next possible state: WFSyncUUID.
85 # WFSyncUUID. Synchronization is about to begin. Next possible states: SyncTarget or PausedSyncT.
86 # SyncSource. Synchronization is currently running, with the local node being the source of synchronization.
87 # SyncTarget. Synchronization is currently running, with the local node being the target of synchronization.
88 # PausedSyncS. The local node is the source of an ongoing synchronization, but synchronization is currently paused.
89 # This may be due to a dependency on the completion of another synchronization process,
90 # or due to synchronization having been manually interrupted by drbdadm pause-sync.
91 # PausedSyncT. The local node is the target of an ongoing synchronization, but synchronization
92 # is currently paused. This may be due to a dependency on the completion of another
93 # synchronization process, or due to synchronization having been manually interrupted by drbdadm pause-sync.
94 # VerifyS. On-line device verification is currently running, with the local node being the source of verification.
95 # VerifyT. On-line device verification is currently running, with the local node being the target of verification.
97 # ro (roles). Roles of the nodes. The role of the local node is displayed first, followed by the role of the partner
98 # node shown after the slash. See the section called “Resource roles” for details about the possible resource roles.
99 # Available Roles:
100 # Primary. The resource is currently in the primary role, and may be read from and written to.
101 # This role only occurs on one of the two nodes, unless dual-primary node is enabled.
102 # Secondary. The resource is currently in the secondary role. It normally receives updates
103 # from its peer (unless running in disconnected mode), but may neither be read from
104 # nor written to. This role may occur on one node or both nodes.
105 # Unknown. The resource's role is currently unknown. The local resource role never has this status.
106 # It is only displayed for the peer's resource role, and only in disconnected mode.
108 # ds (disk states). State of the hard disks. Prior to the slash the state of the local node is displayed,
109 # after the slash the state of the hard disk of the partner node is shown.
110 # See the section called “Disk states” for details about the various disk states.
111 # Disk States:
112 # Diskless. No local block device has been assigned to the DRBD driver. This may mean that the resource
113 # has never attached to its backing device, that it has been manually detached using drbdadm detach
114 # or that it automatically detached after a lower-level I/O error.
115 # Attaching. Transient state while reading meta data.
116 # Failed. Transient state following an I/O failure report by the local block device. Next state: Diskless.
117 # Negotiating. Transient state when an Attach is carried out on an already-connected DRBD device.
118 # Inconsistent. The data is inconsistent. This status occurs immediately upon creation of a new resource,
119 # on both nodes (before the initial full sync). Also, this status is found in one node
120 # (the synchronization target) during synchronization.
121 # Outdated. Resource data is consistent, but outdated.
122 # DUnknown. This state is used for the peer disk if no network connection is available.
123 # Consistent. Consistent data of a node without connection. When the connection
124 # is established, it is decided whether the data are UpToDate or Outdated.
125 # UpToDate. Consistent, up-to-date state of the data. This is the normal state.
127 # Network:
128 # ns (network send). Volume of net data sent to the partner via the network connection; in Kibyte.
129 # nr (network receive). Volume of net data received by the partner via the network connection; in Kibyte.
130 # Disk:
131 # dw (disk write). Net data written on local hard disk; in Kibyte.
132 # dr (disk read). Net data read from local hard disk; in Kibyte.
133 # Stats:
134 # al (activity log). Number of updates of the activity log area of the meta data.
135 # bm (bit map). Number of updates of the bitmap area of the meta data.
136 # lo (local count). Number of open requests to the local I/O sub-system issued by DRBD.
137 # pe (pending). Number of requests sent to the partner, but that have not yet been answered by the latter.
138 # ua (unacknowledged). Number of requests received by the partner via the network connection, but that have not yet been answered.
139 # ap (application pending). Number of block I/O requests forwarded to DRBD, but not yet answered by DRBD.
140 # ep (epochs). Number of epoch objects. Usually 1. Might increase under I/O load
141 # when using either the barrier or the none write ordering method. Since 8.2.7.
142 # wo (write order). Currently used write ordering method: b (barrier), f (flush), d (drain) or n (none). Since 8.2.7.
143 # oos (out of sync). Amount of storage currently out of sync; in Kibibytes. Since 8.2.6.
145 # Default thresholds for drbd checks
146 drbd_net_default_levels = (None, None)
147 drbd_disk_default_levels = (None, None)
148 drbd_stats_default_levels = (None, None, None, None, None, None, None, None, None)
150 _drbd_block_start_match = re.compile('^[0-9]+:')
152 drbd_general_map = ['cs', 'ro', 'ds']
153 drbd_net_map = ['cs', 'ns', 'nr']
154 drbd_disk_map = ['cs', 'dw', 'dr']
155 drbd_stats_map = ['cs', 'al', 'bm', 'lo', 'pe', 'ua', 'ap', 'ep', 'wo', 'oos']
157 drbd_cs_map = {
158 'StandAlone': 1,
159 'Disconnecting': 1,
160 'Unconnected': 2,
161 'Timeout': 2,
162 'BrokenPipe': 2,
163 'NetworkFailure': 2,
164 'ProtocolError': 2,
165 'TearDown': 2,
166 'WFConnection': 2,
167 'WFReportParams': 1,
168 'Connected': 0,
169 'StartingSyncS': 1,
170 'StartingSyncT': 1,
171 'WFBitMapS': 1,
172 'WFBitMapT': 1,
173 'WFSyncUUID': 1,
174 'SyncSource': 1,
175 'SyncTarget': 1,
176 'PausedSyncS': 1,
177 'PausedSyncT': 1,
178 'VerifyS': 0,
179 'VerifyT': 0,
180 'Ahead': 1,
181 'Behind': 1,
184 drbd_ds_map = {
185 "primary_Diskless": 2,
186 "secondary_Diskless": 2,
187 "primary_Attaching": 2,
188 "secondary_Attaching": 2,
189 "primary_Failed": 2,
190 "secondary_Failed": 2,
191 "primary_Negotiating": 2,
192 "secondary_Negotiating": 2,
193 "primary_Inconsistent": 1,
194 "secondary_Inconsistent": 1,
195 "primary_Outdated": 2,
196 "secondary_Outdated": 2,
197 "primary_DUnknown": 2,
198 "secondary_DUnknown": 2,
199 "primary_Consistent": 2,
200 "secondary_Consistent": 2,
201 "primary_UpToDate": 0,
202 "secondary_UpToDate": 0,
206 def inventory_drbd(info, checktype):
207 inventory = []
208 for line in info[2:]:
209 if _drbd_block_start_match.search(line[0]) > 0:
210 parsed = drbd_parse_block(drbd_extract_block('drbd%s' % line[0][:-1], info), checktype)
211 # Skip unconfigured drbd devices
212 if parsed['cs'] == 'Unconfigured':
213 continue
215 if checktype == 'drbd':
216 if 'ro' not in parsed or 'ds' not in parsed:
217 continue
218 levels = {
219 "roles_inventory": parsed['ro'],
220 "diskstates_inventory": parsed['ds'],
222 elif checktype == 'drbd.net':
223 levels = "drbd_net_default_levels"
224 elif checktype == 'drbd.disk':
225 levels = "drbd_disk_default_levels"
226 elif checktype == 'drbd.stats':
227 levels = "drbd_stats_default_levels"
228 inventory.append(('drbd%s' % line[0][:-1], levels))
229 return inventory
232 def drbd_parse_block(block, to_parse):
233 parsed = {}
234 for line in block:
235 for field in line:
236 parts = field.split(':')
237 if len(parts) > 1:
238 # Only parse the requested information depending on the check
239 # to be executed now
240 if to_parse == 'drbd' and parts[0] in drbd_general_map:
241 if parts[0] in ['ro', 'ds']:
242 parsed[parts[0]] = parts[1].split('/')
243 else:
244 parsed[parts[0]] = parts[1]
245 elif to_parse == 'drbd.net' and parts[0] in drbd_net_map:
246 parsed[parts[0]] = parts[1]
247 elif to_parse == 'drbd.disk' and parts[0] in drbd_disk_map:
248 parsed[parts[0]] = parts[1]
249 elif to_parse == 'drbd.stats' and parts[0] in drbd_stats_map:
250 parsed[parts[0]] = parts[1]
252 return parsed
255 def drbd_extract_block(item, info):
256 block = []
257 inBlock = False
258 # Ignore the first two lines since they contain drbd version information
259 for line in info[2:]:
260 if "drbd" + line[0][:-1] == item:
261 inBlock = True
262 elif inBlock and _drbd_block_start_match.search(line[0]) > 0 \
263 and "drbd" + line[0][:-1] != item:
264 # Another block starts. So the requested block is finished
265 break
267 # Skip unwanted lines
268 if not inBlock:
269 continue
271 # If this is reached we are in the wanted block
272 block.append(line)
274 return block
277 def drbd_get_block(item, info, checktype):
278 block = drbd_extract_block(item, info)
279 if len(block) > 0:
280 return drbd_parse_block(block, checktype)
281 return None
284 def check_drbd_general(item, params, info):
285 parsed = drbd_get_block(item, info, 'drbd')
287 if isinstance(params, tuple):
288 params_conv = {}
289 params_conv.update({"roles_inventory": params[0] and params[0] or None})
290 params_conv.update(
291 {"diskstates_inventory": (params[0] and params[1]) and params[1] or None})
292 params = params_conv
294 if not parsed is None:
295 if parsed['cs'] == 'Unconfigured':
296 return (2, 'The device is "Unconfigured"')
297 elif not parsed['cs'] in drbd_cs_map:
298 return (3, 'Undefined "connection state" in drbd output')
300 # Weight of connection state is calculated by the drbd_cs_map.
301 # The roles and disk states are calculated using the expected values
302 state = drbd_cs_map[parsed['cs']]
303 output = 'Connection State: %s' % parsed['cs']
305 # Roles
306 output += ', Roles: %s/%s' % tuple(parsed['ro'])
307 current_roles = "_".join(str(a).lower() for a in parsed["ro"])
309 found_role_match = False
310 if "roles" in params:
311 roles = params.get("roles")
312 if roles:
313 for roles_entry, roles_state in roles:
314 if roles_entry == current_roles:
315 found_role_match = True
316 state = max(state, roles_state)
317 output += ' %s' % state_markers[roles_state]
318 break
319 else: # Ignore roles if set to None
320 found_role_match = True
322 if not found_role_match:
323 if "roles_inventory" in params:
324 roles_inventory = params.get("roles_inventory")
325 if roles_inventory and parsed["ro"] != roles_inventory:
326 state = max(2, state)
327 output += ' (Expected: %s/%s)' % tuple(params.get("roles_inventory"))
328 else:
329 state = max(3, state)
330 output += ' (Check requires a new service discovery)'
332 output += ', Diskstates: %s/%s' % tuple(parsed['ds'])
333 # Do not evaluate diskstates. Either set by rule or through the
334 # legacy configuration option None in the check parameters tuple
335 if "diskstates" in params and params["diskstates"] is None or \
336 "diskstates_inventory" in params and params["diskstates_inventory"] is None:
337 return (state, output)
339 params_diskstates_dict = dict(params.get("diskstates", []))
340 diskstates_info = set()
341 for ro, ds in [(parsed["ro"][0], parsed["ds"][0]), (parsed["ro"][1], parsed["ds"][1])]:
342 diskstate = "%s_%s" % (ro.lower(), ds)
343 params_diskstate = params_diskstates_dict.get(diskstate)
345 if params_diskstate is not None:
346 state = max(state, params_diskstate)
347 diskstates_info.add('%s/%s is %s' % (ro, ds, state_markers[params_diskstate]))
348 else:
349 default_state = drbd_ds_map.get(diskstate, 3)
350 if default_state > 0:
351 diskstates_info.add('%s/%s is %s' % (ro, ds, state_markers[default_state]))
352 state = max(state, drbd_ds_map.get(diskstate, 3))
353 if diskstates_info:
354 output += " (%s)" % ", ".join(diskstates_info)
356 return (state, output)
358 return (3, "Undefined state")
361 check_info["drbd"] = {
362 'inventory_function': lambda info: inventory_drbd(info, "drbd"),
363 'check_function': check_drbd_general,
364 'group': 'drbd',
365 'has_perfdata': True,
366 'service_description': 'DRBD %s status',
370 def drbd_get_rates(list_):
371 now = time.time()
372 output = ''
373 perfdata = []
374 for type_, name, item, value, uom in list_:
375 rate = get_rate("%s.%s.%s" % (type_, name, item), now, value)
376 perfdata.append((name, rate))
377 output += ' %s/sec: %s%s' % (name, rate, uom)
378 return (output, perfdata)
381 def check_drbd_net(item, params, info):
382 parsed = drbd_get_block(item, info, 'drbd.net')
383 if not parsed is None:
384 if parsed['cs'] == 'Unconfigured':
385 return (2, 'The device is "Unconfigured"')
386 output, perfdata = drbd_get_rates([('drbd.net', 'in', item, int(parsed['nr']), 'kb'),
387 ('drbd.net', 'out', item, int(parsed['ns']), 'kb')])
388 # FIXME: Maybe handle thresholds in the future
389 return (0, output, perfdata)
391 return (3, "Undefined state")
394 check_info["drbd.net"] = {
395 'inventory_function': lambda info: inventory_drbd(info, "drbd.net"),
396 'check_function': check_drbd_net,
397 'group': 'drbd.net',
398 'has_perfdata': True,
399 'service_description': 'DRBD %s net',
403 def check_drbd_disk(item, params, info):
404 parsed = drbd_get_block(item, info, 'drbd.disk')
405 if not parsed is None:
406 if parsed['cs'] == 'Unconfigured':
407 return (2, 'The device is "Unconfigured"')
408 output, perfdata = drbd_get_rates([('drbd.disk', 'write', item, int(parsed['dw']), 'kb'),
409 ('drbd.disk', 'read', item, int(parsed['dr']), 'kb')])
410 # FIXME: Maybe handle thresholds in the future
411 return (0, output, perfdata)
413 return (3, "Undefined state")
416 check_info["drbd.disk"] = {
417 'inventory_function': lambda info: inventory_drbd(info, "drbd.disk"),
418 'check_function': check_drbd_disk,
419 'group': 'drbd.disk',
420 'has_perfdata': True,
421 'service_description': 'DRBD %s disk',
425 def check_drbd_stats(item, params, info):
426 parsed = drbd_get_block(item, info, 'drbd.stats')
427 if not parsed is None:
428 if parsed['cs'] == 'Unconfigured':
429 return (2, 'The device is "Unconfigured"')
430 output = ''
431 perfdata = []
432 for key, label in [
433 ('al', 'activity log updates'),
434 ('bm', 'bit map updates'),
435 ('lo', 'local count requests'),
436 ('pe', 'pending requests'),
437 ('ua', 'unacknowledged requests'),
438 ('ap', 'application pending requests'),
439 ('ep', 'epoch objects'),
440 ('wo', 'write order'),
441 ('oos', 'kb out of sync'),
443 if key in parsed:
444 output += '%s: %s, ' % (label, parsed[key])
445 else:
446 parsed[key] = '0' # perfdata must always have same number of entries
447 if parsed[key].isdigit():
448 perfdata.append(('%s' % label.replace(" ", "_"), int(parsed[key])))
449 return (0, output.rstrip(', '), perfdata)
451 return (3, "Undefined state")
454 check_info["drbd.stats"] = {
455 'inventory_function': lambda info: inventory_drbd(info, "drbd.stats"),
456 'check_function': check_drbd_stats,
457 'group': 'drbd.stats',
458 'has_perfdata': True,
459 'service_description': 'DRBD %s stats',