2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
3 # +------------------------------------------------------------------+
4 # | ____ _ _ __ __ _ __ |
5 # | / ___| |__ ___ ___| | __ | \/ | |/ / |
6 # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
7 # | | |___| | | | __/ (__| < | | | | . \ |
8 # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
10 # | Copyright Mathias Kettner 2019 mk@mathias-kettner.de |
11 # +------------------------------------------------------------------+
13 # This file is part of Check_MK.
14 # The official homepage is at http://mathias-kettner.de/check_mk.
16 # check_mk is free software; you can redistribute it and/or modify it
17 # under the terms of the GNU General Public License as published by
18 # the Free Software Foundation in version 2. check_mk is distributed
19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
20 # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
21 # PARTICULAR PURPOSE. See the GNU General Public License for more de-
22 # tails. You should have received a copy of the GNU General Public
23 # License along with GNU Make; see the file COPYING. If not, write
24 # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
25 # Boston, MA 02110-1301 USA.
28 def parse_aws_rds(info
):
30 for metrics
in _extract_aws_metrics_by_labels(
35 "DatabaseConnections",
36 "FailedSQLServerAgentJobsCount",
38 "OldestReplicationSlotLag",
40 "ReplicationSlotDiskUsage",
41 "TransactionLogsDiskUsage",
42 "TransactionLogsGeneration",
43 "NetworkReceiveThroughput",
44 "NetworkTransmitThroughput",
56 #"MaximumUsedTransactionIDs",
60 extra_keys
=['DBInstanceIdentifier']).itervalues():
63 ('AllocatedStorage', 1.074e+9),
64 ('TransactionLogsDiskUsage', 1024**3),
65 ('ReplicationSlotDiskUsage', 1024**3),
66 ('OldestReplicationSlotLag', 1024**3),
69 metrics
[key
] *= factor
72 parsed
.setdefault(metrics
['DBInstanceIdentifier'], metrics
)
76 # .--CPU utilization-----------------------------------------------------.
77 # | ____ ____ _ _ _ _ _ _ _ _ |
78 # | / ___| _ \| | | | _ _| |_(_) (_)______ _| |_(_) ___ _ __ |
79 # | | | | |_) | | | | | | | | __| | | |_ / _` | __| |/ _ \| '_ \ |
80 # | | |___| __/| |_| | | |_| | |_| | | |/ / (_| | |_| | (_) | | | | |
81 # | \____|_| \___/ \__,_|\__|_|_|_/___\__,_|\__|_|\___/|_| |_| |
83 # '----------------------------------------------------------------------'
85 factory_settings
['aws_rds_cpu_util'] = {
86 'levels': (80.0, 90.0),
91 def check_aws_rds(item
, params
, metrics
):
92 return check_cpu_util(metrics
['CPUUtilization'], params
, time
.time())
95 check_info
['aws_rds'] = {
96 'parse_function': parse_aws_rds
,
97 'inventory_function': lambda p
:\
98 inventory_aws_generic(p
, ['CPUUtilization']),
99 'check_function': check_aws_rds
,
100 'service_description': 'AWS/RDS %s CPU Utilization',
101 'includes': ['cpu_util.include', 'aws.include'],
102 'group': 'cpu_utilization_multiitem',
103 'default_levels_variable': 'aws_rds_cpu_util',
104 'has_perfdata': True,
108 # .--CPU credits---------------------------------------------------------.
109 # | ____ ____ _ _ _ _ _ |
110 # | / ___| _ \| | | | ___ _ __ ___ __| (_) |_ ___ |
111 # | | | | |_) | | | | / __| '__/ _ \/ _` | | __/ __| |
112 # | | |___| __/| |_| | | (__| | | __/ (_| | | |_\__ \ |
113 # | \____|_| \___/ \___|_| \___|\__,_|_|\__|___/ |
115 # '----------------------------------------------------------------------'
117 # CPU credit balance:
118 # For standard T2 instances with bursting, a burst can continue only as long as
119 # there are available CPU credits, so it’s important to monitor your instance’s
120 # balance. Credits are earned any time the instance is running below its baseline
121 # CPU performance level. The initial balance, accrual rate, and maximum possible
122 # balance are all dependent on the instance level.
125 # One CPU credit is equivalent to one minute of 100 percent CPU utilization (or
126 # two minutes at 50 percent, etc.). Whenever an instance requires CPU performance
127 # above that instance type’s baseline, it will burst, consuming CPU credits until
128 # the demand lessens or the credit balance runs out. Keeping an eye on your
129 # instances’ credit usage will help you identify whether you might need to switch
130 # to an instance type that is optimized for CPU-intensive workloads. Or, you can
131 # create an alert for when your credit balance drops below a threshold while CPU
132 # usage remains above baseline.
135 @get_parsed_item_data
136 def check_aws_rds_cpu_credits(item
, params
, metrics
):
137 yield 0, "Usage: %.2f" % metrics
['CPUCreditUsage']
138 warn
, crit
= params
.get("balance_levels_lower", (None, None))
140 metrics
['CPUCreditBalance'],
141 "aws_cpu_credit_balance", (None, None, warn
, crit
),
142 human_readable_func
=lambda x
: "%.2f" % x
,
145 burst_balance
= metrics
.get('BurstBalance')
146 if burst_balance
is not None:
147 warn
, crit
= params
.get("burst_balance_levels_lower", (None, None))
149 metrics
['BurstBalance'],
150 "aws_burst_balance", (None, None, warn
, crit
),
151 human_readable_func
=get_percent_human_readable
,
155 check_info
['aws_rds.cpu_credits'] = {
156 'inventory_function': lambda p
:\
157 inventory_aws_generic(p
, ['CPUCreditUsage', 'CPUCreditBalance']),
158 'check_function': check_aws_rds_cpu_credits
,
159 'service_description': 'AWS/RDS %s CPU Credits',
160 'includes': ['aws.include'],
161 'group': 'aws_rds_cpu_credits',
162 'has_perfdata': True,
166 # .--network IO----------------------------------------------------------.
168 # | _ __ ___| |___ _____ _ __| | __ |_ _/ _ \ |
169 # | | '_ \ / _ \ __\ \ /\ / / _ \| '__| |/ / | | | | | |
170 # | | | | | __/ |_ \ V V / (_) | | | < | | |_| | |
171 # | |_| |_|\___|\__| \_/\_/ \___/|_| |_|\_\ |___\___/ |
173 # '----------------------------------------------------------------------'
176 @get_parsed_item_data
177 def check_aws_rds_network_io(item
, params
, metrics
):
184 metrics
['NetworkReceiveThroughput'],
190 metrics
['NetworkTransmitThroughput'],
197 metrics
.get('DBInstanceIdentifier', item
),
200 return check_if_common_single(item
, params
, interfaces
)
203 check_info
['aws_rds.network_io'] = {
204 'inventory_function': lambda p
:\
205 inventory_aws_generic(p
, ['NetworkReceiveThroughput', 'NetworkTransmitThroughput']),
206 'check_function': check_aws_rds_network_io
,
207 'service_description': 'AWS/RDS %s Network IO',
208 'includes': ['aws.include', "if.include"],
209 'default_levels_variable': "if_default_levels",
211 'has_perfdata': True,
215 # .--bin log usage-------------------------------------------------------.
217 # | | |__ (_)_ __ | | ___ __ _ _ _ ___ __ _ __ _ ___ |
218 # | | '_ \| | '_ \ | |/ _ \ / _` | | | | / __|/ _` |/ _` |/ _ \ |
219 # | | |_) | | | | | | | (_) | (_| | | |_| \__ \ (_| | (_| | __/ |
220 # | |_.__/|_|_| |_| |_|\___/ \__, | \__,_|___/\__,_|\__, |\___| |
222 # '----------------------------------------------------------------------'
225 @get_parsed_item_data
226 def check_aws_rds_bin_log_usage(item
, params
, metrics
):
227 bin_log_usage
= metrics
['BinLogDiskUsage']
228 yield 0, get_bytes_human_readable(bin_log_usage
)
231 usage
= 100.0 * bin_log_usage
/ metrics
['AllocatedStorage']
232 except (KeyError, ZeroDivisionError):
233 yield 1, 'Cannot calculate usage'
237 "aws_rds_bin_log_disk_usage",
238 params
.get('levels', (None, None)),
239 human_readable_func
=get_percent_human_readable
)
242 check_info
['aws_rds.bin_log_usage'] = {
243 'inventory_function': lambda p
:\
244 inventory_aws_generic(p
, ['BinLogDiskUsage', 'AllocatedStorage']),
245 'check_function': check_aws_rds_bin_log_usage
,
246 'service_description': 'AWS/RDS %s Binary Log Usage',
247 'includes': ['aws.include'],
248 'has_perfdata': True,
249 'group': 'aws_rds_disk_usage',
253 # .--transaction logs usage----------------------------------------------.
255 # | | |_ _ __ __ _ _ __ ___ __ _ ___| |_(_) ___ _ __ |
256 # | | __| '__/ _` | '_ \/ __|/ _` |/ __| __| |/ _ \| '_ \ |
257 # | | |_| | | (_| | | | \__ \ (_| | (__| |_| | (_) | | | | |
258 # | \__|_| \__,_|_| |_|___/\__,_|\___|\__|_|\___/|_| |_| |
261 # | | | ___ __ _ ___ _ _ ___ __ _ __ _ ___ |
262 # | | |/ _ \ / _` / __| | | | / __|/ _` |/ _` |/ _ \ |
263 # | | | (_) | (_| \__ \ | |_| \__ \ (_| | (_| | __/ |
264 # | |_|\___/ \__, |___/ \__,_|___/\__,_|\__, |\___| |
266 # '----------------------------------------------------------------------'
269 @get_parsed_item_data
270 def check_aws_rds_transaction_logs_usage(item
, params
, metrics
):
271 transaction_logs_space
= metrics
['TransactionLogsDiskUsage']
272 yield 0, get_bytes_human_readable(transaction_logs_space
)
275 usage
= 100.0 * transaction_logs_space
/ metrics
['AllocatedStorage']
276 except (KeyError, ZeroDivisionError):
277 yield 1, 'Cannot calculate usage'
281 "aws_rds_transaction_logs_disk_usage",
282 params
.get('levels', (None, None)),
283 human_readable_func
=get_percent_human_readable
)
285 generation
= metrics
.get('TransactionLogsGeneration')
287 yield 0, 'Size of transaction logs: %s/s' % generation
290 check_info
['aws_rds.transaction_logs_usage'] = {
291 'inventory_function': lambda p
:\
292 inventory_aws_generic(p
, ['TransactionLogsDiskUsage', 'AllocatedStorage']),
293 'check_function': check_aws_rds_transaction_logs_usage
,
294 'service_description': 'AWS/RDS %s Transaction Logs Usage',
295 'includes': ['aws.include'],
296 'has_perfdata': True,
297 'group': 'aws_rds_disk_usage',
301 # .--replication slot usage----------------------------------------------.
303 # | _ __ ___ _ __ | (_) ___ __ _| |_(_) ___ _ __ ___| | ___ | |_ |
304 # | | '__/ _ \ '_ \| | |/ __/ _` | __| |/ _ \| '_ \ / __| |/ _ \| __| |
305 # | | | | __/ |_) | | | (_| (_| | |_| | (_) | | | | \__ \ | (_) | |_ |
306 # | |_| \___| .__/|_|_|\___\__,_|\__|_|\___/|_| |_| |___/_|\___/ \__| |
309 # | _ _ ___ __ _ __ _ ___ |
310 # | | | | / __|/ _` |/ _` |/ _ \ |
311 # | | |_| \__ \ (_| | (_| | __/ |
312 # | \__,_|___/\__,_|\__, |\___| |
314 # '----------------------------------------------------------------------'
317 @get_parsed_item_data
318 def check_aws_rds_replication_slot_usage(item
, params
, metrics
):
319 replication_slot_space
= metrics
['ReplicationSlotDiskUsage']
320 yield 0, get_bytes_human_readable(replication_slot_space
)
323 usage
= 100.0 * replication_slot_space
/ metrics
['AllocatedStorage']
324 except (KeyError, ZeroDivisionError):
325 yield 1, 'Cannot calculate usage'
329 "aws_rds_replication_slot_disk_usage",
330 params
.get('levels', (None, None)),
331 human_readable_func
=get_percent_human_readable
)
334 check_info
['aws_rds.replication_slot_usage'] = {
335 'inventory_function': lambda p
:\
336 inventory_aws_generic(p
, ['ReplicationSlotDiskUsage', 'AllocatedStorage']),
337 'check_function': check_aws_rds_replication_slot_usage
,
338 'service_description': 'AWS/RDS %s Replication Slot Usage',
339 'includes': ['aws.include'],
340 'has_perfdata': True,
341 'group': 'aws_rds_disk_usage',
345 # .--disk IO-------------------------------------------------------------.
347 # | __| (_)___| | __ |_ _/ _ \ |
348 # | / _` | / __| |/ / | | | | | |
349 # | | (_| | \__ \ < | | |_| | |
350 # | \__,_|_|___/_|\_\ |___\___/ |
352 # '----------------------------------------------------------------------'
355 def check_aws_rds_disk_io(item
, params
, parsed
):
358 for disk_name
, metrics
in parsed
.iteritems():
361 "read_ios": get_rate("aws_rds_disk_io_read_ios.%s" % item
, now
,
362 metrics
['ReadIOPS']),
363 "write_ios": get_rate("aws_rds_disk_io_write_ios.%s" % item
, now
,
364 metrics
['WriteIOPS']),
365 "read_throughput": get_rate("aws_rds_disk_io_read_throughput.%s" % item
, now
,
366 metrics
['ReadThroughput']),
367 "write_throughput": get_rate("aws_rds_disk_io_write_throughput.%s" % item
, now
,
368 metrics
['WriteThroughput']),
369 "read_latency": metrics
['ReadLatency'] * 1000.0,
370 "write_latency": metrics
['WriteLatency'] * 1000.0,
372 return check_diskstat_dict(item
, params
, disks
)
375 check_info
['aws_rds.disk_io'] = {
376 'inventory_function': lambda p
:\
377 inventory_aws_generic(p
, ['DiskQueueDepth', 'ReadIOPS', 'ReadLatency', 'ReadThroughput', 'WriteIOPS', 'WriteLatency', 'WriteThroughput']),
378 'check_function': check_aws_rds_disk_io
,
379 'service_description': 'AWS/RDS %s Disk IO',
380 'includes': ['aws.include', 'diskstat.include'],
382 'has_perfdata': True,
386 # .--connections---------------------------------------------------------.
388 # | ___ ___ _ __ _ __ ___ ___| |_(_) ___ _ __ ___ |
389 # | / __/ _ \| '_ \| '_ \ / _ \/ __| __| |/ _ \| '_ \/ __| |
390 # | | (_| (_) | | | | | | | __/ (__| |_| | (_) | | | \__ \ |
391 # | \___\___/|_| |_|_| |_|\___|\___|\__|_|\___/|_| |_|___/ |
393 # '----------------------------------------------------------------------'
396 @get_parsed_item_data
397 def check_aws_rds_connections(item
, params
, metrics
):
399 metrics
['DatabaseConnections'],
400 "aws_rds_connections",
401 params
.get('levels', (None, None)),
405 check_info
['aws_rds.connections'] = {
406 'inventory_function': lambda p
:\
407 inventory_aws_generic(p
, ['DatabaseConnections']),
408 'check_function': check_aws_rds_connections
,
409 'service_description': 'AWS/RDS %s Connections',
410 'includes': ['aws.include'],
411 'has_perfdata': True,
412 'group': 'aws_rds_connections',
416 # .--agent jobs----------------------------------------------------------.
418 # | __ _ __ _ ___ _ __ | |_ (_) ___ | |__ ___ |
419 # | / _` |/ _` |/ _ \ '_ \| __| | |/ _ \| '_ \/ __| |
420 # | | (_| | (_| | __/ | | | |_ | | (_) | |_) \__ \ |
421 # | \__,_|\__, |\___|_| |_|\__| _/ |\___/|_.__/|___/ |
423 # '----------------------------------------------------------------------'
426 @get_parsed_item_data
427 def check_aws_rds_agent_jobs(item
, params
, metrics
):
428 failed_agent_jobs
= metrics
['FailedSQLServerAgentJobsCount']
429 if failed_agent_jobs
> 0:
433 yield state
, "Failed jobs during the last minute: %s" % failed_agent_jobs
436 check_info
['aws_rds.agent_jobs'] = {
437 'inventory_function': lambda p
:\
438 inventory_aws_generic(p
, ['FailedSQLServerAgentJobsCount']),
439 'check_function': check_aws_rds_agent_jobs
,
440 'service_description': 'AWS/RDS %s SQL Server Agent Jobs',
441 'includes': ['aws.include'],
445 # .--replica lag---------------------------------------------------------.
447 # | _ __ ___ _ __ | (_) ___ __ _ | | __ _ __ _ |
448 # | | '__/ _ \ '_ \| | |/ __/ _` | | |/ _` |/ _` | |
449 # | | | | __/ |_) | | | (_| (_| | | | (_| | (_| | |
450 # | |_| \___| .__/|_|_|\___\__,_| |_|\__,_|\__, | |
452 # '----------------------------------------------------------------------'
455 @get_parsed_item_data
456 def check_aws_rds_replica_lag(item
, params
, metrics
):
458 metrics
['ReplicaLag'],
459 "aws_rds_replica_lag",
460 params
.get('lag_levels', (None, None)),
461 human_readable_func
=get_age_human_readable
,
464 oldest_replica_lag_space
= metrics
.get('OldestReplicationSlotLag')
465 if oldest_replica_lag_space
is not None:
467 oldest_replica_lag_space
,
468 "aws_rds_oldest_replication_slot_lag",
469 params
.get('slot_levels', (None, None)),
470 human_readable_func
=get_bytes_human_readable
,
471 infoname
="Oldest replication slot lag")
474 check_info
['aws_rds.replica_lag'] = {
475 'inventory_function': lambda p
:\
476 inventory_aws_generic(p
, ['ReplicaLag']),
477 'check_function': check_aws_rds_replica_lag
,
478 'service_description': 'AWS/RDS %s Replica Lag',
479 'includes': ['aws.include'],
480 'has_perfdata': True,
481 'group': 'aws_rds_replica_lag',