checks/aws_rds

   1 #!/usr/bin/python
   2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
   3 # +------------------------------------------------------------------+
   4 # |             ____ _               _        __  __ _  __           |
   5 # |            / ___| |__   ___  ___| | __   |  \/  | |/ /           |
   6 # |           | |   | '_ \ / _ \/ __| |/ /   | |\/| | ' /            |
   7 # |           | |___| | | |  __/ (__|   <    | |  | | . \            |
   8 # |            \____|_| |_|\___|\___|_|\_\___|_|  |_|_|\_\           |
   9 # |                                                                  |
  10 # | Copyright Mathias Kettner 2019             mk@mathias-kettner.de |
  11 # +------------------------------------------------------------------+
  12 #
  13 # This file is part of Check_MK.
  14 # The official homepage is at http://mathias-kettner.de/check_mk.
  15 #
  16 # check_mk is free software;  you can redistribute it and/or modify it
  17 # under the  terms of the  GNU General Public License  as published by
  18 # the Free Software Foundation in version 2.  check_mk is  distributed
  19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY;  with-
  20 # out even the implied warranty of  MERCHANTABILITY  or  FITNESS FOR A
  21 # PARTICULAR PURPOSE. See the  GNU General Public License for more de-
  22 # tails. You should have  received  a copy of the  GNU  General Public
  23 # License along with GNU Make; see the file  COPYING.  If  not,  write
  24 # to the Free Software Foundation, Inc., 51 Franklin St,  Fifth Floor,
  25 # Boston, MA 02110-1301 USA.
  26
  27
  28 def parse_aws_rds(info):
  29     parsed = {}
  30     for metrics in _extract_aws_metrics_by_labels(
  31         [
  32             "CPUUtilization",
  33             "CPUCreditUsage",
  34             "CPUCreditBalance",
  35             "DatabaseConnections",
  36             "FailedSQLServerAgentJobsCount",
  37             "BinLogDiskUsage",
  38             "OldestReplicationSlotLag",
  39             "ReplicaLag",
  40             "ReplicationSlotDiskUsage",
  41             "TransactionLogsDiskUsage",
  42             "TransactionLogsGeneration",
  43             "NetworkReceiveThroughput",
  44             "NetworkTransmitThroughput",
  45             "DiskQueueDepth",
  46             "WriteIOPS",
  47             "WriteLatency",
  48             "WriteThroughput",
  49             "ReadIOPS",
  50             "ReadLatency",
  51             "ReadThroughput",
  52             "BurstBalance",
  53             #"FreeableMemory",
  54             #"SwapUsage",
  55             #"FreeStorageSpace",
  56             #"MaximumUsedTransactionIDs",
  57             "AllocatedStorage",
  58         ],
  59             parse_aws(info),
  60             extra_keys=['DBInstanceIdentifier']).itervalues():
  61
  62         for key, factor in [
  63             ('AllocatedStorage', 1.074e+9),
  64             ('TransactionLogsDiskUsage', 1024**3),
  65             ('ReplicationSlotDiskUsage', 1024**3),
  66             ('OldestReplicationSlotLag', 1024**3),
  67         ]:
  68             try:
  69                 metrics[key] *= factor
  70             except KeyError:
  71                 pass
  72         parsed.setdefault(metrics['DBInstanceIdentifier'], metrics)
  73     return parsed
  74
  75
  76 #   .--CPU utilization-----------------------------------------------------.
  77 #   |    ____ ____  _   _         _   _ _ _          _   _                 |
  78 #   |   / ___|  _ \| | | |  _   _| |_(_) (_)______ _| |_(_) ___  _ __      |
  79 #   |  | |   | |_) | | | | | | | | __| | | |_  / _` | __| |/ _ \| '_ \     |
  80 #   |  | |___|  __/| |_| | | |_| | |_| | | |/ / (_| | |_| | (_) | | | |    |
  81 #   |   \____|_|    \___/   \__,_|\__|_|_|_/___\__,_|\__|_|\___/|_| |_|    |
  82 #   |                                                                      |
  83 #   '----------------------------------------------------------------------'
  84
  85 factory_settings['aws_rds_cpu_util'] = {
  86     'levels': (80.0, 90.0),
  87 }
  88
  89
  90 @get_parsed_item_data
  91 def check_aws_rds(item, params, metrics):
  92     return check_cpu_util(metrics['CPUUtilization'], params, time.time())
  93
  94
  95 check_info['aws_rds'] = {
  96     'parse_function': parse_aws_rds,
  97     'inventory_function': lambda p:\
  98         inventory_aws_generic(p, ['CPUUtilization']),
  99     'check_function': check_aws_rds,
 100     'service_description': 'AWS/RDS %s CPU Utilization',
 101     'includes': ['cpu_util.include', 'aws.include'],
 102     'group': 'cpu_utilization_multiitem',
 103     'default_levels_variable': 'aws_rds_cpu_util',
 104     'has_perfdata': True,
 105 }
 106
 107 #.
 108 #   .--CPU credits---------------------------------------------------------.
 109 #   |           ____ ____  _   _                     _ _ _                 |
 110 #   |          / ___|  _ \| | | |   ___ _ __ ___  __| (_) |_ ___           |
 111 #   |         | |   | |_) | | | |  / __| '__/ _ \/ _` | | __/ __|          |
 112 #   |         | |___|  __/| |_| | | (__| | |  __/ (_| | | |_\__ \          |
 113 #   |          \____|_|    \___/   \___|_|  \___|\__,_|_|\__|___/          |
 114 #   |                                                                      |
 115 #   '----------------------------------------------------------------------'
 116
 117 # CPU credit balance:
 118 # For standard T2 instances with bursting, a burst can continue only as long as
 119 # there are available CPU credits, so it’s important to monitor your instance’s
 120 # balance. Credits are earned any time the instance is running below its baseline
 121 # CPU performance level. The initial balance, accrual rate, and maximum possible
 122 # balance are all dependent on the instance level.
 123 #
 124 # CPU credit usage:
 125 # One CPU credit is equivalent to one minute of 100 percent CPU utilization (or
 126 # two minutes at 50 percent, etc.). Whenever an instance requires CPU performance
 127 # above that instance type’s baseline, it will burst, consuming CPU credits until
 128 # the demand lessens or the credit balance runs out. Keeping an eye on your
 129 # instances’ credit usage will help you identify whether you might need to switch
 130 # to an instance type that is optimized for CPU-intensive workloads. Or, you can
 131 # create an alert for when your credit balance drops below a threshold while CPU
 132 # usage remains above baseline.
 133
 134
 135 @get_parsed_item_data
 136 def check_aws_rds_cpu_credits(item, params, metrics):
 137     yield 0, "Usage: %.2f" % metrics['CPUCreditUsage']
 138     warn, crit = params.get("balance_levels_lower", (None, None))
 139     yield check_levels(
 140         metrics['CPUCreditBalance'],
 141         "aws_cpu_credit_balance", (None, None, warn, crit),
 142         human_readable_func=lambda x: "%.2f" % x,
 143         infoname='Balance')
 144
 145     burst_balance = metrics.get('BurstBalance')
 146     if burst_balance is not None:
 147         warn, crit = params.get("burst_balance_levels_lower", (None, None))
 148         yield check_levels(
 149             metrics['BurstBalance'],
 150             "aws_burst_balance", (None, None, warn, crit),
 151             human_readable_func=get_percent_human_readable,
 152             infoname='Balance')
 153
 154
 155 check_info['aws_rds.cpu_credits'] = {
 156     'inventory_function': lambda p:\
 157         inventory_aws_generic(p, ['CPUCreditUsage', 'CPUCreditBalance']),
 158     'check_function': check_aws_rds_cpu_credits,
 159     'service_description': 'AWS/RDS %s CPU Credits',
 160     'includes': ['aws.include'],
 161     'group': 'aws_rds_cpu_credits',
 162     'has_perfdata': True,
 163 }
 164
 165 #.
 166 #   .--network IO----------------------------------------------------------.
 167 #   |                     _                      _      ___ ___            |
 168 #   |          _ __   ___| |___      _____  _ __| | __ |_ _/ _ \           |
 169 #   |         | '_ \ / _ \ __\ \ /\ / / _ \| '__| |/ /  | | | | |          |
 170 #   |         | | | |  __/ |_ \ V  V / (_) | |  |   <   | | |_| |          |
 171 #   |         |_| |_|\___|\__| \_/\_/ \___/|_|  |_|\_\ |___\___/           |
 172 #   |                                                                      |
 173 #   '----------------------------------------------------------------------'
 174
 175
 176 @get_parsed_item_data
 177 def check_aws_rds_network_io(item, params, metrics):
 178     interfaces = [[
 179         "0",
 180         item,
 181         "1",
 182         "",
 183         "1",
 184         metrics['NetworkReceiveThroughput'],
 185         "",
 186         "",
 187         "",
 188         "",
 189         "",
 190         metrics['NetworkTransmitThroughput'],
 191         "",
 192         "",
 193         "",
 194         "",
 195         "",
 196         "",
 197         metrics.get('DBInstanceIdentifier', item),
 198         "",
 199     ]]
 200     return check_if_common_single(item, params, interfaces)
 201
 202
 203 check_info['aws_rds.network_io'] = {
 204     'inventory_function': lambda p:\
 205         inventory_aws_generic(p, ['NetworkReceiveThroughput', 'NetworkTransmitThroughput']),
 206     'check_function': check_aws_rds_network_io,
 207     'service_description': 'AWS/RDS %s Network IO',
 208     'includes': ['aws.include', "if.include"],
 209     'default_levels_variable': "if_default_levels",
 210     'group': 'if',
 211     'has_perfdata': True,
 212 }
 213
 214 #.
 215 #   .--bin log usage-------------------------------------------------------.
 216 #   |     _     _         _                                                |
 217 #   |    | |__ (_)_ __   | | ___   __ _   _   _ ___  __ _  __ _  ___       |
 218 #   |    | '_ \| | '_ \  | |/ _ \ / _` | | | | / __|/ _` |/ _` |/ _ \      |
 219 #   |    | |_) | | | | | | | (_) | (_| | | |_| \__ \ (_| | (_| |  __/      |
 220 #   |    |_.__/|_|_| |_| |_|\___/ \__, |  \__,_|___/\__,_|\__, |\___|      |
 221 #   |                             |___/                   |___/            |
 222 #   '----------------------------------------------------------------------'
 223
 224
 225 @get_parsed_item_data
 226 def check_aws_rds_bin_log_usage(item, params, metrics):
 227     bin_log_usage = metrics['BinLogDiskUsage']
 228     yield 0, get_bytes_human_readable(bin_log_usage)
 229
 230     try:
 231         usage = 100.0 * bin_log_usage / metrics['AllocatedStorage']
 232     except (KeyError, ZeroDivisionError):
 233         yield 1, 'Cannot calculate usage'
 234     else:
 235         yield check_levels(
 236             usage,
 237             "aws_rds_bin_log_disk_usage",
 238             params.get('levels', (None, None)),
 239             human_readable_func=get_percent_human_readable)
 240
 241
 242 check_info['aws_rds.bin_log_usage'] = {
 243     'inventory_function': lambda p:\
 244         inventory_aws_generic(p, ['BinLogDiskUsage', 'AllocatedStorage']),
 245     'check_function': check_aws_rds_bin_log_usage,
 246     'service_description': 'AWS/RDS %s Binary Log Usage',
 247     'includes': ['aws.include'],
 248     'has_perfdata': True,
 249     'group': 'aws_rds_disk_usage',
 250 }
 251
 252 #.
 253 #   .--transaction logs usage----------------------------------------------.
 254 #   |        _                                  _   _                      |
 255 #   |       | |_ _ __ __ _ _ __  ___  __ _  ___| |_(_) ___  _ __           |
 256 #   |       | __| '__/ _` | '_ \/ __|/ _` |/ __| __| |/ _ \| '_ \          |
 257 #   |       | |_| | | (_| | | | \__ \ (_| | (__| |_| | (_) | | | |         |
 258 #   |        \__|_|  \__,_|_| |_|___/\__,_|\___|\__|_|\___/|_| |_|         |
 259 #   |                                                                      |
 260 #   |           _                                                          |
 261 #   |          | | ___   __ _ ___   _   _ ___  __ _  __ _  ___             |
 262 #   |          | |/ _ \ / _` / __| | | | / __|/ _` |/ _` |/ _ \            |
 263 #   |          | | (_) | (_| \__ \ | |_| \__ \ (_| | (_| |  __/            |
 264 #   |          |_|\___/ \__, |___/  \__,_|___/\__,_|\__, |\___|            |
 265 #   |                   |___/                       |___/                  |
 266 #   '----------------------------------------------------------------------'
 267
 268
 269 @get_parsed_item_data
 270 def check_aws_rds_transaction_logs_usage(item, params, metrics):
 271     transaction_logs_space = metrics['TransactionLogsDiskUsage']
 272     yield 0, get_bytes_human_readable(transaction_logs_space)
 273
 274     try:
 275         usage = 100.0 * transaction_logs_space / metrics['AllocatedStorage']
 276     except (KeyError, ZeroDivisionError):
 277         yield 1, 'Cannot calculate usage'
 278     else:
 279         yield check_levels(
 280             usage,
 281             "aws_rds_transaction_logs_disk_usage",
 282             params.get('levels', (None, None)),
 283             human_readable_func=get_percent_human_readable)
 284
 285     generation = metrics.get('TransactionLogsGeneration')
 286     if generation:
 287         yield 0, 'Size of transaction logs: %s/s' % generation
 288
 289
 290 check_info['aws_rds.transaction_logs_usage'] = {
 291     'inventory_function': lambda p:\
 292         inventory_aws_generic(p, ['TransactionLogsDiskUsage', 'AllocatedStorage']),
 293     'check_function': check_aws_rds_transaction_logs_usage,
 294     'service_description': 'AWS/RDS %s Transaction Logs Usage',
 295     'includes': ['aws.include'],
 296     'has_perfdata': True,
 297     'group': 'aws_rds_disk_usage',
 298 }
 299
 300 #.
 301 #   .--replication slot usage----------------------------------------------.
 302 #   |                 _ _           _   _                   _       _      |
 303 #   |  _ __ ___ _ __ | (_) ___ __ _| |_(_) ___  _ __    ___| | ___ | |_    |
 304 #   | | '__/ _ \ '_ \| | |/ __/ _` | __| |/ _ \| '_ \  / __| |/ _ \| __|   |
 305 #   | | | |  __/ |_) | | | (_| (_| | |_| | (_) | | | | \__ \ | (_) | |_    |
 306 #   | |_|  \___| .__/|_|_|\___\__,_|\__|_|\___/|_| |_| |___/_|\___/ \__|   |
 307 #   |          |_|                                                         |
 308 #   |                                                                      |
 309 #   |                     _   _ ___  __ _  __ _  ___                       |
 310 #   |                    | | | / __|/ _` |/ _` |/ _ \                      |
 311 #   |                    | |_| \__ \ (_| | (_| |  __/                      |
 312 #   |                     \__,_|___/\__,_|\__, |\___|                      |
 313 #   |                                     |___/                            |
 314 #   '----------------------------------------------------------------------'
 315
 316
 317 @get_parsed_item_data
 318 def check_aws_rds_replication_slot_usage(item, params, metrics):
 319     replication_slot_space = metrics['ReplicationSlotDiskUsage']
 320     yield 0, get_bytes_human_readable(replication_slot_space)
 321
 322     try:
 323         usage = 100.0 * replication_slot_space / metrics['AllocatedStorage']
 324     except (KeyError, ZeroDivisionError):
 325         yield 1, 'Cannot calculate usage'
 326     else:
 327         yield check_levels(
 328             usage,
 329             "aws_rds_replication_slot_disk_usage",
 330             params.get('levels', (None, None)),
 331             human_readable_func=get_percent_human_readable)
 332
 333
 334 check_info['aws_rds.replication_slot_usage'] = {
 335     'inventory_function': lambda p:\
 336         inventory_aws_generic(p, ['ReplicationSlotDiskUsage', 'AllocatedStorage']),
 337     'check_function': check_aws_rds_replication_slot_usage,
 338     'service_description': 'AWS/RDS %s Replication Slot Usage',
 339     'includes': ['aws.include'],
 340     'has_perfdata': True,
 341     'group': 'aws_rds_disk_usage',
 342 }
 343
 344 #.
 345 #   .--disk IO-------------------------------------------------------------.
 346 #   |                         _ _     _      ___ ___                       |
 347 #   |                      __| (_)___| | __ |_ _/ _ \                      |
 348 #   |                     / _` | / __| |/ /  | | | | |                     |
 349 #   |                    | (_| | \__ \   <   | | |_| |                     |
 350 #   |                     \__,_|_|___/_|\_\ |___\___/                      |
 351 #   |                                                                      |
 352 #   '----------------------------------------------------------------------'
 353
 354
 355 def check_aws_rds_disk_io(item, params, parsed):
 356     now = time.time()
 357     disks = {}
 358     for disk_name, metrics in parsed.iteritems():
 359         disks.setdefault(
 360             disk_name, {
 361                 "read_ios": get_rate("aws_rds_disk_io_read_ios.%s" % item, now,
 362                                      metrics['ReadIOPS']),
 363                 "write_ios": get_rate("aws_rds_disk_io_write_ios.%s" % item, now,
 364                                       metrics['WriteIOPS']),
 365                 "read_throughput": get_rate("aws_rds_disk_io_read_throughput.%s" % item, now,
 366                                             metrics['ReadThroughput']),
 367                 "write_throughput": get_rate("aws_rds_disk_io_write_throughput.%s" % item, now,
 368                                              metrics['WriteThroughput']),
 369                 "read_latency": metrics['ReadLatency'] * 1000.0,
 370                 "write_latency": metrics['WriteLatency'] * 1000.0,
 371             })
 372     return check_diskstat_dict(item, params, disks)
 373
 374
 375 check_info['aws_rds.disk_io'] = {
 376     'inventory_function': lambda p:\
 377         inventory_aws_generic(p, ['DiskQueueDepth', 'ReadIOPS', 'ReadLatency', 'ReadThroughput', 'WriteIOPS', 'WriteLatency', 'WriteThroughput']),
 378     'check_function': check_aws_rds_disk_io,
 379     'service_description': 'AWS/RDS %s Disk IO',
 380     'includes': ['aws.include', 'diskstat.include'],
 381     'group': 'diskstat',
 382     'has_perfdata': True,
 383 }
 384
 385 #.
 386 #   .--connections---------------------------------------------------------.
 387 #   |                                        _   _                         |
 388 #   |         ___ ___  _ __  _ __   ___  ___| |_(_) ___  _ __  ___         |
 389 #   |        / __/ _ \| '_ \| '_ \ / _ \/ __| __| |/ _ \| '_ \/ __|        |
 390 #   |       | (_| (_) | | | | | | |  __/ (__| |_| | (_) | | | \__ \        |
 391 #   |        \___\___/|_| |_|_| |_|\___|\___|\__|_|\___/|_| |_|___/        |
 392 #   |                                                                      |
 393 #   '----------------------------------------------------------------------'
 394
 395
 396 @get_parsed_item_data
 397 def check_aws_rds_connections(item, params, metrics):
 398     yield check_levels(
 399         metrics['DatabaseConnections'],
 400         "aws_rds_connections",
 401         params.get('levels', (None, None)),
 402         infoname="In use")
 403
 404
 405 check_info['aws_rds.connections'] = {
 406     'inventory_function': lambda p:\
 407         inventory_aws_generic(p, ['DatabaseConnections']),
 408     'check_function': check_aws_rds_connections,
 409     'service_description': 'AWS/RDS %s Connections',
 410     'includes': ['aws.include'],
 411     'has_perfdata': True,
 412     'group': 'aws_rds_connections',
 413 }
 414
 415 #.
 416 #   .--agent jobs----------------------------------------------------------.
 417 #   |                                 _       _       _                    |
 418 #   |           __ _  __ _  ___ _ __ | |_    (_) ___ | |__  ___            |
 419 #   |          / _` |/ _` |/ _ \ '_ \| __|   | |/ _ \| '_ \/ __|           |
 420 #   |         | (_| | (_| |  __/ | | | |_    | | (_) | |_) \__ \           |
 421 #   |          \__,_|\__, |\___|_| |_|\__|  _/ |\___/|_.__/|___/           |
 422 #   |                |___/                 |__/                            |
 423 #   '----------------------------------------------------------------------'
 424
 425
 426 @get_parsed_item_data
 427 def check_aws_rds_agent_jobs(item, params, metrics):
 428     failed_agent_jobs = metrics['FailedSQLServerAgentJobsCount']
 429     if failed_agent_jobs > 0:
 430         state = 1
 431     else:
 432         state = 0
 433     yield state, "Failed jobs during the last minute: %s" % failed_agent_jobs
 434
 435
 436 check_info['aws_rds.agent_jobs'] = {
 437     'inventory_function': lambda p:\
 438         inventory_aws_generic(p, ['FailedSQLServerAgentJobsCount']),
 439     'check_function': check_aws_rds_agent_jobs,
 440     'service_description': 'AWS/RDS %s SQL Server Agent Jobs',
 441     'includes': ['aws.include'],
 442 }
 443
 444 #.
 445 #   .--replica lag---------------------------------------------------------.
 446 #   |                           _ _             _                          |
 447 #   |            _ __ ___ _ __ | (_) ___ __ _  | | __ _  __ _              |
 448 #   |           | '__/ _ \ '_ \| | |/ __/ _` | | |/ _` |/ _` |             |
 449 #   |           | | |  __/ |_) | | | (_| (_| | | | (_| | (_| |             |
 450 #   |           |_|  \___| .__/|_|_|\___\__,_| |_|\__,_|\__, |             |
 451 #   |                    |_|                            |___/              |
 452 #   '----------------------------------------------------------------------'
 453
 454
 455 @get_parsed_item_data
 456 def check_aws_rds_replica_lag(item, params, metrics):
 457     yield check_levels(
 458         metrics['ReplicaLag'],
 459         "aws_rds_replica_lag",
 460         params.get('lag_levels', (None, None)),
 461         human_readable_func=get_age_human_readable,
 462         infoname="Lag")
 463
 464     oldest_replica_lag_space = metrics.get('OldestReplicationSlotLag')
 465     if oldest_replica_lag_space is not None:
 466         yield check_levels(
 467             oldest_replica_lag_space,
 468             "aws_rds_oldest_replication_slot_lag",
 469             params.get('slot_levels', (None, None)),
 470             human_readable_func=get_bytes_human_readable,
 471             infoname="Oldest replication slot lag")
 472
 473
 474 check_info['aws_rds.replica_lag'] = {
 475     'inventory_function': lambda p:\
 476         inventory_aws_generic(p, ['ReplicaLag']),
 477     'check_function': check_aws_rds_replica_lag,
 478     'service_description': 'AWS/RDS %s Replica Lag',
 479     'includes': ['aws.include'],
 480     'has_perfdata': True,
 481     'group': 'aws_rds_replica_lag',
 482 }