Cleanup config.nodes_of
[check_mk.git] / checks / smart
blob4a565891ee1f587dab6bcdd672f96fc101160619
1 #!/usr/bin/python
2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
3 # +------------------------------------------------------------------+
4 # | ____ _ _ __ __ _ __ |
5 # | / ___| |__ ___ ___| | __ | \/ | |/ / |
6 # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
7 # | | |___| | | | __/ (__| < | | | | . \ |
8 # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
9 # | |
10 # | Copyright Mathias Kettner 2014 mk@mathias-kettner.de |
11 # +------------------------------------------------------------------+
13 # This file is part of Check_MK.
14 # The official homepage is at http://mathias-kettner.de/check_mk.
16 # check_mk is free software; you can redistribute it and/or modify it
17 # under the terms of the GNU General Public License as published by
18 # the Free Software Foundation in version 2. check_mk is distributed
19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
20 # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
21 # PARTICULAR PURPOSE. See the GNU General Public License for more de-
22 # tails. You should have received a copy of the GNU General Public
23 # License along with GNU Make; see the file COPYING. If not, write
24 # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
25 # Boston, MA 02110-1301 USA.
27 # EXAMPLE DATA FROM: WDC SSC-D0128SC-2100
28 #<<<smart>>>
29 #/dev/sda ATA WDC_SSC-D0128SC- 1 Raw_Read_Error_Rate 0x000b 100 100 050 Pre-fail Always - 16777215
30 #/dev/sda ATA WDC_SSC-D0128SC- 3 Spin_Up_Time 0x0007 100 100 050 Pre-fail Always - 0
31 #/dev/sda ATA WDC_SSC-D0128SC- 5 Reallocated_Sector_Ct 0x0013 100 100 050 Pre-fail Always - 0
32 #/dev/sda ATA WDC_SSC-D0128SC- 7 Seek_Error_Rate 0x000b 100 100 050 Pre-fail Always - 0
33 #/dev/sda ATA WDC_SSC-D0128SC- 9 Power_On_Hours 0x0012 100 100 000 Old_age Always - 1408
34 #/dev/sda ATA WDC_SSC-D0128SC- 10 Spin_Retry_Count 0x0013 100 100 050 Pre-fail Always - 0
35 #/dev/sda ATA WDC_SSC-D0128SC- 12 Power_Cycle_Count 0x0012 100 100 000 Old_age Always - 523
36 #/dev/sda ATA WDC_SSC-D0128SC- 168 Unknown_Attribute 0x0012 100 100 000 Old_age Always - 1
37 #/dev/sda ATA WDC_SSC-D0128SC- 175 Program_Fail_Count_Chip 0x0003 100 100 010 Pre-fail Always - 0
38 #/dev/sda ATA WDC_SSC-D0128SC- 192 Power-Off_Retract_Count 0x0012 100 100 000 Old_age Always - 0
39 #/dev/sda ATA WDC_SSC-D0128SC- 194 Temperature_Celsius 0x0022 040 100 000 Old_age Always - 40 (Lifetime Min/Max 30/60)
40 #/dev/sda ATA WDC_SSC-D0128SC- 197 Current_Pending_Sector 0x0012 100 100 000 Old_age Always - 0
41 #/dev/sda ATA WDC_SSC-D0128SC- 240 Head_Flying_Hours 0x0013 100 100 050 Pre-fail Always - 0
42 #/dev/sda ATA WDC_SSC-D0128SC- 170 Unknown_Attribute 0x0003 100 100 010 Pre-fail Always - 1769478
43 #/dev/sda ATA WDC_SSC-D0128SC- 173 Unknown_Attribute 0x0012 100 100 000 Old_age Always - 4217788040605
45 # TODO: Need to completely rework smart check. Use IDs instead of changing
46 # descriptions! But be careful: There is no standard neither for IDs nor for
47 # descriptions. Only use those, which are common sense.
49 factory_settings["smart_temp_default_levels"] = {"levels": (35, 40)}
51 smart_stats_default_levels = {
52 'realloc_events': (1, 1),
53 'realloc_sectors': (1, 1),
54 'spin_retries': (1, 1),
55 'pending_retries': (1, 1),
56 'pending_sectors': (1, 1),
57 'cmd_timeouts': (5, 10),
58 'e2e_errs': (1, 1),
59 'uncorr_errs': (1, 1),
60 'udma_crcs': (1, 1),
64 def parse_smart_raw_values(info):
65 disks = {}
66 disk_name = None
68 for line in info:
69 if len(line) >= 13:
70 if line[0] != disk_name:
71 disk_name = line[0]
72 disk = {}
73 disks[disk_name] = disk
75 field = line[4]
76 if field != "Unknown_Attribute":
77 value = saveint(line[12])
78 disk[field] = value
79 return disks
82 def parse_smart_normalized_values(info):
83 disks = {}
84 disk_name = None
86 for line in info:
87 if len(line) >= 13:
88 if line[0] != disk_name:
89 disk_name = line[0]
90 disk = {}
91 disks[disk_name] = disk
93 field = line[4]
94 if field != "Unknown_Attribute":
95 value = int(line[6])
96 if isinstance(line[8], int):
97 threshold = int(line[8])
98 else:
99 threshold = None
100 disk[field] = value, threshold
101 return disks
104 smart_stats_fields = [
105 'Reallocated_Sector_Ct',
106 'Spin_Retry_Count',
107 'Reallocated_Event_Count',
108 'Current_Pending_Sector',
109 'Command_Timeout',
110 'End-to-End_Error',
111 'Reported_Uncorrect',
112 'Uncorrectable_Error_Cnt',
113 'UDMA_CRC_Error_Count',
114 'CRC_Error_Count',
118 def inventory_smart_stats(info):
119 disks = parse_smart_raw_values(info)
120 inventory = []
121 for disk_name, disk in disks.items():
122 for field in disk.keys():
123 if field in smart_stats_fields: # found at least one interesting field
124 cleaned = dict([(f, disk[f]) for f in smart_stats_fields if f in disk])
125 inventory.append((disk_name, cleaned))
126 break
127 return inventory
130 def check_smart_stats(item, params, info):
131 # params is a snapshot of all counters at the point of time of inventory
133 disks = parse_smart_raw_values(info)
134 normalized = parse_smart_normalized_values(info)
136 if item not in disks:
137 return 3, "Disk not found"
138 disk = disks[item]
140 state = 0
141 infos = []
142 perfdata = []
144 for unit, field, descr in [
145 (' hours', 'Power_On_Hours', 'Powered on'),
146 ('', 'Power_Cycle_Count', 'Power cycles'),
147 ('', 'Reallocated_Sector_Ct', 'Reallocated sectors'),
148 ('', 'Reallocated_Event_Count', 'Reallocated events'),
149 ('', 'Spin_Retry_Count', 'Spin retries'),
150 ('', 'Current_Pending_Sector', 'Pending sectors'),
151 ('', 'Command_Timeout', 'Command timeouts'),
152 ('', 'End-to-End_Error', 'End-to-End errors'),
153 ('', 'Reported_Uncorrect', 'Uncorrectable errors'),
154 ('', 'Uncorrectable_Error_Cnt', 'Uncorrectable errors'),
155 ('', 'UDMA_CRC_Error_Count', 'UDMA CRC errors'),
156 ('', 'CRC_Error_Count', 'UDMA CRC errors'),
158 if field in disk:
159 value = disk[field]
160 infos.append("%s: %d%s" % (descr, value, unit))
161 perfdata.append((field, value))
163 if field in params:
164 ref_value = params[field]
166 # For reallocated event counts we experienced to many reported errors for disks
167 # which still seem to be OK. The raw value increased by a small amount but the
168 # aggregated value remained at it's initial/ok state. So we use the aggregated
169 # value now. Only for this field.
170 if field == "Reallocated_Event_Count":
171 infos[-1] += " (was %d during discovery; normalized value looks OK)" % ref_value
172 norm_value, norm_threshold = normalized[item][field]
173 if norm_value <= norm_threshold:
174 state = 2
176 elif value > ref_value:
177 state = 2
178 infos[-1] += "(!!) (was %d during discovery)" % ref_value
180 return state, ", ".join(infos), perfdata
183 check_info["smart.stats"] = {
184 'check_function': check_smart_stats,
185 'inventory_function': inventory_smart_stats,
186 'has_perfdata': True,
187 'service_description': 'SMART %s Stats',
191 def inventory_smart_temp(info):
192 disks = parse_smart_raw_values(info)
193 for disk_name, disk in disks.iteritems():
194 if "Temperature_Celsius" in disk or \
195 "Temperature_Internal" in disk:
196 yield disk_name, {}
199 def check_smart_temp(item, params, info):
200 disks = parse_smart_raw_values(info)
202 if "Temperature_Celsius" in disks[item]:
203 temperature = disks[item]["Temperature_Celsius"]
204 elif "Temperature_Internal" in disks[item]:
205 temperature = disks[item]["Temperature_Internal"]
206 else:
207 return
209 return check_temperature(temperature, params, "smart_%s" % item)
212 check_info["smart.temp"] = {
213 'check_function': check_smart_temp,
214 'inventory_function': inventory_smart_temp,
215 'service_description': 'Temperature SMART %s',
216 'group': 'temperature',
217 'has_perfdata': True,
218 'includes': ['temperature.include'],
219 'default_levels_variable': "smart_temp_default_levels"