Optimize RAIDZ expansion
[zfs.git] / cmd / zpool / zpool.d / smart
blob8ad3e107f09149e62b8c8ed62827e95c540700cd
1 #!/bin/sh
3 # Show SMART stats
6 helpstr="
7 smart: Show SMART temperature and error stats (specific to drive type)
8 smartx: Show SMART extended drive stats (specific to drive type).
9 temp: Show SMART drive temperature in celsius (all drives).
10 health: Show reported SMART status (all drives).
11 r_proc: Show SMART read GBytes processed over drive lifetime (SAS).
12 w_proc: Show SMART write GBytes processed over drive lifetime (SAS).
13 r_ucor: Show SMART read uncorrectable errors (SAS).
14 w_ucor: Show SMART write uncorrectable errors (SAS).
15 nonmed: Show SMART non-medium errors (SAS).
16 defect: Show SMART grown defect list (SAS).
17 hours_on: Show number of hours drive powered on (all drives).
18 realloc: Show SMART reallocated sectors count (ATA).
19 rep_ucor: Show SMART reported uncorrectable count (ATA).
20 cmd_to: Show SMART command timeout count (ATA).
21 pend_sec: Show SMART current pending sector count (ATA).
22 off_ucor: Show SMART offline uncorrectable errors (ATA).
23 ata_err: Show SMART ATA errors (ATA).
24 pwr_cyc: Show SMART power cycle count (ATA).
25 serial: Show disk serial number.
26 nvme_err: Show SMART NVMe errors (NVMe).
27 smart_test: Show SMART self-test results summary.
28 test_type: Show SMART self-test type (short, long... ).
29 test_status: Show SMART self-test status.
30 test_progress: Show SMART self-test percentage done.
31 test_ended: Show when the last SMART self-test ended (if supported).
34 # Hack for developer testing
36 # If you set $samples to a directory containing smartctl output text files,
37 # we will use them instead of running smartctl on the vdevs. This can be
38 # useful if you want to test a bunch of different smartctl outputs. Also, if
39 # $samples is set, and additional 'file' column is added to the zpool output
40 # showing the filename.
41 samples=
43 # get_filename_from_dir DIR
45 # Look in directory DIR and return a filename from it. The filename returned
46 # is chosen quasi-sequentially (based off our PID). This allows us to return
47 # a different filename every time this script is invoked (which we do for each
48 # vdev), without having to maintain state.
49 get_filename_from_dir()
51 dir=$1
52 pid="$$"
53 num_files=$(find "$dir" -maxdepth 1 -type f | wc -l)
54 mod=$((pid % num_files))
55 i=0
56 find "$dir" -type f -printf '%f\n' | while read -r file ; do
57 if [ "$mod" = "$i" ] ; then
58 echo "$file"
59 break
61 i=$((i+1))
62 done
65 script="${0##*/}"
67 if [ "$1" = "-h" ] ; then
68 echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
69 exit
72 # Sometimes, UPATH ends up /dev/(null).
73 # That should be corrected, but for now...
74 # shellcheck disable=SC2154
75 if [ ! -b "$VDEV_UPATH" ]; then
76 somepath="${VDEV_PATH}"
77 else
78 somepath="${VDEV_UPATH}"
81 if [ -b "$somepath" ] && PATH="/usr/sbin:$PATH" command -v smartctl > /dev/null || [ -n "$samples" ] ; then
82 if [ -n "$samples" ] ; then
83 # cat a smartctl output text file instead of running smartctl
84 # on a vdev (only used for developer testing).
85 file=$(get_filename_from_dir "$samples")
86 echo "file=$file"
87 raw_out=$(cat "$samples/$file")
88 else
89 raw_out=$(sudo smartctl -a "$somepath")
92 # What kind of drive are we? Look for the right line in smartctl:
94 # SAS:
95 # Transport protocol: SAS
97 # SATA:
98 # ATA Version is: 8
100 # NVMe:
101 # SMART/Health Information (NVMe Log 0xnn, NSID 0xnn)
103 out=$(echo "$raw_out" | awk '
104 # SAS specific
105 /read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8}
106 /write:/{print "rwr="$4"\nw_cor="$5"\nw_proc="$7"\nw_ucor="$8}
107 /Non-medium error count/{print "nonmed="$4}
108 /Elements in grown defect list/{print "defect="$6}
110 # SAS common
111 /SAS/{type="sas"}
112 /Drive Temperature:/{print "temp="$4}
113 # Status can be a long string, substitute spaces for '_'
114 /SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i}
115 /number of hours powered up/{print "hours_on="$7; hours_on=int($7)}
116 /Serial number:/{print "serial="$3}
118 # SATA specific
119 /Reallocated_Sector_Ct/{print "realloc="$10}
120 /Reported_Uncorrect/{print "rep_ucor="$10}
121 /Command_Timeout/{print "cmd_to="$10}
122 /Current_Pending_Sector/{print "pend_sec="$10}
123 /Offline_Uncorrectable/{print "off_ucor="$10}
124 /ATA Error Count:/{print "ata_err="$4}
125 /Power_Cycle_Count/{print "pwr_cyc="$10}
127 # SATA common
128 /SATA/{type="sata"}
129 /Temperature_Celsius/{print "temp="$10}
130 /Airflow_Temperature_Cel/{print "temp="$10}
131 /Current Temperature:/{print "temp="$3}
132 /SMART overall-health self-assessment test result:/{print "health="$6}
133 /Power_On_Hours/{print "hours_on="$10; hours_on=int($10)}
134 /Serial Number:/{print "serial="$3}
136 # NVMe common
137 /NVMe/{type="nvme"}
138 /Temperature:/{print "temp="$2}
139 /SMART overall-health self-assessment test result:/{print "health="$6}
140 /Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4}
141 /Serial Number:/{print "serial="$3}
142 /Power Cycles:/{print "pwr_cyc="$3}
144 # NVMe specific
145 /Media and Data Integrity Errors:/{print "nvme_err="$6}
147 # SMART self-test info
148 /Self-test execution status:/{progress=tolower($4)} # SAS
149 /SMART Self-test log/{test_seen=1} # SAS
150 /SMART Extended Self-test Log/{test_seen=1} # SATA
151 /# 1/{
152 test_type=tolower($3"_"$4);
153 # Status could be one word ("Completed") or multiple ("Completed: read
154 # failure"). Look for the ":" to see if we need to grab more words.
156 if ($5 ~ ":")
157 status=tolower($5""$6"_"$7)
158 else
159 status=tolower($5)
160 if (status=="self")
161 status="running";
163 if (type == "sas") {
164 hours=int($(NF-4))
165 } else {
166 hours=int($(NF-1))
167 # SATA reports percent remaining, rather than percent done
168 # Convert it to percent done.
169 progress=(100-int($(NF-2)))"%"
171 # When we int()-ify "hours", it converts stuff like "NOW" and "-" into
172 # 0. In those cases, set it to hours_on, so they will cancel out in
173 # the "hours_ago" calculation later on.
174 if (hours == 0)
175 hours=hours_on
177 if (test_seen) {
178 print "test="hours_on
179 print "test_type="test_type
180 print "test_status="status
181 print "test_progress="progress
183 # Not all drives report hours_on
184 if (hours_on && hours) {
185 total_hours_ago=(hours_on-hours)
186 days_ago=int(total_hours_ago/24)
187 hours_ago=(total_hours_ago % 24)
188 if (days_ago != 0)
189 ago_str=days_ago"d"
190 if (hours_ago !=0)
191 ago_str=ago_str""hours_ago"h"
192 print "test_ended="ago_str
196 END {print "type="type; ORS="\n"; print ""}
199 type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2)
201 # If type is not set by now, either we don't have a block device
202 # or smartctl failed. Either way, default to ATA and set $out to
203 # nothing.
204 if [ -z "$type" ]; then
205 type="sata"
206 out=
209 case $script in
210 smart)
211 # Print temperature plus common predictors of drive failure
212 if [ "$type" = "sas" ] ; then
213 scripts="temp|health|r_ucor|w_ucor"
214 elif [ "$type" = "sata" ] ; then
215 scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor"
216 elif [ "$type" = "nvme" ] ; then
217 scripts="temp|health|nvme_err"
220 smartx)
221 # Print some other interesting stats
222 if [ "$type" = "sas" ] ; then
223 scripts="hours_on|defect|nonmed|r_proc|w_proc"
224 elif [ "$type" = "sata" ] ; then
225 scripts="hours_on|pwr_cyc"
226 elif [ "$type" = "nvme" ] ; then
227 scripts="hours_on|pwr_cyc"
230 smart_test)
231 scripts="test_type|test_status|test_progress|test_ended"
234 scripts="$script"
235 esac
237 with_vals=$(echo "$out" | grep -E "$scripts")
238 if [ -n "$with_vals" ]; then
239 echo "$with_vals"
240 without_vals=$(echo "$scripts" | tr '|' '\n' |
241 grep -v -E "$(echo "$with_vals" |
242 awk -F "=" '{print $1}')" | awk '{print $0"="}')
243 else
244 without_vals=$(echo "$scripts" | tr '|' '\n' | awk '{print $0"="}')
247 if [ -n "$without_vals" ]; then
248 echo "$without_vals"