4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
25 #pragma dictionary "DISK"
33 * Over all comments for this file:
34 * <disk-as-detector> The disk-as-detector DE provides the mapping between
35 * ereports generated by a kernel disk driver sd(7D) and resulting faults.
39 * SERD engine for media error fault propagation:
41 * This strategy is designed to give a file system, like ZFS, the
42 * ability to attempt data recovery/relocation without faulting a disk.
43 * This implementation depends on a file system retry to the same lba
44 * to trigger a fault when recovery/relocation is not possible.
46 * We let the engine propagate one error only once every 1 minute and then if we
47 * still get 2 or more * errors within 24 hours for the same LBA, there is a fault.
49 engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
52 * disk-as-detector: fault events.
54 event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
55 event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
56 engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
59 * The uderr fault will be defined at some future time.
60 * event fault.io.scsi.cmd.disk.dev.uderr@P;
64 * disk-as-detector: upset events.
65 * NOTE: For now we define an upset to implement discard.
67 event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
68 event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
69 event upset.io.scsi.cmd.disk.dev.uderr@P;
70 event upset.io.scsi.cmd.disk.dev.serr@P;
71 event upset.io.scsi.cmd.disk.tran@P;
72 event upset.io.scsi.cmd.disk.recovered@P;
75 * disk-as-detector: ereports from the kernel.
77 * We don't know the topology for all scsi disks, but the kernel will always
78 * generate ereport telemetry assuming that we do. We define these ereports
79 * with 'discard_if_config_unknown=1', which permits ereports against things
80 * with unknown topology to be silently discarded. The ereport data is logged
81 * in either case, and can be viewed via 'fmdump -eV'.
83 event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
84 event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
85 event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
86 event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
87 event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
88 event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
91 * For some ereports we let the 'driver-assessment', communicated as part of
92 * the ereport payload, determine fault .vs. upset via propagation constraints.
94 #define DRIVER_ASSESSMENT_FATAL \
95 (payloadprop_contains("driver-assessment", "fatal"))
96 #define DRIVER_ASSESSMENT_NONFATAL (!DRIVER_ASSESSMENT_FATAL)
99 * disk-as-detector: propagations from faults(based on
100 * DRIVER_ASSESSMENT_FATAL).
101 * We need to set additional fault payloads to indicate fault details.
102 * The payload we may need are listed as following:
103 * fault.io.scsi.cmd.disk.dev.rqs.derr
104 * op_code, key, asc, ascq
105 * fault.io.scsi.cmd.disk.dev.rqs.merr
106 * op_code, key, asc, ascq, lba
108 prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
109 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
110 setpayloadprop("key", payloadprop("key")) &&
111 setpayloadprop("asc", payloadprop("asc")) &&
112 setpayloadprop("ascq", payloadprop("ascq"))};
115 * Utilize setserdsuffix with specific LBA,
116 * the serd engine would only trigger if the fault recurred on the same LBA
118 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
119 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
120 setserdsuffix(payloadprop("lba")) &&
121 setpayloadprop("key", payloadprop("key")) &&
122 setpayloadprop("asc", payloadprop("asc")) &&
123 setpayloadprop("ascq", payloadprop("ascq")) &&
124 setpayloadprop("lba", payloadprop("lba"))};
127 * NOTE: this propagation uses the "may" propagation of eversholt.
128 * The ereport need never exist. It's just a way of making
129 * the diagnosis wait for the within time on that ereport
130 * to complete. Once it has completed the diagnosis continues
131 * even though the dummy ereport didn't occur.
133 event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
134 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
135 ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
138 * The uderr fault will be propagated at some future time.
139 * prop fault.io.scsi.cmd.disk.dev.uderr@P->
140 * ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
144 * disk-as-detector: propagations from upsets(based on
145 * DRIVER_ASSESSMENT_NONFATAL).
147 prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
148 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
150 prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
151 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
154 * disk-as-detector: propagations from upsets(independent of
158 prop upset.io.scsi.cmd.disk.dev.serr@P->
159 ereport.io.scsi.cmd.disk.dev.serr@P;
161 prop upset.io.scsi.cmd.disk.dev.uderr@P->
162 ereport.io.scsi.cmd.disk.dev.uderr@P;
164 prop upset.io.scsi.cmd.disk.recovered@P->
165 ereport.io.scsi.cmd.disk.recovered@P;
167 prop upset.io.scsi.cmd.disk.tran@P->
168 ereport.io.scsi.cmd.disk.tran@P;
171 * --------------------------------------
172 * The remainder of this file contains rules associated with the operation of
173 * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
175 * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
176 * generated by the disk-transport fmd module, and the resulting faults.
182 event fault.io.disk.over-temperature@P,
183 FITrate=10, FRU=P, ASRU=P;
184 event fault.io.disk.predictive-failure@P, FITrate=10,
185 FITrate=10, FRU=P, ASRU=P;
186 event fault.io.disk.self-test-failure@P, FITrate=10,
187 FITrate=10, FRU=P, ASRU=P;
192 event ereport.io.scsi.disk.over-temperature@P;
193 event ereport.io.scsi.disk.predictive-failure@P;
194 event ereport.io.scsi.disk.self-test-failure@P;
199 prop fault.io.disk.over-temperature@P ->
200 ereport.io.scsi.disk.over-temperature@P;
202 prop fault.io.disk.self-test-failure@P ->
203 ereport.io.scsi.disk.self-test-failure@P;
205 prop fault.io.disk.predictive-failure@P ->
206 ereport.io.scsi.disk.predictive-failure@P {
207 setpayloadprop("asc", payloadprop("additional-sense-code")) &&
208 setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };