4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
26 * The ZFS retire agent is responsible for managing hot spares across all pools.
27 * When we see a device fault or a device removal, we try to open the associated
28 * pool and look for any hot spares. We iterate over any available hot spares
29 * and attempt a 'zpool replace' for each one.
31 * For vdevs diagnosed as faulty, the agent is also responsible for proactively
32 * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors).
35 #include <fm/fmd_api.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/fm/protocol.h>
38 #include <sys/fm/fs/zfs.h>
40 #include <fm/libtopo.h>
43 typedef struct zfs_retire_repaired
{
44 struct zfs_retire_repaired
*zrr_next
;
47 } zfs_retire_repaired_t
;
49 typedef struct zfs_retire_data
{
50 libzfs_handle_t
*zrd_hdl
;
51 zfs_retire_repaired_t
*zrd_repaired
;
55 zfs_retire_clear_data(fmd_hdl_t
*hdl
, zfs_retire_data_t
*zdp
)
57 zfs_retire_repaired_t
*zrp
;
59 while ((zrp
= zdp
->zrd_repaired
) != NULL
) {
60 zdp
->zrd_repaired
= zrp
->zrr_next
;
61 fmd_hdl_free(hdl
, zrp
, sizeof (zfs_retire_repaired_t
));
66 * Find a pool with a matching GUID.
68 typedef struct find_cbdata
{
71 zpool_handle_t
*cb_zhp
;
76 find_pool(zpool_handle_t
*zhp
, void *data
)
78 find_cbdata_t
*cbp
= data
;
81 zpool_get_prop_int(zhp
, ZPOOL_PROP_GUID
, NULL
)) {
91 * Find a vdev within a tree with a matching GUID.
94 find_vdev(libzfs_handle_t
*zhdl
, nvlist_t
*nv
, const char *search_fru
,
103 if (search_fru
!= NULL
) {
104 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_FRU
, &fru
) == 0 &&
105 libzfs_fru_compare(zhdl
, fru
, search_fru
))
108 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &guid
) == 0 &&
113 if (nvlist_lookup_nvlist_array(nv
, ZPOOL_CONFIG_CHILDREN
,
114 &child
, &children
) != 0)
117 for (c
= 0; c
< children
; c
++) {
118 if ((ret
= find_vdev(zhdl
, child
[c
], search_fru
,
119 search_guid
)) != NULL
)
123 if (nvlist_lookup_nvlist_array(nv
, ZPOOL_CONFIG_L2CACHE
,
124 &child
, &children
) != 0)
127 for (c
= 0; c
< children
; c
++) {
128 if ((ret
= find_vdev(zhdl
, child
[c
], search_fru
,
129 search_guid
)) != NULL
)
137 * Given a (pool, vdev) GUID pair, find the matching pool and vdev.
139 static zpool_handle_t
*
140 find_by_guid(libzfs_handle_t
*zhdl
, uint64_t pool_guid
, uint64_t vdev_guid
,
145 nvlist_t
*config
, *nvroot
;
148 * Find the corresponding pool and make sure the vdev still exists.
150 cb
.cb_guid
= pool_guid
;
151 if (zpool_iter(zhdl
, find_pool
, &cb
) != 1)
155 config
= zpool_get_config(zhp
, NULL
);
156 if (nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
162 if (vdev_guid
!= 0) {
163 if ((*vdevp
= find_vdev(zhdl
, nvroot
, NULL
,
164 vdev_guid
)) == NULL
) {
174 search_pool(zpool_handle_t
*zhp
, void *data
)
176 find_cbdata_t
*cbp
= data
;
180 config
= zpool_get_config(zhp
, NULL
);
181 if (nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
187 if ((cbp
->cb_vdev
= find_vdev(zpool_get_handle(zhp
), nvroot
,
188 cbp
->cb_fru
, 0)) != NULL
) {
198 * Given a FRU FMRI, find the matching pool and vdev.
200 static zpool_handle_t
*
201 find_by_fru(libzfs_handle_t
*zhdl
, const char *fru
, nvlist_t
**vdevp
)
207 if (zpool_iter(zhdl
, search_pool
, &cb
) != 1)
215 * Given a vdev, attempt to replace it with every known spare until one
219 replace_with_spare(fmd_hdl_t
*hdl
, zpool_handle_t
*zhp
, nvlist_t
*vdev
)
221 nvlist_t
*config
, *nvroot
, *replacement
;
226 config
= zpool_get_config(zhp
, NULL
);
227 if (nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
232 * Find out if there are any hot spares available in the pool.
234 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_SPARES
,
235 &spares
, &nspares
) != 0)
238 replacement
= fmd_nvl_alloc(hdl
, FMD_SLEEP
);
240 (void) nvlist_add_string(replacement
, ZPOOL_CONFIG_TYPE
,
243 dev_name
= zpool_vdev_name(NULL
, zhp
, vdev
, B_FALSE
);
246 * Try to replace each spare, ending when we successfully
249 for (s
= 0; s
< nspares
; s
++) {
252 if (nvlist_lookup_string(spares
[s
], ZPOOL_CONFIG_PATH
,
256 (void) nvlist_add_nvlist_array(replacement
,
257 ZPOOL_CONFIG_CHILDREN
, &spares
[s
], 1);
259 if (zpool_vdev_attach(zhp
, dev_name
, spare_name
,
260 replacement
, B_TRUE
) == 0)
265 nvlist_free(replacement
);
269 * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and
270 * ASRU is now usable. ZFS has found the device to be present and
275 zfs_vdev_repair(fmd_hdl_t
*hdl
, nvlist_t
*nvl
)
277 zfs_retire_data_t
*zdp
= fmd_hdl_getspecific(hdl
);
278 zfs_retire_repaired_t
*zrp
;
279 uint64_t pool_guid
, vdev_guid
;
282 if (nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID
,
283 &pool_guid
) != 0 || nvlist_lookup_uint64(nvl
,
284 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID
, &vdev_guid
) != 0)
288 * Before checking the state of the ASRU, go through and see if we've
289 * already made an attempt to repair this ASRU. This list is cleared
290 * whenever we receive any kind of list event, and is designed to
291 * prevent us from generating a feedback loop when we attempt repairs
292 * against a faulted pool. The problem is that checking the unusable
293 * state of the ASRU can involve opening the pool, which can post
294 * statechange events but otherwise leave the pool in the faulted
295 * state. This list allows us to detect when a statechange event is
296 * due to our own request.
298 for (zrp
= zdp
->zrd_repaired
; zrp
!= NULL
; zrp
= zrp
->zrr_next
) {
299 if (zrp
->zrr_pool
== pool_guid
&&
300 zrp
->zrr_vdev
== vdev_guid
)
304 asru
= fmd_nvl_alloc(hdl
, FMD_SLEEP
);
306 (void) nvlist_add_uint8(asru
, FM_VERSION
, ZFS_SCHEME_VERSION0
);
307 (void) nvlist_add_string(asru
, FM_FMRI_SCHEME
, FM_FMRI_SCHEME_ZFS
);
308 (void) nvlist_add_uint64(asru
, FM_FMRI_ZFS_POOL
, pool_guid
);
309 (void) nvlist_add_uint64(asru
, FM_FMRI_ZFS_VDEV
, vdev_guid
);
312 * We explicitly check for the unusable state here to make sure we
313 * aren't responding to a transient state change. As part of opening a
314 * vdev, it's possible to see the 'statechange' event, only to be
315 * followed by a vdev failure later. If we don't check the current
316 * state of the vdev (or pool) before marking it repaired, then we risk
317 * generating spurious repair events followed immediately by the same
320 * This assumes that the ZFS scheme code associated unusable (i.e.
321 * isolated) with its own definition of faulty state. In the case of a
322 * DEGRADED leaf vdev (due to checksum errors), this is not the case.
323 * This works, however, because the transient state change is not
324 * posted in this case. This could be made more explicit by not
325 * relying on the scheme's unusable callback and instead directly
326 * checking the vdev state, where we could correctly account for
329 if (!fmd_nvl_fmri_unusable(hdl
, asru
) && fmd_nvl_fmri_has_fault(hdl
,
330 asru
, FMD_HAS_FAULT_ASRU
, NULL
)) {
335 thp
= fmd_hdl_topo_hold(hdl
, TOPO_VERSION
);
336 if (topo_fmri_nvl2str(thp
, asru
, &fmri
, &err
) == 0)
337 (void) fmd_repair_asru(hdl
, fmri
);
338 fmd_hdl_topo_rele(hdl
, thp
);
340 topo_hdl_strfree(thp
, fmri
);
343 zrp
= fmd_hdl_alloc(hdl
, sizeof (zfs_retire_repaired_t
), FMD_SLEEP
);
344 zrp
->zrr_next
= zdp
->zrd_repaired
;
345 zrp
->zrr_pool
= pool_guid
;
346 zrp
->zrr_vdev
= vdev_guid
;
347 zdp
->zrd_repaired
= zrp
;
352 zfs_retire_recv(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, nvlist_t
*nvl
,
355 uint64_t pool_guid
, vdev_guid
;
357 nvlist_t
*resource
, *fault
, *fru
;
360 zfs_retire_data_t
*zdp
= fmd_hdl_getspecific(hdl
);
361 libzfs_handle_t
*zhdl
= zdp
->zrd_hdl
;
362 boolean_t fault_device
, degrade_device
;
375 * If this is a resource notifying us of device removal, then simply
376 * check for an available spare and continue.
378 if (strcmp(class, "resource.fs.zfs.removed") == 0) {
379 if (nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID
,
381 nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID
,
385 if ((zhp
= find_by_guid(zhdl
, pool_guid
, vdev_guid
,
389 if (fmd_prop_get_int32(hdl
, "spare_on_remove"))
390 replace_with_spare(hdl
, zhp
, vdev
);
395 if (strcmp(class, FM_LIST_RESOLVED_CLASS
) == 0)
398 if (strcmp(class, "resource.fs.zfs.statechange") == 0 ||
400 "resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove") == 0) {
401 zfs_vdev_repair(hdl
, nvl
);
405 zfs_retire_clear_data(hdl
, zdp
);
407 if (strcmp(class, FM_LIST_REPAIRED_CLASS
) == 0)
413 * We subscribe to zfs faults as well as all repair events.
415 if (nvlist_lookup_nvlist_array(nvl
, FM_SUSPECT_FAULT_LIST
,
416 &faults
, &nfaults
) != 0)
419 for (f
= 0; f
< nfaults
; f
++) {
422 fault_device
= B_FALSE
;
423 degrade_device
= B_FALSE
;
426 if (nvlist_lookup_boolean_value(fault
, FM_SUSPECT_RETIRE
,
427 &retire
) == 0 && retire
== 0)
431 * While we subscribe to fault.fs.zfs.*, we only take action
432 * for faults targeting a specific vdev (open failure or SERD
433 * failure). We also subscribe to fault.io.* events, so that
434 * faulty disks will be faulted in the ZFS configuration.
436 if (fmd_nvl_class_match(hdl
, fault
, "fault.fs.zfs.vdev.io")) {
437 fault_device
= B_TRUE
;
438 } else if (fmd_nvl_class_match(hdl
, fault
,
439 "fault.fs.zfs.vdev.checksum")) {
440 degrade_device
= B_TRUE
;
441 } else if (fmd_nvl_class_match(hdl
, fault
,
442 "fault.fs.zfs.device")) {
443 fault_device
= B_FALSE
;
444 } else if (fmd_nvl_class_match(hdl
, fault
, "fault.io.*")) {
446 fault_device
= B_TRUE
;
453 * This is a disk fault. Lookup the FRU, convert it to
454 * an FMRI string, and attempt to find a matching vdev.
456 if (nvlist_lookup_nvlist(fault
, FM_FAULT_FRU
,
458 nvlist_lookup_string(fru
, FM_FMRI_SCHEME
,
462 if (strcmp(scheme
, FM_FMRI_SCHEME_HC
) != 0)
465 thp
= fmd_hdl_topo_hold(hdl
, TOPO_VERSION
);
466 if (topo_fmri_nvl2str(thp
, fru
, &fmri
, &err
) != 0) {
467 fmd_hdl_topo_rele(hdl
, thp
);
471 zhp
= find_by_fru(zhdl
, fmri
, &vdev
);
472 topo_hdl_strfree(thp
, fmri
);
473 fmd_hdl_topo_rele(hdl
, thp
);
478 (void) nvlist_lookup_uint64(vdev
,
479 ZPOOL_CONFIG_GUID
, &vdev_guid
);
480 aux
= VDEV_AUX_EXTERNAL
;
483 * This is a ZFS fault. Lookup the resource, and
484 * attempt to find the matching vdev.
486 if (nvlist_lookup_nvlist(fault
, FM_FAULT_RESOURCE
,
488 nvlist_lookup_string(resource
, FM_FMRI_SCHEME
,
492 if (strcmp(scheme
, FM_FMRI_SCHEME_ZFS
) != 0)
495 if (nvlist_lookup_uint64(resource
, FM_FMRI_ZFS_POOL
,
499 if (nvlist_lookup_uint64(resource
, FM_FMRI_ZFS_VDEV
,
507 if ((zhp
= find_by_guid(zhdl
, pool_guid
, vdev_guid
,
511 aux
= VDEV_AUX_ERR_EXCEEDED
;
514 if (vdev_guid
== 0) {
516 * For pool-level repair events, clear the entire pool.
518 (void) zpool_clear(zhp
, NULL
, NULL
);
524 * If this is a repair event, then mark the vdev as repaired and
529 (void) zpool_vdev_clear(zhp
, vdev_guid
);
535 * Actively fault the device if needed.
538 (void) zpool_vdev_fault(zhp
, vdev_guid
, aux
);
540 (void) zpool_vdev_degrade(zhp
, vdev_guid
, aux
);
543 * Attempt to substitute a hot spare.
545 replace_with_spare(hdl
, zhp
, vdev
);
549 if (strcmp(class, FM_LIST_REPAIRED_CLASS
) == 0 && repair_done
&&
550 nvlist_lookup_string(nvl
, FM_SUSPECT_UUID
, &uuid
) == 0)
551 fmd_case_uuresolved(hdl
, uuid
);
554 static const fmd_hdl_ops_t fmd_ops
= {
555 zfs_retire_recv
, /* fmdo_recv */
556 NULL
, /* fmdo_timeout */
557 NULL
, /* fmdo_close */
558 NULL
, /* fmdo_stats */
562 static const fmd_prop_t fmd_props
[] = {
563 { "spare_on_remove", FMD_TYPE_BOOL
, "true" },
567 static const fmd_hdl_info_t fmd_info
= {
568 "ZFS Retire Agent", "1.0", &fmd_ops
, fmd_props
572 _fmd_init(fmd_hdl_t
*hdl
)
574 zfs_retire_data_t
*zdp
;
575 libzfs_handle_t
*zhdl
;
577 if ((zhdl
= libzfs_init()) == NULL
)
580 if (fmd_hdl_register(hdl
, FMD_API_VERSION
, &fmd_info
) != 0) {
585 zdp
= fmd_hdl_zalloc(hdl
, sizeof (zfs_retire_data_t
), FMD_SLEEP
);
588 fmd_hdl_setspecific(hdl
, zdp
);
592 _fmd_fini(fmd_hdl_t
*hdl
)
594 zfs_retire_data_t
*zdp
= fmd_hdl_getspecific(hdl
);
597 zfs_retire_clear_data(hdl
, zdp
);
598 libzfs_fini(zdp
->zrd_hdl
);
599 fmd_hdl_free(hdl
, zdp
, sizeof (zfs_retire_data_t
));