4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2016, Intel Corporation.
25 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
29 * The ZFS retire agent is responsible for managing hot spares across all pools.
30 * When we see a device fault or a device removal, we try to open the associated
31 * pool and look for any hot spares. We iterate over any available hot spares
32 * and attempt a 'zpool replace' for each one.
34 * For vdevs diagnosed as faulty, the agent is also responsible for proactively
35 * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors).
38 #include <sys/fs/zfs.h>
39 #include <sys/fm/protocol.h>
40 #include <sys/fm/fs/zfs.h>
46 #include "zfs_agents.h"
50 typedef struct zfs_retire_repaired
{
51 struct zfs_retire_repaired
*zrr_next
;
54 } zfs_retire_repaired_t
;
56 typedef struct zfs_retire_data
{
57 libzfs_handle_t
*zrd_hdl
;
58 zfs_retire_repaired_t
*zrd_repaired
;
62 zfs_retire_clear_data(fmd_hdl_t
*hdl
, zfs_retire_data_t
*zdp
)
64 zfs_retire_repaired_t
*zrp
;
66 while ((zrp
= zdp
->zrd_repaired
) != NULL
) {
67 zdp
->zrd_repaired
= zrp
->zrr_next
;
68 fmd_hdl_free(hdl
, zrp
, sizeof (zfs_retire_repaired_t
));
73 * Find a pool with a matching GUID.
75 typedef struct find_cbdata
{
77 zpool_handle_t
*cb_zhp
;
79 uint64_t cb_vdev_guid
;
80 uint64_t cb_num_spares
;
84 find_pool(zpool_handle_t
*zhp
, void *data
)
86 find_cbdata_t
*cbp
= data
;
89 zpool_get_prop_int(zhp
, ZPOOL_PROP_GUID
, NULL
)) {
99 * Find a vdev within a tree with a matching GUID.
102 find_vdev(libzfs_handle_t
*zhdl
, nvlist_t
*nv
, uint64_t search_guid
)
109 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &guid
) == 0 &&
110 guid
== search_guid
) {
111 fmd_hdl_debug(fmd_module_hdl("zfs-retire"),
112 "matched vdev %llu", guid
);
116 if (nvlist_lookup_nvlist_array(nv
, ZPOOL_CONFIG_CHILDREN
,
117 &child
, &children
) != 0)
120 for (c
= 0; c
< children
; c
++) {
121 if ((ret
= find_vdev(zhdl
, child
[c
], search_guid
)) != NULL
)
125 if (nvlist_lookup_nvlist_array(nv
, ZPOOL_CONFIG_L2CACHE
,
126 &child
, &children
) != 0)
129 for (c
= 0; c
< children
; c
++) {
130 if ((ret
= find_vdev(zhdl
, child
[c
], search_guid
)) != NULL
)
134 if (nvlist_lookup_nvlist_array(nv
, ZPOOL_CONFIG_SPARES
,
135 &child
, &children
) != 0)
138 for (c
= 0; c
< children
; c
++) {
139 if ((ret
= find_vdev(zhdl
, child
[c
], search_guid
)) != NULL
)
147 remove_spares(zpool_handle_t
*zhp
, void *data
)
149 nvlist_t
*config
, *nvroot
;
153 find_cbdata_t
*cbp
= data
;
154 uint64_t spareguid
= 0;
158 config
= zpool_get_config(zhp
, NULL
);
159 if (nvlist_lookup_nvlist(config
,
160 ZPOOL_CONFIG_VDEV_TREE
, &nvroot
) != 0) {
165 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_SPARES
,
166 &spares
, &nspares
) != 0) {
171 for (int i
= 0; i
< nspares
; i
++) {
172 if (nvlist_lookup_uint64(spares
[i
], ZPOOL_CONFIG_GUID
,
173 &spareguid
) == 0 && spareguid
== cbp
->cb_vdev_guid
) {
174 devname
= zpool_vdev_name(NULL
, zhp
, spares
[i
],
176 nvlist_lookup_uint64_array(spares
[i
],
177 ZPOOL_CONFIG_VDEV_STATS
, (uint64_t **)&vs
, &c
);
178 if (vs
->vs_state
!= VDEV_STATE_REMOVED
&&
179 zpool_vdev_remove_wanted(zhp
, devname
) == 0)
180 cbp
->cb_num_spares
++;
190 * Given a vdev guid, find and remove all spares associated with it.
193 find_and_remove_spares(libzfs_handle_t
*zhdl
, uint64_t vdev_guid
)
197 cb
.cb_num_spares
= 0;
198 cb
.cb_vdev_guid
= vdev_guid
;
199 zpool_iter(zhdl
, remove_spares
, &cb
);
201 return (cb
.cb_num_spares
);
205 * Given a (pool, vdev) GUID pair, find the matching pool and vdev.
207 static zpool_handle_t
*
208 find_by_guid(libzfs_handle_t
*zhdl
, uint64_t pool_guid
, uint64_t vdev_guid
,
213 nvlist_t
*config
, *nvroot
;
216 * Find the corresponding pool and make sure the vdev still exists.
218 cb
.cb_guid
= pool_guid
;
219 if (zpool_iter(zhdl
, find_pool
, &cb
) != 1)
223 config
= zpool_get_config(zhp
, NULL
);
224 if (nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
230 if (vdev_guid
!= 0) {
231 if ((*vdevp
= find_vdev(zhdl
, nvroot
, vdev_guid
)) == NULL
) {
241 * Given a vdev, attempt to replace it with every known spare until one
242 * succeeds or we run out of devices to try.
243 * Return whether we were successful or not in replacing the device.
246 replace_with_spare(fmd_hdl_t
*hdl
, zpool_handle_t
*zhp
, nvlist_t
*vdev
)
248 nvlist_t
*config
, *nvroot
, *replacement
;
252 zprop_source_t source
;
255 config
= zpool_get_config(zhp
, NULL
);
256 if (nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
261 * Find out if there are any hot spares available in the pool.
263 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_SPARES
,
264 &spares
, &nspares
) != 0)
268 * lookup "ashift" pool property, we may need it for the replacement
270 ashift
= zpool_get_prop_int(zhp
, ZPOOL_PROP_ASHIFT
, &source
);
272 replacement
= fmd_nvl_alloc(hdl
, FMD_SLEEP
);
274 (void) nvlist_add_string(replacement
, ZPOOL_CONFIG_TYPE
,
277 dev_name
= zpool_vdev_name(NULL
, zhp
, vdev
, B_FALSE
);
280 * Try to replace each spare, ending when we successfully
283 for (s
= 0; s
< nspares
; s
++) {
284 boolean_t rebuild
= B_FALSE
;
285 const char *spare_name
, *type
;
287 if (nvlist_lookup_string(spares
[s
], ZPOOL_CONFIG_PATH
,
291 /* prefer sequential resilvering for distributed spares */
292 if ((nvlist_lookup_string(spares
[s
], ZPOOL_CONFIG_TYPE
,
293 &type
) == 0) && strcmp(type
, VDEV_TYPE_DRAID_SPARE
) == 0)
296 /* if set, add the "ashift" pool property to the spare nvlist */
297 if (source
!= ZPROP_SRC_DEFAULT
)
298 (void) nvlist_add_uint64(spares
[s
],
299 ZPOOL_CONFIG_ASHIFT
, ashift
);
301 (void) nvlist_add_nvlist_array(replacement
,
302 ZPOOL_CONFIG_CHILDREN
, (const nvlist_t
**)&spares
[s
], 1);
304 fmd_hdl_debug(hdl
, "zpool_vdev_replace '%s' with spare '%s'",
305 dev_name
, zfs_basename(spare_name
));
307 if (zpool_vdev_attach(zhp
, dev_name
, spare_name
,
308 replacement
, B_TRUE
, rebuild
) == 0) {
310 nvlist_free(replacement
);
316 nvlist_free(replacement
);
322 * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and
323 * ASRU is now usable. ZFS has found the device to be present and
327 zfs_vdev_repair(fmd_hdl_t
*hdl
, nvlist_t
*nvl
)
329 zfs_retire_data_t
*zdp
= fmd_hdl_getspecific(hdl
);
330 zfs_retire_repaired_t
*zrp
;
331 uint64_t pool_guid
, vdev_guid
;
332 if (nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID
,
333 &pool_guid
) != 0 || nvlist_lookup_uint64(nvl
,
334 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID
, &vdev_guid
) != 0)
338 * Before checking the state of the ASRU, go through and see if we've
339 * already made an attempt to repair this ASRU. This list is cleared
340 * whenever we receive any kind of list event, and is designed to
341 * prevent us from generating a feedback loop when we attempt repairs
342 * against a faulted pool. The problem is that checking the unusable
343 * state of the ASRU can involve opening the pool, which can post
344 * statechange events but otherwise leave the pool in the faulted
345 * state. This list allows us to detect when a statechange event is
346 * due to our own request.
348 for (zrp
= zdp
->zrd_repaired
; zrp
!= NULL
; zrp
= zrp
->zrr_next
) {
349 if (zrp
->zrr_pool
== pool_guid
&&
350 zrp
->zrr_vdev
== vdev_guid
)
354 zrp
= fmd_hdl_alloc(hdl
, sizeof (zfs_retire_repaired_t
), FMD_SLEEP
);
355 zrp
->zrr_next
= zdp
->zrd_repaired
;
356 zrp
->zrr_pool
= pool_guid
;
357 zrp
->zrr_vdev
= vdev_guid
;
358 zdp
->zrd_repaired
= zrp
;
360 fmd_hdl_debug(hdl
, "marking repaired vdev %llu on pool %llu",
361 vdev_guid
, pool_guid
);
365 zfs_retire_recv(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, nvlist_t
*nvl
,
369 uint64_t pool_guid
, vdev_guid
;
371 nvlist_t
*resource
, *fault
;
374 zfs_retire_data_t
*zdp
= fmd_hdl_getspecific(hdl
);
375 libzfs_handle_t
*zhdl
= zdp
->zrd_hdl
;
376 boolean_t fault_device
, degrade_device
;
378 boolean_t l2arc
= B_FALSE
;
379 boolean_t spare
= B_FALSE
;
381 nvlist_t
*vdev
= NULL
;
391 fmd_hdl_debug(hdl
, "zfs_retire_recv: '%s'", class);
393 (void) nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE
,
397 * If this is a resource notifying us of device removal then simply
398 * check for an available spare and continue unless the device is a
399 * l2arc vdev, in which case we just offline it.
401 if (strcmp(class, "resource.fs.zfs.removed") == 0 ||
402 (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
403 (state
== VDEV_STATE_REMOVED
|| state
== VDEV_STATE_FAULTED
))) {
407 if (nvlist_lookup_string(nvl
, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE
,
409 if (strcmp(devtype
, VDEV_TYPE_SPARE
) == 0)
411 else if (strcmp(devtype
, VDEV_TYPE_L2CACHE
) == 0)
415 if (nvlist_lookup_uint64(nvl
,
416 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID
, &vdev_guid
) != 0)
419 if (vdev_guid
== 0) {
420 fmd_hdl_debug(hdl
, "Got a zero GUID");
425 int nspares
= find_and_remove_spares(zhdl
, vdev_guid
);
426 fmd_hdl_debug(hdl
, "%d spares removed", nspares
);
430 if (nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID
,
434 if ((zhp
= find_by_guid(zhdl
, pool_guid
, vdev_guid
,
438 devname
= zpool_vdev_name(NULL
, zhp
, vdev
, B_FALSE
);
440 nvlist_lookup_uint64_array(vdev
, ZPOOL_CONFIG_VDEV_STATS
,
441 (uint64_t **)&vs
, &c
);
444 * If state removed is requested for already removed vdev,
445 * its a loopback event from spa_async_remove(). Just
448 if (vs
->vs_state
== VDEV_STATE_REMOVED
&&
449 state
== VDEV_STATE_REMOVED
)
452 /* Remove the vdev since device is unplugged */
453 int remove_status
= 0;
454 if (l2arc
|| (strcmp(class, "resource.fs.zfs.removed") == 0)) {
455 remove_status
= zpool_vdev_remove_wanted(zhp
, devname
);
456 fmd_hdl_debug(hdl
, "zpool_vdev_remove_wanted '%s'"
457 ", err:%d", devname
, libzfs_errno(zhdl
));
460 /* Replace the vdev with a spare if its not a l2arc */
461 if (!l2arc
&& !remove_status
&&
462 (!fmd_prop_get_int32(hdl
, "spare_on_remove") ||
463 replace_with_spare(hdl
, zhp
, vdev
) == B_FALSE
)) {
464 /* Could not handle with spare */
465 fmd_hdl_debug(hdl
, "no spare for '%s'", devname
);
473 if (strcmp(class, FM_LIST_RESOLVED_CLASS
) == 0)
477 * Note: on Linux statechange events are more than just
478 * healthy ones so we need to confirm the actual state value.
480 if (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
481 state
== VDEV_STATE_HEALTHY
) {
482 zfs_vdev_repair(hdl
, nvl
);
485 if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
486 zfs_vdev_repair(hdl
, nvl
);
490 zfs_retire_clear_data(hdl
, zdp
);
492 if (strcmp(class, FM_LIST_REPAIRED_CLASS
) == 0)
498 * We subscribe to zfs faults as well as all repair events.
500 if (nvlist_lookup_nvlist_array(nvl
, FM_SUSPECT_FAULT_LIST
,
501 &faults
, &nfaults
) != 0)
504 for (f
= 0; f
< nfaults
; f
++) {
507 fault_device
= B_FALSE
;
508 degrade_device
= B_FALSE
;
511 if (nvlist_lookup_boolean_value(fault
, FM_SUSPECT_RETIRE
,
512 &retire
) == 0 && retire
== 0)
516 * While we subscribe to fault.fs.zfs.*, we only take action
517 * for faults targeting a specific vdev (open failure or SERD
518 * failure). We also subscribe to fault.io.* events, so that
519 * faulty disks will be faulted in the ZFS configuration.
521 if (fmd_nvl_class_match(hdl
, fault
, "fault.fs.zfs.vdev.io")) {
522 fault_device
= B_TRUE
;
523 } else if (fmd_nvl_class_match(hdl
, fault
,
524 "fault.fs.zfs.vdev.checksum")) {
525 degrade_device
= B_TRUE
;
526 } else if (fmd_nvl_class_match(hdl
, fault
,
527 "fault.fs.zfs.device")) {
528 fault_device
= B_FALSE
;
529 } else if (fmd_nvl_class_match(hdl
, fault
, "fault.io.*")) {
531 fault_device
= B_TRUE
;
540 * This is a ZFS fault. Lookup the resource, and
541 * attempt to find the matching vdev.
543 if (nvlist_lookup_nvlist(fault
, FM_FAULT_RESOURCE
,
545 nvlist_lookup_string(resource
, FM_FMRI_SCHEME
,
549 if (strcmp(scheme
, FM_FMRI_SCHEME_ZFS
) != 0)
552 if (nvlist_lookup_uint64(resource
, FM_FMRI_ZFS_POOL
,
556 if (nvlist_lookup_uint64(resource
, FM_FMRI_ZFS_VDEV
,
564 if ((zhp
= find_by_guid(zhdl
, pool_guid
, vdev_guid
,
568 aux
= VDEV_AUX_ERR_EXCEEDED
;
571 if (vdev_guid
== 0) {
573 * For pool-level repair events, clear the entire pool.
575 fmd_hdl_debug(hdl
, "zpool_clear of pool '%s'",
576 zpool_get_name(zhp
));
577 (void) zpool_clear(zhp
, NULL
, NULL
);
583 * If this is a repair event, then mark the vdev as repaired and
588 fmd_hdl_debug(hdl
, "zpool_clear of pool '%s' vdev %llu",
589 zpool_get_name(zhp
), vdev_guid
);
590 (void) zpool_vdev_clear(zhp
, vdev_guid
);
596 * Actively fault the device if needed.
599 (void) zpool_vdev_fault(zhp
, vdev_guid
, aux
);
601 (void) zpool_vdev_degrade(zhp
, vdev_guid
, aux
);
603 if (fault_device
|| degrade_device
)
604 fmd_hdl_debug(hdl
, "zpool_vdev_%s: vdev %llu on '%s'",
605 fault_device
? "fault" : "degrade", vdev_guid
,
606 zpool_get_name(zhp
));
609 * Attempt to substitute a hot spare.
611 (void) replace_with_spare(hdl
, zhp
, vdev
);
616 if (strcmp(class, FM_LIST_REPAIRED_CLASS
) == 0 && repair_done
&&
617 nvlist_lookup_string(nvl
, FM_SUSPECT_UUID
, &uuid
) == 0)
618 fmd_case_uuresolved(hdl
, uuid
);
621 static const fmd_hdl_ops_t fmd_ops
= {
622 zfs_retire_recv
, /* fmdo_recv */
623 NULL
, /* fmdo_timeout */
624 NULL
, /* fmdo_close */
625 NULL
, /* fmdo_stats */
629 static const fmd_prop_t fmd_props
[] = {
630 { "spare_on_remove", FMD_TYPE_BOOL
, "true" },
634 static const fmd_hdl_info_t fmd_info
= {
635 "ZFS Retire Agent", "1.0", &fmd_ops
, fmd_props
639 _zfs_retire_init(fmd_hdl_t
*hdl
)
641 zfs_retire_data_t
*zdp
;
642 libzfs_handle_t
*zhdl
;
644 if ((zhdl
= libzfs_init()) == NULL
)
647 if (fmd_hdl_register(hdl
, FMD_API_VERSION
, &fmd_info
) != 0) {
652 zdp
= fmd_hdl_zalloc(hdl
, sizeof (zfs_retire_data_t
), FMD_SLEEP
);
655 fmd_hdl_setspecific(hdl
, zdp
);
659 _zfs_retire_fini(fmd_hdl_t
*hdl
)
661 zfs_retire_data_t
*zdp
= fmd_hdl_getspecific(hdl
);
664 zfs_retire_clear_data(hdl
, zdp
);
665 libzfs_fini(zdp
->zrd_hdl
);
666 fmd_hdl_free(hdl
, zdp
, sizeof (zfs_retire_data_t
));