4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016, Intel Corporation.
26 * Copyright (c) 2023, Klara Inc.
33 #include <sys/types.h>
35 #include <sys/fs/zfs.h>
36 #include <sys/fm/protocol.h>
37 #include <sys/fm/fs/zfs.h>
40 #include "zfs_agents.h"
44 * Default values for the serd engine when processing checksum or io errors. The
45 * semantics are N <events> in T <seconds>.
47 #define DEFAULT_CHECKSUM_N 10 /* events */
48 #define DEFAULT_CHECKSUM_T 600 /* seconds */
49 #define DEFAULT_IO_N 10 /* events */
50 #define DEFAULT_IO_T 600 /* seconds */
51 #define DEFAULT_SLOW_IO_N 10 /* events */
52 #define DEFAULT_SLOW_IO_T 30 /* seconds */
54 #define CASE_GC_TIMEOUT_SECS 43200 /* 12 hours */
57 * Our serd engines are named in the following format:
58 * 'zfs_<pool_guid>_<vdev_guid>_{checksum,io,slow_io}'
59 * This #define reserves enough space for two 64-bit hex values plus the
60 * length of the longest string.
62 #define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum"))
65 * On-disk case structure. This must maintain backwards compatibility with
66 * previous versions of the DE. By default, any members appended to the end
67 * will be filled with zeros if they don't exist in a previous version.
69 typedef struct zfs_case_data
{
72 uint64_t zc_pool_guid
;
73 uint64_t zc_vdev_guid
;
74 uint64_t zc_parent_guid
;
76 char zc_serd_checksum
[MAX_SERDLEN
];
77 char zc_serd_io
[MAX_SERDLEN
];
78 char zc_serd_slow_io
[MAX_SERDLEN
];
79 int zc_has_remove_timer
;
85 typedef struct er_timeval
{
91 * In-core case structure.
93 typedef struct zfs_case
{
96 zfs_case_data_t zc_data
;
98 uu_list_node_t zc_node
;
101 er_timeval_t zc_when
;
104 #define CASE_DATA "data"
105 #define CASE_FRU "fru"
106 #define CASE_DATA_VERSION_INITIAL 1
107 #define CASE_DATA_VERSION_SERD 2
109 typedef struct zfs_de_stats
{
110 fmd_stat_t old_drops
;
111 fmd_stat_t dev_drops
;
112 fmd_stat_t vdev_drops
;
113 fmd_stat_t import_drops
;
114 fmd_stat_t resource_drops
;
117 zfs_de_stats_t zfs_stats
= {
118 { "old_drops", FMD_TYPE_UINT64
, "ereports dropped (from before load)" },
119 { "dev_drops", FMD_TYPE_UINT64
, "ereports dropped (dev during open)"},
120 { "vdev_drops", FMD_TYPE_UINT64
, "ereports dropped (weird vdev types)"},
121 { "import_drops", FMD_TYPE_UINT64
, "ereports dropped (during import)" },
122 { "resource_drops", FMD_TYPE_UINT64
, "resource related ereports" }
125 /* wait 15 seconds after a removal */
126 static hrtime_t zfs_remove_timeout
= SEC2NSEC(15);
128 uu_list_pool_t
*zfs_case_pool
;
129 uu_list_t
*zfs_cases
;
131 #define ZFS_MAKE_RSRC(type) \
132 FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type
133 #define ZFS_MAKE_EREPORT(type) \
134 FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
136 static void zfs_purge_cases(fmd_hdl_t
*hdl
);
139 * Write out the persistent representation of an active case.
142 zfs_case_serialize(zfs_case_t
*zcp
)
144 zcp
->zc_data
.zc_version
= CASE_DATA_VERSION_SERD
;
148 * Read back the persistent representation of an active case.
151 zfs_case_unserialize(fmd_hdl_t
*hdl
, fmd_case_t
*cp
)
155 zcp
= fmd_hdl_zalloc(hdl
, sizeof (zfs_case_t
), FMD_SLEEP
);
158 fmd_buf_read(hdl
, cp
, CASE_DATA
, &zcp
->zc_data
,
159 sizeof (zcp
->zc_data
));
161 if (zcp
->zc_data
.zc_version
> CASE_DATA_VERSION_SERD
) {
162 fmd_hdl_free(hdl
, zcp
, sizeof (zfs_case_t
));
167 * fmd_buf_read() will have already zeroed out the remainder of the
168 * buffer, so we don't have to do anything special if the version
169 * doesn't include the SERD engine name.
172 if (zcp
->zc_data
.zc_has_remove_timer
)
173 zcp
->zc_remove_timer
= fmd_timer_install(hdl
, zcp
,
174 NULL
, zfs_remove_timeout
);
176 uu_list_node_init(zcp
, &zcp
->zc_node
, zfs_case_pool
);
177 (void) uu_list_insert_before(zfs_cases
, NULL
, zcp
);
179 fmd_case_setspecific(hdl
, cp
, zcp
);
185 * Return count of other unique SERD cases under same vdev parent
188 zfs_other_serd_cases(fmd_hdl_t
*hdl
, const zfs_case_data_t
*zfs_case
)
192 static hrtime_t next_check
= 0;
195 * Note that plumbing in some external GC would require adding locking,
196 * since most of this module code is not thread safe and assumes there
197 * is only one thread running against the module. So we perform GC here
198 * inline periodically so that future delay induced faults will be
199 * possible once the issue causing multiple vdev delays is resolved.
201 if (gethrestime_sec() > next_check
) {
202 /* Periodically purge old SERD entries and stale cases */
204 zfs_purge_cases(hdl
);
205 next_check
= gethrestime_sec() + CASE_GC_TIMEOUT_SECS
;
208 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
209 zcp
= uu_list_next(zfs_cases
, zcp
)) {
210 zfs_case_data_t
*zcd
= &zcp
->zc_data
;
213 * must be same pool and parent vdev but different leaf vdev
215 if (zcd
->zc_pool_guid
!= zfs_case
->zc_pool_guid
||
216 zcd
->zc_parent_guid
!= zfs_case
->zc_parent_guid
||
217 zcd
->zc_vdev_guid
== zfs_case
->zc_vdev_guid
) {
222 * Check if there is another active serd case besides zfs_case
224 * Only one serd engine will be assigned to the case
226 if (zcd
->zc_serd_checksum
[0] == zfs_case
->zc_serd_checksum
[0] &&
227 fmd_serd_active(hdl
, zcd
->zc_serd_checksum
)) {
230 if (zcd
->zc_serd_io
[0] == zfs_case
->zc_serd_io
[0] &&
231 fmd_serd_active(hdl
, zcd
->zc_serd_io
)) {
234 if (zcd
->zc_serd_slow_io
[0] == zfs_case
->zc_serd_slow_io
[0] &&
235 fmd_serd_active(hdl
, zcd
->zc_serd_slow_io
)) {
243 * Iterate over any active cases. If any cases are associated with a pool or
244 * vdev which is no longer present on the system, close the associated case.
247 zfs_mark_vdev(uint64_t pool_guid
, nvlist_t
*vd
, er_timeval_t
*loaded
)
249 uint64_t vdev_guid
= 0;
254 (void) nvlist_lookup_uint64(vd
, ZPOOL_CONFIG_GUID
, &vdev_guid
);
257 * Mark any cases associated with this (pool, vdev) pair.
259 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
260 zcp
= uu_list_next(zfs_cases
, zcp
)) {
261 if (zcp
->zc_data
.zc_pool_guid
== pool_guid
&&
262 zcp
->zc_data
.zc_vdev_guid
== vdev_guid
) {
263 zcp
->zc_present
= B_TRUE
;
264 zcp
->zc_when
= *loaded
;
269 * Iterate over all children.
271 if (nvlist_lookup_nvlist_array(vd
, ZPOOL_CONFIG_CHILDREN
, &child
,
273 for (c
= 0; c
< children
; c
++)
274 zfs_mark_vdev(pool_guid
, child
[c
], loaded
);
277 if (nvlist_lookup_nvlist_array(vd
, ZPOOL_CONFIG_L2CACHE
, &child
,
279 for (c
= 0; c
< children
; c
++)
280 zfs_mark_vdev(pool_guid
, child
[c
], loaded
);
283 if (nvlist_lookup_nvlist_array(vd
, ZPOOL_CONFIG_SPARES
, &child
,
285 for (c
= 0; c
< children
; c
++)
286 zfs_mark_vdev(pool_guid
, child
[c
], loaded
);
291 zfs_mark_pool(zpool_handle_t
*zhp
, void *unused
)
297 er_timeval_t loaded
= { 0 };
298 nvlist_t
*config
, *vd
;
302 pool_guid
= zpool_get_prop_int(zhp
, ZPOOL_PROP_GUID
, NULL
);
304 * Mark any cases associated with just this pool.
306 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
307 zcp
= uu_list_next(zfs_cases
, zcp
)) {
308 if (zcp
->zc_data
.zc_pool_guid
== pool_guid
&&
309 zcp
->zc_data
.zc_vdev_guid
== 0)
310 zcp
->zc_present
= B_TRUE
;
313 if ((config
= zpool_get_config(zhp
, NULL
)) == NULL
) {
318 (void) nvlist_lookup_uint64_array(config
, ZPOOL_CONFIG_LOADED_TIME
,
321 loaded
.ertv_sec
= tod
[0];
322 loaded
.ertv_nsec
= tod
[1];
323 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
324 zcp
= uu_list_next(zfs_cases
, zcp
)) {
325 if (zcp
->zc_data
.zc_pool_guid
== pool_guid
&&
326 zcp
->zc_data
.zc_vdev_guid
== 0) {
327 zcp
->zc_when
= loaded
;
332 ret
= nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
, &vd
);
338 zfs_mark_vdev(pool_guid
, vd
, &loaded
);
345 struct load_time_arg
{
347 er_timeval_t
*lt_time
;
352 zpool_find_load_time(zpool_handle_t
*zhp
, void *arg
)
354 struct load_time_arg
*lta
= arg
;
365 pool_guid
= zpool_get_prop_int(zhp
, ZPOOL_PROP_GUID
, NULL
);
366 if (pool_guid
!= lta
->lt_guid
) {
371 if ((config
= zpool_get_config(zhp
, NULL
)) == NULL
) {
376 if (nvlist_lookup_uint64_array(config
, ZPOOL_CONFIG_LOADED_TIME
,
377 &tod
, &nelem
) == 0 && nelem
== 2) {
378 lta
->lt_found
= B_TRUE
;
379 lta
->lt_time
->ertv_sec
= tod
[0];
380 lta
->lt_time
->ertv_nsec
= tod
[1];
389 zfs_purge_cases(fmd_hdl_t
*hdl
)
392 uu_list_walk_t
*walk
;
393 libzfs_handle_t
*zhdl
= fmd_hdl_getspecific(hdl
);
396 * There is no way to open a pool by GUID, or lookup a vdev by GUID. No
397 * matter what we do, we're going to have to stomach an O(vdevs * cases)
398 * algorithm. In reality, both quantities are likely so small that
399 * neither will matter. Given that iterating over pools is more
400 * expensive than iterating over the in-memory case list, we opt for a
401 * 'present' flag in each case that starts off cleared. We then iterate
402 * over all pools, marking those that are still present, and removing
403 * those that aren't found.
405 * Note that we could also construct an FMRI and rely on
406 * fmd_nvl_fmri_present(), but this would end up doing the same search.
410 * Mark the cases as not present.
412 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
413 zcp
= uu_list_next(zfs_cases
, zcp
))
414 zcp
->zc_present
= B_FALSE
;
417 * Iterate over all pools and mark the pools and vdevs found. If this
418 * fails (most probably because we're out of memory), then don't close
419 * any of the cases and we cannot be sure they are accurate.
421 if (zpool_iter(zhdl
, zfs_mark_pool
, NULL
) != 0)
425 * Remove those cases which were not found.
427 walk
= uu_list_walk_start(zfs_cases
, UU_WALK_ROBUST
);
428 while ((zcp
= uu_list_walk_next(walk
)) != NULL
) {
429 if (!zcp
->zc_present
)
430 fmd_case_close(hdl
, zcp
->zc_case
);
432 uu_list_walk_end(walk
);
436 * Construct the name of a serd engine given the pool/vdev GUID and type (io or
440 zfs_serd_name(char *buf
, uint64_t pool_guid
, uint64_t vdev_guid
,
443 (void) snprintf(buf
, MAX_SERDLEN
, "zfs_%llx_%llx_%s",
444 (long long unsigned int)pool_guid
,
445 (long long unsigned int)vdev_guid
, type
);
449 zfs_case_retire(fmd_hdl_t
*hdl
, zfs_case_t
*zcp
)
451 fmd_hdl_debug(hdl
, "retiring case");
453 fmd_case_close(hdl
, zcp
->zc_case
);
457 * Solve a given ZFS case. This first checks to make sure the diagnosis is
458 * still valid, as well as cleaning up any pending timer associated with the
462 zfs_case_solve(fmd_hdl_t
*hdl
, zfs_case_t
*zcp
, const char *faultname
)
464 nvlist_t
*detector
, *fault
;
466 nvlist_t
*fru
= NULL
;
467 fmd_hdl_debug(hdl
, "solving fault '%s'", faultname
);
470 * Construct the detector from the case data. The detector is in the
471 * ZFS scheme, and is either the pool or the vdev, depending on whether
472 * this is a vdev or pool fault.
474 detector
= fmd_nvl_alloc(hdl
, FMD_SLEEP
);
476 (void) nvlist_add_uint8(detector
, FM_VERSION
, ZFS_SCHEME_VERSION0
);
477 (void) nvlist_add_string(detector
, FM_FMRI_SCHEME
, FM_FMRI_SCHEME_ZFS
);
478 (void) nvlist_add_uint64(detector
, FM_FMRI_ZFS_POOL
,
479 zcp
->zc_data
.zc_pool_guid
);
480 if (zcp
->zc_data
.zc_vdev_guid
!= 0) {
481 (void) nvlist_add_uint64(detector
, FM_FMRI_ZFS_VDEV
,
482 zcp
->zc_data
.zc_vdev_guid
);
485 fault
= fmd_nvl_create_fault(hdl
, faultname
, 100, detector
,
487 fmd_case_add_suspect(hdl
, zcp
->zc_case
, fault
);
491 fmd_case_solve(hdl
, zcp
->zc_case
);
494 if (zcp
->zc_data
.zc_has_remove_timer
) {
495 fmd_timer_remove(hdl
, zcp
->zc_remove_timer
);
496 zcp
->zc_data
.zc_has_remove_timer
= 0;
500 zfs_case_serialize(zcp
);
502 nvlist_free(detector
);
506 timeval_earlier(er_timeval_t
*a
, er_timeval_t
*b
)
508 return (a
->ertv_sec
< b
->ertv_sec
||
509 (a
->ertv_sec
== b
->ertv_sec
&& a
->ertv_nsec
< b
->ertv_nsec
));
513 zfs_ereport_when(fmd_hdl_t
*hdl
, nvlist_t
*nvl
, er_timeval_t
*when
)
519 if (nvlist_lookup_int64_array(nvl
, FM_EREPORT_TIME
, &tod
,
520 &nelem
) == 0 && nelem
== 2) {
521 when
->ertv_sec
= tod
[0];
522 when
->ertv_nsec
= tod
[1];
524 when
->ertv_sec
= when
->ertv_nsec
= UINT64_MAX
;
529 * Record the specified event in the SERD engine and return a
530 * boolean value indicating whether or not the engine fired as
531 * the result of inserting this event.
533 * When the pool has similar active cases on other vdevs, then
534 * the fired state is disregarded and the case is retired.
537 zfs_fm_serd_record(fmd_hdl_t
*hdl
, const char *name
, fmd_event_t
*ep
,
538 zfs_case_t
*zcp
, const char *err_type
)
540 int fired
= fmd_serd_record(hdl
, name
, ep
);
543 if (fired
&& (peers
= zfs_other_serd_cases(hdl
, &zcp
->zc_data
)) > 0) {
544 fmd_hdl_debug(hdl
, "pool %llu is tracking %d other %s cases "
545 "-- skip faulting the vdev %llu",
546 (u_longlong_t
)zcp
->zc_data
.zc_pool_guid
,
548 (u_longlong_t
)zcp
->zc_data
.zc_vdev_guid
);
549 zfs_case_retire(hdl
, zcp
);
557 * Main fmd entry point.
560 zfs_fm_recv(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, nvlist_t
*nvl
, const char *class)
562 zfs_case_t
*zcp
, *dcp
;
564 uint64_t ena
, pool_guid
, vdev_guid
, parent_guid
;
565 uint64_t checksum_n
, checksum_t
;
567 er_timeval_t pool_load
;
568 er_timeval_t er_when
;
570 boolean_t pool_found
= B_FALSE
;
571 boolean_t isresource
;
575 * We subscribe to notifications for vdev or pool removal. In these
576 * cases, there may be cases that no longer apply. Purge any cases
577 * that no longer apply.
579 if (fmd_nvl_class_match(hdl
, nvl
, "sysevent.fs.zfs.*")) {
580 fmd_hdl_debug(hdl
, "purging orphaned cases from %s",
581 strrchr(class, '.') + 1);
582 zfs_purge_cases(hdl
);
583 zfs_stats
.resource_drops
.fmds_value
.ui64
++;
587 isresource
= fmd_nvl_class_match(hdl
, nvl
, "resource.fs.zfs.*");
591 * For resources, we don't have a normal payload.
593 if (nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID
,
595 pool_state
= SPA_LOAD_OPEN
;
597 pool_state
= SPA_LOAD_NONE
;
600 (void) nvlist_lookup_nvlist(nvl
,
601 FM_EREPORT_DETECTOR
, &detector
);
602 (void) nvlist_lookup_int32(nvl
,
603 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT
, &pool_state
);
607 * We also ignore all ereports generated during an import of a pool,
608 * since the only possible fault (.pool) would result in import failure,
609 * and hence no persistent fault. Some day we may want to do something
610 * with these ereports, so we continue generating them internally.
612 if (pool_state
== SPA_LOAD_IMPORT
) {
613 zfs_stats
.import_drops
.fmds_value
.ui64
++;
614 fmd_hdl_debug(hdl
, "ignoring '%s' during import", class);
619 * Device I/O errors are ignored during pool open.
621 if (pool_state
== SPA_LOAD_OPEN
&&
622 (fmd_nvl_class_match(hdl
, nvl
,
623 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM
)) ||
624 fmd_nvl_class_match(hdl
, nvl
,
625 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO
)) ||
626 fmd_nvl_class_match(hdl
, nvl
,
627 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE
)))) {
628 fmd_hdl_debug(hdl
, "ignoring '%s' during pool open", class);
629 zfs_stats
.dev_drops
.fmds_value
.ui64
++;
634 * We ignore ereports for anything except disks and files.
636 if (nvlist_lookup_string(nvl
, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE
,
638 if (strcmp(type
, VDEV_TYPE_DISK
) != 0 &&
639 strcmp(type
, VDEV_TYPE_FILE
) != 0) {
640 zfs_stats
.vdev_drops
.fmds_value
.ui64
++;
646 * Determine if this ereport corresponds to an open case.
647 * Each vdev or pool can have a single case.
649 (void) nvlist_lookup_uint64(nvl
,
650 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID
, &pool_guid
);
651 if (nvlist_lookup_uint64(nvl
,
652 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID
, &vdev_guid
) != 0)
654 if (nvlist_lookup_uint64(nvl
,
655 FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID
, &parent_guid
) != 0)
657 if (nvlist_lookup_uint64(nvl
, FM_EREPORT_ENA
, &ena
) != 0)
660 zfs_ereport_when(hdl
, nvl
, &er_when
);
662 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
663 zcp
= uu_list_next(zfs_cases
, zcp
)) {
664 if (zcp
->zc_data
.zc_pool_guid
== pool_guid
) {
666 pool_load
= zcp
->zc_when
;
668 if (zcp
->zc_data
.zc_vdev_guid
== vdev_guid
)
673 * Avoid falsely accusing a pool of being faulty. Do so by
674 * not replaying ereports that were generated prior to the
675 * current import. If the failure that generated them was
676 * transient because the device was actually removed but we
677 * didn't receive the normal asynchronous notification, we
678 * don't want to mark it as faulted and potentially panic. If
679 * there is still a problem we'd expect not to be able to
680 * import the pool, or that new ereports will be generated
681 * once the pool is used.
683 if (pool_found
&& timeval_earlier(&er_when
, &pool_load
)) {
684 fmd_hdl_debug(hdl
, "ignoring pool %llx, "
685 "ereport time %lld.%lld, pool load time = %lld.%lld",
686 pool_guid
, er_when
.ertv_sec
, er_when
.ertv_nsec
,
687 pool_load
.ertv_sec
, pool_load
.ertv_nsec
);
688 zfs_stats
.old_drops
.fmds_value
.ui64
++;
694 * Haven't yet seen this pool, but same situation
697 libzfs_handle_t
*zhdl
= fmd_hdl_getspecific(hdl
);
698 struct load_time_arg la
;
700 la
.lt_guid
= pool_guid
;
701 la
.lt_time
= &pool_load
;
702 la
.lt_found
= B_FALSE
;
705 zpool_iter(zhdl
, zpool_find_load_time
, &la
) == 0 &&
706 la
.lt_found
== B_TRUE
) {
709 if (timeval_earlier(&er_when
, &pool_load
)) {
710 fmd_hdl_debug(hdl
, "ignoring pool %llx, "
711 "ereport time %lld.%lld, "
712 "pool load time = %lld.%lld",
713 pool_guid
, er_when
.ertv_sec
,
714 er_when
.ertv_nsec
, pool_load
.ertv_sec
,
715 pool_load
.ertv_nsec
);
716 zfs_stats
.old_drops
.fmds_value
.ui64
++;
724 zfs_case_data_t data
= { 0 };
727 * If this is one of our 'fake' resource ereports, and there is
728 * no case open, simply discard it.
731 zfs_stats
.resource_drops
.fmds_value
.ui64
++;
732 fmd_hdl_debug(hdl
, "discarding '%s for vdev %llu",
738 * Skip tracking some ereports
741 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA
)) == 0 ||
743 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE
)) == 0) {
744 zfs_stats
.resource_drops
.fmds_value
.ui64
++;
751 cs
= fmd_case_open(hdl
, NULL
);
753 fmd_hdl_debug(hdl
, "opening case for vdev %llu due to '%s'",
757 * Initialize the case buffer. To commonize code, we actually
758 * create the buffer with existing data, and then call
759 * zfs_case_unserialize() to instantiate the in-core structure.
761 fmd_buf_create(hdl
, cs
, CASE_DATA
, sizeof (zfs_case_data_t
));
763 data
.zc_version
= CASE_DATA_VERSION_SERD
;
765 data
.zc_pool_guid
= pool_guid
;
766 data
.zc_vdev_guid
= vdev_guid
;
767 data
.zc_parent_guid
= parent_guid
;
768 data
.zc_pool_state
= (int)pool_state
;
770 fmd_buf_write(hdl
, cs
, CASE_DATA
, &data
, sizeof (data
));
772 zcp
= zfs_case_unserialize(hdl
, cs
);
775 zcp
->zc_when
= pool_load
;
779 fmd_hdl_debug(hdl
, "resource event '%s'", class);
781 if (fmd_nvl_class_match(hdl
, nvl
,
782 ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE
))) {
784 * The 'resource.fs.zfs.autoreplace' event indicates
785 * that the pool was loaded with the 'autoreplace'
786 * property set. In this case, any pending device
787 * failures should be ignored, as the asynchronous
788 * autoreplace handling will take care of them.
790 fmd_case_close(hdl
, zcp
->zc_case
);
791 } else if (fmd_nvl_class_match(hdl
, nvl
,
792 ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED
))) {
794 * The 'resource.fs.zfs.removed' event indicates that
795 * device removal was detected, and the device was
796 * closed asynchronously. If this is the case, we
797 * assume that any recent I/O errors were due to the
798 * device removal, not any fault of the device itself.
799 * We reset the SERD engine, and cancel any pending
802 if (zcp
->zc_data
.zc_has_remove_timer
) {
803 fmd_timer_remove(hdl
, zcp
->zc_remove_timer
);
804 zcp
->zc_data
.zc_has_remove_timer
= 0;
805 zfs_case_serialize(zcp
);
807 if (zcp
->zc_data
.zc_serd_io
[0] != '\0')
808 fmd_serd_reset(hdl
, zcp
->zc_data
.zc_serd_io
);
809 if (zcp
->zc_data
.zc_serd_checksum
[0] != '\0')
811 zcp
->zc_data
.zc_serd_checksum
);
812 if (zcp
->zc_data
.zc_serd_slow_io
[0] != '\0')
814 zcp
->zc_data
.zc_serd_slow_io
);
815 } else if (fmd_nvl_class_match(hdl
, nvl
,
816 ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE
))) {
820 nvlist_lookup_uint64(nvl
,
821 FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE
, &state
) == 0 &&
822 state
== VDEV_STATE_HEALTHY
) {
823 fmd_hdl_debug(hdl
, "closing case after a "
824 "device statechange to healthy");
825 fmd_case_close(hdl
, zcp
->zc_case
);
828 zfs_stats
.resource_drops
.fmds_value
.ui64
++;
833 * Associate the ereport with this case.
835 fmd_case_add_ereport(hdl
, zcp
->zc_case
, ep
);
838 * Don't do anything else if this case is already solved.
840 if (fmd_case_solved(hdl
, zcp
->zc_case
))
844 fmd_hdl_debug(hdl
, "error event '%s', vdev %llu", class,
847 fmd_hdl_debug(hdl
, "error event '%s'", class);
850 * Determine if we should solve the case and generate a fault. We solve
853 * a. A pool failed to open (ereport.fs.zfs.pool)
854 * b. A device failed to open (ereport.fs.zfs.pool) while a pool
855 * was up and running.
857 * We may see a series of ereports associated with a pool open, all
858 * chained together by the same ENA. If the pool open succeeds, then
859 * we'll see no further ereports. To detect when a pool open has
860 * succeeded, we associate a timer with the event. When it expires, we
863 if (fmd_nvl_class_match(hdl
, nvl
,
864 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL
))) {
866 * Pool level fault. Before solving the case, go through and
867 * close any open device cases that may be pending.
869 for (dcp
= uu_list_first(zfs_cases
); dcp
!= NULL
;
870 dcp
= uu_list_next(zfs_cases
, dcp
)) {
871 if (dcp
->zc_data
.zc_pool_guid
==
872 zcp
->zc_data
.zc_pool_guid
&&
873 dcp
->zc_data
.zc_vdev_guid
!= 0)
874 fmd_case_close(hdl
, dcp
->zc_case
);
877 zfs_case_solve(hdl
, zcp
, "fault.fs.zfs.pool");
878 } else if (fmd_nvl_class_match(hdl
, nvl
,
879 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY
))) {
881 * Pool level fault for reading the intent logs.
883 zfs_case_solve(hdl
, zcp
, "fault.fs.zfs.log_replay");
884 } else if (fmd_nvl_class_match(hdl
, nvl
, "ereport.fs.zfs.vdev.*")) {
888 zfs_case_solve(hdl
, zcp
, "fault.fs.zfs.device");
889 } else if (fmd_nvl_class_match(hdl
, nvl
,
890 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO
)) ||
891 fmd_nvl_class_match(hdl
, nvl
,
892 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM
)) ||
893 fmd_nvl_class_match(hdl
, nvl
,
894 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE
)) ||
895 fmd_nvl_class_match(hdl
, nvl
,
896 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY
)) ||
897 fmd_nvl_class_match(hdl
, nvl
,
898 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE
))) {
899 const char *failmode
= NULL
;
900 boolean_t checkremove
= B_FALSE
;
904 * If this is a checksum or I/O error, then toss it into the
905 * appropriate SERD engine and check to see if it has fired.
906 * Ideally, we want to do something more sophisticated,
907 * (persistent errors for a single data block, etc). For now,
908 * a single SERD engine is sufficient.
910 if (fmd_nvl_class_match(hdl
, nvl
,
911 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO
))) {
912 if (zcp
->zc_data
.zc_serd_io
[0] == '\0') {
913 if (nvlist_lookup_uint64(nvl
,
914 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N
,
918 if (nvlist_lookup_uint64(nvl
,
919 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T
,
923 zfs_serd_name(zcp
->zc_data
.zc_serd_io
,
924 pool_guid
, vdev_guid
, "io");
925 fmd_serd_create(hdl
, zcp
->zc_data
.zc_serd_io
,
928 zfs_case_serialize(zcp
);
930 if (zfs_fm_serd_record(hdl
, zcp
->zc_data
.zc_serd_io
,
931 ep
, zcp
, "io error")) {
932 checkremove
= B_TRUE
;
934 } else if (fmd_nvl_class_match(hdl
, nvl
,
935 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY
))) {
936 uint64_t slow_io_n
, slow_io_t
;
939 * Create a slow io SERD engine when the VDEV has the
940 * 'vdev_slow_io_n' and 'vdev_slow_io_n' properties.
942 if (zcp
->zc_data
.zc_serd_slow_io
[0] == '\0' &&
943 nvlist_lookup_uint64(nvl
,
944 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N
,
946 nvlist_lookup_uint64(nvl
,
947 FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T
,
949 zfs_serd_name(zcp
->zc_data
.zc_serd_slow_io
,
950 pool_guid
, vdev_guid
, "slow_io");
952 zcp
->zc_data
.zc_serd_slow_io
,
954 SEC2NSEC(slow_io_t
));
955 zfs_case_serialize(zcp
);
957 /* Pass event to SERD engine and see if this triggers */
958 if (zcp
->zc_data
.zc_serd_slow_io
[0] != '\0' &&
959 zfs_fm_serd_record(hdl
,
960 zcp
->zc_data
.zc_serd_slow_io
, ep
, zcp
, "slow io")) {
961 zfs_case_solve(hdl
, zcp
,
962 "fault.fs.zfs.vdev.slow_io");
964 } else if (fmd_nvl_class_match(hdl
, nvl
,
965 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM
))) {
969 * We ignore ereports for checksum errors generated by
970 * scrub/resilver I/O to avoid potentially further
971 * degrading the pool while it's being repaired.
973 * Note that FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS used to
974 * be int32. To allow newer zed to work on older
975 * kernels, if we don't find the flags, we look for
976 * the older ones too.
978 if (((nvlist_lookup_uint32(nvl
,
979 FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY
, &pri
) == 0) &&
980 (pri
== ZIO_PRIORITY_SCRUB
||
981 pri
== ZIO_PRIORITY_REBUILD
)) ||
982 ((nvlist_lookup_uint64(nvl
,
983 FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS
, &flags
) == 0) &&
984 (flags
& (ZIO_FLAG_SCRUB
| ZIO_FLAG_RESILVER
))) ||
985 ((nvlist_lookup_int32(nvl
,
986 FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS
, &flags32
) == 0) &&
987 (flags32
& (ZIO_FLAG_SCRUB
| ZIO_FLAG_RESILVER
)))) {
988 fmd_hdl_debug(hdl
, "ignoring '%s' for "
989 "scrub/resilver I/O", class);
993 if (zcp
->zc_data
.zc_serd_checksum
[0] == '\0') {
994 if (nvlist_lookup_uint64(nvl
,
995 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N
,
997 checksum_n
= DEFAULT_CHECKSUM_N
;
999 if (nvlist_lookup_uint64(nvl
,
1000 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T
,
1001 &checksum_t
) != 0) {
1002 checksum_t
= DEFAULT_CHECKSUM_T
;
1005 zfs_serd_name(zcp
->zc_data
.zc_serd_checksum
,
1006 pool_guid
, vdev_guid
, "checksum");
1007 fmd_serd_create(hdl
,
1008 zcp
->zc_data
.zc_serd_checksum
,
1010 SEC2NSEC(checksum_t
));
1011 zfs_case_serialize(zcp
);
1013 if (zfs_fm_serd_record(hdl
,
1014 zcp
->zc_data
.zc_serd_checksum
, ep
, zcp
,
1016 zfs_case_solve(hdl
, zcp
,
1017 "fault.fs.zfs.vdev.checksum");
1019 } else if (fmd_nvl_class_match(hdl
, nvl
,
1020 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE
)) &&
1021 (nvlist_lookup_string(nvl
,
1022 FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE
, &failmode
) == 0) &&
1024 if (strncmp(failmode
, FM_EREPORT_FAILMODE_CONTINUE
,
1025 strlen(FM_EREPORT_FAILMODE_CONTINUE
)) == 0) {
1026 zfs_case_solve(hdl
, zcp
,
1027 "fault.fs.zfs.io_failure_continue");
1028 } else if (strncmp(failmode
, FM_EREPORT_FAILMODE_WAIT
,
1029 strlen(FM_EREPORT_FAILMODE_WAIT
)) == 0) {
1030 zfs_case_solve(hdl
, zcp
,
1031 "fault.fs.zfs.io_failure_wait");
1033 } else if (fmd_nvl_class_match(hdl
, nvl
,
1034 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE
))) {
1036 /* This causes an unexpected fault diagnosis on linux */
1037 checkremove
= B_TRUE
;
1042 * Because I/O errors may be due to device removal, we postpone
1043 * any diagnosis until we're sure that we aren't about to
1044 * receive a 'resource.fs.zfs.removed' event.
1047 if (zcp
->zc_data
.zc_has_remove_timer
)
1048 fmd_timer_remove(hdl
, zcp
->zc_remove_timer
);
1049 zcp
->zc_remove_timer
= fmd_timer_install(hdl
, zcp
, NULL
,
1050 zfs_remove_timeout
);
1051 if (!zcp
->zc_data
.zc_has_remove_timer
) {
1052 zcp
->zc_data
.zc_has_remove_timer
= 1;
1053 zfs_case_serialize(zcp
);
1060 * The timeout is fired when we diagnosed an I/O error, and it was not due to
1061 * device removal (which would cause the timeout to be cancelled).
1064 zfs_fm_timeout(fmd_hdl_t
*hdl
, id_t id
, void *data
)
1066 zfs_case_t
*zcp
= data
;
1068 if (id
== zcp
->zc_remove_timer
)
1069 zfs_case_solve(hdl
, zcp
, "fault.fs.zfs.vdev.io");
1073 * The specified case has been closed and any case-specific
1074 * data structures should be deallocated.
1077 zfs_fm_close(fmd_hdl_t
*hdl
, fmd_case_t
*cs
)
1079 zfs_case_t
*zcp
= fmd_case_getspecific(hdl
, cs
);
1081 if (zcp
->zc_data
.zc_serd_checksum
[0] != '\0')
1082 fmd_serd_destroy(hdl
, zcp
->zc_data
.zc_serd_checksum
);
1083 if (zcp
->zc_data
.zc_serd_io
[0] != '\0')
1084 fmd_serd_destroy(hdl
, zcp
->zc_data
.zc_serd_io
);
1085 if (zcp
->zc_data
.zc_serd_slow_io
[0] != '\0')
1086 fmd_serd_destroy(hdl
, zcp
->zc_data
.zc_serd_slow_io
);
1087 if (zcp
->zc_data
.zc_has_remove_timer
)
1088 fmd_timer_remove(hdl
, zcp
->zc_remove_timer
);
1090 uu_list_remove(zfs_cases
, zcp
);
1091 uu_list_node_fini(zcp
, &zcp
->zc_node
, zfs_case_pool
);
1092 fmd_hdl_free(hdl
, zcp
, sizeof (zfs_case_t
));
1095 static const fmd_hdl_ops_t fmd_ops
= {
1096 zfs_fm_recv
, /* fmdo_recv */
1097 zfs_fm_timeout
, /* fmdo_timeout */
1098 zfs_fm_close
, /* fmdo_close */
1099 NULL
, /* fmdo_stats */
1103 static const fmd_prop_t fmd_props
[] = {
1107 static const fmd_hdl_info_t fmd_info
= {
1108 "ZFS Diagnosis Engine", "1.0", &fmd_ops
, fmd_props
1112 _zfs_diagnosis_init(fmd_hdl_t
*hdl
)
1114 libzfs_handle_t
*zhdl
;
1116 if ((zhdl
= libzfs_init()) == NULL
)
1119 if ((zfs_case_pool
= uu_list_pool_create("zfs_case_pool",
1120 sizeof (zfs_case_t
), offsetof(zfs_case_t
, zc_node
),
1121 NULL
, UU_LIST_POOL_DEBUG
)) == NULL
) {
1126 if ((zfs_cases
= uu_list_create(zfs_case_pool
, NULL
,
1127 UU_LIST_DEBUG
)) == NULL
) {
1128 uu_list_pool_destroy(zfs_case_pool
);
1133 if (fmd_hdl_register(hdl
, FMD_API_VERSION
, &fmd_info
) != 0) {
1134 uu_list_destroy(zfs_cases
);
1135 uu_list_pool_destroy(zfs_case_pool
);
1140 fmd_hdl_setspecific(hdl
, zhdl
);
1142 (void) fmd_stat_create(hdl
, FMD_STAT_NOALLOC
, sizeof (zfs_stats
) /
1143 sizeof (fmd_stat_t
), (fmd_stat_t
*)&zfs_stats
);
1147 _zfs_diagnosis_fini(fmd_hdl_t
*hdl
)
1150 uu_list_walk_t
*walk
;
1151 libzfs_handle_t
*zhdl
;
1154 * Remove all active cases.
1156 walk
= uu_list_walk_start(zfs_cases
, UU_WALK_ROBUST
);
1157 while ((zcp
= uu_list_walk_next(walk
)) != NULL
) {
1158 fmd_hdl_debug(hdl
, "removing case ena %llu",
1159 (long long unsigned)zcp
->zc_data
.zc_ena
);
1160 uu_list_remove(zfs_cases
, zcp
);
1161 uu_list_node_fini(zcp
, &zcp
->zc_node
, zfs_case_pool
);
1162 fmd_hdl_free(hdl
, zcp
, sizeof (zfs_case_t
));
1164 uu_list_walk_end(walk
);
1166 uu_list_destroy(zfs_cases
);
1167 uu_list_pool_destroy(zfs_case_pool
);
1169 zhdl
= fmd_hdl_getspecific(hdl
);