4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
32 #include <fm/fmd_api.h>
33 #include <fm/libtopo.h>
34 #include <sys/types.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/fm/protocol.h>
38 #include <sys/fm/fs/zfs.h>
41 * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This
42 * #define reserves enough space for two 64-bit hex values plus the length of
45 #define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum"))
48 * On-disk case structure. This must maintain backwards compatibility with
49 * previous versions of the DE. By default, any members appended to the end
50 * will be filled with zeros if they don't exist in a previous version.
52 typedef struct zfs_case_data
{
55 uint64_t zc_pool_guid
;
56 uint64_t zc_vdev_guid
;
57 int zc_has_timer
; /* defunct */
59 char zc_serd_checksum
[MAX_SERDLEN
];
60 char zc_serd_io
[MAX_SERDLEN
];
61 int zc_has_remove_timer
;
67 typedef struct er_timeval
{
73 * In-core case structure.
75 typedef struct zfs_case
{
78 zfs_case_data_t zc_data
;
80 uu_list_node_t zc_node
;
86 #define CASE_DATA "data"
87 #define CASE_FRU "fru"
88 #define CASE_DATA_VERSION_INITIAL 1
89 #define CASE_DATA_VERSION_SERD 2
91 typedef struct zfs_de_stats
{
94 fmd_stat_t vdev_drops
;
95 fmd_stat_t import_drops
;
96 fmd_stat_t resource_drops
;
99 zfs_de_stats_t zfs_stats
= {
100 { "old_drops", FMD_TYPE_UINT64
, "ereports dropped (from before load)" },
101 { "dev_drops", FMD_TYPE_UINT64
, "ereports dropped (dev during open)"},
102 { "vdev_drops", FMD_TYPE_UINT64
, "ereports dropped (weird vdev types)"},
103 { "import_drops", FMD_TYPE_UINT64
, "ereports dropped (during import)" },
104 { "resource_drops", FMD_TYPE_UINT64
, "resource related ereports" }
107 static hrtime_t zfs_remove_timeout
;
109 uu_list_pool_t
*zfs_case_pool
;
110 uu_list_t
*zfs_cases
;
112 #define ZFS_MAKE_RSRC(type) \
113 FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type
114 #define ZFS_MAKE_EREPORT(type) \
115 FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
118 * Write out the persistent representation of an active case.
121 zfs_case_serialize(fmd_hdl_t
*hdl
, zfs_case_t
*zcp
)
124 * Always update cases to the latest version, even if they were the
125 * previous version when unserialized.
127 zcp
->zc_data
.zc_version
= CASE_DATA_VERSION_SERD
;
128 fmd_buf_write(hdl
, zcp
->zc_case
, CASE_DATA
, &zcp
->zc_data
,
129 sizeof (zcp
->zc_data
));
131 if (zcp
->zc_fru
!= NULL
)
132 fmd_buf_write(hdl
, zcp
->zc_case
, CASE_FRU
, zcp
->zc_fru
,
133 strlen(zcp
->zc_fru
));
137 * Read back the persistent representation of an active case.
140 zfs_case_unserialize(fmd_hdl_t
*hdl
, fmd_case_t
*cp
)
145 zcp
= fmd_hdl_zalloc(hdl
, sizeof (zfs_case_t
), FMD_SLEEP
);
148 fmd_buf_read(hdl
, cp
, CASE_DATA
, &zcp
->zc_data
,
149 sizeof (zcp
->zc_data
));
151 if (zcp
->zc_data
.zc_version
> CASE_DATA_VERSION_SERD
) {
152 fmd_hdl_free(hdl
, zcp
, sizeof (zfs_case_t
));
156 if ((frulen
= fmd_buf_size(hdl
, zcp
->zc_case
, CASE_FRU
)) > 0) {
157 zcp
->zc_fru
= fmd_hdl_alloc(hdl
, frulen
+ 1, FMD_SLEEP
);
158 fmd_buf_read(hdl
, zcp
->zc_case
, CASE_FRU
, zcp
->zc_fru
,
160 zcp
->zc_fru
[frulen
] = '\0';
164 * fmd_buf_read() will have already zeroed out the remainder of the
165 * buffer, so we don't have to do anything special if the version
166 * doesn't include the SERD engine name.
169 if (zcp
->zc_data
.zc_has_remove_timer
)
170 zcp
->zc_remove_timer
= fmd_timer_install(hdl
, zcp
,
171 NULL
, zfs_remove_timeout
);
173 (void) uu_list_insert_before(zfs_cases
, NULL
, zcp
);
175 fmd_case_setspecific(hdl
, cp
, zcp
);
181 * Iterate over any active cases. If any cases are associated with a pool or
182 * vdev which is no longer present on the system, close the associated case.
185 zfs_mark_vdev(uint64_t pool_guid
, nvlist_t
*vd
, er_timeval_t
*loaded
)
193 ret
= nvlist_lookup_uint64(vd
, ZPOOL_CONFIG_GUID
, &vdev_guid
);
197 * Mark any cases associated with this (pool, vdev) pair.
199 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
200 zcp
= uu_list_next(zfs_cases
, zcp
)) {
201 if (zcp
->zc_data
.zc_pool_guid
== pool_guid
&&
202 zcp
->zc_data
.zc_vdev_guid
== vdev_guid
) {
203 zcp
->zc_present
= B_TRUE
;
204 zcp
->zc_when
= *loaded
;
209 * Iterate over all children.
211 if (nvlist_lookup_nvlist_array(vd
, ZPOOL_CONFIG_CHILDREN
, &child
,
213 for (c
= 0; c
< children
; c
++)
214 zfs_mark_vdev(pool_guid
, child
[c
], loaded
);
217 if (nvlist_lookup_nvlist_array(vd
, ZPOOL_CONFIG_L2CACHE
, &child
,
219 for (c
= 0; c
< children
; c
++)
220 zfs_mark_vdev(pool_guid
, child
[c
], loaded
);
223 if (nvlist_lookup_nvlist_array(vd
, ZPOOL_CONFIG_SPARES
, &child
,
225 for (c
= 0; c
< children
; c
++)
226 zfs_mark_vdev(pool_guid
, child
[c
], loaded
);
232 zfs_mark_pool(zpool_handle_t
*zhp
, void *unused
)
237 er_timeval_t loaded
= { 0 };
238 nvlist_t
*config
, *vd
;
242 pool_guid
= zpool_get_prop_int(zhp
, ZPOOL_PROP_GUID
, NULL
);
244 * Mark any cases associated with just this pool.
246 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
247 zcp
= uu_list_next(zfs_cases
, zcp
)) {
248 if (zcp
->zc_data
.zc_pool_guid
== pool_guid
&&
249 zcp
->zc_data
.zc_vdev_guid
== 0)
250 zcp
->zc_present
= B_TRUE
;
253 if ((config
= zpool_get_config(zhp
, NULL
)) == NULL
) {
258 (void) nvlist_lookup_uint64_array(config
, ZPOOL_CONFIG_LOADED_TIME
,
261 loaded
.ertv_sec
= tod
[0];
262 loaded
.ertv_nsec
= tod
[1];
263 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
264 zcp
= uu_list_next(zfs_cases
, zcp
)) {
265 if (zcp
->zc_data
.zc_pool_guid
== pool_guid
&&
266 zcp
->zc_data
.zc_vdev_guid
== 0) {
267 zcp
->zc_when
= loaded
;
272 ret
= nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
, &vd
);
275 zfs_mark_vdev(pool_guid
, vd
, &loaded
);
282 struct load_time_arg
{
284 er_timeval_t
*lt_time
;
289 zpool_find_load_time(zpool_handle_t
*zhp
, void *arg
)
291 struct load_time_arg
*lta
= arg
;
302 pool_guid
= zpool_get_prop_int(zhp
, ZPOOL_PROP_GUID
, NULL
);
303 if (pool_guid
!= lta
->lt_guid
) {
308 if ((config
= zpool_get_config(zhp
, NULL
)) == NULL
) {
313 if (nvlist_lookup_uint64_array(config
, ZPOOL_CONFIG_LOADED_TIME
,
314 &tod
, &nelem
) == 0 && nelem
== 2) {
315 lta
->lt_found
= B_TRUE
;
316 lta
->lt_time
->ertv_sec
= tod
[0];
317 lta
->lt_time
->ertv_nsec
= tod
[1];
326 zfs_purge_cases(fmd_hdl_t
*hdl
)
329 uu_list_walk_t
*walk
;
330 libzfs_handle_t
*zhdl
= fmd_hdl_getspecific(hdl
);
333 * There is no way to open a pool by GUID, or lookup a vdev by GUID. No
334 * matter what we do, we're going to have to stomach a O(vdevs * cases)
335 * algorithm. In reality, both quantities are likely so small that
336 * neither will matter. Given that iterating over pools is more
337 * expensive than iterating over the in-memory case list, we opt for a
338 * 'present' flag in each case that starts off cleared. We then iterate
339 * over all pools, marking those that are still present, and removing
340 * those that aren't found.
342 * Note that we could also construct an FMRI and rely on
343 * fmd_nvl_fmri_present(), but this would end up doing the same search.
347 * Mark the cases an not present.
349 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
350 zcp
= uu_list_next(zfs_cases
, zcp
))
351 zcp
->zc_present
= B_FALSE
;
354 * Iterate over all pools and mark the pools and vdevs found. If this
355 * fails (most probably because we're out of memory), then don't close
356 * any of the cases and we cannot be sure they are accurate.
358 if (zpool_iter(zhdl
, zfs_mark_pool
, NULL
) != 0)
362 * Remove those cases which were not found.
364 walk
= uu_list_walk_start(zfs_cases
, UU_WALK_ROBUST
);
365 while ((zcp
= uu_list_walk_next(walk
)) != NULL
) {
366 if (!zcp
->zc_present
)
367 fmd_case_close(hdl
, zcp
->zc_case
);
369 uu_list_walk_end(walk
);
373 * Construct the name of a serd engine given the pool/vdev GUID and type (io or
377 zfs_serd_name(char *buf
, uint64_t pool_guid
, uint64_t vdev_guid
,
380 (void) snprintf(buf
, MAX_SERDLEN
, "zfs_%llx_%llx_%s", pool_guid
,
385 * Solve a given ZFS case. This first checks to make sure the diagnosis is
386 * still valid, as well as cleaning up any pending timer associated with the
390 zfs_case_solve(fmd_hdl_t
*hdl
, zfs_case_t
*zcp
, const char *faultname
,
391 boolean_t checkunusable
)
393 libzfs_handle_t
*zhdl
= fmd_hdl_getspecific(hdl
);
394 nvlist_t
*detector
, *fault
;
396 nvlist_t
*fmri
, *fru
;
401 * Construct the detector from the case data. The detector is in the
402 * ZFS scheme, and is either the pool or the vdev, depending on whether
403 * this is a vdev or pool fault.
405 detector
= fmd_nvl_alloc(hdl
, FMD_SLEEP
);
407 (void) nvlist_add_uint8(detector
, FM_VERSION
, ZFS_SCHEME_VERSION0
);
408 (void) nvlist_add_string(detector
, FM_FMRI_SCHEME
, FM_FMRI_SCHEME_ZFS
);
409 (void) nvlist_add_uint64(detector
, FM_FMRI_ZFS_POOL
,
410 zcp
->zc_data
.zc_pool_guid
);
411 if (zcp
->zc_data
.zc_vdev_guid
!= 0) {
412 (void) nvlist_add_uint64(detector
, FM_FMRI_ZFS_VDEV
,
413 zcp
->zc_data
.zc_vdev_guid
);
417 * We also want to make sure that the detector (pool or vdev) properly
418 * reflects the diagnosed state, when the fault corresponds to internal
419 * ZFS state (i.e. not checksum or I/O error-induced). Otherwise, a
420 * device which was unavailable early in boot (because the driver/file
421 * wasn't available) and is now healthy will be mis-diagnosed.
423 if (!fmd_nvl_fmri_present(hdl
, detector
) ||
424 (checkunusable
&& !fmd_nvl_fmri_unusable(hdl
, detector
))) {
425 fmd_case_close(hdl
, zcp
->zc_case
);
426 nvlist_free(detector
);
432 if (zcp
->zc_fru
!= NULL
&&
433 (thp
= fmd_hdl_topo_hold(hdl
, TOPO_VERSION
)) != NULL
) {
435 * If the vdev had an associated FRU, then get the FRU nvlist
436 * from the topo handle and use that in the suspect list. We
437 * explicitly lookup the FRU because the fmri reported from the
438 * kernel may not have up to date details about the disk itself
439 * (serial, part, etc).
441 if (topo_fmri_str2nvl(thp
, zcp
->zc_fru
, &fmri
, &err
) == 0) {
443 * If the disk is part of the system chassis, but the
444 * FRU indicates a different chassis ID than our
445 * current system, then ignore the error. This
446 * indicates that the device was part of another
447 * cluster head, and for obvious reasons cannot be
448 * imported on this system.
450 if (libzfs_fru_notself(zhdl
, zcp
->zc_fru
)) {
451 fmd_case_close(hdl
, zcp
->zc_case
);
453 fmd_hdl_topo_rele(hdl
, thp
);
454 nvlist_free(detector
);
459 * If the device is no longer present on the system, or
460 * topo_fmri_fru() fails for other reasons, then fall
461 * back to the fmri specified in the vdev.
463 if (topo_fmri_fru(thp
, fmri
, &fru
, &err
) != 0)
464 fru
= fmd_nvl_dup(hdl
, fmri
, FMD_SLEEP
);
468 fmd_hdl_topo_rele(hdl
, thp
);
471 fault
= fmd_nvl_create_fault(hdl
, faultname
, 100, detector
,
473 fmd_case_add_suspect(hdl
, zcp
->zc_case
, fault
);
477 fmd_case_solve(hdl
, zcp
->zc_case
);
480 if (zcp
->zc_data
.zc_has_remove_timer
) {
481 fmd_timer_remove(hdl
, zcp
->zc_remove_timer
);
482 zcp
->zc_data
.zc_has_remove_timer
= 0;
486 zfs_case_serialize(hdl
, zcp
);
488 nvlist_free(detector
);
492 * This #define and function access a private interface of the FMA
493 * framework. Ereports include a time-of-day upper bound.
494 * We want to look at that so we can compare it to when pools get
497 #define FMD_EVN_TOD "__tod"
500 timeval_earlier(er_timeval_t
*a
, er_timeval_t
*b
)
502 return (a
->ertv_sec
< b
->ertv_sec
||
503 (a
->ertv_sec
== b
->ertv_sec
&& a
->ertv_nsec
< b
->ertv_nsec
));
508 zfs_ereport_when(fmd_hdl_t
*hdl
, nvlist_t
*nvl
, er_timeval_t
*when
)
513 if (nvlist_lookup_uint64_array(nvl
, FMD_EVN_TOD
, &tod
, &nelem
) == 0 &&
515 when
->ertv_sec
= tod
[0];
516 when
->ertv_nsec
= tod
[1];
518 when
->ertv_sec
= when
->ertv_nsec
= UINT64_MAX
;
523 * Main fmd entry point.
527 zfs_fm_recv(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, nvlist_t
*nvl
, const char *class)
529 zfs_case_t
*zcp
, *dcp
;
531 uint64_t ena
, pool_guid
, vdev_guid
;
532 er_timeval_t pool_load
;
533 er_timeval_t er_when
;
535 boolean_t pool_found
= B_FALSE
;
536 boolean_t isresource
;
540 * We subscribe to notifications for vdev or pool removal. In these
541 * cases, there may be cases that no longer apply. Purge any cases
542 * that no longer apply.
544 if (fmd_nvl_class_match(hdl
, nvl
, "resource.sysevent.EC_zfs.*")) {
545 zfs_purge_cases(hdl
);
546 zfs_stats
.resource_drops
.fmds_value
.ui64
++;
550 isresource
= fmd_nvl_class_match(hdl
, nvl
, "resource.fs.zfs.*");
554 * For resources, we don't have a normal payload.
556 if (nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID
,
558 pool_state
= SPA_LOAD_OPEN
;
560 pool_state
= SPA_LOAD_NONE
;
563 (void) nvlist_lookup_nvlist(nvl
,
564 FM_EREPORT_DETECTOR
, &detector
);
565 (void) nvlist_lookup_int32(nvl
,
566 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT
, &pool_state
);
570 * We also ignore all ereports generated during an import of a pool,
571 * since the only possible fault (.pool) would result in import failure,
572 * and hence no persistent fault. Some day we may want to do something
573 * with these ereports, so we continue generating them internally.
575 if (pool_state
== SPA_LOAD_IMPORT
) {
576 zfs_stats
.import_drops
.fmds_value
.ui64
++;
581 * Device I/O errors are ignored during pool open.
583 if (pool_state
== SPA_LOAD_OPEN
&&
584 (fmd_nvl_class_match(hdl
, nvl
,
585 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM
)) ||
586 fmd_nvl_class_match(hdl
, nvl
,
587 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO
)) ||
588 fmd_nvl_class_match(hdl
, nvl
,
589 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE
)))) {
590 zfs_stats
.dev_drops
.fmds_value
.ui64
++;
595 * We ignore ereports for anything except disks and files.
597 if (nvlist_lookup_string(nvl
, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE
,
599 if (strcmp(type
, VDEV_TYPE_DISK
) != 0 &&
600 strcmp(type
, VDEV_TYPE_FILE
) != 0) {
601 zfs_stats
.vdev_drops
.fmds_value
.ui64
++;
607 * Determine if this ereport corresponds to an open case. Previous
608 * incarnations of this DE used the ENA to chain events together as
609 * part of the same case. The problem with this is that we rely on
610 * global uniqueness of cases based on (pool_guid, vdev_guid) pair when
611 * generating SERD engines. Instead, we have a case for each vdev or
612 * pool, regardless of the ENA.
614 (void) nvlist_lookup_uint64(nvl
,
615 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID
, &pool_guid
);
616 if (nvlist_lookup_uint64(nvl
,
617 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID
, &vdev_guid
) != 0)
619 if (nvlist_lookup_uint64(nvl
, FM_EREPORT_ENA
, &ena
) != 0)
622 zfs_ereport_when(hdl
, nvl
, &er_when
);
624 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
625 zcp
= uu_list_next(zfs_cases
, zcp
)) {
626 if (zcp
->zc_data
.zc_pool_guid
== pool_guid
) {
628 pool_load
= zcp
->zc_when
;
630 if (zcp
->zc_data
.zc_vdev_guid
== vdev_guid
)
635 fmd_hdl_debug(hdl
, "pool %llx, "
636 "ereport time %lld.%lld, pool load time = %lld.%lld\n",
637 pool_guid
, er_when
.ertv_sec
, er_when
.ertv_nsec
,
638 pool_load
.ertv_sec
, pool_load
.ertv_nsec
);
642 * Avoid falsely accusing a pool of being faulty. Do so by
643 * not replaying ereports that were generated prior to the
644 * current import. If the failure that generated them was
645 * transient because the device was actually removed but we
646 * didn't receive the normal asynchronous notification, we
647 * don't want to mark it as faulted and potentially panic. If
648 * there is still a problem we'd expect not to be able to
649 * import the pool, or that new ereports will be generated
650 * once the pool is used.
652 if (pool_found
&& timeval_earlier(&er_when
, &pool_load
)) {
653 zfs_stats
.old_drops
.fmds_value
.ui64
++;
659 * Haven't yet seen this pool, but same situation
662 libzfs_handle_t
*zhdl
= fmd_hdl_getspecific(hdl
);
663 struct load_time_arg la
;
665 la
.lt_guid
= pool_guid
;
666 la
.lt_time
= &pool_load
;
667 la
.lt_found
= B_FALSE
;
670 zpool_iter(zhdl
, zpool_find_load_time
, &la
) == 0 &&
671 la
.lt_found
== B_TRUE
) {
673 fmd_hdl_debug(hdl
, "pool %llx, "
674 "ereport time %lld.%lld, "
675 "pool load time = %lld.%lld\n",
676 pool_guid
, er_when
.ertv_sec
, er_when
.ertv_nsec
,
677 pool_load
.ertv_sec
, pool_load
.ertv_nsec
);
678 if (timeval_earlier(&er_when
, &pool_load
)) {
679 zfs_stats
.old_drops
.fmds_value
.ui64
++;
687 zfs_case_data_t data
= { 0 };
690 * If this is one of our 'fake' resource ereports, and there is
691 * no case open, simply discard it.
694 zfs_stats
.resource_drops
.fmds_value
.ui64
++;
701 cs
= fmd_case_open(hdl
, NULL
);
704 * Initialize the case buffer. To commonize code, we actually
705 * create the buffer with existing data, and then call
706 * zfs_case_unserialize() to instantiate the in-core structure.
708 fmd_buf_create(hdl
, cs
, CASE_DATA
,
709 sizeof (zfs_case_data_t
));
711 data
.zc_version
= CASE_DATA_VERSION_SERD
;
713 data
.zc_pool_guid
= pool_guid
;
714 data
.zc_vdev_guid
= vdev_guid
;
715 data
.zc_pool_state
= (int)pool_state
;
717 fmd_buf_write(hdl
, cs
, CASE_DATA
, &data
, sizeof (data
));
719 zcp
= zfs_case_unserialize(hdl
, cs
);
722 zcp
->zc_when
= pool_load
;
727 * If this is an ereport for a case with an associated vdev FRU, make
728 * sure it is accurate and up to date.
730 if (nvlist_lookup_string(nvl
, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU
,
732 topo_hdl_t
*thp
= fmd_hdl_topo_hold(hdl
, TOPO_VERSION
);
733 if (zcp
->zc_fru
== NULL
||
734 !topo_fmri_strcmp(thp
, zcp
->zc_fru
, fru
)) {
735 if (zcp
->zc_fru
!= NULL
) {
736 fmd_hdl_strfree(hdl
, zcp
->zc_fru
);
737 fmd_buf_destroy(hdl
, zcp
->zc_case
, CASE_FRU
);
739 zcp
->zc_fru
= fmd_hdl_strdup(hdl
, fru
, FMD_SLEEP
);
740 zfs_case_serialize(hdl
, zcp
);
742 fmd_hdl_topo_rele(hdl
, thp
);
746 if (fmd_nvl_class_match(hdl
, nvl
,
747 ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE
))) {
749 * The 'resource.fs.zfs.autoreplace' event indicates
750 * that the pool was loaded with the 'autoreplace'
751 * property set. In this case, any pending device
752 * failures should be ignored, as the asynchronous
753 * autoreplace handling will take care of them.
755 fmd_case_close(hdl
, zcp
->zc_case
);
756 } else if (fmd_nvl_class_match(hdl
, nvl
,
757 ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED
))) {
759 * The 'resource.fs.zfs.removed' event indicates that
760 * device removal was detected, and the device was
761 * closed asynchronously. If this is the case, we
762 * assume that any recent I/O errors were due to the
763 * device removal, not any fault of the device itself.
764 * We reset the SERD engine, and cancel any pending
767 if (zcp
->zc_data
.zc_has_remove_timer
) {
768 fmd_timer_remove(hdl
, zcp
->zc_remove_timer
);
769 zcp
->zc_data
.zc_has_remove_timer
= 0;
770 zfs_case_serialize(hdl
, zcp
);
772 if (zcp
->zc_data
.zc_serd_io
[0] != '\0')
774 zcp
->zc_data
.zc_serd_io
);
775 if (zcp
->zc_data
.zc_serd_checksum
[0] != '\0')
777 zcp
->zc_data
.zc_serd_checksum
);
779 zfs_stats
.resource_drops
.fmds_value
.ui64
++;
784 * Associate the ereport with this case.
786 fmd_case_add_ereport(hdl
, zcp
->zc_case
, ep
);
789 * Don't do anything else if this case is already solved.
791 if (fmd_case_solved(hdl
, zcp
->zc_case
))
795 * Determine if we should solve the case and generate a fault. We solve
798 * a. A pool failed to open (ereport.fs.zfs.pool)
799 * b. A device failed to open (ereport.fs.zfs.pool) while a pool
800 * was up and running.
802 * We may see a series of ereports associated with a pool open, all
803 * chained together by the same ENA. If the pool open succeeds, then
804 * we'll see no further ereports. To detect when a pool open has
805 * succeeded, we associate a timer with the event. When it expires, we
808 if (fmd_nvl_class_match(hdl
, nvl
,
809 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL
))) {
811 * Pool level fault. Before solving the case, go through and
812 * close any open device cases that may be pending.
814 for (dcp
= uu_list_first(zfs_cases
); dcp
!= NULL
;
815 dcp
= uu_list_next(zfs_cases
, dcp
)) {
816 if (dcp
->zc_data
.zc_pool_guid
==
817 zcp
->zc_data
.zc_pool_guid
&&
818 dcp
->zc_data
.zc_vdev_guid
!= 0)
819 fmd_case_close(hdl
, dcp
->zc_case
);
822 zfs_case_solve(hdl
, zcp
, "fault.fs.zfs.pool", B_TRUE
);
823 } else if (fmd_nvl_class_match(hdl
, nvl
,
824 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY
))) {
826 * Pool level fault for reading the intent logs.
828 zfs_case_solve(hdl
, zcp
, "fault.fs.zfs.log_replay", B_TRUE
);
829 } else if (fmd_nvl_class_match(hdl
, nvl
, "ereport.fs.zfs.vdev.*")) {
833 zfs_case_solve(hdl
, zcp
, "fault.fs.zfs.device", B_TRUE
);
834 } else if (fmd_nvl_class_match(hdl
, nvl
,
835 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO
)) ||
836 fmd_nvl_class_match(hdl
, nvl
,
837 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM
)) ||
838 fmd_nvl_class_match(hdl
, nvl
,
839 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE
)) ||
840 fmd_nvl_class_match(hdl
, nvl
,
841 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE
))) {
842 char *failmode
= NULL
;
843 boolean_t checkremove
= B_FALSE
;
846 * If this is a checksum or I/O error, then toss it into the
847 * appropriate SERD engine and check to see if it has fired.
848 * Ideally, we want to do something more sophisticated,
849 * (persistent errors for a single data block, etc). For now,
850 * a single SERD engine is sufficient.
852 if (fmd_nvl_class_match(hdl
, nvl
,
853 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO
))) {
854 if (zcp
->zc_data
.zc_serd_io
[0] == '\0') {
855 zfs_serd_name(zcp
->zc_data
.zc_serd_io
,
856 pool_guid
, vdev_guid
, "io");
857 fmd_serd_create(hdl
, zcp
->zc_data
.zc_serd_io
,
858 fmd_prop_get_int32(hdl
, "io_N"),
859 fmd_prop_get_int64(hdl
, "io_T"));
860 zfs_case_serialize(hdl
, zcp
);
862 if (fmd_serd_record(hdl
, zcp
->zc_data
.zc_serd_io
, ep
))
863 checkremove
= B_TRUE
;
864 } else if (fmd_nvl_class_match(hdl
, nvl
,
865 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM
))) {
866 if (zcp
->zc_data
.zc_serd_checksum
[0] == '\0') {
867 zfs_serd_name(zcp
->zc_data
.zc_serd_checksum
,
868 pool_guid
, vdev_guid
, "checksum");
870 zcp
->zc_data
.zc_serd_checksum
,
871 fmd_prop_get_int32(hdl
, "checksum_N"),
872 fmd_prop_get_int64(hdl
, "checksum_T"));
873 zfs_case_serialize(hdl
, zcp
);
875 if (fmd_serd_record(hdl
,
876 zcp
->zc_data
.zc_serd_checksum
, ep
)) {
877 zfs_case_solve(hdl
, zcp
,
878 "fault.fs.zfs.vdev.checksum", B_FALSE
);
880 } else if (fmd_nvl_class_match(hdl
, nvl
,
881 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE
)) &&
882 (nvlist_lookup_string(nvl
,
883 FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE
, &failmode
) == 0) &&
885 if (strncmp(failmode
, FM_EREPORT_FAILMODE_CONTINUE
,
886 strlen(FM_EREPORT_FAILMODE_CONTINUE
)) == 0) {
887 zfs_case_solve(hdl
, zcp
,
888 "fault.fs.zfs.io_failure_continue",
890 } else if (strncmp(failmode
, FM_EREPORT_FAILMODE_WAIT
,
891 strlen(FM_EREPORT_FAILMODE_WAIT
)) == 0) {
892 zfs_case_solve(hdl
, zcp
,
893 "fault.fs.zfs.io_failure_wait", B_FALSE
);
895 } else if (fmd_nvl_class_match(hdl
, nvl
,
896 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE
))) {
897 checkremove
= B_TRUE
;
901 * Because I/O errors may be due to device removal, we postpone
902 * any diagnosis until we're sure that we aren't about to
903 * receive a 'resource.fs.zfs.removed' event.
906 if (zcp
->zc_data
.zc_has_remove_timer
)
907 fmd_timer_remove(hdl
, zcp
->zc_remove_timer
);
908 zcp
->zc_remove_timer
= fmd_timer_install(hdl
, zcp
, NULL
,
910 if (!zcp
->zc_data
.zc_has_remove_timer
) {
911 zcp
->zc_data
.zc_has_remove_timer
= 1;
912 zfs_case_serialize(hdl
, zcp
);
919 * The timeout is fired when we diagnosed an I/O error, and it was not due to
920 * device removal (which would cause the timeout to be cancelled).
924 zfs_fm_timeout(fmd_hdl_t
*hdl
, id_t id
, void *data
)
926 zfs_case_t
*zcp
= data
;
928 if (id
== zcp
->zc_remove_timer
)
929 zfs_case_solve(hdl
, zcp
, "fault.fs.zfs.vdev.io", B_FALSE
);
933 zfs_fm_close(fmd_hdl_t
*hdl
, fmd_case_t
*cs
)
935 zfs_case_t
*zcp
= fmd_case_getspecific(hdl
, cs
);
937 if (zcp
->zc_data
.zc_serd_checksum
[0] != '\0')
938 fmd_serd_destroy(hdl
, zcp
->zc_data
.zc_serd_checksum
);
939 if (zcp
->zc_data
.zc_serd_io
[0] != '\0')
940 fmd_serd_destroy(hdl
, zcp
->zc_data
.zc_serd_io
);
941 if (zcp
->zc_data
.zc_has_remove_timer
)
942 fmd_timer_remove(hdl
, zcp
->zc_remove_timer
);
943 uu_list_remove(zfs_cases
, zcp
);
944 fmd_hdl_free(hdl
, zcp
, sizeof (zfs_case_t
));
948 * We use the fmd gc entry point to look for old cases that no longer apply.
949 * This allows us to keep our set of case data small in a long running system.
952 zfs_fm_gc(fmd_hdl_t
*hdl
)
954 zfs_purge_cases(hdl
);
957 static const fmd_hdl_ops_t fmd_ops
= {
958 zfs_fm_recv
, /* fmdo_recv */
959 zfs_fm_timeout
, /* fmdo_timeout */
960 zfs_fm_close
, /* fmdo_close */
961 NULL
, /* fmdo_stats */
962 zfs_fm_gc
, /* fmdo_gc */
965 static const fmd_prop_t fmd_props
[] = {
966 { "checksum_N", FMD_TYPE_UINT32
, "10" },
967 { "checksum_T", FMD_TYPE_TIME
, "10min" },
968 { "io_N", FMD_TYPE_UINT32
, "10" },
969 { "io_T", FMD_TYPE_TIME
, "10min" },
970 { "remove_timeout", FMD_TYPE_TIME
, "15sec" },
974 static const fmd_hdl_info_t fmd_info
= {
975 "ZFS Diagnosis Engine", "1.0", &fmd_ops
, fmd_props
979 _fmd_init(fmd_hdl_t
*hdl
)
982 libzfs_handle_t
*zhdl
;
984 if ((zhdl
= libzfs_init()) == NULL
)
987 if ((zfs_case_pool
= uu_list_pool_create("zfs_case_pool",
988 sizeof (zfs_case_t
), offsetof(zfs_case_t
, zc_node
),
994 if ((zfs_cases
= uu_list_create(zfs_case_pool
, NULL
, 0)) == NULL
) {
995 uu_list_pool_destroy(zfs_case_pool
);
1000 if (fmd_hdl_register(hdl
, FMD_API_VERSION
, &fmd_info
) != 0) {
1001 uu_list_destroy(zfs_cases
);
1002 uu_list_pool_destroy(zfs_case_pool
);
1007 fmd_hdl_setspecific(hdl
, zhdl
);
1009 (void) fmd_stat_create(hdl
, FMD_STAT_NOALLOC
, sizeof (zfs_stats
) /
1010 sizeof (fmd_stat_t
), (fmd_stat_t
*)&zfs_stats
);
1013 * Iterate over all active cases and unserialize the associated buffers,
1014 * adding them to our list of open cases.
1016 for (cp
= fmd_case_next(hdl
, NULL
);
1017 cp
!= NULL
; cp
= fmd_case_next(hdl
, cp
))
1018 (void) zfs_case_unserialize(hdl
, cp
);
1021 * Clear out any old cases that are no longer valid.
1023 zfs_purge_cases(hdl
);
1025 zfs_remove_timeout
= fmd_prop_get_int64(hdl
, "remove_timeout");
1029 _fmd_fini(fmd_hdl_t
*hdl
)
1032 uu_list_walk_t
*walk
;
1033 libzfs_handle_t
*zhdl
;
1036 * Remove all active cases.
1038 walk
= uu_list_walk_start(zfs_cases
, UU_WALK_ROBUST
);
1039 while ((zcp
= uu_list_walk_next(walk
)) != NULL
) {
1040 uu_list_remove(zfs_cases
, zcp
);
1041 fmd_hdl_free(hdl
, zcp
, sizeof (zfs_case_t
));
1043 uu_list_walk_end(walk
);
1045 uu_list_destroy(zfs_cases
);
1046 uu_list_pool_destroy(zfs_case_pool
);
1048 zhdl
= fmd_hdl_getspecific(hdl
);