4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016, Intel Corporation.
32 #include <sys/types.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/fm/protocol.h>
36 #include <sys/fm/fs/zfs.h>
39 #include "zfs_agents.h"
43 * Default values for the serd engine when processing checksum or io errors. The
44 * semantics are N <events> in T <seconds>.
46 #define DEFAULT_CHECKSUM_N 10 /* events */
47 #define DEFAULT_CHECKSUM_T 600 /* seconds */
48 #define DEFAULT_IO_N 10 /* events */
49 #define DEFAULT_IO_T 600 /* seconds */
52 * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This
53 * #define reserves enough space for two 64-bit hex values plus the length of
56 #define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum"))
59 * On-disk case structure. This must maintain backwards compatibility with
60 * previous versions of the DE. By default, any members appended to the end
61 * will be filled with zeros if they don't exist in a previous version.
63 typedef struct zfs_case_data
{
66 uint64_t zc_pool_guid
;
67 uint64_t zc_vdev_guid
;
69 char zc_serd_checksum
[MAX_SERDLEN
];
70 char zc_serd_io
[MAX_SERDLEN
];
71 int zc_has_remove_timer
;
77 typedef struct er_timeval
{
83 * In-core case structure.
85 typedef struct zfs_case
{
88 zfs_case_data_t zc_data
;
90 uu_list_node_t zc_node
;
96 #define CASE_DATA "data"
97 #define CASE_FRU "fru"
98 #define CASE_DATA_VERSION_INITIAL 1
99 #define CASE_DATA_VERSION_SERD 2
101 typedef struct zfs_de_stats
{
102 fmd_stat_t old_drops
;
103 fmd_stat_t dev_drops
;
104 fmd_stat_t vdev_drops
;
105 fmd_stat_t import_drops
;
106 fmd_stat_t resource_drops
;
109 zfs_de_stats_t zfs_stats
= {
110 { "old_drops", FMD_TYPE_UINT64
, "ereports dropped (from before load)" },
111 { "dev_drops", FMD_TYPE_UINT64
, "ereports dropped (dev during open)"},
112 { "vdev_drops", FMD_TYPE_UINT64
, "ereports dropped (weird vdev types)"},
113 { "import_drops", FMD_TYPE_UINT64
, "ereports dropped (during import)" },
114 { "resource_drops", FMD_TYPE_UINT64
, "resource related ereports" }
117 static hrtime_t zfs_remove_timeout
;
119 uu_list_pool_t
*zfs_case_pool
;
120 uu_list_t
*zfs_cases
;
122 #define ZFS_MAKE_RSRC(type) \
123 FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type
124 #define ZFS_MAKE_EREPORT(type) \
125 FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
128 * Write out the persistent representation of an active case.
131 zfs_case_serialize(zfs_case_t
*zcp
)
133 zcp
->zc_data
.zc_version
= CASE_DATA_VERSION_SERD
;
137 * Read back the persistent representation of an active case.
140 zfs_case_unserialize(fmd_hdl_t
*hdl
, fmd_case_t
*cp
)
144 zcp
= fmd_hdl_zalloc(hdl
, sizeof (zfs_case_t
), FMD_SLEEP
);
147 fmd_buf_read(hdl
, cp
, CASE_DATA
, &zcp
->zc_data
,
148 sizeof (zcp
->zc_data
));
150 if (zcp
->zc_data
.zc_version
> CASE_DATA_VERSION_SERD
) {
151 fmd_hdl_free(hdl
, zcp
, sizeof (zfs_case_t
));
156 * fmd_buf_read() will have already zeroed out the remainder of the
157 * buffer, so we don't have to do anything special if the version
158 * doesn't include the SERD engine name.
161 if (zcp
->zc_data
.zc_has_remove_timer
)
162 zcp
->zc_remove_timer
= fmd_timer_install(hdl
, zcp
,
163 NULL
, zfs_remove_timeout
);
165 uu_list_node_init(zcp
, &zcp
->zc_node
, zfs_case_pool
);
166 (void) uu_list_insert_before(zfs_cases
, NULL
, zcp
);
168 fmd_case_setspecific(hdl
, cp
, zcp
);
174 * Iterate over any active cases. If any cases are associated with a pool or
175 * vdev which is no longer present on the system, close the associated case.
178 zfs_mark_vdev(uint64_t pool_guid
, nvlist_t
*vd
, er_timeval_t
*loaded
)
180 uint64_t vdev_guid
= 0;
185 (void) nvlist_lookup_uint64(vd
, ZPOOL_CONFIG_GUID
, &vdev_guid
);
188 * Mark any cases associated with this (pool, vdev) pair.
190 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
191 zcp
= uu_list_next(zfs_cases
, zcp
)) {
192 if (zcp
->zc_data
.zc_pool_guid
== pool_guid
&&
193 zcp
->zc_data
.zc_vdev_guid
== vdev_guid
) {
194 zcp
->zc_present
= B_TRUE
;
195 zcp
->zc_when
= *loaded
;
200 * Iterate over all children.
202 if (nvlist_lookup_nvlist_array(vd
, ZPOOL_CONFIG_CHILDREN
, &child
,
204 for (c
= 0; c
< children
; c
++)
205 zfs_mark_vdev(pool_guid
, child
[c
], loaded
);
208 if (nvlist_lookup_nvlist_array(vd
, ZPOOL_CONFIG_L2CACHE
, &child
,
210 for (c
= 0; c
< children
; c
++)
211 zfs_mark_vdev(pool_guid
, child
[c
], loaded
);
214 if (nvlist_lookup_nvlist_array(vd
, ZPOOL_CONFIG_SPARES
, &child
,
216 for (c
= 0; c
< children
; c
++)
217 zfs_mark_vdev(pool_guid
, child
[c
], loaded
);
222 zfs_mark_pool(zpool_handle_t
*zhp
, void *unused
)
228 er_timeval_t loaded
= { 0 };
229 nvlist_t
*config
, *vd
;
233 pool_guid
= zpool_get_prop_int(zhp
, ZPOOL_PROP_GUID
, NULL
);
235 * Mark any cases associated with just this pool.
237 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
238 zcp
= uu_list_next(zfs_cases
, zcp
)) {
239 if (zcp
->zc_data
.zc_pool_guid
== pool_guid
&&
240 zcp
->zc_data
.zc_vdev_guid
== 0)
241 zcp
->zc_present
= B_TRUE
;
244 if ((config
= zpool_get_config(zhp
, NULL
)) == NULL
) {
249 (void) nvlist_lookup_uint64_array(config
, ZPOOL_CONFIG_LOADED_TIME
,
252 loaded
.ertv_sec
= tod
[0];
253 loaded
.ertv_nsec
= tod
[1];
254 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
255 zcp
= uu_list_next(zfs_cases
, zcp
)) {
256 if (zcp
->zc_data
.zc_pool_guid
== pool_guid
&&
257 zcp
->zc_data
.zc_vdev_guid
== 0) {
258 zcp
->zc_when
= loaded
;
263 ret
= nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
, &vd
);
269 zfs_mark_vdev(pool_guid
, vd
, &loaded
);
276 struct load_time_arg
{
278 er_timeval_t
*lt_time
;
283 zpool_find_load_time(zpool_handle_t
*zhp
, void *arg
)
285 struct load_time_arg
*lta
= arg
;
296 pool_guid
= zpool_get_prop_int(zhp
, ZPOOL_PROP_GUID
, NULL
);
297 if (pool_guid
!= lta
->lt_guid
) {
302 if ((config
= zpool_get_config(zhp
, NULL
)) == NULL
) {
307 if (nvlist_lookup_uint64_array(config
, ZPOOL_CONFIG_LOADED_TIME
,
308 &tod
, &nelem
) == 0 && nelem
== 2) {
309 lta
->lt_found
= B_TRUE
;
310 lta
->lt_time
->ertv_sec
= tod
[0];
311 lta
->lt_time
->ertv_nsec
= tod
[1];
320 zfs_purge_cases(fmd_hdl_t
*hdl
)
323 uu_list_walk_t
*walk
;
324 libzfs_handle_t
*zhdl
= fmd_hdl_getspecific(hdl
);
327 * There is no way to open a pool by GUID, or lookup a vdev by GUID. No
328 * matter what we do, we're going to have to stomach an O(vdevs * cases)
329 * algorithm. In reality, both quantities are likely so small that
330 * neither will matter. Given that iterating over pools is more
331 * expensive than iterating over the in-memory case list, we opt for a
332 * 'present' flag in each case that starts off cleared. We then iterate
333 * over all pools, marking those that are still present, and removing
334 * those that aren't found.
336 * Note that we could also construct an FMRI and rely on
337 * fmd_nvl_fmri_present(), but this would end up doing the same search.
341 * Mark the cases as not present.
343 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
344 zcp
= uu_list_next(zfs_cases
, zcp
))
345 zcp
->zc_present
= B_FALSE
;
348 * Iterate over all pools and mark the pools and vdevs found. If this
349 * fails (most probably because we're out of memory), then don't close
350 * any of the cases and we cannot be sure they are accurate.
352 if (zpool_iter(zhdl
, zfs_mark_pool
, NULL
) != 0)
356 * Remove those cases which were not found.
358 walk
= uu_list_walk_start(zfs_cases
, UU_WALK_ROBUST
);
359 while ((zcp
= uu_list_walk_next(walk
)) != NULL
) {
360 if (!zcp
->zc_present
)
361 fmd_case_close(hdl
, zcp
->zc_case
);
363 uu_list_walk_end(walk
);
367 * Construct the name of a serd engine given the pool/vdev GUID and type (io or
371 zfs_serd_name(char *buf
, uint64_t pool_guid
, uint64_t vdev_guid
,
374 (void) snprintf(buf
, MAX_SERDLEN
, "zfs_%llx_%llx_%s",
375 (long long unsigned int)pool_guid
,
376 (long long unsigned int)vdev_guid
, type
);
380 * Solve a given ZFS case. This first checks to make sure the diagnosis is
381 * still valid, as well as cleaning up any pending timer associated with the
385 zfs_case_solve(fmd_hdl_t
*hdl
, zfs_case_t
*zcp
, const char *faultname
)
387 nvlist_t
*detector
, *fault
;
389 nvlist_t
*fru
= NULL
;
390 fmd_hdl_debug(hdl
, "solving fault '%s'", faultname
);
393 * Construct the detector from the case data. The detector is in the
394 * ZFS scheme, and is either the pool or the vdev, depending on whether
395 * this is a vdev or pool fault.
397 detector
= fmd_nvl_alloc(hdl
, FMD_SLEEP
);
399 (void) nvlist_add_uint8(detector
, FM_VERSION
, ZFS_SCHEME_VERSION0
);
400 (void) nvlist_add_string(detector
, FM_FMRI_SCHEME
, FM_FMRI_SCHEME_ZFS
);
401 (void) nvlist_add_uint64(detector
, FM_FMRI_ZFS_POOL
,
402 zcp
->zc_data
.zc_pool_guid
);
403 if (zcp
->zc_data
.zc_vdev_guid
!= 0) {
404 (void) nvlist_add_uint64(detector
, FM_FMRI_ZFS_VDEV
,
405 zcp
->zc_data
.zc_vdev_guid
);
408 fault
= fmd_nvl_create_fault(hdl
, faultname
, 100, detector
,
410 fmd_case_add_suspect(hdl
, zcp
->zc_case
, fault
);
414 fmd_case_solve(hdl
, zcp
->zc_case
);
417 if (zcp
->zc_data
.zc_has_remove_timer
) {
418 fmd_timer_remove(hdl
, zcp
->zc_remove_timer
);
419 zcp
->zc_data
.zc_has_remove_timer
= 0;
423 zfs_case_serialize(zcp
);
425 nvlist_free(detector
);
429 timeval_earlier(er_timeval_t
*a
, er_timeval_t
*b
)
431 return (a
->ertv_sec
< b
->ertv_sec
||
432 (a
->ertv_sec
== b
->ertv_sec
&& a
->ertv_nsec
< b
->ertv_nsec
));
436 zfs_ereport_when(fmd_hdl_t
*hdl
, nvlist_t
*nvl
, er_timeval_t
*when
)
442 if (nvlist_lookup_int64_array(nvl
, FM_EREPORT_TIME
, &tod
,
443 &nelem
) == 0 && nelem
== 2) {
444 when
->ertv_sec
= tod
[0];
445 when
->ertv_nsec
= tod
[1];
447 when
->ertv_sec
= when
->ertv_nsec
= UINT64_MAX
;
452 * Main fmd entry point.
455 zfs_fm_recv(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, nvlist_t
*nvl
, const char *class)
457 zfs_case_t
*zcp
, *dcp
;
459 uint64_t ena
, pool_guid
, vdev_guid
;
460 uint64_t checksum_n
, checksum_t
;
462 er_timeval_t pool_load
;
463 er_timeval_t er_when
;
465 boolean_t pool_found
= B_FALSE
;
466 boolean_t isresource
;
470 * We subscribe to notifications for vdev or pool removal. In these
471 * cases, there may be cases that no longer apply. Purge any cases
472 * that no longer apply.
474 if (fmd_nvl_class_match(hdl
, nvl
, "sysevent.fs.zfs.*")) {
475 fmd_hdl_debug(hdl
, "purging orphaned cases from %s",
476 strrchr(class, '.') + 1);
477 zfs_purge_cases(hdl
);
478 zfs_stats
.resource_drops
.fmds_value
.ui64
++;
482 isresource
= fmd_nvl_class_match(hdl
, nvl
, "resource.fs.zfs.*");
486 * For resources, we don't have a normal payload.
488 if (nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID
,
490 pool_state
= SPA_LOAD_OPEN
;
492 pool_state
= SPA_LOAD_NONE
;
495 (void) nvlist_lookup_nvlist(nvl
,
496 FM_EREPORT_DETECTOR
, &detector
);
497 (void) nvlist_lookup_int32(nvl
,
498 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT
, &pool_state
);
502 * We also ignore all ereports generated during an import of a pool,
503 * since the only possible fault (.pool) would result in import failure,
504 * and hence no persistent fault. Some day we may want to do something
505 * with these ereports, so we continue generating them internally.
507 if (pool_state
== SPA_LOAD_IMPORT
) {
508 zfs_stats
.import_drops
.fmds_value
.ui64
++;
509 fmd_hdl_debug(hdl
, "ignoring '%s' during import", class);
514 * Device I/O errors are ignored during pool open.
516 if (pool_state
== SPA_LOAD_OPEN
&&
517 (fmd_nvl_class_match(hdl
, nvl
,
518 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM
)) ||
519 fmd_nvl_class_match(hdl
, nvl
,
520 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO
)) ||
521 fmd_nvl_class_match(hdl
, nvl
,
522 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE
)))) {
523 fmd_hdl_debug(hdl
, "ignoring '%s' during pool open", class);
524 zfs_stats
.dev_drops
.fmds_value
.ui64
++;
529 * We ignore ereports for anything except disks and files.
531 if (nvlist_lookup_string(nvl
, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE
,
533 if (strcmp(type
, VDEV_TYPE_DISK
) != 0 &&
534 strcmp(type
, VDEV_TYPE_FILE
) != 0) {
535 zfs_stats
.vdev_drops
.fmds_value
.ui64
++;
541 * Determine if this ereport corresponds to an open case.
542 * Each vdev or pool can have a single case.
544 (void) nvlist_lookup_uint64(nvl
,
545 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID
, &pool_guid
);
546 if (nvlist_lookup_uint64(nvl
,
547 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID
, &vdev_guid
) != 0)
549 if (nvlist_lookup_uint64(nvl
, FM_EREPORT_ENA
, &ena
) != 0)
552 zfs_ereport_when(hdl
, nvl
, &er_when
);
554 for (zcp
= uu_list_first(zfs_cases
); zcp
!= NULL
;
555 zcp
= uu_list_next(zfs_cases
, zcp
)) {
556 if (zcp
->zc_data
.zc_pool_guid
== pool_guid
) {
558 pool_load
= zcp
->zc_when
;
560 if (zcp
->zc_data
.zc_vdev_guid
== vdev_guid
)
565 * Avoid falsely accusing a pool of being faulty. Do so by
566 * not replaying ereports that were generated prior to the
567 * current import. If the failure that generated them was
568 * transient because the device was actually removed but we
569 * didn't receive the normal asynchronous notification, we
570 * don't want to mark it as faulted and potentially panic. If
571 * there is still a problem we'd expect not to be able to
572 * import the pool, or that new ereports will be generated
573 * once the pool is used.
575 if (pool_found
&& timeval_earlier(&er_when
, &pool_load
)) {
576 fmd_hdl_debug(hdl
, "ignoring pool %llx, "
577 "ereport time %lld.%lld, pool load time = %lld.%lld",
578 pool_guid
, er_when
.ertv_sec
, er_when
.ertv_nsec
,
579 pool_load
.ertv_sec
, pool_load
.ertv_nsec
);
580 zfs_stats
.old_drops
.fmds_value
.ui64
++;
586 * Haven't yet seen this pool, but same situation
589 libzfs_handle_t
*zhdl
= fmd_hdl_getspecific(hdl
);
590 struct load_time_arg la
;
592 la
.lt_guid
= pool_guid
;
593 la
.lt_time
= &pool_load
;
594 la
.lt_found
= B_FALSE
;
597 zpool_iter(zhdl
, zpool_find_load_time
, &la
) == 0 &&
598 la
.lt_found
== B_TRUE
) {
601 if (timeval_earlier(&er_when
, &pool_load
)) {
602 fmd_hdl_debug(hdl
, "ignoring pool %llx, "
603 "ereport time %lld.%lld, "
604 "pool load time = %lld.%lld",
605 pool_guid
, er_when
.ertv_sec
,
606 er_when
.ertv_nsec
, pool_load
.ertv_sec
,
607 pool_load
.ertv_nsec
);
608 zfs_stats
.old_drops
.fmds_value
.ui64
++;
616 zfs_case_data_t data
= { 0 };
619 * If this is one of our 'fake' resource ereports, and there is
620 * no case open, simply discard it.
623 zfs_stats
.resource_drops
.fmds_value
.ui64
++;
624 fmd_hdl_debug(hdl
, "discarding '%s for vdev %llu",
630 * Skip tracking some ereports
633 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA
)) == 0 ||
635 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE
)) == 0 ||
637 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY
)) == 0) {
638 zfs_stats
.resource_drops
.fmds_value
.ui64
++;
645 cs
= fmd_case_open(hdl
, NULL
);
647 fmd_hdl_debug(hdl
, "opening case for vdev %llu due to '%s'",
651 * Initialize the case buffer. To commonize code, we actually
652 * create the buffer with existing data, and then call
653 * zfs_case_unserialize() to instantiate the in-core structure.
655 fmd_buf_create(hdl
, cs
, CASE_DATA
, sizeof (zfs_case_data_t
));
657 data
.zc_version
= CASE_DATA_VERSION_SERD
;
659 data
.zc_pool_guid
= pool_guid
;
660 data
.zc_vdev_guid
= vdev_guid
;
661 data
.zc_pool_state
= (int)pool_state
;
663 fmd_buf_write(hdl
, cs
, CASE_DATA
, &data
, sizeof (data
));
665 zcp
= zfs_case_unserialize(hdl
, cs
);
668 zcp
->zc_when
= pool_load
;
672 fmd_hdl_debug(hdl
, "resource event '%s'", class);
674 if (fmd_nvl_class_match(hdl
, nvl
,
675 ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE
))) {
677 * The 'resource.fs.zfs.autoreplace' event indicates
678 * that the pool was loaded with the 'autoreplace'
679 * property set. In this case, any pending device
680 * failures should be ignored, as the asynchronous
681 * autoreplace handling will take care of them.
683 fmd_case_close(hdl
, zcp
->zc_case
);
684 } else if (fmd_nvl_class_match(hdl
, nvl
,
685 ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED
))) {
687 * The 'resource.fs.zfs.removed' event indicates that
688 * device removal was detected, and the device was
689 * closed asynchronously. If this is the case, we
690 * assume that any recent I/O errors were due to the
691 * device removal, not any fault of the device itself.
692 * We reset the SERD engine, and cancel any pending
695 if (zcp
->zc_data
.zc_has_remove_timer
) {
696 fmd_timer_remove(hdl
, zcp
->zc_remove_timer
);
697 zcp
->zc_data
.zc_has_remove_timer
= 0;
698 zfs_case_serialize(zcp
);
700 if (zcp
->zc_data
.zc_serd_io
[0] != '\0')
701 fmd_serd_reset(hdl
, zcp
->zc_data
.zc_serd_io
);
702 if (zcp
->zc_data
.zc_serd_checksum
[0] != '\0')
704 zcp
->zc_data
.zc_serd_checksum
);
705 } else if (fmd_nvl_class_match(hdl
, nvl
,
706 ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE
))) {
710 nvlist_lookup_uint64(nvl
,
711 FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE
, &state
) == 0 &&
712 state
== VDEV_STATE_HEALTHY
) {
713 fmd_hdl_debug(hdl
, "closing case after a "
714 "device statechange to healthy");
715 fmd_case_close(hdl
, zcp
->zc_case
);
718 zfs_stats
.resource_drops
.fmds_value
.ui64
++;
723 * Associate the ereport with this case.
725 fmd_case_add_ereport(hdl
, zcp
->zc_case
, ep
);
728 * Don't do anything else if this case is already solved.
730 if (fmd_case_solved(hdl
, zcp
->zc_case
))
733 fmd_hdl_debug(hdl
, "error event '%s'", class);
736 * Determine if we should solve the case and generate a fault. We solve
739 * a. A pool failed to open (ereport.fs.zfs.pool)
740 * b. A device failed to open (ereport.fs.zfs.pool) while a pool
741 * was up and running.
743 * We may see a series of ereports associated with a pool open, all
744 * chained together by the same ENA. If the pool open succeeds, then
745 * we'll see no further ereports. To detect when a pool open has
746 * succeeded, we associate a timer with the event. When it expires, we
749 if (fmd_nvl_class_match(hdl
, nvl
,
750 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL
))) {
752 * Pool level fault. Before solving the case, go through and
753 * close any open device cases that may be pending.
755 for (dcp
= uu_list_first(zfs_cases
); dcp
!= NULL
;
756 dcp
= uu_list_next(zfs_cases
, dcp
)) {
757 if (dcp
->zc_data
.zc_pool_guid
==
758 zcp
->zc_data
.zc_pool_guid
&&
759 dcp
->zc_data
.zc_vdev_guid
!= 0)
760 fmd_case_close(hdl
, dcp
->zc_case
);
763 zfs_case_solve(hdl
, zcp
, "fault.fs.zfs.pool");
764 } else if (fmd_nvl_class_match(hdl
, nvl
,
765 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY
))) {
767 * Pool level fault for reading the intent logs.
769 zfs_case_solve(hdl
, zcp
, "fault.fs.zfs.log_replay");
770 } else if (fmd_nvl_class_match(hdl
, nvl
, "ereport.fs.zfs.vdev.*")) {
774 zfs_case_solve(hdl
, zcp
, "fault.fs.zfs.device");
775 } else if (fmd_nvl_class_match(hdl
, nvl
,
776 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO
)) ||
777 fmd_nvl_class_match(hdl
, nvl
,
778 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM
)) ||
779 fmd_nvl_class_match(hdl
, nvl
,
780 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE
)) ||
781 fmd_nvl_class_match(hdl
, nvl
,
782 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE
))) {
783 const char *failmode
= NULL
;
784 boolean_t checkremove
= B_FALSE
;
789 * If this is a checksum or I/O error, then toss it into the
790 * appropriate SERD engine and check to see if it has fired.
791 * Ideally, we want to do something more sophisticated,
792 * (persistent errors for a single data block, etc). For now,
793 * a single SERD engine is sufficient.
795 if (fmd_nvl_class_match(hdl
, nvl
,
796 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO
))) {
797 if (zcp
->zc_data
.zc_serd_io
[0] == '\0') {
798 if (nvlist_lookup_uint64(nvl
,
799 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N
,
803 if (nvlist_lookup_uint64(nvl
,
804 FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T
,
808 zfs_serd_name(zcp
->zc_data
.zc_serd_io
,
809 pool_guid
, vdev_guid
, "io");
810 fmd_serd_create(hdl
, zcp
->zc_data
.zc_serd_io
,
813 zfs_case_serialize(zcp
);
815 if (fmd_serd_record(hdl
, zcp
->zc_data
.zc_serd_io
, ep
))
816 checkremove
= B_TRUE
;
817 } else if (fmd_nvl_class_match(hdl
, nvl
,
818 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM
))) {
820 * We ignore ereports for checksum errors generated by
821 * scrub/resilver I/O to avoid potentially further
822 * degrading the pool while it's being repaired.
824 if (((nvlist_lookup_uint32(nvl
,
825 FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY
, &pri
) == 0) &&
826 (pri
== ZIO_PRIORITY_SCRUB
||
827 pri
== ZIO_PRIORITY_REBUILD
)) ||
828 ((nvlist_lookup_int32(nvl
,
829 FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS
, &flags
) == 0) &&
830 (flags
& (ZIO_FLAG_SCRUB
| ZIO_FLAG_RESILVER
)))) {
831 fmd_hdl_debug(hdl
, "ignoring '%s' for "
832 "scrub/resilver I/O", class);
836 if (zcp
->zc_data
.zc_serd_checksum
[0] == '\0') {
837 if (nvlist_lookup_uint64(nvl
,
838 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N
,
840 checksum_n
= DEFAULT_CHECKSUM_N
;
842 if (nvlist_lookup_uint64(nvl
,
843 FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T
,
845 checksum_t
= DEFAULT_CHECKSUM_T
;
848 zfs_serd_name(zcp
->zc_data
.zc_serd_checksum
,
849 pool_guid
, vdev_guid
, "checksum");
851 zcp
->zc_data
.zc_serd_checksum
,
853 SEC2NSEC(checksum_t
));
854 zfs_case_serialize(zcp
);
856 if (fmd_serd_record(hdl
,
857 zcp
->zc_data
.zc_serd_checksum
, ep
)) {
858 zfs_case_solve(hdl
, zcp
,
859 "fault.fs.zfs.vdev.checksum");
861 } else if (fmd_nvl_class_match(hdl
, nvl
,
862 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE
)) &&
863 (nvlist_lookup_string(nvl
,
864 FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE
, &failmode
) == 0) &&
866 if (strncmp(failmode
, FM_EREPORT_FAILMODE_CONTINUE
,
867 strlen(FM_EREPORT_FAILMODE_CONTINUE
)) == 0) {
868 zfs_case_solve(hdl
, zcp
,
869 "fault.fs.zfs.io_failure_continue");
870 } else if (strncmp(failmode
, FM_EREPORT_FAILMODE_WAIT
,
871 strlen(FM_EREPORT_FAILMODE_WAIT
)) == 0) {
872 zfs_case_solve(hdl
, zcp
,
873 "fault.fs.zfs.io_failure_wait");
875 } else if (fmd_nvl_class_match(hdl
, nvl
,
876 ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE
))) {
878 /* This causes an unexpected fault diagnosis on linux */
879 checkremove
= B_TRUE
;
884 * Because I/O errors may be due to device removal, we postpone
885 * any diagnosis until we're sure that we aren't about to
886 * receive a 'resource.fs.zfs.removed' event.
889 if (zcp
->zc_data
.zc_has_remove_timer
)
890 fmd_timer_remove(hdl
, zcp
->zc_remove_timer
);
891 zcp
->zc_remove_timer
= fmd_timer_install(hdl
, zcp
, NULL
,
893 if (!zcp
->zc_data
.zc_has_remove_timer
) {
894 zcp
->zc_data
.zc_has_remove_timer
= 1;
895 zfs_case_serialize(zcp
);
902 * The timeout is fired when we diagnosed an I/O error, and it was not due to
903 * device removal (which would cause the timeout to be cancelled).
906 zfs_fm_timeout(fmd_hdl_t
*hdl
, id_t id
, void *data
)
908 zfs_case_t
*zcp
= data
;
910 if (id
== zcp
->zc_remove_timer
)
911 zfs_case_solve(hdl
, zcp
, "fault.fs.zfs.vdev.io");
915 * The specified case has been closed and any case-specific
916 * data structures should be deallocated.
919 zfs_fm_close(fmd_hdl_t
*hdl
, fmd_case_t
*cs
)
921 zfs_case_t
*zcp
= fmd_case_getspecific(hdl
, cs
);
923 if (zcp
->zc_data
.zc_serd_checksum
[0] != '\0')
924 fmd_serd_destroy(hdl
, zcp
->zc_data
.zc_serd_checksum
);
925 if (zcp
->zc_data
.zc_serd_io
[0] != '\0')
926 fmd_serd_destroy(hdl
, zcp
->zc_data
.zc_serd_io
);
927 if (zcp
->zc_data
.zc_has_remove_timer
)
928 fmd_timer_remove(hdl
, zcp
->zc_remove_timer
);
930 uu_list_remove(zfs_cases
, zcp
);
931 uu_list_node_fini(zcp
, &zcp
->zc_node
, zfs_case_pool
);
932 fmd_hdl_free(hdl
, zcp
, sizeof (zfs_case_t
));
936 * We use the fmd gc entry point to look for old cases that no longer apply.
937 * This allows us to keep our set of case data small in a long running system.
940 zfs_fm_gc(fmd_hdl_t
*hdl
)
942 zfs_purge_cases(hdl
);
945 static const fmd_hdl_ops_t fmd_ops
= {
946 zfs_fm_recv
, /* fmdo_recv */
947 zfs_fm_timeout
, /* fmdo_timeout */
948 zfs_fm_close
, /* fmdo_close */
949 NULL
, /* fmdo_stats */
950 zfs_fm_gc
, /* fmdo_gc */
953 static const fmd_prop_t fmd_props
[] = {
954 { "checksum_N", FMD_TYPE_UINT32
, "10" },
955 { "checksum_T", FMD_TYPE_TIME
, "10min" },
956 { "io_N", FMD_TYPE_UINT32
, "10" },
957 { "io_T", FMD_TYPE_TIME
, "10min" },
958 { "remove_timeout", FMD_TYPE_TIME
, "15sec" },
962 static const fmd_hdl_info_t fmd_info
= {
963 "ZFS Diagnosis Engine", "1.0", &fmd_ops
, fmd_props
967 _zfs_diagnosis_init(fmd_hdl_t
*hdl
)
969 libzfs_handle_t
*zhdl
;
971 if ((zhdl
= libzfs_init()) == NULL
)
974 if ((zfs_case_pool
= uu_list_pool_create("zfs_case_pool",
975 sizeof (zfs_case_t
), offsetof(zfs_case_t
, zc_node
),
976 NULL
, UU_LIST_POOL_DEBUG
)) == NULL
) {
981 if ((zfs_cases
= uu_list_create(zfs_case_pool
, NULL
,
982 UU_LIST_DEBUG
)) == NULL
) {
983 uu_list_pool_destroy(zfs_case_pool
);
988 if (fmd_hdl_register(hdl
, FMD_API_VERSION
, &fmd_info
) != 0) {
989 uu_list_destroy(zfs_cases
);
990 uu_list_pool_destroy(zfs_case_pool
);
995 fmd_hdl_setspecific(hdl
, zhdl
);
997 (void) fmd_stat_create(hdl
, FMD_STAT_NOALLOC
, sizeof (zfs_stats
) /
998 sizeof (fmd_stat_t
), (fmd_stat_t
*)&zfs_stats
);
1000 zfs_remove_timeout
= fmd_prop_get_int64(hdl
, "remove_timeout");
1004 _zfs_diagnosis_fini(fmd_hdl_t
*hdl
)
1007 uu_list_walk_t
*walk
;
1008 libzfs_handle_t
*zhdl
;
1011 * Remove all active cases.
1013 walk
= uu_list_walk_start(zfs_cases
, UU_WALK_ROBUST
);
1014 while ((zcp
= uu_list_walk_next(walk
)) != NULL
) {
1015 fmd_hdl_debug(hdl
, "removing case ena %llu",
1016 (long long unsigned)zcp
->zc_data
.zc_ena
);
1017 uu_list_remove(zfs_cases
, zcp
);
1018 uu_list_node_fini(zcp
, &zcp
->zc_node
, zfs_case_pool
);
1019 fmd_hdl_free(hdl
, zcp
, sizeof (zfs_case_t
));
1021 uu_list_walk_end(walk
);
1023 uu_list_destroy(zfs_cases
);
1024 uu_list_pool_destroy(zfs_case_pool
);
1026 zhdl
= fmd_hdl_getspecific(hdl
);