4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <sys/zfs_context.h>
28 #include <sys/fm/fs/zfs.h>
30 #include <sys/spa_impl.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/uberblock_impl.h>
35 #include <sys/metaslab.h>
36 #include <sys/metaslab_impl.h>
37 #include <sys/space_map.h>
40 #include <sys/fs/zfs.h>
44 * Virtual device management.
47 static vdev_ops_t
*vdev_ops_table
[] = {
59 /* maximum scrub/resilver I/O queue per leaf vdev */
60 int zfs_scrub_limit
= 10;
63 * Given a vdev type, return the appropriate ops vector.
66 vdev_getops(const char *type
)
68 vdev_ops_t
*ops
, **opspp
;
70 for (opspp
= vdev_ops_table
; (ops
= *opspp
) != NULL
; opspp
++)
71 if (strcmp(ops
->vdev_op_type
, type
) == 0)
78 * Default asize function: return the MAX of psize with the asize of
79 * all children. This is what's used by anything other than RAID-Z.
82 vdev_default_asize(vdev_t
*vd
, uint64_t psize
)
84 uint64_t asize
= P2ROUNDUP(psize
, 1ULL << vd
->vdev_top
->vdev_ashift
);
88 for (c
= 0; c
< vd
->vdev_children
; c
++) {
89 csize
= vdev_psize_to_asize(vd
->vdev_child
[c
], psize
);
90 asize
= MAX(asize
, csize
);
97 * Get the replaceable or attachable device size.
98 * If the parent is a mirror or raidz, the replaceable size is the minimum
99 * psize of all its children. For the rest, just return our own psize.
110 vdev_get_rsize(vdev_t
*vd
)
115 pvd
= vd
->vdev_parent
;
118 * If our parent is NULL or the root, just return our own psize.
120 if (pvd
== NULL
|| pvd
->vdev_parent
== NULL
)
121 return (vd
->vdev_psize
);
125 for (c
= 0; c
< pvd
->vdev_children
; c
++) {
126 cvd
= pvd
->vdev_child
[c
];
127 rsize
= MIN(rsize
- 1, cvd
->vdev_psize
- 1) + 1;
134 vdev_lookup_top(spa_t
*spa
, uint64_t vdev
)
136 vdev_t
*rvd
= spa
->spa_root_vdev
;
138 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_READER
) != 0);
140 if (vdev
< rvd
->vdev_children
) {
141 ASSERT(rvd
->vdev_child
[vdev
] != NULL
);
142 return (rvd
->vdev_child
[vdev
]);
149 vdev_lookup_by_guid(vdev_t
*vd
, uint64_t guid
)
154 if (vd
->vdev_guid
== guid
)
157 for (c
= 0; c
< vd
->vdev_children
; c
++)
158 if ((mvd
= vdev_lookup_by_guid(vd
->vdev_child
[c
], guid
)) !=
166 vdev_add_child(vdev_t
*pvd
, vdev_t
*cvd
)
168 size_t oldsize
, newsize
;
169 uint64_t id
= cvd
->vdev_id
;
172 ASSERT(spa_config_held(cvd
->vdev_spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
173 ASSERT(cvd
->vdev_parent
== NULL
);
175 cvd
->vdev_parent
= pvd
;
180 ASSERT(id
>= pvd
->vdev_children
|| pvd
->vdev_child
[id
] == NULL
);
182 oldsize
= pvd
->vdev_children
* sizeof (vdev_t
*);
183 pvd
->vdev_children
= MAX(pvd
->vdev_children
, id
+ 1);
184 newsize
= pvd
->vdev_children
* sizeof (vdev_t
*);
186 newchild
= kmem_zalloc(newsize
, KM_SLEEP
);
187 if (pvd
->vdev_child
!= NULL
) {
188 bcopy(pvd
->vdev_child
, newchild
, oldsize
);
189 kmem_free(pvd
->vdev_child
, oldsize
);
192 pvd
->vdev_child
= newchild
;
193 pvd
->vdev_child
[id
] = cvd
;
195 cvd
->vdev_top
= (pvd
->vdev_top
? pvd
->vdev_top
: cvd
);
196 ASSERT(cvd
->vdev_top
->vdev_parent
->vdev_parent
== NULL
);
199 * Walk up all ancestors to update guid sum.
201 for (; pvd
!= NULL
; pvd
= pvd
->vdev_parent
)
202 pvd
->vdev_guid_sum
+= cvd
->vdev_guid_sum
;
204 if (cvd
->vdev_ops
->vdev_op_leaf
)
205 cvd
->vdev_spa
->spa_scrub_maxinflight
+= zfs_scrub_limit
;
209 vdev_remove_child(vdev_t
*pvd
, vdev_t
*cvd
)
212 uint_t id
= cvd
->vdev_id
;
214 ASSERT(cvd
->vdev_parent
== pvd
);
219 ASSERT(id
< pvd
->vdev_children
);
220 ASSERT(pvd
->vdev_child
[id
] == cvd
);
222 pvd
->vdev_child
[id
] = NULL
;
223 cvd
->vdev_parent
= NULL
;
225 for (c
= 0; c
< pvd
->vdev_children
; c
++)
226 if (pvd
->vdev_child
[c
])
229 if (c
== pvd
->vdev_children
) {
230 kmem_free(pvd
->vdev_child
, c
* sizeof (vdev_t
*));
231 pvd
->vdev_child
= NULL
;
232 pvd
->vdev_children
= 0;
236 * Walk up all ancestors to update guid sum.
238 for (; pvd
!= NULL
; pvd
= pvd
->vdev_parent
)
239 pvd
->vdev_guid_sum
-= cvd
->vdev_guid_sum
;
241 if (cvd
->vdev_ops
->vdev_op_leaf
)
242 cvd
->vdev_spa
->spa_scrub_maxinflight
-= zfs_scrub_limit
;
246 * Remove any holes in the child array.
249 vdev_compact_children(vdev_t
*pvd
)
251 vdev_t
**newchild
, *cvd
;
252 int oldc
= pvd
->vdev_children
;
255 ASSERT(spa_config_held(pvd
->vdev_spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
257 for (c
= newc
= 0; c
< oldc
; c
++)
258 if (pvd
->vdev_child
[c
])
261 newchild
= kmem_alloc(newc
* sizeof (vdev_t
*), KM_SLEEP
);
263 for (c
= newc
= 0; c
< oldc
; c
++) {
264 if ((cvd
= pvd
->vdev_child
[c
]) != NULL
) {
265 newchild
[newc
] = cvd
;
266 cvd
->vdev_id
= newc
++;
270 kmem_free(pvd
->vdev_child
, oldc
* sizeof (vdev_t
*));
271 pvd
->vdev_child
= newchild
;
272 pvd
->vdev_children
= newc
;
276 * Allocate and minimally initialize a vdev_t.
279 vdev_alloc_common(spa_t
*spa
, uint_t id
, uint64_t guid
, vdev_ops_t
*ops
)
283 vd
= kmem_zalloc(sizeof (vdev_t
), KM_SLEEP
);
285 if (spa
->spa_root_vdev
== NULL
) {
286 ASSERT(ops
== &vdev_root_ops
);
287 spa
->spa_root_vdev
= vd
;
291 if (spa
->spa_root_vdev
== vd
) {
293 * The root vdev's guid will also be the pool guid,
294 * which must be unique among all pools.
296 while (guid
== 0 || spa_guid_exists(guid
, 0))
297 guid
= spa_get_random(-1ULL);
300 * Any other vdev's guid must be unique within the pool.
303 spa_guid_exists(spa_guid(spa
), guid
))
304 guid
= spa_get_random(-1ULL);
306 ASSERT(!spa_guid_exists(spa_guid(spa
), guid
));
311 vd
->vdev_guid
= guid
;
312 vd
->vdev_guid_sum
= guid
;
314 vd
->vdev_state
= VDEV_STATE_CLOSED
;
316 mutex_init(&vd
->vdev_dtl_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
317 mutex_init(&vd
->vdev_stat_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
318 mutex_init(&vd
->vdev_probe_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
319 space_map_create(&vd
->vdev_dtl_map
, 0, -1ULL, 0, &vd
->vdev_dtl_lock
);
320 space_map_create(&vd
->vdev_dtl_scrub
, 0, -1ULL, 0, &vd
->vdev_dtl_lock
);
321 txg_list_create(&vd
->vdev_ms_list
,
322 offsetof(struct metaslab
, ms_txg_node
));
323 txg_list_create(&vd
->vdev_dtl_list
,
324 offsetof(struct vdev
, vdev_dtl_node
));
325 vd
->vdev_stat
.vs_timestamp
= gethrtime();
333 * Allocate a new vdev. The 'alloctype' is used to control whether we are
334 * creating a new vdev or loading an existing one - the behavior is slightly
335 * different for each case.
338 vdev_alloc(spa_t
*spa
, vdev_t
**vdp
, nvlist_t
*nv
, vdev_t
*parent
, uint_t id
,
343 uint64_t guid
= 0, islog
, nparity
;
346 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
348 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_TYPE
, &type
) != 0)
351 if ((ops
= vdev_getops(type
)) == NULL
)
355 * If this is a load, get the vdev guid from the nvlist.
356 * Otherwise, vdev_alloc_common() will generate one for us.
358 if (alloctype
== VDEV_ALLOC_LOAD
) {
361 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_ID
, &label_id
) ||
365 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &guid
) != 0)
367 } else if (alloctype
== VDEV_ALLOC_SPARE
) {
368 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &guid
) != 0)
370 } else if (alloctype
== VDEV_ALLOC_L2CACHE
) {
371 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &guid
) != 0)
376 * The first allocated vdev must be of type 'root'.
378 if (ops
!= &vdev_root_ops
&& spa
->spa_root_vdev
== NULL
)
382 * Determine whether we're a log vdev.
385 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_IS_LOG
, &islog
);
386 if (islog
&& spa_version(spa
) < SPA_VERSION_SLOGS
)
390 * Set the nparity property for RAID-Z vdevs.
393 if (ops
== &vdev_raidz_ops
) {
394 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_NPARITY
,
397 * Currently, we can only support 2 parity devices.
399 if (nparity
== 0 || nparity
> 2)
402 * Older versions can only support 1 parity device.
405 spa_version(spa
) < SPA_VERSION_RAID6
)
409 * We require the parity to be specified for SPAs that
410 * support multiple parity levels.
412 if (spa_version(spa
) >= SPA_VERSION_RAID6
)
415 * Otherwise, we default to 1 parity device for RAID-Z.
422 ASSERT(nparity
!= -1ULL);
424 vd
= vdev_alloc_common(spa
, id
, guid
, ops
);
426 vd
->vdev_islog
= islog
;
427 vd
->vdev_nparity
= nparity
;
429 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_PATH
, &vd
->vdev_path
) == 0)
430 vd
->vdev_path
= spa_strdup(vd
->vdev_path
);
431 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_DEVID
, &vd
->vdev_devid
) == 0)
432 vd
->vdev_devid
= spa_strdup(vd
->vdev_devid
);
433 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_PHYS_PATH
,
434 &vd
->vdev_physpath
) == 0)
435 vd
->vdev_physpath
= spa_strdup(vd
->vdev_physpath
);
438 * Set the whole_disk property. If it's not specified, leave the value
441 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_WHOLE_DISK
,
442 &vd
->vdev_wholedisk
) != 0)
443 vd
->vdev_wholedisk
= -1ULL;
446 * Look for the 'not present' flag. This will only be set if the device
447 * was not present at the time of import.
449 if (!spa
->spa_import_faulted
)
450 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_NOT_PRESENT
,
451 &vd
->vdev_not_present
);
454 * Get the alignment requirement.
456 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_ASHIFT
, &vd
->vdev_ashift
);
459 * If we're a top-level vdev, try to load the allocation parameters.
461 if (parent
&& !parent
->vdev_parent
&& alloctype
== VDEV_ALLOC_LOAD
) {
462 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_METASLAB_ARRAY
,
464 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_METASLAB_SHIFT
,
466 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_ASIZE
,
471 * If we're a leaf vdev, try to load the DTL object and other state.
473 if (vd
->vdev_ops
->vdev_op_leaf
&&
474 (alloctype
== VDEV_ALLOC_LOAD
|| alloctype
== VDEV_ALLOC_L2CACHE
)) {
475 if (alloctype
== VDEV_ALLOC_LOAD
) {
476 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_DTL
,
477 &vd
->vdev_dtl
.smo_object
);
478 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_UNSPARE
,
481 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_OFFLINE
,
485 * When importing a pool, we want to ignore the persistent fault
486 * state, as the diagnosis made on another system may not be
487 * valid in the current context.
489 if (spa
->spa_load_state
== SPA_LOAD_OPEN
) {
490 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_FAULTED
,
492 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_DEGRADED
,
494 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_REMOVED
,
500 * Add ourselves to the parent's list of children.
502 vdev_add_child(parent
, vd
);
510 vdev_free(vdev_t
*vd
)
513 spa_t
*spa
= vd
->vdev_spa
;
516 * vdev_free() implies closing the vdev first. This is simpler than
517 * trying to ensure complicated semantics for all callers.
521 ASSERT(!list_link_active(&vd
->vdev_config_dirty_node
));
526 for (c
= 0; c
< vd
->vdev_children
; c
++)
527 vdev_free(vd
->vdev_child
[c
]);
529 ASSERT(vd
->vdev_child
== NULL
);
530 ASSERT(vd
->vdev_guid_sum
== vd
->vdev_guid
);
533 * Discard allocation state.
535 if (vd
== vd
->vdev_top
)
536 vdev_metaslab_fini(vd
);
538 ASSERT3U(vd
->vdev_stat
.vs_space
, ==, 0);
539 ASSERT3U(vd
->vdev_stat
.vs_dspace
, ==, 0);
540 ASSERT3U(vd
->vdev_stat
.vs_alloc
, ==, 0);
543 * Remove this vdev from its parent's child list.
545 vdev_remove_child(vd
->vdev_parent
, vd
);
547 ASSERT(vd
->vdev_parent
== NULL
);
550 * Clean up vdev structure.
556 spa_strfree(vd
->vdev_path
);
558 spa_strfree(vd
->vdev_devid
);
559 if (vd
->vdev_physpath
)
560 spa_strfree(vd
->vdev_physpath
);
562 if (vd
->vdev_isspare
)
563 spa_spare_remove(vd
);
564 if (vd
->vdev_isl2cache
)
565 spa_l2cache_remove(vd
);
567 txg_list_destroy(&vd
->vdev_ms_list
);
568 txg_list_destroy(&vd
->vdev_dtl_list
);
569 mutex_enter(&vd
->vdev_dtl_lock
);
570 space_map_unload(&vd
->vdev_dtl_map
);
571 space_map_destroy(&vd
->vdev_dtl_map
);
572 space_map_vacate(&vd
->vdev_dtl_scrub
, NULL
, NULL
);
573 space_map_destroy(&vd
->vdev_dtl_scrub
);
574 mutex_exit(&vd
->vdev_dtl_lock
);
575 mutex_destroy(&vd
->vdev_dtl_lock
);
576 mutex_destroy(&vd
->vdev_stat_lock
);
577 mutex_destroy(&vd
->vdev_probe_lock
);
579 if (vd
== spa
->spa_root_vdev
)
580 spa
->spa_root_vdev
= NULL
;
582 kmem_free(vd
, sizeof (vdev_t
));
586 * Transfer top-level vdev state from svd to tvd.
589 vdev_top_transfer(vdev_t
*svd
, vdev_t
*tvd
)
591 spa_t
*spa
= svd
->vdev_spa
;
596 ASSERT(tvd
== tvd
->vdev_top
);
598 tvd
->vdev_ms_array
= svd
->vdev_ms_array
;
599 tvd
->vdev_ms_shift
= svd
->vdev_ms_shift
;
600 tvd
->vdev_ms_count
= svd
->vdev_ms_count
;
602 svd
->vdev_ms_array
= 0;
603 svd
->vdev_ms_shift
= 0;
604 svd
->vdev_ms_count
= 0;
606 tvd
->vdev_mg
= svd
->vdev_mg
;
607 tvd
->vdev_ms
= svd
->vdev_ms
;
612 if (tvd
->vdev_mg
!= NULL
)
613 tvd
->vdev_mg
->mg_vd
= tvd
;
615 tvd
->vdev_stat
.vs_alloc
= svd
->vdev_stat
.vs_alloc
;
616 tvd
->vdev_stat
.vs_space
= svd
->vdev_stat
.vs_space
;
617 tvd
->vdev_stat
.vs_dspace
= svd
->vdev_stat
.vs_dspace
;
619 svd
->vdev_stat
.vs_alloc
= 0;
620 svd
->vdev_stat
.vs_space
= 0;
621 svd
->vdev_stat
.vs_dspace
= 0;
623 for (t
= 0; t
< TXG_SIZE
; t
++) {
624 while ((msp
= txg_list_remove(&svd
->vdev_ms_list
, t
)) != NULL
)
625 (void) txg_list_add(&tvd
->vdev_ms_list
, msp
, t
);
626 while ((vd
= txg_list_remove(&svd
->vdev_dtl_list
, t
)) != NULL
)
627 (void) txg_list_add(&tvd
->vdev_dtl_list
, vd
, t
);
628 if (txg_list_remove_this(&spa
->spa_vdev_txg_list
, svd
, t
))
629 (void) txg_list_add(&spa
->spa_vdev_txg_list
, tvd
, t
);
632 if (list_link_active(&svd
->vdev_config_dirty_node
)) {
633 vdev_config_clean(svd
);
634 vdev_config_dirty(tvd
);
637 if (list_link_active(&svd
->vdev_state_dirty_node
)) {
638 vdev_state_clean(svd
);
639 vdev_state_dirty(tvd
);
642 tvd
->vdev_deflate_ratio
= svd
->vdev_deflate_ratio
;
643 svd
->vdev_deflate_ratio
= 0;
645 tvd
->vdev_islog
= svd
->vdev_islog
;
650 vdev_top_update(vdev_t
*tvd
, vdev_t
*vd
)
659 for (c
= 0; c
< vd
->vdev_children
; c
++)
660 vdev_top_update(tvd
, vd
->vdev_child
[c
]);
664 * Add a mirror/replacing vdev above an existing vdev.
667 vdev_add_parent(vdev_t
*cvd
, vdev_ops_t
*ops
)
669 spa_t
*spa
= cvd
->vdev_spa
;
670 vdev_t
*pvd
= cvd
->vdev_parent
;
673 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
675 mvd
= vdev_alloc_common(spa
, cvd
->vdev_id
, 0, ops
);
677 mvd
->vdev_asize
= cvd
->vdev_asize
;
678 mvd
->vdev_ashift
= cvd
->vdev_ashift
;
679 mvd
->vdev_state
= cvd
->vdev_state
;
681 vdev_remove_child(pvd
, cvd
);
682 vdev_add_child(pvd
, mvd
);
683 cvd
->vdev_id
= mvd
->vdev_children
;
684 vdev_add_child(mvd
, cvd
);
685 vdev_top_update(cvd
->vdev_top
, cvd
->vdev_top
);
687 if (mvd
== mvd
->vdev_top
)
688 vdev_top_transfer(cvd
, mvd
);
694 * Remove a 1-way mirror/replacing vdev from the tree.
697 vdev_remove_parent(vdev_t
*cvd
)
699 vdev_t
*mvd
= cvd
->vdev_parent
;
700 vdev_t
*pvd
= mvd
->vdev_parent
;
702 ASSERT(spa_config_held(cvd
->vdev_spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
704 ASSERT(mvd
->vdev_children
== 1);
705 ASSERT(mvd
->vdev_ops
== &vdev_mirror_ops
||
706 mvd
->vdev_ops
== &vdev_replacing_ops
||
707 mvd
->vdev_ops
== &vdev_spare_ops
);
708 cvd
->vdev_ashift
= mvd
->vdev_ashift
;
710 vdev_remove_child(mvd
, cvd
);
711 vdev_remove_child(pvd
, mvd
);
713 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
714 * Otherwise, we could have detached an offline device, and when we
715 * go to import the pool we'll think we have two top-level vdevs,
716 * instead of a different version of the same top-level vdev.
718 if (mvd
->vdev_top
== mvd
)
719 cvd
->vdev_guid
= cvd
->vdev_guid_sum
= mvd
->vdev_guid
;
720 cvd
->vdev_id
= mvd
->vdev_id
;
721 vdev_add_child(pvd
, cvd
);
722 vdev_top_update(cvd
->vdev_top
, cvd
->vdev_top
);
724 if (cvd
== cvd
->vdev_top
)
725 vdev_top_transfer(mvd
, cvd
);
727 ASSERT(mvd
->vdev_children
== 0);
732 vdev_metaslab_init(vdev_t
*vd
, uint64_t txg
)
734 spa_t
*spa
= vd
->vdev_spa
;
735 objset_t
*mos
= spa
->spa_meta_objset
;
736 metaslab_class_t
*mc
;
738 uint64_t oldc
= vd
->vdev_ms_count
;
739 uint64_t newc
= vd
->vdev_asize
>> vd
->vdev_ms_shift
;
743 if (vd
->vdev_ms_shift
== 0) /* not being allocated from yet */
746 ASSERT(oldc
<= newc
);
749 mc
= spa
->spa_log_class
;
751 mc
= spa
->spa_normal_class
;
753 if (vd
->vdev_mg
== NULL
)
754 vd
->vdev_mg
= metaslab_group_create(mc
, vd
);
756 mspp
= kmem_zalloc(newc
* sizeof (*mspp
), KM_SLEEP
);
759 bcopy(vd
->vdev_ms
, mspp
, oldc
* sizeof (*mspp
));
760 kmem_free(vd
->vdev_ms
, oldc
* sizeof (*mspp
));
764 vd
->vdev_ms_count
= newc
;
766 for (m
= oldc
; m
< newc
; m
++) {
767 space_map_obj_t smo
= { 0, 0, 0 };
770 error
= dmu_read(mos
, vd
->vdev_ms_array
,
771 m
* sizeof (uint64_t), sizeof (uint64_t), &object
);
776 error
= dmu_bonus_hold(mos
, object
, FTAG
, &db
);
779 ASSERT3U(db
->db_size
, >=, sizeof (smo
));
780 bcopy(db
->db_data
, &smo
, sizeof (smo
));
781 ASSERT3U(smo
.smo_object
, ==, object
);
782 dmu_buf_rele(db
, FTAG
);
785 vd
->vdev_ms
[m
] = metaslab_init(vd
->vdev_mg
, &smo
,
786 m
<< vd
->vdev_ms_shift
, 1ULL << vd
->vdev_ms_shift
, txg
);
793 vdev_metaslab_fini(vdev_t
*vd
)
796 uint64_t count
= vd
->vdev_ms_count
;
798 if (vd
->vdev_ms
!= NULL
) {
799 for (m
= 0; m
< count
; m
++)
800 if (vd
->vdev_ms
[m
] != NULL
)
801 metaslab_fini(vd
->vdev_ms
[m
]);
802 kmem_free(vd
->vdev_ms
, count
* sizeof (metaslab_t
*));
807 typedef struct vdev_probe_stats
{
808 boolean_t vps_readable
;
809 boolean_t vps_writeable
;
813 } vdev_probe_stats_t
;
816 vdev_probe_done(zio_t
*zio
)
818 vdev_probe_stats_t
*vps
= zio
->io_private
;
819 vdev_t
*vd
= vps
->vps_vd
;
821 if (zio
->io_type
== ZIO_TYPE_READ
) {
822 ASSERT(zio
->io_vd
== vd
);
823 if (zio
->io_error
== 0)
824 vps
->vps_readable
= 1;
825 if (zio
->io_error
== 0 && (spa_mode
& FWRITE
)) {
826 zio_nowait(zio_write_phys(vps
->vps_root
, vd
,
827 zio
->io_offset
, zio
->io_size
, zio
->io_data
,
828 ZIO_CHECKSUM_OFF
, vdev_probe_done
, vps
,
829 ZIO_PRIORITY_SYNC_WRITE
, vps
->vps_flags
, B_TRUE
));
831 zio_buf_free(zio
->io_data
, zio
->io_size
);
833 } else if (zio
->io_type
== ZIO_TYPE_WRITE
) {
834 ASSERT(zio
->io_vd
== vd
);
835 if (zio
->io_error
== 0)
836 vps
->vps_writeable
= 1;
837 zio_buf_free(zio
->io_data
, zio
->io_size
);
838 } else if (zio
->io_type
== ZIO_TYPE_NULL
) {
839 ASSERT(zio
->io_vd
== NULL
);
840 ASSERT(zio
== vps
->vps_root
);
842 vd
->vdev_cant_read
|= !vps
->vps_readable
;
843 vd
->vdev_cant_write
|= !vps
->vps_writeable
;
845 if (vdev_readable(vd
) &&
846 (vdev_writeable(vd
) || !(spa_mode
& FWRITE
))) {
849 ASSERT(zio
->io_error
!= 0);
850 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE
,
851 zio
->io_spa
, vd
, NULL
, 0, 0);
852 zio
->io_error
= ENXIO
;
854 kmem_free(vps
, sizeof (*vps
));
859 * Determine whether this device is accessible by reading and writing
860 * to several known locations: the pad regions of each vdev label
861 * but the first (which we leave alone in case it contains a VTOC).
864 vdev_probe(vdev_t
*vd
, zio_t
*pio
)
866 spa_t
*spa
= vd
->vdev_spa
;
867 vdev_probe_stats_t
*vps
;
870 vps
= kmem_zalloc(sizeof (*vps
), KM_SLEEP
);
872 vps
->vps_flags
= ZIO_FLAG_CANFAIL
| ZIO_FLAG_PROBE
|
873 ZIO_FLAG_DONT_CACHE
| ZIO_FLAG_DONT_AGGREGATE
| ZIO_FLAG_DONT_RETRY
;
875 if (spa_config_held(spa
, SCL_ZIO
, RW_WRITER
)) {
877 * vdev_cant_read and vdev_cant_write can only transition
878 * from TRUE to FALSE when we have the SCL_ZIO lock as writer;
879 * otherwise they can only transition from FALSE to TRUE.
880 * This ensures that any zio looking at these values can
881 * assume that failures persist for the life of the I/O.
882 * That's important because when a device has intermittent
883 * connectivity problems, we want to ensure that they're
884 * ascribed to the device (ENXIO) and not the zio (EIO).
886 * Since we hold SCL_ZIO as writer here, clear both values
887 * so the probe can reevaluate from first principles.
889 vps
->vps_flags
|= ZIO_FLAG_CONFIG_WRITER
;
890 vd
->vdev_cant_read
= B_FALSE
;
891 vd
->vdev_cant_write
= B_FALSE
;
894 ASSERT(vd
->vdev_ops
->vdev_op_leaf
);
896 zio
= zio_null(pio
, spa
, vdev_probe_done
, vps
, vps
->vps_flags
);
901 for (int l
= 1; l
< VDEV_LABELS
; l
++) {
902 zio_nowait(zio_read_phys(zio
, vd
,
903 vdev_label_offset(vd
->vdev_psize
, l
,
904 offsetof(vdev_label_t
, vl_pad
)),
905 VDEV_SKIP_SIZE
, zio_buf_alloc(VDEV_SKIP_SIZE
),
906 ZIO_CHECKSUM_OFF
, vdev_probe_done
, vps
,
907 ZIO_PRIORITY_SYNC_READ
, vps
->vps_flags
, B_TRUE
));
914 * Prepare a virtual device for access.
917 vdev_open(vdev_t
*vd
)
922 uint64_t asize
, psize
;
925 ASSERT(vd
->vdev_state
== VDEV_STATE_CLOSED
||
926 vd
->vdev_state
== VDEV_STATE_CANT_OPEN
||
927 vd
->vdev_state
== VDEV_STATE_OFFLINE
);
929 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NONE
;
931 if (!vd
->vdev_removed
&& vd
->vdev_faulted
) {
932 ASSERT(vd
->vdev_children
== 0);
933 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_FAULTED
,
934 VDEV_AUX_ERR_EXCEEDED
);
936 } else if (vd
->vdev_offline
) {
937 ASSERT(vd
->vdev_children
== 0);
938 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_OFFLINE
, VDEV_AUX_NONE
);
942 error
= vd
->vdev_ops
->vdev_op_open(vd
, &osize
, &ashift
);
943 dprintf("vdev_op_open size" PRIu64
" %d \n", osize
, zio_injection_enabled
);
944 if (zio_injection_enabled
&& error
== 0)
945 error
= zio_handle_device_injection(vd
, ENXIO
);
948 if (vd
->vdev_removed
&&
949 vd
->vdev_stat
.vs_aux
!= VDEV_AUX_OPEN_FAILED
)
950 vd
->vdev_removed
= B_FALSE
;
952 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
953 vd
->vdev_stat
.vs_aux
);
957 vd
->vdev_removed
= B_FALSE
;
959 if (vd
->vdev_degraded
) {
960 ASSERT(vd
->vdev_children
== 0);
961 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_DEGRADED
,
962 VDEV_AUX_ERR_EXCEEDED
);
964 vd
->vdev_state
= VDEV_STATE_HEALTHY
;
967 for (c
= 0; c
< vd
->vdev_children
; c
++)
968 if (vd
->vdev_child
[c
]->vdev_state
!= VDEV_STATE_HEALTHY
) {
969 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_DEGRADED
,
974 osize
= P2ALIGN(osize
, (uint64_t)sizeof (vdev_label_t
));
976 if (vd
->vdev_children
== 0) {
977 if (osize
< SPA_MINDEVSIZE
) {
978 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
983 asize
= osize
- (VDEV_LABEL_START_SIZE
+ VDEV_LABEL_END_SIZE
);
985 if (vd
->vdev_parent
!= NULL
&& osize
< SPA_MINDEVSIZE
-
986 (VDEV_LABEL_START_SIZE
+ VDEV_LABEL_END_SIZE
)) {
987 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
995 vd
->vdev_psize
= psize
;
997 if (vd
->vdev_asize
== 0) {
999 * This is the first-ever open, so use the computed values.
1000 * For testing purposes, a higher ashift can be requested.
1002 vd
->vdev_asize
= asize
;
1003 vd
->vdev_ashift
= MAX(ashift
, vd
->vdev_ashift
);
1006 * Make sure the alignment requirement hasn't increased.
1008 if (ashift
> vd
->vdev_top
->vdev_ashift
) {
1009 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1010 VDEV_AUX_BAD_LABEL
);
1015 * Make sure the device hasn't shrunk.
1017 if (asize
< vd
->vdev_asize
) {
1018 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1019 VDEV_AUX_BAD_LABEL
);
1024 * If all children are healthy and the asize has increased,
1025 * then we've experienced dynamic LUN growth.
1027 if (vd
->vdev_state
== VDEV_STATE_HEALTHY
&&
1028 asize
> vd
->vdev_asize
) {
1029 vd
->vdev_asize
= asize
;
1034 * Ensure we can issue some IO before declaring the
1035 * vdev open for business.
1037 if (vd
->vdev_ops
->vdev_op_leaf
&&
1038 (error
= zio_wait(vdev_probe(vd
, NULL
))) != 0) {
1039 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1040 VDEV_AUX_IO_FAILURE
);
1045 * If this is a top-level vdev, compute the raidz-deflation
1046 * ratio. Note, we hard-code in 128k (1<<17) because it is the
1047 * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE
1048 * changes, this algorithm must never change, or we will
1049 * inconsistently account for existing bp's.
1051 if (vd
->vdev_top
== vd
) {
1052 vd
->vdev_deflate_ratio
= (1<<17) /
1053 (vdev_psize_to_asize(vd
, 1<<17) >> SPA_MINBLOCKSHIFT
);
1057 * If a leaf vdev has a DTL, and seems healthy, then kick off a
1058 * resilver. But don't do this if we are doing a reopen for a
1059 * scrub, since this would just restart the scrub we are already
1062 if (vd
->vdev_children
== 0 && !vd
->vdev_spa
->spa_scrub_reopen
) {
1063 mutex_enter(&vd
->vdev_dtl_lock
);
1064 if (vd
->vdev_dtl_map
.sm_space
!= 0 && vdev_writeable(vd
))
1065 spa_async_request(vd
->vdev_spa
, SPA_ASYNC_RESILVER
);
1066 mutex_exit(&vd
->vdev_dtl_lock
);
1073 * Called once the vdevs are all opened, this routine validates the label
1074 * contents. This needs to be done before vdev_load() so that we don't
1075 * inadvertently do repair I/Os to the wrong device.
1077 * This function will only return failure if one of the vdevs indicates that it
1078 * has since been destroyed or exported. This is only possible if
1079 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
1080 * will be updated but the function will return 0.
1083 vdev_validate(vdev_t
*vd
)
1085 spa_t
*spa
= vd
->vdev_spa
;
1088 uint64_t guid
, top_guid
;
1091 for (c
= 0; c
< vd
->vdev_children
; c
++)
1092 if (vdev_validate(vd
->vdev_child
[c
]) != 0)
1096 * If the device has already failed, or was marked offline, don't do
1097 * any further validation. Otherwise, label I/O will fail and we will
1098 * overwrite the previous state.
1100 if (vd
->vdev_ops
->vdev_op_leaf
&& vdev_readable(vd
)) {
1102 if ((label
= vdev_label_read_config(vd
)) == NULL
) {
1103 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1104 VDEV_AUX_BAD_LABEL
);
1108 if (nvlist_lookup_uint64(label
, ZPOOL_CONFIG_POOL_GUID
,
1109 &guid
) != 0 || guid
!= spa_guid(spa
)) {
1110 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
1111 VDEV_AUX_CORRUPT_DATA
);
1117 * If this vdev just became a top-level vdev because its
1118 * sibling was detached, it will have adopted the parent's
1119 * vdev guid -- but the label may or may not be on disk yet.
1120 * Fortunately, either version of the label will have the
1121 * same top guid, so if we're a top-level vdev, we can
1122 * safely compare to that instead.
1124 if (nvlist_lookup_uint64(label
, ZPOOL_CONFIG_GUID
,
1126 nvlist_lookup_uint64(label
, ZPOOL_CONFIG_TOP_GUID
,
1128 (vd
->vdev_guid
!= guid
&&
1129 (vd
->vdev_guid
!= top_guid
|| vd
!= vd
->vdev_top
))) {
1130 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
1131 VDEV_AUX_CORRUPT_DATA
);
1136 if (nvlist_lookup_uint64(label
, ZPOOL_CONFIG_POOL_STATE
,
1138 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
1139 VDEV_AUX_CORRUPT_DATA
);
1146 if (spa
->spa_load_state
== SPA_LOAD_OPEN
&&
1147 state
!= POOL_STATE_ACTIVE
)
1151 * If we were able to open and validate a vdev that was
1152 * previously marked permanently unavailable, clear that state
1155 if (vd
->vdev_not_present
)
1156 vd
->vdev_not_present
= 0;
1163 * Close a virtual device.
1166 vdev_close(vdev_t
*vd
)
1168 vd
->vdev_ops
->vdev_op_close(vd
);
1170 vdev_cache_purge(vd
);
1173 * We record the previous state before we close it, so that if we are
1174 * doing a reopen(), we don't generate FMA ereports if we notice that
1175 * it's still faulted.
1177 vd
->vdev_prevstate
= vd
->vdev_state
;
1179 if (vd
->vdev_offline
)
1180 vd
->vdev_state
= VDEV_STATE_OFFLINE
;
1182 vd
->vdev_state
= VDEV_STATE_CLOSED
;
1183 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NONE
;
1187 vdev_reopen(vdev_t
*vd
)
1189 spa_t
*spa
= vd
->vdev_spa
;
1191 ASSERT(spa_config_held(spa
, SCL_STATE_ALL
, RW_WRITER
) == SCL_STATE_ALL
);
1194 (void) vdev_open(vd
);
1197 * Call vdev_validate() here to make sure we have the same device.
1198 * Otherwise, a device with an invalid label could be successfully
1199 * opened in response to vdev_reopen().
1202 (void) vdev_validate_aux(vd
);
1203 if (vdev_readable(vd
) && vdev_writeable(vd
) &&
1204 !l2arc_vdev_present(vd
)) {
1205 uint64_t size
= vdev_get_rsize(vd
);
1206 l2arc_add_vdev(spa
, vd
,
1207 VDEV_LABEL_START_SIZE
,
1208 size
- VDEV_LABEL_START_SIZE
);
1211 (void) vdev_validate(vd
);
1215 * Reassess parent vdev's health.
1217 vdev_propagate_state(vd
);
1221 vdev_create(vdev_t
*vd
, uint64_t txg
, boolean_t isreplacing
)
1226 * Normally, partial opens (e.g. of a mirror) are allowed.
1227 * For a create, however, we want to fail the request if
1228 * there are any components we can't open.
1230 error
= vdev_open(vd
);
1232 if (error
|| vd
->vdev_state
!= VDEV_STATE_HEALTHY
) {
1234 return (error
? error
: ENXIO
);
1238 * Recursively initialize all labels.
1240 if ((error
= vdev_label_init(vd
, txg
, isreplacing
?
1241 VDEV_LABEL_REPLACE
: VDEV_LABEL_CREATE
)) != 0) {
1250 * The is the latter half of vdev_create(). It is distinct because it
1251 * involves initiating transactions in order to do metaslab creation.
1252 * For creation, we want to try to create all vdevs at once and then undo it
1253 * if anything fails; this is much harder if we have pending transactions.
1256 vdev_init(vdev_t
*vd
, uint64_t txg
)
1259 * Aim for roughly 200 metaslabs per vdev.
1261 vd
->vdev_ms_shift
= highbit(vd
->vdev_asize
/ 200);
1262 vd
->vdev_ms_shift
= MAX(vd
->vdev_ms_shift
, SPA_MAXBLOCKSHIFT
);
1265 * Initialize the vdev's metaslabs. This can't fail because
1266 * there's nothing to read when creating all new metaslabs.
1268 VERIFY(vdev_metaslab_init(vd
, txg
) == 0);
1272 vdev_dirty(vdev_t
*vd
, int flags
, void *arg
, uint64_t txg
)
1274 ASSERT(vd
== vd
->vdev_top
);
1275 ASSERT(ISP2(flags
));
1277 if (flags
& VDD_METASLAB
)
1278 (void) txg_list_add(&vd
->vdev_ms_list
, arg
, txg
);
1280 if (flags
& VDD_DTL
)
1281 (void) txg_list_add(&vd
->vdev_dtl_list
, arg
, txg
);
1283 (void) txg_list_add(&vd
->vdev_spa
->spa_vdev_txg_list
, vd
, txg
);
1287 vdev_dtl_dirty(space_map_t
*sm
, uint64_t txg
, uint64_t size
)
1289 mutex_enter(sm
->sm_lock
);
1290 if (!space_map_contains(sm
, txg
, size
))
1291 space_map_add(sm
, txg
, size
);
1292 mutex_exit(sm
->sm_lock
);
1296 vdev_dtl_contains(space_map_t
*sm
, uint64_t txg
, uint64_t size
)
1301 * Quick test without the lock -- covers the common case that
1302 * there are no dirty time segments.
1304 if (sm
->sm_space
== 0)
1307 mutex_enter(sm
->sm_lock
);
1308 dirty
= space_map_contains(sm
, txg
, size
);
1309 mutex_exit(sm
->sm_lock
);
1315 * Reassess DTLs after a config change or scrub completion.
1318 vdev_dtl_reassess(vdev_t
*vd
, uint64_t txg
, uint64_t scrub_txg
, int scrub_done
)
1320 spa_t
*spa
= vd
->vdev_spa
;
1323 ASSERT(spa_config_held(spa
, SCL_CONFIG
, RW_READER
));
1325 if (vd
->vdev_children
== 0) {
1326 mutex_enter(&vd
->vdev_dtl_lock
);
1327 if (scrub_txg
!= 0 &&
1328 (spa
->spa_scrub_started
|| spa
->spa_scrub_errors
== 0)) {
1329 /* XXX should check scrub_done? */
1331 * We completed a scrub up to scrub_txg. If we
1332 * did it without rebooting, then the scrub dtl
1333 * will be valid, so excise the old region and
1334 * fold in the scrub dtl. Otherwise, leave the
1335 * dtl as-is if there was an error.
1337 space_map_excise(&vd
->vdev_dtl_map
, 0, scrub_txg
);
1338 space_map_union(&vd
->vdev_dtl_map
, &vd
->vdev_dtl_scrub
);
1341 space_map_vacate(&vd
->vdev_dtl_scrub
, NULL
, NULL
);
1342 mutex_exit(&vd
->vdev_dtl_lock
);
1345 vdev_dirty(vd
->vdev_top
, VDD_DTL
, vd
, txg
);
1350 * Make sure the DTLs are always correct under the scrub lock.
1352 if (vd
== spa
->spa_root_vdev
)
1353 mutex_enter(&spa
->spa_scrub_lock
);
1355 mutex_enter(&vd
->vdev_dtl_lock
);
1356 space_map_vacate(&vd
->vdev_dtl_map
, NULL
, NULL
);
1357 space_map_vacate(&vd
->vdev_dtl_scrub
, NULL
, NULL
);
1358 mutex_exit(&vd
->vdev_dtl_lock
);
1360 for (c
= 0; c
< vd
->vdev_children
; c
++) {
1361 vdev_t
*cvd
= vd
->vdev_child
[c
];
1362 vdev_dtl_reassess(cvd
, txg
, scrub_txg
, scrub_done
);
1363 mutex_enter(&vd
->vdev_dtl_lock
);
1364 space_map_union(&vd
->vdev_dtl_map
, &cvd
->vdev_dtl_map
);
1365 space_map_union(&vd
->vdev_dtl_scrub
, &cvd
->vdev_dtl_scrub
);
1366 mutex_exit(&vd
->vdev_dtl_lock
);
1369 if (vd
== spa
->spa_root_vdev
)
1370 mutex_exit(&spa
->spa_scrub_lock
);
1374 vdev_dtl_load(vdev_t
*vd
)
1376 spa_t
*spa
= vd
->vdev_spa
;
1377 space_map_obj_t
*smo
= &vd
->vdev_dtl
;
1378 objset_t
*mos
= spa
->spa_meta_objset
;
1382 ASSERT(vd
->vdev_children
== 0);
1384 if (smo
->smo_object
== 0)
1387 if ((error
= dmu_bonus_hold(mos
, smo
->smo_object
, FTAG
, &db
)) != 0)
1390 ASSERT3U(db
->db_size
, >=, sizeof (*smo
));
1391 bcopy(db
->db_data
, smo
, sizeof (*smo
));
1392 dmu_buf_rele(db
, FTAG
);
1394 mutex_enter(&vd
->vdev_dtl_lock
);
1395 error
= space_map_load(&vd
->vdev_dtl_map
, NULL
, SM_ALLOC
, smo
, mos
);
1396 mutex_exit(&vd
->vdev_dtl_lock
);
1402 vdev_dtl_sync(vdev_t
*vd
, uint64_t txg
)
1404 spa_t
*spa
= vd
->vdev_spa
;
1405 space_map_obj_t
*smo
= &vd
->vdev_dtl
;
1406 space_map_t
*sm
= &vd
->vdev_dtl_map
;
1407 objset_t
*mos
= spa
->spa_meta_objset
;
1413 tx
= dmu_tx_create_assigned(spa
->spa_dsl_pool
, txg
);
1415 if (vd
->vdev_detached
) {
1416 if (smo
->smo_object
!= 0) {
1417 int err
= dmu_object_free(mos
, smo
->smo_object
, tx
);
1418 ASSERT3U(err
, ==, 0);
1419 smo
->smo_object
= 0;
1425 if (smo
->smo_object
== 0) {
1426 ASSERT(smo
->smo_objsize
== 0);
1427 ASSERT(smo
->smo_alloc
== 0);
1428 smo
->smo_object
= dmu_object_alloc(mos
,
1429 DMU_OT_SPACE_MAP
, 1 << SPACE_MAP_BLOCKSHIFT
,
1430 DMU_OT_SPACE_MAP_HEADER
, sizeof (*smo
), tx
);
1431 ASSERT(smo
->smo_object
!= 0);
1432 vdev_config_dirty(vd
->vdev_top
);
1435 mutex_init(&smlock
, NULL
, MUTEX_DEFAULT
, NULL
);
1437 space_map_create(&smsync
, sm
->sm_start
, sm
->sm_size
, sm
->sm_shift
,
1440 mutex_enter(&smlock
);
1442 mutex_enter(&vd
->vdev_dtl_lock
);
1443 space_map_walk(sm
, space_map_add
, &smsync
);
1444 mutex_exit(&vd
->vdev_dtl_lock
);
1446 space_map_truncate(smo
, mos
, tx
);
1447 space_map_sync(&smsync
, SM_ALLOC
, smo
, mos
, tx
);
1449 space_map_destroy(&smsync
);
1451 mutex_exit(&smlock
);
1452 mutex_destroy(&smlock
);
1454 VERIFY(0 == dmu_bonus_hold(mos
, smo
->smo_object
, FTAG
, &db
));
1455 dmu_buf_will_dirty(db
, tx
);
1456 ASSERT3U(db
->db_size
, >=, sizeof (*smo
));
1457 bcopy(smo
, db
->db_data
, sizeof (*smo
));
1458 dmu_buf_rele(db
, FTAG
);
1464 * Determine if resilver is needed, and if so the txg range.
1467 vdev_resilver_needed(vdev_t
*vd
, uint64_t *minp
, uint64_t *maxp
)
1469 boolean_t needed
= B_FALSE
;
1470 uint64_t thismin
= UINT64_MAX
;
1471 uint64_t thismax
= 0;
1473 if (vd
->vdev_children
== 0) {
1474 mutex_enter(&vd
->vdev_dtl_lock
);
1475 if (vd
->vdev_dtl_map
.sm_space
!= 0 && vdev_writeable(vd
)) {
1478 ss
= avl_first(&vd
->vdev_dtl_map
.sm_root
);
1479 thismin
= ss
->ss_start
- 1;
1480 ss
= avl_last(&vd
->vdev_dtl_map
.sm_root
);
1481 thismax
= ss
->ss_end
;
1484 mutex_exit(&vd
->vdev_dtl_lock
);
1487 for (c
= 0; c
< vd
->vdev_children
; c
++) {
1488 vdev_t
*cvd
= vd
->vdev_child
[c
];
1489 uint64_t cmin
, cmax
;
1491 if (vdev_resilver_needed(cvd
, &cmin
, &cmax
)) {
1492 thismin
= MIN(thismin
, cmin
);
1493 thismax
= MAX(thismax
, cmax
);
1499 if (needed
&& minp
) {
1507 vdev_load(vdev_t
*vd
)
1512 * Recursively load all children.
1514 for (c
= 0; c
< vd
->vdev_children
; c
++)
1515 vdev_load(vd
->vdev_child
[c
]);
1518 * If this is a top-level vdev, initialize its metaslabs.
1520 if (vd
== vd
->vdev_top
&&
1521 (vd
->vdev_ashift
== 0 || vd
->vdev_asize
== 0 ||
1522 vdev_metaslab_init(vd
, 0) != 0))
1523 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
1524 VDEV_AUX_CORRUPT_DATA
);
1527 * If this is a leaf vdev, load its DTL.
1529 if (vd
->vdev_ops
->vdev_op_leaf
&& vdev_dtl_load(vd
) != 0)
1530 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
1531 VDEV_AUX_CORRUPT_DATA
);
1535 * The special vdev case is used for hot spares and l2cache devices. Its
1536 * sole purpose it to set the vdev state for the associated vdev. To do this,
1537 * we make sure that we can open the underlying device, then try to read the
1538 * label, and make sure that the label is sane and that it hasn't been
1539 * repurposed to another pool.
1542 vdev_validate_aux(vdev_t
*vd
)
1545 uint64_t guid
, version
;
1548 if (!vdev_readable(vd
))
1551 if ((label
= vdev_label_read_config(vd
)) == NULL
) {
1552 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1553 VDEV_AUX_CORRUPT_DATA
);
1557 if (nvlist_lookup_uint64(label
, ZPOOL_CONFIG_VERSION
, &version
) != 0 ||
1558 version
> SPA_VERSION
||
1559 nvlist_lookup_uint64(label
, ZPOOL_CONFIG_GUID
, &guid
) != 0 ||
1560 guid
!= vd
->vdev_guid
||
1561 nvlist_lookup_uint64(label
, ZPOOL_CONFIG_POOL_STATE
, &state
) != 0) {
1562 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1563 VDEV_AUX_CORRUPT_DATA
);
1569 * We don't actually check the pool state here. If it's in fact in
1570 * use by another pool, we update this fact on the fly when requested.
1577 vdev_sync_done(vdev_t
*vd
, uint64_t txg
)
1581 while (msp
= txg_list_remove(&vd
->vdev_ms_list
, TXG_CLEAN(txg
)))
1582 metaslab_sync_done(msp
, txg
);
1586 vdev_sync(vdev_t
*vd
, uint64_t txg
)
1588 spa_t
*spa
= vd
->vdev_spa
;
1593 if (vd
->vdev_ms_array
== 0 && vd
->vdev_ms_shift
!= 0) {
1594 ASSERT(vd
== vd
->vdev_top
);
1595 tx
= dmu_tx_create_assigned(spa
->spa_dsl_pool
, txg
);
1596 vd
->vdev_ms_array
= dmu_object_alloc(spa
->spa_meta_objset
,
1597 DMU_OT_OBJECT_ARRAY
, 0, DMU_OT_NONE
, 0, tx
);
1598 ASSERT(vd
->vdev_ms_array
!= 0);
1599 vdev_config_dirty(vd
);
1603 while ((msp
= txg_list_remove(&vd
->vdev_ms_list
, txg
)) != NULL
) {
1604 metaslab_sync(msp
, txg
);
1605 (void) txg_list_add(&vd
->vdev_ms_list
, msp
, TXG_CLEAN(txg
));
1608 while ((lvd
= txg_list_remove(&vd
->vdev_dtl_list
, txg
)) != NULL
)
1609 vdev_dtl_sync(lvd
, txg
);
1611 (void) txg_list_add(&spa
->spa_vdev_txg_list
, vd
, TXG_CLEAN(txg
));
1615 vdev_psize_to_asize(vdev_t
*vd
, uint64_t psize
)
1617 return (vd
->vdev_ops
->vdev_op_asize(vd
, psize
));
1621 * Mark the given vdev faulted. A faulted vdev behaves as if the device could
1622 * not be opened, and no I/O is attempted.
1625 vdev_fault(spa_t
*spa
, uint64_t guid
)
1629 spa_vdev_state_enter(spa
);
1631 if ((vd
= spa_lookup_by_guid(spa
, guid
, B_TRUE
)) == NULL
)
1632 return (spa_vdev_state_exit(spa
, NULL
, ENODEV
));
1634 if (!vd
->vdev_ops
->vdev_op_leaf
)
1635 return (spa_vdev_state_exit(spa
, NULL
, ENOTSUP
));
1638 * Faulted state takes precedence over degraded.
1640 vd
->vdev_faulted
= 1ULL;
1641 vd
->vdev_degraded
= 0ULL;
1642 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_FAULTED
, VDEV_AUX_ERR_EXCEEDED
);
1645 * If marking the vdev as faulted cause the top-level vdev to become
1646 * unavailable, then back off and simply mark the vdev as degraded
1649 if (vdev_is_dead(vd
->vdev_top
) && vd
->vdev_aux
== NULL
) {
1650 vd
->vdev_degraded
= 1ULL;
1651 vd
->vdev_faulted
= 0ULL;
1654 * If we reopen the device and it's not dead, only then do we
1659 if (vdev_readable(vd
)) {
1660 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_DEGRADED
,
1661 VDEV_AUX_ERR_EXCEEDED
);
1665 return (spa_vdev_state_exit(spa
, vd
, 0));
1669 * Mark the given vdev degraded. A degraded vdev is purely an indication to the
1670 * user that something is wrong. The vdev continues to operate as normal as far
1671 * as I/O is concerned.
1674 vdev_degrade(spa_t
*spa
, uint64_t guid
)
1678 spa_vdev_state_enter(spa
);
1680 if ((vd
= spa_lookup_by_guid(spa
, guid
, B_TRUE
)) == NULL
)
1681 return (spa_vdev_state_exit(spa
, NULL
, ENODEV
));
1683 if (!vd
->vdev_ops
->vdev_op_leaf
)
1684 return (spa_vdev_state_exit(spa
, NULL
, ENOTSUP
));
1687 * If the vdev is already faulted, then don't do anything.
1689 if (vd
->vdev_faulted
|| vd
->vdev_degraded
)
1690 return (spa_vdev_state_exit(spa
, NULL
, 0));
1692 vd
->vdev_degraded
= 1ULL;
1693 if (!vdev_is_dead(vd
))
1694 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_DEGRADED
,
1695 VDEV_AUX_ERR_EXCEEDED
);
1697 return (spa_vdev_state_exit(spa
, vd
, 0));
1701 * Online the given vdev. If 'unspare' is set, it implies two things. First,
1702 * any attached spare device should be detached when the device finishes
1703 * resilvering. Second, the online should be treated like a 'test' online case,
1704 * so no FMA events are generated if the device fails to open.
1707 vdev_online(spa_t
*spa
, uint64_t guid
, uint64_t flags
, vdev_state_t
*newstate
)
1711 spa_vdev_state_enter(spa
);
1713 if ((vd
= spa_lookup_by_guid(spa
, guid
, B_TRUE
)) == NULL
)
1714 return (spa_vdev_state_exit(spa
, NULL
, ENODEV
));
1716 if (!vd
->vdev_ops
->vdev_op_leaf
)
1717 return (spa_vdev_state_exit(spa
, NULL
, ENOTSUP
));
1719 vd
->vdev_offline
= B_FALSE
;
1720 vd
->vdev_tmpoffline
= B_FALSE
;
1721 vd
->vdev_checkremove
= !!(flags
& ZFS_ONLINE_CHECKREMOVE
);
1722 vd
->vdev_forcefault
= !!(flags
& ZFS_ONLINE_FORCEFAULT
);
1723 vdev_reopen(vd
->vdev_top
);
1724 vd
->vdev_checkremove
= vd
->vdev_forcefault
= B_FALSE
;
1727 *newstate
= vd
->vdev_state
;
1728 if ((flags
& ZFS_ONLINE_UNSPARE
) &&
1729 !vdev_is_dead(vd
) && vd
->vdev_parent
&&
1730 vd
->vdev_parent
->vdev_ops
== &vdev_spare_ops
&&
1731 vd
->vdev_parent
->vdev_child
[0] == vd
)
1732 vd
->vdev_unspare
= B_TRUE
;
1734 (void) spa_vdev_state_exit(spa
, vd
, 0);
1736 VERIFY3U(spa_scrub(spa
, POOL_SCRUB_RESILVER
), ==, 0);
1742 vdev_offline(spa_t
*spa
, uint64_t guid
, uint64_t flags
)
1746 spa_vdev_state_enter(spa
);
1748 if ((vd
= spa_lookup_by_guid(spa
, guid
, B_TRUE
)) == NULL
)
1749 return (spa_vdev_state_exit(spa
, NULL
, ENODEV
));
1751 if (!vd
->vdev_ops
->vdev_op_leaf
)
1752 return (spa_vdev_state_exit(spa
, NULL
, ENOTSUP
));
1755 * If the device isn't already offline, try to offline it.
1757 if (!vd
->vdev_offline
) {
1759 * If this device's top-level vdev has a non-empty DTL,
1760 * don't allow the device to be offlined.
1762 * XXX -- make this more precise by allowing the offline
1763 * as long as the remaining devices don't have any DTL holes.
1765 if (vd
->vdev_top
->vdev_dtl_map
.sm_space
!= 0)
1766 return (spa_vdev_state_exit(spa
, NULL
, EBUSY
));
1769 * Offline this device and reopen its top-level vdev.
1770 * If this action results in the top-level vdev becoming
1771 * unusable, undo it and fail the request.
1773 vd
->vdev_offline
= B_TRUE
;
1774 vdev_reopen(vd
->vdev_top
);
1775 if (vdev_is_dead(vd
->vdev_top
) && vd
->vdev_aux
== NULL
) {
1776 vd
->vdev_offline
= B_FALSE
;
1777 vdev_reopen(vd
->vdev_top
);
1778 return (spa_vdev_state_exit(spa
, NULL
, EBUSY
));
1782 vd
->vdev_tmpoffline
= !!(flags
& ZFS_OFFLINE_TEMPORARY
);
1784 return (spa_vdev_state_exit(spa
, vd
, 0));
1788 * Clear the error counts associated with this vdev. Unlike vdev_online() and
1789 * vdev_offline(), we assume the spa config is locked. We also clear all
1790 * children. If 'vd' is NULL, then the user wants to clear all vdevs.
1793 vdev_clear(spa_t
*spa
, vdev_t
*vd
)
1795 vdev_t
*rvd
= spa
->spa_root_vdev
;
1797 ASSERT(spa_config_held(spa
, SCL_STATE_ALL
, RW_WRITER
) == SCL_STATE_ALL
);
1802 vd
->vdev_stat
.vs_read_errors
= 0;
1803 vd
->vdev_stat
.vs_write_errors
= 0;
1804 vd
->vdev_stat
.vs_checksum_errors
= 0;
1806 for (int c
= 0; c
< vd
->vdev_children
; c
++)
1807 vdev_clear(spa
, vd
->vdev_child
[c
]);
1810 * If we're in the FAULTED state or have experienced failed I/O, then
1811 * clear the persistent state and attempt to reopen the device. We
1812 * also mark the vdev config dirty, so that the new faulted state is
1813 * written out to disk.
1815 if (vd
->vdev_faulted
|| vd
->vdev_degraded
||
1816 !vdev_readable(vd
) || !vdev_writeable(vd
)) {
1818 vd
->vdev_faulted
= vd
->vdev_degraded
= 0;
1819 vd
->vdev_cant_read
= B_FALSE
;
1820 vd
->vdev_cant_write
= B_FALSE
;
1825 vdev_state_dirty(vd
->vdev_top
);
1827 if (vd
->vdev_aux
== NULL
&& !vdev_is_dead(vd
))
1828 spa_async_request(spa
, SPA_ASYNC_RESILVER
);
1830 spa_event_notify(spa
, vd
, ESC_ZFS_VDEV_CLEAR
);
1835 vdev_is_dead(vdev_t
*vd
)
1837 return (vd
->vdev_state
< VDEV_STATE_DEGRADED
);
1841 vdev_readable(vdev_t
*vd
)
1843 return (!vdev_is_dead(vd
) && !vd
->vdev_cant_read
);
1847 vdev_writeable(vdev_t
*vd
)
1849 return (!vdev_is_dead(vd
) && !vd
->vdev_cant_write
);
1853 vdev_allocatable(vdev_t
*vd
)
1856 * We currently allow allocations from vdevs which maybe in the
1857 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
1858 * fails to reopen then we'll catch it later when we're holding
1861 return (!(vdev_is_dead(vd
) && vd
->vdev_state
!= VDEV_STATE_CLOSED
) &&
1862 !vd
->vdev_cant_write
);
1866 vdev_accessible(vdev_t
*vd
, zio_t
*zio
)
1868 ASSERT(zio
->io_vd
== vd
);
1870 if (vdev_is_dead(vd
) || vd
->vdev_remove_wanted
)
1873 if (zio
->io_type
== ZIO_TYPE_READ
)
1874 return (!vd
->vdev_cant_read
);
1876 if (zio
->io_type
== ZIO_TYPE_WRITE
)
1877 return (!vd
->vdev_cant_write
);
1883 * Get statistics for the given vdev.
1886 vdev_get_stats(vdev_t
*vd
, vdev_stat_t
*vs
)
1888 vdev_t
*rvd
= vd
->vdev_spa
->spa_root_vdev
;
1890 mutex_enter(&vd
->vdev_stat_lock
);
1891 bcopy(&vd
->vdev_stat
, vs
, sizeof (*vs
));
1892 vs
->vs_scrub_errors
= vd
->vdev_spa
->spa_scrub_errors
;
1893 vs
->vs_timestamp
= gethrtime() - vs
->vs_timestamp
;
1894 vs
->vs_state
= vd
->vdev_state
;
1895 vs
->vs_rsize
= vdev_get_rsize(vd
);
1896 mutex_exit(&vd
->vdev_stat_lock
);
1899 * If we're getting stats on the root vdev, aggregate the I/O counts
1900 * over all top-level vdevs (i.e. the direct children of the root).
1903 for (int c
= 0; c
< rvd
->vdev_children
; c
++) {
1904 vdev_t
*cvd
= rvd
->vdev_child
[c
];
1905 vdev_stat_t
*cvs
= &cvd
->vdev_stat
;
1907 mutex_enter(&vd
->vdev_stat_lock
);
1908 for (int t
= 0; t
< ZIO_TYPES
; t
++) {
1909 vs
->vs_ops
[t
] += cvs
->vs_ops
[t
];
1910 vs
->vs_bytes
[t
] += cvs
->vs_bytes
[t
];
1912 vs
->vs_scrub_examined
+= cvs
->vs_scrub_examined
;
1913 mutex_exit(&vd
->vdev_stat_lock
);
1919 vdev_clear_stats(vdev_t
*vd
)
1921 mutex_enter(&vd
->vdev_stat_lock
);
1922 vd
->vdev_stat
.vs_space
= 0;
1923 vd
->vdev_stat
.vs_dspace
= 0;
1924 vd
->vdev_stat
.vs_alloc
= 0;
1925 mutex_exit(&vd
->vdev_stat_lock
);
1929 vdev_stat_update(zio_t
*zio
, uint64_t psize
)
1931 vdev_t
*rvd
= zio
->io_spa
->spa_root_vdev
;
1932 vdev_t
*vd
= zio
->io_vd
? zio
->io_vd
: rvd
;
1934 uint64_t txg
= zio
->io_txg
;
1935 vdev_stat_t
*vs
= &vd
->vdev_stat
;
1936 zio_type_t type
= zio
->io_type
;
1937 int flags
= zio
->io_flags
;
1940 * If this i/o is a gang leader, it didn't do any actual work.
1942 if (zio
->io_gang_tree
)
1945 if (zio
->io_error
== 0) {
1947 * If this is a root i/o, don't count it -- we've already
1948 * counted the top-level vdevs, and vdev_get_stats() will
1949 * aggregate them when asked. This reduces contention on
1950 * the root vdev_stat_lock and implicitly handles blocks
1951 * that compress away to holes, for which there is no i/o.
1952 * (Holes never create vdev children, so all the counters
1953 * remain zero, which is what we want.)
1955 * Note: this only applies to successful i/o (io_error == 0)
1956 * because unlike i/o counts, errors are not additive.
1957 * When reading a ditto block, for example, failure of
1958 * one top-level vdev does not imply a root-level error.
1963 ASSERT(vd
== zio
->io_vd
);
1964 if (!(flags
& ZIO_FLAG_IO_BYPASS
)) {
1965 mutex_enter(&vd
->vdev_stat_lock
);
1967 vs
->vs_bytes
[type
] += psize
;
1968 mutex_exit(&vd
->vdev_stat_lock
);
1970 if (flags
& ZIO_FLAG_IO_REPAIR
) {
1971 ASSERT(zio
->io_delegate_list
== NULL
);
1972 mutex_enter(&vd
->vdev_stat_lock
);
1973 if (flags
& ZIO_FLAG_SCRUB_THREAD
)
1974 vs
->vs_scrub_repaired
+= psize
;
1976 vs
->vs_self_healed
+= psize
;
1977 mutex_exit(&vd
->vdev_stat_lock
);
1982 if (flags
& ZIO_FLAG_SPECULATIVE
)
1985 mutex_enter(&vd
->vdev_stat_lock
);
1986 if (type
== ZIO_TYPE_READ
) {
1987 if (zio
->io_error
== ECKSUM
)
1988 vs
->vs_checksum_errors
++;
1990 vs
->vs_read_errors
++;
1992 if (type
== ZIO_TYPE_WRITE
)
1993 vs
->vs_write_errors
++;
1994 mutex_exit(&vd
->vdev_stat_lock
);
1996 if (type
== ZIO_TYPE_WRITE
&& txg
!= 0 && vd
->vdev_children
== 0) {
1997 if (flags
& ZIO_FLAG_SCRUB_THREAD
) {
1998 ASSERT(flags
& ZIO_FLAG_IO_REPAIR
);
1999 for (pvd
= vd
; pvd
!= NULL
; pvd
= pvd
->vdev_parent
)
2000 vdev_dtl_dirty(&pvd
->vdev_dtl_scrub
, txg
, 1);
2002 if (!(flags
& ZIO_FLAG_IO_REPAIR
)) {
2003 if (vdev_dtl_contains(&vd
->vdev_dtl_map
, txg
, 1))
2005 vdev_dirty(vd
->vdev_top
, VDD_DTL
, vd
, txg
);
2006 for (pvd
= vd
; pvd
!= NULL
; pvd
= pvd
->vdev_parent
)
2007 vdev_dtl_dirty(&pvd
->vdev_dtl_map
, txg
, 1);
2013 vdev_scrub_stat_update(vdev_t
*vd
, pool_scrub_type_t type
, boolean_t complete
)
2016 vdev_stat_t
*vs
= &vd
->vdev_stat
;
2018 for (c
= 0; c
< vd
->vdev_children
; c
++)
2019 vdev_scrub_stat_update(vd
->vdev_child
[c
], type
, complete
);
2021 mutex_enter(&vd
->vdev_stat_lock
);
2023 if (type
== POOL_SCRUB_NONE
) {
2025 * Update completion and end time. Leave everything else alone
2026 * so we can report what happened during the previous scrub.
2028 vs
->vs_scrub_complete
= complete
;
2029 vs
->vs_scrub_end
= gethrestime_sec();
2031 vs
->vs_scrub_type
= type
;
2032 vs
->vs_scrub_complete
= 0;
2033 vs
->vs_scrub_examined
= 0;
2034 vs
->vs_scrub_repaired
= 0;
2035 vs
->vs_scrub_start
= gethrestime_sec();
2036 vs
->vs_scrub_end
= 0;
2039 mutex_exit(&vd
->vdev_stat_lock
);
2043 * Update the in-core space usage stats for this vdev and the root vdev.
2046 vdev_space_update(vdev_t
*vd
, int64_t space_delta
, int64_t alloc_delta
,
2047 boolean_t update_root
)
2049 int64_t dspace_delta
= space_delta
;
2050 spa_t
*spa
= vd
->vdev_spa
;
2051 vdev_t
*rvd
= spa
->spa_root_vdev
;
2053 ASSERT(vd
== vd
->vdev_top
);
2056 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
2057 * factor. We must calculate this here and not at the root vdev
2058 * because the root vdev's psize-to-asize is simply the max of its
2059 * childrens', thus not accurate enough for us.
2061 ASSERT((dspace_delta
& (SPA_MINBLOCKSIZE
-1)) == 0);
2062 dspace_delta
= (dspace_delta
>> SPA_MINBLOCKSHIFT
) *
2063 vd
->vdev_deflate_ratio
;
2065 mutex_enter(&vd
->vdev_stat_lock
);
2066 vd
->vdev_stat
.vs_space
+= space_delta
;
2067 vd
->vdev_stat
.vs_alloc
+= alloc_delta
;
2068 vd
->vdev_stat
.vs_dspace
+= dspace_delta
;
2069 mutex_exit(&vd
->vdev_stat_lock
);
2072 ASSERT(rvd
== vd
->vdev_parent
);
2073 ASSERT(vd
->vdev_ms_count
!= 0);
2076 * Don't count non-normal (e.g. intent log) space as part of
2077 * the pool's capacity.
2079 if (vd
->vdev_mg
->mg_class
!= spa
->spa_normal_class
)
2082 mutex_enter(&rvd
->vdev_stat_lock
);
2083 rvd
->vdev_stat
.vs_space
+= space_delta
;
2084 rvd
->vdev_stat
.vs_alloc
+= alloc_delta
;
2085 rvd
->vdev_stat
.vs_dspace
+= dspace_delta
;
2086 mutex_exit(&rvd
->vdev_stat_lock
);
2091 * Mark a top-level vdev's config as dirty, placing it on the dirty list
2092 * so that it will be written out next time the vdev configuration is synced.
2093 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
2096 vdev_config_dirty(vdev_t
*vd
)
2098 spa_t
*spa
= vd
->vdev_spa
;
2099 vdev_t
*rvd
= spa
->spa_root_vdev
;
2103 * If this is an aux vdev (as with l2cache devices), then we update the
2104 * vdev config manually and set the sync flag.
2106 if (vd
->vdev_aux
!= NULL
) {
2107 spa_aux_vdev_t
*sav
= vd
->vdev_aux
;
2111 for (c
= 0; c
< sav
->sav_count
; c
++) {
2112 if (sav
->sav_vdevs
[c
] == vd
)
2116 if (c
== sav
->sav_count
) {
2118 * We're being removed. There's nothing more to do.
2120 ASSERT(sav
->sav_sync
== B_TRUE
);
2124 sav
->sav_sync
= B_TRUE
;
2126 VERIFY(nvlist_lookup_nvlist_array(sav
->sav_config
,
2127 ZPOOL_CONFIG_L2CACHE
, &aux
, &naux
) == 0);
2132 * Setting the nvlist in the middle if the array is a little
2133 * sketchy, but it will work.
2135 nvlist_free(aux
[c
]);
2136 aux
[c
] = vdev_config_generate(spa
, vd
, B_TRUE
, B_FALSE
, B_TRUE
);
2142 * The dirty list is protected by the SCL_CONFIG lock. The caller
2143 * must either hold SCL_CONFIG as writer, or must be the sync thread
2144 * (which holds SCL_CONFIG as reader). There's only one sync thread,
2145 * so this is sufficient to ensure mutual exclusion.
2147 ASSERT(spa_config_held(spa
, SCL_CONFIG
, RW_WRITER
) ||
2148 (dsl_pool_sync_context(spa_get_dsl(spa
)) &&
2149 spa_config_held(spa
, SCL_CONFIG
, RW_READER
)));
2152 for (c
= 0; c
< rvd
->vdev_children
; c
++)
2153 vdev_config_dirty(rvd
->vdev_child
[c
]);
2155 ASSERT(vd
== vd
->vdev_top
);
2157 if (!list_link_active(&vd
->vdev_config_dirty_node
))
2158 list_insert_head(&spa
->spa_config_dirty_list
, vd
);
2163 vdev_config_clean(vdev_t
*vd
)
2165 spa_t
*spa
= vd
->vdev_spa
;
2167 ASSERT(spa_config_held(spa
, SCL_CONFIG
, RW_WRITER
) ||
2168 (dsl_pool_sync_context(spa_get_dsl(spa
)) &&
2169 spa_config_held(spa
, SCL_CONFIG
, RW_READER
)));
2171 ASSERT(list_link_active(&vd
->vdev_config_dirty_node
));
2172 list_remove(&spa
->spa_config_dirty_list
, vd
);
2176 * Mark a top-level vdev's state as dirty, so that the next pass of
2177 * spa_sync() can convert this into vdev_config_dirty(). We distinguish
2178 * the state changes from larger config changes because they require
2179 * much less locking, and are often needed for administrative actions.
2182 vdev_state_dirty(vdev_t
*vd
)
2184 spa_t
*spa
= vd
->vdev_spa
;
2186 ASSERT(vd
== vd
->vdev_top
);
2189 * The state list is protected by the SCL_STATE lock. The caller
2190 * must either hold SCL_STATE as writer, or must be the sync thread
2191 * (which holds SCL_STATE as reader). There's only one sync thread,
2192 * so this is sufficient to ensure mutual exclusion.
2194 ASSERT(spa_config_held(spa
, SCL_STATE
, RW_WRITER
) ||
2195 (dsl_pool_sync_context(spa_get_dsl(spa
)) &&
2196 spa_config_held(spa
, SCL_STATE
, RW_READER
)));
2198 if (!list_link_active(&vd
->vdev_state_dirty_node
))
2199 list_insert_head(&spa
->spa_state_dirty_list
, vd
);
2203 vdev_state_clean(vdev_t
*vd
)
2205 spa_t
*spa
= vd
->vdev_spa
;
2207 ASSERT(spa_config_held(spa
, SCL_STATE
, RW_WRITER
) ||
2208 (dsl_pool_sync_context(spa_get_dsl(spa
)) &&
2209 spa_config_held(spa
, SCL_STATE
, RW_READER
)));
2211 ASSERT(list_link_active(&vd
->vdev_state_dirty_node
));
2212 list_remove(&spa
->spa_state_dirty_list
, vd
);
2216 * Propagate vdev state up from children to parent.
2219 vdev_propagate_state(vdev_t
*vd
)
2221 vdev_t
*rvd
= vd
->vdev_spa
->spa_root_vdev
;
2222 int degraded
= 0, faulted
= 0;
2227 if (vd
->vdev_children
> 0) {
2228 for (c
= 0; c
< vd
->vdev_children
; c
++) {
2229 child
= vd
->vdev_child
[c
];
2231 if (!vdev_readable(child
) ||
2232 (!vdev_writeable(child
) && (spa_mode
& FWRITE
))) {
2234 * Root special: if there is a top-level log
2235 * device, treat the root vdev as if it were
2238 if (child
->vdev_islog
&& vd
== rvd
)
2242 } else if (child
->vdev_state
<= VDEV_STATE_DEGRADED
) {
2246 if (child
->vdev_stat
.vs_aux
== VDEV_AUX_CORRUPT_DATA
)
2250 vd
->vdev_ops
->vdev_op_state_change(vd
, faulted
, degraded
);
2253 * Root special: if there is a top-level vdev that cannot be
2254 * opened due to corrupted metadata, then propagate the root
2255 * vdev's aux state as 'corrupt' rather than 'insufficient
2258 if (corrupted
&& vd
== rvd
&&
2259 rvd
->vdev_state
== VDEV_STATE_CANT_OPEN
)
2260 vdev_set_state(rvd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
2261 VDEV_AUX_CORRUPT_DATA
);
2264 if (vd
->vdev_parent
)
2265 vdev_propagate_state(vd
->vdev_parent
);
2269 * Set a vdev's state. If this is during an open, we don't update the parent
2270 * state, because we're in the process of opening children depth-first.
2271 * Otherwise, we propagate the change to the parent.
2273 * If this routine places a device in a faulted state, an appropriate ereport is
2277 vdev_set_state(vdev_t
*vd
, boolean_t isopen
, vdev_state_t state
, vdev_aux_t aux
)
2279 uint64_t save_state
;
2280 spa_t
*spa
= vd
->vdev_spa
;
2282 if (state
== vd
->vdev_state
) {
2283 vd
->vdev_stat
.vs_aux
= aux
;
2287 save_state
= vd
->vdev_state
;
2289 vd
->vdev_state
= state
;
2290 vd
->vdev_stat
.vs_aux
= aux
;
2293 * If we are setting the vdev state to anything but an open state, then
2294 * always close the underlying device. Otherwise, we keep accessible
2295 * but invalid devices open forever. We don't call vdev_close() itself,
2296 * because that implies some extra checks (offline, etc) that we don't
2297 * want here. This is limited to leaf devices, because otherwise
2298 * closing the device will affect other children.
2300 if (vdev_is_dead(vd
) && vd
->vdev_ops
->vdev_op_leaf
)
2301 vd
->vdev_ops
->vdev_op_close(vd
);
2303 if (vd
->vdev_removed
&&
2304 state
== VDEV_STATE_CANT_OPEN
&&
2305 (aux
== VDEV_AUX_OPEN_FAILED
|| vd
->vdev_checkremove
)) {
2307 * If the previous state is set to VDEV_STATE_REMOVED, then this
2308 * device was previously marked removed and someone attempted to
2309 * reopen it. If this failed due to a nonexistent device, then
2310 * keep the device in the REMOVED state. We also let this be if
2311 * it is one of our special test online cases, which is only
2312 * attempting to online the device and shouldn't generate an FMA
2315 vd
->vdev_state
= VDEV_STATE_REMOVED
;
2316 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NONE
;
2317 } else if (state
== VDEV_STATE_REMOVED
) {
2319 * Indicate to the ZFS DE that this device has been removed, and
2320 * any recent errors should be ignored.
2322 zfs_post_remove(spa
, vd
);
2323 vd
->vdev_removed
= B_TRUE
;
2324 } else if (state
== VDEV_STATE_CANT_OPEN
) {
2326 * If we fail to open a vdev during an import, we mark it as
2327 * "not available", which signifies that it was never there to
2328 * begin with. Failure to open such a device is not considered
2331 if (spa
->spa_load_state
== SPA_LOAD_IMPORT
&&
2332 !spa
->spa_import_faulted
&&
2333 vd
->vdev_ops
->vdev_op_leaf
)
2334 vd
->vdev_not_present
= 1;
2337 * Post the appropriate ereport. If the 'prevstate' field is
2338 * set to something other than VDEV_STATE_UNKNOWN, it indicates
2339 * that this is part of a vdev_reopen(). In this case, we don't
2340 * want to post the ereport if the device was already in the
2341 * CANT_OPEN state beforehand.
2343 * If the 'checkremove' flag is set, then this is an attempt to
2344 * online the device in response to an insertion event. If we
2345 * hit this case, then we have detected an insertion event for a
2346 * faulted or offline device that wasn't in the removed state.
2347 * In this scenario, we don't post an ereport because we are
2348 * about to replace the device, or attempt an online with
2349 * vdev_forcefault, which will generate the fault for us.
2351 if ((vd
->vdev_prevstate
!= state
|| vd
->vdev_forcefault
) &&
2352 !vd
->vdev_not_present
&& !vd
->vdev_checkremove
&&
2353 vd
!= spa
->spa_root_vdev
) {
2357 case VDEV_AUX_OPEN_FAILED
:
2358 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED
;
2360 case VDEV_AUX_CORRUPT_DATA
:
2361 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA
;
2363 case VDEV_AUX_NO_REPLICAS
:
2364 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS
;
2366 case VDEV_AUX_BAD_GUID_SUM
:
2367 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM
;
2369 case VDEV_AUX_TOO_SMALL
:
2370 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL
;
2372 case VDEV_AUX_BAD_LABEL
:
2373 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL
;
2375 case VDEV_AUX_IO_FAILURE
:
2376 class = FM_EREPORT_ZFS_IO_FAILURE
;
2379 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN
;
2382 zfs_ereport_post(class, spa
, vd
, NULL
, save_state
, 0);
2385 /* Erase any notion of persistent removed state */
2386 vd
->vdev_removed
= B_FALSE
;
2388 vd
->vdev_removed
= B_FALSE
;
2392 vdev_propagate_state(vd
);
2396 * Check the vdev configuration to ensure that it's capable of supporting
2397 * a root pool. Currently, we do not support RAID-Z or partial configuration.
2398 * In addition, only a single top-level vdev is allowed and none of the leaves
2399 * can be wholedisks.
2402 vdev_is_bootable(vdev_t
*vd
)
2406 if (!vd
->vdev_ops
->vdev_op_leaf
) {
2407 char *vdev_type
= vd
->vdev_ops
->vdev_op_type
;
2409 if (strcmp(vdev_type
, VDEV_TYPE_ROOT
) == 0 &&
2410 vd
->vdev_children
> 1) {
2412 } else if (strcmp(vdev_type
, VDEV_TYPE_RAIDZ
) == 0 ||
2413 strcmp(vdev_type
, VDEV_TYPE_MISSING
) == 0) {
2416 } else if (vd
->vdev_wholedisk
== 1) {
2420 for (c
= 0; c
< vd
->vdev_children
; c
++) {
2421 if (!vdev_is_bootable(vd
->vdev_child
[c
]))