4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
30 #include <sys/zfs_context.h>
32 #include <sys/spa_impl.h>
33 #include <sys/dsl_pool.h>
34 #include <sys/dsl_scan.h>
35 #include <sys/vdev_impl.h>
36 #include <sys/vdev_draid.h>
39 #include <sys/fs/zfs.h>
44 static kstat_t
*mirror_ksp
= NULL
;
46 typedef struct mirror_stats
{
47 kstat_named_t vdev_mirror_stat_rotating_linear
;
48 kstat_named_t vdev_mirror_stat_rotating_offset
;
49 kstat_named_t vdev_mirror_stat_rotating_seek
;
50 kstat_named_t vdev_mirror_stat_non_rotating_linear
;
51 kstat_named_t vdev_mirror_stat_non_rotating_seek
;
53 kstat_named_t vdev_mirror_stat_preferred_found
;
54 kstat_named_t vdev_mirror_stat_preferred_not_found
;
57 static mirror_stats_t mirror_stats
= {
58 /* New I/O follows directly the last I/O */
59 { "rotating_linear", KSTAT_DATA_UINT64
},
60 /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
61 { "rotating_offset", KSTAT_DATA_UINT64
},
62 /* New I/O requires random seek */
63 { "rotating_seek", KSTAT_DATA_UINT64
},
64 /* New I/O follows directly the last I/O (nonrot) */
65 { "non_rotating_linear", KSTAT_DATA_UINT64
},
66 /* New I/O requires random seek (nonrot) */
67 { "non_rotating_seek", KSTAT_DATA_UINT64
},
68 /* Preferred child vdev found */
69 { "preferred_found", KSTAT_DATA_UINT64
},
70 /* Preferred child vdev not found or equal load */
71 { "preferred_not_found", KSTAT_DATA_UINT64
},
75 #define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64)
76 #define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val)
77 #define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1)
80 vdev_mirror_stat_init(void)
82 mirror_ksp
= kstat_create("zfs", 0, "vdev_mirror_stats",
83 "misc", KSTAT_TYPE_NAMED
,
84 sizeof (mirror_stats
) / sizeof (kstat_named_t
), KSTAT_FLAG_VIRTUAL
);
85 if (mirror_ksp
!= NULL
) {
86 mirror_ksp
->ks_data
= &mirror_stats
;
87 kstat_install(mirror_ksp
);
92 vdev_mirror_stat_fini(void)
94 if (mirror_ksp
!= NULL
) {
95 kstat_delete(mirror_ksp
);
101 * Virtual device vector for mirroring.
103 typedef struct mirror_child
{
110 uint8_t mc_speculative
;
111 uint8_t mc_rebuilding
;
114 typedef struct mirror_map
{
116 int mm_preferred_cnt
;
118 boolean_t mm_resilvering
;
119 boolean_t mm_rebuilding
;
121 mirror_child_t mm_child
[];
124 static int vdev_mirror_shift
= 21;
127 * The load configuration settings below are tuned by default for
128 * the case where all devices are of the same rotational type.
130 * If there is a mixture of rotating and non-rotating media, setting
131 * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
132 * as it will direct more reads to the non-rotating vdevs which are more likely
133 * to have a higher performance.
136 /* Rotating media load calculation configuration. */
137 static int zfs_vdev_mirror_rotating_inc
= 0;
138 static int zfs_vdev_mirror_rotating_seek_inc
= 5;
139 static int zfs_vdev_mirror_rotating_seek_offset
= 1 * 1024 * 1024;
141 /* Non-rotating media load calculation configuration. */
142 static int zfs_vdev_mirror_non_rotating_inc
= 0;
143 static int zfs_vdev_mirror_non_rotating_seek_inc
= 1;
146 vdev_mirror_map_size(int children
)
148 return (offsetof(mirror_map_t
, mm_child
[children
]) +
149 sizeof (int) * children
);
152 static inline mirror_map_t
*
153 vdev_mirror_map_alloc(int children
, boolean_t resilvering
, boolean_t root
)
157 mm
= kmem_zalloc(vdev_mirror_map_size(children
), KM_SLEEP
);
158 mm
->mm_children
= children
;
159 mm
->mm_resilvering
= resilvering
;
161 mm
->mm_preferred
= (int *)((uintptr_t)mm
+
162 offsetof(mirror_map_t
, mm_child
[children
]));
168 vdev_mirror_map_free(zio_t
*zio
)
170 mirror_map_t
*mm
= zio
->io_vsd
;
172 kmem_free(mm
, vdev_mirror_map_size(mm
->mm_children
));
175 static const zio_vsd_ops_t vdev_mirror_vsd_ops
= {
176 .vsd_free
= vdev_mirror_map_free
,
180 vdev_mirror_load(mirror_map_t
*mm
, vdev_t
*vd
, uint64_t zio_offset
)
182 uint64_t last_offset
;
186 /* All DVAs have equal weight at the root. */
191 * We don't return INT_MAX if the device is resilvering i.e.
192 * vdev_resilver_txg != 0 as when tested performance was slightly
193 * worse overall when resilvering with compared to without.
196 /* Fix zio_offset for leaf vdevs */
197 if (vd
->vdev_ops
->vdev_op_leaf
)
198 zio_offset
+= VDEV_LABEL_START_SIZE
;
200 /* Standard load based on pending queue length. */
201 load
= vdev_queue_length(vd
);
202 last_offset
= vdev_queue_last_offset(vd
);
204 if (vd
->vdev_nonrot
) {
205 /* Non-rotating media. */
206 if (last_offset
== zio_offset
) {
207 MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear
);
208 return (load
+ zfs_vdev_mirror_non_rotating_inc
);
212 * Apply a seek penalty even for non-rotating devices as
213 * sequential I/O's can be aggregated into fewer operations on
214 * the device, thus avoiding unnecessary per-command overhead
215 * and boosting performance.
217 MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek
);
218 return (load
+ zfs_vdev_mirror_non_rotating_seek_inc
);
221 /* Rotating media I/O's which directly follow the last I/O. */
222 if (last_offset
== zio_offset
) {
223 MIRROR_BUMP(vdev_mirror_stat_rotating_linear
);
224 return (load
+ zfs_vdev_mirror_rotating_inc
);
228 * Apply half the seek increment to I/O's within seek offset
229 * of the last I/O issued to this vdev as they should incur less
230 * of a seek increment.
232 offset_diff
= (int64_t)(last_offset
- zio_offset
);
233 if (ABS(offset_diff
) < zfs_vdev_mirror_rotating_seek_offset
) {
234 MIRROR_BUMP(vdev_mirror_stat_rotating_offset
);
235 return (load
+ (zfs_vdev_mirror_rotating_seek_inc
/ 2));
238 /* Apply the full seek increment to all other I/O's. */
239 MIRROR_BUMP(vdev_mirror_stat_rotating_seek
);
240 return (load
+ zfs_vdev_mirror_rotating_seek_inc
);
244 vdev_mirror_rebuilding(vdev_t
*vd
)
246 if (vd
->vdev_ops
->vdev_op_leaf
&& vd
->vdev_rebuild_txg
)
249 for (int i
= 0; i
< vd
->vdev_children
; i
++) {
250 if (vdev_mirror_rebuilding(vd
->vdev_child
[i
])) {
259 * Avoid inlining the function to keep vdev_mirror_io_start(), which
260 * is this functions only caller, as small as possible on the stack.
262 noinline
static mirror_map_t
*
263 vdev_mirror_map_init(zio_t
*zio
)
265 mirror_map_t
*mm
= NULL
;
267 vdev_t
*vd
= zio
->io_vd
;
271 dva_t
*dva
= zio
->io_bp
->blk_dva
;
272 spa_t
*spa
= zio
->io_spa
;
273 dsl_scan_t
*scn
= spa
->spa_dsl_pool
->dp_scan
;
274 dva_t dva_copy
[SPA_DVAS_PER_BP
];
277 * The sequential scrub code sorts and issues all DVAs
278 * of a bp separately. Each of these IOs includes all
279 * original DVA copies so that repairs can be performed
280 * in the event of an error, but we only actually want
281 * to check the first DVA since the others will be
282 * checked by their respective sorted IOs. Only if we
283 * hit an error will we try all DVAs upon retrying.
285 * Note: This check is safe even if the user switches
286 * from a legacy scrub to a sequential one in the middle
287 * of processing, since scn_is_sorted isn't updated until
288 * all outstanding IOs from the previous scrub pass
291 if ((zio
->io_flags
& ZIO_FLAG_SCRUB
) &&
292 !(zio
->io_flags
& ZIO_FLAG_IO_RETRY
) &&
293 dsl_scan_scrubbing(spa
->spa_dsl_pool
) &&
294 scn
->scn_is_sorted
) {
297 c
= BP_GET_NDVAS(zio
->io_bp
);
301 * If the pool cannot be written to, then infer that some
302 * DVAs might be invalid or point to vdevs that do not exist.
305 if (!spa_writeable(spa
)) {
306 ASSERT3U(zio
->io_type
, ==, ZIO_TYPE_READ
);
308 for (int i
= 0; i
< c
; i
++) {
309 if (zfs_dva_valid(spa
, &dva
[i
], zio
->io_bp
))
310 dva_copy
[j
++] = dva
[i
];
314 zio
->io_error
= ENXIO
;
323 mm
= vdev_mirror_map_alloc(c
, B_FALSE
, B_TRUE
);
324 for (c
= 0; c
< mm
->mm_children
; c
++) {
325 mc
= &mm
->mm_child
[c
];
327 mc
->mc_vd
= vdev_lookup_top(spa
, DVA_GET_VDEV(&dva
[c
]));
328 mc
->mc_offset
= DVA_GET_OFFSET(&dva
[c
]);
329 if (mc
->mc_vd
== NULL
) {
330 kmem_free(mm
, vdev_mirror_map_size(
333 zio
->io_error
= ENXIO
;
339 * If we are resilvering, then we should handle scrub reads
340 * differently; we shouldn't issue them to the resilvering
341 * device because it might not have those blocks.
343 * We are resilvering iff:
344 * 1) We are a replacing vdev (ie our name is "replacing-1" or
345 * "spare-1" or something like that), and
346 * 2) The pool is currently being resilvered.
348 * We cannot simply check vd->vdev_resilver_txg, because it's
349 * not set in this path.
351 * Nor can we just check our vdev_ops; there are cases (such as
352 * when a user types "zpool replace pool odev spare_dev" and
353 * spare_dev is in the spare list, or when a spare device is
354 * automatically used to replace a DEGRADED device) when
355 * resilvering is complete but both the original vdev and the
356 * spare vdev remain in the pool. That behavior is intentional.
357 * It helps implement the policy that a spare should be
358 * automatically removed from the pool after the user replaces
359 * the device that originally failed.
361 * If a spa load is in progress, then spa_dsl_pool may be
362 * uninitialized. But we shouldn't be resilvering during a spa
365 boolean_t replacing
= (vd
->vdev_ops
== &vdev_replacing_ops
||
366 vd
->vdev_ops
== &vdev_spare_ops
) &&
367 spa_load_state(vd
->vdev_spa
) == SPA_LOAD_NONE
&&
368 dsl_scan_resilvering(vd
->vdev_spa
->spa_dsl_pool
);
369 mm
= vdev_mirror_map_alloc(vd
->vdev_children
, replacing
,
371 for (c
= 0; c
< mm
->mm_children
; c
++) {
372 mc
= &mm
->mm_child
[c
];
373 mc
->mc_vd
= vd
->vdev_child
[c
];
374 mc
->mc_offset
= zio
->io_offset
;
376 if (vdev_mirror_rebuilding(mc
->mc_vd
))
377 mm
->mm_rebuilding
= mc
->mc_rebuilding
= B_TRUE
;
385 vdev_mirror_open(vdev_t
*vd
, uint64_t *asize
, uint64_t *max_asize
,
386 uint64_t *logical_ashift
, uint64_t *physical_ashift
)
391 if (vd
->vdev_children
== 0) {
392 vd
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
393 return (SET_ERROR(EINVAL
));
396 vdev_open_children(vd
);
398 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
399 vdev_t
*cvd
= vd
->vdev_child
[c
];
401 if (cvd
->vdev_open_error
) {
402 lasterror
= cvd
->vdev_open_error
;
407 *asize
= MIN(*asize
- 1, cvd
->vdev_asize
- 1) + 1;
408 *max_asize
= MIN(*max_asize
- 1, cvd
->vdev_max_asize
- 1) + 1;
409 *logical_ashift
= MAX(*logical_ashift
, cvd
->vdev_ashift
);
410 *physical_ashift
= MAX(*physical_ashift
,
411 cvd
->vdev_physical_ashift
);
414 if (numerrors
== vd
->vdev_children
) {
415 if (vdev_children_are_offline(vd
))
416 vd
->vdev_stat
.vs_aux
= VDEV_AUX_CHILDREN_OFFLINE
;
418 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NO_REPLICAS
;
426 vdev_mirror_close(vdev_t
*vd
)
428 for (int c
= 0; c
< vd
->vdev_children
; c
++)
429 vdev_close(vd
->vdev_child
[c
]);
433 vdev_mirror_child_done(zio_t
*zio
)
435 mirror_child_t
*mc
= zio
->io_private
;
437 mc
->mc_error
= zio
->io_error
;
443 vdev_mirror_scrub_done(zio_t
*zio
)
445 mirror_child_t
*mc
= zio
->io_private
;
447 if (zio
->io_error
== 0) {
449 zio_link_t
*zl
= NULL
;
451 mutex_enter(&zio
->io_lock
);
452 while ((pio
= zio_walk_parents(zio
, &zl
)) != NULL
) {
453 mutex_enter(&pio
->io_lock
);
454 ASSERT3U(zio
->io_size
, >=, pio
->io_size
);
455 abd_copy(pio
->io_abd
, zio
->io_abd
, pio
->io_size
);
456 mutex_exit(&pio
->io_lock
);
458 mutex_exit(&zio
->io_lock
);
461 abd_free(zio
->io_abd
);
463 mc
->mc_error
= zio
->io_error
;
469 * Check the other, lower-index DVAs to see if they're on the same
470 * vdev as the child we picked. If they are, use them since they
471 * are likely to have been allocated from the primary metaslab in
472 * use at the time, and hence are more likely to have locality with
476 vdev_mirror_dva_select(zio_t
*zio
, int p
)
478 dva_t
*dva
= zio
->io_bp
->blk_dva
;
479 mirror_map_t
*mm
= zio
->io_vsd
;
483 preferred
= mm
->mm_preferred
[p
];
484 for (p
--; p
>= 0; p
--) {
485 c
= mm
->mm_preferred
[p
];
486 if (DVA_GET_VDEV(&dva
[c
]) == DVA_GET_VDEV(&dva
[preferred
]))
493 vdev_mirror_preferred_child_randomize(zio_t
*zio
)
495 mirror_map_t
*mm
= zio
->io_vsd
;
499 p
= random_in_range(mm
->mm_preferred_cnt
);
500 return (vdev_mirror_dva_select(zio
, p
));
504 * To ensure we don't always favour the first matching vdev,
505 * which could lead to wear leveling issues on SSD's, we
506 * use the I/O offset as a pseudo random seed into the vdevs
507 * which have the lowest load.
509 p
= (zio
->io_offset
>> vdev_mirror_shift
) % mm
->mm_preferred_cnt
;
510 return (mm
->mm_preferred
[p
]);
514 vdev_mirror_child_readable(mirror_child_t
*mc
)
516 vdev_t
*vd
= mc
->mc_vd
;
518 if (vd
->vdev_top
!= NULL
&& vd
->vdev_top
->vdev_ops
== &vdev_draid_ops
)
519 return (vdev_draid_readable(vd
, mc
->mc_offset
));
521 return (vdev_readable(vd
));
525 vdev_mirror_child_missing(mirror_child_t
*mc
, uint64_t txg
, uint64_t size
)
527 vdev_t
*vd
= mc
->mc_vd
;
529 if (vd
->vdev_top
!= NULL
&& vd
->vdev_top
->vdev_ops
== &vdev_draid_ops
)
530 return (vdev_draid_missing(vd
, mc
->mc_offset
, txg
, size
));
532 return (vdev_dtl_contains(vd
, DTL_MISSING
, txg
, size
));
536 * Try to find a vdev whose DTL doesn't contain the block we want to read
537 * preferring vdevs based on determined load. If we can't, try the read on
538 * any vdev we haven't already tried.
540 * Distributed spares are an exception to the above load rule. They are
541 * always preferred in order to detect gaps in the distributed spare which
542 * are created when another disk in the dRAID fails. In order to restore
543 * redundancy those gaps must be read to trigger the required repair IO.
546 vdev_mirror_child_select(zio_t
*zio
)
548 mirror_map_t
*mm
= zio
->io_vsd
;
549 uint64_t txg
= zio
->io_txg
;
552 ASSERT(zio
->io_bp
== NULL
|| BP_PHYSICAL_BIRTH(zio
->io_bp
) == txg
);
554 lowest_load
= INT_MAX
;
555 mm
->mm_preferred_cnt
= 0;
556 for (c
= 0; c
< mm
->mm_children
; c
++) {
559 mc
= &mm
->mm_child
[c
];
560 if (mc
->mc_tried
|| mc
->mc_skipped
)
563 if (mc
->mc_vd
== NULL
||
564 !vdev_mirror_child_readable(mc
)) {
565 mc
->mc_error
= SET_ERROR(ENXIO
);
566 mc
->mc_tried
= 1; /* don't even try */
571 if (vdev_mirror_child_missing(mc
, txg
, 1)) {
572 mc
->mc_error
= SET_ERROR(ESTALE
);
574 mc
->mc_speculative
= 1;
578 if (mc
->mc_vd
->vdev_ops
== &vdev_draid_spare_ops
) {
579 mm
->mm_preferred
[0] = c
;
580 mm
->mm_preferred_cnt
= 1;
584 mc
->mc_load
= vdev_mirror_load(mm
, mc
->mc_vd
, mc
->mc_offset
);
585 if (mc
->mc_load
> lowest_load
)
588 if (mc
->mc_load
< lowest_load
) {
589 lowest_load
= mc
->mc_load
;
590 mm
->mm_preferred_cnt
= 0;
592 mm
->mm_preferred
[mm
->mm_preferred_cnt
] = c
;
593 mm
->mm_preferred_cnt
++;
596 if (mm
->mm_preferred_cnt
== 1) {
597 MIRROR_BUMP(vdev_mirror_stat_preferred_found
);
598 return (mm
->mm_preferred
[0]);
601 if (mm
->mm_preferred_cnt
> 1) {
602 MIRROR_BUMP(vdev_mirror_stat_preferred_not_found
);
603 return (vdev_mirror_preferred_child_randomize(zio
));
607 * Every device is either missing or has this txg in its DTL.
608 * Look for any child we haven't already tried before giving up.
610 for (c
= 0; c
< mm
->mm_children
; c
++) {
611 if (!mm
->mm_child
[c
].mc_tried
)
616 * Every child failed. There's no place left to look.
622 vdev_mirror_io_start(zio_t
*zio
)
628 mm
= vdev_mirror_map_init(zio
);
630 zio
->io_vsd_ops
= &vdev_mirror_vsd_ops
;
633 ASSERT(!spa_trust_config(zio
->io_spa
));
634 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
639 if (zio
->io_type
== ZIO_TYPE_READ
) {
640 if (zio
->io_bp
!= NULL
&&
641 (zio
->io_flags
& ZIO_FLAG_SCRUB
) && !mm
->mm_resilvering
) {
643 * For scrubbing reads (if we can verify the
644 * checksum here, as indicated by io_bp being
645 * non-NULL) we need to allocate a read buffer for
646 * each child and issue reads to all children. If
647 * any child succeeds, it will copy its data into
648 * zio->io_data in vdev_mirror_scrub_done.
650 for (c
= 0; c
< mm
->mm_children
; c
++) {
651 mc
= &mm
->mm_child
[c
];
653 /* Don't issue ZIOs to offline children */
654 if (!vdev_mirror_child_readable(mc
)) {
655 mc
->mc_error
= SET_ERROR(ENXIO
);
661 zio_nowait(zio_vdev_child_io(zio
, zio
->io_bp
,
662 mc
->mc_vd
, mc
->mc_offset
,
663 abd_alloc_sametype(zio
->io_abd
,
664 zio
->io_size
), zio
->io_size
,
665 zio
->io_type
, zio
->io_priority
, 0,
666 vdev_mirror_scrub_done
, mc
));
672 * For normal reads just pick one child.
674 c
= vdev_mirror_child_select(zio
);
677 ASSERT(zio
->io_type
== ZIO_TYPE_WRITE
);
680 * Writes go to all children.
683 children
= mm
->mm_children
;
687 mc
= &mm
->mm_child
[c
];
691 * When sequentially resilvering only issue write repair
692 * IOs to the vdev which is being rebuilt since performance
693 * is limited by the slowest child. This is an issue for
694 * faster replacement devices such as distributed spares.
696 if ((zio
->io_priority
== ZIO_PRIORITY_REBUILD
) &&
697 (zio
->io_flags
& ZIO_FLAG_IO_REPAIR
) &&
698 !(zio
->io_flags
& ZIO_FLAG_SCRUB
) &&
699 mm
->mm_rebuilding
&& !mc
->mc_rebuilding
) {
703 zio_nowait(zio_vdev_child_io(zio
, zio
->io_bp
,
704 mc
->mc_vd
, mc
->mc_offset
, zio
->io_abd
, zio
->io_size
,
705 zio
->io_type
, zio
->io_priority
, 0,
706 vdev_mirror_child_done
, mc
));
713 vdev_mirror_worst_error(mirror_map_t
*mm
)
715 int error
[2] = { 0, 0 };
717 for (int c
= 0; c
< mm
->mm_children
; c
++) {
718 mirror_child_t
*mc
= &mm
->mm_child
[c
];
719 int s
= mc
->mc_speculative
;
720 error
[s
] = zio_worst_error(error
[s
], mc
->mc_error
);
723 return (error
[0] ? error
[0] : error
[1]);
727 vdev_mirror_io_done(zio_t
*zio
)
729 mirror_map_t
*mm
= zio
->io_vsd
;
733 int unexpected_errors
= 0;
738 for (c
= 0; c
< mm
->mm_children
; c
++) {
739 mc
= &mm
->mm_child
[c
];
744 } else if (mc
->mc_tried
) {
749 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
751 * XXX -- for now, treat partial writes as success.
753 * Now that we support write reallocation, it would be better
754 * to treat partial failure as real failure unless there are
755 * no non-degraded top-level vdevs left, and not update DTLs
756 * if we intend to reallocate.
759 if (good_copies
!= mm
->mm_children
) {
761 * Always require at least one good copy.
763 * For ditto blocks (io_vd == NULL), require
764 * all copies to be good.
766 * XXX -- for replacing vdevs, there's no great answer.
767 * If the old device is really dead, we may not even
768 * be able to access it -- so we only want to
769 * require good writes to the new device. But if
770 * the new device turns out to be flaky, we want
771 * to be able to detach it -- which requires all
772 * writes to the old device to have succeeded.
774 if (good_copies
== 0 || zio
->io_vd
== NULL
)
775 zio
->io_error
= vdev_mirror_worst_error(mm
);
780 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
783 * If we don't have a good copy yet, keep trying other children.
786 if (good_copies
== 0 && (c
= vdev_mirror_child_select(zio
)) != -1) {
787 ASSERT(c
>= 0 && c
< mm
->mm_children
);
788 mc
= &mm
->mm_child
[c
];
789 zio_vdev_io_redone(zio
);
790 zio_nowait(zio_vdev_child_io(zio
, zio
->io_bp
,
791 mc
->mc_vd
, mc
->mc_offset
, zio
->io_abd
, zio
->io_size
,
792 ZIO_TYPE_READ
, zio
->io_priority
, 0,
793 vdev_mirror_child_done
, mc
));
798 if (good_copies
== 0) {
799 zio
->io_error
= vdev_mirror_worst_error(mm
);
800 ASSERT(zio
->io_error
!= 0);
803 if (good_copies
&& spa_writeable(zio
->io_spa
) &&
804 (unexpected_errors
||
805 (zio
->io_flags
& ZIO_FLAG_RESILVER
) ||
806 ((zio
->io_flags
& ZIO_FLAG_SCRUB
) && mm
->mm_resilvering
))) {
808 * Use the good data we have in hand to repair damaged children.
810 for (c
= 0; c
< mm
->mm_children
; c
++) {
812 * Don't rewrite known good children.
813 * Not only is it unnecessary, it could
814 * actually be harmful: if the system lost
815 * power while rewriting the only good copy,
816 * there would be no good copies left!
818 mc
= &mm
->mm_child
[c
];
820 if (mc
->mc_error
== 0) {
821 vdev_ops_t
*ops
= mc
->mc_vd
->vdev_ops
;
826 * We didn't try this child. We need to
828 * 1. it's a scrub (in which case we have
829 * tried everything that was healthy)
831 * 2. it's an indirect or distributed spare
832 * vdev (in which case it could point to any
833 * other vdev, which might have a bad DTL)
835 * 3. the DTL indicates that this data is
836 * missing from this vdev
838 if (!(zio
->io_flags
& ZIO_FLAG_SCRUB
) &&
839 ops
!= &vdev_indirect_ops
&&
840 ops
!= &vdev_draid_spare_ops
&&
841 !vdev_dtl_contains(mc
->mc_vd
, DTL_PARTIAL
,
844 mc
->mc_error
= SET_ERROR(ESTALE
);
847 zio_nowait(zio_vdev_child_io(zio
, zio
->io_bp
,
848 mc
->mc_vd
, mc
->mc_offset
,
849 zio
->io_abd
, zio
->io_size
, ZIO_TYPE_WRITE
,
850 zio
->io_priority
== ZIO_PRIORITY_REBUILD
?
851 ZIO_PRIORITY_REBUILD
: ZIO_PRIORITY_ASYNC_WRITE
,
852 ZIO_FLAG_IO_REPAIR
| (unexpected_errors
?
853 ZIO_FLAG_SELF_HEAL
: 0), NULL
, NULL
));
859 vdev_mirror_state_change(vdev_t
*vd
, int faulted
, int degraded
)
861 if (faulted
== vd
->vdev_children
) {
862 if (vdev_children_are_offline(vd
)) {
863 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_OFFLINE
,
864 VDEV_AUX_CHILDREN_OFFLINE
);
866 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
867 VDEV_AUX_NO_REPLICAS
);
869 } else if (degraded
+ faulted
!= 0) {
870 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_DEGRADED
, VDEV_AUX_NONE
);
872 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_HEALTHY
, VDEV_AUX_NONE
);
877 * Return the maximum asize for a rebuild zio in the provided range.
880 vdev_mirror_rebuild_asize(vdev_t
*vd
, uint64_t start
, uint64_t asize
,
881 uint64_t max_segment
)
883 uint64_t psize
= MIN(P2ROUNDUP(max_segment
, 1 << vd
->vdev_ashift
),
886 return (MIN(asize
, vdev_psize_to_asize(vd
, psize
)));
889 vdev_ops_t vdev_mirror_ops
= {
890 .vdev_op_init
= NULL
,
891 .vdev_op_fini
= NULL
,
892 .vdev_op_open
= vdev_mirror_open
,
893 .vdev_op_close
= vdev_mirror_close
,
894 .vdev_op_asize
= vdev_default_asize
,
895 .vdev_op_min_asize
= vdev_default_min_asize
,
896 .vdev_op_min_alloc
= NULL
,
897 .vdev_op_io_start
= vdev_mirror_io_start
,
898 .vdev_op_io_done
= vdev_mirror_io_done
,
899 .vdev_op_state_change
= vdev_mirror_state_change
,
900 .vdev_op_need_resilver
= vdev_default_need_resilver
,
901 .vdev_op_hold
= NULL
,
902 .vdev_op_rele
= NULL
,
903 .vdev_op_remap
= NULL
,
904 .vdev_op_xlate
= vdev_default_xlate
,
905 .vdev_op_rebuild_asize
= vdev_mirror_rebuild_asize
,
906 .vdev_op_metaslab_init
= NULL
,
907 .vdev_op_config_generate
= NULL
,
908 .vdev_op_nparity
= NULL
,
909 .vdev_op_ndisks
= NULL
,
910 .vdev_op_type
= VDEV_TYPE_MIRROR
, /* name of this vdev type */
911 .vdev_op_leaf
= B_FALSE
/* not a leaf vdev */
914 vdev_ops_t vdev_replacing_ops
= {
915 .vdev_op_init
= NULL
,
916 .vdev_op_fini
= NULL
,
917 .vdev_op_open
= vdev_mirror_open
,
918 .vdev_op_close
= vdev_mirror_close
,
919 .vdev_op_asize
= vdev_default_asize
,
920 .vdev_op_min_asize
= vdev_default_min_asize
,
921 .vdev_op_min_alloc
= NULL
,
922 .vdev_op_io_start
= vdev_mirror_io_start
,
923 .vdev_op_io_done
= vdev_mirror_io_done
,
924 .vdev_op_state_change
= vdev_mirror_state_change
,
925 .vdev_op_need_resilver
= vdev_default_need_resilver
,
926 .vdev_op_hold
= NULL
,
927 .vdev_op_rele
= NULL
,
928 .vdev_op_remap
= NULL
,
929 .vdev_op_xlate
= vdev_default_xlate
,
930 .vdev_op_rebuild_asize
= vdev_mirror_rebuild_asize
,
931 .vdev_op_metaslab_init
= NULL
,
932 .vdev_op_config_generate
= NULL
,
933 .vdev_op_nparity
= NULL
,
934 .vdev_op_ndisks
= NULL
,
935 .vdev_op_type
= VDEV_TYPE_REPLACING
, /* name of this vdev type */
936 .vdev_op_leaf
= B_FALSE
/* not a leaf vdev */
939 vdev_ops_t vdev_spare_ops
= {
940 .vdev_op_init
= NULL
,
941 .vdev_op_fini
= NULL
,
942 .vdev_op_open
= vdev_mirror_open
,
943 .vdev_op_close
= vdev_mirror_close
,
944 .vdev_op_asize
= vdev_default_asize
,
945 .vdev_op_min_asize
= vdev_default_min_asize
,
946 .vdev_op_min_alloc
= NULL
,
947 .vdev_op_io_start
= vdev_mirror_io_start
,
948 .vdev_op_io_done
= vdev_mirror_io_done
,
949 .vdev_op_state_change
= vdev_mirror_state_change
,
950 .vdev_op_need_resilver
= vdev_default_need_resilver
,
951 .vdev_op_hold
= NULL
,
952 .vdev_op_rele
= NULL
,
953 .vdev_op_remap
= NULL
,
954 .vdev_op_xlate
= vdev_default_xlate
,
955 .vdev_op_rebuild_asize
= vdev_mirror_rebuild_asize
,
956 .vdev_op_metaslab_init
= NULL
,
957 .vdev_op_config_generate
= NULL
,
958 .vdev_op_nparity
= NULL
,
959 .vdev_op_ndisks
= NULL
,
960 .vdev_op_type
= VDEV_TYPE_SPARE
, /* name of this vdev type */
961 .vdev_op_leaf
= B_FALSE
/* not a leaf vdev */
965 ZFS_MODULE_PARAM(zfs_vdev_mirror
, zfs_vdev_mirror_
, rotating_inc
, INT
, ZMOD_RW
,
966 "Rotating media load increment for non-seeking I/O's");
968 ZFS_MODULE_PARAM(zfs_vdev_mirror
, zfs_vdev_mirror_
, rotating_seek_inc
, INT
, ZMOD_RW
,
969 "Rotating media load increment for seeking I/O's");
971 ZFS_MODULE_PARAM(zfs_vdev_mirror
, zfs_vdev_mirror_
, rotating_seek_offset
, INT
, ZMOD_RW
,
972 "Offset in bytes from the last I/O which triggers "
973 "a reduced rotating media seek increment");
975 ZFS_MODULE_PARAM(zfs_vdev_mirror
, zfs_vdev_mirror_
, non_rotating_inc
, INT
, ZMOD_RW
,
976 "Non-rotating media load increment for non-seeking I/O's");
978 ZFS_MODULE_PARAM(zfs_vdev_mirror
, zfs_vdev_mirror_
, non_rotating_seek_inc
, INT
, ZMOD_RW
,
979 "Non-rotating media load increment for seeking I/O's");