4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
30 #include <sys/zfs_context.h>
32 #include <sys/spa_impl.h>
33 #include <sys/dsl_pool.h>
34 #include <sys/dsl_scan.h>
35 #include <sys/vdev_impl.h>
36 #include <sys/vdev_draid.h>
38 #include <sys/zio_checksum.h>
40 #include <sys/fs/zfs.h>
45 static kstat_t
*mirror_ksp
= NULL
;
47 typedef struct mirror_stats
{
48 kstat_named_t vdev_mirror_stat_rotating_linear
;
49 kstat_named_t vdev_mirror_stat_rotating_offset
;
50 kstat_named_t vdev_mirror_stat_rotating_seek
;
51 kstat_named_t vdev_mirror_stat_non_rotating_linear
;
52 kstat_named_t vdev_mirror_stat_non_rotating_seek
;
54 kstat_named_t vdev_mirror_stat_preferred_found
;
55 kstat_named_t vdev_mirror_stat_preferred_not_found
;
58 static mirror_stats_t mirror_stats
= {
59 /* New I/O follows directly the last I/O */
60 { "rotating_linear", KSTAT_DATA_UINT64
},
61 /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
62 { "rotating_offset", KSTAT_DATA_UINT64
},
63 /* New I/O requires random seek */
64 { "rotating_seek", KSTAT_DATA_UINT64
},
65 /* New I/O follows directly the last I/O (nonrot) */
66 { "non_rotating_linear", KSTAT_DATA_UINT64
},
67 /* New I/O requires random seek (nonrot) */
68 { "non_rotating_seek", KSTAT_DATA_UINT64
},
69 /* Preferred child vdev found */
70 { "preferred_found", KSTAT_DATA_UINT64
},
71 /* Preferred child vdev not found or equal load */
72 { "preferred_not_found", KSTAT_DATA_UINT64
},
76 #define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64)
77 #define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val)
78 #define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1)
81 vdev_mirror_stat_init(void)
83 mirror_ksp
= kstat_create("zfs", 0, "vdev_mirror_stats",
84 "misc", KSTAT_TYPE_NAMED
,
85 sizeof (mirror_stats
) / sizeof (kstat_named_t
), KSTAT_FLAG_VIRTUAL
);
86 if (mirror_ksp
!= NULL
) {
87 mirror_ksp
->ks_data
= &mirror_stats
;
88 kstat_install(mirror_ksp
);
93 vdev_mirror_stat_fini(void)
95 if (mirror_ksp
!= NULL
) {
96 kstat_delete(mirror_ksp
);
102 * Virtual device vector for mirroring.
104 typedef struct mirror_child
{
112 uint8_t mc_speculative
;
113 uint8_t mc_rebuilding
;
116 typedef struct mirror_map
{
118 int mm_preferred_cnt
;
120 boolean_t mm_resilvering
;
121 boolean_t mm_rebuilding
;
123 mirror_child_t mm_child
[];
126 static const int vdev_mirror_shift
= 21;
129 * The load configuration settings below are tuned by default for
130 * the case where all devices are of the same rotational type.
132 * If there is a mixture of rotating and non-rotating media, setting
133 * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
134 * as it will direct more reads to the non-rotating vdevs which are more likely
135 * to have a higher performance.
138 /* Rotating media load calculation configuration. */
139 static int zfs_vdev_mirror_rotating_inc
= 0;
140 static int zfs_vdev_mirror_rotating_seek_inc
= 5;
141 static int zfs_vdev_mirror_rotating_seek_offset
= 1 * 1024 * 1024;
143 /* Non-rotating media load calculation configuration. */
144 static int zfs_vdev_mirror_non_rotating_inc
= 0;
145 static int zfs_vdev_mirror_non_rotating_seek_inc
= 1;
148 vdev_mirror_map_size(int children
)
150 return (offsetof(mirror_map_t
, mm_child
[children
]) +
151 sizeof (int) * children
);
154 static inline mirror_map_t
*
155 vdev_mirror_map_alloc(int children
, boolean_t resilvering
, boolean_t root
)
159 mm
= kmem_zalloc(vdev_mirror_map_size(children
), KM_SLEEP
);
160 mm
->mm_children
= children
;
161 mm
->mm_resilvering
= resilvering
;
163 mm
->mm_preferred
= (int *)((uintptr_t)mm
+
164 offsetof(mirror_map_t
, mm_child
[children
]));
170 vdev_mirror_map_free(zio_t
*zio
)
172 mirror_map_t
*mm
= zio
->io_vsd
;
174 kmem_free(mm
, vdev_mirror_map_size(mm
->mm_children
));
177 static const zio_vsd_ops_t vdev_mirror_vsd_ops
= {
178 .vsd_free
= vdev_mirror_map_free
,
182 vdev_mirror_load(mirror_map_t
*mm
, vdev_t
*vd
, uint64_t zio_offset
)
184 uint64_t last_offset
;
188 /* All DVAs have equal weight at the root. */
193 * We don't return INT_MAX if the device is resilvering i.e.
194 * vdev_resilver_txg != 0 as when tested performance was slightly
195 * worse overall when resilvering with compared to without.
198 /* Fix zio_offset for leaf vdevs */
199 if (vd
->vdev_ops
->vdev_op_leaf
)
200 zio_offset
+= VDEV_LABEL_START_SIZE
;
202 /* Standard load based on pending queue length. */
203 load
= vdev_queue_length(vd
);
204 last_offset
= vdev_queue_last_offset(vd
);
206 if (vd
->vdev_nonrot
) {
207 /* Non-rotating media. */
208 if (last_offset
== zio_offset
) {
209 MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear
);
210 return (load
+ zfs_vdev_mirror_non_rotating_inc
);
214 * Apply a seek penalty even for non-rotating devices as
215 * sequential I/O's can be aggregated into fewer operations on
216 * the device, thus avoiding unnecessary per-command overhead
217 * and boosting performance.
219 MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek
);
220 return (load
+ zfs_vdev_mirror_non_rotating_seek_inc
);
223 /* Rotating media I/O's which directly follow the last I/O. */
224 if (last_offset
== zio_offset
) {
225 MIRROR_BUMP(vdev_mirror_stat_rotating_linear
);
226 return (load
+ zfs_vdev_mirror_rotating_inc
);
230 * Apply half the seek increment to I/O's within seek offset
231 * of the last I/O issued to this vdev as they should incur less
232 * of a seek increment.
234 offset_diff
= (int64_t)(last_offset
- zio_offset
);
235 if (ABS(offset_diff
) < zfs_vdev_mirror_rotating_seek_offset
) {
236 MIRROR_BUMP(vdev_mirror_stat_rotating_offset
);
237 return (load
+ (zfs_vdev_mirror_rotating_seek_inc
/ 2));
240 /* Apply the full seek increment to all other I/O's. */
241 MIRROR_BUMP(vdev_mirror_stat_rotating_seek
);
242 return (load
+ zfs_vdev_mirror_rotating_seek_inc
);
246 vdev_mirror_rebuilding(vdev_t
*vd
)
248 if (vd
->vdev_ops
->vdev_op_leaf
&& vd
->vdev_rebuild_txg
)
251 for (int i
= 0; i
< vd
->vdev_children
; i
++) {
252 if (vdev_mirror_rebuilding(vd
->vdev_child
[i
])) {
261 * Avoid inlining the function to keep vdev_mirror_io_start(), which
262 * is this functions only caller, as small as possible on the stack.
264 noinline
static mirror_map_t
*
265 vdev_mirror_map_init(zio_t
*zio
)
267 mirror_map_t
*mm
= NULL
;
269 vdev_t
*vd
= zio
->io_vd
;
273 dva_t
*dva
= zio
->io_bp
->blk_dva
;
274 spa_t
*spa
= zio
->io_spa
;
275 dsl_scan_t
*scn
= spa
->spa_dsl_pool
->dp_scan
;
276 dva_t dva_copy
[SPA_DVAS_PER_BP
];
279 * The sequential scrub code sorts and issues all DVAs
280 * of a bp separately. Each of these IOs includes all
281 * original DVA copies so that repairs can be performed
282 * in the event of an error, but we only actually want
283 * to check the first DVA since the others will be
284 * checked by their respective sorted IOs. Only if we
285 * hit an error will we try all DVAs upon retrying.
287 * Note: This check is safe even if the user switches
288 * from a legacy scrub to a sequential one in the middle
289 * of processing, since scn_is_sorted isn't updated until
290 * all outstanding IOs from the previous scrub pass
293 if ((zio
->io_flags
& ZIO_FLAG_SCRUB
) &&
294 !(zio
->io_flags
& ZIO_FLAG_IO_RETRY
) &&
295 dsl_scan_scrubbing(spa
->spa_dsl_pool
) &&
296 scn
->scn_is_sorted
) {
299 c
= BP_GET_NDVAS(zio
->io_bp
);
303 * If the pool cannot be written to, then infer that some
304 * DVAs might be invalid or point to vdevs that do not exist.
307 if (!spa_writeable(spa
)) {
308 ASSERT3U(zio
->io_type
, ==, ZIO_TYPE_READ
);
310 for (int i
= 0; i
< c
; i
++) {
311 if (zfs_dva_valid(spa
, &dva
[i
], zio
->io_bp
))
312 dva_copy
[j
++] = dva
[i
];
316 zio
->io_error
= ENXIO
;
325 mm
= vdev_mirror_map_alloc(c
, B_FALSE
, B_TRUE
);
326 for (c
= 0; c
< mm
->mm_children
; c
++) {
327 mc
= &mm
->mm_child
[c
];
329 mc
->mc_vd
= vdev_lookup_top(spa
, DVA_GET_VDEV(&dva
[c
]));
330 mc
->mc_offset
= DVA_GET_OFFSET(&dva
[c
]);
331 if (mc
->mc_vd
== NULL
) {
332 kmem_free(mm
, vdev_mirror_map_size(
335 zio
->io_error
= ENXIO
;
341 * If we are resilvering, then we should handle scrub reads
342 * differently; we shouldn't issue them to the resilvering
343 * device because it might not have those blocks.
345 * We are resilvering iff:
346 * 1) We are a replacing vdev (ie our name is "replacing-1" or
347 * "spare-1" or something like that), and
348 * 2) The pool is currently being resilvered.
350 * We cannot simply check vd->vdev_resilver_txg, because it's
351 * not set in this path.
353 * Nor can we just check our vdev_ops; there are cases (such as
354 * when a user types "zpool replace pool odev spare_dev" and
355 * spare_dev is in the spare list, or when a spare device is
356 * automatically used to replace a DEGRADED device) when
357 * resilvering is complete but both the original vdev and the
358 * spare vdev remain in the pool. That behavior is intentional.
359 * It helps implement the policy that a spare should be
360 * automatically removed from the pool after the user replaces
361 * the device that originally failed.
363 * If a spa load is in progress, then spa_dsl_pool may be
364 * uninitialized. But we shouldn't be resilvering during a spa
367 boolean_t replacing
= (vd
->vdev_ops
== &vdev_replacing_ops
||
368 vd
->vdev_ops
== &vdev_spare_ops
) &&
369 spa_load_state(vd
->vdev_spa
) == SPA_LOAD_NONE
&&
370 dsl_scan_resilvering(vd
->vdev_spa
->spa_dsl_pool
);
371 mm
= vdev_mirror_map_alloc(vd
->vdev_children
, replacing
,
373 for (c
= 0; c
< mm
->mm_children
; c
++) {
374 mc
= &mm
->mm_child
[c
];
375 mc
->mc_vd
= vd
->vdev_child
[c
];
376 mc
->mc_offset
= zio
->io_offset
;
378 if (vdev_mirror_rebuilding(mc
->mc_vd
))
379 mm
->mm_rebuilding
= mc
->mc_rebuilding
= B_TRUE
;
387 vdev_mirror_open(vdev_t
*vd
, uint64_t *asize
, uint64_t *max_asize
,
388 uint64_t *logical_ashift
, uint64_t *physical_ashift
)
393 if (vd
->vdev_children
== 0) {
394 vd
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
395 return (SET_ERROR(EINVAL
));
398 vdev_open_children(vd
);
400 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
401 vdev_t
*cvd
= vd
->vdev_child
[c
];
403 if (cvd
->vdev_open_error
) {
404 lasterror
= cvd
->vdev_open_error
;
409 *asize
= MIN(*asize
- 1, cvd
->vdev_asize
- 1) + 1;
410 *max_asize
= MIN(*max_asize
- 1, cvd
->vdev_max_asize
- 1) + 1;
411 *logical_ashift
= MAX(*logical_ashift
, cvd
->vdev_ashift
);
413 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
414 vdev_t
*cvd
= vd
->vdev_child
[c
];
416 if (cvd
->vdev_open_error
)
418 *physical_ashift
= vdev_best_ashift(*logical_ashift
,
419 *physical_ashift
, cvd
->vdev_physical_ashift
);
422 if (numerrors
== vd
->vdev_children
) {
423 if (vdev_children_are_offline(vd
))
424 vd
->vdev_stat
.vs_aux
= VDEV_AUX_CHILDREN_OFFLINE
;
426 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NO_REPLICAS
;
434 vdev_mirror_close(vdev_t
*vd
)
436 for (int c
= 0; c
< vd
->vdev_children
; c
++)
437 vdev_close(vd
->vdev_child
[c
]);
441 vdev_mirror_child_done(zio_t
*zio
)
443 mirror_child_t
*mc
= zio
->io_private
;
445 mc
->mc_error
= zio
->io_error
;
451 * Check the other, lower-index DVAs to see if they're on the same
452 * vdev as the child we picked. If they are, use them since they
453 * are likely to have been allocated from the primary metaslab in
454 * use at the time, and hence are more likely to have locality with
458 vdev_mirror_dva_select(zio_t
*zio
, int p
)
460 dva_t
*dva
= zio
->io_bp
->blk_dva
;
461 mirror_map_t
*mm
= zio
->io_vsd
;
465 preferred
= mm
->mm_preferred
[p
];
466 for (p
--; p
>= 0; p
--) {
467 c
= mm
->mm_preferred
[p
];
468 if (DVA_GET_VDEV(&dva
[c
]) == DVA_GET_VDEV(&dva
[preferred
]))
475 vdev_mirror_preferred_child_randomize(zio_t
*zio
)
477 mirror_map_t
*mm
= zio
->io_vsd
;
481 p
= random_in_range(mm
->mm_preferred_cnt
);
482 return (vdev_mirror_dva_select(zio
, p
));
486 * To ensure we don't always favour the first matching vdev,
487 * which could lead to wear leveling issues on SSD's, we
488 * use the I/O offset as a pseudo random seed into the vdevs
489 * which have the lowest load.
491 p
= (zio
->io_offset
>> vdev_mirror_shift
) % mm
->mm_preferred_cnt
;
492 return (mm
->mm_preferred
[p
]);
496 vdev_mirror_child_readable(mirror_child_t
*mc
)
498 vdev_t
*vd
= mc
->mc_vd
;
500 if (vd
->vdev_top
!= NULL
&& vd
->vdev_top
->vdev_ops
== &vdev_draid_ops
)
501 return (vdev_draid_readable(vd
, mc
->mc_offset
));
503 return (vdev_readable(vd
));
507 vdev_mirror_child_missing(mirror_child_t
*mc
, uint64_t txg
, uint64_t size
)
509 vdev_t
*vd
= mc
->mc_vd
;
511 if (vd
->vdev_top
!= NULL
&& vd
->vdev_top
->vdev_ops
== &vdev_draid_ops
)
512 return (vdev_draid_missing(vd
, mc
->mc_offset
, txg
, size
));
514 return (vdev_dtl_contains(vd
, DTL_MISSING
, txg
, size
));
518 * Try to find a vdev whose DTL doesn't contain the block we want to read
519 * preferring vdevs based on determined load. If we can't, try the read on
520 * any vdev we haven't already tried.
522 * Distributed spares are an exception to the above load rule. They are
523 * always preferred in order to detect gaps in the distributed spare which
524 * are created when another disk in the dRAID fails. In order to restore
525 * redundancy those gaps must be read to trigger the required repair IO.
528 vdev_mirror_child_select(zio_t
*zio
)
530 mirror_map_t
*mm
= zio
->io_vsd
;
531 uint64_t txg
= zio
->io_txg
;
534 ASSERT(zio
->io_bp
== NULL
|| BP_GET_BIRTH(zio
->io_bp
) == txg
);
536 lowest_load
= INT_MAX
;
537 mm
->mm_preferred_cnt
= 0;
538 for (c
= 0; c
< mm
->mm_children
; c
++) {
541 mc
= &mm
->mm_child
[c
];
542 if (mc
->mc_tried
|| mc
->mc_skipped
)
545 if (mc
->mc_vd
== NULL
||
546 !vdev_mirror_child_readable(mc
)) {
547 mc
->mc_error
= SET_ERROR(ENXIO
);
548 mc
->mc_tried
= 1; /* don't even try */
553 if (vdev_mirror_child_missing(mc
, txg
, 1)) {
554 mc
->mc_error
= SET_ERROR(ESTALE
);
556 mc
->mc_speculative
= 1;
560 if (mc
->mc_vd
->vdev_ops
== &vdev_draid_spare_ops
) {
561 mm
->mm_preferred
[0] = c
;
562 mm
->mm_preferred_cnt
= 1;
566 mc
->mc_load
= vdev_mirror_load(mm
, mc
->mc_vd
, mc
->mc_offset
);
567 if (mc
->mc_load
> lowest_load
)
570 if (mc
->mc_load
< lowest_load
) {
571 lowest_load
= mc
->mc_load
;
572 mm
->mm_preferred_cnt
= 0;
574 mm
->mm_preferred
[mm
->mm_preferred_cnt
] = c
;
575 mm
->mm_preferred_cnt
++;
578 if (mm
->mm_preferred_cnt
== 1) {
579 MIRROR_BUMP(vdev_mirror_stat_preferred_found
);
580 return (mm
->mm_preferred
[0]);
583 if (mm
->mm_preferred_cnt
> 1) {
584 MIRROR_BUMP(vdev_mirror_stat_preferred_not_found
);
585 return (vdev_mirror_preferred_child_randomize(zio
));
589 * Every device is either missing or has this txg in its DTL.
590 * Look for any child we haven't already tried before giving up.
592 for (c
= 0; c
< mm
->mm_children
; c
++) {
593 if (!mm
->mm_child
[c
].mc_tried
)
598 * Every child failed. There's no place left to look.
604 vdev_mirror_io_start(zio_t
*zio
)
610 mm
= vdev_mirror_map_init(zio
);
612 zio
->io_vsd_ops
= &vdev_mirror_vsd_ops
;
615 ASSERT(!spa_trust_config(zio
->io_spa
));
616 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
621 if (zio
->io_type
== ZIO_TYPE_READ
) {
622 if ((zio
->io_flags
& ZIO_FLAG_SCRUB
) && !mm
->mm_resilvering
) {
624 * For scrubbing reads we need to issue reads to all
625 * children. One child can reuse parent buffer, but
626 * for others we have to allocate separate ones to
627 * verify checksums if io_bp is non-NULL, or compare
628 * them in vdev_mirror_io_done() otherwise.
630 boolean_t first
= B_TRUE
;
631 for (c
= 0; c
< mm
->mm_children
; c
++) {
632 mc
= &mm
->mm_child
[c
];
634 /* Don't issue ZIOs to offline children */
635 if (!vdev_mirror_child_readable(mc
)) {
636 mc
->mc_error
= SET_ERROR(ENXIO
);
642 mc
->mc_abd
= first
? zio
->io_abd
:
643 abd_alloc_sametype(zio
->io_abd
,
645 zio_nowait(zio_vdev_child_io(zio
, zio
->io_bp
,
646 mc
->mc_vd
, mc
->mc_offset
, mc
->mc_abd
,
647 zio
->io_size
, zio
->io_type
,
649 vdev_mirror_child_done
, mc
));
656 * For normal reads just pick one child.
658 c
= vdev_mirror_child_select(zio
);
661 ASSERT(zio
->io_type
== ZIO_TYPE_WRITE
);
664 * Writes go to all children.
667 children
= mm
->mm_children
;
671 mc
= &mm
->mm_child
[c
];
675 * When sequentially resilvering only issue write repair
676 * IOs to the vdev which is being rebuilt since performance
677 * is limited by the slowest child. This is an issue for
678 * faster replacement devices such as distributed spares.
680 if ((zio
->io_priority
== ZIO_PRIORITY_REBUILD
) &&
681 (zio
->io_flags
& ZIO_FLAG_IO_REPAIR
) &&
682 !(zio
->io_flags
& ZIO_FLAG_SCRUB
) &&
683 mm
->mm_rebuilding
&& !mc
->mc_rebuilding
) {
687 zio_nowait(zio_vdev_child_io(zio
, zio
->io_bp
,
688 mc
->mc_vd
, mc
->mc_offset
, zio
->io_abd
, zio
->io_size
,
689 zio
->io_type
, zio
->io_priority
, 0,
690 vdev_mirror_child_done
, mc
));
697 vdev_mirror_worst_error(mirror_map_t
*mm
)
699 int error
[2] = { 0, 0 };
701 for (int c
= 0; c
< mm
->mm_children
; c
++) {
702 mirror_child_t
*mc
= &mm
->mm_child
[c
];
703 int s
= mc
->mc_speculative
;
704 error
[s
] = zio_worst_error(error
[s
], mc
->mc_error
);
707 return (error
[0] ? error
[0] : error
[1]);
711 vdev_mirror_io_done(zio_t
*zio
)
713 mirror_map_t
*mm
= zio
->io_vsd
;
717 int unexpected_errors
= 0;
718 int last_good_copy
= -1;
723 for (c
= 0; c
< mm
->mm_children
; c
++) {
724 mc
= &mm
->mm_child
[c
];
729 } else if (mc
->mc_tried
) {
735 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
737 * XXX -- for now, treat partial writes as success.
739 * Now that we support write reallocation, it would be better
740 * to treat partial failure as real failure unless there are
741 * no non-degraded top-level vdevs left, and not update DTLs
742 * if we intend to reallocate.
744 if (good_copies
!= mm
->mm_children
) {
746 * Always require at least one good copy.
748 * For ditto blocks (io_vd == NULL), require
749 * all copies to be good.
751 * XXX -- for replacing vdevs, there's no great answer.
752 * If the old device is really dead, we may not even
753 * be able to access it -- so we only want to
754 * require good writes to the new device. But if
755 * the new device turns out to be flaky, we want
756 * to be able to detach it -- which requires all
757 * writes to the old device to have succeeded.
759 if (good_copies
== 0 || zio
->io_vd
== NULL
)
760 zio
->io_error
= vdev_mirror_worst_error(mm
);
765 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
768 * Any Direct I/O read that has a checksum error must be treated as
769 * suspicious as the contents of the buffer could be getting
770 * manipulated while the I/O is taking place. The checksum verify error
771 * will be reported to the top-level Mirror VDEV.
773 * There will be no attampt at reading any additional data copies. If
774 * the buffer is still being manipulated while attempting to read from
775 * another child, there exists a possibly that the checksum could be
776 * verified as valid. However, the buffer contents could again get
777 * manipulated after verifying the checksum. This would lead to bad data
778 * being written out during self healing.
780 if ((zio
->io_flags
& ZIO_FLAG_DIO_READ
) &&
781 (zio
->io_flags
& ZIO_FLAG_DIO_CHKSUM_ERR
)) {
782 zio_dio_chksum_verify_error_report(zio
);
783 zio
->io_error
= vdev_mirror_worst_error(mm
);
784 ASSERT3U(zio
->io_error
, ==, ECKSUM
);
789 * If we don't have a good copy yet, keep trying other children.
791 if (good_copies
== 0 && (c
= vdev_mirror_child_select(zio
)) != -1) {
792 ASSERT(c
>= 0 && c
< mm
->mm_children
);
793 mc
= &mm
->mm_child
[c
];
794 zio_vdev_io_redone(zio
);
795 zio_nowait(zio_vdev_child_io(zio
, zio
->io_bp
,
796 mc
->mc_vd
, mc
->mc_offset
, zio
->io_abd
, zio
->io_size
,
797 ZIO_TYPE_READ
, zio
->io_priority
, 0,
798 vdev_mirror_child_done
, mc
));
802 if (zio
->io_flags
& ZIO_FLAG_SCRUB
&& !mm
->mm_resilvering
) {
803 abd_t
*best_abd
= NULL
;
804 if (last_good_copy
>= 0)
805 best_abd
= mm
->mm_child
[last_good_copy
].mc_abd
;
808 * If we're scrubbing but don't have a BP available (because
809 * this vdev is under a raidz or draid vdev) then the best we
810 * can do is compare all of the copies read. If they're not
811 * identical then return a checksum error and the most likely
812 * correct data. The raidz code will issue a repair I/O if
815 if (zio
->io_bp
== NULL
) {
816 ASSERT(zio
->io_vd
->vdev_ops
== &vdev_replacing_ops
||
817 zio
->io_vd
->vdev_ops
== &vdev_spare_ops
);
819 abd_t
*pref_abd
= NULL
;
820 for (c
= 0; c
< last_good_copy
; c
++) {
821 mc
= &mm
->mm_child
[c
];
822 if (mc
->mc_error
|| !mc
->mc_tried
)
825 if (abd_cmp(mc
->mc_abd
, best_abd
) != 0)
826 zio
->io_error
= SET_ERROR(ECKSUM
);
829 * The distributed spare is always prefered
830 * by vdev_mirror_child_select() so it's
831 * considered to be the best candidate.
833 if (pref_abd
== NULL
&&
834 mc
->mc_vd
->vdev_ops
==
835 &vdev_draid_spare_ops
)
836 pref_abd
= mc
->mc_abd
;
839 * In the absence of a preferred copy, use
840 * the parent pointer to avoid a memory copy.
842 if (mc
->mc_abd
== zio
->io_abd
)
843 best_abd
= mc
->mc_abd
;
850 * If we have a BP available, then checksums are
851 * already verified and we just need a buffer
852 * with valid data, preferring parent one to
853 * avoid a memory copy.
855 for (c
= 0; c
< last_good_copy
; c
++) {
856 mc
= &mm
->mm_child
[c
];
857 if (mc
->mc_error
|| !mc
->mc_tried
)
859 if (mc
->mc_abd
== zio
->io_abd
) {
860 best_abd
= mc
->mc_abd
;
866 if (best_abd
&& best_abd
!= zio
->io_abd
)
867 abd_copy(zio
->io_abd
, best_abd
, zio
->io_size
);
868 for (c
= 0; c
< mm
->mm_children
; c
++) {
869 mc
= &mm
->mm_child
[c
];
870 if (mc
->mc_abd
!= zio
->io_abd
)
871 abd_free(mc
->mc_abd
);
876 if (good_copies
== 0) {
877 zio
->io_error
= vdev_mirror_worst_error(mm
);
878 ASSERT(zio
->io_error
!= 0);
881 if (good_copies
&& spa_writeable(zio
->io_spa
) &&
882 (unexpected_errors
||
883 (zio
->io_flags
& ZIO_FLAG_RESILVER
) ||
884 ((zio
->io_flags
& ZIO_FLAG_SCRUB
) && mm
->mm_resilvering
))) {
886 * Use the good data we have in hand to repair damaged children.
888 for (c
= 0; c
< mm
->mm_children
; c
++) {
890 * Don't rewrite known good children.
891 * Not only is it unnecessary, it could
892 * actually be harmful: if the system lost
893 * power while rewriting the only good copy,
894 * there would be no good copies left!
896 mc
= &mm
->mm_child
[c
];
898 if (mc
->mc_error
== 0) {
899 vdev_ops_t
*ops
= mc
->mc_vd
->vdev_ops
;
904 * We didn't try this child. We need to
906 * 1. it's a scrub (in which case we have
907 * tried everything that was healthy)
909 * 2. it's an indirect or distributed spare
910 * vdev (in which case it could point to any
911 * other vdev, which might have a bad DTL)
913 * 3. the DTL indicates that this data is
914 * missing from this vdev
916 if (!(zio
->io_flags
& ZIO_FLAG_SCRUB
) &&
917 ops
!= &vdev_indirect_ops
&&
918 ops
!= &vdev_draid_spare_ops
&&
919 !vdev_dtl_contains(mc
->mc_vd
, DTL_PARTIAL
,
922 mc
->mc_error
= SET_ERROR(ESTALE
);
925 zio_nowait(zio_vdev_child_io(zio
, zio
->io_bp
,
926 mc
->mc_vd
, mc
->mc_offset
,
927 zio
->io_abd
, zio
->io_size
, ZIO_TYPE_WRITE
,
928 zio
->io_priority
== ZIO_PRIORITY_REBUILD
?
929 ZIO_PRIORITY_REBUILD
: ZIO_PRIORITY_ASYNC_WRITE
,
930 ZIO_FLAG_IO_REPAIR
| (unexpected_errors
?
931 ZIO_FLAG_SELF_HEAL
: 0), NULL
, NULL
));
937 vdev_mirror_state_change(vdev_t
*vd
, int faulted
, int degraded
)
939 if (faulted
== vd
->vdev_children
) {
940 if (vdev_children_are_offline(vd
)) {
941 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_OFFLINE
,
942 VDEV_AUX_CHILDREN_OFFLINE
);
944 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
945 VDEV_AUX_NO_REPLICAS
);
947 } else if (degraded
+ faulted
!= 0) {
948 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_DEGRADED
, VDEV_AUX_NONE
);
950 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_HEALTHY
, VDEV_AUX_NONE
);
955 * Return the maximum asize for a rebuild zio in the provided range.
958 vdev_mirror_rebuild_asize(vdev_t
*vd
, uint64_t start
, uint64_t asize
,
959 uint64_t max_segment
)
963 uint64_t psize
= MIN(P2ROUNDUP(max_segment
, 1 << vd
->vdev_ashift
),
966 return (MIN(asize
, vdev_psize_to_asize(vd
, psize
)));
969 vdev_ops_t vdev_mirror_ops
= {
970 .vdev_op_init
= NULL
,
971 .vdev_op_fini
= NULL
,
972 .vdev_op_open
= vdev_mirror_open
,
973 .vdev_op_close
= vdev_mirror_close
,
974 .vdev_op_asize
= vdev_default_asize
,
975 .vdev_op_min_asize
= vdev_default_min_asize
,
976 .vdev_op_min_alloc
= NULL
,
977 .vdev_op_io_start
= vdev_mirror_io_start
,
978 .vdev_op_io_done
= vdev_mirror_io_done
,
979 .vdev_op_state_change
= vdev_mirror_state_change
,
980 .vdev_op_need_resilver
= vdev_default_need_resilver
,
981 .vdev_op_hold
= NULL
,
982 .vdev_op_rele
= NULL
,
983 .vdev_op_remap
= NULL
,
984 .vdev_op_xlate
= vdev_default_xlate
,
985 .vdev_op_rebuild_asize
= vdev_mirror_rebuild_asize
,
986 .vdev_op_metaslab_init
= NULL
,
987 .vdev_op_config_generate
= NULL
,
988 .vdev_op_nparity
= NULL
,
989 .vdev_op_ndisks
= NULL
,
990 .vdev_op_type
= VDEV_TYPE_MIRROR
, /* name of this vdev type */
991 .vdev_op_leaf
= B_FALSE
/* not a leaf vdev */
994 vdev_ops_t vdev_replacing_ops
= {
995 .vdev_op_init
= NULL
,
996 .vdev_op_fini
= NULL
,
997 .vdev_op_open
= vdev_mirror_open
,
998 .vdev_op_close
= vdev_mirror_close
,
999 .vdev_op_asize
= vdev_default_asize
,
1000 .vdev_op_min_asize
= vdev_default_min_asize
,
1001 .vdev_op_min_alloc
= NULL
,
1002 .vdev_op_io_start
= vdev_mirror_io_start
,
1003 .vdev_op_io_done
= vdev_mirror_io_done
,
1004 .vdev_op_state_change
= vdev_mirror_state_change
,
1005 .vdev_op_need_resilver
= vdev_default_need_resilver
,
1006 .vdev_op_hold
= NULL
,
1007 .vdev_op_rele
= NULL
,
1008 .vdev_op_remap
= NULL
,
1009 .vdev_op_xlate
= vdev_default_xlate
,
1010 .vdev_op_rebuild_asize
= vdev_mirror_rebuild_asize
,
1011 .vdev_op_metaslab_init
= NULL
,
1012 .vdev_op_config_generate
= NULL
,
1013 .vdev_op_nparity
= NULL
,
1014 .vdev_op_ndisks
= NULL
,
1015 .vdev_op_type
= VDEV_TYPE_REPLACING
, /* name of this vdev type */
1016 .vdev_op_leaf
= B_FALSE
/* not a leaf vdev */
1019 vdev_ops_t vdev_spare_ops
= {
1020 .vdev_op_init
= NULL
,
1021 .vdev_op_fini
= NULL
,
1022 .vdev_op_open
= vdev_mirror_open
,
1023 .vdev_op_close
= vdev_mirror_close
,
1024 .vdev_op_asize
= vdev_default_asize
,
1025 .vdev_op_min_asize
= vdev_default_min_asize
,
1026 .vdev_op_min_alloc
= NULL
,
1027 .vdev_op_io_start
= vdev_mirror_io_start
,
1028 .vdev_op_io_done
= vdev_mirror_io_done
,
1029 .vdev_op_state_change
= vdev_mirror_state_change
,
1030 .vdev_op_need_resilver
= vdev_default_need_resilver
,
1031 .vdev_op_hold
= NULL
,
1032 .vdev_op_rele
= NULL
,
1033 .vdev_op_remap
= NULL
,
1034 .vdev_op_xlate
= vdev_default_xlate
,
1035 .vdev_op_rebuild_asize
= vdev_mirror_rebuild_asize
,
1036 .vdev_op_metaslab_init
= NULL
,
1037 .vdev_op_config_generate
= NULL
,
1038 .vdev_op_nparity
= NULL
,
1039 .vdev_op_ndisks
= NULL
,
1040 .vdev_op_type
= VDEV_TYPE_SPARE
, /* name of this vdev type */
1041 .vdev_op_leaf
= B_FALSE
/* not a leaf vdev */
1044 ZFS_MODULE_PARAM(zfs_vdev_mirror
, zfs_vdev_mirror_
, rotating_inc
, INT
, ZMOD_RW
,
1045 "Rotating media load increment for non-seeking I/Os");
1047 ZFS_MODULE_PARAM(zfs_vdev_mirror
, zfs_vdev_mirror_
, rotating_seek_inc
, INT
,
1048 ZMOD_RW
, "Rotating media load increment for seeking I/Os");
1051 ZFS_MODULE_PARAM(zfs_vdev_mirror
, zfs_vdev_mirror_
, rotating_seek_offset
, INT
,
1053 "Offset in bytes from the last I/O which triggers "
1054 "a reduced rotating media seek increment");
1057 ZFS_MODULE_PARAM(zfs_vdev_mirror
, zfs_vdev_mirror_
, non_rotating_inc
, INT
,
1058 ZMOD_RW
, "Non-rotating media load increment for non-seeking I/Os");
1060 ZFS_MODULE_PARAM(zfs_vdev_mirror
, zfs_vdev_mirror_
, non_rotating_seek_inc
, INT
,
1061 ZMOD_RW
, "Non-rotating media load increment for seeking I/Os");