module/zfs/vdev_mirror.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*
  27  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  28  */
  29
  30 #include <sys/zfs_context.h>
  31 #include <sys/spa.h>
  32 #include <sys/spa_impl.h>
  33 #include <sys/dsl_pool.h>
  34 #include <sys/dsl_scan.h>
  35 #include <sys/vdev_impl.h>
  36 #include <sys/vdev_draid.h>
  37 #include <sys/zio.h>
  38 #include <sys/abd.h>
  39 #include <sys/fs/zfs.h>
  40
  41 /*
  42  * Vdev mirror kstats
  43  */
  44 static kstat_t *mirror_ksp = NULL;
  45
  46 typedef struct mirror_stats {
  47         kstat_named_t vdev_mirror_stat_rotating_linear;
  48         kstat_named_t vdev_mirror_stat_rotating_offset;
  49         kstat_named_t vdev_mirror_stat_rotating_seek;
  50         kstat_named_t vdev_mirror_stat_non_rotating_linear;
  51         kstat_named_t vdev_mirror_stat_non_rotating_seek;
  52
  53         kstat_named_t vdev_mirror_stat_preferred_found;
  54         kstat_named_t vdev_mirror_stat_preferred_not_found;
  55 } mirror_stats_t;
  56
  57 static mirror_stats_t mirror_stats = {
  58         /* New I/O follows directly the last I/O */
  59         { "rotating_linear",                    KSTAT_DATA_UINT64 },
  60         /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
  61         { "rotating_offset",                    KSTAT_DATA_UINT64 },
  62         /* New I/O requires random seek */
  63         { "rotating_seek",                      KSTAT_DATA_UINT64 },
  64         /* New I/O follows directly the last I/O  (nonrot) */
  65         { "non_rotating_linear",                KSTAT_DATA_UINT64 },
  66         /* New I/O requires random seek (nonrot) */
  67         { "non_rotating_seek",                  KSTAT_DATA_UINT64 },
  68         /* Preferred child vdev found */
  69         { "preferred_found",                    KSTAT_DATA_UINT64 },
  70         /* Preferred child vdev not found or equal load  */
  71         { "preferred_not_found",                KSTAT_DATA_UINT64 },
  72
  73 };
  74
  75 #define MIRROR_STAT(stat)               (mirror_stats.stat.value.ui64)
  76 #define MIRROR_INCR(stat, val)          atomic_add_64(&MIRROR_STAT(stat), val)
  77 #define MIRROR_BUMP(stat)               MIRROR_INCR(stat, 1)
  78
  79 void
  80 vdev_mirror_stat_init(void)
  81 {
  82         mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats",
  83             "misc", KSTAT_TYPE_NAMED,
  84             sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
  85         if (mirror_ksp != NULL) {
  86                 mirror_ksp->ks_data = &mirror_stats;
  87                 kstat_install(mirror_ksp);
  88         }
  89 }
  90
  91 void
  92 vdev_mirror_stat_fini(void)
  93 {
  94         if (mirror_ksp != NULL) {
  95                 kstat_delete(mirror_ksp);
  96                 mirror_ksp = NULL;
  97         }
  98 }
  99
 100 /*
 101  * Virtual device vector for mirroring.
 102  */
 103 typedef struct mirror_child {
 104         vdev_t          *mc_vd;
 105         uint64_t        mc_offset;
 106         int             mc_error;
 107         int             mc_load;
 108         uint8_t         mc_tried;
 109         uint8_t         mc_skipped;
 110         uint8_t         mc_speculative;
 111         uint8_t         mc_rebuilding;
 112 } mirror_child_t;
 113
 114 typedef struct mirror_map {
 115         int             *mm_preferred;
 116         int             mm_preferred_cnt;
 117         int             mm_children;
 118         boolean_t       mm_resilvering;
 119         boolean_t       mm_rebuilding;
 120         boolean_t       mm_root;
 121         mirror_child_t  mm_child[];
 122 } mirror_map_t;
 123
 124 static int vdev_mirror_shift = 21;
 125
 126 /*
 127  * The load configuration settings below are tuned by default for
 128  * the case where all devices are of the same rotational type.
 129  *
 130  * If there is a mixture of rotating and non-rotating media, setting
 131  * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
 132  * as it will direct more reads to the non-rotating vdevs which are more likely
 133  * to have a higher performance.
 134  */
 135
 136 /* Rotating media load calculation configuration. */
 137 static int zfs_vdev_mirror_rotating_inc = 0;
 138 static int zfs_vdev_mirror_rotating_seek_inc = 5;
 139 static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024;
 140
 141 /* Non-rotating media load calculation configuration. */
 142 static int zfs_vdev_mirror_non_rotating_inc = 0;
 143 static int zfs_vdev_mirror_non_rotating_seek_inc = 1;
 144
 145 static inline size_t
 146 vdev_mirror_map_size(int children)
 147 {
 148         return (offsetof(mirror_map_t, mm_child[children]) +
 149             sizeof (int) * children);
 150 }
 151
 152 static inline mirror_map_t *
 153 vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
 154 {
 155         mirror_map_t *mm;
 156
 157         mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
 158         mm->mm_children = children;
 159         mm->mm_resilvering = resilvering;
 160         mm->mm_root = root;
 161         mm->mm_preferred = (int *)((uintptr_t)mm +
 162             offsetof(mirror_map_t, mm_child[children]));
 163
 164         return (mm);
 165 }
 166
 167 static void
 168 vdev_mirror_map_free(zio_t *zio)
 169 {
 170         mirror_map_t *mm = zio->io_vsd;
 171
 172         kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
 173 }
 174
 175 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
 176         .vsd_free = vdev_mirror_map_free,
 177 };
 178
 179 static int
 180 vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
 181 {
 182         uint64_t last_offset;
 183         int64_t offset_diff;
 184         int load;
 185
 186         /* All DVAs have equal weight at the root. */
 187         if (mm->mm_root)
 188                 return (INT_MAX);
 189
 190         /*
 191          * We don't return INT_MAX if the device is resilvering i.e.
 192          * vdev_resilver_txg != 0 as when tested performance was slightly
 193          * worse overall when resilvering with compared to without.
 194          */
 195
 196         /* Fix zio_offset for leaf vdevs */
 197         if (vd->vdev_ops->vdev_op_leaf)
 198                 zio_offset += VDEV_LABEL_START_SIZE;
 199
 200         /* Standard load based on pending queue length. */
 201         load = vdev_queue_length(vd);
 202         last_offset = vdev_queue_last_offset(vd);
 203
 204         if (vd->vdev_nonrot) {
 205                 /* Non-rotating media. */
 206                 if (last_offset == zio_offset) {
 207                         MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear);
 208                         return (load + zfs_vdev_mirror_non_rotating_inc);
 209                 }
 210
 211                 /*
 212                  * Apply a seek penalty even for non-rotating devices as
 213                  * sequential I/O's can be aggregated into fewer operations on
 214                  * the device, thus avoiding unnecessary per-command overhead
 215                  * and boosting performance.
 216                  */
 217                 MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek);
 218                 return (load + zfs_vdev_mirror_non_rotating_seek_inc);
 219         }
 220
 221         /* Rotating media I/O's which directly follow the last I/O. */
 222         if (last_offset == zio_offset) {
 223                 MIRROR_BUMP(vdev_mirror_stat_rotating_linear);
 224                 return (load + zfs_vdev_mirror_rotating_inc);
 225         }
 226
 227         /*
 228          * Apply half the seek increment to I/O's within seek offset
 229          * of the last I/O issued to this vdev as they should incur less
 230          * of a seek increment.
 231          */
 232         offset_diff = (int64_t)(last_offset - zio_offset);
 233         if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) {
 234                 MIRROR_BUMP(vdev_mirror_stat_rotating_offset);
 235                 return (load + (zfs_vdev_mirror_rotating_seek_inc / 2));
 236         }
 237
 238         /* Apply the full seek increment to all other I/O's. */
 239         MIRROR_BUMP(vdev_mirror_stat_rotating_seek);
 240         return (load + zfs_vdev_mirror_rotating_seek_inc);
 241 }
 242
 243 static boolean_t
 244 vdev_mirror_rebuilding(vdev_t *vd)
 245 {
 246         if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
 247                 return (B_TRUE);
 248
 249         for (int i = 0; i < vd->vdev_children; i++) {
 250                 if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
 251                         return (B_TRUE);
 252                 }
 253         }
 254
 255         return (B_FALSE);
 256 }
 257
 258 /*
 259  * Avoid inlining the function to keep vdev_mirror_io_start(), which
 260  * is this functions only caller, as small as possible on the stack.
 261  */
 262 noinline static mirror_map_t *
 263 vdev_mirror_map_init(zio_t *zio)
 264 {
 265         mirror_map_t *mm = NULL;
 266         mirror_child_t *mc;
 267         vdev_t *vd = zio->io_vd;
 268         int c;
 269
 270         if (vd == NULL) {
 271                 dva_t *dva = zio->io_bp->blk_dva;
 272                 spa_t *spa = zio->io_spa;
 273                 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 274                 dva_t dva_copy[SPA_DVAS_PER_BP];
 275
 276                 /*
 277                  * The sequential scrub code sorts and issues all DVAs
 278                  * of a bp separately. Each of these IOs includes all
 279                  * original DVA copies so that repairs can be performed
 280                  * in the event of an error, but we only actually want
 281                  * to check the first DVA since the others will be
 282                  * checked by their respective sorted IOs. Only if we
 283                  * hit an error will we try all DVAs upon retrying.
 284                  *
 285                  * Note: This check is safe even if the user switches
 286                  * from a legacy scrub to a sequential one in the middle
 287                  * of processing, since scn_is_sorted isn't updated until
 288                  * all outstanding IOs from the previous scrub pass
 289                  * complete.
 290                  */
 291                 if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
 292                     !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
 293                     dsl_scan_scrubbing(spa->spa_dsl_pool) &&
 294                     scn->scn_is_sorted) {
 295                         c = 1;
 296                 } else {
 297                         c = BP_GET_NDVAS(zio->io_bp);
 298                 }
 299
 300                 /*
 301                  * If the pool cannot be written to, then infer that some
 302                  * DVAs might be invalid or point to vdevs that do not exist.
 303                  * We skip them.
 304                  */
 305                 if (!spa_writeable(spa)) {
 306                         ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 307                         int j = 0;
 308                         for (int i = 0; i < c; i++) {
 309                                 if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
 310                                         dva_copy[j++] = dva[i];
 311                         }
 312                         if (j == 0) {
 313                                 zio->io_vsd = NULL;
 314                                 zio->io_error = ENXIO;
 315                                 return (NULL);
 316                         }
 317                         if (j < c) {
 318                                 dva = dva_copy;
 319                                 c = j;
 320                         }
 321                 }
 322
 323                 mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
 324                 for (c = 0; c < mm->mm_children; c++) {
 325                         mc = &mm->mm_child[c];
 326
 327                         mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
 328                         mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 329                         if (mc->mc_vd == NULL) {
 330                                 kmem_free(mm, vdev_mirror_map_size(
 331                                     mm->mm_children));
 332                                 zio->io_vsd = NULL;
 333                                 zio->io_error = ENXIO;
 334                                 return (NULL);
 335                         }
 336                 }
 337         } else {
 338                 /*
 339                  * If we are resilvering, then we should handle scrub reads
 340                  * differently; we shouldn't issue them to the resilvering
 341                  * device because it might not have those blocks.
 342                  *
 343                  * We are resilvering iff:
 344                  * 1) We are a replacing vdev (ie our name is "replacing-1" or
 345                  *    "spare-1" or something like that), and
 346                  * 2) The pool is currently being resilvered.
 347                  *
 348                  * We cannot simply check vd->vdev_resilver_txg, because it's
 349                  * not set in this path.
 350                  *
 351                  * Nor can we just check our vdev_ops; there are cases (such as
 352                  * when a user types "zpool replace pool odev spare_dev" and
 353                  * spare_dev is in the spare list, or when a spare device is
 354                  * automatically used to replace a DEGRADED device) when
 355                  * resilvering is complete but both the original vdev and the
 356                  * spare vdev remain in the pool.  That behavior is intentional.
 357                  * It helps implement the policy that a spare should be
 358                  * automatically removed from the pool after the user replaces
 359                  * the device that originally failed.
 360                  *
 361                  * If a spa load is in progress, then spa_dsl_pool may be
 362                  * uninitialized.  But we shouldn't be resilvering during a spa
 363                  * load anyway.
 364                  */
 365                 boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
 366                     vd->vdev_ops == &vdev_spare_ops) &&
 367                     spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
 368                     dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
 369                 mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
 370                     B_FALSE);
 371                 for (c = 0; c < mm->mm_children; c++) {
 372                         mc = &mm->mm_child[c];
 373                         mc->mc_vd = vd->vdev_child[c];
 374                         mc->mc_offset = zio->io_offset;
 375
 376                         if (vdev_mirror_rebuilding(mc->mc_vd))
 377                                 mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
 378                 }
 379         }
 380
 381         return (mm);
 382 }
 383
 384 static int
 385 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
 386     uint64_t *logical_ashift, uint64_t *physical_ashift)
 387 {
 388         int numerrors = 0;
 389         int lasterror = 0;
 390
 391         if (vd->vdev_children == 0) {
 392                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 393                 return (SET_ERROR(EINVAL));
 394         }
 395
 396         vdev_open_children(vd);
 397
 398         for (int c = 0; c < vd->vdev_children; c++) {
 399                 vdev_t *cvd = vd->vdev_child[c];
 400
 401                 if (cvd->vdev_open_error) {
 402                         lasterror = cvd->vdev_open_error;
 403                         numerrors++;
 404                         continue;
 405                 }
 406
 407                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 408                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 409                 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
 410                 *physical_ashift = MAX(*physical_ashift,
 411                     cvd->vdev_physical_ashift);
 412         }
 413
 414         if (numerrors == vd->vdev_children) {
 415                 if (vdev_children_are_offline(vd))
 416                         vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
 417                 else
 418                         vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 419                 return (lasterror);
 420         }
 421
 422         return (0);
 423 }
 424
 425 static void
 426 vdev_mirror_close(vdev_t *vd)
 427 {
 428         for (int c = 0; c < vd->vdev_children; c++)
 429                 vdev_close(vd->vdev_child[c]);
 430 }
 431
 432 static void
 433 vdev_mirror_child_done(zio_t *zio)
 434 {
 435         mirror_child_t *mc = zio->io_private;
 436
 437         mc->mc_error = zio->io_error;
 438         mc->mc_tried = 1;
 439         mc->mc_skipped = 0;
 440 }
 441
 442 static void
 443 vdev_mirror_scrub_done(zio_t *zio)
 444 {
 445         mirror_child_t *mc = zio->io_private;
 446
 447         if (zio->io_error == 0) {
 448                 zio_t *pio;
 449                 zio_link_t *zl = NULL;
 450
 451                 mutex_enter(&zio->io_lock);
 452                 while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
 453                         mutex_enter(&pio->io_lock);
 454                         ASSERT3U(zio->io_size, >=, pio->io_size);
 455                         abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
 456                         mutex_exit(&pio->io_lock);
 457                 }
 458                 mutex_exit(&zio->io_lock);
 459         }
 460
 461         abd_free(zio->io_abd);
 462
 463         mc->mc_error = zio->io_error;
 464         mc->mc_tried = 1;
 465         mc->mc_skipped = 0;
 466 }
 467
 468 /*
 469  * Check the other, lower-index DVAs to see if they're on the same
 470  * vdev as the child we picked.  If they are, use them since they
 471  * are likely to have been allocated from the primary metaslab in
 472  * use at the time, and hence are more likely to have locality with
 473  * single-copy data.
 474  */
 475 static int
 476 vdev_mirror_dva_select(zio_t *zio, int p)
 477 {
 478         dva_t *dva = zio->io_bp->blk_dva;
 479         mirror_map_t *mm = zio->io_vsd;
 480         int preferred;
 481         int c;
 482
 483         preferred = mm->mm_preferred[p];
 484         for (p--; p >= 0; p--) {
 485                 c = mm->mm_preferred[p];
 486                 if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
 487                         preferred = c;
 488         }
 489         return (preferred);
 490 }
 491
 492 static int
 493 vdev_mirror_preferred_child_randomize(zio_t *zio)
 494 {
 495         mirror_map_t *mm = zio->io_vsd;
 496         int p;
 497
 498         if (mm->mm_root) {
 499                 p = random_in_range(mm->mm_preferred_cnt);
 500                 return (vdev_mirror_dva_select(zio, p));
 501         }
 502
 503         /*
 504          * To ensure we don't always favour the first matching vdev,
 505          * which could lead to wear leveling issues on SSD's, we
 506          * use the I/O offset as a pseudo random seed into the vdevs
 507          * which have the lowest load.
 508          */
 509         p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
 510         return (mm->mm_preferred[p]);
 511 }
 512
 513 static boolean_t
 514 vdev_mirror_child_readable(mirror_child_t *mc)
 515 {
 516         vdev_t *vd = mc->mc_vd;
 517
 518         if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
 519                 return (vdev_draid_readable(vd, mc->mc_offset));
 520         else
 521                 return (vdev_readable(vd));
 522 }
 523
 524 static boolean_t
 525 vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
 526 {
 527         vdev_t *vd = mc->mc_vd;
 528
 529         if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
 530                 return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
 531         else
 532                 return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
 533 }
 534
 535 /*
 536  * Try to find a vdev whose DTL doesn't contain the block we want to read
 537  * preferring vdevs based on determined load. If we can't, try the read on
 538  * any vdev we haven't already tried.
 539  *
 540  * Distributed spares are an exception to the above load rule. They are
 541  * always preferred in order to detect gaps in the distributed spare which
 542  * are created when another disk in the dRAID fails. In order to restore
 543  * redundancy those gaps must be read to trigger the required repair IO.
 544  */
 545 static int
 546 vdev_mirror_child_select(zio_t *zio)
 547 {
 548         mirror_map_t *mm = zio->io_vsd;
 549         uint64_t txg = zio->io_txg;
 550         int c, lowest_load;
 551
 552         ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
 553
 554         lowest_load = INT_MAX;
 555         mm->mm_preferred_cnt = 0;
 556         for (c = 0; c < mm->mm_children; c++) {
 557                 mirror_child_t *mc;
 558
 559                 mc = &mm->mm_child[c];
 560                 if (mc->mc_tried || mc->mc_skipped)
 561                         continue;
 562
 563                 if (mc->mc_vd == NULL ||
 564                     !vdev_mirror_child_readable(mc)) {
 565                         mc->mc_error = SET_ERROR(ENXIO);
 566                         mc->mc_tried = 1;       /* don't even try */
 567                         mc->mc_skipped = 1;
 568                         continue;
 569                 }
 570
 571                 if (vdev_mirror_child_missing(mc, txg, 1)) {
 572                         mc->mc_error = SET_ERROR(ESTALE);
 573                         mc->mc_skipped = 1;
 574                         mc->mc_speculative = 1;
 575                         continue;
 576                 }
 577
 578                 if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
 579                         mm->mm_preferred[0] = c;
 580                         mm->mm_preferred_cnt = 1;
 581                         break;
 582                 }
 583
 584                 mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
 585                 if (mc->mc_load > lowest_load)
 586                         continue;
 587
 588                 if (mc->mc_load < lowest_load) {
 589                         lowest_load = mc->mc_load;
 590                         mm->mm_preferred_cnt = 0;
 591                 }
 592                 mm->mm_preferred[mm->mm_preferred_cnt] = c;
 593                 mm->mm_preferred_cnt++;
 594         }
 595
 596         if (mm->mm_preferred_cnt == 1) {
 597                 MIRROR_BUMP(vdev_mirror_stat_preferred_found);
 598                 return (mm->mm_preferred[0]);
 599         }
 600
 601         if (mm->mm_preferred_cnt > 1) {
 602                 MIRROR_BUMP(vdev_mirror_stat_preferred_not_found);
 603                 return (vdev_mirror_preferred_child_randomize(zio));
 604         }
 605
 606         /*
 607          * Every device is either missing or has this txg in its DTL.
 608          * Look for any child we haven't already tried before giving up.
 609          */
 610         for (c = 0; c < mm->mm_children; c++) {
 611                 if (!mm->mm_child[c].mc_tried)
 612                         return (c);
 613         }
 614
 615         /*
 616          * Every child failed.  There's no place left to look.
 617          */
 618         return (-1);
 619 }
 620
 621 static void
 622 vdev_mirror_io_start(zio_t *zio)
 623 {
 624         mirror_map_t *mm;
 625         mirror_child_t *mc;
 626         int c, children;
 627
 628         mm = vdev_mirror_map_init(zio);
 629         zio->io_vsd = mm;
 630         zio->io_vsd_ops = &vdev_mirror_vsd_ops;
 631
 632         if (mm == NULL) {
 633                 ASSERT(!spa_trust_config(zio->io_spa));
 634                 ASSERT(zio->io_type == ZIO_TYPE_READ);
 635                 zio_execute(zio);
 636                 return;
 637         }
 638
 639         if (zio->io_type == ZIO_TYPE_READ) {
 640                 if (zio->io_bp != NULL &&
 641                     (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
 642                         /*
 643                          * For scrubbing reads (if we can verify the
 644                          * checksum here, as indicated by io_bp being
 645                          * non-NULL) we need to allocate a read buffer for
 646                          * each child and issue reads to all children.  If
 647                          * any child succeeds, it will copy its data into
 648                          * zio->io_data in vdev_mirror_scrub_done.
 649                          */
 650                         for (c = 0; c < mm->mm_children; c++) {
 651                                 mc = &mm->mm_child[c];
 652
 653                                 /* Don't issue ZIOs to offline children */
 654                                 if (!vdev_mirror_child_readable(mc)) {
 655                                         mc->mc_error = SET_ERROR(ENXIO);
 656                                         mc->mc_tried = 1;
 657                                         mc->mc_skipped = 1;
 658                                         continue;
 659                                 }
 660
 661                                 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 662                                     mc->mc_vd, mc->mc_offset,
 663                                     abd_alloc_sametype(zio->io_abd,
 664                                     zio->io_size), zio->io_size,
 665                                     zio->io_type, zio->io_priority, 0,
 666                                     vdev_mirror_scrub_done, mc));
 667                         }
 668                         zio_execute(zio);
 669                         return;
 670                 }
 671                 /*
 672                  * For normal reads just pick one child.
 673                  */
 674                 c = vdev_mirror_child_select(zio);
 675                 children = (c >= 0);
 676         } else {
 677                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 678
 679                 /*
 680                  * Writes go to all children.
 681                  */
 682                 c = 0;
 683                 children = mm->mm_children;
 684         }
 685
 686         while (children--) {
 687                 mc = &mm->mm_child[c];
 688                 c++;
 689
 690                 /*
 691                  * When sequentially resilvering only issue write repair
 692                  * IOs to the vdev which is being rebuilt since performance
 693                  * is limited by the slowest child.  This is an issue for
 694                  * faster replacement devices such as distributed spares.
 695                  */
 696                 if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
 697                     (zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 698                     !(zio->io_flags & ZIO_FLAG_SCRUB) &&
 699                     mm->mm_rebuilding && !mc->mc_rebuilding) {
 700                         continue;
 701                 }
 702
 703                 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 704                     mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
 705                     zio->io_type, zio->io_priority, 0,
 706                     vdev_mirror_child_done, mc));
 707         }
 708
 709         zio_execute(zio);
 710 }
 711
 712 static int
 713 vdev_mirror_worst_error(mirror_map_t *mm)
 714 {
 715         int error[2] = { 0, 0 };
 716
 717         for (int c = 0; c < mm->mm_children; c++) {
 718                 mirror_child_t *mc = &mm->mm_child[c];
 719                 int s = mc->mc_speculative;
 720                 error[s] = zio_worst_error(error[s], mc->mc_error);
 721         }
 722
 723         return (error[0] ? error[0] : error[1]);
 724 }
 725
 726 static void
 727 vdev_mirror_io_done(zio_t *zio)
 728 {
 729         mirror_map_t *mm = zio->io_vsd;
 730         mirror_child_t *mc;
 731         int c;
 732         int good_copies = 0;
 733         int unexpected_errors = 0;
 734
 735         if (mm == NULL)
 736                 return;
 737
 738         for (c = 0; c < mm->mm_children; c++) {
 739                 mc = &mm->mm_child[c];
 740
 741                 if (mc->mc_error) {
 742                         if (!mc->mc_skipped)
 743                                 unexpected_errors++;
 744                 } else if (mc->mc_tried) {
 745                         good_copies++;
 746                 }
 747         }
 748
 749         if (zio->io_type == ZIO_TYPE_WRITE) {
 750                 /*
 751                  * XXX -- for now, treat partial writes as success.
 752                  *
 753                  * Now that we support write reallocation, it would be better
 754                  * to treat partial failure as real failure unless there are
 755                  * no non-degraded top-level vdevs left, and not update DTLs
 756                  * if we intend to reallocate.
 757                  */
 758                 /* XXPOLICY */
 759                 if (good_copies != mm->mm_children) {
 760                         /*
 761                          * Always require at least one good copy.
 762                          *
 763                          * For ditto blocks (io_vd == NULL), require
 764                          * all copies to be good.
 765                          *
 766                          * XXX -- for replacing vdevs, there's no great answer.
 767                          * If the old device is really dead, we may not even
 768                          * be able to access it -- so we only want to
 769                          * require good writes to the new device.  But if
 770                          * the new device turns out to be flaky, we want
 771                          * to be able to detach it -- which requires all
 772                          * writes to the old device to have succeeded.
 773                          */
 774                         if (good_copies == 0 || zio->io_vd == NULL)
 775                                 zio->io_error = vdev_mirror_worst_error(mm);
 776                 }
 777                 return;
 778         }
 779
 780         ASSERT(zio->io_type == ZIO_TYPE_READ);
 781
 782         /*
 783          * If we don't have a good copy yet, keep trying other children.
 784          */
 785         /* XXPOLICY */
 786         if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
 787                 ASSERT(c >= 0 && c < mm->mm_children);
 788                 mc = &mm->mm_child[c];
 789                 zio_vdev_io_redone(zio);
 790                 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 791                     mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
 792                     ZIO_TYPE_READ, zio->io_priority, 0,
 793                     vdev_mirror_child_done, mc));
 794                 return;
 795         }
 796
 797         /* XXPOLICY */
 798         if (good_copies == 0) {
 799                 zio->io_error = vdev_mirror_worst_error(mm);
 800                 ASSERT(zio->io_error != 0);
 801         }
 802
 803         if (good_copies && spa_writeable(zio->io_spa) &&
 804             (unexpected_errors ||
 805             (zio->io_flags & ZIO_FLAG_RESILVER) ||
 806             ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
 807                 /*
 808                  * Use the good data we have in hand to repair damaged children.
 809                  */
 810                 for (c = 0; c < mm->mm_children; c++) {
 811                         /*
 812                          * Don't rewrite known good children.
 813                          * Not only is it unnecessary, it could
 814                          * actually be harmful: if the system lost
 815                          * power while rewriting the only good copy,
 816                          * there would be no good copies left!
 817                          */
 818                         mc = &mm->mm_child[c];
 819
 820                         if (mc->mc_error == 0) {
 821                                 vdev_ops_t *ops = mc->mc_vd->vdev_ops;
 822
 823                                 if (mc->mc_tried)
 824                                         continue;
 825                                 /*
 826                                  * We didn't try this child.  We need to
 827                                  * repair it if:
 828                                  * 1. it's a scrub (in which case we have
 829                                  * tried everything that was healthy)
 830                                  *  - or -
 831                                  * 2. it's an indirect or distributed spare
 832                                  * vdev (in which case it could point to any
 833                                  * other vdev, which might have a bad DTL)
 834                                  *  - or -
 835                                  * 3. the DTL indicates that this data is
 836                                  * missing from this vdev
 837                                  */
 838                                 if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
 839                                     ops != &vdev_indirect_ops &&
 840                                     ops != &vdev_draid_spare_ops &&
 841                                     !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
 842                                     zio->io_txg, 1))
 843                                         continue;
 844                                 mc->mc_error = SET_ERROR(ESTALE);
 845                         }
 846
 847                         zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 848                             mc->mc_vd, mc->mc_offset,
 849                             zio->io_abd, zio->io_size, ZIO_TYPE_WRITE,
 850                             zio->io_priority == ZIO_PRIORITY_REBUILD ?
 851                             ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
 852                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 853                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 854                 }
 855         }
 856 }
 857
 858 static void
 859 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
 860 {
 861         if (faulted == vd->vdev_children) {
 862                 if (vdev_children_are_offline(vd)) {
 863                         vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
 864                             VDEV_AUX_CHILDREN_OFFLINE);
 865                 } else {
 866                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 867                             VDEV_AUX_NO_REPLICAS);
 868                 }
 869         } else if (degraded + faulted != 0) {
 870                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 871         } else {
 872                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 873         }
 874 }
 875
 876 /*
 877  * Return the maximum asize for a rebuild zio in the provided range.
 878  */
 879 static uint64_t
 880 vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
 881     uint64_t max_segment)
 882 {
 883         uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
 884             SPA_MAXBLOCKSIZE);
 885
 886         return (MIN(asize, vdev_psize_to_asize(vd, psize)));
 887 }
 888
 889 vdev_ops_t vdev_mirror_ops = {
 890         .vdev_op_init = NULL,
 891         .vdev_op_fini = NULL,
 892         .vdev_op_open = vdev_mirror_open,
 893         .vdev_op_close = vdev_mirror_close,
 894         .vdev_op_asize = vdev_default_asize,
 895         .vdev_op_min_asize = vdev_default_min_asize,
 896         .vdev_op_min_alloc = NULL,
 897         .vdev_op_io_start = vdev_mirror_io_start,
 898         .vdev_op_io_done = vdev_mirror_io_done,
 899         .vdev_op_state_change = vdev_mirror_state_change,
 900         .vdev_op_need_resilver = vdev_default_need_resilver,
 901         .vdev_op_hold = NULL,
 902         .vdev_op_rele = NULL,
 903         .vdev_op_remap = NULL,
 904         .vdev_op_xlate = vdev_default_xlate,
 905         .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 906         .vdev_op_metaslab_init = NULL,
 907         .vdev_op_config_generate = NULL,
 908         .vdev_op_nparity = NULL,
 909         .vdev_op_ndisks = NULL,
 910         .vdev_op_type = VDEV_TYPE_MIRROR,       /* name of this vdev type */
 911         .vdev_op_leaf = B_FALSE                 /* not a leaf vdev */
 912 };
 913
 914 vdev_ops_t vdev_replacing_ops = {
 915         .vdev_op_init = NULL,
 916         .vdev_op_fini = NULL,
 917         .vdev_op_open = vdev_mirror_open,
 918         .vdev_op_close = vdev_mirror_close,
 919         .vdev_op_asize = vdev_default_asize,
 920         .vdev_op_min_asize = vdev_default_min_asize,
 921         .vdev_op_min_alloc = NULL,
 922         .vdev_op_io_start = vdev_mirror_io_start,
 923         .vdev_op_io_done = vdev_mirror_io_done,
 924         .vdev_op_state_change = vdev_mirror_state_change,
 925         .vdev_op_need_resilver = vdev_default_need_resilver,
 926         .vdev_op_hold = NULL,
 927         .vdev_op_rele = NULL,
 928         .vdev_op_remap = NULL,
 929         .vdev_op_xlate = vdev_default_xlate,
 930         .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 931         .vdev_op_metaslab_init = NULL,
 932         .vdev_op_config_generate = NULL,
 933         .vdev_op_nparity = NULL,
 934         .vdev_op_ndisks = NULL,
 935         .vdev_op_type = VDEV_TYPE_REPLACING,    /* name of this vdev type */
 936         .vdev_op_leaf = B_FALSE                 /* not a leaf vdev */
 937 };
 938
 939 vdev_ops_t vdev_spare_ops = {
 940         .vdev_op_init = NULL,
 941         .vdev_op_fini = NULL,
 942         .vdev_op_open = vdev_mirror_open,
 943         .vdev_op_close = vdev_mirror_close,
 944         .vdev_op_asize = vdev_default_asize,
 945         .vdev_op_min_asize = vdev_default_min_asize,
 946         .vdev_op_min_alloc = NULL,
 947         .vdev_op_io_start = vdev_mirror_io_start,
 948         .vdev_op_io_done = vdev_mirror_io_done,
 949         .vdev_op_state_change = vdev_mirror_state_change,
 950         .vdev_op_need_resilver = vdev_default_need_resilver,
 951         .vdev_op_hold = NULL,
 952         .vdev_op_rele = NULL,
 953         .vdev_op_remap = NULL,
 954         .vdev_op_xlate = vdev_default_xlate,
 955         .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 956         .vdev_op_metaslab_init = NULL,
 957         .vdev_op_config_generate = NULL,
 958         .vdev_op_nparity = NULL,
 959         .vdev_op_ndisks = NULL,
 960         .vdev_op_type = VDEV_TYPE_SPARE,        /* name of this vdev type */
 961         .vdev_op_leaf = B_FALSE                 /* not a leaf vdev */
 962 };
 963
 964 /* BEGIN CSTYLED */
 965 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW,
 966         "Rotating media load increment for non-seeking I/O's");
 967
 968 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, ZMOD_RW,
 969         "Rotating media load increment for seeking I/O's");
 970
 971 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, ZMOD_RW,
 972         "Offset in bytes from the last I/O which triggers "
 973         "a reduced rotating media seek increment");
 974
 975 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, ZMOD_RW,
 976         "Non-rotating media load increment for non-seeking I/O's");
 977
 978 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, ZMOD_RW,
 979         "Non-rotating media load increment for seeking I/O's");
 980 /* END CSTYLED */