module/zfs/vdev_raidz.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  25  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
  26  */
  27
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/spa_impl.h>
  31 #include <sys/zap.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/metaslab_impl.h>
  34 #include <sys/zio.h>
  35 #include <sys/zio_checksum.h>
  36 #include <sys/dmu_tx.h>
  37 #include <sys/abd.h>
  38 #include <sys/zfs_rlock.h>
  39 #include <sys/fs/zfs.h>
  40 #include <sys/fm/fs/zfs.h>
  41 #include <sys/vdev_raidz.h>
  42 #include <sys/vdev_raidz_impl.h>
  43 #include <sys/vdev_draid.h>
  44 #include <sys/uberblock_impl.h>
  45 #include <sys/dsl_scan.h>
  46
  47 #ifdef ZFS_DEBUG
  48 #include <sys/vdev.h>   /* For vdev_xlate() in vdev_raidz_io_verify() */
  49 #endif
  50
  51 /*
  52  * Virtual device vector for RAID-Z.
  53  *
  54  * This vdev supports single, double, and triple parity. For single parity,
  55  * we use a simple XOR of all the data columns. For double or triple parity,
  56  * we use a special case of Reed-Solomon coding. This extends the
  57  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  58  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  59  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  60  * former is also based. The latter is designed to provide higher performance
  61  * for writes.
  62  *
  63  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  64  * amended six years later identifying a critical flaw that invalidates its
  65  * claims. Nevertheless, the technique can be adapted to work for up to
  66  * triple parity. For additional parity, the amendment "Note: Correction to
  67  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  68  * is viable, but the additional complexity means that write performance will
  69  * suffer.
  70  *
  71  * All of the methods above operate on a Galois field, defined over the
  72  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  73  * can be expressed with a single byte. Briefly, the operations on the
  74  * field are defined as follows:
  75  *
  76  *   o addition (+) is represented by a bitwise XOR
  77  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  78  *   o multiplication of A by 2 is defined by the following bitwise expression:
  79  *
  80  *      (A * 2)_7 = A_6
  81  *      (A * 2)_6 = A_5
  82  *      (A * 2)_5 = A_4
  83  *      (A * 2)_4 = A_3 + A_7
  84  *      (A * 2)_3 = A_2 + A_7
  85  *      (A * 2)_2 = A_1 + A_7
  86  *      (A * 2)_1 = A_0
  87  *      (A * 2)_0 = A_7
  88  *
  89  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  90  * As an aside, this multiplication is derived from the error correcting
  91  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  92  *
  93  * Observe that any number in the field (except for 0) can be expressed as a
  94  * power of 2 -- a generator for the field. We store a table of the powers of
  95  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  96  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  97  * than field addition). The inverse of a field element A (A^-1) is therefore
  98  * A ^ (255 - 1) = A^254.
  99  *
 100  * The up-to-three parity columns, P, Q, R over several data columns,
 101  * D_0, ... D_n-1, can be expressed by field operations:
 102  *
 103  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
 104  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
 105  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
 106  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
 107  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
 108  *
 109  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
 110  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
 111  * independent coefficients. (There are no additional coefficients that have
 112  * this property which is why the uncorrected Plank method breaks down.)
 113  *
 114  * See the reconstruction code below for how P, Q and R can used individually
 115  * or in concert to recover missing data columns.
 116  */
 117
 118 #define VDEV_RAIDZ_P            0
 119 #define VDEV_RAIDZ_Q            1
 120 #define VDEV_RAIDZ_R            2
 121
 122 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 123 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 124
 125 /*
 126  * We provide a mechanism to perform the field multiplication operation on a
 127  * 64-bit value all at once rather than a byte at a time. This works by
 128  * creating a mask from the top bit in each byte and using that to
 129  * conditionally apply the XOR of 0x1d.
 130  */
 131 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 132 { \
 133         (mask) = (x) & 0x8080808080808080ULL; \
 134         (mask) = ((mask) << 1) - ((mask) >> 7); \
 135         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 136             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 137 }
 138
 139 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 140 { \
 141         VDEV_RAIDZ_64MUL_2((x), mask); \
 142         VDEV_RAIDZ_64MUL_2((x), mask); \
 143 }
 144
 145
 146 /*
 147  * Big Theory Statement for how a RAIDZ VDEV is expanded
 148  *
 149  * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
 150  * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
 151  * that have been previously expanded can be expanded again.
 152  *
 153  * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
 154  * the VDEV) when an expansion starts.  And the expansion will pause if any
 155  * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
 156  * operations on the pool can continue while an expansion is in progress (e.g.
 157  * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
 158  * and zpool initialize which can't be run during an expansion.  Following a
 159  * reboot or export/import, the expansion resumes where it left off.
 160  *
 161  * == Reflowing the Data ==
 162  *
 163  * The expansion involves reflowing (copying) the data from the current set
 164  * of disks to spread it across the new set which now has one more disk. This
 165  * reflow operation is similar to reflowing text when the column width of a
 166  * text editor window is expanded. The text doesn’t change but the location of
 167  * the text changes to accommodate the new width. An example reflow result for
 168  * a 4-wide RAIDZ1 to a 5-wide is shown below.
 169  *
 170  *                            Reflow End State
 171  *            Each letter indicates a parity group (logical stripe)
 172  *
 173  *         Before expansion                         After Expansion
 174  *     D1     D2     D3     D4               D1     D2     D3     D4     D5
 175  *  +------+------+------+------+         +------+------+------+------+------+
 176  *  |      |      |      |      |         |      |      |      |      |      |
 177  *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
 178  *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
 179  *  +------+------+------+------+         +------+------+------+------+------+
 180  *  |      |      |      |      |         |      |      |      |      |      |
 181  *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
 182  *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
 183  *  +------+------+------+------+         +------+------+------+------+------+
 184  *  |      |      |      |      |         |      |      |      |      |      |
 185  *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
 186  *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
 187  *  +------+------+------+------+         +------+------+------+------+------+
 188  *  |      |      |      |      |         |      |      |      |      |      |
 189  *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
 190  *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
 191  *  +------+------+------+------+         +------+------+------+------+------+
 192  *  |      |      |      |      |         |      |      |      |      |      |
 193  *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
 194  *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
 195  *  +------+------+------+------+         +------+------+------+------+------+
 196  *  |      |      |      |      |         |      |      |      |      |      |
 197  *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
 198  *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
 199  *  +------+------+------+------+         +------+------+------+------+------+
 200  *  |      |      |      |      |         |      |      |      |      |      |
 201  *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
 202  *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
 203  *  +------+------+------+------+         +------+------+------+------+------+
 204  *
 205  * This reflow approach has several advantages. There is no need to read or
 206  * modify the block pointers or recompute any block checksums.  The reflow
 207  * doesn’t need to know where the parity sectors reside. We can read and write
 208  * data sequentially and the copy can occur in a background thread in open
 209  * context. The design also allows for fast discovery of what data to copy.
 210  *
 211  * The VDEV metaslabs are processed, one at a time, to copy the block data to
 212  * have it flow across all the disks. The metaslab is disabled for allocations
 213  * during the copy. As an optimization, we only copy the allocated data which
 214  * can be determined by looking at the metaslab range tree. During the copy we
 215  * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
 216  * need to be able to survive losing parity count disks).  This means we
 217  * cannot overwrite data during the reflow that would be needed if a disk is
 218  * lost.
 219  *
 220  * After the reflow completes, all newly-written blocks will have the new
 221  * layout, i.e., they will have the parity to data ratio implied by the new
 222  * number of disks in the RAIDZ group.  Even though the reflow copies all of
 223  * the allocated space (data and parity), it is only rearranged, not changed.
 224  *
 225  * This act of reflowing the data has a few implications about blocks
 226  * that were written before the reflow completes:
 227  *
 228  *  - Old blocks will still use the same amount of space (i.e., they will have
 229  *    the parity to data ratio implied by the old number of disks in the RAIDZ
 230  *    group).
 231  *  - Reading old blocks will be slightly slower than before the reflow, for
 232  *    two reasons. First, we will have to read from all disks in the RAIDZ
 233  *    VDEV, rather than being able to skip the children that contain only
 234  *    parity of this block (because the data of a single block is now spread
 235  *    out across all the disks).  Second, in most cases there will be an extra
 236  *    bcopy, needed to rearrange the data back to its original layout in memory.
 237  *
 238  * == Scratch Area ==
 239  *
 240  * As we copy the block data, we can only progress to the point that writes
 241  * will not overlap with blocks whose progress has not yet been recorded on
 242  * disk.  Since partially-copied rows are always read from the old location,
 243  * we need to stop one row before the sector-wise overlap, to prevent any
 244  * row-wise overlap. For example, in the diagram above, when we reflow sector
 245  * B6 it will overwite the original location for B5.
 246  *
 247  * To get around this, a scratch space is used so that we can start copying
 248  * without risking data loss by overlapping the row. As an added benefit, it
 249  * improves performance at the beginning of the reflow, but that small perf
 250  * boost wouldn't be worth the complexity on its own.
 251  *
 252  * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
 253  * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
 254  * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
 255  * the widths will likely be single digits so we can get a substantial chuck
 256  * size using only a few MB of scratch per disk.
 257  *
 258  * The scratch area is persisted to disk which holds a large amount of reflowed
 259  * state. We can always read the partially written stripes when a disk fails or
 260  * the copy is interrupted (crash) during the initial copying phase and also
 261  * get past a small chunk size restriction.  At a minimum, the scratch space
 262  * must be large enough to get us to the point that one row does not overlap
 263  * itself when moved (i.e new_width^2).  But going larger is even better. We
 264  * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
 265  * as our scratch space to handle overwriting the initial part of the VDEV.
 266  *
 267  *      0     256K   512K                    4M
 268  *      +------+------+-----------------------+-----------------------------
 269  *      | VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
 270  *      |  L0  |  L1  |       Reserved        |     (Metaslabs)
 271  *      +------+------+-----------------------+-------------------------------
 272  *                        Scratch Area
 273  *
 274  * == Reflow Progress Updates ==
 275  * After the initial scratch-based reflow, the expansion process works
 276  * similarly to device removal. We create a new open context thread which
 277  * reflows the data, and periodically kicks off sync tasks to update logical
 278  * state. In this case, state is the committed progress (offset of next data
 279  * to copy). We need to persist the completed offset on disk, so that if we
 280  * crash we know which format each VDEV offset is in.
 281  *
 282  * == Time Dependent Geometry ==
 283  *
 284  * In non-expanded RAIDZ, blocks are read from disk in a column by column
 285  * fashion. For a multi-row block, the second sector is in the first column
 286  * not in the second column. This allows us to issue full reads for each
 287  * column directly into the request buffer. The block data is thus laid out
 288  * sequentially in a column-by-column fashion.
 289  *
 290  * For example, in the before expansion diagram above, one logical block might
 291  * be sectors G19-H26. The parity is in G19,H23; and the data is in
 292  * G20,H24,G21,H25,G22,H26.
 293  *
 294  * After a block is reflowed, the sectors that were all in the original column
 295  * data can now reside in different columns. When reading from an expanded
 296  * VDEV, we need to know the logical stripe width for each block so we can
 297  * reconstitute the block’s data after the reads are completed. Likewise,
 298  * when we perform the combinatorial reconstruction we need to know the
 299  * original width so we can retry combinations from the past layouts.
 300  *
 301  * Time dependent geometry is what we call having blocks with different layouts
 302  * (stripe widths) in the same VDEV. This time-dependent geometry uses the
 303  * block’s birth time (+ the time expansion ended) to establish the correct
 304  * width for a given block. After an expansion completes, we record the time
 305  * for blocks written with a particular width (geometry).
 306  *
 307  * == On Disk Format Changes ==
 308  *
 309  * New pool feature flag, 'raidz_expansion' whose reference count is the number
 310  * of RAIDZ VDEVs that have been expanded.
 311  *
 312  * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
 313  *
 314  * Since the uberblock can point to arbitrary blocks, which might be on the
 315  * expanding RAIDZ, and might or might not have been expanded. We need to know
 316  * which way a block is laid out before reading it. This info is the next
 317  * offset that needs to be reflowed and we persist that in the uberblock, in
 318  * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
 319  * After the expansion is complete, we then use the raidz_expand_txgs array
 320  * (see below) to determine how to read a block and the ub_raidz_reflow_info
 321  * field no longer required.
 322  *
 323  * The uberblock's ub_raidz_reflow_info field also holds the scratch space
 324  * state (i.e., active or not) which is also required before reading a block
 325  * during the initial phase of reflowing the data.
 326  *
 327  * The top-level RAIDZ VDEV has two new entries in the nvlist:
 328  *
 329  * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
 330  *                            and used after the expansion is complete to
 331  *                            determine how to read a raidz block
 332  * 'raidz_expanding' boolean: present during reflow and removed after completion
 333  *                            used during a spa import to resume an unfinished
 334  *                            expansion
 335  *
 336  * And finally the VDEVs top zap adds the following informational entries:
 337  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
 338  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
 339  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
 340  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
 341  */
 342
 343 /*
 344  * For testing only: pause the raidz expansion after reflowing this amount.
 345  * (accessed by ZTS and ztest)
 346  */
 347 #ifdef  _KERNEL
 348 static
 349 #endif  /* _KERNEL */
 350 unsigned long raidz_expand_max_reflow_bytes = 0;
 351
 352 /*
 353  * For testing only: pause the raidz expansion at a certain point.
 354  */
 355 uint_t raidz_expand_pause_point = 0;
 356
 357 /*
 358  * Maximum amount of copy io's outstanding at once.
 359  */
 360 #ifdef _ILP32
 361 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
 362 #else
 363 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
 364 #endif
 365
 366 /*
 367  * Apply raidz map abds aggregation if the number of rows in the map is equal
 368  * or greater than the value below.
 369  */
 370 static unsigned long raidz_io_aggregate_rows = 4;
 371
 372 /*
 373  * Automatically start a pool scrub when a RAIDZ expansion completes in
 374  * order to verify the checksums of all blocks which have been copied
 375  * during the expansion.  Automatic scrubbing is enabled by default and
 376  * is strongly recommended.
 377  */
 378 static int zfs_scrub_after_expand = 1;
 379
 380 static void
 381 vdev_raidz_row_free(raidz_row_t *rr)
 382 {
 383         for (int c = 0; c < rr->rr_cols; c++) {
 384                 raidz_col_t *rc = &rr->rr_col[c];
 385
 386                 if (rc->rc_size != 0)
 387                         abd_free(rc->rc_abd);
 388                 if (rc->rc_orig_data != NULL)
 389                         abd_free(rc->rc_orig_data);
 390         }
 391
 392         if (rr->rr_abd_empty != NULL)
 393                 abd_free(rr->rr_abd_empty);
 394
 395         kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
 396 }
 397
 398 void
 399 vdev_raidz_map_free(raidz_map_t *rm)
 400 {
 401         for (int i = 0; i < rm->rm_nrows; i++)
 402                 vdev_raidz_row_free(rm->rm_row[i]);
 403
 404         if (rm->rm_nphys_cols) {
 405                 for (int i = 0; i < rm->rm_nphys_cols; i++) {
 406                         if (rm->rm_phys_col[i].rc_abd != NULL)
 407                                 abd_free(rm->rm_phys_col[i].rc_abd);
 408                 }
 409
 410                 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
 411                     rm->rm_nphys_cols);
 412         }
 413
 414         ASSERT3P(rm->rm_lr, ==, NULL);
 415         kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
 416 }
 417
 418 static void
 419 vdev_raidz_map_free_vsd(zio_t *zio)
 420 {
 421         raidz_map_t *rm = zio->io_vsd;
 422
 423         vdev_raidz_map_free(rm);
 424 }
 425
 426 static int
 427 vdev_raidz_reflow_compare(const void *x1, const void *x2)
 428 {
 429         const reflow_node_t *l = x1;
 430         const reflow_node_t *r = x2;
 431
 432         return (TREE_CMP(l->re_txg, r->re_txg));
 433 }
 434
 435 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 436         .vsd_free = vdev_raidz_map_free_vsd,
 437 };
 438
 439 raidz_row_t *
 440 vdev_raidz_row_alloc(int cols, zio_t *zio)
 441 {
 442         raidz_row_t *rr =
 443             kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
 444
 445         rr->rr_cols = cols;
 446         rr->rr_scols = cols;
 447
 448         for (int c = 0; c < cols; c++) {
 449                 raidz_col_t *rc = &rr->rr_col[c];
 450                 rc->rc_shadow_devidx = INT_MAX;
 451                 rc->rc_shadow_offset = UINT64_MAX;
 452                 /*
 453                  * We can not allow self healing to take place for Direct I/O
 454                  * reads. There is nothing that stops the buffer contents from
 455                  * being manipulated while the I/O is in flight. It is possible
 456                  * that the checksum could be verified on the buffer and then
 457                  * the contents of that buffer are manipulated afterwards. This
 458                  * could lead to bad data being written out during self
 459                  * healing.
 460                  */
 461                 if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
 462                         rc->rc_allow_repair = 1;
 463         }
 464         return (rr);
 465 }
 466
 467 static void
 468 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
 469 {
 470         int c;
 471         int nwrapped = 0;
 472         uint64_t off = 0;
 473         raidz_row_t *rr = rm->rm_row[0];
 474
 475         ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 476         ASSERT3U(rm->rm_nrows, ==, 1);
 477
 478         /*
 479          * Pad any parity columns with additional space to account for skip
 480          * sectors.
 481          */
 482         if (rm->rm_skipstart < rr->rr_firstdatacol) {
 483                 ASSERT0(rm->rm_skipstart);
 484                 nwrapped = rm->rm_nskip;
 485         } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
 486                 nwrapped =
 487                     (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
 488         }
 489
 490         /*
 491          * Optional single skip sectors (rc_size == 0) will be handled in
 492          * vdev_raidz_io_start_write().
 493          */
 494         int skipped = rr->rr_scols - rr->rr_cols;
 495
 496         /* Allocate buffers for the parity columns */
 497         for (c = 0; c < rr->rr_firstdatacol; c++) {
 498                 raidz_col_t *rc = &rr->rr_col[c];
 499
 500                 /*
 501                  * Parity columns will pad out a linear ABD to account for
 502                  * the skip sector. A linear ABD is used here because
 503                  * parity calculations use the ABD buffer directly to calculate
 504                  * parity. This avoids doing a memcpy back to the ABD after the
 505                  * parity has been calculated. By issuing the parity column
 506                  * with the skip sector we can reduce contention on the child
 507                  * VDEV queue locks (vq_lock).
 508                  */
 509                 if (c < nwrapped) {
 510                         rc->rc_abd = abd_alloc_linear(
 511                             rc->rc_size + (1ULL << ashift), B_FALSE);
 512                         abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
 513                         skipped++;
 514                 } else {
 515                         rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 516                 }
 517         }
 518
 519         for (off = 0; c < rr->rr_cols; c++) {
 520                 raidz_col_t *rc = &rr->rr_col[c];
 521                 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
 522                     zio->io_abd, off, rc->rc_size);
 523
 524                 /*
 525                  * Generate I/O for skip sectors to improve aggregation
 526                  * continuity. We will use gang ABD's to reduce contention
 527                  * on the child VDEV queue locks (vq_lock) by issuing
 528                  * a single I/O that contains the data and skip sector.
 529                  *
 530                  * It is important to make sure that rc_size is not updated
 531                  * even though we are adding a skip sector to the ABD. When
 532                  * calculating the parity in vdev_raidz_generate_parity_row()
 533                  * the rc_size is used to iterate through the ABD's. We can
 534                  * not have zero'd out skip sectors used for calculating
 535                  * parity for raidz, because those same sectors are not used
 536                  * during reconstruction.
 537                  */
 538                 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
 539                         rc->rc_abd = abd_alloc_gang();
 540                         abd_gang_add(rc->rc_abd, abd, B_TRUE);
 541                         abd_gang_add(rc->rc_abd,
 542                             abd_get_zeros(1ULL << ashift), B_TRUE);
 543                         skipped++;
 544                 } else {
 545                         rc->rc_abd = abd;
 546                 }
 547                 off += rc->rc_size;
 548         }
 549
 550         ASSERT3U(off, ==, zio->io_size);
 551         ASSERT3S(skipped, ==, rm->rm_nskip);
 552 }
 553
 554 static void
 555 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
 556 {
 557         int c;
 558         raidz_row_t *rr = rm->rm_row[0];
 559
 560         ASSERT3U(rm->rm_nrows, ==, 1);
 561
 562         /* Allocate buffers for the parity columns */
 563         for (c = 0; c < rr->rr_firstdatacol; c++)
 564                 rr->rr_col[c].rc_abd =
 565                     abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
 566
 567         for (uint64_t off = 0; c < rr->rr_cols; c++) {
 568                 raidz_col_t *rc = &rr->rr_col[c];
 569                 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 570                     zio->io_abd, off, rc->rc_size);
 571                 off += rc->rc_size;
 572         }
 573 }
 574
 575 /*
 576  * Divides the IO evenly across all child vdevs; usually, dcols is
 577  * the number of children in the target vdev.
 578  *
 579  * Avoid inlining the function to keep vdev_raidz_io_start(), which
 580  * is this functions only caller, as small as possible on the stack.
 581  */
 582 noinline raidz_map_t *
 583 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
 584     uint64_t nparity)
 585 {
 586         raidz_row_t *rr;
 587         /* The starting RAIDZ (parent) vdev sector of the block. */
 588         uint64_t b = zio->io_offset >> ashift;
 589         /* The zio's size in units of the vdev's minimum sector size. */
 590         uint64_t s = zio->io_size >> ashift;
 591         /* The first column for this stripe. */
 592         uint64_t f = b % dcols;
 593         /* The starting byte offset on each child vdev. */
 594         uint64_t o = (b / dcols) << ashift;
 595         uint64_t acols, scols;
 596
 597         raidz_map_t *rm =
 598             kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
 599         rm->rm_nrows = 1;
 600
 601         /*
 602          * "Quotient": The number of data sectors for this stripe on all but
 603          * the "big column" child vdevs that also contain "remainder" data.
 604          */
 605         uint64_t q = s / (dcols - nparity);
 606
 607         /*
 608          * "Remainder": The number of partial stripe data sectors in this I/O.
 609          * This will add a sector to some, but not all, child vdevs.
 610          */
 611         uint64_t r = s - q * (dcols - nparity);
 612
 613         /* The number of "big columns" - those which contain remainder data. */
 614         uint64_t bc = (r == 0 ? 0 : r + nparity);
 615
 616         /*
 617          * The total number of data and parity sectors associated with
 618          * this I/O.
 619          */
 620         uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 621
 622         /*
 623          * acols: The columns that will be accessed.
 624          * scols: The columns that will be accessed or skipped.
 625          */
 626         if (q == 0) {
 627                 /* Our I/O request doesn't span all child vdevs. */
 628                 acols = bc;
 629                 scols = MIN(dcols, roundup(bc, nparity + 1));
 630         } else {
 631                 acols = dcols;
 632                 scols = dcols;
 633         }
 634
 635         ASSERT3U(acols, <=, scols);
 636         rr = vdev_raidz_row_alloc(scols, zio);
 637         rm->rm_row[0] = rr;
 638         rr->rr_cols = acols;
 639         rr->rr_bigcols = bc;
 640         rr->rr_firstdatacol = nparity;
 641 #ifdef ZFS_DEBUG
 642         rr->rr_offset = zio->io_offset;
 643         rr->rr_size = zio->io_size;
 644 #endif
 645
 646         uint64_t asize = 0;
 647
 648         for (uint64_t c = 0; c < scols; c++) {
 649                 raidz_col_t *rc = &rr->rr_col[c];
 650                 uint64_t col = f + c;
 651                 uint64_t coff = o;
 652                 if (col >= dcols) {
 653                         col -= dcols;
 654                         coff += 1ULL << ashift;
 655                 }
 656                 rc->rc_devidx = col;
 657                 rc->rc_offset = coff;
 658
 659                 if (c >= acols)
 660                         rc->rc_size = 0;
 661                 else if (c < bc)
 662                         rc->rc_size = (q + 1) << ashift;
 663                 else
 664                         rc->rc_size = q << ashift;
 665
 666                 asize += rc->rc_size;
 667         }
 668
 669         ASSERT3U(asize, ==, tot << ashift);
 670         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 671         rm->rm_skipstart = bc;
 672
 673         /*
 674          * If all data stored spans all columns, there's a danger that parity
 675          * will always be on the same device and, since parity isn't read
 676          * during normal operation, that device's I/O bandwidth won't be
 677          * used effectively. We therefore switch the parity every 1MB.
 678          *
 679          * ... at least that was, ostensibly, the theory. As a practical
 680          * matter unless we juggle the parity between all devices evenly, we
 681          * won't see any benefit. Further, occasional writes that aren't a
 682          * multiple of the LCM of the number of children and the minimum
 683          * stripe width are sufficient to avoid pessimal behavior.
 684          * Unfortunately, this decision created an implicit on-disk format
 685          * requirement that we need to support for all eternity, but only
 686          * for single-parity RAID-Z.
 687          *
 688          * If we intend to skip a sector in the zeroth column for padding
 689          * we must make sure to note this swap. We will never intend to
 690          * skip the first column since at least one data and one parity
 691          * column must appear in each row.
 692          */
 693         ASSERT(rr->rr_cols >= 2);
 694         ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 695
 696         if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 697                 uint64_t devidx = rr->rr_col[0].rc_devidx;
 698                 o = rr->rr_col[0].rc_offset;
 699                 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 700                 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 701                 rr->rr_col[1].rc_devidx = devidx;
 702                 rr->rr_col[1].rc_offset = o;
 703                 if (rm->rm_skipstart == 0)
 704                         rm->rm_skipstart = 1;
 705         }
 706
 707         if (zio->io_type == ZIO_TYPE_WRITE) {
 708                 vdev_raidz_map_alloc_write(zio, rm, ashift);
 709         } else {
 710                 vdev_raidz_map_alloc_read(zio, rm);
 711         }
 712         /* init RAIDZ parity ops */
 713         rm->rm_ops = vdev_raidz_math_get_ops();
 714
 715         return (rm);
 716 }
 717
 718 /*
 719  * Everything before reflow_offset_synced should have been moved to the new
 720  * location (read and write completed).  However, this may not yet be reflected
 721  * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
 722  * uberblock has not yet been written). If reflow is not in progress,
 723  * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
 724  * entirely before reflow_offset_synced, it will come from the new location.
 725  * Otherwise this row will come from the old location.  Therefore, rows that
 726  * straddle the reflow_offset_synced will come from the old location.
 727  *
 728  * For writes, reflow_offset_next is the next offset to copy.  If a sector has
 729  * been copied, but not yet reflected in the on-disk progress
 730  * (reflow_offset_synced), it will also be written to the new (already copied)
 731  * offset.
 732  */
 733 noinline raidz_map_t *
 734 vdev_raidz_map_alloc_expanded(zio_t *zio,
 735     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
 736     uint64_t nparity, uint64_t reflow_offset_synced,
 737     uint64_t reflow_offset_next, boolean_t use_scratch)
 738 {
 739         abd_t *abd = zio->io_abd;
 740         uint64_t offset = zio->io_offset;
 741         uint64_t size = zio->io_size;
 742
 743         /* The zio's size in units of the vdev's minimum sector size. */
 744         uint64_t s = size >> ashift;
 745
 746         /*
 747          * "Quotient": The number of data sectors for this stripe on all but
 748          * the "big column" child vdevs that also contain "remainder" data.
 749          * AKA "full rows"
 750          */
 751         uint64_t q = s / (logical_cols - nparity);
 752
 753         /*
 754          * "Remainder": The number of partial stripe data sectors in this I/O.
 755          * This will add a sector to some, but not all, child vdevs.
 756          */
 757         uint64_t r = s - q * (logical_cols - nparity);
 758
 759         /* The number of "big columns" - those which contain remainder data. */
 760         uint64_t bc = (r == 0 ? 0 : r + nparity);
 761
 762         /*
 763          * The total number of data and parity sectors associated with
 764          * this I/O.
 765          */
 766         uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 767
 768         /* How many rows contain data (not skip) */
 769         uint64_t rows = howmany(tot, logical_cols);
 770         int cols = MIN(tot, logical_cols);
 771
 772         raidz_map_t *rm =
 773             kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
 774             KM_SLEEP);
 775         rm->rm_nrows = rows;
 776         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 777         rm->rm_skipstart = bc;
 778         uint64_t asize = 0;
 779
 780         for (uint64_t row = 0; row < rows; row++) {
 781                 boolean_t row_use_scratch = B_FALSE;
 782                 raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
 783                 rm->rm_row[row] = rr;
 784
 785                 /* The starting RAIDZ (parent) vdev sector of the row. */
 786                 uint64_t b = (offset >> ashift) + row * logical_cols;
 787
 788                 /*
 789                  * If we are in the middle of a reflow, and the copying has
 790                  * not yet completed for any part of this row, then use the
 791                  * old location of this row.  Note that reflow_offset_synced
 792                  * reflects the i/o that's been completed, because it's
 793                  * updated by a synctask, after zio_wait(spa_txg_zio[]).
 794                  * This is sufficient for our check, even if that progress
 795                  * has not yet been recorded to disk (reflected in
 796                  * spa_ubsync).  Also note that we consider the last row to
 797                  * be "full width" (`cols`-wide rather than `bc`-wide) for
 798                  * this calculation. This causes a tiny bit of unnecessary
 799                  * double-writes but is safe and simpler to calculate.
 800                  */
 801                 int row_phys_cols = physical_cols;
 802                 if (b + cols > reflow_offset_synced >> ashift)
 803                         row_phys_cols--;
 804                 else if (use_scratch)
 805                         row_use_scratch = B_TRUE;
 806
 807                 /* starting child of this row */
 808                 uint64_t child_id = b % row_phys_cols;
 809                 /* The starting byte offset on each child vdev. */
 810                 uint64_t child_offset = (b / row_phys_cols) << ashift;
 811
 812                 /*
 813                  * Note, rr_cols is the entire width of the block, even
 814                  * if this row is shorter.  This is needed because parity
 815                  * generation (for Q and R) needs to know the entire width,
 816                  * because it treats the short row as though it was
 817                  * full-width (and the "phantom" sectors were zero-filled).
 818                  *
 819                  * Another approach to this would be to set cols shorter
 820                  * (to just the number of columns that we might do i/o to)
 821                  * and have another mechanism to tell the parity generation
 822                  * about the "entire width".  Reconstruction (at least
 823                  * vdev_raidz_reconstruct_general()) would also need to
 824                  * know about the "entire width".
 825                  */
 826                 rr->rr_firstdatacol = nparity;
 827 #ifdef ZFS_DEBUG
 828                 /*
 829                  * note: rr_size is PSIZE, not ASIZE
 830                  */
 831                 rr->rr_offset = b << ashift;
 832                 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
 833 #endif
 834
 835                 for (int c = 0; c < rr->rr_cols; c++, child_id++) {
 836                         if (child_id >= row_phys_cols) {
 837                                 child_id -= row_phys_cols;
 838                                 child_offset += 1ULL << ashift;
 839                         }
 840                         raidz_col_t *rc = &rr->rr_col[c];
 841                         rc->rc_devidx = child_id;
 842                         rc->rc_offset = child_offset;
 843
 844                         /*
 845                          * Get this from the scratch space if appropriate.
 846                          * This only happens if we crashed in the middle of
 847                          * raidz_reflow_scratch_sync() (while it's running,
 848                          * the rangelock prevents us from doing concurrent
 849                          * io), and even then only during zpool import or
 850                          * when the pool is imported readonly.
 851                          */
 852                         if (row_use_scratch)
 853                                 rc->rc_offset -= VDEV_BOOT_SIZE;
 854
 855                         uint64_t dc = c - rr->rr_firstdatacol;
 856                         if (c < rr->rr_firstdatacol) {
 857                                 rc->rc_size = 1ULL << ashift;
 858
 859                                 /*
 860                                  * Parity sectors' rc_abd's are set below
 861                                  * after determining if this is an aggregation.
 862                                  */
 863                         } else if (row == rows - 1 && bc != 0 && c >= bc) {
 864                                 /*
 865                                  * Past the end of the block (even including
 866                                  * skip sectors).  This sector is part of the
 867                                  * map so that we have full rows for p/q parity
 868                                  * generation.
 869                                  */
 870                                 rc->rc_size = 0;
 871                                 rc->rc_abd = NULL;
 872                         } else {
 873                                 /* "data column" (col excluding parity) */
 874                                 uint64_t off;
 875
 876                                 if (c < bc || r == 0) {
 877                                         off = dc * rows + row;
 878                                 } else {
 879                                         off = r * rows +
 880                                             (dc - r) * (rows - 1) + row;
 881                                 }
 882                                 rc->rc_size = 1ULL << ashift;
 883                                 rc->rc_abd = abd_get_offset_struct(
 884                                     &rc->rc_abdstruct, abd, off << ashift,
 885                                     rc->rc_size);
 886                         }
 887
 888                         if (rc->rc_size == 0)
 889                                 continue;
 890
 891                         /*
 892                          * If any part of this row is in both old and new
 893                          * locations, the primary location is the old
 894                          * location. If this sector was already copied to the
 895                          * new location, we need to also write to the new,
 896                          * "shadow" location.
 897                          *
 898                          * Note, `row_phys_cols != physical_cols` indicates
 899                          * that the primary location is the old location.
 900                          * `b+c < reflow_offset_next` indicates that the copy
 901                          * to the new location has been initiated. We know
 902                          * that the copy has completed because we have the
 903                          * rangelock, which is held exclusively while the
 904                          * copy is in progress.
 905                          */
 906                         if (row_use_scratch ||
 907                             (row_phys_cols != physical_cols &&
 908                             b + c < reflow_offset_next >> ashift)) {
 909                                 rc->rc_shadow_devidx = (b + c) % physical_cols;
 910                                 rc->rc_shadow_offset =
 911                                     ((b + c) / physical_cols) << ashift;
 912                                 if (row_use_scratch)
 913                                         rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
 914                         }
 915
 916                         asize += rc->rc_size;
 917                 }
 918
 919                 /*
 920                  * See comment in vdev_raidz_map_alloc()
 921                  */
 922                 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
 923                     (offset & (1ULL << 20))) {
 924                         ASSERT(rr->rr_cols >= 2);
 925                         ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 926
 927                         int devidx0 = rr->rr_col[0].rc_devidx;
 928                         uint64_t offset0 = rr->rr_col[0].rc_offset;
 929                         int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
 930                         uint64_t shadow_offset0 =
 931                             rr->rr_col[0].rc_shadow_offset;
 932
 933                         rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 934                         rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 935                         rr->rr_col[0].rc_shadow_devidx =
 936                             rr->rr_col[1].rc_shadow_devidx;
 937                         rr->rr_col[0].rc_shadow_offset =
 938                             rr->rr_col[1].rc_shadow_offset;
 939
 940                         rr->rr_col[1].rc_devidx = devidx0;
 941                         rr->rr_col[1].rc_offset = offset0;
 942                         rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
 943                         rr->rr_col[1].rc_shadow_offset = shadow_offset0;
 944                 }
 945         }
 946         ASSERT3U(asize, ==, tot << ashift);
 947
 948         /*
 949          * Determine if the block is contiguous, in which case we can use
 950          * an aggregation.
 951          */
 952         if (rows >= raidz_io_aggregate_rows) {
 953                 rm->rm_nphys_cols = physical_cols;
 954                 rm->rm_phys_col =
 955                     kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
 956                     KM_SLEEP);
 957
 958                 /*
 959                  * Determine the aggregate io's offset and size, and check
 960                  * that the io is contiguous.
 961                  */
 962                 for (int i = 0;
 963                     i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
 964                         raidz_row_t *rr = rm->rm_row[i];
 965                         for (int c = 0; c < rr->rr_cols; c++) {
 966                                 raidz_col_t *rc = &rr->rr_col[c];
 967                                 raidz_col_t *prc =
 968                                     &rm->rm_phys_col[rc->rc_devidx];
 969
 970                                 if (rc->rc_size == 0)
 971                                         continue;
 972
 973                                 if (prc->rc_size == 0) {
 974                                         ASSERT0(prc->rc_offset);
 975                                         prc->rc_offset = rc->rc_offset;
 976                                 } else if (prc->rc_offset + prc->rc_size !=
 977                                     rc->rc_offset) {
 978                                         /*
 979                                          * This block is not contiguous and
 980                                          * therefore can't be aggregated.
 981                                          * This is expected to be rare, so
 982                                          * the cost of allocating and then
 983                                          * freeing rm_phys_col is not
 984                                          * significant.
 985                                          */
 986                                         kmem_free(rm->rm_phys_col,
 987                                             sizeof (raidz_col_t) *
 988                                             rm->rm_nphys_cols);
 989                                         rm->rm_phys_col = NULL;
 990                                         rm->rm_nphys_cols = 0;
 991                                         break;
 992                                 }
 993                                 prc->rc_size += rc->rc_size;
 994                         }
 995                 }
 996         }
 997         if (rm->rm_phys_col != NULL) {
 998                 /*
 999                  * Allocate aggregate ABD's.
1000                  */
1001                 for (int i = 0; i < rm->rm_nphys_cols; i++) {
1002                         raidz_col_t *prc = &rm->rm_phys_col[i];
1003
1004                         prc->rc_devidx = i;
1005
1006                         if (prc->rc_size == 0)
1007                                 continue;
1008
1009                         prc->rc_abd =
1010                             abd_alloc_linear(rm->rm_phys_col[i].rc_size,
1011                             B_FALSE);
1012                 }
1013
1014                 /*
1015                  * Point the parity abd's into the aggregate abd's.
1016                  */
1017                 for (int i = 0; i < rm->rm_nrows; i++) {
1018                         raidz_row_t *rr = rm->rm_row[i];
1019                         for (int c = 0; c < rr->rr_firstdatacol; c++) {
1020                                 raidz_col_t *rc = &rr->rr_col[c];
1021                                 raidz_col_t *prc =
1022                                     &rm->rm_phys_col[rc->rc_devidx];
1023                                 rc->rc_abd =
1024                                     abd_get_offset_struct(&rc->rc_abdstruct,
1025                                     prc->rc_abd,
1026                                     rc->rc_offset - prc->rc_offset,
1027                                     rc->rc_size);
1028                         }
1029                 }
1030         } else {
1031                 /*
1032                  * Allocate new abd's for the parity sectors.
1033                  */
1034                 for (int i = 0; i < rm->rm_nrows; i++) {
1035                         raidz_row_t *rr = rm->rm_row[i];
1036                         for (int c = 0; c < rr->rr_firstdatacol; c++) {
1037                                 raidz_col_t *rc = &rr->rr_col[c];
1038                                 rc->rc_abd =
1039                                     abd_alloc_linear(rc->rc_size,
1040                                     B_TRUE);
1041                         }
1042                 }
1043         }
1044         /* init RAIDZ parity ops */
1045         rm->rm_ops = vdev_raidz_math_get_ops();
1046
1047         return (rm);
1048 }
1049
1050 struct pqr_struct {
1051         uint64_t *p;
1052         uint64_t *q;
1053         uint64_t *r;
1054 };
1055
1056 static int
1057 vdev_raidz_p_func(void *buf, size_t size, void *private)
1058 {
1059         struct pqr_struct *pqr = private;
1060         const uint64_t *src = buf;
1061         int cnt = size / sizeof (src[0]);
1062
1063         ASSERT(pqr->p && !pqr->q && !pqr->r);
1064
1065         for (int i = 0; i < cnt; i++, src++, pqr->p++)
1066                 *pqr->p ^= *src;
1067
1068         return (0);
1069 }
1070
1071 static int
1072 vdev_raidz_pq_func(void *buf, size_t size, void *private)
1073 {
1074         struct pqr_struct *pqr = private;
1075         const uint64_t *src = buf;
1076         uint64_t mask;
1077         int cnt = size / sizeof (src[0]);
1078
1079         ASSERT(pqr->p && pqr->q && !pqr->r);
1080
1081         for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1082                 *pqr->p ^= *src;
1083                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1084                 *pqr->q ^= *src;
1085         }
1086
1087         return (0);
1088 }
1089
1090 static int
1091 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1092 {
1093         struct pqr_struct *pqr = private;
1094         const uint64_t *src = buf;
1095         uint64_t mask;
1096         int cnt = size / sizeof (src[0]);
1097
1098         ASSERT(pqr->p && pqr->q && pqr->r);
1099
1100         for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1101                 *pqr->p ^= *src;
1102                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1103                 *pqr->q ^= *src;
1104                 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1105                 *pqr->r ^= *src;
1106         }
1107
1108         return (0);
1109 }
1110
1111 static void
1112 vdev_raidz_generate_parity_p(raidz_row_t *rr)
1113 {
1114         uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1115
1116         for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1117                 abd_t *src = rr->rr_col[c].rc_abd;
1118
1119                 if (c == rr->rr_firstdatacol) {
1120                         abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1121                 } else {
1122                         struct pqr_struct pqr = { p, NULL, NULL };
1123                         (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1124                             vdev_raidz_p_func, &pqr);
1125                 }
1126         }
1127 }
1128
1129 static void
1130 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1131 {
1132         uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1133         uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1134         uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1135         ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1136             rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1137
1138         for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1139                 abd_t *src = rr->rr_col[c].rc_abd;
1140
1141                 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1142
1143                 if (c == rr->rr_firstdatacol) {
1144                         ASSERT(ccnt == pcnt || ccnt == 0);
1145                         abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1146                         (void) memcpy(q, p, rr->rr_col[c].rc_size);
1147
1148                         for (uint64_t i = ccnt; i < pcnt; i++) {
1149                                 p[i] = 0;
1150                                 q[i] = 0;
1151                         }
1152                 } else {
1153                         struct pqr_struct pqr = { p, q, NULL };
1154
1155                         ASSERT(ccnt <= pcnt);
1156                         (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1157                             vdev_raidz_pq_func, &pqr);
1158
1159                         /*
1160                          * Treat short columns as though they are full of 0s.
1161                          * Note that there's therefore nothing needed for P.
1162                          */
1163                         uint64_t mask;
1164                         for (uint64_t i = ccnt; i < pcnt; i++) {
1165                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
1166                         }
1167                 }
1168         }
1169 }
1170
1171 static void
1172 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1173 {
1174         uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1175         uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1176         uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1177         uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1178         ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1179             rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1180         ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1181             rr->rr_col[VDEV_RAIDZ_R].rc_size);
1182
1183         for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1184                 abd_t *src = rr->rr_col[c].rc_abd;
1185
1186                 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1187
1188                 if (c == rr->rr_firstdatacol) {
1189                         ASSERT(ccnt == pcnt || ccnt == 0);
1190                         abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1191                         (void) memcpy(q, p, rr->rr_col[c].rc_size);
1192                         (void) memcpy(r, p, rr->rr_col[c].rc_size);
1193
1194                         for (uint64_t i = ccnt; i < pcnt; i++) {
1195                                 p[i] = 0;
1196                                 q[i] = 0;
1197                                 r[i] = 0;
1198                         }
1199                 } else {
1200                         struct pqr_struct pqr = { p, q, r };
1201
1202                         ASSERT(ccnt <= pcnt);
1203                         (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1204                             vdev_raidz_pqr_func, &pqr);
1205
1206                         /*
1207                          * Treat short columns as though they are full of 0s.
1208                          * Note that there's therefore nothing needed for P.
1209                          */
1210                         uint64_t mask;
1211                         for (uint64_t i = ccnt; i < pcnt; i++) {
1212                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
1213                                 VDEV_RAIDZ_64MUL_4(r[i], mask);
1214                         }
1215                 }
1216         }
1217 }
1218
1219 /*
1220  * Generate RAID parity in the first virtual columns according to the number of
1221  * parity columns available.
1222  */
1223 void
1224 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1225 {
1226         if (rr->rr_cols == 0) {
1227                 /*
1228                  * We are handling this block one row at a time (because
1229                  * this block has a different logical vs physical width,
1230                  * due to RAIDZ expansion), and this is a pad-only row,
1231                  * which has no parity.
1232                  */
1233                 return;
1234         }
1235
1236         /* Generate using the new math implementation */
1237         if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1238                 return;
1239
1240         switch (rr->rr_firstdatacol) {
1241         case 1:
1242                 vdev_raidz_generate_parity_p(rr);
1243                 break;
1244         case 2:
1245                 vdev_raidz_generate_parity_pq(rr);
1246                 break;
1247         case 3:
1248                 vdev_raidz_generate_parity_pqr(rr);
1249                 break;
1250         default:
1251                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1252         }
1253 }
1254
1255 void
1256 vdev_raidz_generate_parity(raidz_map_t *rm)
1257 {
1258         for (int i = 0; i < rm->rm_nrows; i++) {
1259                 raidz_row_t *rr = rm->rm_row[i];
1260                 vdev_raidz_generate_parity_row(rm, rr);
1261         }
1262 }
1263
1264 static int
1265 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1266 {
1267         (void) private;
1268         uint64_t *dst = dbuf;
1269         uint64_t *src = sbuf;
1270         int cnt = size / sizeof (src[0]);
1271
1272         for (int i = 0; i < cnt; i++) {
1273                 dst[i] ^= src[i];
1274         }
1275
1276         return (0);
1277 }
1278
1279 static int
1280 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1281     void *private)
1282 {
1283         (void) private;
1284         uint64_t *dst = dbuf;
1285         uint64_t *src = sbuf;
1286         uint64_t mask;
1287         int cnt = size / sizeof (dst[0]);
1288
1289         for (int i = 0; i < cnt; i++, dst++, src++) {
1290                 VDEV_RAIDZ_64MUL_2(*dst, mask);
1291                 *dst ^= *src;
1292         }
1293
1294         return (0);
1295 }
1296
1297 static int
1298 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1299 {
1300         (void) private;
1301         uint64_t *dst = buf;
1302         uint64_t mask;
1303         int cnt = size / sizeof (dst[0]);
1304
1305         for (int i = 0; i < cnt; i++, dst++) {
1306                 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1307                 VDEV_RAIDZ_64MUL_2(*dst, mask);
1308         }
1309
1310         return (0);
1311 }
1312
1313 struct reconst_q_struct {
1314         uint64_t *q;
1315         int exp;
1316 };
1317
1318 static int
1319 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1320 {
1321         struct reconst_q_struct *rq = private;
1322         uint64_t *dst = buf;
1323         int cnt = size / sizeof (dst[0]);
1324
1325         for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1326                 int j;
1327                 uint8_t *b;
1328
1329                 *dst ^= *rq->q;
1330                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1331                         *b = vdev_raidz_exp2(*b, rq->exp);
1332                 }
1333         }
1334
1335         return (0);
1336 }
1337
1338 struct reconst_pq_struct {
1339         uint8_t *p;
1340         uint8_t *q;
1341         uint8_t *pxy;
1342         uint8_t *qxy;
1343         int aexp;
1344         int bexp;
1345 };
1346
1347 static int
1348 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1349 {
1350         struct reconst_pq_struct *rpq = private;
1351         uint8_t *xd = xbuf;
1352         uint8_t *yd = ybuf;
1353
1354         for (int i = 0; i < size;
1355             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1356                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1357                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1358                 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
1359         }
1360
1361         return (0);
1362 }
1363
1364 static int
1365 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1366 {
1367         struct reconst_pq_struct *rpq = private;
1368         uint8_t *xd = xbuf;
1369
1370         for (int i = 0; i < size;
1371             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1372                 /* same operation as vdev_raidz_reconst_pq_func() on xd */
1373                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1374                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1375         }
1376
1377         return (0);
1378 }
1379
1380 static void
1381 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1382 {
1383         int x = tgts[0];
1384         abd_t *dst, *src;
1385
1386         if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1387                 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1388
1389         ASSERT3U(ntgts, ==, 1);
1390         ASSERT3U(x, >=, rr->rr_firstdatacol);
1391         ASSERT3U(x, <, rr->rr_cols);
1392
1393         ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1394
1395         src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1396         dst = rr->rr_col[x].rc_abd;
1397
1398         abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1399
1400         for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1401                 uint64_t size = MIN(rr->rr_col[x].rc_size,
1402                     rr->rr_col[c].rc_size);
1403
1404                 src = rr->rr_col[c].rc_abd;
1405
1406                 if (c == x)
1407                         continue;
1408
1409                 (void) abd_iterate_func2(dst, src, 0, 0, size,
1410                     vdev_raidz_reconst_p_func, NULL);
1411         }
1412 }
1413
1414 static void
1415 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1416 {
1417         int x = tgts[0];
1418         int c, exp;
1419         abd_t *dst, *src;
1420
1421         if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1422                 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1423
1424         ASSERT(ntgts == 1);
1425
1426         ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1427
1428         for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1429                 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1430                     rr->rr_col[c].rc_size);
1431
1432                 src = rr->rr_col[c].rc_abd;
1433                 dst = rr->rr_col[x].rc_abd;
1434
1435                 if (c == rr->rr_firstdatacol) {
1436                         abd_copy(dst, src, size);
1437                         if (rr->rr_col[x].rc_size > size) {
1438                                 abd_zero_off(dst, size,
1439                                     rr->rr_col[x].rc_size - size);
1440                         }
1441                 } else {
1442                         ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1443                         (void) abd_iterate_func2(dst, src, 0, 0, size,
1444                             vdev_raidz_reconst_q_pre_func, NULL);
1445                         (void) abd_iterate_func(dst,
1446                             size, rr->rr_col[x].rc_size - size,
1447                             vdev_raidz_reconst_q_pre_tail_func, NULL);
1448                 }
1449         }
1450
1451         src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1452         dst = rr->rr_col[x].rc_abd;
1453         exp = 255 - (rr->rr_cols - 1 - x);
1454
1455         struct reconst_q_struct rq = { abd_to_buf(src), exp };
1456         (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1457             vdev_raidz_reconst_q_post_func, &rq);
1458 }
1459
1460 static void
1461 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1462 {
1463         uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1464         abd_t *pdata, *qdata;
1465         uint64_t xsize, ysize;
1466         int x = tgts[0];
1467         int y = tgts[1];
1468         abd_t *xd, *yd;
1469
1470         if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1471                 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1472
1473         ASSERT(ntgts == 2);
1474         ASSERT(x < y);
1475         ASSERT(x >= rr->rr_firstdatacol);
1476         ASSERT(y < rr->rr_cols);
1477
1478         ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1479
1480         /*
1481          * Move the parity data aside -- we're going to compute parity as
1482          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1483          * reuse the parity generation mechanism without trashing the actual
1484          * parity so we make those columns appear to be full of zeros by
1485          * setting their lengths to zero.
1486          */
1487         pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1488         qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1489         xsize = rr->rr_col[x].rc_size;
1490         ysize = rr->rr_col[y].rc_size;
1491
1492         rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1493             abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1494         rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1495             abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1496         rr->rr_col[x].rc_size = 0;
1497         rr->rr_col[y].rc_size = 0;
1498
1499         vdev_raidz_generate_parity_pq(rr);
1500
1501         rr->rr_col[x].rc_size = xsize;
1502         rr->rr_col[y].rc_size = ysize;
1503
1504         p = abd_to_buf(pdata);
1505         q = abd_to_buf(qdata);
1506         pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1507         qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1508         xd = rr->rr_col[x].rc_abd;
1509         yd = rr->rr_col[y].rc_abd;
1510
1511         /*
1512          * We now have:
1513          *      Pxy = P + D_x + D_y
1514          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1515          *
1516          * We can then solve for D_x:
1517          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
1518          * where
1519          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
1520          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1521          *
1522          * With D_x in hand, we can easily solve for D_y:
1523          *      D_y = P + Pxy + D_x
1524          */
1525
1526         a = vdev_raidz_pow2[255 + x - y];
1527         b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1528         tmp = 255 - vdev_raidz_log2[a ^ 1];
1529
1530         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1531         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1532
1533         ASSERT3U(xsize, >=, ysize);
1534         struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1535
1536         (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1537             vdev_raidz_reconst_pq_func, &rpq);
1538         (void) abd_iterate_func(xd, ysize, xsize - ysize,
1539             vdev_raidz_reconst_pq_tail_func, &rpq);
1540
1541         abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1542         abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1543
1544         /*
1545          * Restore the saved parity data.
1546          */
1547         rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1548         rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1549 }
1550
1551 /*
1552  * In the general case of reconstruction, we must solve the system of linear
1553  * equations defined by the coefficients used to generate parity as well as
1554  * the contents of the data and parity disks. This can be expressed with
1555  * vectors for the original data (D) and the actual data (d) and parity (p)
1556  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1557  *
1558  *            __   __                     __     __
1559  *            |     |         __     __   |  p_0  |
1560  *            |  V  |         |  D_0  |   | p_m-1 |
1561  *            |     |    x    |   :   | = |  d_0  |
1562  *            |  I  |         | D_n-1 |   |   :   |
1563  *            |     |         ~~     ~~   | d_n-1 |
1564  *            ~~   ~~                     ~~     ~~
1565  *
1566  * I is simply a square identity matrix of size n, and V is a vandermonde
1567  * matrix defined by the coefficients we chose for the various parity columns
1568  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1569  * computation as well as linear separability.
1570  *
1571  *      __               __               __     __
1572  *      |   1   ..  1 1 1 |               |  p_0  |
1573  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1574  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1575  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1576  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1577  *      |   :       : : : |   |   :   |   |  d_2  |
1578  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1579  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1580  *      |   0   ..  0 0 1 |               | d_n-1 |
1581  *      ~~               ~~               ~~     ~~
1582  *
1583  * Note that I, V, d, and p are known. To compute D, we must invert the
1584  * matrix and use the known data and parity values to reconstruct the unknown
1585  * data values. We begin by removing the rows in V|I and d|p that correspond
1586  * to failed or missing columns; we then make V|I square (n x n) and d|p
1587  * sized n by removing rows corresponding to unused parity from the bottom up
1588  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1589  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1590  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1591  *           __                               __
1592  *           |  1   1   1   1   1   1   1   1  |
1593  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1594  *           |  19 205 116  29  64  16  4   1  |      / /
1595  *           |  1   0   0   0   0   0   0   0  |     / /
1596  *           |  0   1   0   0   0   0   0   0  | <--' /
1597  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1598  *           |  0   0   0   1   0   0   0   0  |
1599  *           |  0   0   0   0   1   0   0   0  |
1600  *           |  0   0   0   0   0   1   0   0  |
1601  *           |  0   0   0   0   0   0   1   0  |
1602  *           |  0   0   0   0   0   0   0   1  |
1603  *           ~~                               ~~
1604  *           __                               __
1605  *           |  1   1   1   1   1   1   1   1  |
1606  *           | 128  64  32  16  8   4   2   1  |
1607  *           |  19 205 116  29  64  16  4   1  |
1608  *           |  1   0   0   0   0   0   0   0  |
1609  *           |  0   1   0   0   0   0   0   0  |
1610  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1611  *           |  0   0   0   1   0   0   0   0  |
1612  *           |  0   0   0   0   1   0   0   0  |
1613  *           |  0   0   0   0   0   1   0   0  |
1614  *           |  0   0   0   0   0   0   1   0  |
1615  *           |  0   0   0   0   0   0   0   1  |
1616  *           ~~                               ~~
1617  *
1618  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1619  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1620  * matrix is not singular.
1621  * __                                                                 __
1622  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1623  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1624  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1625  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1626  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1627  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1628  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1629  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1630  * ~~                                                                 ~~
1631  * __                                                                 __
1632  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1633  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1634  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1635  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1636  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1637  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1638  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1639  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1640  * ~~                                                                 ~~
1641  * __                                                                 __
1642  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1643  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1644  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1645  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1646  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1647  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1648  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1649  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1650  * ~~                                                                 ~~
1651  * __                                                                 __
1652  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1653  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1654  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1655  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1656  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1657  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1658  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1659  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1660  * ~~                                                                 ~~
1661  * __                                                                 __
1662  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1663  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1664  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1665  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1666  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1667  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1668  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1669  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1670  * ~~                                                                 ~~
1671  * __                                                                 __
1672  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1673  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1674  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1675  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1676  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1677  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1678  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1679  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1680  * ~~                                                                 ~~
1681  *                   __                               __
1682  *                   |  0   0   1   0   0   0   0   0  |
1683  *                   | 167 100  5   41 159 169 217 208 |
1684  *                   | 166 100  4   40 158 168 216 209 |
1685  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1686  *                   |  0   0   0   0   1   0   0   0  |
1687  *                   |  0   0   0   0   0   1   0   0  |
1688  *                   |  0   0   0   0   0   0   1   0  |
1689  *                   |  0   0   0   0   0   0   0   1  |
1690  *                   ~~                               ~~
1691  *
1692  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1693  * of the missing data.
1694  *
1695  * As is apparent from the example above, the only non-trivial rows in the
1696  * inverse matrix correspond to the data disks that we're trying to
1697  * reconstruct. Indeed, those are the only rows we need as the others would
1698  * only be useful for reconstructing data known or assumed to be valid. For
1699  * that reason, we only build the coefficients in the rows that correspond to
1700  * targeted columns.
1701  */
1702
1703 static void
1704 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1705     uint8_t **rows)
1706 {
1707         int i, j;
1708         int pow;
1709
1710         ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1711
1712         /*
1713          * Fill in the missing rows of interest.
1714          */
1715         for (i = 0; i < nmap; i++) {
1716                 ASSERT3S(0, <=, map[i]);
1717                 ASSERT3S(map[i], <=, 2);
1718
1719                 pow = map[i] * n;
1720                 if (pow > 255)
1721                         pow -= 255;
1722                 ASSERT(pow <= 255);
1723
1724                 for (j = 0; j < n; j++) {
1725                         pow -= map[i];
1726                         if (pow < 0)
1727                                 pow += 255;
1728                         rows[i][j] = vdev_raidz_pow2[pow];
1729                 }
1730         }
1731 }
1732
1733 static void
1734 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1735     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1736 {
1737         int i, j, ii, jj;
1738         uint8_t log;
1739
1740         /*
1741          * Assert that the first nmissing entries from the array of used
1742          * columns correspond to parity columns and that subsequent entries
1743          * correspond to data columns.
1744          */
1745         for (i = 0; i < nmissing; i++) {
1746                 ASSERT3S(used[i], <, rr->rr_firstdatacol);
1747         }
1748         for (; i < n; i++) {
1749                 ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1750         }
1751
1752         /*
1753          * First initialize the storage where we'll compute the inverse rows.
1754          */
1755         for (i = 0; i < nmissing; i++) {
1756                 for (j = 0; j < n; j++) {
1757                         invrows[i][j] = (i == j) ? 1 : 0;
1758                 }
1759         }
1760
1761         /*
1762          * Subtract all trivial rows from the rows of consequence.
1763          */
1764         for (i = 0; i < nmissing; i++) {
1765                 for (j = nmissing; j < n; j++) {
1766                         ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1767                         jj = used[j] - rr->rr_firstdatacol;
1768                         ASSERT3S(jj, <, n);
1769                         invrows[i][j] = rows[i][jj];
1770                         rows[i][jj] = 0;
1771                 }
1772         }
1773
1774         /*
1775          * For each of the rows of interest, we must normalize it and subtract
1776          * a multiple of it from the other rows.
1777          */
1778         for (i = 0; i < nmissing; i++) {
1779                 for (j = 0; j < missing[i]; j++) {
1780                         ASSERT0(rows[i][j]);
1781                 }
1782                 ASSERT3U(rows[i][missing[i]], !=, 0);
1783
1784                 /*
1785                  * Compute the inverse of the first element and multiply each
1786                  * element in the row by that value.
1787                  */
1788                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1789
1790                 for (j = 0; j < n; j++) {
1791                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1792                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1793                 }
1794
1795                 for (ii = 0; ii < nmissing; ii++) {
1796                         if (i == ii)
1797                                 continue;
1798
1799                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1800
1801                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1802
1803                         for (j = 0; j < n; j++) {
1804                                 rows[ii][j] ^=
1805                                     vdev_raidz_exp2(rows[i][j], log);
1806                                 invrows[ii][j] ^=
1807                                     vdev_raidz_exp2(invrows[i][j], log);
1808                         }
1809                 }
1810         }
1811
1812         /*
1813          * Verify that the data that is left in the rows are properly part of
1814          * an identity matrix.
1815          */
1816         for (i = 0; i < nmissing; i++) {
1817                 for (j = 0; j < n; j++) {
1818                         if (j == missing[i]) {
1819                                 ASSERT3U(rows[i][j], ==, 1);
1820                         } else {
1821                                 ASSERT0(rows[i][j]);
1822                         }
1823                 }
1824         }
1825 }
1826
1827 static void
1828 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1829     int *missing, uint8_t **invrows, const uint8_t *used)
1830 {
1831         int i, j, x, cc, c;
1832         uint8_t *src;
1833         uint64_t ccount;
1834         uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1835         uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1836         uint8_t log = 0;
1837         uint8_t val;
1838         int ll;
1839         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1840         uint8_t *p, *pp;
1841         size_t psize;
1842
1843         psize = sizeof (invlog[0][0]) * n * nmissing;
1844         p = kmem_alloc(psize, KM_SLEEP);
1845
1846         for (pp = p, i = 0; i < nmissing; i++) {
1847                 invlog[i] = pp;
1848                 pp += n;
1849         }
1850
1851         for (i = 0; i < nmissing; i++) {
1852                 for (j = 0; j < n; j++) {
1853                         ASSERT3U(invrows[i][j], !=, 0);
1854                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1855                 }
1856         }
1857
1858         for (i = 0; i < n; i++) {
1859                 c = used[i];
1860                 ASSERT3U(c, <, rr->rr_cols);
1861
1862                 ccount = rr->rr_col[c].rc_size;
1863                 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1864                 if (ccount == 0)
1865                         continue;
1866                 src = abd_to_buf(rr->rr_col[c].rc_abd);
1867                 for (j = 0; j < nmissing; j++) {
1868                         cc = missing[j] + rr->rr_firstdatacol;
1869                         ASSERT3U(cc, >=, rr->rr_firstdatacol);
1870                         ASSERT3U(cc, <, rr->rr_cols);
1871                         ASSERT3U(cc, !=, c);
1872
1873                         dcount[j] = rr->rr_col[cc].rc_size;
1874                         if (dcount[j] != 0)
1875                                 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1876                 }
1877
1878                 for (x = 0; x < ccount; x++, src++) {
1879                         if (*src != 0)
1880                                 log = vdev_raidz_log2[*src];
1881
1882                         for (cc = 0; cc < nmissing; cc++) {
1883                                 if (x >= dcount[cc])
1884                                         continue;
1885
1886                                 if (*src == 0) {
1887                                         val = 0;
1888                                 } else {
1889                                         if ((ll = log + invlog[cc][i]) >= 255)
1890                                                 ll -= 255;
1891                                         val = vdev_raidz_pow2[ll];
1892                                 }
1893
1894                                 if (i == 0)
1895                                         dst[cc][x] = val;
1896                                 else
1897                                         dst[cc][x] ^= val;
1898                         }
1899                 }
1900         }
1901
1902         kmem_free(p, psize);
1903 }
1904
1905 static void
1906 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1907 {
1908         int i, c, t, tt;
1909         unsigned int n;
1910         unsigned int nmissing_rows;
1911         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1912         int parity_map[VDEV_RAIDZ_MAXPARITY];
1913         uint8_t *p, *pp;
1914         size_t psize;
1915         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1916         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1917         uint8_t *used;
1918
1919         abd_t **bufs = NULL;
1920
1921         if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1922                 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1923         /*
1924          * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1925          * temporary linear ABDs if any non-linear ABDs are found.
1926          */
1927         for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1928                 ASSERT(rr->rr_col[i].rc_abd != NULL);
1929                 if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1930                         bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1931                             KM_PUSHPAGE);
1932
1933                         for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1934                                 raidz_col_t *col = &rr->rr_col[c];
1935
1936                                 bufs[c] = col->rc_abd;
1937                                 if (bufs[c] != NULL) {
1938                                         col->rc_abd = abd_alloc_linear(
1939                                             col->rc_size, B_TRUE);
1940                                         abd_copy(col->rc_abd, bufs[c],
1941                                             col->rc_size);
1942                                 }
1943                         }
1944
1945                         break;
1946                 }
1947         }
1948
1949         n = rr->rr_cols - rr->rr_firstdatacol;
1950
1951         /*
1952          * Figure out which data columns are missing.
1953          */
1954         nmissing_rows = 0;
1955         for (t = 0; t < ntgts; t++) {
1956                 if (tgts[t] >= rr->rr_firstdatacol) {
1957                         missing_rows[nmissing_rows++] =
1958                             tgts[t] - rr->rr_firstdatacol;
1959                 }
1960         }
1961
1962         /*
1963          * Figure out which parity columns to use to help generate the missing
1964          * data columns.
1965          */
1966         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1967                 ASSERT(tt < ntgts);
1968                 ASSERT(c < rr->rr_firstdatacol);
1969
1970                 /*
1971                  * Skip any targeted parity columns.
1972                  */
1973                 if (c == tgts[tt]) {
1974                         tt++;
1975                         continue;
1976                 }
1977
1978                 parity_map[i] = c;
1979                 i++;
1980         }
1981
1982         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1983             nmissing_rows * n + sizeof (used[0]) * n;
1984         p = kmem_alloc(psize, KM_SLEEP);
1985
1986         for (pp = p, i = 0; i < nmissing_rows; i++) {
1987                 rows[i] = pp;
1988                 pp += n;
1989                 invrows[i] = pp;
1990                 pp += n;
1991         }
1992         used = pp;
1993
1994         for (i = 0; i < nmissing_rows; i++) {
1995                 used[i] = parity_map[i];
1996         }
1997
1998         for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1999                 if (tt < nmissing_rows &&
2000                     c == missing_rows[tt] + rr->rr_firstdatacol) {
2001                         tt++;
2002                         continue;
2003                 }
2004
2005                 ASSERT3S(i, <, n);
2006                 used[i] = c;
2007                 i++;
2008         }
2009
2010         /*
2011          * Initialize the interesting rows of the matrix.
2012          */
2013         vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2014
2015         /*
2016          * Invert the matrix.
2017          */
2018         vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2019             invrows, used);
2020
2021         /*
2022          * Reconstruct the missing data using the generated matrix.
2023          */
2024         vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2025             invrows, used);
2026
2027         kmem_free(p, psize);
2028
2029         /*
2030          * copy back from temporary linear abds and free them
2031          */
2032         if (bufs) {
2033                 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2034                         raidz_col_t *col = &rr->rr_col[c];
2035
2036                         if (bufs[c] != NULL) {
2037                                 abd_copy(bufs[c], col->rc_abd, col->rc_size);
2038                                 abd_free(col->rc_abd);
2039                         }
2040                         col->rc_abd = bufs[c];
2041                 }
2042                 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2043         }
2044 }
2045
2046 static void
2047 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2048     const int *t, int nt)
2049 {
2050         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2051         int ntgts;
2052         int i, c, ret;
2053         int nbadparity, nbaddata;
2054         int parity_valid[VDEV_RAIDZ_MAXPARITY];
2055
2056         if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2057                 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2058                     rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2059                     (int)rr->rr_missingparity);
2060         }
2061
2062         nbadparity = rr->rr_firstdatacol;
2063         nbaddata = rr->rr_cols - nbadparity;
2064         ntgts = 0;
2065         for (i = 0, c = 0; c < rr->rr_cols; c++) {
2066                 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2067                         zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2068                             "offset=%llx error=%u)",
2069                             rr, c, (int)rr->rr_col[c].rc_devidx,
2070                             (long long)rr->rr_col[c].rc_offset,
2071                             (int)rr->rr_col[c].rc_error);
2072                 }
2073                 if (c < rr->rr_firstdatacol)
2074                         parity_valid[c] = B_FALSE;
2075
2076                 if (i < nt && c == t[i]) {
2077                         tgts[ntgts++] = c;
2078                         i++;
2079                 } else if (rr->rr_col[c].rc_error != 0) {
2080                         tgts[ntgts++] = c;
2081                 } else if (c >= rr->rr_firstdatacol) {
2082                         nbaddata--;
2083                 } else {
2084                         parity_valid[c] = B_TRUE;
2085                         nbadparity--;
2086                 }
2087         }
2088
2089         ASSERT(ntgts >= nt);
2090         ASSERT(nbaddata >= 0);
2091         ASSERT(nbaddata + nbadparity == ntgts);
2092
2093         dt = &tgts[nbadparity];
2094
2095         /* Reconstruct using the new math implementation */
2096         ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2097         if (ret != RAIDZ_ORIGINAL_IMPL)
2098                 return;
2099
2100         /*
2101          * See if we can use any of our optimized reconstruction routines.
2102          */
2103         switch (nbaddata) {
2104         case 1:
2105                 if (parity_valid[VDEV_RAIDZ_P]) {
2106                         vdev_raidz_reconstruct_p(rr, dt, 1);
2107                         return;
2108                 }
2109
2110                 ASSERT(rr->rr_firstdatacol > 1);
2111
2112                 if (parity_valid[VDEV_RAIDZ_Q]) {
2113                         vdev_raidz_reconstruct_q(rr, dt, 1);
2114                         return;
2115                 }
2116
2117                 ASSERT(rr->rr_firstdatacol > 2);
2118                 break;
2119
2120         case 2:
2121                 ASSERT(rr->rr_firstdatacol > 1);
2122
2123                 if (parity_valid[VDEV_RAIDZ_P] &&
2124                     parity_valid[VDEV_RAIDZ_Q]) {
2125                         vdev_raidz_reconstruct_pq(rr, dt, 2);
2126                         return;
2127                 }
2128
2129                 ASSERT(rr->rr_firstdatacol > 2);
2130
2131                 break;
2132         }
2133
2134         vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2135 }
2136
2137 static int
2138 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2139     uint64_t *logical_ashift, uint64_t *physical_ashift)
2140 {
2141         vdev_raidz_t *vdrz = vd->vdev_tsd;
2142         uint64_t nparity = vdrz->vd_nparity;
2143         int c;
2144         int lasterror = 0;
2145         int numerrors = 0;
2146
2147         ASSERT(nparity > 0);
2148
2149         if (nparity > VDEV_RAIDZ_MAXPARITY ||
2150             vd->vdev_children < nparity + 1) {
2151                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2152                 return (SET_ERROR(EINVAL));
2153         }
2154
2155         vdev_open_children(vd);
2156
2157         for (c = 0; c < vd->vdev_children; c++) {
2158                 vdev_t *cvd = vd->vdev_child[c];
2159
2160                 if (cvd->vdev_open_error != 0) {
2161                         lasterror = cvd->vdev_open_error;
2162                         numerrors++;
2163                         continue;
2164                 }
2165
2166                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2167                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2168                 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2169         }
2170         for (c = 0; c < vd->vdev_children; c++) {
2171                 vdev_t *cvd = vd->vdev_child[c];
2172
2173                 if (cvd->vdev_open_error != 0)
2174                         continue;
2175                 *physical_ashift = vdev_best_ashift(*logical_ashift,
2176                     *physical_ashift, cvd->vdev_physical_ashift);
2177         }
2178
2179         if (vd->vdev_rz_expanding) {
2180                 *asize *= vd->vdev_children - 1;
2181                 *max_asize *= vd->vdev_children - 1;
2182
2183                 vd->vdev_min_asize = *asize;
2184         } else {
2185                 *asize *= vd->vdev_children;
2186                 *max_asize *= vd->vdev_children;
2187         }
2188
2189         if (numerrors > nparity) {
2190                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2191                 return (lasterror);
2192         }
2193
2194         return (0);
2195 }
2196
2197 static void
2198 vdev_raidz_close(vdev_t *vd)
2199 {
2200         for (int c = 0; c < vd->vdev_children; c++) {
2201                 if (vd->vdev_child[c] != NULL)
2202                         vdev_close(vd->vdev_child[c]);
2203         }
2204 }
2205
2206 /*
2207  * Return the logical width to use, given the txg in which the allocation
2208  * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
2209  * BP was allocated.  Remapped BP's (that were relocated due to device
2210  * removal, see remap_blkptr_cb()), will have a more recent physical birth
2211  * which reflects when the BP was relocated, but we can ignore these because
2212  * they can't be on RAIDZ (device removal doesn't support RAIDZ).
2213  */
2214 static uint64_t
2215 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2216 {
2217         reflow_node_t lookup = {
2218                 .re_txg = txg,
2219         };
2220         avl_index_t where;
2221
2222         uint64_t width;
2223         mutex_enter(&vdrz->vd_expand_lock);
2224         reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2225         if (re != NULL) {
2226                 width = re->re_logical_width;
2227         } else {
2228                 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2229                 if (re != NULL)
2230                         width = re->re_logical_width;
2231                 else
2232                         width = vdrz->vd_original_width;
2233         }
2234         mutex_exit(&vdrz->vd_expand_lock);
2235         return (width);
2236 }
2237
2238 /*
2239  * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2240  * more space due to the lower data-to-parity ratio.  In this case it's
2241  * important to pass in the correct txg.  Note that vdev_gang_header_asize()
2242  * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2243  * regardless of txg.  This is assured because for a single data sector, we
2244  * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2245  */
2246 static uint64_t
2247 vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2248 {
2249         vdev_raidz_t *vdrz = vd->vdev_tsd;
2250         uint64_t asize;
2251         uint64_t ashift = vd->vdev_top->vdev_ashift;
2252         uint64_t cols = vdrz->vd_original_width;
2253         uint64_t nparity = vdrz->vd_nparity;
2254
2255         cols = vdev_raidz_get_logical_width(vdrz, txg);
2256
2257         asize = ((psize - 1) >> ashift) + 1;
2258         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2259         asize = roundup(asize, nparity + 1) << ashift;
2260
2261 #ifdef ZFS_DEBUG
2262         uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2263         uint64_t ncols_new = vdrz->vd_physical_width;
2264         asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2265             (ncols_new - nparity));
2266         asize_new = roundup(asize_new, nparity + 1) << ashift;
2267         VERIFY3U(asize_new, <=, asize);
2268 #endif
2269
2270         return (asize);
2271 }
2272
2273 /*
2274  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2275  * so each child must provide at least 1/Nth of its asize.
2276  */
2277 static uint64_t
2278 vdev_raidz_min_asize(vdev_t *vd)
2279 {
2280         return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2281             vd->vdev_children);
2282 }
2283
2284 void
2285 vdev_raidz_child_done(zio_t *zio)
2286 {
2287         raidz_col_t *rc = zio->io_private;
2288
2289         ASSERT3P(rc->rc_abd, !=, NULL);
2290         rc->rc_error = zio->io_error;
2291         rc->rc_tried = 1;
2292         rc->rc_skipped = 0;
2293 }
2294
2295 static void
2296 vdev_raidz_shadow_child_done(zio_t *zio)
2297 {
2298         raidz_col_t *rc = zio->io_private;
2299
2300         rc->rc_shadow_error = zio->io_error;
2301 }
2302
2303 static void
2304 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2305 {
2306         (void) rm;
2307 #ifdef ZFS_DEBUG
2308         range_seg64_t logical_rs, physical_rs, remain_rs;
2309         logical_rs.rs_start = rr->rr_offset;
2310         logical_rs.rs_end = logical_rs.rs_start +
2311             vdev_raidz_asize(zio->io_vd, rr->rr_size,
2312             BP_GET_BIRTH(zio->io_bp));
2313
2314         raidz_col_t *rc = &rr->rr_col[col];
2315         vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2316
2317         vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2318         ASSERT(vdev_xlate_is_empty(&remain_rs));
2319         if (vdev_xlate_is_empty(&physical_rs)) {
2320                 /*
2321                  * If we are in the middle of expansion, the
2322                  * physical->logical mapping is changing so vdev_xlate()
2323                  * can't give us a reliable answer.
2324                  */
2325                 return;
2326         }
2327         ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2328         ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2329         /*
2330          * It would be nice to assert that rs_end is equal
2331          * to rc_offset + rc_size but there might be an
2332          * optional I/O at the end that is not accounted in
2333          * rc_size.
2334          */
2335         if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2336                 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2337                     rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2338         } else {
2339                 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2340         }
2341 #endif
2342 }
2343
2344 static void
2345 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2346 {
2347         vdev_t *vd = zio->io_vd;
2348         raidz_map_t *rm = zio->io_vsd;
2349
2350         vdev_raidz_generate_parity_row(rm, rr);
2351
2352         for (int c = 0; c < rr->rr_scols; c++) {
2353                 raidz_col_t *rc = &rr->rr_col[c];
2354                 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2355
2356                 /* Verify physical to logical translation */
2357                 vdev_raidz_io_verify(zio, rm, rr, c);
2358
2359                 if (rc->rc_size == 0)
2360                         continue;
2361
2362                 ASSERT3U(rc->rc_offset + rc->rc_size, <,
2363                     cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2364
2365                 ASSERT3P(rc->rc_abd, !=, NULL);
2366                 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2367                     rc->rc_offset, rc->rc_abd,
2368                     abd_get_size(rc->rc_abd), zio->io_type,
2369                     zio->io_priority, 0, vdev_raidz_child_done, rc));
2370
2371                 if (rc->rc_shadow_devidx != INT_MAX) {
2372                         vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2373
2374                         ASSERT3U(
2375                             rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2376                             cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2377
2378                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2379                             rc->rc_shadow_offset, rc->rc_abd,
2380                             abd_get_size(rc->rc_abd),
2381                             zio->io_type, zio->io_priority, 0,
2382                             vdev_raidz_shadow_child_done, rc));
2383                 }
2384         }
2385 }
2386
2387 /*
2388  * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2389  * This only works for vdev_raidz_map_alloc() (not _expanded()).
2390  */
2391 static void
2392 raidz_start_skip_writes(zio_t *zio)
2393 {
2394         vdev_t *vd = zio->io_vd;
2395         uint64_t ashift = vd->vdev_top->vdev_ashift;
2396         raidz_map_t *rm = zio->io_vsd;
2397         ASSERT3U(rm->rm_nrows, ==, 1);
2398         raidz_row_t *rr = rm->rm_row[0];
2399         for (int c = 0; c < rr->rr_scols; c++) {
2400                 raidz_col_t *rc = &rr->rr_col[c];
2401                 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2402                 if (rc->rc_size != 0)
2403                         continue;
2404                 ASSERT3P(rc->rc_abd, ==, NULL);
2405
2406                 ASSERT3U(rc->rc_offset, <,
2407                     cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2408
2409                 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2410                     NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2411                     ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2412         }
2413 }
2414
2415 static void
2416 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2417 {
2418         vdev_t *vd = zio->io_vd;
2419
2420         /*
2421          * Iterate over the columns in reverse order so that we hit the parity
2422          * last -- any errors along the way will force us to read the parity.
2423          */
2424         for (int c = rr->rr_cols - 1; c >= 0; c--) {
2425                 raidz_col_t *rc = &rr->rr_col[c];
2426                 if (rc->rc_size == 0)
2427                         continue;
2428                 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2429                 if (!vdev_readable(cvd)) {
2430                         if (c >= rr->rr_firstdatacol)
2431                                 rr->rr_missingdata++;
2432                         else
2433                                 rr->rr_missingparity++;
2434                         rc->rc_error = SET_ERROR(ENXIO);
2435                         rc->rc_tried = 1;       /* don't even try */
2436                         rc->rc_skipped = 1;
2437                         continue;
2438                 }
2439                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2440                         if (c >= rr->rr_firstdatacol)
2441                                 rr->rr_missingdata++;
2442                         else
2443                                 rr->rr_missingparity++;
2444                         rc->rc_error = SET_ERROR(ESTALE);
2445                         rc->rc_skipped = 1;
2446                         continue;
2447                 }
2448                 if (forceparity ||
2449                     c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
2450                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2451                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2452                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2453                             zio->io_type, zio->io_priority, 0,
2454                             vdev_raidz_child_done, rc));
2455                 }
2456         }
2457 }
2458
2459 static void
2460 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2461 {
2462         vdev_t *vd = zio->io_vd;
2463
2464         for (int i = 0; i < rm->rm_nphys_cols; i++) {
2465                 raidz_col_t *prc = &rm->rm_phys_col[i];
2466                 if (prc->rc_size == 0)
2467                         continue;
2468
2469                 ASSERT3U(prc->rc_devidx, ==, i);
2470                 vdev_t *cvd = vd->vdev_child[i];
2471                 if (!vdev_readable(cvd)) {
2472                         prc->rc_error = SET_ERROR(ENXIO);
2473                         prc->rc_tried = 1;      /* don't even try */
2474                         prc->rc_skipped = 1;
2475                         continue;
2476                 }
2477                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2478                         prc->rc_error = SET_ERROR(ESTALE);
2479                         prc->rc_skipped = 1;
2480                         continue;
2481                 }
2482                 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2483                     prc->rc_offset, prc->rc_abd, prc->rc_size,
2484                     zio->io_type, zio->io_priority, 0,
2485                     vdev_raidz_child_done, prc));
2486         }
2487 }
2488
2489 static void
2490 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2491 {
2492         /*
2493          * If there are multiple rows, we will be hitting
2494          * all disks, so go ahead and read the parity so
2495          * that we are reading in decent size chunks.
2496          */
2497         boolean_t forceparity = rm->rm_nrows > 1;
2498
2499         if (rm->rm_phys_col) {
2500                 vdev_raidz_io_start_read_phys_cols(zio, rm);
2501         } else {
2502                 for (int i = 0; i < rm->rm_nrows; i++) {
2503                         raidz_row_t *rr = rm->rm_row[i];
2504                         vdev_raidz_io_start_read_row(zio, rr, forceparity);
2505                 }
2506         }
2507 }
2508
2509 /*
2510  * Start an IO operation on a RAIDZ VDev
2511  *
2512  * Outline:
2513  * - For write operations:
2514  *   1. Generate the parity data
2515  *   2. Create child zio write operations to each column's vdev, for both
2516  *      data and parity.
2517  *   3. If the column skips any sectors for padding, create optional dummy
2518  *      write zio children for those areas to improve aggregation continuity.
2519  * - For read operations:
2520  *   1. Create child zio read operations to each data column's vdev to read
2521  *      the range of data required for zio.
2522  *   2. If this is a scrub or resilver operation, or if any of the data
2523  *      vdevs have had errors, then create zio read operations to the parity
2524  *      columns' VDevs as well.
2525  */
2526 static void
2527 vdev_raidz_io_start(zio_t *zio)
2528 {
2529         vdev_t *vd = zio->io_vd;
2530         vdev_t *tvd = vd->vdev_top;
2531         vdev_raidz_t *vdrz = vd->vdev_tsd;
2532         raidz_map_t *rm;
2533
2534         uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2535             BP_GET_BIRTH(zio->io_bp));
2536         if (logical_width != vdrz->vd_physical_width) {
2537                 zfs_locked_range_t *lr = NULL;
2538                 uint64_t synced_offset = UINT64_MAX;
2539                 uint64_t next_offset = UINT64_MAX;
2540                 boolean_t use_scratch = B_FALSE;
2541                 /*
2542                  * Note: when the expansion is completing, we set
2543                  * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2544                  * in a later txg than when we last update spa_ubsync's state
2545                  * (see the end of spa_raidz_expand_thread()).  Therefore we
2546                  * may see vre_state!=SCANNING before
2547                  * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2548                  * on disk, but the copying progress has been synced to disk
2549                  * (and reflected in spa_ubsync).  In this case it's fine to
2550                  * treat the expansion as completed, since if we crash there's
2551                  * no additional copying to do.
2552                  */
2553                 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2554                         ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2555                             &vdrz->vn_vre);
2556                         lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2557                             zio->io_offset, zio->io_size, RL_READER);
2558                         use_scratch =
2559                             (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2560                             RRSS_SCRATCH_VALID);
2561                         synced_offset =
2562                             RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2563                         next_offset = vdrz->vn_vre.vre_offset;
2564                         /*
2565                          * If we haven't resumed expanding since importing the
2566                          * pool, vre_offset won't have been set yet.  In
2567                          * this case the next offset to be copied is the same
2568                          * as what was synced.
2569                          */
2570                         if (next_offset == UINT64_MAX) {
2571                                 next_offset = synced_offset;
2572                         }
2573                 }
2574                 if (use_scratch) {
2575                         zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2576                             "%lld next_offset=%lld use_scratch=%u",
2577                             zio,
2578                             zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2579                             (long long)zio->io_offset,
2580                             (long long)synced_offset,
2581                             (long long)next_offset,
2582                             use_scratch);
2583                 }
2584
2585                 rm = vdev_raidz_map_alloc_expanded(zio,
2586                     tvd->vdev_ashift, vdrz->vd_physical_width,
2587                     logical_width, vdrz->vd_nparity,
2588                     synced_offset, next_offset, use_scratch);
2589                 rm->rm_lr = lr;
2590         } else {
2591                 rm = vdev_raidz_map_alloc(zio,
2592                     tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2593         }
2594         rm->rm_original_width = vdrz->vd_original_width;
2595
2596         zio->io_vsd = rm;
2597         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2598         if (zio->io_type == ZIO_TYPE_WRITE) {
2599                 for (int i = 0; i < rm->rm_nrows; i++) {
2600                         vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2601                 }
2602
2603                 if (logical_width == vdrz->vd_physical_width) {
2604                         raidz_start_skip_writes(zio);
2605                 }
2606         } else {
2607                 ASSERT(zio->io_type == ZIO_TYPE_READ);
2608                 vdev_raidz_io_start_read(zio, rm);
2609         }
2610
2611         zio_execute(zio);
2612 }
2613
2614 /*
2615  * Report a checksum error for a child of a RAID-Z device.
2616  */
2617 void
2618 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2619 {
2620         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2621
2622         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2623             zio->io_priority != ZIO_PRIORITY_REBUILD) {
2624                 zio_bad_cksum_t zbc;
2625                 raidz_map_t *rm = zio->io_vsd;
2626
2627                 zbc.zbc_has_cksum = 0;
2628                 zbc.zbc_injected = rm->rm_ecksuminjected;
2629
2630                 mutex_enter(&vd->vdev_stat_lock);
2631                 vd->vdev_stat.vs_checksum_errors++;
2632                 mutex_exit(&vd->vdev_stat_lock);
2633                 (void) zfs_ereport_post_checksum(zio->io_spa, vd,
2634                     &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2635                     rc->rc_abd, bad_data, &zbc);
2636         }
2637 }
2638
2639 /*
2640  * We keep track of whether or not there were any injected errors, so that
2641  * any ereports we generate can note it.
2642  */
2643 static int
2644 raidz_checksum_verify(zio_t *zio)
2645 {
2646         zio_bad_cksum_t zbc = {0};
2647         raidz_map_t *rm = zio->io_vsd;
2648
2649         int ret = zio_checksum_error(zio, &zbc);
2650         /*
2651          * Any Direct I/O read that has a checksum error must be treated as
2652          * suspicious as the contents of the buffer could be getting
2653          * manipulated while the I/O is taking place. The checksum verify error
2654          * will be reported to the top-level RAIDZ VDEV.
2655          */
2656         if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
2657                 zio->io_error = ret;
2658                 zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
2659                 zio_dio_chksum_verify_error_report(zio);
2660                 zio_checksum_verified(zio);
2661                 return (0);
2662         }
2663
2664         if (ret != 0 && zbc.zbc_injected != 0)
2665                 rm->rm_ecksuminjected = 1;
2666
2667         return (ret);
2668 }
2669
2670 /*
2671  * Generate the parity from the data columns. If we tried and were able to
2672  * read the parity without error, verify that the generated parity matches the
2673  * data we read. If it doesn't, we fire off a checksum error. Return the
2674  * number of such failures.
2675  */
2676 static int
2677 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2678 {
2679         abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2680         int c, ret = 0;
2681         raidz_map_t *rm = zio->io_vsd;
2682         raidz_col_t *rc;
2683
2684         blkptr_t *bp = zio->io_bp;
2685         enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2686             (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2687
2688         if (checksum == ZIO_CHECKSUM_NOPARITY)
2689                 return (ret);
2690
2691         for (c = 0; c < rr->rr_firstdatacol; c++) {
2692                 rc = &rr->rr_col[c];
2693                 if (!rc->rc_tried || rc->rc_error != 0)
2694                         continue;
2695
2696                 orig[c] = rc->rc_abd;
2697                 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2698                 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2699         }
2700
2701         /*
2702          * Verify any empty sectors are zero filled to ensure the parity
2703          * is calculated correctly even if these non-data sectors are damaged.
2704          */
2705         if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2706                 ret += vdev_draid_map_verify_empty(zio, rr);
2707
2708         /*
2709          * Regenerates parity even for !tried||rc_error!=0 columns.  This
2710          * isn't harmful but it does have the side effect of fixing stuff
2711          * we didn't realize was necessary (i.e. even if we return 0).
2712          */
2713         vdev_raidz_generate_parity_row(rm, rr);
2714
2715         for (c = 0; c < rr->rr_firstdatacol; c++) {
2716                 rc = &rr->rr_col[c];
2717
2718                 if (!rc->rc_tried || rc->rc_error != 0)
2719                         continue;
2720
2721                 if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2722                         zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2723                             c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
2724                         vdev_raidz_checksum_error(zio, rc, orig[c]);
2725                         rc->rc_error = SET_ERROR(ECKSUM);
2726                         ret++;
2727                 }
2728                 abd_free(orig[c]);
2729         }
2730
2731         return (ret);
2732 }
2733
2734 static int
2735 vdev_raidz_worst_error(raidz_row_t *rr)
2736 {
2737         int error = 0;
2738
2739         for (int c = 0; c < rr->rr_cols; c++) {
2740                 error = zio_worst_error(error, rr->rr_col[c].rc_error);
2741                 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2742         }
2743
2744         return (error);
2745 }
2746
2747 static void
2748 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
2749 {
2750         int unexpected_errors = 0;
2751         int parity_errors = 0;
2752         int parity_untried = 0;
2753         int data_errors = 0;
2754
2755         ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2756
2757         for (int c = 0; c < rr->rr_cols; c++) {
2758                 raidz_col_t *rc = &rr->rr_col[c];
2759
2760                 if (rc->rc_error) {
2761                         if (c < rr->rr_firstdatacol)
2762                                 parity_errors++;
2763                         else
2764                                 data_errors++;
2765
2766                         if (!rc->rc_skipped)
2767                                 unexpected_errors++;
2768                 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2769                         parity_untried++;
2770                 }
2771
2772                 if (rc->rc_force_repair)
2773                         unexpected_errors++;
2774         }
2775
2776         /*
2777          * If we read more parity disks than were used for
2778          * reconstruction, confirm that the other parity disks produced
2779          * correct data.
2780          *
2781          * Note that we also regenerate parity when resilvering so we
2782          * can write it out to failed devices later.
2783          */
2784         if (parity_errors + parity_untried <
2785             rr->rr_firstdatacol - data_errors ||
2786             (zio->io_flags & ZIO_FLAG_RESILVER)) {
2787                 int n = raidz_parity_verify(zio, rr);
2788                 unexpected_errors += n;
2789         }
2790
2791         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2792             (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2793                 /*
2794                  * Use the good data we have in hand to repair damaged children.
2795                  */
2796                 for (int c = 0; c < rr->rr_cols; c++) {
2797                         raidz_col_t *rc = &rr->rr_col[c];
2798                         vdev_t *vd = zio->io_vd;
2799                         vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2800
2801                         if (!rc->rc_allow_repair) {
2802                                 continue;
2803                         } else if (!rc->rc_force_repair &&
2804                             (rc->rc_error == 0 || rc->rc_size == 0)) {
2805                                 continue;
2806                         }
2807                         /*
2808                          * We do not allow self healing for Direct I/O reads.
2809                          * See comment in vdev_raid_row_alloc().
2810                          */
2811                         ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
2812
2813                         zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2814                             "offset=%llx",
2815                             zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2816
2817                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2818                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2819                             ZIO_TYPE_WRITE,
2820                             zio->io_priority == ZIO_PRIORITY_REBUILD ?
2821                             ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
2822                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2823                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2824                 }
2825         }
2826
2827         /*
2828          * Scrub or resilver i/o's: overwrite any shadow locations with the
2829          * good data.  This ensures that if we've already copied this sector,
2830          * it will be corrected if it was damaged.  This writes more than is
2831          * necessary, but since expansion is paused during scrub/resilver, at
2832          * most a single row will have a shadow location.
2833          */
2834         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2835             (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2836                 for (int c = 0; c < rr->rr_cols; c++) {
2837                         raidz_col_t *rc = &rr->rr_col[c];
2838                         vdev_t *vd = zio->io_vd;
2839
2840                         if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2841                                 continue;
2842                         vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2843
2844                         /*
2845                          * Note: We don't want to update the repair stats
2846                          * because that would incorrectly indicate that there
2847                          * was bad data to repair, which we aren't sure about.
2848                          * By clearing the SCAN_THREAD flag, we prevent this
2849                          * from happening, despite having the REPAIR flag set.
2850                          * We need to set SELF_HEAL so that this i/o can't be
2851                          * bypassed by zio_vdev_io_start().
2852                          */
2853                         zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2854                             rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2855                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2856                             ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2857                             NULL, NULL);
2858                         cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2859                         zio_nowait(cio);
2860                 }
2861         }
2862 }
2863
2864 static void
2865 raidz_restore_orig_data(raidz_map_t *rm)
2866 {
2867         for (int i = 0; i < rm->rm_nrows; i++) {
2868                 raidz_row_t *rr = rm->rm_row[i];
2869                 for (int c = 0; c < rr->rr_cols; c++) {
2870                         raidz_col_t *rc = &rr->rr_col[c];
2871                         if (rc->rc_need_orig_restore) {
2872                                 abd_copy(rc->rc_abd,
2873                                     rc->rc_orig_data, rc->rc_size);
2874                                 rc->rc_need_orig_restore = B_FALSE;
2875                         }
2876                 }
2877         }
2878 }
2879
2880 /*
2881  * During raidz_reconstruct() for expanded VDEV, we need special consideration
2882  * failure simulations.  See note in raidz_reconstruct() on simulating failure
2883  * of a pre-expansion device.
2884  *
2885  * Treating logical child i as failed, return TRUE if the given column should
2886  * be treated as failed.  The idea of logical children allows us to imagine
2887  * that a disk silently failed before a RAIDZ expansion (reads from this disk
2888  * succeed but return the wrong data).  Since the expansion doesn't verify
2889  * checksums, the incorrect data will be moved to new locations spread among
2890  * the children (going diagonally across them).
2891  *
2892  * Higher "logical child failures" (values of `i`) indicate these
2893  * "pre-expansion failures".  The first physical_width values imagine that a
2894  * current child failed; the next physical_width-1 values imagine that a
2895  * child failed before the most recent expansion; the next physical_width-2
2896  * values imagine a child failed in the expansion before that, etc.
2897  */
2898 static boolean_t
2899 raidz_simulate_failure(int physical_width, int original_width, int ashift,
2900     int i, raidz_col_t *rc)
2901 {
2902         uint64_t sector_id =
2903             physical_width * (rc->rc_offset >> ashift) +
2904             rc->rc_devidx;
2905
2906         for (int w = physical_width; w >= original_width; w--) {
2907                 if (i < w) {
2908                         return (sector_id % w == i);
2909                 } else {
2910                         i -= w;
2911                 }
2912         }
2913         ASSERT(!"invalid logical child id");
2914         return (B_FALSE);
2915 }
2916
2917 /*
2918  * returns EINVAL if reconstruction of the block will not be possible
2919  * returns ECKSUM if this specific reconstruction failed
2920  * returns 0 on successful reconstruction
2921  */
2922 static int
2923 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
2924 {
2925         raidz_map_t *rm = zio->io_vsd;
2926         int physical_width = zio->io_vd->vdev_children;
2927         int original_width = (rm->rm_original_width != 0) ?
2928             rm->rm_original_width : physical_width;
2929         int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2930
2931         if (dbgmsg) {
2932                 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2933                     "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2934         }
2935
2936         /* Reconstruct each row */
2937         for (int r = 0; r < rm->rm_nrows; r++) {
2938                 raidz_row_t *rr = rm->rm_row[r];
2939                 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
2940                 int t = 0;
2941                 int dead = 0;
2942                 int dead_data = 0;
2943
2944                 if (dbgmsg)
2945                         zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2946
2947                 for (int c = 0; c < rr->rr_cols; c++) {
2948                         raidz_col_t *rc = &rr->rr_col[c];
2949                         ASSERT0(rc->rc_need_orig_restore);
2950                         if (rc->rc_error != 0) {
2951                                 dead++;
2952                                 if (c >= nparity)
2953                                         dead_data++;
2954                                 continue;
2955                         }
2956                         if (rc->rc_size == 0)
2957                                 continue;
2958                         for (int lt = 0; lt < ntgts; lt++) {
2959                                 if (raidz_simulate_failure(physical_width,
2960                                     original_width,
2961                                     zio->io_vd->vdev_top->vdev_ashift,
2962                                     ltgts[lt], rc)) {
2963                                         if (rc->rc_orig_data == NULL) {
2964                                                 rc->rc_orig_data =
2965                                                     abd_alloc_linear(
2966                                                     rc->rc_size, B_TRUE);
2967                                                 abd_copy(rc->rc_orig_data,
2968                                                     rc->rc_abd, rc->rc_size);
2969                                         }
2970                                         rc->rc_need_orig_restore = B_TRUE;
2971
2972                                         dead++;
2973                                         if (c >= nparity)
2974                                                 dead_data++;
2975                                         /*
2976                                          * Note: simulating failure of a
2977                                          * pre-expansion device can hit more
2978                                          * than one column, in which case we
2979                                          * might try to simulate more failures
2980                                          * than can be reconstructed, which is
2981                                          * also more than the size of my_tgts.
2982                                          * This check prevents accessing past
2983                                          * the end of my_tgts.  The "dead >
2984                                          * nparity" check below will fail this
2985                                          * reconstruction attempt.
2986                                          */
2987                                         if (t < VDEV_RAIDZ_MAXPARITY) {
2988                                                 my_tgts[t++] = c;
2989                                                 if (dbgmsg) {
2990                                                         zfs_dbgmsg("simulating "
2991                                                             "failure of col %u "
2992                                                             "devidx %u", c,
2993                                                             (int)rc->rc_devidx);
2994                                                 }
2995                                         }
2996                                         break;
2997                                 }
2998                         }
2999                 }
3000                 if (dead > nparity) {
3001                         /* reconstruction not possible */
3002                         if (dbgmsg) {
3003                                 zfs_dbgmsg("reconstruction not possible; "
3004                                     "too many failures");
3005                         }
3006                         raidz_restore_orig_data(rm);
3007                         return (EINVAL);
3008                 }
3009                 if (dead_data > 0)
3010                         vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
3011         }
3012
3013         /* Check for success */
3014         if (raidz_checksum_verify(zio) == 0) {
3015                 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
3016                         return (0);
3017
3018                 /* Reconstruction succeeded - report errors */
3019                 for (int i = 0; i < rm->rm_nrows; i++) {
3020                         raidz_row_t *rr = rm->rm_row[i];
3021
3022                         for (int c = 0; c < rr->rr_cols; c++) {
3023                                 raidz_col_t *rc = &rr->rr_col[c];
3024                                 if (rc->rc_need_orig_restore) {
3025                                         /*
3026                                          * Note: if this is a parity column,
3027                                          * we don't really know if it's wrong.
3028                                          * We need to let
3029                                          * vdev_raidz_io_done_verified() check
3030                                          * it, and if we set rc_error, it will
3031                                          * think that it is a "known" error
3032                                          * that doesn't need to be checked
3033                                          * or corrected.
3034                                          */
3035                                         if (rc->rc_error == 0 &&
3036                                             c >= rr->rr_firstdatacol) {
3037                                                 vdev_raidz_checksum_error(zio,
3038                                                     rc, rc->rc_orig_data);
3039                                                 rc->rc_error =
3040                                                     SET_ERROR(ECKSUM);
3041                                         }
3042                                         rc->rc_need_orig_restore = B_FALSE;
3043                                 }
3044                         }
3045
3046                         vdev_raidz_io_done_verified(zio, rr);
3047                 }
3048
3049                 zio_checksum_verified(zio);
3050
3051                 if (dbgmsg) {
3052                         zfs_dbgmsg("reconstruction successful "
3053                             "(checksum verified)");
3054                 }
3055                 return (0);
3056         }
3057
3058         /* Reconstruction failed - restore original data */
3059         raidz_restore_orig_data(rm);
3060         if (dbgmsg) {
3061                 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3062                     "failed", zio);
3063         }
3064         return (ECKSUM);
3065 }
3066
3067 /*
3068  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3069  * Note that the algorithm below is non-optimal because it doesn't take into
3070  * account how reconstruction is actually performed. For example, with
3071  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3072  * is targeted as invalid as if columns 1 and 4 are targeted since in both
3073  * cases we'd only use parity information in column 0.
3074  *
3075  * The order that we find the various possible combinations of failed
3076  * disks is dictated by these rules:
3077  * - Examine each "slot" (the "i" in tgts[i])
3078  *   - Try to increment this slot (tgts[i] += 1)
3079  *   - if we can't increment because it runs into the next slot,
3080  *     reset our slot to the minimum, and examine the next slot
3081  *
3082  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3083  *  3 columns to reconstruct), we will generate the following sequence:
3084  *
3085  *  STATE        ACTION
3086  *  0 1 2        special case: skip since these are all parity
3087  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
3088  *  0   2 3      first slot: increment to 1
3089  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
3090  *  0 1     4    first: reset to 0; middle: increment to 2
3091  *  0   2   4    first: increment to 1
3092  *    1 2   4    first: reset to 0; middle: increment to 3
3093  *  0     3 4    first: increment to 1
3094  *    1   3 4    first: increment to 2
3095  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
3096  *  0 1       5  first: reset to 0; middle: increment to 2
3097  *  0   2     5  first: increment to 1
3098  *    1 2     5  first: reset to 0; middle: increment to 3
3099  *  0     3   5  first: increment to 1
3100  *    1   3   5  first: increment to 2
3101  *      2 3   5  first: reset to 0; middle: increment to 4
3102  *  0       4 5  first: increment to 1
3103  *    1     4 5  first: increment to 2
3104  *      2   4 5  first: increment to 3
3105  *        3 4 5  done
3106  *
3107  * This strategy works for dRAID but is less efficient when there are a large
3108  * number of child vdevs and therefore permutations to check. Furthermore,
3109  * since the raidz_map_t rows likely do not overlap, reconstruction would be
3110  * possible as long as there are no more than nparity data errors per row.
3111  * These additional permutations are not currently checked but could be as
3112  * a future improvement.
3113  *
3114  * Returns 0 on success, ECKSUM on failure.
3115  */
3116 static int
3117 vdev_raidz_combrec(zio_t *zio)
3118 {
3119         int nparity = vdev_get_nparity(zio->io_vd);
3120         raidz_map_t *rm = zio->io_vsd;
3121         int physical_width = zio->io_vd->vdev_children;
3122         int original_width = (rm->rm_original_width != 0) ?
3123             rm->rm_original_width : physical_width;
3124
3125         for (int i = 0; i < rm->rm_nrows; i++) {
3126                 raidz_row_t *rr = rm->rm_row[i];
3127                 int total_errors = 0;
3128
3129                 for (int c = 0; c < rr->rr_cols; c++) {
3130                         if (rr->rr_col[c].rc_error)
3131                                 total_errors++;
3132                 }
3133
3134                 if (total_errors > nparity)
3135                         return (vdev_raidz_worst_error(rr));
3136         }
3137
3138         for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3139                 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3140                 int *ltgts = &tstore[1]; /* value is logical child ID */
3141
3142
3143                 /*
3144                  * Determine number of logical children, n.  See comment
3145                  * above raidz_simulate_failure().
3146                  */
3147                 int n = 0;
3148                 for (int w = physical_width;
3149                     w >= original_width; w--) {
3150                         n += w;
3151                 }
3152
3153                 ASSERT3U(num_failures, <=, nparity);
3154                 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3155
3156                 /* Handle corner cases in combrec logic */
3157                 ltgts[-1] = -1;
3158                 for (int i = 0; i < num_failures; i++) {
3159                         ltgts[i] = i;
3160                 }
3161                 ltgts[num_failures] = n;
3162
3163                 for (;;) {
3164                         int err = raidz_reconstruct(zio, ltgts, num_failures,
3165                             nparity);
3166                         if (err == EINVAL) {
3167                                 /*
3168                                  * Reconstruction not possible with this #
3169                                  * failures; try more failures.
3170                                  */
3171                                 break;
3172                         } else if (err == 0)
3173                                 return (0);
3174
3175                         /* Compute next targets to try */
3176                         for (int t = 0; ; t++) {
3177                                 ASSERT3U(t, <, num_failures);
3178                                 ltgts[t]++;
3179                                 if (ltgts[t] == n) {
3180                                         /* try more failures */
3181                                         ASSERT3U(t, ==, num_failures - 1);
3182                                         if (zfs_flags &
3183                                             ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3184                                                 zfs_dbgmsg("reconstruction "
3185                                                     "failed for num_failures="
3186                                                     "%u; tried all "
3187                                                     "combinations",
3188                                                     num_failures);
3189                                         }
3190                                         break;
3191                                 }
3192
3193                                 ASSERT3U(ltgts[t], <, n);
3194                                 ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3195
3196                                 /*
3197                                  * If that spot is available, we're done here.
3198                                  * Try the next combination.
3199                                  */
3200                                 if (ltgts[t] != ltgts[t + 1])
3201                                         break; // found next combination
3202
3203                                 /*
3204                                  * Otherwise, reset this tgt to the minimum,
3205                                  * and move on to the next tgt.
3206                                  */
3207                                 ltgts[t] = ltgts[t - 1] + 1;
3208                                 ASSERT3U(ltgts[t], ==, t);
3209                         }
3210
3211                         /* Increase the number of failures and keep trying. */
3212                         if (ltgts[num_failures - 1] == n)
3213                                 break;
3214                 }
3215         }
3216         if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3217                 zfs_dbgmsg("reconstruction failed for all num_failures");
3218         return (ECKSUM);
3219 }
3220
3221 void
3222 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3223 {
3224         for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3225                 raidz_row_t *rr = rm->rm_row[row];
3226                 vdev_raidz_reconstruct_row(rm, rr, t, nt);
3227         }
3228 }
3229
3230 /*
3231  * Complete a write IO operation on a RAIDZ VDev
3232  *
3233  * Outline:
3234  *   1. Check for errors on the child IOs.
3235  *   2. Return, setting an error code if too few child VDevs were written
3236  *      to reconstruct the data later.  Note that partial writes are
3237  *      considered successful if they can be reconstructed at all.
3238  */
3239 static void
3240 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3241 {
3242         int normal_errors = 0;
3243         int shadow_errors = 0;
3244
3245         ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3246         ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3247         ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3248
3249         for (int c = 0; c < rr->rr_cols; c++) {
3250                 raidz_col_t *rc = &rr->rr_col[c];
3251
3252                 if (rc->rc_error != 0) {
3253                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
3254                         normal_errors++;
3255                 }
3256                 if (rc->rc_shadow_error != 0) {
3257                         ASSERT(rc->rc_shadow_error != ECKSUM);
3258                         shadow_errors++;
3259                 }
3260         }
3261
3262         /*
3263          * Treat partial writes as a success. If we couldn't write enough
3264          * columns to reconstruct the data, the I/O failed.  Otherwise, good
3265          * enough.  Note that in the case of a shadow write (during raidz
3266          * expansion), depending on if we crash, either the normal (old) or
3267          * shadow (new) location may become the "real" version of the block,
3268          * so both locations must have sufficient redundancy.
3269          *
3270          * Now that we support write reallocation, it would be better
3271          * to treat partial failure as real failure unless there are
3272          * no non-degraded top-level vdevs left, and not update DTLs
3273          * if we intend to reallocate.
3274          */
3275         if (normal_errors > rr->rr_firstdatacol ||
3276             shadow_errors > rr->rr_firstdatacol) {
3277                 zio->io_error = zio_worst_error(zio->io_error,
3278                     vdev_raidz_worst_error(rr));
3279         }
3280 }
3281
3282 static void
3283 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3284     raidz_row_t *rr)
3285 {
3286         int parity_errors = 0;
3287         int parity_untried = 0;
3288         int data_errors = 0;
3289         int total_errors = 0;
3290
3291         ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3292         ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3293
3294         for (int c = 0; c < rr->rr_cols; c++) {
3295                 raidz_col_t *rc = &rr->rr_col[c];
3296
3297                 /*
3298                  * If scrubbing and a replacing/sparing child vdev determined
3299                  * that not all of its children have an identical copy of the
3300                  * data, then clear the error so the column is treated like
3301                  * any other read and force a repair to correct the damage.
3302                  */
3303                 if (rc->rc_error == ECKSUM) {
3304                         ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3305                         vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3306                         rc->rc_force_repair = 1;
3307                         rc->rc_error = 0;
3308                 }
3309
3310                 if (rc->rc_error) {
3311                         if (c < rr->rr_firstdatacol)
3312                                 parity_errors++;
3313                         else
3314                                 data_errors++;
3315
3316                         total_errors++;
3317                 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3318                         parity_untried++;
3319                 }
3320         }
3321
3322         /*
3323          * If there were data errors and the number of errors we saw was
3324          * correctable -- less than or equal to the number of parity disks read
3325          * -- reconstruct based on the missing data.
3326          */
3327         if (data_errors != 0 &&
3328             total_errors <= rr->rr_firstdatacol - parity_untried) {
3329                 /*
3330                  * We either attempt to read all the parity columns or
3331                  * none of them. If we didn't try to read parity, we
3332                  * wouldn't be here in the correctable case. There must
3333                  * also have been fewer parity errors than parity
3334                  * columns or, again, we wouldn't be in this code path.
3335                  */
3336                 ASSERT(parity_untried == 0);
3337                 ASSERT(parity_errors < rr->rr_firstdatacol);
3338
3339                 /*
3340                  * Identify the data columns that reported an error.
3341                  */
3342                 int n = 0;
3343                 int tgts[VDEV_RAIDZ_MAXPARITY];
3344                 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3345                         raidz_col_t *rc = &rr->rr_col[c];
3346                         if (rc->rc_error != 0) {
3347                                 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3348                                 tgts[n++] = c;
3349                         }
3350                 }
3351
3352                 ASSERT(rr->rr_firstdatacol >= n);
3353
3354                 vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3355         }
3356 }
3357
3358 /*
3359  * Return the number of reads issued.
3360  */
3361 static int
3362 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3363 {
3364         vdev_t *vd = zio->io_vd;
3365         int nread = 0;
3366
3367         rr->rr_missingdata = 0;
3368         rr->rr_missingparity = 0;
3369
3370         /*
3371          * If this rows contains empty sectors which are not required
3372          * for a normal read then allocate an ABD for them now so they
3373          * may be read, verified, and any needed repairs performed.
3374          */
3375         if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3376                 vdev_draid_map_alloc_empty(zio, rr);
3377
3378         for (int c = 0; c < rr->rr_cols; c++) {
3379                 raidz_col_t *rc = &rr->rr_col[c];
3380                 if (rc->rc_tried || rc->rc_size == 0)
3381                         continue;
3382
3383                 zio_nowait(zio_vdev_child_io(zio, NULL,
3384                     vd->vdev_child[rc->rc_devidx],
3385                     rc->rc_offset, rc->rc_abd, rc->rc_size,
3386                     zio->io_type, zio->io_priority, 0,
3387                     vdev_raidz_child_done, rc));
3388                 nread++;
3389         }
3390         return (nread);
3391 }
3392
3393 /*
3394  * We're here because either there were too many errors to even attempt
3395  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3396  * failed. In either case, there is enough bad data to prevent reconstruction.
3397  * Start checksum ereports for all children which haven't failed.
3398  */
3399 static void
3400 vdev_raidz_io_done_unrecoverable(zio_t *zio)
3401 {
3402         raidz_map_t *rm = zio->io_vsd;
3403
3404         for (int i = 0; i < rm->rm_nrows; i++) {
3405                 raidz_row_t *rr = rm->rm_row[i];
3406
3407                 for (int c = 0; c < rr->rr_cols; c++) {
3408                         raidz_col_t *rc = &rr->rr_col[c];
3409                         vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3410
3411                         if (rc->rc_error != 0)
3412                                 continue;
3413
3414                         zio_bad_cksum_t zbc;
3415                         zbc.zbc_has_cksum = 0;
3416                         zbc.zbc_injected = rm->rm_ecksuminjected;
3417                         mutex_enter(&cvd->vdev_stat_lock);
3418                         cvd->vdev_stat.vs_checksum_errors++;
3419                         mutex_exit(&cvd->vdev_stat_lock);
3420                         (void) zfs_ereport_start_checksum(zio->io_spa,
3421                             cvd, &zio->io_bookmark, zio, rc->rc_offset,
3422                             rc->rc_size, &zbc);
3423                 }
3424         }
3425 }
3426
3427 void
3428 vdev_raidz_io_done(zio_t *zio)
3429 {
3430         raidz_map_t *rm = zio->io_vsd;
3431
3432         ASSERT(zio->io_bp != NULL);
3433         if (zio->io_type == ZIO_TYPE_WRITE) {
3434                 for (int i = 0; i < rm->rm_nrows; i++) {
3435                         vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3436                 }
3437         } else {
3438                 if (rm->rm_phys_col) {
3439                         /*
3440                          * This is an aggregated read.  Copy the data and status
3441                          * from the aggregate abd's to the individual rows.
3442                          */
3443                         for (int i = 0; i < rm->rm_nrows; i++) {
3444                                 raidz_row_t *rr = rm->rm_row[i];
3445
3446                                 for (int c = 0; c < rr->rr_cols; c++) {
3447                                         raidz_col_t *rc = &rr->rr_col[c];
3448                                         if (rc->rc_tried || rc->rc_size == 0)
3449                                                 continue;
3450
3451                                         raidz_col_t *prc =
3452                                             &rm->rm_phys_col[rc->rc_devidx];
3453                                         rc->rc_error = prc->rc_error;
3454                                         rc->rc_tried = prc->rc_tried;
3455                                         rc->rc_skipped = prc->rc_skipped;
3456                                         if (c >= rr->rr_firstdatacol) {
3457                                                 /*
3458                                                  * Note: this is slightly faster
3459                                                  * than using abd_copy_off().
3460                                                  */
3461                                                 char *physbuf = abd_to_buf(
3462                                                     prc->rc_abd);
3463                                                 void *physloc = physbuf +
3464                                                     rc->rc_offset -
3465                                                     prc->rc_offset;
3466
3467                                                 abd_copy_from_buf(rc->rc_abd,
3468                                                     physloc, rc->rc_size);
3469                                         }
3470                                 }
3471                         }
3472                 }
3473
3474                 for (int i = 0; i < rm->rm_nrows; i++) {
3475                         raidz_row_t *rr = rm->rm_row[i];
3476                         vdev_raidz_io_done_reconstruct_known_missing(zio,
3477                             rm, rr);
3478                 }
3479
3480                 if (raidz_checksum_verify(zio) == 0) {
3481                         if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
3482                                 goto done;
3483
3484                         for (int i = 0; i < rm->rm_nrows; i++) {
3485                                 raidz_row_t *rr = rm->rm_row[i];
3486                                 vdev_raidz_io_done_verified(zio, rr);
3487                         }
3488                         zio_checksum_verified(zio);
3489                 } else {
3490                         /*
3491                          * A sequential resilver has no checksum which makes
3492                          * combinatoral reconstruction impossible. This code
3493                          * path is unreachable since raidz_checksum_verify()
3494                          * has no checksum to verify and must succeed.
3495                          */
3496                         ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3497
3498                         /*
3499                          * This isn't a typical situation -- either we got a
3500                          * read error or a child silently returned bad data.
3501                          * Read every block so we can try again with as much
3502                          * data and parity as we can track down. If we've
3503                          * already been through once before, all children will
3504                          * be marked as tried so we'll proceed to combinatorial
3505                          * reconstruction.
3506                          */
3507                         int nread = 0;
3508                         for (int i = 0; i < rm->rm_nrows; i++) {
3509                                 nread += vdev_raidz_read_all(zio,
3510                                     rm->rm_row[i]);
3511                         }
3512                         if (nread != 0) {
3513                                 /*
3514                                  * Normally our stage is VDEV_IO_DONE, but if
3515                                  * we've already called redone(), it will have
3516                                  * changed to VDEV_IO_START, in which case we
3517                                  * don't want to call redone() again.
3518                                  */
3519                                 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3520                                         zio_vdev_io_redone(zio);
3521                                 return;
3522                         }
3523                         /*
3524                          * It would be too expensive to try every possible
3525                          * combination of failed sectors in every row, so
3526                          * instead we try every combination of failed current or
3527                          * past physical disk. This means that if the incorrect
3528                          * sectors were all on Nparity disks at any point in the
3529                          * past, we will find the correct data.  The only known
3530                          * case where this is less durable than a non-expanded
3531                          * RAIDZ, is if we have a silent failure during
3532                          * expansion.  In that case, one block could be
3533                          * partially in the old format and partially in the
3534                          * new format, so we'd lost some sectors from the old
3535                          * format and some from the new format.
3536                          *
3537                          * e.g. logical_width=4 physical_width=6
3538                          * the 15 (6+5+4) possible failed disks are:
3539                          * width=6 child=0
3540                          * width=6 child=1
3541                          * width=6 child=2
3542                          * width=6 child=3
3543                          * width=6 child=4
3544                          * width=6 child=5
3545                          * width=5 child=0
3546                          * width=5 child=1
3547                          * width=5 child=2
3548                          * width=5 child=3
3549                          * width=5 child=4
3550                          * width=4 child=0
3551                          * width=4 child=1
3552                          * width=4 child=2
3553                          * width=4 child=3
3554                          * And we will try every combination of Nparity of these
3555                          * failing.
3556                          *
3557                          * As a first pass, we can generate every combo,
3558                          * and try reconstructing, ignoring any known
3559                          * failures.  If any row has too many known + simulated
3560                          * failures, then we bail on reconstructing with this
3561                          * number of simulated failures.  As an improvement,
3562                          * we could detect the number of whole known failures
3563                          * (i.e. we have known failures on these disks for
3564                          * every row; the disks never succeeded), and
3565                          * subtract that from the max # failures to simulate.
3566                          * We could go even further like the current
3567                          * combrec code, but that doesn't seem like it
3568                          * gains us very much.  If we simulate a failure
3569                          * that is also a known failure, that's fine.
3570                          */
3571                         zio->io_error = vdev_raidz_combrec(zio);
3572                         if (zio->io_error == ECKSUM &&
3573                             !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3574                                 vdev_raidz_io_done_unrecoverable(zio);
3575                         }
3576                 }
3577         }
3578 done:
3579         if (rm->rm_lr != NULL) {
3580                 zfs_rangelock_exit(rm->rm_lr);
3581                 rm->rm_lr = NULL;
3582         }
3583 }
3584
3585 static void
3586 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3587 {
3588         vdev_raidz_t *vdrz = vd->vdev_tsd;
3589         if (faulted > vdrz->vd_nparity)
3590                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3591                     VDEV_AUX_NO_REPLICAS);
3592         else if (degraded + faulted != 0)
3593                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3594         else
3595                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3596 }
3597
3598 /*
3599  * Determine if any portion of the provided block resides on a child vdev
3600  * with a dirty DTL and therefore needs to be resilvered.  The function
3601  * assumes that at least one DTL is dirty which implies that full stripe
3602  * width blocks must be resilvered.
3603  */
3604 static boolean_t
3605 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3606     uint64_t phys_birth)
3607 {
3608         vdev_raidz_t *vdrz = vd->vdev_tsd;
3609
3610         /*
3611          * If we're in the middle of a RAIDZ expansion, this block may be in
3612          * the old and/or new location.  For simplicity, always resilver it.
3613          */
3614         if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3615                 return (B_TRUE);
3616
3617         uint64_t dcols = vd->vdev_children;
3618         uint64_t nparity = vdrz->vd_nparity;
3619         uint64_t ashift = vd->vdev_top->vdev_ashift;
3620         /* The starting RAIDZ (parent) vdev sector of the block. */
3621         uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3622         /* The zio's size in units of the vdev's minimum sector size. */
3623         uint64_t s = ((psize - 1) >> ashift) + 1;
3624         /* The first column for this stripe. */
3625         uint64_t f = b % dcols;
3626
3627         /* Unreachable by sequential resilver. */
3628         ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3629
3630         if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3631                 return (B_FALSE);
3632
3633         if (s + nparity >= dcols)
3634                 return (B_TRUE);
3635
3636         for (uint64_t c = 0; c < s + nparity; c++) {
3637                 uint64_t devidx = (f + c) % dcols;
3638                 vdev_t *cvd = vd->vdev_child[devidx];
3639
3640                 /*
3641                  * dsl_scan_need_resilver() already checked vd with
3642                  * vdev_dtl_contains(). So here just check cvd with
3643                  * vdev_dtl_empty(), cheaper and a good approximation.
3644                  */
3645                 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3646                         return (B_TRUE);
3647         }
3648
3649         return (B_FALSE);
3650 }
3651
3652 static void
3653 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
3654     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
3655 {
3656         (void) remain_rs;
3657
3658         vdev_t *raidvd = cvd->vdev_parent;
3659         ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3660
3661         vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3662
3663         if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3664                 /*
3665                  * We're in the middle of expansion, in which case the
3666                  * translation is in flux.  Any answer we give may be wrong
3667                  * by the time we return, so it isn't safe for the caller to
3668                  * act on it.  Therefore we say that this range isn't present
3669                  * on any children.  The only consumers of this are "zpool
3670                  * initialize" and trimming, both of which are "best effort"
3671                  * anyway.
3672                  */
3673                 physical_rs->rs_start = physical_rs->rs_end = 0;
3674                 remain_rs->rs_start = remain_rs->rs_end = 0;
3675                 return;
3676         }
3677
3678         uint64_t width = vdrz->vd_physical_width;
3679         uint64_t tgt_col = cvd->vdev_id;
3680         uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3681
3682         /* make sure the offsets are block-aligned */
3683         ASSERT0(logical_rs->rs_start % (1 << ashift));
3684         ASSERT0(logical_rs->rs_end % (1 << ashift));
3685         uint64_t b_start = logical_rs->rs_start >> ashift;
3686         uint64_t b_end = logical_rs->rs_end >> ashift;
3687
3688         uint64_t start_row = 0;
3689         if (b_start > tgt_col) /* avoid underflow */
3690                 start_row = ((b_start - tgt_col - 1) / width) + 1;
3691
3692         uint64_t end_row = 0;
3693         if (b_end > tgt_col)
3694                 end_row = ((b_end - tgt_col - 1) / width) + 1;
3695
3696         physical_rs->rs_start = start_row << ashift;
3697         physical_rs->rs_end = end_row << ashift;
3698
3699         ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
3700         ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
3701             logical_rs->rs_end - logical_rs->rs_start);
3702 }
3703
3704 static void
3705 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
3706 {
3707         spa_t *spa = arg;
3708         int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3709         vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3710
3711         /*
3712          * Ensure there are no i/os to the range that is being committed.
3713          */
3714         uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3715         ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
3716
3717         mutex_enter(&vre->vre_lock);
3718         uint64_t new_offset =
3719             MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3720         /*
3721          * We should not have committed anything that failed.
3722          */
3723         VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3724         mutex_exit(&vre->vre_lock);
3725
3726         zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3727             old_offset, new_offset - old_offset,
3728             RL_WRITER);
3729
3730         /*
3731          * Update the uberblock that will be written when this txg completes.
3732          */
3733         RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3734             RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3735         vre->vre_offset_pertxg[txgoff] = 0;
3736         zfs_rangelock_exit(lr);
3737
3738         mutex_enter(&vre->vre_lock);
3739         vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3740         vre->vre_bytes_copied_pertxg[txgoff] = 0;
3741         mutex_exit(&vre->vre_lock);
3742
3743         vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3744         VERIFY0(zap_update(spa->spa_meta_objset,
3745             vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3746             sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3747 }
3748
3749 static void
3750 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3751 {
3752         spa_t *spa = arg;
3753         vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3754         vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3755         vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3756
3757         for (int i = 0; i < TXG_SIZE; i++)
3758                 VERIFY0(vre->vre_offset_pertxg[i]);
3759
3760         reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3761         re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3762         re->re_logical_width = vdrz->vd_physical_width;
3763         mutex_enter(&vdrz->vd_expand_lock);
3764         avl_add(&vdrz->vd_expand_txgs, re);
3765         mutex_exit(&vdrz->vd_expand_lock);
3766
3767         vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3768
3769         /*
3770          * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3771          * will get written (based on vd_expand_txgs).
3772          */
3773         vdev_config_dirty(vd);
3774
3775         /*
3776          * Before we change vre_state, the on-disk state must reflect that we
3777          * have completed all copying, so that vdev_raidz_io_start() can use
3778          * vre_state to determine if the reflow is in progress.  See also the
3779          * end of spa_raidz_expand_thread().
3780          */
3781         VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3782             raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3783
3784         vre->vre_end_time = gethrestime_sec();
3785         vre->vre_state = DSS_FINISHED;
3786
3787         uint64_t state = vre->vre_state;
3788         VERIFY0(zap_update(spa->spa_meta_objset,
3789             vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3790             sizeof (state), 1, &state, tx));
3791
3792         uint64_t end_time = vre->vre_end_time;
3793         VERIFY0(zap_update(spa->spa_meta_objset,
3794             vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3795             sizeof (end_time), 1, &end_time, tx));
3796
3797         spa->spa_uberblock.ub_raidz_reflow_info = 0;
3798
3799         spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
3800             "%s vdev %llu new width %llu", spa_name(spa),
3801             (unsigned long long)vd->vdev_id,
3802             (unsigned long long)vd->vdev_children);
3803
3804         spa->spa_raidz_expand = NULL;
3805         raidvd->vdev_rz_expanding = B_FALSE;
3806
3807         spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3808         spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3809         spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3810
3811         spa_notify_waiters(spa);
3812
3813         /*
3814          * While we're in syncing context take the opportunity to
3815          * setup a scrub. All the data has been sucessfully copied
3816          * but we have not validated any checksums.
3817          */
3818         setup_sync_arg_t setup_sync_arg = {
3819                 .func = POOL_SCAN_SCRUB,
3820                 .txgstart = 0,
3821                 .txgend = 0,
3822         };
3823         if (zfs_scrub_after_expand &&
3824             dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
3825                 dsl_scan_setup_sync(&setup_sync_arg, tx);
3826         }
3827 }
3828
3829 /*
3830  * State of one copy batch.
3831  */
3832 typedef struct raidz_reflow_arg {
3833         vdev_raidz_expand_t *rra_vre;   /* Global expantion state. */
3834         zfs_locked_range_t *rra_lr;     /* Range lock of this batch. */
3835         uint64_t rra_txg;       /* TXG of this batch. */
3836         uint_t rra_ashift;      /* Ashift of the vdev. */
3837         uint32_t rra_tbd;       /* Number of in-flight ZIOs. */
3838         uint32_t rra_writes;    /* Number of write ZIOs. */
3839         zio_t *rra_zio[];       /* Write ZIO pointers. */
3840 } raidz_reflow_arg_t;
3841
3842 /*
3843  * Write of the new location on one child is done.  Once all of them are done
3844  * we can unlock and free everything.
3845  */
3846 static void
3847 raidz_reflow_write_done(zio_t *zio)
3848 {
3849         raidz_reflow_arg_t *rra = zio->io_private;
3850         vdev_raidz_expand_t *vre = rra->rra_vre;
3851
3852         abd_free(zio->io_abd);
3853
3854         mutex_enter(&vre->vre_lock);
3855         if (zio->io_error != 0) {
3856                 /* Force a reflow pause on errors */
3857                 vre->vre_failed_offset =
3858                     MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3859         }
3860         ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3861         vre->vre_outstanding_bytes -= zio->io_size;
3862         if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3863             vre->vre_failed_offset) {
3864                 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3865                     zio->io_size;
3866         }
3867         cv_signal(&vre->vre_cv);
3868         boolean_t done = (--rra->rra_tbd == 0);
3869         mutex_exit(&vre->vre_lock);
3870
3871         if (!done)
3872                 return;
3873         spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
3874         zfs_rangelock_exit(rra->rra_lr);
3875         kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
3876 }
3877
3878 /*
3879  * Read of the old location on one child is done.  Once all of them are done
3880  * writes should have all the data and we can issue them.
3881  */
3882 static void
3883 raidz_reflow_read_done(zio_t *zio)
3884 {
3885         raidz_reflow_arg_t *rra = zio->io_private;
3886         vdev_raidz_expand_t *vre = rra->rra_vre;
3887
3888         /* Reads of only one block use write ABDs.  For bigger free gangs. */
3889         if (zio->io_size > (1 << rra->rra_ashift))
3890                 abd_free(zio->io_abd);
3891
3892         /*
3893          * If the read failed, or if it was done on a vdev that is not fully
3894          * healthy (e.g. a child that has a resilver in progress), we may not
3895          * have the correct data.  Note that it's OK if the write proceeds.
3896          * It may write garbage but the location is otherwise unused and we
3897          * will retry later due to vre_failed_offset.
3898          */
3899         if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3900                 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3901                     "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3902                     (long long)rra->rra_lr->lr_offset,
3903                     (long long)rra->rra_lr->lr_length,
3904                     (long long)rra->rra_txg,
3905                     zio->io_error,
3906                     vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3907                     vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3908                 mutex_enter(&vre->vre_lock);
3909                 /* Force a reflow pause on errors */
3910                 vre->vre_failed_offset =
3911                     MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3912                 mutex_exit(&vre->vre_lock);
3913         }
3914
3915         if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
3916                 return;
3917         rra->rra_tbd = rra->rra_writes;
3918         for (uint64_t i = 0; i < rra->rra_writes; i++)
3919                 zio_nowait(rra->rra_zio[i]);
3920 }
3921
3922 static void
3923 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3924     dmu_tx_t *tx)
3925 {
3926         int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3927         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3928
3929         if (offset == 0)
3930                 return;
3931
3932         mutex_enter(&vre->vre_lock);
3933         ASSERT3U(vre->vre_offset, <=, offset);
3934         vre->vre_offset = offset;
3935         mutex_exit(&vre->vre_lock);
3936
3937         if (vre->vre_offset_pertxg[txgoff] == 0) {
3938                 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3939                     spa, tx);
3940         }
3941         vre->vre_offset_pertxg[txgoff] = offset;
3942 }
3943
3944 static boolean_t
3945 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3946 {
3947         for (int i = 0; i < raidz_vd->vdev_children; i++) {
3948                 /* Quick check if a child is being replaced */
3949                 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3950                         return (B_TRUE);
3951         }
3952         return (B_FALSE);
3953 }
3954
3955 static boolean_t
3956 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3957     dmu_tx_t *tx)
3958 {
3959         spa_t *spa = vd->vdev_spa;
3960         uint_t ashift = vd->vdev_top->vdev_ashift;
3961
3962         range_seg_t *rs = range_tree_first(rt);
3963         if (rt == NULL)
3964                 return (B_FALSE);
3965         uint64_t offset = rs_get_start(rs, rt);
3966         ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3967         uint64_t size = rs_get_end(rs, rt) - offset;
3968         ASSERT3U(size, >=, 1 << ashift);
3969         ASSERT(IS_P2ALIGNED(size, 1 << ashift));
3970
3971         uint64_t blkid = offset >> ashift;
3972         uint_t old_children = vd->vdev_children - 1;
3973
3974         /*
3975          * We can only progress to the point that writes will not overlap
3976          * with blocks whose progress has not yet been recorded on disk.
3977          * Since partially-copied rows are still read from the old location,
3978          * we need to stop one row before the sector-wise overlap, to prevent
3979          * row-wise overlap.
3980          *
3981          * Note that even if we are skipping over a large unallocated region,
3982          * we can't move the on-disk progress to `offset`, because concurrent
3983          * writes/allocations could still use the currently-unallocated
3984          * region.
3985          */
3986         uint64_t ubsync_blkid =
3987             RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
3988         uint64_t next_overwrite_blkid = ubsync_blkid +
3989             ubsync_blkid / old_children - old_children;
3990         VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
3991         if (blkid >= next_overwrite_blkid) {
3992                 raidz_reflow_record_progress(vre,
3993                     next_overwrite_blkid << ashift, tx);
3994                 return (B_TRUE);
3995         }
3996
3997         size = MIN(size, raidz_expand_max_copy_bytes);
3998         size = MIN(size, (uint64_t)old_children *
3999             MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
4000         size = MAX(size, 1 << ashift);
4001         uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
4002         size = (uint64_t)blocks << ashift;
4003
4004         range_tree_remove(rt, offset, size);
4005
4006         uint_t reads = MIN(blocks, old_children);
4007         uint_t writes = MIN(blocks, vd->vdev_children);
4008         raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
4009             sizeof (zio_t *) * writes, KM_SLEEP);
4010         rra->rra_vre = vre;
4011         rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
4012             offset, size, RL_WRITER);
4013         rra->rra_txg = dmu_tx_get_txg(tx);
4014         rra->rra_ashift = ashift;
4015         rra->rra_tbd = reads;
4016         rra->rra_writes = writes;
4017
4018         raidz_reflow_record_progress(vre, offset + size, tx);
4019
4020         /*
4021          * SCL_STATE will be released when the read and write are done,
4022          * by raidz_reflow_write_done().
4023          */
4024         spa_config_enter(spa, SCL_STATE, spa, RW_READER);
4025
4026         /* check if a replacing vdev was added, if so treat it as an error */
4027         if (vdev_raidz_expand_child_replacing(vd)) {
4028                 zfs_dbgmsg("replacing vdev encountered, reflow paused at "
4029                     "offset=%llu txg=%llu",
4030                     (long long)rra->rra_lr->lr_offset,
4031                     (long long)rra->rra_txg);
4032
4033                 mutex_enter(&vre->vre_lock);
4034                 vre->vre_failed_offset =
4035                     MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4036                 cv_signal(&vre->vre_cv);
4037                 mutex_exit(&vre->vre_lock);
4038
4039                 /* drop everything we acquired */
4040                 spa_config_exit(spa, SCL_STATE, spa);
4041                 zfs_rangelock_exit(rra->rra_lr);
4042                 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
4043                 return (B_TRUE);
4044         }
4045
4046         mutex_enter(&vre->vre_lock);
4047         vre->vre_outstanding_bytes += size;
4048         mutex_exit(&vre->vre_lock);
4049
4050         /* Allocate ABD and ZIO for each child we write. */
4051         int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4052         zio_t *pio = spa->spa_txg_zio[txgoff];
4053         uint_t b = blocks / vd->vdev_children;
4054         uint_t bb = blocks % vd->vdev_children;
4055         for (uint_t i = 0; i < writes; i++) {
4056                 uint_t n = b + (i < bb);
4057                 abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
4058                 rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
4059                     vd->vdev_child[(blkid + i) % vd->vdev_children],
4060                     ((blkid + i) / vd->vdev_children) << ashift,
4061                     abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4062                     ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
4063         }
4064
4065         /*
4066          * Allocate and issue ZIO for each child we read.  For reads of only
4067          * one block we can use respective writer ABDs, since they will also
4068          * have only one block.  For bigger reads create gang ABDs and fill
4069          * them with respective blocks from writer ABDs.
4070          */
4071         b = blocks / old_children;
4072         bb = blocks % old_children;
4073         for (uint_t i = 0; i < reads; i++) {
4074                 uint_t n = b + (i < bb);
4075                 abd_t *abd;
4076                 if (n > 1) {
4077                         abd = abd_alloc_gang();
4078                         for (uint_t j = 0; j < n; j++) {
4079                                 uint_t b = j * old_children + i;
4080                                 abd_t *cabd = abd_get_offset_size(
4081                                     rra->rra_zio[b % vd->vdev_children]->io_abd,
4082                                     (b / vd->vdev_children) << ashift,
4083                                     1 << ashift);
4084                                 abd_gang_add(abd, cabd, B_TRUE);
4085                         }
4086                 } else {
4087                         abd = rra->rra_zio[i]->io_abd;
4088                 }
4089                 zio_nowait(zio_vdev_child_io(pio, NULL,
4090                     vd->vdev_child[(blkid + i) % old_children],
4091                     ((blkid + i) / old_children) << ashift, abd,
4092                     n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4093                     ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
4094         }
4095
4096         return (B_FALSE);
4097 }
4098
4099 /*
4100  * For testing (ztest specific)
4101  */
4102 static void
4103 raidz_expand_pause(uint_t pause_point)
4104 {
4105         while (raidz_expand_pause_point != 0 &&
4106             raidz_expand_pause_point <= pause_point)
4107                 delay(hz);
4108 }
4109
4110 static void
4111 raidz_scratch_child_done(zio_t *zio)
4112 {
4113         zio_t *pio = zio->io_private;
4114
4115         mutex_enter(&pio->io_lock);
4116         pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4117         mutex_exit(&pio->io_lock);
4118 }
4119
4120 /*
4121  * Reflow the beginning portion of the vdev into an intermediate scratch area
4122  * in memory and on disk. This operation must be persisted on disk before we
4123  * proceed to overwrite the beginning portion with the reflowed data.
4124  *
4125  * This multi-step task can fail to complete if disk errors are encountered
4126  * and we can return here after a pause (waiting for disk to become healthy).
4127  */
4128 static void
4129 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4130 {
4131         vdev_raidz_expand_t *vre = arg;
4132         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4133         zio_t *pio;
4134         int error;
4135
4136         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4137         vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4138         int ashift = raidvd->vdev_ashift;
4139         uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4140             uint64_t);
4141         uint64_t logical_size = write_size * raidvd->vdev_children;
4142         uint64_t read_size =
4143             P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4144             1 << ashift);
4145
4146         /*
4147          * The scratch space must be large enough to get us to the point
4148          * that one row does not overlap itself when moved.  This is checked
4149          * by vdev_raidz_attach_check().
4150          */
4151         VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4152         VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4153         VERIFY3U(write_size, <=, read_size);
4154
4155         zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4156             0, logical_size, RL_WRITER);
4157
4158         abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4159             KM_SLEEP);
4160         for (int i = 0; i < raidvd->vdev_children; i++) {
4161                 abds[i] = abd_alloc_linear(read_size, B_FALSE);
4162         }
4163
4164         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4165
4166         /*
4167          * If we have already written the scratch area then we must read from
4168          * there, since new writes were redirected there while we were paused
4169          * or the original location may have been partially overwritten with
4170          * reflowed data.
4171          */
4172         if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4173                 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4174                 /*
4175                  * Read from scratch space.
4176                  */
4177                 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4178                 for (int i = 0; i < raidvd->vdev_children; i++) {
4179                         /*
4180                          * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4181                          * to the offset to calculate the physical offset to
4182                          * write to.  Passing in a negative offset makes us
4183                          * access the scratch area.
4184                          */
4185                         zio_nowait(zio_vdev_child_io(pio, NULL,
4186                             raidvd->vdev_child[i],
4187                             VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4188                             write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4189                             ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4190                 }
4191                 error = zio_wait(pio);
4192                 if (error != 0) {
4193                         zfs_dbgmsg("reflow: error %d reading scratch location",
4194                             error);
4195                         goto io_error_exit;
4196                 }
4197                 goto overwrite;
4198         }
4199
4200         /*
4201          * Read from original location.
4202          */
4203         pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4204         for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4205                 ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4206                 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4207                     0, abds[i], read_size, ZIO_TYPE_READ,
4208                     ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4209                     raidz_scratch_child_done, pio));
4210         }
4211         error = zio_wait(pio);
4212         if (error != 0) {
4213                 zfs_dbgmsg("reflow: error %d reading original location", error);
4214 io_error_exit:
4215                 for (int i = 0; i < raidvd->vdev_children; i++)
4216                         abd_free(abds[i]);
4217                 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4218                 zfs_rangelock_exit(lr);
4219                 spa_config_exit(spa, SCL_STATE, FTAG);
4220                 return;
4221         }
4222
4223         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4224
4225         /*
4226          * Reflow in memory.
4227          */
4228         uint64_t logical_sectors = logical_size >> ashift;
4229         for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4230                 int oldchild = i % (raidvd->vdev_children - 1);
4231                 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4232
4233                 int newchild = i % raidvd->vdev_children;
4234                 uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4235
4236                 /* a single sector should not be copying over itself */
4237                 ASSERT(!(newchild == oldchild && newoff == oldoff));
4238
4239                 abd_copy_off(abds[newchild], abds[oldchild],
4240                     newoff, oldoff, 1 << ashift);
4241         }
4242
4243         /*
4244          * Verify that we filled in everything we intended to (write_size on
4245          * each child).
4246          */
4247         VERIFY0(logical_sectors % raidvd->vdev_children);
4248         VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4249             write_size);
4250
4251         /*
4252          * Write to scratch location (boot area).
4253          */
4254         pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4255         for (int i = 0; i < raidvd->vdev_children; i++) {
4256                 /*
4257                  * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4258                  * the offset to calculate the physical offset to write to.
4259                  * Passing in a negative offset lets us access the boot area.
4260                  */
4261                 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4262                     VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4263                     write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4264                     ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4265         }
4266         error = zio_wait(pio);
4267         if (error != 0) {
4268                 zfs_dbgmsg("reflow: error %d writing scratch location", error);
4269                 goto io_error_exit;
4270         }
4271         pio = zio_root(spa, NULL, NULL, 0);
4272         zio_flush(pio, raidvd);
4273         zio_wait(pio);
4274
4275         zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4276             (long long)logical_size);
4277
4278         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4279
4280         /*
4281          * Update uberblock to indicate that scratch space is valid.  This is
4282          * needed because after this point, the real location may be
4283          * overwritten.  If we crash, we need to get the data from the
4284          * scratch space, rather than the real location.
4285          *
4286          * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4287          * will prefer this uberblock.
4288          */
4289         RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4290         spa->spa_ubsync.ub_timestamp++;
4291         ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4292             &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4293         if (spa_multihost(spa))
4294                 mmp_update_uberblock(spa, &spa->spa_ubsync);
4295
4296         zfs_dbgmsg("reflow: uberblock updated "
4297             "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4298             (long long)spa->spa_ubsync.ub_txg,
4299             (long long)logical_size,
4300             (long long)spa->spa_ubsync.ub_timestamp);
4301
4302         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4303
4304         /*
4305          * Overwrite with reflow'ed data.
4306          */
4307 overwrite:
4308         pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4309         for (int i = 0; i < raidvd->vdev_children; i++) {
4310                 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4311                     0, abds[i], write_size, ZIO_TYPE_WRITE,
4312                     ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4313                     raidz_scratch_child_done, pio));
4314         }
4315         error = zio_wait(pio);
4316         if (error != 0) {
4317                 /*
4318                  * When we exit early here and drop the range lock, new
4319                  * writes will go into the scratch area so we'll need to
4320                  * read from there when we return after pausing.
4321                  */
4322                 zfs_dbgmsg("reflow: error %d writing real location", error);
4323                 /*
4324                  * Update the uberblock that is written when this txg completes.
4325                  */
4326                 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4327                     logical_size);
4328                 goto io_error_exit;
4329         }
4330         pio = zio_root(spa, NULL, NULL, 0);
4331         zio_flush(pio, raidvd);
4332         zio_wait(pio);
4333
4334         zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4335             (long long)logical_size);
4336         for (int i = 0; i < raidvd->vdev_children; i++)
4337                 abd_free(abds[i]);
4338         kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4339
4340         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4341
4342         /*
4343          * Update uberblock to indicate that the initial part has been
4344          * reflow'ed.  This is needed because after this point (when we exit
4345          * the rangelock), we allow regular writes to this region, which will
4346          * be written to the new location only (because reflow_offset_next ==
4347          * reflow_offset_synced).  If we crashed and re-copied from the
4348          * scratch space, we would lose the regular writes.
4349          */
4350         RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4351             logical_size);
4352         spa->spa_ubsync.ub_timestamp++;
4353         ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4354             &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4355         if (spa_multihost(spa))
4356                 mmp_update_uberblock(spa, &spa->spa_ubsync);
4357
4358         zfs_dbgmsg("reflow: uberblock updated "
4359             "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4360             (long long)spa->spa_ubsync.ub_txg,
4361             (long long)logical_size,
4362             (long long)spa->spa_ubsync.ub_timestamp);
4363
4364         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4365
4366         /*
4367          * Update progress.
4368          */
4369         vre->vre_offset = logical_size;
4370         zfs_rangelock_exit(lr);
4371         spa_config_exit(spa, SCL_STATE, FTAG);
4372
4373         int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4374         vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4375         vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4376         /*
4377          * Note - raidz_reflow_sync() will update the uberblock state to
4378          * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4379          */
4380         raidz_reflow_sync(spa, tx);
4381
4382         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4383 }
4384
4385 /*
4386  * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4387  * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
4388  */
4389 void
4390 vdev_raidz_reflow_copy_scratch(spa_t *spa)
4391 {
4392         vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4393         uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4394         ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4395
4396         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4397         vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4398         ASSERT0(logical_size % raidvd->vdev_children);
4399         uint64_t write_size = logical_size / raidvd->vdev_children;
4400
4401         zio_t *pio;
4402
4403         /*
4404          * Read from scratch space.
4405          */
4406         abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4407             KM_SLEEP);
4408         for (int i = 0; i < raidvd->vdev_children; i++) {
4409                 abds[i] = abd_alloc_linear(write_size, B_FALSE);
4410         }
4411
4412         pio = zio_root(spa, NULL, NULL, 0);
4413         for (int i = 0; i < raidvd->vdev_children; i++) {
4414                 /*
4415                  * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4416                  * the offset to calculate the physical offset to write to.
4417                  * Passing in a negative offset lets us access the boot area.
4418                  */
4419                 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4420                     VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4421                     write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
4422                     raidz_scratch_child_done, pio));
4423         }
4424         zio_wait(pio);
4425
4426         /*
4427          * Overwrite real location with reflow'ed data.
4428          */
4429         pio = zio_root(spa, NULL, NULL, 0);
4430         for (int i = 0; i < raidvd->vdev_children; i++) {
4431                 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4432                     0, abds[i], write_size, ZIO_TYPE_WRITE,
4433                     ZIO_PRIORITY_REMOVAL, 0,
4434                     raidz_scratch_child_done, pio));
4435         }
4436         zio_wait(pio);
4437         pio = zio_root(spa, NULL, NULL, 0);
4438         zio_flush(pio, raidvd);
4439         zio_wait(pio);
4440
4441         zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4442             "to real location", (long long)logical_size);
4443
4444         for (int i = 0; i < raidvd->vdev_children; i++)
4445                 abd_free(abds[i]);
4446         kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4447
4448         /*
4449          * Update uberblock.
4450          */
4451         RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4452             RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4453         spa->spa_ubsync.ub_timestamp++;
4454         VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4455             &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4456         if (spa_multihost(spa))
4457                 mmp_update_uberblock(spa, &spa->spa_ubsync);
4458
4459         zfs_dbgmsg("reflow recovery: uberblock updated "
4460             "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4461             (long long)spa->spa_ubsync.ub_txg,
4462             (long long)logical_size,
4463             (long long)spa->spa_ubsync.ub_timestamp);
4464
4465         dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4466             spa_first_txg(spa));
4467         int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4468         vre->vre_offset = logical_size;
4469         vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4470         vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4471         /*
4472          * Note that raidz_reflow_sync() will update the uberblock once more
4473          */
4474         raidz_reflow_sync(spa, tx);
4475
4476         dmu_tx_commit(tx);
4477
4478         spa_config_exit(spa, SCL_STATE, FTAG);
4479 }
4480
4481 static boolean_t
4482 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4483 {
4484         (void) zthr;
4485         spa_t *spa = arg;
4486
4487         return (spa->spa_raidz_expand != NULL &&
4488             !spa->spa_raidz_expand->vre_waiting_for_resilver);
4489 }
4490
4491 /*
4492  * RAIDZ expansion background thread
4493  *
4494  * Can be called multiple times if the reflow is paused
4495  */
4496 static void
4497 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4498 {
4499         spa_t *spa = arg;
4500         vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4501
4502         if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4503                 vre->vre_offset = 0;
4504         else
4505                 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4506
4507         /* Reflow the begining portion using the scratch area */
4508         if (vre->vre_offset == 0) {
4509                 VERIFY0(dsl_sync_task(spa_name(spa),
4510                     NULL, raidz_reflow_scratch_sync,
4511                     vre, 0, ZFS_SPACE_CHECK_NONE));
4512
4513                 /* if we encountered errors then pause */
4514                 if (vre->vre_offset == 0) {
4515                         mutex_enter(&vre->vre_lock);
4516                         vre->vre_waiting_for_resilver = B_TRUE;
4517                         mutex_exit(&vre->vre_lock);
4518                         return;
4519                 }
4520         }
4521
4522         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4523         vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4524
4525         uint64_t guid = raidvd->vdev_guid;
4526
4527         /* Iterate over all the remaining metaslabs */
4528         for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4529             i < raidvd->vdev_ms_count &&
4530             !zthr_iscancelled(zthr) &&
4531             vre->vre_failed_offset == UINT64_MAX; i++) {
4532                 metaslab_t *msp = raidvd->vdev_ms[i];
4533
4534                 metaslab_disable(msp);
4535                 mutex_enter(&msp->ms_lock);
4536
4537                 /*
4538                  * The metaslab may be newly created (for the expanded
4539                  * space), in which case its trees won't exist yet,
4540                  * so we need to bail out early.
4541                  */
4542                 if (msp->ms_new) {
4543                         mutex_exit(&msp->ms_lock);
4544                         metaslab_enable(msp, B_FALSE, B_FALSE);
4545                         continue;
4546                 }
4547
4548                 VERIFY0(metaslab_load(msp));
4549
4550                 /*
4551                  * We want to copy everything except the free (allocatable)
4552                  * space.  Note that there may be a little bit more free
4553                  * space (e.g. in ms_defer), and it's fine to copy that too.
4554                  */
4555                 uint64_t shift, start;
4556                 range_seg_type_t type = metaslab_calculate_range_tree_type(
4557                     raidvd, msp, &start, &shift);
4558                 range_tree_t *rt = range_tree_create(NULL, type, NULL,
4559                     start, shift);
4560                 range_tree_add(rt, msp->ms_start, msp->ms_size);
4561                 range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
4562                 mutex_exit(&msp->ms_lock);
4563
4564                 /*
4565                  * Force the last sector of each metaslab to be copied.  This
4566                  * ensures that we advance the on-disk progress to the end of
4567                  * this metaslab while the metaslab is disabled.  Otherwise, we
4568                  * could move past this metaslab without advancing the on-disk
4569                  * progress, and then an allocation to this metaslab would not
4570                  * be copied.
4571                  */
4572                 int sectorsz = 1 << raidvd->vdev_ashift;
4573                 uint64_t ms_last_offset = msp->ms_start +
4574                     msp->ms_size - sectorsz;
4575                 if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
4576                         range_tree_add(rt, ms_last_offset, sectorsz);
4577                 }
4578
4579                 /*
4580                  * When we are resuming from a paused expansion (i.e.
4581                  * when importing a pool with a expansion in progress),
4582                  * discard any state that we have already processed.
4583                  */
4584                 if (vre->vre_offset > msp->ms_start) {
4585                         range_tree_clear(rt, msp->ms_start,
4586                             vre->vre_offset - msp->ms_start);
4587                 }
4588
4589                 while (!zthr_iscancelled(zthr) &&
4590                     !range_tree_is_empty(rt) &&
4591                     vre->vre_failed_offset == UINT64_MAX) {
4592
4593                         /*
4594                          * We need to periodically drop the config lock so that
4595                          * writers can get in.  Additionally, we can't wait
4596                          * for a txg to sync while holding a config lock
4597                          * (since a waiting writer could cause a 3-way deadlock
4598                          * with the sync thread, which also gets a config
4599                          * lock for reader).  So we can't hold the config lock
4600                          * while calling dmu_tx_assign().
4601                          */
4602                         spa_config_exit(spa, SCL_CONFIG, FTAG);
4603
4604                         /*
4605                          * If requested, pause the reflow when the amount
4606                          * specified by raidz_expand_max_reflow_bytes is reached
4607                          *
4608                          * This pause is only used during testing or debugging.
4609                          */
4610                         while (raidz_expand_max_reflow_bytes != 0 &&
4611                             raidz_expand_max_reflow_bytes <=
4612                             vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4613                                 delay(hz);
4614                         }
4615
4616                         mutex_enter(&vre->vre_lock);
4617                         while (vre->vre_outstanding_bytes >
4618                             raidz_expand_max_copy_bytes) {
4619                                 cv_wait(&vre->vre_cv, &vre->vre_lock);
4620                         }
4621                         mutex_exit(&vre->vre_lock);
4622
4623                         dmu_tx_t *tx =
4624                             dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4625
4626                         VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
4627                         uint64_t txg = dmu_tx_get_txg(tx);
4628
4629                         /*
4630                          * Reacquire the vdev_config lock.  Theoretically, the
4631                          * vdev_t that we're expanding may have changed.
4632                          */
4633                         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4634                         raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4635
4636                         boolean_t needsync =
4637                             raidz_reflow_impl(raidvd, vre, rt, tx);
4638
4639                         dmu_tx_commit(tx);
4640
4641                         if (needsync) {
4642                                 spa_config_exit(spa, SCL_CONFIG, FTAG);
4643                                 txg_wait_synced(spa->spa_dsl_pool, txg);
4644                                 spa_config_enter(spa, SCL_CONFIG, FTAG,
4645                                     RW_READER);
4646                         }
4647                 }
4648
4649                 spa_config_exit(spa, SCL_CONFIG, FTAG);
4650
4651                 metaslab_enable(msp, B_FALSE, B_FALSE);
4652                 range_tree_vacate(rt, NULL, NULL);
4653                 range_tree_destroy(rt);
4654
4655                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4656                 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4657         }
4658
4659         spa_config_exit(spa, SCL_CONFIG, FTAG);
4660
4661         /*
4662          * The txg_wait_synced() here ensures that all reflow zio's have
4663          * completed, and vre_failed_offset has been set if necessary.  It
4664          * also ensures that the progress of the last raidz_reflow_sync() is
4665          * written to disk before raidz_reflow_complete_sync() changes the
4666          * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
4667          * determine if a reflow is in progress, in which case we may need to
4668          * write to both old and new locations.  Therefore we can only change
4669          * vre_state once this is not necessary, which is once the on-disk
4670          * progress (in spa_ubsync) has been set past any possible writes (to
4671          * the end of the last metaslab).
4672          */
4673         txg_wait_synced(spa->spa_dsl_pool, 0);
4674
4675         if (!zthr_iscancelled(zthr) &&
4676             vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4677                 /*
4678                  * We are not being canceled or paused, so the reflow must be
4679                  * complete. In that case also mark it as completed on disk.
4680                  */
4681                 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4682                 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4683                     raidz_reflow_complete_sync, spa,
4684                     0, ZFS_SPACE_CHECK_NONE));
4685                 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4686         } else {
4687                 /*
4688                  * Wait for all copy zio's to complete and for all the
4689                  * raidz_reflow_sync() synctasks to be run.
4690                  */
4691                 spa_history_log_internal(spa, "reflow pause",
4692                     NULL, "offset=%llu failed_offset=%lld",
4693                     (long long)vre->vre_offset,
4694                     (long long)vre->vre_failed_offset);
4695                 mutex_enter(&vre->vre_lock);
4696                 if (vre->vre_failed_offset != UINT64_MAX) {
4697                         /*
4698                          * Reset progress so that we will retry everything
4699                          * after the point that something failed.
4700                          */
4701                         vre->vre_offset = vre->vre_failed_offset;
4702                         vre->vre_failed_offset = UINT64_MAX;
4703                         vre->vre_waiting_for_resilver = B_TRUE;
4704                 }
4705                 mutex_exit(&vre->vre_lock);
4706         }
4707 }
4708
4709 void
4710 spa_start_raidz_expansion_thread(spa_t *spa)
4711 {
4712         ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
4713         spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4714             spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4715             spa, defclsyspri);
4716 }
4717
4718 void
4719 raidz_dtl_reassessed(vdev_t *vd)
4720 {
4721         spa_t *spa = vd->vdev_spa;
4722         if (spa->spa_raidz_expand != NULL) {
4723                 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4724                 /*
4725                  * we get called often from vdev_dtl_reassess() so make
4726                  * sure it's our vdev and any replacing is complete
4727                  */
4728                 if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4729                     !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4730                         mutex_enter(&vre->vre_lock);
4731                         if (vre->vre_waiting_for_resilver) {
4732                                 vdev_dbgmsg(vd, "DTL reassessed, "
4733                                     "continuing raidz expansion");
4734                                 vre->vre_waiting_for_resilver = B_FALSE;
4735                                 zthr_wakeup(spa->spa_raidz_expand_zthr);
4736                         }
4737                         mutex_exit(&vre->vre_lock);
4738                 }
4739         }
4740 }
4741
4742 int
4743 vdev_raidz_attach_check(vdev_t *new_child)
4744 {
4745         vdev_t *raidvd = new_child->vdev_parent;
4746         uint64_t new_children = raidvd->vdev_children;
4747
4748         /*
4749          * We use the "boot" space as scratch space to handle overwriting the
4750          * initial part of the vdev.  If it is too small, then this expansion
4751          * is not allowed.  This would be very unusual (e.g. ashift > 13 and
4752          * >200 children).
4753          */
4754         if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4755                 return (EINVAL);
4756         }
4757         return (0);
4758 }
4759
4760 void
4761 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4762 {
4763         vdev_t *new_child = arg;
4764         spa_t *spa = new_child->vdev_spa;
4765         vdev_t *raidvd = new_child->vdev_parent;
4766         vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4767         ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4768         ASSERT3P(raidvd->vdev_top, ==, raidvd);
4769         ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4770         ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4771         ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4772             new_child);
4773
4774         spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4775
4776         vdrz->vd_physical_width++;
4777
4778         VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4779         vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4780         vdrz->vn_vre.vre_offset = 0;
4781         vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4782         spa->spa_raidz_expand = &vdrz->vn_vre;
4783         zthr_wakeup(spa->spa_raidz_expand_zthr);
4784
4785         /*
4786          * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4787          * written to the config.
4788          */
4789         vdev_config_dirty(raidvd);
4790
4791         vdrz->vn_vre.vre_start_time = gethrestime_sec();
4792         vdrz->vn_vre.vre_end_time = 0;
4793         vdrz->vn_vre.vre_state = DSS_SCANNING;
4794         vdrz->vn_vre.vre_bytes_copied = 0;
4795
4796         uint64_t state = vdrz->vn_vre.vre_state;
4797         VERIFY0(zap_update(spa->spa_meta_objset,
4798             raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4799             sizeof (state), 1, &state, tx));
4800
4801         uint64_t start_time = vdrz->vn_vre.vre_start_time;
4802         VERIFY0(zap_update(spa->spa_meta_objset,
4803             raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4804             sizeof (start_time), 1, &start_time, tx));
4805
4806         (void) zap_remove(spa->spa_meta_objset,
4807             raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4808         (void) zap_remove(spa->spa_meta_objset,
4809             raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4810
4811         spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
4812             "%s vdev %llu new width %llu", spa_name(spa),
4813             (unsigned long long)raidvd->vdev_id,
4814             (unsigned long long)raidvd->vdev_children);
4815 }
4816
4817 int
4818 vdev_raidz_load(vdev_t *vd)
4819 {
4820         vdev_raidz_t *vdrz = vd->vdev_tsd;
4821         int err;
4822
4823         uint64_t state = DSS_NONE;
4824         uint64_t start_time = 0;
4825         uint64_t end_time = 0;
4826         uint64_t bytes_copied = 0;
4827
4828         if (vd->vdev_top_zap != 0) {
4829                 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4830                     vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4831                     sizeof (state), 1, &state);
4832                 if (err != 0 && err != ENOENT)
4833                         return (err);
4834
4835                 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4836                     vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4837                     sizeof (start_time), 1, &start_time);
4838                 if (err != 0 && err != ENOENT)
4839                         return (err);
4840
4841                 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4842                     vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4843                     sizeof (end_time), 1, &end_time);
4844                 if (err != 0 && err != ENOENT)
4845                         return (err);
4846
4847                 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4848                     vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4849                     sizeof (bytes_copied), 1, &bytes_copied);
4850                 if (err != 0 && err != ENOENT)
4851                         return (err);
4852         }
4853
4854         /*
4855          * If we are in the middle of expansion, vre_state should have
4856          * already been set by vdev_raidz_init().
4857          */
4858         EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4859         vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4860         vdrz->vn_vre.vre_start_time = start_time;
4861         vdrz->vn_vre.vre_end_time = end_time;
4862         vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4863
4864         return (0);
4865 }
4866
4867 int
4868 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4869 {
4870         vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4871
4872         if (vre == NULL) {
4873                 /* no removal in progress; find most recent completed */
4874                 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4875                         vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4876                         if (vd->vdev_ops == &vdev_raidz_ops) {
4877                                 vdev_raidz_t *vdrz = vd->vdev_tsd;
4878
4879                                 if (vdrz->vn_vre.vre_end_time != 0 &&
4880                                     (vre == NULL ||
4881                                     vdrz->vn_vre.vre_end_time >
4882                                     vre->vre_end_time)) {
4883                                         vre = &vdrz->vn_vre;
4884                                 }
4885                         }
4886                 }
4887         }
4888
4889         if (vre == NULL) {
4890                 return (SET_ERROR(ENOENT));
4891         }
4892
4893         pres->pres_state = vre->vre_state;
4894         pres->pres_expanding_vdev = vre->vre_vdev_id;
4895
4896         vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4897         pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4898
4899         mutex_enter(&vre->vre_lock);
4900         pres->pres_reflowed = vre->vre_bytes_copied;
4901         for (int i = 0; i < TXG_SIZE; i++)
4902                 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4903         mutex_exit(&vre->vre_lock);
4904
4905         pres->pres_start_time = vre->vre_start_time;
4906         pres->pres_end_time = vre->vre_end_time;
4907         pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4908
4909         return (0);
4910 }
4911
4912 /*
4913  * Initialize private RAIDZ specific fields from the nvlist.
4914  */
4915 static int
4916 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
4917 {
4918         uint_t children;
4919         nvlist_t **child;
4920         int error = nvlist_lookup_nvlist_array(nv,
4921             ZPOOL_CONFIG_CHILDREN, &child, &children);
4922         if (error != 0)
4923                 return (SET_ERROR(EINVAL));
4924
4925         uint64_t nparity;
4926         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
4927                 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
4928                         return (SET_ERROR(EINVAL));
4929
4930                 /*
4931                  * Previous versions could only support 1 or 2 parity
4932                  * device.
4933                  */
4934                 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
4935                         return (SET_ERROR(EINVAL));
4936                 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
4937                         return (SET_ERROR(EINVAL));
4938         } else {
4939                 /*
4940                  * We require the parity to be specified for SPAs that
4941                  * support multiple parity levels.
4942                  */
4943                 if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
4944                         return (SET_ERROR(EINVAL));
4945
4946                 /*
4947                  * Otherwise, we default to 1 parity device for RAID-Z.
4948                  */
4949                 nparity = 1;
4950         }
4951
4952         vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4953         vdrz->vn_vre.vre_vdev_id = -1;
4954         vdrz->vn_vre.vre_offset = UINT64_MAX;
4955         vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4956         mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4957         cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4958         zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4959         mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4960         avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4961             sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4962
4963         vdrz->vd_physical_width = children;
4964         vdrz->vd_nparity = nparity;
4965
4966         /* note, the ID does not exist when creating a pool */
4967         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
4968             &vdrz->vn_vre.vre_vdev_id);
4969
4970         boolean_t reflow_in_progress =
4971             nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4972         if (reflow_in_progress) {
4973                 spa->spa_raidz_expand = &vdrz->vn_vre;
4974                 vdrz->vn_vre.vre_state = DSS_SCANNING;
4975         }
4976
4977         vdrz->vd_original_width = children;
4978         uint64_t *txgs;
4979         unsigned int txgs_size = 0;
4980         error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4981             &txgs, &txgs_size);
4982         if (error == 0) {
4983                 for (int i = 0; i < txgs_size; i++) {
4984                         reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4985                         re->re_txg = txgs[txgs_size - i - 1];
4986                         re->re_logical_width = vdrz->vd_physical_width - i;
4987
4988                         if (reflow_in_progress)
4989                                 re->re_logical_width--;
4990
4991                         avl_add(&vdrz->vd_expand_txgs, re);
4992                 }
4993
4994                 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
4995         }
4996         if (reflow_in_progress) {
4997                 vdrz->vd_original_width--;
4998                 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
4999                     children, txgs_size);
5000         }
5001
5002         *tsd = vdrz;
5003
5004         return (0);
5005 }
5006
5007 static void
5008 vdev_raidz_fini(vdev_t *vd)
5009 {
5010         vdev_raidz_t *vdrz = vd->vdev_tsd;
5011         if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
5012                 vd->vdev_spa->spa_raidz_expand = NULL;
5013         reflow_node_t *re;
5014         void *cookie = NULL;
5015         avl_tree_t *tree = &vdrz->vd_expand_txgs;
5016         while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
5017                 kmem_free(re, sizeof (*re));
5018         avl_destroy(&vdrz->vd_expand_txgs);
5019         mutex_destroy(&vdrz->vd_expand_lock);
5020         mutex_destroy(&vdrz->vn_vre.vre_lock);
5021         cv_destroy(&vdrz->vn_vre.vre_cv);
5022         zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
5023         kmem_free(vdrz, sizeof (*vdrz));
5024 }
5025
5026 /*
5027  * Add RAIDZ specific fields to the config nvlist.
5028  */
5029 static void
5030 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
5031 {
5032         ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
5033         vdev_raidz_t *vdrz = vd->vdev_tsd;
5034
5035         /*
5036          * Make sure someone hasn't managed to sneak a fancy new vdev
5037          * into a crufty old storage pool.
5038          */
5039         ASSERT(vdrz->vd_nparity == 1 ||
5040             (vdrz->vd_nparity <= 2 &&
5041             spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
5042             (vdrz->vd_nparity <= 3 &&
5043             spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
5044
5045         /*
5046          * Note that we'll add these even on storage pools where they
5047          * aren't strictly required -- older software will just ignore
5048          * it.
5049          */
5050         fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
5051
5052         if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
5053                 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5054         }
5055
5056         mutex_enter(&vdrz->vd_expand_lock);
5057         if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
5058                 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
5059                 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
5060                     KM_SLEEP);
5061                 uint64_t i = 0;
5062
5063                 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
5064                     re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
5065                         txgs[i++] = re->re_txg;
5066                 }
5067
5068                 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5069                     txgs, count);
5070
5071                 kmem_free(txgs, sizeof (uint64_t) * count);
5072         }
5073         mutex_exit(&vdrz->vd_expand_lock);
5074 }
5075
5076 static uint64_t
5077 vdev_raidz_nparity(vdev_t *vd)
5078 {
5079         vdev_raidz_t *vdrz = vd->vdev_tsd;
5080         return (vdrz->vd_nparity);
5081 }
5082
5083 static uint64_t
5084 vdev_raidz_ndisks(vdev_t *vd)
5085 {
5086         return (vd->vdev_children);
5087 }
5088
5089 vdev_ops_t vdev_raidz_ops = {
5090         .vdev_op_init = vdev_raidz_init,
5091         .vdev_op_fini = vdev_raidz_fini,
5092         .vdev_op_open = vdev_raidz_open,
5093         .vdev_op_close = vdev_raidz_close,
5094         .vdev_op_asize = vdev_raidz_asize,
5095         .vdev_op_min_asize = vdev_raidz_min_asize,
5096         .vdev_op_min_alloc = NULL,
5097         .vdev_op_io_start = vdev_raidz_io_start,
5098         .vdev_op_io_done = vdev_raidz_io_done,
5099         .vdev_op_state_change = vdev_raidz_state_change,
5100         .vdev_op_need_resilver = vdev_raidz_need_resilver,
5101         .vdev_op_hold = NULL,
5102         .vdev_op_rele = NULL,
5103         .vdev_op_remap = NULL,
5104         .vdev_op_xlate = vdev_raidz_xlate,
5105         .vdev_op_rebuild_asize = NULL,
5106         .vdev_op_metaslab_init = NULL,
5107         .vdev_op_config_generate = vdev_raidz_config_generate,
5108         .vdev_op_nparity = vdev_raidz_nparity,
5109         .vdev_op_ndisks = vdev_raidz_ndisks,
5110         .vdev_op_type = VDEV_TYPE_RAIDZ,        /* name of this vdev type */
5111         .vdev_op_leaf = B_FALSE                 /* not a leaf vdev */
5112 };
5113
5114 /* BEGIN CSTYLED */
5115 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5116         "For testing, pause RAIDZ expansion after reflowing this many bytes");
5117 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5118         "Max amount of concurrent i/o for RAIDZ expansion");
5119 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5120         "For expanded RAIDZ, aggregate reads that have more rows than this");
5121 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5122         "For expanded RAIDZ, automatically start a pool scrub when expansion "
5123         "completes");
5124 /* END CSTYLED */