module/zfs/vdev_raidz.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  25  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
  26  */
  27
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/spa_impl.h>
  31 #include <sys/zap.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/metaslab_impl.h>
  34 #include <sys/zio.h>
  35 #include <sys/zio_checksum.h>
  36 #include <sys/dmu_tx.h>
  37 #include <sys/abd.h>
  38 #include <sys/zfs_rlock.h>
  39 #include <sys/fs/zfs.h>
  40 #include <sys/fm/fs/zfs.h>
  41 #include <sys/vdev_raidz.h>
  42 #include <sys/vdev_raidz_impl.h>
  43 #include <sys/vdev_draid.h>
  44 #include <sys/uberblock_impl.h>
  45 #include <sys/dsl_scan.h>
  46
  47 #ifdef ZFS_DEBUG
  48 #include <sys/vdev.h>   /* For vdev_xlate() in vdev_raidz_io_verify() */
  49 #endif
  50
  51 /*
  52  * Virtual device vector for RAID-Z.
  53  *
  54  * This vdev supports single, double, and triple parity. For single parity,
  55  * we use a simple XOR of all the data columns. For double or triple parity,
  56  * we use a special case of Reed-Solomon coding. This extends the
  57  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  58  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  59  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  60  * former is also based. The latter is designed to provide higher performance
  61  * for writes.
  62  *
  63  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  64  * amended six years later identifying a critical flaw that invalidates its
  65  * claims. Nevertheless, the technique can be adapted to work for up to
  66  * triple parity. For additional parity, the amendment "Note: Correction to
  67  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  68  * is viable, but the additional complexity means that write performance will
  69  * suffer.
  70  *
  71  * All of the methods above operate on a Galois field, defined over the
  72  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  73  * can be expressed with a single byte. Briefly, the operations on the
  74  * field are defined as follows:
  75  *
  76  *   o addition (+) is represented by a bitwise XOR
  77  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  78  *   o multiplication of A by 2 is defined by the following bitwise expression:
  79  *
  80  *      (A * 2)_7 = A_6
  81  *      (A * 2)_6 = A_5
  82  *      (A * 2)_5 = A_4
  83  *      (A * 2)_4 = A_3 + A_7
  84  *      (A * 2)_3 = A_2 + A_7
  85  *      (A * 2)_2 = A_1 + A_7
  86  *      (A * 2)_1 = A_0
  87  *      (A * 2)_0 = A_7
  88  *
  89  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  90  * As an aside, this multiplication is derived from the error correcting
  91  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  92  *
  93  * Observe that any number in the field (except for 0) can be expressed as a
  94  * power of 2 -- a generator for the field. We store a table of the powers of
  95  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  96  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  97  * than field addition). The inverse of a field element A (A^-1) is therefore
  98  * A ^ (255 - 1) = A^254.
  99  *
 100  * The up-to-three parity columns, P, Q, R over several data columns,
 101  * D_0, ... D_n-1, can be expressed by field operations:
 102  *
 103  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
 104  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
 105  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
 106  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
 107  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
 108  *
 109  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
 110  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
 111  * independent coefficients. (There are no additional coefficients that have
 112  * this property which is why the uncorrected Plank method breaks down.)
 113  *
 114  * See the reconstruction code below for how P, Q and R can used individually
 115  * or in concert to recover missing data columns.
 116  */
 117
 118 #define VDEV_RAIDZ_P            0
 119 #define VDEV_RAIDZ_Q            1
 120 #define VDEV_RAIDZ_R            2
 121
 122 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 123 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 124
 125 /*
 126  * We provide a mechanism to perform the field multiplication operation on a
 127  * 64-bit value all at once rather than a byte at a time. This works by
 128  * creating a mask from the top bit in each byte and using that to
 129  * conditionally apply the XOR of 0x1d.
 130  */
 131 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 132 { \
 133         (mask) = (x) & 0x8080808080808080ULL; \
 134         (mask) = ((mask) << 1) - ((mask) >> 7); \
 135         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 136             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 137 }
 138
 139 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 140 { \
 141         VDEV_RAIDZ_64MUL_2((x), mask); \
 142         VDEV_RAIDZ_64MUL_2((x), mask); \
 143 }
 144
 145
 146 /*
 147  * Big Theory Statement for how a RAIDZ VDEV is expanded
 148  *
 149  * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
 150  * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
 151  * that have been previously expanded can be expanded again.
 152  *
 153  * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
 154  * the VDEV) when an expansion starts.  And the expansion will pause if any
 155  * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
 156  * operations on the pool can continue while an expansion is in progress (e.g.
 157  * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
 158  * and zpool initialize which can't be run during an expansion.  Following a
 159  * reboot or export/import, the expansion resumes where it left off.
 160  *
 161  * == Reflowing the Data ==
 162  *
 163  * The expansion involves reflowing (copying) the data from the current set
 164  * of disks to spread it across the new set which now has one more disk. This
 165  * reflow operation is similar to reflowing text when the column width of a
 166  * text editor window is expanded. The text doesn’t change but the location of
 167  * the text changes to accommodate the new width. An example reflow result for
 168  * a 4-wide RAIDZ1 to a 5-wide is shown below.
 169  *
 170  *                            Reflow End State
 171  *            Each letter indicates a parity group (logical stripe)
 172  *
 173  *         Before expansion                         After Expansion
 174  *     D1     D2     D3     D4               D1     D2     D3     D4     D5
 175  *  +------+------+------+------+         +------+------+------+------+------+
 176  *  |      |      |      |      |         |      |      |      |      |      |
 177  *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
 178  *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
 179  *  +------+------+------+------+         +------+------+------+------+------+
 180  *  |      |      |      |      |         |      |      |      |      |      |
 181  *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
 182  *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
 183  *  +------+------+------+------+         +------+------+------+------+------+
 184  *  |      |      |      |      |         |      |      |      |      |      |
 185  *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
 186  *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
 187  *  +------+------+------+------+         +------+------+------+------+------+
 188  *  |      |      |      |      |         |      |      |      |      |      |
 189  *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
 190  *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
 191  *  +------+------+------+------+         +------+------+------+------+------+
 192  *  |      |      |      |      |         |      |      |      |      |      |
 193  *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
 194  *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
 195  *  +------+------+------+------+         +------+------+------+------+------+
 196  *  |      |      |      |      |         |      |      |      |      |      |
 197  *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
 198  *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
 199  *  +------+------+------+------+         +------+------+------+------+------+
 200  *  |      |      |      |      |         |      |      |      |      |      |
 201  *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
 202  *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
 203  *  +------+------+------+------+         +------+------+------+------+------+
 204  *
 205  * This reflow approach has several advantages. There is no need to read or
 206  * modify the block pointers or recompute any block checksums.  The reflow
 207  * doesn’t need to know where the parity sectors reside. We can read and write
 208  * data sequentially and the copy can occur in a background thread in open
 209  * context. The design also allows for fast discovery of what data to copy.
 210  *
 211  * The VDEV metaslabs are processed, one at a time, to copy the block data to
 212  * have it flow across all the disks. The metaslab is disabled for allocations
 213  * during the copy. As an optimization, we only copy the allocated data which
 214  * can be determined by looking at the metaslab range tree. During the copy we
 215  * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
 216  * need to be able to survive losing parity count disks).  This means we
 217  * cannot overwrite data during the reflow that would be needed if a disk is
 218  * lost.
 219  *
 220  * After the reflow completes, all newly-written blocks will have the new
 221  * layout, i.e., they will have the parity to data ratio implied by the new
 222  * number of disks in the RAIDZ group.  Even though the reflow copies all of
 223  * the allocated space (data and parity), it is only rearranged, not changed.
 224  *
 225  * This act of reflowing the data has a few implications about blocks
 226  * that were written before the reflow completes:
 227  *
 228  *  - Old blocks will still use the same amount of space (i.e., they will have
 229  *    the parity to data ratio implied by the old number of disks in the RAIDZ
 230  *    group).
 231  *  - Reading old blocks will be slightly slower than before the reflow, for
 232  *    two reasons. First, we will have to read from all disks in the RAIDZ
 233  *    VDEV, rather than being able to skip the children that contain only
 234  *    parity of this block (because the data of a single block is now spread
 235  *    out across all the disks).  Second, in most cases there will be an extra
 236  *    bcopy, needed to rearrange the data back to its original layout in memory.
 237  *
 238  * == Scratch Area ==
 239  *
 240  * As we copy the block data, we can only progress to the point that writes
 241  * will not overlap with blocks whose progress has not yet been recorded on
 242  * disk.  Since partially-copied rows are always read from the old location,
 243  * we need to stop one row before the sector-wise overlap, to prevent any
 244  * row-wise overlap. For example, in the diagram above, when we reflow sector
 245  * B6 it will overwite the original location for B5.
 246  *
 247  * To get around this, a scratch space is used so that we can start copying
 248  * without risking data loss by overlapping the row. As an added benefit, it
 249  * improves performance at the beginning of the reflow, but that small perf
 250  * boost wouldn't be worth the complexity on its own.
 251  *
 252  * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
 253  * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
 254  * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
 255  * the widths will likely be single digits so we can get a substantial chuck
 256  * size using only a few MB of scratch per disk.
 257  *
 258  * The scratch area is persisted to disk which holds a large amount of reflowed
 259  * state. We can always read the partially written stripes when a disk fails or
 260  * the copy is interrupted (crash) during the initial copying phase and also
 261  * get past a small chunk size restriction.  At a minimum, the scratch space
 262  * must be large enough to get us to the point that one row does not overlap
 263  * itself when moved (i.e new_width^2).  But going larger is even better. We
 264  * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
 265  * as our scratch space to handle overwriting the initial part of the VDEV.
 266  *
 267  *      0     256K   512K                    4M
 268  *      +------+------+-----------------------+-----------------------------
 269  *      | VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
 270  *      |  L0  |  L1  |       Reserved        |     (Metaslabs)
 271  *      +------+------+-----------------------+-------------------------------
 272  *                        Scratch Area
 273  *
 274  * == Reflow Progress Updates ==
 275  * After the initial scratch-based reflow, the expansion process works
 276  * similarly to device removal. We create a new open context thread which
 277  * reflows the data, and periodically kicks off sync tasks to update logical
 278  * state. In this case, state is the committed progress (offset of next data
 279  * to copy). We need to persist the completed offset on disk, so that if we
 280  * crash we know which format each VDEV offset is in.
 281  *
 282  * == Time Dependent Geometry ==
 283  *
 284  * In non-expanded RAIDZ, blocks are read from disk in a column by column
 285  * fashion. For a multi-row block, the second sector is in the first column
 286  * not in the second column. This allows us to issue full reads for each
 287  * column directly into the request buffer. The block data is thus laid out
 288  * sequentially in a column-by-column fashion.
 289  *
 290  * For example, in the before expansion diagram above, one logical block might
 291  * be sectors G19-H26. The parity is in G19,H23; and the data is in
 292  * G20,H24,G21,H25,G22,H26.
 293  *
 294  * After a block is reflowed, the sectors that were all in the original column
 295  * data can now reside in different columns. When reading from an expanded
 296  * VDEV, we need to know the logical stripe width for each block so we can
 297  * reconstitute the block’s data after the reads are completed. Likewise,
 298  * when we perform the combinatorial reconstruction we need to know the
 299  * original width so we can retry combinations from the past layouts.
 300  *
 301  * Time dependent geometry is what we call having blocks with different layouts
 302  * (stripe widths) in the same VDEV. This time-dependent geometry uses the
 303  * block’s birth time (+ the time expansion ended) to establish the correct
 304  * width for a given block. After an expansion completes, we record the time
 305  * for blocks written with a particular width (geometry).
 306  *
 307  * == On Disk Format Changes ==
 308  *
 309  * New pool feature flag, 'raidz_expansion' whose reference count is the number
 310  * of RAIDZ VDEVs that have been expanded.
 311  *
 312  * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
 313  *
 314  * Since the uberblock can point to arbitrary blocks, which might be on the
 315  * expanding RAIDZ, and might or might not have been expanded. We need to know
 316  * which way a block is laid out before reading it. This info is the next
 317  * offset that needs to be reflowed and we persist that in the uberblock, in
 318  * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
 319  * After the expansion is complete, we then use the raidz_expand_txgs array
 320  * (see below) to determine how to read a block and the ub_raidz_reflow_info
 321  * field no longer required.
 322  *
 323  * The uberblock's ub_raidz_reflow_info field also holds the scratch space
 324  * state (i.e., active or not) which is also required before reading a block
 325  * during the initial phase of reflowing the data.
 326  *
 327  * The top-level RAIDZ VDEV has two new entries in the nvlist:
 328  *
 329  * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
 330  *                            and used after the expansion is complete to
 331  *                            determine how to read a raidz block
 332  * 'raidz_expanding' boolean: present during reflow and removed after completion
 333  *                            used during a spa import to resume an unfinished
 334  *                            expansion
 335  *
 336  * And finally the VDEVs top zap adds the following informational entries:
 337  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
 338  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
 339  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
 340  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
 341  */
 342
 343 /*
 344  * For testing only: pause the raidz expansion after reflowing this amount.
 345  * (accessed by ZTS and ztest)
 346  */
 347 #ifdef  _KERNEL
 348 static
 349 #endif  /* _KERNEL */
 350 unsigned long raidz_expand_max_reflow_bytes = 0;
 351
 352 /*
 353  * For testing only: pause the raidz expansion at a certain point.
 354  */
 355 uint_t raidz_expand_pause_point = 0;
 356
 357 /*
 358  * Maximum amount of copy io's outstanding at once.
 359  */
 360 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
 361
 362 /*
 363  * Apply raidz map abds aggregation if the number of rows in the map is equal
 364  * or greater than the value below.
 365  */
 366 static unsigned long raidz_io_aggregate_rows = 4;
 367
 368 /*
 369  * Automatically start a pool scrub when a RAIDZ expansion completes in
 370  * order to verify the checksums of all blocks which have been copied
 371  * during the expansion.  Automatic scrubbing is enabled by default and
 372  * is strongly recommended.
 373  */
 374 static int zfs_scrub_after_expand = 1;
 375
 376 static void
 377 vdev_raidz_row_free(raidz_row_t *rr)
 378 {
 379         for (int c = 0; c < rr->rr_cols; c++) {
 380                 raidz_col_t *rc = &rr->rr_col[c];
 381
 382                 if (rc->rc_size != 0)
 383                         abd_free(rc->rc_abd);
 384                 if (rc->rc_orig_data != NULL)
 385                         abd_free(rc->rc_orig_data);
 386         }
 387
 388         if (rr->rr_abd_empty != NULL)
 389                 abd_free(rr->rr_abd_empty);
 390
 391         kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
 392 }
 393
 394 void
 395 vdev_raidz_map_free(raidz_map_t *rm)
 396 {
 397         for (int i = 0; i < rm->rm_nrows; i++)
 398                 vdev_raidz_row_free(rm->rm_row[i]);
 399
 400         if (rm->rm_nphys_cols) {
 401                 for (int i = 0; i < rm->rm_nphys_cols; i++) {
 402                         if (rm->rm_phys_col[i].rc_abd != NULL)
 403                                 abd_free(rm->rm_phys_col[i].rc_abd);
 404                 }
 405
 406                 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
 407                     rm->rm_nphys_cols);
 408         }
 409
 410         ASSERT3P(rm->rm_lr, ==, NULL);
 411         kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
 412 }
 413
 414 static void
 415 vdev_raidz_map_free_vsd(zio_t *zio)
 416 {
 417         raidz_map_t *rm = zio->io_vsd;
 418
 419         vdev_raidz_map_free(rm);
 420 }
 421
 422 static int
 423 vdev_raidz_reflow_compare(const void *x1, const void *x2)
 424 {
 425         const reflow_node_t *l = x1;
 426         const reflow_node_t *r = x2;
 427
 428         return (TREE_CMP(l->re_txg, r->re_txg));
 429 }
 430
 431 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 432         .vsd_free = vdev_raidz_map_free_vsd,
 433 };
 434
 435 raidz_row_t *
 436 vdev_raidz_row_alloc(int cols, zio_t *zio)
 437 {
 438         raidz_row_t *rr =
 439             kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
 440
 441         rr->rr_cols = cols;
 442         rr->rr_scols = cols;
 443
 444         for (int c = 0; c < cols; c++) {
 445                 raidz_col_t *rc = &rr->rr_col[c];
 446                 rc->rc_shadow_devidx = INT_MAX;
 447                 rc->rc_shadow_offset = UINT64_MAX;
 448                 /*
 449                  * We can not allow self healing to take place for Direct I/O
 450                  * reads. There is nothing that stops the buffer contents from
 451                  * being manipulated while the I/O is in flight. It is possible
 452                  * that the checksum could be verified on the buffer and then
 453                  * the contents of that buffer are manipulated afterwards. This
 454                  * could lead to bad data being written out during self
 455                  * healing.
 456                  */
 457                 if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
 458                         rc->rc_allow_repair = 1;
 459         }
 460         return (rr);
 461 }
 462
 463 static void
 464 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
 465 {
 466         int c;
 467         int nwrapped = 0;
 468         uint64_t off = 0;
 469         raidz_row_t *rr = rm->rm_row[0];
 470
 471         ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 472         ASSERT3U(rm->rm_nrows, ==, 1);
 473
 474         /*
 475          * Pad any parity columns with additional space to account for skip
 476          * sectors.
 477          */
 478         if (rm->rm_skipstart < rr->rr_firstdatacol) {
 479                 ASSERT0(rm->rm_skipstart);
 480                 nwrapped = rm->rm_nskip;
 481         } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
 482                 nwrapped =
 483                     (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
 484         }
 485
 486         /*
 487          * Optional single skip sectors (rc_size == 0) will be handled in
 488          * vdev_raidz_io_start_write().
 489          */
 490         int skipped = rr->rr_scols - rr->rr_cols;
 491
 492         /* Allocate buffers for the parity columns */
 493         for (c = 0; c < rr->rr_firstdatacol; c++) {
 494                 raidz_col_t *rc = &rr->rr_col[c];
 495
 496                 /*
 497                  * Parity columns will pad out a linear ABD to account for
 498                  * the skip sector. A linear ABD is used here because
 499                  * parity calculations use the ABD buffer directly to calculate
 500                  * parity. This avoids doing a memcpy back to the ABD after the
 501                  * parity has been calculated. By issuing the parity column
 502                  * with the skip sector we can reduce contention on the child
 503                  * VDEV queue locks (vq_lock).
 504                  */
 505                 if (c < nwrapped) {
 506                         rc->rc_abd = abd_alloc_linear(
 507                             rc->rc_size + (1ULL << ashift), B_FALSE);
 508                         abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
 509                         skipped++;
 510                 } else {
 511                         rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 512                 }
 513         }
 514
 515         for (off = 0; c < rr->rr_cols; c++) {
 516                 raidz_col_t *rc = &rr->rr_col[c];
 517                 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
 518                     zio->io_abd, off, rc->rc_size);
 519
 520                 /*
 521                  * Generate I/O for skip sectors to improve aggregation
 522                  * continuity. We will use gang ABD's to reduce contention
 523                  * on the child VDEV queue locks (vq_lock) by issuing
 524                  * a single I/O that contains the data and skip sector.
 525                  *
 526                  * It is important to make sure that rc_size is not updated
 527                  * even though we are adding a skip sector to the ABD. When
 528                  * calculating the parity in vdev_raidz_generate_parity_row()
 529                  * the rc_size is used to iterate through the ABD's. We can
 530                  * not have zero'd out skip sectors used for calculating
 531                  * parity for raidz, because those same sectors are not used
 532                  * during reconstruction.
 533                  */
 534                 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
 535                         rc->rc_abd = abd_alloc_gang();
 536                         abd_gang_add(rc->rc_abd, abd, B_TRUE);
 537                         abd_gang_add(rc->rc_abd,
 538                             abd_get_zeros(1ULL << ashift), B_TRUE);
 539                         skipped++;
 540                 } else {
 541                         rc->rc_abd = abd;
 542                 }
 543                 off += rc->rc_size;
 544         }
 545
 546         ASSERT3U(off, ==, zio->io_size);
 547         ASSERT3S(skipped, ==, rm->rm_nskip);
 548 }
 549
 550 static void
 551 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
 552 {
 553         int c;
 554         raidz_row_t *rr = rm->rm_row[0];
 555
 556         ASSERT3U(rm->rm_nrows, ==, 1);
 557
 558         /* Allocate buffers for the parity columns */
 559         for (c = 0; c < rr->rr_firstdatacol; c++)
 560                 rr->rr_col[c].rc_abd =
 561                     abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
 562
 563         for (uint64_t off = 0; c < rr->rr_cols; c++) {
 564                 raidz_col_t *rc = &rr->rr_col[c];
 565                 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 566                     zio->io_abd, off, rc->rc_size);
 567                 off += rc->rc_size;
 568         }
 569 }
 570
 571 /*
 572  * Divides the IO evenly across all child vdevs; usually, dcols is
 573  * the number of children in the target vdev.
 574  *
 575  * Avoid inlining the function to keep vdev_raidz_io_start(), which
 576  * is this functions only caller, as small as possible on the stack.
 577  */
 578 noinline raidz_map_t *
 579 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
 580     uint64_t nparity)
 581 {
 582         raidz_row_t *rr;
 583         /* The starting RAIDZ (parent) vdev sector of the block. */
 584         uint64_t b = zio->io_offset >> ashift;
 585         /* The zio's size in units of the vdev's minimum sector size. */
 586         uint64_t s = zio->io_size >> ashift;
 587         /* The first column for this stripe. */
 588         uint64_t f = b % dcols;
 589         /* The starting byte offset on each child vdev. */
 590         uint64_t o = (b / dcols) << ashift;
 591         uint64_t acols, scols;
 592
 593         raidz_map_t *rm =
 594             kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
 595         rm->rm_nrows = 1;
 596
 597         /*
 598          * "Quotient": The number of data sectors for this stripe on all but
 599          * the "big column" child vdevs that also contain "remainder" data.
 600          */
 601         uint64_t q = s / (dcols - nparity);
 602
 603         /*
 604          * "Remainder": The number of partial stripe data sectors in this I/O.
 605          * This will add a sector to some, but not all, child vdevs.
 606          */
 607         uint64_t r = s - q * (dcols - nparity);
 608
 609         /* The number of "big columns" - those which contain remainder data. */
 610         uint64_t bc = (r == 0 ? 0 : r + nparity);
 611
 612         /*
 613          * The total number of data and parity sectors associated with
 614          * this I/O.
 615          */
 616         uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 617
 618         /*
 619          * acols: The columns that will be accessed.
 620          * scols: The columns that will be accessed or skipped.
 621          */
 622         if (q == 0) {
 623                 /* Our I/O request doesn't span all child vdevs. */
 624                 acols = bc;
 625                 scols = MIN(dcols, roundup(bc, nparity + 1));
 626         } else {
 627                 acols = dcols;
 628                 scols = dcols;
 629         }
 630
 631         ASSERT3U(acols, <=, scols);
 632         rr = vdev_raidz_row_alloc(scols, zio);
 633         rm->rm_row[0] = rr;
 634         rr->rr_cols = acols;
 635         rr->rr_bigcols = bc;
 636         rr->rr_firstdatacol = nparity;
 637 #ifdef ZFS_DEBUG
 638         rr->rr_offset = zio->io_offset;
 639         rr->rr_size = zio->io_size;
 640 #endif
 641
 642         uint64_t asize = 0;
 643
 644         for (uint64_t c = 0; c < scols; c++) {
 645                 raidz_col_t *rc = &rr->rr_col[c];
 646                 uint64_t col = f + c;
 647                 uint64_t coff = o;
 648                 if (col >= dcols) {
 649                         col -= dcols;
 650                         coff += 1ULL << ashift;
 651                 }
 652                 rc->rc_devidx = col;
 653                 rc->rc_offset = coff;
 654
 655                 if (c >= acols)
 656                         rc->rc_size = 0;
 657                 else if (c < bc)
 658                         rc->rc_size = (q + 1) << ashift;
 659                 else
 660                         rc->rc_size = q << ashift;
 661
 662                 asize += rc->rc_size;
 663         }
 664
 665         ASSERT3U(asize, ==, tot << ashift);
 666         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 667         rm->rm_skipstart = bc;
 668
 669         /*
 670          * If all data stored spans all columns, there's a danger that parity
 671          * will always be on the same device and, since parity isn't read
 672          * during normal operation, that device's I/O bandwidth won't be
 673          * used effectively. We therefore switch the parity every 1MB.
 674          *
 675          * ... at least that was, ostensibly, the theory. As a practical
 676          * matter unless we juggle the parity between all devices evenly, we
 677          * won't see any benefit. Further, occasional writes that aren't a
 678          * multiple of the LCM of the number of children and the minimum
 679          * stripe width are sufficient to avoid pessimal behavior.
 680          * Unfortunately, this decision created an implicit on-disk format
 681          * requirement that we need to support for all eternity, but only
 682          * for single-parity RAID-Z.
 683          *
 684          * If we intend to skip a sector in the zeroth column for padding
 685          * we must make sure to note this swap. We will never intend to
 686          * skip the first column since at least one data and one parity
 687          * column must appear in each row.
 688          */
 689         ASSERT(rr->rr_cols >= 2);
 690         ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 691
 692         if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 693                 uint64_t devidx = rr->rr_col[0].rc_devidx;
 694                 o = rr->rr_col[0].rc_offset;
 695                 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 696                 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 697                 rr->rr_col[1].rc_devidx = devidx;
 698                 rr->rr_col[1].rc_offset = o;
 699                 if (rm->rm_skipstart == 0)
 700                         rm->rm_skipstart = 1;
 701         }
 702
 703         if (zio->io_type == ZIO_TYPE_WRITE) {
 704                 vdev_raidz_map_alloc_write(zio, rm, ashift);
 705         } else {
 706                 vdev_raidz_map_alloc_read(zio, rm);
 707         }
 708         /* init RAIDZ parity ops */
 709         rm->rm_ops = vdev_raidz_math_get_ops();
 710
 711         return (rm);
 712 }
 713
 714 /*
 715  * Everything before reflow_offset_synced should have been moved to the new
 716  * location (read and write completed).  However, this may not yet be reflected
 717  * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
 718  * uberblock has not yet been written). If reflow is not in progress,
 719  * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
 720  * entirely before reflow_offset_synced, it will come from the new location.
 721  * Otherwise this row will come from the old location.  Therefore, rows that
 722  * straddle the reflow_offset_synced will come from the old location.
 723  *
 724  * For writes, reflow_offset_next is the next offset to copy.  If a sector has
 725  * been copied, but not yet reflected in the on-disk progress
 726  * (reflow_offset_synced), it will also be written to the new (already copied)
 727  * offset.
 728  */
 729 noinline raidz_map_t *
 730 vdev_raidz_map_alloc_expanded(zio_t *zio,
 731     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
 732     uint64_t nparity, uint64_t reflow_offset_synced,
 733     uint64_t reflow_offset_next, boolean_t use_scratch)
 734 {
 735         abd_t *abd = zio->io_abd;
 736         uint64_t offset = zio->io_offset;
 737         uint64_t size = zio->io_size;
 738
 739         /* The zio's size in units of the vdev's minimum sector size. */
 740         uint64_t s = size >> ashift;
 741
 742         /*
 743          * "Quotient": The number of data sectors for this stripe on all but
 744          * the "big column" child vdevs that also contain "remainder" data.
 745          * AKA "full rows"
 746          */
 747         uint64_t q = s / (logical_cols - nparity);
 748
 749         /*
 750          * "Remainder": The number of partial stripe data sectors in this I/O.
 751          * This will add a sector to some, but not all, child vdevs.
 752          */
 753         uint64_t r = s - q * (logical_cols - nparity);
 754
 755         /* The number of "big columns" - those which contain remainder data. */
 756         uint64_t bc = (r == 0 ? 0 : r + nparity);
 757
 758         /*
 759          * The total number of data and parity sectors associated with
 760          * this I/O.
 761          */
 762         uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 763
 764         /* How many rows contain data (not skip) */
 765         uint64_t rows = howmany(tot, logical_cols);
 766         int cols = MIN(tot, logical_cols);
 767
 768         raidz_map_t *rm =
 769             kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
 770             KM_SLEEP);
 771         rm->rm_nrows = rows;
 772         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 773         rm->rm_skipstart = bc;
 774         uint64_t asize = 0;
 775
 776         for (uint64_t row = 0; row < rows; row++) {
 777                 boolean_t row_use_scratch = B_FALSE;
 778                 raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
 779                 rm->rm_row[row] = rr;
 780
 781                 /* The starting RAIDZ (parent) vdev sector of the row. */
 782                 uint64_t b = (offset >> ashift) + row * logical_cols;
 783
 784                 /*
 785                  * If we are in the middle of a reflow, and the copying has
 786                  * not yet completed for any part of this row, then use the
 787                  * old location of this row.  Note that reflow_offset_synced
 788                  * reflects the i/o that's been completed, because it's
 789                  * updated by a synctask, after zio_wait(spa_txg_zio[]).
 790                  * This is sufficient for our check, even if that progress
 791                  * has not yet been recorded to disk (reflected in
 792                  * spa_ubsync).  Also note that we consider the last row to
 793                  * be "full width" (`cols`-wide rather than `bc`-wide) for
 794                  * this calculation. This causes a tiny bit of unnecessary
 795                  * double-writes but is safe and simpler to calculate.
 796                  */
 797                 int row_phys_cols = physical_cols;
 798                 if (b + cols > reflow_offset_synced >> ashift)
 799                         row_phys_cols--;
 800                 else if (use_scratch)
 801                         row_use_scratch = B_TRUE;
 802
 803                 /* starting child of this row */
 804                 uint64_t child_id = b % row_phys_cols;
 805                 /* The starting byte offset on each child vdev. */
 806                 uint64_t child_offset = (b / row_phys_cols) << ashift;
 807
 808                 /*
 809                  * Note, rr_cols is the entire width of the block, even
 810                  * if this row is shorter.  This is needed because parity
 811                  * generation (for Q and R) needs to know the entire width,
 812                  * because it treats the short row as though it was
 813                  * full-width (and the "phantom" sectors were zero-filled).
 814                  *
 815                  * Another approach to this would be to set cols shorter
 816                  * (to just the number of columns that we might do i/o to)
 817                  * and have another mechanism to tell the parity generation
 818                  * about the "entire width".  Reconstruction (at least
 819                  * vdev_raidz_reconstruct_general()) would also need to
 820                  * know about the "entire width".
 821                  */
 822                 rr->rr_firstdatacol = nparity;
 823 #ifdef ZFS_DEBUG
 824                 /*
 825                  * note: rr_size is PSIZE, not ASIZE
 826                  */
 827                 rr->rr_offset = b << ashift;
 828                 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
 829 #endif
 830
 831                 for (int c = 0; c < rr->rr_cols; c++, child_id++) {
 832                         if (child_id >= row_phys_cols) {
 833                                 child_id -= row_phys_cols;
 834                                 child_offset += 1ULL << ashift;
 835                         }
 836                         raidz_col_t *rc = &rr->rr_col[c];
 837                         rc->rc_devidx = child_id;
 838                         rc->rc_offset = child_offset;
 839
 840                         /*
 841                          * Get this from the scratch space if appropriate.
 842                          * This only happens if we crashed in the middle of
 843                          * raidz_reflow_scratch_sync() (while it's running,
 844                          * the rangelock prevents us from doing concurrent
 845                          * io), and even then only during zpool import or
 846                          * when the pool is imported readonly.
 847                          */
 848                         if (row_use_scratch)
 849                                 rc->rc_offset -= VDEV_BOOT_SIZE;
 850
 851                         uint64_t dc = c - rr->rr_firstdatacol;
 852                         if (c < rr->rr_firstdatacol) {
 853                                 rc->rc_size = 1ULL << ashift;
 854
 855                                 /*
 856                                  * Parity sectors' rc_abd's are set below
 857                                  * after determining if this is an aggregation.
 858                                  */
 859                         } else if (row == rows - 1 && bc != 0 && c >= bc) {
 860                                 /*
 861                                  * Past the end of the block (even including
 862                                  * skip sectors).  This sector is part of the
 863                                  * map so that we have full rows for p/q parity
 864                                  * generation.
 865                                  */
 866                                 rc->rc_size = 0;
 867                                 rc->rc_abd = NULL;
 868                         } else {
 869                                 /* "data column" (col excluding parity) */
 870                                 uint64_t off;
 871
 872                                 if (c < bc || r == 0) {
 873                                         off = dc * rows + row;
 874                                 } else {
 875                                         off = r * rows +
 876                                             (dc - r) * (rows - 1) + row;
 877                                 }
 878                                 rc->rc_size = 1ULL << ashift;
 879                                 rc->rc_abd = abd_get_offset_struct(
 880                                     &rc->rc_abdstruct, abd, off << ashift,
 881                                     rc->rc_size);
 882                         }
 883
 884                         if (rc->rc_size == 0)
 885                                 continue;
 886
 887                         /*
 888                          * If any part of this row is in both old and new
 889                          * locations, the primary location is the old
 890                          * location. If this sector was already copied to the
 891                          * new location, we need to also write to the new,
 892                          * "shadow" location.
 893                          *
 894                          * Note, `row_phys_cols != physical_cols` indicates
 895                          * that the primary location is the old location.
 896                          * `b+c < reflow_offset_next` indicates that the copy
 897                          * to the new location has been initiated. We know
 898                          * that the copy has completed because we have the
 899                          * rangelock, which is held exclusively while the
 900                          * copy is in progress.
 901                          */
 902                         if (row_use_scratch ||
 903                             (row_phys_cols != physical_cols &&
 904                             b + c < reflow_offset_next >> ashift)) {
 905                                 rc->rc_shadow_devidx = (b + c) % physical_cols;
 906                                 rc->rc_shadow_offset =
 907                                     ((b + c) / physical_cols) << ashift;
 908                                 if (row_use_scratch)
 909                                         rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
 910                         }
 911
 912                         asize += rc->rc_size;
 913                 }
 914
 915                 /*
 916                  * See comment in vdev_raidz_map_alloc()
 917                  */
 918                 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
 919                     (offset & (1ULL << 20))) {
 920                         ASSERT(rr->rr_cols >= 2);
 921                         ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 922
 923                         int devidx0 = rr->rr_col[0].rc_devidx;
 924                         uint64_t offset0 = rr->rr_col[0].rc_offset;
 925                         int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
 926                         uint64_t shadow_offset0 =
 927                             rr->rr_col[0].rc_shadow_offset;
 928
 929                         rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 930                         rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 931                         rr->rr_col[0].rc_shadow_devidx =
 932                             rr->rr_col[1].rc_shadow_devidx;
 933                         rr->rr_col[0].rc_shadow_offset =
 934                             rr->rr_col[1].rc_shadow_offset;
 935
 936                         rr->rr_col[1].rc_devidx = devidx0;
 937                         rr->rr_col[1].rc_offset = offset0;
 938                         rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
 939                         rr->rr_col[1].rc_shadow_offset = shadow_offset0;
 940                 }
 941         }
 942         ASSERT3U(asize, ==, tot << ashift);
 943
 944         /*
 945          * Determine if the block is contiguous, in which case we can use
 946          * an aggregation.
 947          */
 948         if (rows >= raidz_io_aggregate_rows) {
 949                 rm->rm_nphys_cols = physical_cols;
 950                 rm->rm_phys_col =
 951                     kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
 952                     KM_SLEEP);
 953
 954                 /*
 955                  * Determine the aggregate io's offset and size, and check
 956                  * that the io is contiguous.
 957                  */
 958                 for (int i = 0;
 959                     i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
 960                         raidz_row_t *rr = rm->rm_row[i];
 961                         for (int c = 0; c < rr->rr_cols; c++) {
 962                                 raidz_col_t *rc = &rr->rr_col[c];
 963                                 raidz_col_t *prc =
 964                                     &rm->rm_phys_col[rc->rc_devidx];
 965
 966                                 if (rc->rc_size == 0)
 967                                         continue;
 968
 969                                 if (prc->rc_size == 0) {
 970                                         ASSERT0(prc->rc_offset);
 971                                         prc->rc_offset = rc->rc_offset;
 972                                 } else if (prc->rc_offset + prc->rc_size !=
 973                                     rc->rc_offset) {
 974                                         /*
 975                                          * This block is not contiguous and
 976                                          * therefore can't be aggregated.
 977                                          * This is expected to be rare, so
 978                                          * the cost of allocating and then
 979                                          * freeing rm_phys_col is not
 980                                          * significant.
 981                                          */
 982                                         kmem_free(rm->rm_phys_col,
 983                                             sizeof (raidz_col_t) *
 984                                             rm->rm_nphys_cols);
 985                                         rm->rm_phys_col = NULL;
 986                                         rm->rm_nphys_cols = 0;
 987                                         break;
 988                                 }
 989                                 prc->rc_size += rc->rc_size;
 990                         }
 991                 }
 992         }
 993         if (rm->rm_phys_col != NULL) {
 994                 /*
 995                  * Allocate aggregate ABD's.
 996                  */
 997                 for (int i = 0; i < rm->rm_nphys_cols; i++) {
 998                         raidz_col_t *prc = &rm->rm_phys_col[i];
 999
1000                         prc->rc_devidx = i;
1001
1002                         if (prc->rc_size == 0)
1003                                 continue;
1004
1005                         prc->rc_abd =
1006                             abd_alloc_linear(rm->rm_phys_col[i].rc_size,
1007                             B_FALSE);
1008                 }
1009
1010                 /*
1011                  * Point the parity abd's into the aggregate abd's.
1012                  */
1013                 for (int i = 0; i < rm->rm_nrows; i++) {
1014                         raidz_row_t *rr = rm->rm_row[i];
1015                         for (int c = 0; c < rr->rr_firstdatacol; c++) {
1016                                 raidz_col_t *rc = &rr->rr_col[c];
1017                                 raidz_col_t *prc =
1018                                     &rm->rm_phys_col[rc->rc_devidx];
1019                                 rc->rc_abd =
1020                                     abd_get_offset_struct(&rc->rc_abdstruct,
1021                                     prc->rc_abd,
1022                                     rc->rc_offset - prc->rc_offset,
1023                                     rc->rc_size);
1024                         }
1025                 }
1026         } else {
1027                 /*
1028                  * Allocate new abd's for the parity sectors.
1029                  */
1030                 for (int i = 0; i < rm->rm_nrows; i++) {
1031                         raidz_row_t *rr = rm->rm_row[i];
1032                         for (int c = 0; c < rr->rr_firstdatacol; c++) {
1033                                 raidz_col_t *rc = &rr->rr_col[c];
1034                                 rc->rc_abd =
1035                                     abd_alloc_linear(rc->rc_size,
1036                                     B_TRUE);
1037                         }
1038                 }
1039         }
1040         /* init RAIDZ parity ops */
1041         rm->rm_ops = vdev_raidz_math_get_ops();
1042
1043         return (rm);
1044 }
1045
1046 struct pqr_struct {
1047         uint64_t *p;
1048         uint64_t *q;
1049         uint64_t *r;
1050 };
1051
1052 static int
1053 vdev_raidz_p_func(void *buf, size_t size, void *private)
1054 {
1055         struct pqr_struct *pqr = private;
1056         const uint64_t *src = buf;
1057         int cnt = size / sizeof (src[0]);
1058
1059         ASSERT(pqr->p && !pqr->q && !pqr->r);
1060
1061         for (int i = 0; i < cnt; i++, src++, pqr->p++)
1062                 *pqr->p ^= *src;
1063
1064         return (0);
1065 }
1066
1067 static int
1068 vdev_raidz_pq_func(void *buf, size_t size, void *private)
1069 {
1070         struct pqr_struct *pqr = private;
1071         const uint64_t *src = buf;
1072         uint64_t mask;
1073         int cnt = size / sizeof (src[0]);
1074
1075         ASSERT(pqr->p && pqr->q && !pqr->r);
1076
1077         for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1078                 *pqr->p ^= *src;
1079                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1080                 *pqr->q ^= *src;
1081         }
1082
1083         return (0);
1084 }
1085
1086 static int
1087 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1088 {
1089         struct pqr_struct *pqr = private;
1090         const uint64_t *src = buf;
1091         uint64_t mask;
1092         int cnt = size / sizeof (src[0]);
1093
1094         ASSERT(pqr->p && pqr->q && pqr->r);
1095
1096         for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1097                 *pqr->p ^= *src;
1098                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1099                 *pqr->q ^= *src;
1100                 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1101                 *pqr->r ^= *src;
1102         }
1103
1104         return (0);
1105 }
1106
1107 static void
1108 vdev_raidz_generate_parity_p(raidz_row_t *rr)
1109 {
1110         uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1111
1112         for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1113                 abd_t *src = rr->rr_col[c].rc_abd;
1114
1115                 if (c == rr->rr_firstdatacol) {
1116                         abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1117                 } else {
1118                         struct pqr_struct pqr = { p, NULL, NULL };
1119                         (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1120                             vdev_raidz_p_func, &pqr);
1121                 }
1122         }
1123 }
1124
1125 static void
1126 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1127 {
1128         uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1129         uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1130         uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1131         ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1132             rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1133
1134         for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1135                 abd_t *src = rr->rr_col[c].rc_abd;
1136
1137                 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1138
1139                 if (c == rr->rr_firstdatacol) {
1140                         ASSERT(ccnt == pcnt || ccnt == 0);
1141                         abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1142                         (void) memcpy(q, p, rr->rr_col[c].rc_size);
1143
1144                         for (uint64_t i = ccnt; i < pcnt; i++) {
1145                                 p[i] = 0;
1146                                 q[i] = 0;
1147                         }
1148                 } else {
1149                         struct pqr_struct pqr = { p, q, NULL };
1150
1151                         ASSERT(ccnt <= pcnt);
1152                         (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1153                             vdev_raidz_pq_func, &pqr);
1154
1155                         /*
1156                          * Treat short columns as though they are full of 0s.
1157                          * Note that there's therefore nothing needed for P.
1158                          */
1159                         uint64_t mask;
1160                         for (uint64_t i = ccnt; i < pcnt; i++) {
1161                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
1162                         }
1163                 }
1164         }
1165 }
1166
1167 static void
1168 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1169 {
1170         uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1171         uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1172         uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1173         uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1174         ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1175             rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1176         ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1177             rr->rr_col[VDEV_RAIDZ_R].rc_size);
1178
1179         for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1180                 abd_t *src = rr->rr_col[c].rc_abd;
1181
1182                 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1183
1184                 if (c == rr->rr_firstdatacol) {
1185                         ASSERT(ccnt == pcnt || ccnt == 0);
1186                         abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1187                         (void) memcpy(q, p, rr->rr_col[c].rc_size);
1188                         (void) memcpy(r, p, rr->rr_col[c].rc_size);
1189
1190                         for (uint64_t i = ccnt; i < pcnt; i++) {
1191                                 p[i] = 0;
1192                                 q[i] = 0;
1193                                 r[i] = 0;
1194                         }
1195                 } else {
1196                         struct pqr_struct pqr = { p, q, r };
1197
1198                         ASSERT(ccnt <= pcnt);
1199                         (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1200                             vdev_raidz_pqr_func, &pqr);
1201
1202                         /*
1203                          * Treat short columns as though they are full of 0s.
1204                          * Note that there's therefore nothing needed for P.
1205                          */
1206                         uint64_t mask;
1207                         for (uint64_t i = ccnt; i < pcnt; i++) {
1208                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
1209                                 VDEV_RAIDZ_64MUL_4(r[i], mask);
1210                         }
1211                 }
1212         }
1213 }
1214
1215 /*
1216  * Generate RAID parity in the first virtual columns according to the number of
1217  * parity columns available.
1218  */
1219 void
1220 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1221 {
1222         if (rr->rr_cols == 0) {
1223                 /*
1224                  * We are handling this block one row at a time (because
1225                  * this block has a different logical vs physical width,
1226                  * due to RAIDZ expansion), and this is a pad-only row,
1227                  * which has no parity.
1228                  */
1229                 return;
1230         }
1231
1232         /* Generate using the new math implementation */
1233         if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1234                 return;
1235
1236         switch (rr->rr_firstdatacol) {
1237         case 1:
1238                 vdev_raidz_generate_parity_p(rr);
1239                 break;
1240         case 2:
1241                 vdev_raidz_generate_parity_pq(rr);
1242                 break;
1243         case 3:
1244                 vdev_raidz_generate_parity_pqr(rr);
1245                 break;
1246         default:
1247                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1248         }
1249 }
1250
1251 void
1252 vdev_raidz_generate_parity(raidz_map_t *rm)
1253 {
1254         for (int i = 0; i < rm->rm_nrows; i++) {
1255                 raidz_row_t *rr = rm->rm_row[i];
1256                 vdev_raidz_generate_parity_row(rm, rr);
1257         }
1258 }
1259
1260 static int
1261 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1262 {
1263         (void) private;
1264         uint64_t *dst = dbuf;
1265         uint64_t *src = sbuf;
1266         int cnt = size / sizeof (src[0]);
1267
1268         for (int i = 0; i < cnt; i++) {
1269                 dst[i] ^= src[i];
1270         }
1271
1272         return (0);
1273 }
1274
1275 static int
1276 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1277     void *private)
1278 {
1279         (void) private;
1280         uint64_t *dst = dbuf;
1281         uint64_t *src = sbuf;
1282         uint64_t mask;
1283         int cnt = size / sizeof (dst[0]);
1284
1285         for (int i = 0; i < cnt; i++, dst++, src++) {
1286                 VDEV_RAIDZ_64MUL_2(*dst, mask);
1287                 *dst ^= *src;
1288         }
1289
1290         return (0);
1291 }
1292
1293 static int
1294 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1295 {
1296         (void) private;
1297         uint64_t *dst = buf;
1298         uint64_t mask;
1299         int cnt = size / sizeof (dst[0]);
1300
1301         for (int i = 0; i < cnt; i++, dst++) {
1302                 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1303                 VDEV_RAIDZ_64MUL_2(*dst, mask);
1304         }
1305
1306         return (0);
1307 }
1308
1309 struct reconst_q_struct {
1310         uint64_t *q;
1311         int exp;
1312 };
1313
1314 static int
1315 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1316 {
1317         struct reconst_q_struct *rq = private;
1318         uint64_t *dst = buf;
1319         int cnt = size / sizeof (dst[0]);
1320
1321         for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1322                 int j;
1323                 uint8_t *b;
1324
1325                 *dst ^= *rq->q;
1326                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1327                         *b = vdev_raidz_exp2(*b, rq->exp);
1328                 }
1329         }
1330
1331         return (0);
1332 }
1333
1334 struct reconst_pq_struct {
1335         uint8_t *p;
1336         uint8_t *q;
1337         uint8_t *pxy;
1338         uint8_t *qxy;
1339         int aexp;
1340         int bexp;
1341 };
1342
1343 static int
1344 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1345 {
1346         struct reconst_pq_struct *rpq = private;
1347         uint8_t *xd = xbuf;
1348         uint8_t *yd = ybuf;
1349
1350         for (int i = 0; i < size;
1351             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1352                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1353                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1354                 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
1355         }
1356
1357         return (0);
1358 }
1359
1360 static int
1361 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1362 {
1363         struct reconst_pq_struct *rpq = private;
1364         uint8_t *xd = xbuf;
1365
1366         for (int i = 0; i < size;
1367             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1368                 /* same operation as vdev_raidz_reconst_pq_func() on xd */
1369                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1370                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1371         }
1372
1373         return (0);
1374 }
1375
1376 static void
1377 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1378 {
1379         int x = tgts[0];
1380         abd_t *dst, *src;
1381
1382         if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1383                 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1384
1385         ASSERT3U(ntgts, ==, 1);
1386         ASSERT3U(x, >=, rr->rr_firstdatacol);
1387         ASSERT3U(x, <, rr->rr_cols);
1388
1389         ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1390
1391         src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1392         dst = rr->rr_col[x].rc_abd;
1393
1394         abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1395
1396         for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1397                 uint64_t size = MIN(rr->rr_col[x].rc_size,
1398                     rr->rr_col[c].rc_size);
1399
1400                 src = rr->rr_col[c].rc_abd;
1401
1402                 if (c == x)
1403                         continue;
1404
1405                 (void) abd_iterate_func2(dst, src, 0, 0, size,
1406                     vdev_raidz_reconst_p_func, NULL);
1407         }
1408 }
1409
1410 static void
1411 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1412 {
1413         int x = tgts[0];
1414         int c, exp;
1415         abd_t *dst, *src;
1416
1417         if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1418                 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1419
1420         ASSERT(ntgts == 1);
1421
1422         ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1423
1424         for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1425                 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1426                     rr->rr_col[c].rc_size);
1427
1428                 src = rr->rr_col[c].rc_abd;
1429                 dst = rr->rr_col[x].rc_abd;
1430
1431                 if (c == rr->rr_firstdatacol) {
1432                         abd_copy(dst, src, size);
1433                         if (rr->rr_col[x].rc_size > size) {
1434                                 abd_zero_off(dst, size,
1435                                     rr->rr_col[x].rc_size - size);
1436                         }
1437                 } else {
1438                         ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1439                         (void) abd_iterate_func2(dst, src, 0, 0, size,
1440                             vdev_raidz_reconst_q_pre_func, NULL);
1441                         (void) abd_iterate_func(dst,
1442                             size, rr->rr_col[x].rc_size - size,
1443                             vdev_raidz_reconst_q_pre_tail_func, NULL);
1444                 }
1445         }
1446
1447         src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1448         dst = rr->rr_col[x].rc_abd;
1449         exp = 255 - (rr->rr_cols - 1 - x);
1450
1451         struct reconst_q_struct rq = { abd_to_buf(src), exp };
1452         (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1453             vdev_raidz_reconst_q_post_func, &rq);
1454 }
1455
1456 static void
1457 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1458 {
1459         uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1460         abd_t *pdata, *qdata;
1461         uint64_t xsize, ysize;
1462         int x = tgts[0];
1463         int y = tgts[1];
1464         abd_t *xd, *yd;
1465
1466         if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1467                 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1468
1469         ASSERT(ntgts == 2);
1470         ASSERT(x < y);
1471         ASSERT(x >= rr->rr_firstdatacol);
1472         ASSERT(y < rr->rr_cols);
1473
1474         ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1475
1476         /*
1477          * Move the parity data aside -- we're going to compute parity as
1478          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1479          * reuse the parity generation mechanism without trashing the actual
1480          * parity so we make those columns appear to be full of zeros by
1481          * setting their lengths to zero.
1482          */
1483         pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1484         qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1485         xsize = rr->rr_col[x].rc_size;
1486         ysize = rr->rr_col[y].rc_size;
1487
1488         rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1489             abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1490         rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1491             abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1492         rr->rr_col[x].rc_size = 0;
1493         rr->rr_col[y].rc_size = 0;
1494
1495         vdev_raidz_generate_parity_pq(rr);
1496
1497         rr->rr_col[x].rc_size = xsize;
1498         rr->rr_col[y].rc_size = ysize;
1499
1500         p = abd_to_buf(pdata);
1501         q = abd_to_buf(qdata);
1502         pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1503         qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1504         xd = rr->rr_col[x].rc_abd;
1505         yd = rr->rr_col[y].rc_abd;
1506
1507         /*
1508          * We now have:
1509          *      Pxy = P + D_x + D_y
1510          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1511          *
1512          * We can then solve for D_x:
1513          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
1514          * where
1515          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
1516          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1517          *
1518          * With D_x in hand, we can easily solve for D_y:
1519          *      D_y = P + Pxy + D_x
1520          */
1521
1522         a = vdev_raidz_pow2[255 + x - y];
1523         b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1524         tmp = 255 - vdev_raidz_log2[a ^ 1];
1525
1526         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1527         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1528
1529         ASSERT3U(xsize, >=, ysize);
1530         struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1531
1532         (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1533             vdev_raidz_reconst_pq_func, &rpq);
1534         (void) abd_iterate_func(xd, ysize, xsize - ysize,
1535             vdev_raidz_reconst_pq_tail_func, &rpq);
1536
1537         abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1538         abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1539
1540         /*
1541          * Restore the saved parity data.
1542          */
1543         rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1544         rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1545 }
1546
1547 /*
1548  * In the general case of reconstruction, we must solve the system of linear
1549  * equations defined by the coefficients used to generate parity as well as
1550  * the contents of the data and parity disks. This can be expressed with
1551  * vectors for the original data (D) and the actual data (d) and parity (p)
1552  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1553  *
1554  *            __   __                     __     __
1555  *            |     |         __     __   |  p_0  |
1556  *            |  V  |         |  D_0  |   | p_m-1 |
1557  *            |     |    x    |   :   | = |  d_0  |
1558  *            |  I  |         | D_n-1 |   |   :   |
1559  *            |     |         ~~     ~~   | d_n-1 |
1560  *            ~~   ~~                     ~~     ~~
1561  *
1562  * I is simply a square identity matrix of size n, and V is a vandermonde
1563  * matrix defined by the coefficients we chose for the various parity columns
1564  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1565  * computation as well as linear separability.
1566  *
1567  *      __               __               __     __
1568  *      |   1   ..  1 1 1 |               |  p_0  |
1569  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1570  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1571  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1572  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1573  *      |   :       : : : |   |   :   |   |  d_2  |
1574  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1575  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1576  *      |   0   ..  0 0 1 |               | d_n-1 |
1577  *      ~~               ~~               ~~     ~~
1578  *
1579  * Note that I, V, d, and p are known. To compute D, we must invert the
1580  * matrix and use the known data and parity values to reconstruct the unknown
1581  * data values. We begin by removing the rows in V|I and d|p that correspond
1582  * to failed or missing columns; we then make V|I square (n x n) and d|p
1583  * sized n by removing rows corresponding to unused parity from the bottom up
1584  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1585  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1586  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1587  *           __                               __
1588  *           |  1   1   1   1   1   1   1   1  |
1589  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1590  *           |  19 205 116  29  64  16  4   1  |      / /
1591  *           |  1   0   0   0   0   0   0   0  |     / /
1592  *           |  0   1   0   0   0   0   0   0  | <--' /
1593  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1594  *           |  0   0   0   1   0   0   0   0  |
1595  *           |  0   0   0   0   1   0   0   0  |
1596  *           |  0   0   0   0   0   1   0   0  |
1597  *           |  0   0   0   0   0   0   1   0  |
1598  *           |  0   0   0   0   0   0   0   1  |
1599  *           ~~                               ~~
1600  *           __                               __
1601  *           |  1   1   1   1   1   1   1   1  |
1602  *           | 128  64  32  16  8   4   2   1  |
1603  *           |  19 205 116  29  64  16  4   1  |
1604  *           |  1   0   0   0   0   0   0   0  |
1605  *           |  0   1   0   0   0   0   0   0  |
1606  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1607  *           |  0   0   0   1   0   0   0   0  |
1608  *           |  0   0   0   0   1   0   0   0  |
1609  *           |  0   0   0   0   0   1   0   0  |
1610  *           |  0   0   0   0   0   0   1   0  |
1611  *           |  0   0   0   0   0   0   0   1  |
1612  *           ~~                               ~~
1613  *
1614  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1615  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1616  * matrix is not singular.
1617  * __                                                                 __
1618  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1619  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1620  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1621  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1622  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1623  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1624  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1625  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1626  * ~~                                                                 ~~
1627  * __                                                                 __
1628  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1629  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1630  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1631  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1632  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1633  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1634  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1635  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1636  * ~~                                                                 ~~
1637  * __                                                                 __
1638  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1639  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1640  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1641  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1642  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1643  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1644  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1645  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1646  * ~~                                                                 ~~
1647  * __                                                                 __
1648  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1649  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1650  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1651  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1652  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1653  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1654  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1655  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1656  * ~~                                                                 ~~
1657  * __                                                                 __
1658  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1659  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1660  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1661  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1662  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1663  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1664  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1665  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1666  * ~~                                                                 ~~
1667  * __                                                                 __
1668  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1669  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1670  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1671  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1672  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1673  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1674  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1675  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1676  * ~~                                                                 ~~
1677  *                   __                               __
1678  *                   |  0   0   1   0   0   0   0   0  |
1679  *                   | 167 100  5   41 159 169 217 208 |
1680  *                   | 166 100  4   40 158 168 216 209 |
1681  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1682  *                   |  0   0   0   0   1   0   0   0  |
1683  *                   |  0   0   0   0   0   1   0   0  |
1684  *                   |  0   0   0   0   0   0   1   0  |
1685  *                   |  0   0   0   0   0   0   0   1  |
1686  *                   ~~                               ~~
1687  *
1688  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1689  * of the missing data.
1690  *
1691  * As is apparent from the example above, the only non-trivial rows in the
1692  * inverse matrix correspond to the data disks that we're trying to
1693  * reconstruct. Indeed, those are the only rows we need as the others would
1694  * only be useful for reconstructing data known or assumed to be valid. For
1695  * that reason, we only build the coefficients in the rows that correspond to
1696  * targeted columns.
1697  */
1698
1699 static void
1700 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1701     uint8_t **rows)
1702 {
1703         int i, j;
1704         int pow;
1705
1706         ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1707
1708         /*
1709          * Fill in the missing rows of interest.
1710          */
1711         for (i = 0; i < nmap; i++) {
1712                 ASSERT3S(0, <=, map[i]);
1713                 ASSERT3S(map[i], <=, 2);
1714
1715                 pow = map[i] * n;
1716                 if (pow > 255)
1717                         pow -= 255;
1718                 ASSERT(pow <= 255);
1719
1720                 for (j = 0; j < n; j++) {
1721                         pow -= map[i];
1722                         if (pow < 0)
1723                                 pow += 255;
1724                         rows[i][j] = vdev_raidz_pow2[pow];
1725                 }
1726         }
1727 }
1728
1729 static void
1730 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1731     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1732 {
1733         int i, j, ii, jj;
1734         uint8_t log;
1735
1736         /*
1737          * Assert that the first nmissing entries from the array of used
1738          * columns correspond to parity columns and that subsequent entries
1739          * correspond to data columns.
1740          */
1741         for (i = 0; i < nmissing; i++) {
1742                 ASSERT3S(used[i], <, rr->rr_firstdatacol);
1743         }
1744         for (; i < n; i++) {
1745                 ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1746         }
1747
1748         /*
1749          * First initialize the storage where we'll compute the inverse rows.
1750          */
1751         for (i = 0; i < nmissing; i++) {
1752                 for (j = 0; j < n; j++) {
1753                         invrows[i][j] = (i == j) ? 1 : 0;
1754                 }
1755         }
1756
1757         /*
1758          * Subtract all trivial rows from the rows of consequence.
1759          */
1760         for (i = 0; i < nmissing; i++) {
1761                 for (j = nmissing; j < n; j++) {
1762                         ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1763                         jj = used[j] - rr->rr_firstdatacol;
1764                         ASSERT3S(jj, <, n);
1765                         invrows[i][j] = rows[i][jj];
1766                         rows[i][jj] = 0;
1767                 }
1768         }
1769
1770         /*
1771          * For each of the rows of interest, we must normalize it and subtract
1772          * a multiple of it from the other rows.
1773          */
1774         for (i = 0; i < nmissing; i++) {
1775                 for (j = 0; j < missing[i]; j++) {
1776                         ASSERT0(rows[i][j]);
1777                 }
1778                 ASSERT3U(rows[i][missing[i]], !=, 0);
1779
1780                 /*
1781                  * Compute the inverse of the first element and multiply each
1782                  * element in the row by that value.
1783                  */
1784                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1785
1786                 for (j = 0; j < n; j++) {
1787                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1788                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1789                 }
1790
1791                 for (ii = 0; ii < nmissing; ii++) {
1792                         if (i == ii)
1793                                 continue;
1794
1795                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1796
1797                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1798
1799                         for (j = 0; j < n; j++) {
1800                                 rows[ii][j] ^=
1801                                     vdev_raidz_exp2(rows[i][j], log);
1802                                 invrows[ii][j] ^=
1803                                     vdev_raidz_exp2(invrows[i][j], log);
1804                         }
1805                 }
1806         }
1807
1808         /*
1809          * Verify that the data that is left in the rows are properly part of
1810          * an identity matrix.
1811          */
1812         for (i = 0; i < nmissing; i++) {
1813                 for (j = 0; j < n; j++) {
1814                         if (j == missing[i]) {
1815                                 ASSERT3U(rows[i][j], ==, 1);
1816                         } else {
1817                                 ASSERT0(rows[i][j]);
1818                         }
1819                 }
1820         }
1821 }
1822
1823 static void
1824 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1825     int *missing, uint8_t **invrows, const uint8_t *used)
1826 {
1827         int i, j, x, cc, c;
1828         uint8_t *src;
1829         uint64_t ccount;
1830         uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1831         uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1832         uint8_t log = 0;
1833         uint8_t val;
1834         int ll;
1835         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1836         uint8_t *p, *pp;
1837         size_t psize;
1838
1839         psize = sizeof (invlog[0][0]) * n * nmissing;
1840         p = kmem_alloc(psize, KM_SLEEP);
1841
1842         for (pp = p, i = 0; i < nmissing; i++) {
1843                 invlog[i] = pp;
1844                 pp += n;
1845         }
1846
1847         for (i = 0; i < nmissing; i++) {
1848                 for (j = 0; j < n; j++) {
1849                         ASSERT3U(invrows[i][j], !=, 0);
1850                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1851                 }
1852         }
1853
1854         for (i = 0; i < n; i++) {
1855                 c = used[i];
1856                 ASSERT3U(c, <, rr->rr_cols);
1857
1858                 ccount = rr->rr_col[c].rc_size;
1859                 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1860                 if (ccount == 0)
1861                         continue;
1862                 src = abd_to_buf(rr->rr_col[c].rc_abd);
1863                 for (j = 0; j < nmissing; j++) {
1864                         cc = missing[j] + rr->rr_firstdatacol;
1865                         ASSERT3U(cc, >=, rr->rr_firstdatacol);
1866                         ASSERT3U(cc, <, rr->rr_cols);
1867                         ASSERT3U(cc, !=, c);
1868
1869                         dcount[j] = rr->rr_col[cc].rc_size;
1870                         if (dcount[j] != 0)
1871                                 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1872                 }
1873
1874                 for (x = 0; x < ccount; x++, src++) {
1875                         if (*src != 0)
1876                                 log = vdev_raidz_log2[*src];
1877
1878                         for (cc = 0; cc < nmissing; cc++) {
1879                                 if (x >= dcount[cc])
1880                                         continue;
1881
1882                                 if (*src == 0) {
1883                                         val = 0;
1884                                 } else {
1885                                         if ((ll = log + invlog[cc][i]) >= 255)
1886                                                 ll -= 255;
1887                                         val = vdev_raidz_pow2[ll];
1888                                 }
1889
1890                                 if (i == 0)
1891                                         dst[cc][x] = val;
1892                                 else
1893                                         dst[cc][x] ^= val;
1894                         }
1895                 }
1896         }
1897
1898         kmem_free(p, psize);
1899 }
1900
1901 static void
1902 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1903 {
1904         int i, c, t, tt;
1905         unsigned int n;
1906         unsigned int nmissing_rows;
1907         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1908         int parity_map[VDEV_RAIDZ_MAXPARITY];
1909         uint8_t *p, *pp;
1910         size_t psize;
1911         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1912         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1913         uint8_t *used;
1914
1915         abd_t **bufs = NULL;
1916
1917         if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1918                 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1919         /*
1920          * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1921          * temporary linear ABDs if any non-linear ABDs are found.
1922          */
1923         for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1924                 ASSERT(rr->rr_col[i].rc_abd != NULL);
1925                 if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1926                         bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1927                             KM_PUSHPAGE);
1928
1929                         for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1930                                 raidz_col_t *col = &rr->rr_col[c];
1931
1932                                 bufs[c] = col->rc_abd;
1933                                 if (bufs[c] != NULL) {
1934                                         col->rc_abd = abd_alloc_linear(
1935                                             col->rc_size, B_TRUE);
1936                                         abd_copy(col->rc_abd, bufs[c],
1937                                             col->rc_size);
1938                                 }
1939                         }
1940
1941                         break;
1942                 }
1943         }
1944
1945         n = rr->rr_cols - rr->rr_firstdatacol;
1946
1947         /*
1948          * Figure out which data columns are missing.
1949          */
1950         nmissing_rows = 0;
1951         for (t = 0; t < ntgts; t++) {
1952                 if (tgts[t] >= rr->rr_firstdatacol) {
1953                         missing_rows[nmissing_rows++] =
1954                             tgts[t] - rr->rr_firstdatacol;
1955                 }
1956         }
1957
1958         /*
1959          * Figure out which parity columns to use to help generate the missing
1960          * data columns.
1961          */
1962         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1963                 ASSERT(tt < ntgts);
1964                 ASSERT(c < rr->rr_firstdatacol);
1965
1966                 /*
1967                  * Skip any targeted parity columns.
1968                  */
1969                 if (c == tgts[tt]) {
1970                         tt++;
1971                         continue;
1972                 }
1973
1974                 parity_map[i] = c;
1975                 i++;
1976         }
1977
1978         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1979             nmissing_rows * n + sizeof (used[0]) * n;
1980         p = kmem_alloc(psize, KM_SLEEP);
1981
1982         for (pp = p, i = 0; i < nmissing_rows; i++) {
1983                 rows[i] = pp;
1984                 pp += n;
1985                 invrows[i] = pp;
1986                 pp += n;
1987         }
1988         used = pp;
1989
1990         for (i = 0; i < nmissing_rows; i++) {
1991                 used[i] = parity_map[i];
1992         }
1993
1994         for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1995                 if (tt < nmissing_rows &&
1996                     c == missing_rows[tt] + rr->rr_firstdatacol) {
1997                         tt++;
1998                         continue;
1999                 }
2000
2001                 ASSERT3S(i, <, n);
2002                 used[i] = c;
2003                 i++;
2004         }
2005
2006         /*
2007          * Initialize the interesting rows of the matrix.
2008          */
2009         vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2010
2011         /*
2012          * Invert the matrix.
2013          */
2014         vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2015             invrows, used);
2016
2017         /*
2018          * Reconstruct the missing data using the generated matrix.
2019          */
2020         vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2021             invrows, used);
2022
2023         kmem_free(p, psize);
2024
2025         /*
2026          * copy back from temporary linear abds and free them
2027          */
2028         if (bufs) {
2029                 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2030                         raidz_col_t *col = &rr->rr_col[c];
2031
2032                         if (bufs[c] != NULL) {
2033                                 abd_copy(bufs[c], col->rc_abd, col->rc_size);
2034                                 abd_free(col->rc_abd);
2035                         }
2036                         col->rc_abd = bufs[c];
2037                 }
2038                 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2039         }
2040 }
2041
2042 static void
2043 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2044     const int *t, int nt)
2045 {
2046         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2047         int ntgts;
2048         int i, c, ret;
2049         int nbadparity, nbaddata;
2050         int parity_valid[VDEV_RAIDZ_MAXPARITY];
2051
2052         if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2053                 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2054                     rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2055                     (int)rr->rr_missingparity);
2056         }
2057
2058         nbadparity = rr->rr_firstdatacol;
2059         nbaddata = rr->rr_cols - nbadparity;
2060         ntgts = 0;
2061         for (i = 0, c = 0; c < rr->rr_cols; c++) {
2062                 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2063                         zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2064                             "offset=%llx error=%u)",
2065                             rr, c, (int)rr->rr_col[c].rc_devidx,
2066                             (long long)rr->rr_col[c].rc_offset,
2067                             (int)rr->rr_col[c].rc_error);
2068                 }
2069                 if (c < rr->rr_firstdatacol)
2070                         parity_valid[c] = B_FALSE;
2071
2072                 if (i < nt && c == t[i]) {
2073                         tgts[ntgts++] = c;
2074                         i++;
2075                 } else if (rr->rr_col[c].rc_error != 0) {
2076                         tgts[ntgts++] = c;
2077                 } else if (c >= rr->rr_firstdatacol) {
2078                         nbaddata--;
2079                 } else {
2080                         parity_valid[c] = B_TRUE;
2081                         nbadparity--;
2082                 }
2083         }
2084
2085         ASSERT(ntgts >= nt);
2086         ASSERT(nbaddata >= 0);
2087         ASSERT(nbaddata + nbadparity == ntgts);
2088
2089         dt = &tgts[nbadparity];
2090
2091         /* Reconstruct using the new math implementation */
2092         ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2093         if (ret != RAIDZ_ORIGINAL_IMPL)
2094                 return;
2095
2096         /*
2097          * See if we can use any of our optimized reconstruction routines.
2098          */
2099         switch (nbaddata) {
2100         case 1:
2101                 if (parity_valid[VDEV_RAIDZ_P]) {
2102                         vdev_raidz_reconstruct_p(rr, dt, 1);
2103                         return;
2104                 }
2105
2106                 ASSERT(rr->rr_firstdatacol > 1);
2107
2108                 if (parity_valid[VDEV_RAIDZ_Q]) {
2109                         vdev_raidz_reconstruct_q(rr, dt, 1);
2110                         return;
2111                 }
2112
2113                 ASSERT(rr->rr_firstdatacol > 2);
2114                 break;
2115
2116         case 2:
2117                 ASSERT(rr->rr_firstdatacol > 1);
2118
2119                 if (parity_valid[VDEV_RAIDZ_P] &&
2120                     parity_valid[VDEV_RAIDZ_Q]) {
2121                         vdev_raidz_reconstruct_pq(rr, dt, 2);
2122                         return;
2123                 }
2124
2125                 ASSERT(rr->rr_firstdatacol > 2);
2126
2127                 break;
2128         }
2129
2130         vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2131 }
2132
2133 static int
2134 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2135     uint64_t *logical_ashift, uint64_t *physical_ashift)
2136 {
2137         vdev_raidz_t *vdrz = vd->vdev_tsd;
2138         uint64_t nparity = vdrz->vd_nparity;
2139         int c;
2140         int lasterror = 0;
2141         int numerrors = 0;
2142
2143         ASSERT(nparity > 0);
2144
2145         if (nparity > VDEV_RAIDZ_MAXPARITY ||
2146             vd->vdev_children < nparity + 1) {
2147                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2148                 return (SET_ERROR(EINVAL));
2149         }
2150
2151         vdev_open_children(vd);
2152
2153         for (c = 0; c < vd->vdev_children; c++) {
2154                 vdev_t *cvd = vd->vdev_child[c];
2155
2156                 if (cvd->vdev_open_error != 0) {
2157                         lasterror = cvd->vdev_open_error;
2158                         numerrors++;
2159                         continue;
2160                 }
2161
2162                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2163                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2164                 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2165         }
2166         for (c = 0; c < vd->vdev_children; c++) {
2167                 vdev_t *cvd = vd->vdev_child[c];
2168
2169                 if (cvd->vdev_open_error != 0)
2170                         continue;
2171                 *physical_ashift = vdev_best_ashift(*logical_ashift,
2172                     *physical_ashift, cvd->vdev_physical_ashift);
2173         }
2174
2175         if (vd->vdev_rz_expanding) {
2176                 *asize *= vd->vdev_children - 1;
2177                 *max_asize *= vd->vdev_children - 1;
2178
2179                 vd->vdev_min_asize = *asize;
2180         } else {
2181                 *asize *= vd->vdev_children;
2182                 *max_asize *= vd->vdev_children;
2183         }
2184
2185         if (numerrors > nparity) {
2186                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2187                 return (lasterror);
2188         }
2189
2190         return (0);
2191 }
2192
2193 static void
2194 vdev_raidz_close(vdev_t *vd)
2195 {
2196         for (int c = 0; c < vd->vdev_children; c++) {
2197                 if (vd->vdev_child[c] != NULL)
2198                         vdev_close(vd->vdev_child[c]);
2199         }
2200 }
2201
2202 /*
2203  * Return the logical width to use, given the txg in which the allocation
2204  * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
2205  * BP was allocated.  Remapped BP's (that were relocated due to device
2206  * removal, see remap_blkptr_cb()), will have a more recent physical birth
2207  * which reflects when the BP was relocated, but we can ignore these because
2208  * they can't be on RAIDZ (device removal doesn't support RAIDZ).
2209  */
2210 static uint64_t
2211 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2212 {
2213         reflow_node_t lookup = {
2214                 .re_txg = txg,
2215         };
2216         avl_index_t where;
2217
2218         uint64_t width;
2219         mutex_enter(&vdrz->vd_expand_lock);
2220         reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2221         if (re != NULL) {
2222                 width = re->re_logical_width;
2223         } else {
2224                 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2225                 if (re != NULL)
2226                         width = re->re_logical_width;
2227                 else
2228                         width = vdrz->vd_original_width;
2229         }
2230         mutex_exit(&vdrz->vd_expand_lock);
2231         return (width);
2232 }
2233
2234 /*
2235  * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2236  * more space due to the lower data-to-parity ratio.  In this case it's
2237  * important to pass in the correct txg.  Note that vdev_gang_header_asize()
2238  * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2239  * regardless of txg.  This is assured because for a single data sector, we
2240  * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2241  */
2242 static uint64_t
2243 vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2244 {
2245         vdev_raidz_t *vdrz = vd->vdev_tsd;
2246         uint64_t asize;
2247         uint64_t ashift = vd->vdev_top->vdev_ashift;
2248         uint64_t cols = vdrz->vd_original_width;
2249         uint64_t nparity = vdrz->vd_nparity;
2250
2251         cols = vdev_raidz_get_logical_width(vdrz, txg);
2252
2253         asize = ((psize - 1) >> ashift) + 1;
2254         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2255         asize = roundup(asize, nparity + 1) << ashift;
2256
2257 #ifdef ZFS_DEBUG
2258         uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2259         uint64_t ncols_new = vdrz->vd_physical_width;
2260         asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2261             (ncols_new - nparity));
2262         asize_new = roundup(asize_new, nparity + 1) << ashift;
2263         VERIFY3U(asize_new, <=, asize);
2264 #endif
2265
2266         return (asize);
2267 }
2268
2269 /*
2270  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2271  * so each child must provide at least 1/Nth of its asize.
2272  */
2273 static uint64_t
2274 vdev_raidz_min_asize(vdev_t *vd)
2275 {
2276         return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2277             vd->vdev_children);
2278 }
2279
2280 void
2281 vdev_raidz_child_done(zio_t *zio)
2282 {
2283         raidz_col_t *rc = zio->io_private;
2284
2285         ASSERT3P(rc->rc_abd, !=, NULL);
2286         rc->rc_error = zio->io_error;
2287         rc->rc_tried = 1;
2288         rc->rc_skipped = 0;
2289 }
2290
2291 static void
2292 vdev_raidz_shadow_child_done(zio_t *zio)
2293 {
2294         raidz_col_t *rc = zio->io_private;
2295
2296         rc->rc_shadow_error = zio->io_error;
2297 }
2298
2299 static void
2300 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2301 {
2302         (void) rm;
2303 #ifdef ZFS_DEBUG
2304         range_seg64_t logical_rs, physical_rs, remain_rs;
2305         logical_rs.rs_start = rr->rr_offset;
2306         logical_rs.rs_end = logical_rs.rs_start +
2307             vdev_raidz_asize(zio->io_vd, rr->rr_size,
2308             BP_GET_BIRTH(zio->io_bp));
2309
2310         raidz_col_t *rc = &rr->rr_col[col];
2311         vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2312
2313         vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2314         ASSERT(vdev_xlate_is_empty(&remain_rs));
2315         if (vdev_xlate_is_empty(&physical_rs)) {
2316                 /*
2317                  * If we are in the middle of expansion, the
2318                  * physical->logical mapping is changing so vdev_xlate()
2319                  * can't give us a reliable answer.
2320                  */
2321                 return;
2322         }
2323         ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2324         ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2325         /*
2326          * It would be nice to assert that rs_end is equal
2327          * to rc_offset + rc_size but there might be an
2328          * optional I/O at the end that is not accounted in
2329          * rc_size.
2330          */
2331         if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2332                 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2333                     rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2334         } else {
2335                 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2336         }
2337 #endif
2338 }
2339
2340 static void
2341 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2342 {
2343         vdev_t *vd = zio->io_vd;
2344         raidz_map_t *rm = zio->io_vsd;
2345
2346         vdev_raidz_generate_parity_row(rm, rr);
2347
2348         for (int c = 0; c < rr->rr_scols; c++) {
2349                 raidz_col_t *rc = &rr->rr_col[c];
2350                 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2351
2352                 /* Verify physical to logical translation */
2353                 vdev_raidz_io_verify(zio, rm, rr, c);
2354
2355                 if (rc->rc_size == 0)
2356                         continue;
2357
2358                 ASSERT3U(rc->rc_offset + rc->rc_size, <,
2359                     cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2360
2361                 ASSERT3P(rc->rc_abd, !=, NULL);
2362                 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2363                     rc->rc_offset, rc->rc_abd,
2364                     abd_get_size(rc->rc_abd), zio->io_type,
2365                     zio->io_priority, 0, vdev_raidz_child_done, rc));
2366
2367                 if (rc->rc_shadow_devidx != INT_MAX) {
2368                         vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2369
2370                         ASSERT3U(
2371                             rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2372                             cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2373
2374                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2375                             rc->rc_shadow_offset, rc->rc_abd,
2376                             abd_get_size(rc->rc_abd),
2377                             zio->io_type, zio->io_priority, 0,
2378                             vdev_raidz_shadow_child_done, rc));
2379                 }
2380         }
2381 }
2382
2383 /*
2384  * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2385  * This only works for vdev_raidz_map_alloc() (not _expanded()).
2386  */
2387 static void
2388 raidz_start_skip_writes(zio_t *zio)
2389 {
2390         vdev_t *vd = zio->io_vd;
2391         uint64_t ashift = vd->vdev_top->vdev_ashift;
2392         raidz_map_t *rm = zio->io_vsd;
2393         ASSERT3U(rm->rm_nrows, ==, 1);
2394         raidz_row_t *rr = rm->rm_row[0];
2395         for (int c = 0; c < rr->rr_scols; c++) {
2396                 raidz_col_t *rc = &rr->rr_col[c];
2397                 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2398                 if (rc->rc_size != 0)
2399                         continue;
2400                 ASSERT3P(rc->rc_abd, ==, NULL);
2401
2402                 ASSERT3U(rc->rc_offset, <,
2403                     cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2404
2405                 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2406                     NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2407                     ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2408         }
2409 }
2410
2411 static void
2412 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2413 {
2414         vdev_t *vd = zio->io_vd;
2415
2416         /*
2417          * Iterate over the columns in reverse order so that we hit the parity
2418          * last -- any errors along the way will force us to read the parity.
2419          */
2420         for (int c = rr->rr_cols - 1; c >= 0; c--) {
2421                 raidz_col_t *rc = &rr->rr_col[c];
2422                 if (rc->rc_size == 0)
2423                         continue;
2424                 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2425                 if (!vdev_readable(cvd)) {
2426                         if (c >= rr->rr_firstdatacol)
2427                                 rr->rr_missingdata++;
2428                         else
2429                                 rr->rr_missingparity++;
2430                         rc->rc_error = SET_ERROR(ENXIO);
2431                         rc->rc_tried = 1;       /* don't even try */
2432                         rc->rc_skipped = 1;
2433                         continue;
2434                 }
2435                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2436                         if (c >= rr->rr_firstdatacol)
2437                                 rr->rr_missingdata++;
2438                         else
2439                                 rr->rr_missingparity++;
2440                         rc->rc_error = SET_ERROR(ESTALE);
2441                         rc->rc_skipped = 1;
2442                         continue;
2443                 }
2444                 if (forceparity ||
2445                     c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
2446                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2447                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2448                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2449                             zio->io_type, zio->io_priority, 0,
2450                             vdev_raidz_child_done, rc));
2451                 }
2452         }
2453 }
2454
2455 static void
2456 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2457 {
2458         vdev_t *vd = zio->io_vd;
2459
2460         for (int i = 0; i < rm->rm_nphys_cols; i++) {
2461                 raidz_col_t *prc = &rm->rm_phys_col[i];
2462                 if (prc->rc_size == 0)
2463                         continue;
2464
2465                 ASSERT3U(prc->rc_devidx, ==, i);
2466                 vdev_t *cvd = vd->vdev_child[i];
2467                 if (!vdev_readable(cvd)) {
2468                         prc->rc_error = SET_ERROR(ENXIO);
2469                         prc->rc_tried = 1;      /* don't even try */
2470                         prc->rc_skipped = 1;
2471                         continue;
2472                 }
2473                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2474                         prc->rc_error = SET_ERROR(ESTALE);
2475                         prc->rc_skipped = 1;
2476                         continue;
2477                 }
2478                 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2479                     prc->rc_offset, prc->rc_abd, prc->rc_size,
2480                     zio->io_type, zio->io_priority, 0,
2481                     vdev_raidz_child_done, prc));
2482         }
2483 }
2484
2485 static void
2486 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2487 {
2488         /*
2489          * If there are multiple rows, we will be hitting
2490          * all disks, so go ahead and read the parity so
2491          * that we are reading in decent size chunks.
2492          */
2493         boolean_t forceparity = rm->rm_nrows > 1;
2494
2495         if (rm->rm_phys_col) {
2496                 vdev_raidz_io_start_read_phys_cols(zio, rm);
2497         } else {
2498                 for (int i = 0; i < rm->rm_nrows; i++) {
2499                         raidz_row_t *rr = rm->rm_row[i];
2500                         vdev_raidz_io_start_read_row(zio, rr, forceparity);
2501                 }
2502         }
2503 }
2504
2505 /*
2506  * Start an IO operation on a RAIDZ VDev
2507  *
2508  * Outline:
2509  * - For write operations:
2510  *   1. Generate the parity data
2511  *   2. Create child zio write operations to each column's vdev, for both
2512  *      data and parity.
2513  *   3. If the column skips any sectors for padding, create optional dummy
2514  *      write zio children for those areas to improve aggregation continuity.
2515  * - For read operations:
2516  *   1. Create child zio read operations to each data column's vdev to read
2517  *      the range of data required for zio.
2518  *   2. If this is a scrub or resilver operation, or if any of the data
2519  *      vdevs have had errors, then create zio read operations to the parity
2520  *      columns' VDevs as well.
2521  */
2522 static void
2523 vdev_raidz_io_start(zio_t *zio)
2524 {
2525         vdev_t *vd = zio->io_vd;
2526         vdev_t *tvd = vd->vdev_top;
2527         vdev_raidz_t *vdrz = vd->vdev_tsd;
2528         raidz_map_t *rm;
2529
2530         uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2531             BP_GET_BIRTH(zio->io_bp));
2532         if (logical_width != vdrz->vd_physical_width) {
2533                 zfs_locked_range_t *lr = NULL;
2534                 uint64_t synced_offset = UINT64_MAX;
2535                 uint64_t next_offset = UINT64_MAX;
2536                 boolean_t use_scratch = B_FALSE;
2537                 /*
2538                  * Note: when the expansion is completing, we set
2539                  * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2540                  * in a later txg than when we last update spa_ubsync's state
2541                  * (see the end of spa_raidz_expand_thread()).  Therefore we
2542                  * may see vre_state!=SCANNING before
2543                  * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2544                  * on disk, but the copying progress has been synced to disk
2545                  * (and reflected in spa_ubsync).  In this case it's fine to
2546                  * treat the expansion as completed, since if we crash there's
2547                  * no additional copying to do.
2548                  */
2549                 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2550                         ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2551                             &vdrz->vn_vre);
2552                         lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2553                             zio->io_offset, zio->io_size, RL_READER);
2554                         use_scratch =
2555                             (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2556                             RRSS_SCRATCH_VALID);
2557                         synced_offset =
2558                             RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2559                         next_offset = vdrz->vn_vre.vre_offset;
2560                         /*
2561                          * If we haven't resumed expanding since importing the
2562                          * pool, vre_offset won't have been set yet.  In
2563                          * this case the next offset to be copied is the same
2564                          * as what was synced.
2565                          */
2566                         if (next_offset == UINT64_MAX) {
2567                                 next_offset = synced_offset;
2568                         }
2569                 }
2570                 if (use_scratch) {
2571                         zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2572                             "%lld next_offset=%lld use_scratch=%u",
2573                             zio,
2574                             zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2575                             (long long)zio->io_offset,
2576                             (long long)synced_offset,
2577                             (long long)next_offset,
2578                             use_scratch);
2579                 }
2580
2581                 rm = vdev_raidz_map_alloc_expanded(zio,
2582                     tvd->vdev_ashift, vdrz->vd_physical_width,
2583                     logical_width, vdrz->vd_nparity,
2584                     synced_offset, next_offset, use_scratch);
2585                 rm->rm_lr = lr;
2586         } else {
2587                 rm = vdev_raidz_map_alloc(zio,
2588                     tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2589         }
2590         rm->rm_original_width = vdrz->vd_original_width;
2591
2592         zio->io_vsd = rm;
2593         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2594         if (zio->io_type == ZIO_TYPE_WRITE) {
2595                 for (int i = 0; i < rm->rm_nrows; i++) {
2596                         vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2597                 }
2598
2599                 if (logical_width == vdrz->vd_physical_width) {
2600                         raidz_start_skip_writes(zio);
2601                 }
2602         } else {
2603                 ASSERT(zio->io_type == ZIO_TYPE_READ);
2604                 vdev_raidz_io_start_read(zio, rm);
2605         }
2606
2607         zio_execute(zio);
2608 }
2609
2610 /*
2611  * Report a checksum error for a child of a RAID-Z device.
2612  */
2613 void
2614 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2615 {
2616         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2617
2618         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2619             zio->io_priority != ZIO_PRIORITY_REBUILD) {
2620                 zio_bad_cksum_t zbc;
2621                 raidz_map_t *rm = zio->io_vsd;
2622
2623                 zbc.zbc_has_cksum = 0;
2624                 zbc.zbc_injected = rm->rm_ecksuminjected;
2625
2626                 mutex_enter(&vd->vdev_stat_lock);
2627                 vd->vdev_stat.vs_checksum_errors++;
2628                 mutex_exit(&vd->vdev_stat_lock);
2629                 (void) zfs_ereport_post_checksum(zio->io_spa, vd,
2630                     &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2631                     rc->rc_abd, bad_data, &zbc);
2632         }
2633 }
2634
2635 /*
2636  * We keep track of whether or not there were any injected errors, so that
2637  * any ereports we generate can note it.
2638  */
2639 static int
2640 raidz_checksum_verify(zio_t *zio)
2641 {
2642         zio_bad_cksum_t zbc = {0};
2643         raidz_map_t *rm = zio->io_vsd;
2644
2645         int ret = zio_checksum_error(zio, &zbc);
2646         /*
2647          * Any Direct I/O read that has a checksum error must be treated as
2648          * suspicious as the contents of the buffer could be getting
2649          * manipulated while the I/O is taking place. The checksum verify error
2650          * will be reported to the top-level RAIDZ VDEV.
2651          */
2652         if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
2653                 zio->io_error = ret;
2654                 zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
2655                 zio_dio_chksum_verify_error_report(zio);
2656                 zio_checksum_verified(zio);
2657                 return (0);
2658         }
2659
2660         if (ret != 0 && zbc.zbc_injected != 0)
2661                 rm->rm_ecksuminjected = 1;
2662
2663         return (ret);
2664 }
2665
2666 /*
2667  * Generate the parity from the data columns. If we tried and were able to
2668  * read the parity without error, verify that the generated parity matches the
2669  * data we read. If it doesn't, we fire off a checksum error. Return the
2670  * number of such failures.
2671  */
2672 static int
2673 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2674 {
2675         abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2676         int c, ret = 0;
2677         raidz_map_t *rm = zio->io_vsd;
2678         raidz_col_t *rc;
2679
2680         blkptr_t *bp = zio->io_bp;
2681         enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2682             (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2683
2684         if (checksum == ZIO_CHECKSUM_NOPARITY)
2685                 return (ret);
2686
2687         for (c = 0; c < rr->rr_firstdatacol; c++) {
2688                 rc = &rr->rr_col[c];
2689                 if (!rc->rc_tried || rc->rc_error != 0)
2690                         continue;
2691
2692                 orig[c] = rc->rc_abd;
2693                 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2694                 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2695         }
2696
2697         /*
2698          * Verify any empty sectors are zero filled to ensure the parity
2699          * is calculated correctly even if these non-data sectors are damaged.
2700          */
2701         if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2702                 ret += vdev_draid_map_verify_empty(zio, rr);
2703
2704         /*
2705          * Regenerates parity even for !tried||rc_error!=0 columns.  This
2706          * isn't harmful but it does have the side effect of fixing stuff
2707          * we didn't realize was necessary (i.e. even if we return 0).
2708          */
2709         vdev_raidz_generate_parity_row(rm, rr);
2710
2711         for (c = 0; c < rr->rr_firstdatacol; c++) {
2712                 rc = &rr->rr_col[c];
2713
2714                 if (!rc->rc_tried || rc->rc_error != 0)
2715                         continue;
2716
2717                 if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2718                         zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2719                             c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
2720                         vdev_raidz_checksum_error(zio, rc, orig[c]);
2721                         rc->rc_error = SET_ERROR(ECKSUM);
2722                         ret++;
2723                 }
2724                 abd_free(orig[c]);
2725         }
2726
2727         return (ret);
2728 }
2729
2730 static int
2731 vdev_raidz_worst_error(raidz_row_t *rr)
2732 {
2733         int error = 0;
2734
2735         for (int c = 0; c < rr->rr_cols; c++) {
2736                 error = zio_worst_error(error, rr->rr_col[c].rc_error);
2737                 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2738         }
2739
2740         return (error);
2741 }
2742
2743 static void
2744 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
2745 {
2746         int unexpected_errors = 0;
2747         int parity_errors = 0;
2748         int parity_untried = 0;
2749         int data_errors = 0;
2750
2751         ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2752
2753         for (int c = 0; c < rr->rr_cols; c++) {
2754                 raidz_col_t *rc = &rr->rr_col[c];
2755
2756                 if (rc->rc_error) {
2757                         if (c < rr->rr_firstdatacol)
2758                                 parity_errors++;
2759                         else
2760                                 data_errors++;
2761
2762                         if (!rc->rc_skipped)
2763                                 unexpected_errors++;
2764                 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2765                         parity_untried++;
2766                 }
2767
2768                 if (rc->rc_force_repair)
2769                         unexpected_errors++;
2770         }
2771
2772         /*
2773          * If we read more parity disks than were used for
2774          * reconstruction, confirm that the other parity disks produced
2775          * correct data.
2776          *
2777          * Note that we also regenerate parity when resilvering so we
2778          * can write it out to failed devices later.
2779          */
2780         if (parity_errors + parity_untried <
2781             rr->rr_firstdatacol - data_errors ||
2782             (zio->io_flags & ZIO_FLAG_RESILVER)) {
2783                 int n = raidz_parity_verify(zio, rr);
2784                 unexpected_errors += n;
2785         }
2786
2787         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2788             (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2789                 /*
2790                  * Use the good data we have in hand to repair damaged children.
2791                  */
2792                 for (int c = 0; c < rr->rr_cols; c++) {
2793                         raidz_col_t *rc = &rr->rr_col[c];
2794                         vdev_t *vd = zio->io_vd;
2795                         vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2796
2797                         if (!rc->rc_allow_repair) {
2798                                 continue;
2799                         } else if (!rc->rc_force_repair &&
2800                             (rc->rc_error == 0 || rc->rc_size == 0)) {
2801                                 continue;
2802                         }
2803                         /*
2804                          * We do not allow self healing for Direct I/O reads.
2805                          * See comment in vdev_raid_row_alloc().
2806                          */
2807                         ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
2808
2809                         zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2810                             "offset=%llx",
2811                             zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2812
2813                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2814                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2815                             ZIO_TYPE_WRITE,
2816                             zio->io_priority == ZIO_PRIORITY_REBUILD ?
2817                             ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
2818                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2819                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2820                 }
2821         }
2822
2823         /*
2824          * Scrub or resilver i/o's: overwrite any shadow locations with the
2825          * good data.  This ensures that if we've already copied this sector,
2826          * it will be corrected if it was damaged.  This writes more than is
2827          * necessary, but since expansion is paused during scrub/resilver, at
2828          * most a single row will have a shadow location.
2829          */
2830         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2831             (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2832                 for (int c = 0; c < rr->rr_cols; c++) {
2833                         raidz_col_t *rc = &rr->rr_col[c];
2834                         vdev_t *vd = zio->io_vd;
2835
2836                         if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2837                                 continue;
2838                         vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2839
2840                         /*
2841                          * Note: We don't want to update the repair stats
2842                          * because that would incorrectly indicate that there
2843                          * was bad data to repair, which we aren't sure about.
2844                          * By clearing the SCAN_THREAD flag, we prevent this
2845                          * from happening, despite having the REPAIR flag set.
2846                          * We need to set SELF_HEAL so that this i/o can't be
2847                          * bypassed by zio_vdev_io_start().
2848                          */
2849                         zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2850                             rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2851                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2852                             ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2853                             NULL, NULL);
2854                         cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2855                         zio_nowait(cio);
2856                 }
2857         }
2858 }
2859
2860 static void
2861 raidz_restore_orig_data(raidz_map_t *rm)
2862 {
2863         for (int i = 0; i < rm->rm_nrows; i++) {
2864                 raidz_row_t *rr = rm->rm_row[i];
2865                 for (int c = 0; c < rr->rr_cols; c++) {
2866                         raidz_col_t *rc = &rr->rr_col[c];
2867                         if (rc->rc_need_orig_restore) {
2868                                 abd_copy(rc->rc_abd,
2869                                     rc->rc_orig_data, rc->rc_size);
2870                                 rc->rc_need_orig_restore = B_FALSE;
2871                         }
2872                 }
2873         }
2874 }
2875
2876 /*
2877  * During raidz_reconstruct() for expanded VDEV, we need special consideration
2878  * failure simulations.  See note in raidz_reconstruct() on simulating failure
2879  * of a pre-expansion device.
2880  *
2881  * Treating logical child i as failed, return TRUE if the given column should
2882  * be treated as failed.  The idea of logical children allows us to imagine
2883  * that a disk silently failed before a RAIDZ expansion (reads from this disk
2884  * succeed but return the wrong data).  Since the expansion doesn't verify
2885  * checksums, the incorrect data will be moved to new locations spread among
2886  * the children (going diagonally across them).
2887  *
2888  * Higher "logical child failures" (values of `i`) indicate these
2889  * "pre-expansion failures".  The first physical_width values imagine that a
2890  * current child failed; the next physical_width-1 values imagine that a
2891  * child failed before the most recent expansion; the next physical_width-2
2892  * values imagine a child failed in the expansion before that, etc.
2893  */
2894 static boolean_t
2895 raidz_simulate_failure(int physical_width, int original_width, int ashift,
2896     int i, raidz_col_t *rc)
2897 {
2898         uint64_t sector_id =
2899             physical_width * (rc->rc_offset >> ashift) +
2900             rc->rc_devidx;
2901
2902         for (int w = physical_width; w >= original_width; w--) {
2903                 if (i < w) {
2904                         return (sector_id % w == i);
2905                 } else {
2906                         i -= w;
2907                 }
2908         }
2909         ASSERT(!"invalid logical child id");
2910         return (B_FALSE);
2911 }
2912
2913 /*
2914  * returns EINVAL if reconstruction of the block will not be possible
2915  * returns ECKSUM if this specific reconstruction failed
2916  * returns 0 on successful reconstruction
2917  */
2918 static int
2919 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
2920 {
2921         raidz_map_t *rm = zio->io_vsd;
2922         int physical_width = zio->io_vd->vdev_children;
2923         int original_width = (rm->rm_original_width != 0) ?
2924             rm->rm_original_width : physical_width;
2925         int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2926
2927         if (dbgmsg) {
2928                 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2929                     "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2930         }
2931
2932         /* Reconstruct each row */
2933         for (int r = 0; r < rm->rm_nrows; r++) {
2934                 raidz_row_t *rr = rm->rm_row[r];
2935                 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
2936                 int t = 0;
2937                 int dead = 0;
2938                 int dead_data = 0;
2939
2940                 if (dbgmsg)
2941                         zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2942
2943                 for (int c = 0; c < rr->rr_cols; c++) {
2944                         raidz_col_t *rc = &rr->rr_col[c];
2945                         ASSERT0(rc->rc_need_orig_restore);
2946                         if (rc->rc_error != 0) {
2947                                 dead++;
2948                                 if (c >= nparity)
2949                                         dead_data++;
2950                                 continue;
2951                         }
2952                         if (rc->rc_size == 0)
2953                                 continue;
2954                         for (int lt = 0; lt < ntgts; lt++) {
2955                                 if (raidz_simulate_failure(physical_width,
2956                                     original_width,
2957                                     zio->io_vd->vdev_top->vdev_ashift,
2958                                     ltgts[lt], rc)) {
2959                                         if (rc->rc_orig_data == NULL) {
2960                                                 rc->rc_orig_data =
2961                                                     abd_alloc_linear(
2962                                                     rc->rc_size, B_TRUE);
2963                                                 abd_copy(rc->rc_orig_data,
2964                                                     rc->rc_abd, rc->rc_size);
2965                                         }
2966                                         rc->rc_need_orig_restore = B_TRUE;
2967
2968                                         dead++;
2969                                         if (c >= nparity)
2970                                                 dead_data++;
2971                                         /*
2972                                          * Note: simulating failure of a
2973                                          * pre-expansion device can hit more
2974                                          * than one column, in which case we
2975                                          * might try to simulate more failures
2976                                          * than can be reconstructed, which is
2977                                          * also more than the size of my_tgts.
2978                                          * This check prevents accessing past
2979                                          * the end of my_tgts.  The "dead >
2980                                          * nparity" check below will fail this
2981                                          * reconstruction attempt.
2982                                          */
2983                                         if (t < VDEV_RAIDZ_MAXPARITY) {
2984                                                 my_tgts[t++] = c;
2985                                                 if (dbgmsg) {
2986                                                         zfs_dbgmsg("simulating "
2987                                                             "failure of col %u "
2988                                                             "devidx %u", c,
2989                                                             (int)rc->rc_devidx);
2990                                                 }
2991                                         }
2992                                         break;
2993                                 }
2994                         }
2995                 }
2996                 if (dead > nparity) {
2997                         /* reconstruction not possible */
2998                         if (dbgmsg) {
2999                                 zfs_dbgmsg("reconstruction not possible; "
3000                                     "too many failures");
3001                         }
3002                         raidz_restore_orig_data(rm);
3003                         return (EINVAL);
3004                 }
3005                 if (dead_data > 0)
3006                         vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
3007         }
3008
3009         /* Check for success */
3010         if (raidz_checksum_verify(zio) == 0) {
3011                 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
3012                         return (0);
3013
3014                 /* Reconstruction succeeded - report errors */
3015                 for (int i = 0; i < rm->rm_nrows; i++) {
3016                         raidz_row_t *rr = rm->rm_row[i];
3017
3018                         for (int c = 0; c < rr->rr_cols; c++) {
3019                                 raidz_col_t *rc = &rr->rr_col[c];
3020                                 if (rc->rc_need_orig_restore) {
3021                                         /*
3022                                          * Note: if this is a parity column,
3023                                          * we don't really know if it's wrong.
3024                                          * We need to let
3025                                          * vdev_raidz_io_done_verified() check
3026                                          * it, and if we set rc_error, it will
3027                                          * think that it is a "known" error
3028                                          * that doesn't need to be checked
3029                                          * or corrected.
3030                                          */
3031                                         if (rc->rc_error == 0 &&
3032                                             c >= rr->rr_firstdatacol) {
3033                                                 vdev_raidz_checksum_error(zio,
3034                                                     rc, rc->rc_orig_data);
3035                                                 rc->rc_error =
3036                                                     SET_ERROR(ECKSUM);
3037                                         }
3038                                         rc->rc_need_orig_restore = B_FALSE;
3039                                 }
3040                         }
3041
3042                         vdev_raidz_io_done_verified(zio, rr);
3043                 }
3044
3045                 zio_checksum_verified(zio);
3046
3047                 if (dbgmsg) {
3048                         zfs_dbgmsg("reconstruction successful "
3049                             "(checksum verified)");
3050                 }
3051                 return (0);
3052         }
3053
3054         /* Reconstruction failed - restore original data */
3055         raidz_restore_orig_data(rm);
3056         if (dbgmsg) {
3057                 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3058                     "failed", zio);
3059         }
3060         return (ECKSUM);
3061 }
3062
3063 /*
3064  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3065  * Note that the algorithm below is non-optimal because it doesn't take into
3066  * account how reconstruction is actually performed. For example, with
3067  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3068  * is targeted as invalid as if columns 1 and 4 are targeted since in both
3069  * cases we'd only use parity information in column 0.
3070  *
3071  * The order that we find the various possible combinations of failed
3072  * disks is dictated by these rules:
3073  * - Examine each "slot" (the "i" in tgts[i])
3074  *   - Try to increment this slot (tgts[i] += 1)
3075  *   - if we can't increment because it runs into the next slot,
3076  *     reset our slot to the minimum, and examine the next slot
3077  *
3078  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3079  *  3 columns to reconstruct), we will generate the following sequence:
3080  *
3081  *  STATE        ACTION
3082  *  0 1 2        special case: skip since these are all parity
3083  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
3084  *  0   2 3      first slot: increment to 1
3085  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
3086  *  0 1     4    first: reset to 0; middle: increment to 2
3087  *  0   2   4    first: increment to 1
3088  *    1 2   4    first: reset to 0; middle: increment to 3
3089  *  0     3 4    first: increment to 1
3090  *    1   3 4    first: increment to 2
3091  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
3092  *  0 1       5  first: reset to 0; middle: increment to 2
3093  *  0   2     5  first: increment to 1
3094  *    1 2     5  first: reset to 0; middle: increment to 3
3095  *  0     3   5  first: increment to 1
3096  *    1   3   5  first: increment to 2
3097  *      2 3   5  first: reset to 0; middle: increment to 4
3098  *  0       4 5  first: increment to 1
3099  *    1     4 5  first: increment to 2
3100  *      2   4 5  first: increment to 3
3101  *        3 4 5  done
3102  *
3103  * This strategy works for dRAID but is less efficient when there are a large
3104  * number of child vdevs and therefore permutations to check. Furthermore,
3105  * since the raidz_map_t rows likely do not overlap, reconstruction would be
3106  * possible as long as there are no more than nparity data errors per row.
3107  * These additional permutations are not currently checked but could be as
3108  * a future improvement.
3109  *
3110  * Returns 0 on success, ECKSUM on failure.
3111  */
3112 static int
3113 vdev_raidz_combrec(zio_t *zio)
3114 {
3115         int nparity = vdev_get_nparity(zio->io_vd);
3116         raidz_map_t *rm = zio->io_vsd;
3117         int physical_width = zio->io_vd->vdev_children;
3118         int original_width = (rm->rm_original_width != 0) ?
3119             rm->rm_original_width : physical_width;
3120
3121         for (int i = 0; i < rm->rm_nrows; i++) {
3122                 raidz_row_t *rr = rm->rm_row[i];
3123                 int total_errors = 0;
3124
3125                 for (int c = 0; c < rr->rr_cols; c++) {
3126                         if (rr->rr_col[c].rc_error)
3127                                 total_errors++;
3128                 }
3129
3130                 if (total_errors > nparity)
3131                         return (vdev_raidz_worst_error(rr));
3132         }
3133
3134         for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3135                 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3136                 int *ltgts = &tstore[1]; /* value is logical child ID */
3137
3138
3139                 /*
3140                  * Determine number of logical children, n.  See comment
3141                  * above raidz_simulate_failure().
3142                  */
3143                 int n = 0;
3144                 for (int w = physical_width;
3145                     w >= original_width; w--) {
3146                         n += w;
3147                 }
3148
3149                 ASSERT3U(num_failures, <=, nparity);
3150                 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3151
3152                 /* Handle corner cases in combrec logic */
3153                 ltgts[-1] = -1;
3154                 for (int i = 0; i < num_failures; i++) {
3155                         ltgts[i] = i;
3156                 }
3157                 ltgts[num_failures] = n;
3158
3159                 for (;;) {
3160                         int err = raidz_reconstruct(zio, ltgts, num_failures,
3161                             nparity);
3162                         if (err == EINVAL) {
3163                                 /*
3164                                  * Reconstruction not possible with this #
3165                                  * failures; try more failures.
3166                                  */
3167                                 break;
3168                         } else if (err == 0)
3169                                 return (0);
3170
3171                         /* Compute next targets to try */
3172                         for (int t = 0; ; t++) {
3173                                 ASSERT3U(t, <, num_failures);
3174                                 ltgts[t]++;
3175                                 if (ltgts[t] == n) {
3176                                         /* try more failures */
3177                                         ASSERT3U(t, ==, num_failures - 1);
3178                                         if (zfs_flags &
3179                                             ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3180                                                 zfs_dbgmsg("reconstruction "
3181                                                     "failed for num_failures="
3182                                                     "%u; tried all "
3183                                                     "combinations",
3184                                                     num_failures);
3185                                         }
3186                                         break;
3187                                 }
3188
3189                                 ASSERT3U(ltgts[t], <, n);
3190                                 ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3191
3192                                 /*
3193                                  * If that spot is available, we're done here.
3194                                  * Try the next combination.
3195                                  */
3196                                 if (ltgts[t] != ltgts[t + 1])
3197                                         break; // found next combination
3198
3199                                 /*
3200                                  * Otherwise, reset this tgt to the minimum,
3201                                  * and move on to the next tgt.
3202                                  */
3203                                 ltgts[t] = ltgts[t - 1] + 1;
3204                                 ASSERT3U(ltgts[t], ==, t);
3205                         }
3206
3207                         /* Increase the number of failures and keep trying. */
3208                         if (ltgts[num_failures - 1] == n)
3209                                 break;
3210                 }
3211         }
3212         if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3213                 zfs_dbgmsg("reconstruction failed for all num_failures");
3214         return (ECKSUM);
3215 }
3216
3217 void
3218 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3219 {
3220         for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3221                 raidz_row_t *rr = rm->rm_row[row];
3222                 vdev_raidz_reconstruct_row(rm, rr, t, nt);
3223         }
3224 }
3225
3226 /*
3227  * Complete a write IO operation on a RAIDZ VDev
3228  *
3229  * Outline:
3230  *   1. Check for errors on the child IOs.
3231  *   2. Return, setting an error code if too few child VDevs were written
3232  *      to reconstruct the data later.  Note that partial writes are
3233  *      considered successful if they can be reconstructed at all.
3234  */
3235 static void
3236 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3237 {
3238         int normal_errors = 0;
3239         int shadow_errors = 0;
3240
3241         ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3242         ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3243         ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3244
3245         for (int c = 0; c < rr->rr_cols; c++) {
3246                 raidz_col_t *rc = &rr->rr_col[c];
3247
3248                 if (rc->rc_error != 0) {
3249                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
3250                         normal_errors++;
3251                 }
3252                 if (rc->rc_shadow_error != 0) {
3253                         ASSERT(rc->rc_shadow_error != ECKSUM);
3254                         shadow_errors++;
3255                 }
3256         }
3257
3258         /*
3259          * Treat partial writes as a success. If we couldn't write enough
3260          * columns to reconstruct the data, the I/O failed.  Otherwise, good
3261          * enough.  Note that in the case of a shadow write (during raidz
3262          * expansion), depending on if we crash, either the normal (old) or
3263          * shadow (new) location may become the "real" version of the block,
3264          * so both locations must have sufficient redundancy.
3265          *
3266          * Now that we support write reallocation, it would be better
3267          * to treat partial failure as real failure unless there are
3268          * no non-degraded top-level vdevs left, and not update DTLs
3269          * if we intend to reallocate.
3270          */
3271         if (normal_errors > rr->rr_firstdatacol ||
3272             shadow_errors > rr->rr_firstdatacol) {
3273                 zio->io_error = zio_worst_error(zio->io_error,
3274                     vdev_raidz_worst_error(rr));
3275         }
3276 }
3277
3278 static void
3279 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3280     raidz_row_t *rr)
3281 {
3282         int parity_errors = 0;
3283         int parity_untried = 0;
3284         int data_errors = 0;
3285         int total_errors = 0;
3286
3287         ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3288         ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3289
3290         for (int c = 0; c < rr->rr_cols; c++) {
3291                 raidz_col_t *rc = &rr->rr_col[c];
3292
3293                 /*
3294                  * If scrubbing and a replacing/sparing child vdev determined
3295                  * that not all of its children have an identical copy of the
3296                  * data, then clear the error so the column is treated like
3297                  * any other read and force a repair to correct the damage.
3298                  */
3299                 if (rc->rc_error == ECKSUM) {
3300                         ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3301                         vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3302                         rc->rc_force_repair = 1;
3303                         rc->rc_error = 0;
3304                 }
3305
3306                 if (rc->rc_error) {
3307                         if (c < rr->rr_firstdatacol)
3308                                 parity_errors++;
3309                         else
3310                                 data_errors++;
3311
3312                         total_errors++;
3313                 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3314                         parity_untried++;
3315                 }
3316         }
3317
3318         /*
3319          * If there were data errors and the number of errors we saw was
3320          * correctable -- less than or equal to the number of parity disks read
3321          * -- reconstruct based on the missing data.
3322          */
3323         if (data_errors != 0 &&
3324             total_errors <= rr->rr_firstdatacol - parity_untried) {
3325                 /*
3326                  * We either attempt to read all the parity columns or
3327                  * none of them. If we didn't try to read parity, we
3328                  * wouldn't be here in the correctable case. There must
3329                  * also have been fewer parity errors than parity
3330                  * columns or, again, we wouldn't be in this code path.
3331                  */
3332                 ASSERT(parity_untried == 0);
3333                 ASSERT(parity_errors < rr->rr_firstdatacol);
3334
3335                 /*
3336                  * Identify the data columns that reported an error.
3337                  */
3338                 int n = 0;
3339                 int tgts[VDEV_RAIDZ_MAXPARITY];
3340                 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3341                         raidz_col_t *rc = &rr->rr_col[c];
3342                         if (rc->rc_error != 0) {
3343                                 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3344                                 tgts[n++] = c;
3345                         }
3346                 }
3347
3348                 ASSERT(rr->rr_firstdatacol >= n);
3349
3350                 vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3351         }
3352 }
3353
3354 /*
3355  * Return the number of reads issued.
3356  */
3357 static int
3358 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3359 {
3360         vdev_t *vd = zio->io_vd;
3361         int nread = 0;
3362
3363         rr->rr_missingdata = 0;
3364         rr->rr_missingparity = 0;
3365
3366         /*
3367          * If this rows contains empty sectors which are not required
3368          * for a normal read then allocate an ABD for them now so they
3369          * may be read, verified, and any needed repairs performed.
3370          */
3371         if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3372                 vdev_draid_map_alloc_empty(zio, rr);
3373
3374         for (int c = 0; c < rr->rr_cols; c++) {
3375                 raidz_col_t *rc = &rr->rr_col[c];
3376                 if (rc->rc_tried || rc->rc_size == 0)
3377                         continue;
3378
3379                 zio_nowait(zio_vdev_child_io(zio, NULL,
3380                     vd->vdev_child[rc->rc_devidx],
3381                     rc->rc_offset, rc->rc_abd, rc->rc_size,
3382                     zio->io_type, zio->io_priority, 0,
3383                     vdev_raidz_child_done, rc));
3384                 nread++;
3385         }
3386         return (nread);
3387 }
3388
3389 /*
3390  * We're here because either there were too many errors to even attempt
3391  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3392  * failed. In either case, there is enough bad data to prevent reconstruction.
3393  * Start checksum ereports for all children which haven't failed.
3394  */
3395 static void
3396 vdev_raidz_io_done_unrecoverable(zio_t *zio)
3397 {
3398         raidz_map_t *rm = zio->io_vsd;
3399
3400         for (int i = 0; i < rm->rm_nrows; i++) {
3401                 raidz_row_t *rr = rm->rm_row[i];
3402
3403                 for (int c = 0; c < rr->rr_cols; c++) {
3404                         raidz_col_t *rc = &rr->rr_col[c];
3405                         vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3406
3407                         if (rc->rc_error != 0)
3408                                 continue;
3409
3410                         zio_bad_cksum_t zbc;
3411                         zbc.zbc_has_cksum = 0;
3412                         zbc.zbc_injected = rm->rm_ecksuminjected;
3413                         mutex_enter(&cvd->vdev_stat_lock);
3414                         cvd->vdev_stat.vs_checksum_errors++;
3415                         mutex_exit(&cvd->vdev_stat_lock);
3416                         (void) zfs_ereport_start_checksum(zio->io_spa,
3417                             cvd, &zio->io_bookmark, zio, rc->rc_offset,
3418                             rc->rc_size, &zbc);
3419                 }
3420         }
3421 }
3422
3423 void
3424 vdev_raidz_io_done(zio_t *zio)
3425 {
3426         raidz_map_t *rm = zio->io_vsd;
3427
3428         ASSERT(zio->io_bp != NULL);
3429         if (zio->io_type == ZIO_TYPE_WRITE) {
3430                 for (int i = 0; i < rm->rm_nrows; i++) {
3431                         vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3432                 }
3433         } else {
3434                 if (rm->rm_phys_col) {
3435                         /*
3436                          * This is an aggregated read.  Copy the data and status
3437                          * from the aggregate abd's to the individual rows.
3438                          */
3439                         for (int i = 0; i < rm->rm_nrows; i++) {
3440                                 raidz_row_t *rr = rm->rm_row[i];
3441
3442                                 for (int c = 0; c < rr->rr_cols; c++) {
3443                                         raidz_col_t *rc = &rr->rr_col[c];
3444                                         if (rc->rc_tried || rc->rc_size == 0)
3445                                                 continue;
3446
3447                                         raidz_col_t *prc =
3448                                             &rm->rm_phys_col[rc->rc_devidx];
3449                                         rc->rc_error = prc->rc_error;
3450                                         rc->rc_tried = prc->rc_tried;
3451                                         rc->rc_skipped = prc->rc_skipped;
3452                                         if (c >= rr->rr_firstdatacol) {
3453                                                 /*
3454                                                  * Note: this is slightly faster
3455                                                  * than using abd_copy_off().
3456                                                  */
3457                                                 char *physbuf = abd_to_buf(
3458                                                     prc->rc_abd);
3459                                                 void *physloc = physbuf +
3460                                                     rc->rc_offset -
3461                                                     prc->rc_offset;
3462
3463                                                 abd_copy_from_buf(rc->rc_abd,
3464                                                     physloc, rc->rc_size);
3465                                         }
3466                                 }
3467                         }
3468                 }
3469
3470                 for (int i = 0; i < rm->rm_nrows; i++) {
3471                         raidz_row_t *rr = rm->rm_row[i];
3472                         vdev_raidz_io_done_reconstruct_known_missing(zio,
3473                             rm, rr);
3474                 }
3475
3476                 if (raidz_checksum_verify(zio) == 0) {
3477                         if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
3478                                 goto done;
3479
3480                         for (int i = 0; i < rm->rm_nrows; i++) {
3481                                 raidz_row_t *rr = rm->rm_row[i];
3482                                 vdev_raidz_io_done_verified(zio, rr);
3483                         }
3484                         zio_checksum_verified(zio);
3485                 } else {
3486                         /*
3487                          * A sequential resilver has no checksum which makes
3488                          * combinatoral reconstruction impossible. This code
3489                          * path is unreachable since raidz_checksum_verify()
3490                          * has no checksum to verify and must succeed.
3491                          */
3492                         ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3493
3494                         /*
3495                          * This isn't a typical situation -- either we got a
3496                          * read error or a child silently returned bad data.
3497                          * Read every block so we can try again with as much
3498                          * data and parity as we can track down. If we've
3499                          * already been through once before, all children will
3500                          * be marked as tried so we'll proceed to combinatorial
3501                          * reconstruction.
3502                          */
3503                         int nread = 0;
3504                         for (int i = 0; i < rm->rm_nrows; i++) {
3505                                 nread += vdev_raidz_read_all(zio,
3506                                     rm->rm_row[i]);
3507                         }
3508                         if (nread != 0) {
3509                                 /*
3510                                  * Normally our stage is VDEV_IO_DONE, but if
3511                                  * we've already called redone(), it will have
3512                                  * changed to VDEV_IO_START, in which case we
3513                                  * don't want to call redone() again.
3514                                  */
3515                                 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3516                                         zio_vdev_io_redone(zio);
3517                                 return;
3518                         }
3519                         /*
3520                          * It would be too expensive to try every possible
3521                          * combination of failed sectors in every row, so
3522                          * instead we try every combination of failed current or
3523                          * past physical disk. This means that if the incorrect
3524                          * sectors were all on Nparity disks at any point in the
3525                          * past, we will find the correct data.  The only known
3526                          * case where this is less durable than a non-expanded
3527                          * RAIDZ, is if we have a silent failure during
3528                          * expansion.  In that case, one block could be
3529                          * partially in the old format and partially in the
3530                          * new format, so we'd lost some sectors from the old
3531                          * format and some from the new format.
3532                          *
3533                          * e.g. logical_width=4 physical_width=6
3534                          * the 15 (6+5+4) possible failed disks are:
3535                          * width=6 child=0
3536                          * width=6 child=1
3537                          * width=6 child=2
3538                          * width=6 child=3
3539                          * width=6 child=4
3540                          * width=6 child=5
3541                          * width=5 child=0
3542                          * width=5 child=1
3543                          * width=5 child=2
3544                          * width=5 child=3
3545                          * width=5 child=4
3546                          * width=4 child=0
3547                          * width=4 child=1
3548                          * width=4 child=2
3549                          * width=4 child=3
3550                          * And we will try every combination of Nparity of these
3551                          * failing.
3552                          *
3553                          * As a first pass, we can generate every combo,
3554                          * and try reconstructing, ignoring any known
3555                          * failures.  If any row has too many known + simulated
3556                          * failures, then we bail on reconstructing with this
3557                          * number of simulated failures.  As an improvement,
3558                          * we could detect the number of whole known failures
3559                          * (i.e. we have known failures on these disks for
3560                          * every row; the disks never succeeded), and
3561                          * subtract that from the max # failures to simulate.
3562                          * We could go even further like the current
3563                          * combrec code, but that doesn't seem like it
3564                          * gains us very much.  If we simulate a failure
3565                          * that is also a known failure, that's fine.
3566                          */
3567                         zio->io_error = vdev_raidz_combrec(zio);
3568                         if (zio->io_error == ECKSUM &&
3569                             !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3570                                 vdev_raidz_io_done_unrecoverable(zio);
3571                         }
3572                 }
3573         }
3574 done:
3575         if (rm->rm_lr != NULL) {
3576                 zfs_rangelock_exit(rm->rm_lr);
3577                 rm->rm_lr = NULL;
3578         }
3579 }
3580
3581 static void
3582 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3583 {
3584         vdev_raidz_t *vdrz = vd->vdev_tsd;
3585         if (faulted > vdrz->vd_nparity)
3586                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3587                     VDEV_AUX_NO_REPLICAS);
3588         else if (degraded + faulted != 0)
3589                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3590         else
3591                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3592 }
3593
3594 /*
3595  * Determine if any portion of the provided block resides on a child vdev
3596  * with a dirty DTL and therefore needs to be resilvered.  The function
3597  * assumes that at least one DTL is dirty which implies that full stripe
3598  * width blocks must be resilvered.
3599  */
3600 static boolean_t
3601 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3602     uint64_t phys_birth)
3603 {
3604         vdev_raidz_t *vdrz = vd->vdev_tsd;
3605
3606         /*
3607          * If we're in the middle of a RAIDZ expansion, this block may be in
3608          * the old and/or new location.  For simplicity, always resilver it.
3609          */
3610         if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3611                 return (B_TRUE);
3612
3613         uint64_t dcols = vd->vdev_children;
3614         uint64_t nparity = vdrz->vd_nparity;
3615         uint64_t ashift = vd->vdev_top->vdev_ashift;
3616         /* The starting RAIDZ (parent) vdev sector of the block. */
3617         uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3618         /* The zio's size in units of the vdev's minimum sector size. */
3619         uint64_t s = ((psize - 1) >> ashift) + 1;
3620         /* The first column for this stripe. */
3621         uint64_t f = b % dcols;
3622
3623         /* Unreachable by sequential resilver. */
3624         ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3625
3626         if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3627                 return (B_FALSE);
3628
3629         if (s + nparity >= dcols)
3630                 return (B_TRUE);
3631
3632         for (uint64_t c = 0; c < s + nparity; c++) {
3633                 uint64_t devidx = (f + c) % dcols;
3634                 vdev_t *cvd = vd->vdev_child[devidx];
3635
3636                 /*
3637                  * dsl_scan_need_resilver() already checked vd with
3638                  * vdev_dtl_contains(). So here just check cvd with
3639                  * vdev_dtl_empty(), cheaper and a good approximation.
3640                  */
3641                 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3642                         return (B_TRUE);
3643         }
3644
3645         return (B_FALSE);
3646 }
3647
3648 static void
3649 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
3650     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
3651 {
3652         (void) remain_rs;
3653
3654         vdev_t *raidvd = cvd->vdev_parent;
3655         ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3656
3657         vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3658
3659         if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3660                 /*
3661                  * We're in the middle of expansion, in which case the
3662                  * translation is in flux.  Any answer we give may be wrong
3663                  * by the time we return, so it isn't safe for the caller to
3664                  * act on it.  Therefore we say that this range isn't present
3665                  * on any children.  The only consumers of this are "zpool
3666                  * initialize" and trimming, both of which are "best effort"
3667                  * anyway.
3668                  */
3669                 physical_rs->rs_start = physical_rs->rs_end = 0;
3670                 remain_rs->rs_start = remain_rs->rs_end = 0;
3671                 return;
3672         }
3673
3674         uint64_t width = vdrz->vd_physical_width;
3675         uint64_t tgt_col = cvd->vdev_id;
3676         uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3677
3678         /* make sure the offsets are block-aligned */
3679         ASSERT0(logical_rs->rs_start % (1 << ashift));
3680         ASSERT0(logical_rs->rs_end % (1 << ashift));
3681         uint64_t b_start = logical_rs->rs_start >> ashift;
3682         uint64_t b_end = logical_rs->rs_end >> ashift;
3683
3684         uint64_t start_row = 0;
3685         if (b_start > tgt_col) /* avoid underflow */
3686                 start_row = ((b_start - tgt_col - 1) / width) + 1;
3687
3688         uint64_t end_row = 0;
3689         if (b_end > tgt_col)
3690                 end_row = ((b_end - tgt_col - 1) / width) + 1;
3691
3692         physical_rs->rs_start = start_row << ashift;
3693         physical_rs->rs_end = end_row << ashift;
3694
3695         ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
3696         ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
3697             logical_rs->rs_end - logical_rs->rs_start);
3698 }
3699
3700 static void
3701 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
3702 {
3703         spa_t *spa = arg;
3704         int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3705         vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3706
3707         /*
3708          * Ensure there are no i/os to the range that is being committed.
3709          */
3710         uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3711         ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
3712
3713         mutex_enter(&vre->vre_lock);
3714         uint64_t new_offset =
3715             MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3716         /*
3717          * We should not have committed anything that failed.
3718          */
3719         VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3720         mutex_exit(&vre->vre_lock);
3721
3722         zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3723             old_offset, new_offset - old_offset,
3724             RL_WRITER);
3725
3726         /*
3727          * Update the uberblock that will be written when this txg completes.
3728          */
3729         RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3730             RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3731         vre->vre_offset_pertxg[txgoff] = 0;
3732         zfs_rangelock_exit(lr);
3733
3734         mutex_enter(&vre->vre_lock);
3735         vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3736         vre->vre_bytes_copied_pertxg[txgoff] = 0;
3737         mutex_exit(&vre->vre_lock);
3738
3739         vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3740         VERIFY0(zap_update(spa->spa_meta_objset,
3741             vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3742             sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3743 }
3744
3745 static void
3746 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3747 {
3748         spa_t *spa = arg;
3749         vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3750         vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3751         vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3752
3753         for (int i = 0; i < TXG_SIZE; i++)
3754                 VERIFY0(vre->vre_offset_pertxg[i]);
3755
3756         reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3757         re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3758         re->re_logical_width = vdrz->vd_physical_width;
3759         mutex_enter(&vdrz->vd_expand_lock);
3760         avl_add(&vdrz->vd_expand_txgs, re);
3761         mutex_exit(&vdrz->vd_expand_lock);
3762
3763         vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3764
3765         /*
3766          * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3767          * will get written (based on vd_expand_txgs).
3768          */
3769         vdev_config_dirty(vd);
3770
3771         /*
3772          * Before we change vre_state, the on-disk state must reflect that we
3773          * have completed all copying, so that vdev_raidz_io_start() can use
3774          * vre_state to determine if the reflow is in progress.  See also the
3775          * end of spa_raidz_expand_thread().
3776          */
3777         VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3778             raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3779
3780         vre->vre_end_time = gethrestime_sec();
3781         vre->vre_state = DSS_FINISHED;
3782
3783         uint64_t state = vre->vre_state;
3784         VERIFY0(zap_update(spa->spa_meta_objset,
3785             vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3786             sizeof (state), 1, &state, tx));
3787
3788         uint64_t end_time = vre->vre_end_time;
3789         VERIFY0(zap_update(spa->spa_meta_objset,
3790             vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3791             sizeof (end_time), 1, &end_time, tx));
3792
3793         spa->spa_uberblock.ub_raidz_reflow_info = 0;
3794
3795         spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
3796             "%s vdev %llu new width %llu", spa_name(spa),
3797             (unsigned long long)vd->vdev_id,
3798             (unsigned long long)vd->vdev_children);
3799
3800         spa->spa_raidz_expand = NULL;
3801         raidvd->vdev_rz_expanding = B_FALSE;
3802
3803         spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3804         spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3805         spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3806
3807         spa_notify_waiters(spa);
3808
3809         /*
3810          * While we're in syncing context take the opportunity to
3811          * setup a scrub. All the data has been sucessfully copied
3812          * but we have not validated any checksums.
3813          */
3814         pool_scan_func_t func = POOL_SCAN_SCRUB;
3815         if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0)
3816                 dsl_scan_setup_sync(&func, tx);
3817 }
3818
3819 /*
3820  * Struct for one copy zio.
3821  */
3822 typedef struct raidz_reflow_arg {
3823         vdev_raidz_expand_t *rra_vre;
3824         zfs_locked_range_t *rra_lr;
3825         uint64_t rra_txg;
3826 } raidz_reflow_arg_t;
3827
3828 /*
3829  * The write of the new location is done.
3830  */
3831 static void
3832 raidz_reflow_write_done(zio_t *zio)
3833 {
3834         raidz_reflow_arg_t *rra = zio->io_private;
3835         vdev_raidz_expand_t *vre = rra->rra_vre;
3836
3837         abd_free(zio->io_abd);
3838
3839         mutex_enter(&vre->vre_lock);
3840         if (zio->io_error != 0) {
3841                 /* Force a reflow pause on errors */
3842                 vre->vre_failed_offset =
3843                     MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3844         }
3845         ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3846         vre->vre_outstanding_bytes -= zio->io_size;
3847         if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3848             vre->vre_failed_offset) {
3849                 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3850                     zio->io_size;
3851         }
3852         cv_signal(&vre->vre_cv);
3853         mutex_exit(&vre->vre_lock);
3854
3855         zfs_rangelock_exit(rra->rra_lr);
3856
3857         kmem_free(rra, sizeof (*rra));
3858         spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
3859 }
3860
3861 /*
3862  * The read of the old location is done.  The parent zio is the write to
3863  * the new location.  Allow it to start.
3864  */
3865 static void
3866 raidz_reflow_read_done(zio_t *zio)
3867 {
3868         raidz_reflow_arg_t *rra = zio->io_private;
3869         vdev_raidz_expand_t *vre = rra->rra_vre;
3870
3871         /*
3872          * If the read failed, or if it was done on a vdev that is not fully
3873          * healthy (e.g. a child that has a resilver in progress), we may not
3874          * have the correct data.  Note that it's OK if the write proceeds.
3875          * It may write garbage but the location is otherwise unused and we
3876          * will retry later due to vre_failed_offset.
3877          */
3878         if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3879                 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3880                     "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3881                     (long long)rra->rra_lr->lr_offset,
3882                     (long long)rra->rra_lr->lr_length,
3883                     (long long)rra->rra_txg,
3884                     zio->io_error,
3885                     vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3886                     vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3887                 mutex_enter(&vre->vre_lock);
3888                 /* Force a reflow pause on errors */
3889                 vre->vre_failed_offset =
3890                     MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3891                 mutex_exit(&vre->vre_lock);
3892         }
3893
3894         zio_nowait(zio_unique_parent(zio));
3895 }
3896
3897 static void
3898 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3899     dmu_tx_t *tx)
3900 {
3901         int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3902         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3903
3904         if (offset == 0)
3905                 return;
3906
3907         mutex_enter(&vre->vre_lock);
3908         ASSERT3U(vre->vre_offset, <=, offset);
3909         vre->vre_offset = offset;
3910         mutex_exit(&vre->vre_lock);
3911
3912         if (vre->vre_offset_pertxg[txgoff] == 0) {
3913                 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3914                     spa, tx);
3915         }
3916         vre->vre_offset_pertxg[txgoff] = offset;
3917 }
3918
3919 static boolean_t
3920 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3921 {
3922         for (int i = 0; i < raidz_vd->vdev_children; i++) {
3923                 /* Quick check if a child is being replaced */
3924                 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3925                         return (B_TRUE);
3926         }
3927         return (B_FALSE);
3928 }
3929
3930 static boolean_t
3931 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3932     dmu_tx_t *tx)
3933 {
3934         spa_t *spa = vd->vdev_spa;
3935         int ashift = vd->vdev_top->vdev_ashift;
3936         uint64_t offset, size;
3937
3938         if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,
3939             &offset, &size)) {
3940                 return (B_FALSE);
3941         }
3942         ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3943         ASSERT3U(size, >=, 1 << ashift);
3944         uint64_t length = 1 << ashift;
3945         int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3946
3947         uint64_t blkid = offset >> ashift;
3948
3949         int old_children = vd->vdev_children - 1;
3950
3951         /*
3952          * We can only progress to the point that writes will not overlap
3953          * with blocks whose progress has not yet been recorded on disk.
3954          * Since partially-copied rows are still read from the old location,
3955          * we need to stop one row before the sector-wise overlap, to prevent
3956          * row-wise overlap.
3957          *
3958          * Note that even if we are skipping over a large unallocated region,
3959          * we can't move the on-disk progress to `offset`, because concurrent
3960          * writes/allocations could still use the currently-unallocated
3961          * region.
3962          */
3963         uint64_t ubsync_blkid =
3964             RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
3965         uint64_t next_overwrite_blkid = ubsync_blkid +
3966             ubsync_blkid / old_children - old_children;
3967         VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
3968
3969         if (blkid >= next_overwrite_blkid) {
3970                 raidz_reflow_record_progress(vre,
3971                     next_overwrite_blkid << ashift, tx);
3972                 return (B_TRUE);
3973         }
3974
3975         range_tree_remove(rt, offset, length);
3976
3977         raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);
3978         rra->rra_vre = vre;
3979         rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
3980             offset, length, RL_WRITER);
3981         rra->rra_txg = dmu_tx_get_txg(tx);
3982
3983         raidz_reflow_record_progress(vre, offset + length, tx);
3984
3985         mutex_enter(&vre->vre_lock);
3986         vre->vre_outstanding_bytes += length;
3987         mutex_exit(&vre->vre_lock);
3988
3989         /*
3990          * SCL_STATE will be released when the read and write are done,
3991          * by raidz_reflow_write_done().
3992          */
3993         spa_config_enter(spa, SCL_STATE, spa, RW_READER);
3994
3995         /* check if a replacing vdev was added, if so treat it as an error */
3996         if (vdev_raidz_expand_child_replacing(vd)) {
3997                 zfs_dbgmsg("replacing vdev encountered, reflow paused at "
3998                     "offset=%llu txg=%llu",
3999                     (long long)rra->rra_lr->lr_offset,
4000                     (long long)rra->rra_txg);
4001
4002                 mutex_enter(&vre->vre_lock);
4003                 vre->vre_failed_offset =
4004                     MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4005                 cv_signal(&vre->vre_cv);
4006                 mutex_exit(&vre->vre_lock);
4007
4008                 /* drop everything we acquired */
4009                 zfs_rangelock_exit(rra->rra_lr);
4010                 kmem_free(rra, sizeof (*rra));
4011                 spa_config_exit(spa, SCL_STATE, spa);
4012                 return (B_TRUE);
4013         }
4014
4015         zio_t *pio = spa->spa_txg_zio[txgoff];
4016         abd_t *abd = abd_alloc_for_io(length, B_FALSE);
4017         zio_t *write_zio = zio_vdev_child_io(pio, NULL,
4018             vd->vdev_child[blkid % vd->vdev_children],
4019             (blkid / vd->vdev_children) << ashift,
4020             abd, length,
4021             ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4022             ZIO_FLAG_CANFAIL,
4023             raidz_reflow_write_done, rra);
4024
4025         zio_nowait(zio_vdev_child_io(write_zio, NULL,
4026             vd->vdev_child[blkid % old_children],
4027             (blkid / old_children) << ashift,
4028             abd, length,
4029             ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4030             ZIO_FLAG_CANFAIL,
4031             raidz_reflow_read_done, rra));
4032
4033         return (B_FALSE);
4034 }
4035
4036 /*
4037  * For testing (ztest specific)
4038  */
4039 static void
4040 raidz_expand_pause(uint_t pause_point)
4041 {
4042         while (raidz_expand_pause_point != 0 &&
4043             raidz_expand_pause_point <= pause_point)
4044                 delay(hz);
4045 }
4046
4047 static void
4048 raidz_scratch_child_done(zio_t *zio)
4049 {
4050         zio_t *pio = zio->io_private;
4051
4052         mutex_enter(&pio->io_lock);
4053         pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4054         mutex_exit(&pio->io_lock);
4055 }
4056
4057 /*
4058  * Reflow the beginning portion of the vdev into an intermediate scratch area
4059  * in memory and on disk. This operation must be persisted on disk before we
4060  * proceed to overwrite the beginning portion with the reflowed data.
4061  *
4062  * This multi-step task can fail to complete if disk errors are encountered
4063  * and we can return here after a pause (waiting for disk to become healthy).
4064  */
4065 static void
4066 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4067 {
4068         vdev_raidz_expand_t *vre = arg;
4069         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4070         zio_t *pio;
4071         int error;
4072
4073         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4074         vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4075         int ashift = raidvd->vdev_ashift;
4076         uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4077             uint64_t);
4078         uint64_t logical_size = write_size * raidvd->vdev_children;
4079         uint64_t read_size =
4080             P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4081             1 << ashift);
4082
4083         /*
4084          * The scratch space must be large enough to get us to the point
4085          * that one row does not overlap itself when moved.  This is checked
4086          * by vdev_raidz_attach_check().
4087          */
4088         VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4089         VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4090         VERIFY3U(write_size, <=, read_size);
4091
4092         zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4093             0, logical_size, RL_WRITER);
4094
4095         abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4096             KM_SLEEP);
4097         for (int i = 0; i < raidvd->vdev_children; i++) {
4098                 abds[i] = abd_alloc_linear(read_size, B_FALSE);
4099         }
4100
4101         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4102
4103         /*
4104          * If we have already written the scratch area then we must read from
4105          * there, since new writes were redirected there while we were paused
4106          * or the original location may have been partially overwritten with
4107          * reflowed data.
4108          */
4109         if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4110                 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4111                 /*
4112                  * Read from scratch space.
4113                  */
4114                 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4115                 for (int i = 0; i < raidvd->vdev_children; i++) {
4116                         /*
4117                          * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4118                          * to the offset to calculate the physical offset to
4119                          * write to.  Passing in a negative offset makes us
4120                          * access the scratch area.
4121                          */
4122                         zio_nowait(zio_vdev_child_io(pio, NULL,
4123                             raidvd->vdev_child[i],
4124                             VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4125                             write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ,
4126                             ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4127                 }
4128                 error = zio_wait(pio);
4129                 if (error != 0) {
4130                         zfs_dbgmsg("reflow: error %d reading scratch location",
4131                             error);
4132                         goto io_error_exit;
4133                 }
4134                 goto overwrite;
4135         }
4136
4137         /*
4138          * Read from original location.
4139          */
4140         pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4141         for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4142                 ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4143                 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4144                     0, abds[i], read_size, ZIO_TYPE_READ,
4145                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
4146                     raidz_scratch_child_done, pio));
4147         }
4148         error = zio_wait(pio);
4149         if (error != 0) {
4150                 zfs_dbgmsg("reflow: error %d reading original location", error);
4151 io_error_exit:
4152                 for (int i = 0; i < raidvd->vdev_children; i++)
4153                         abd_free(abds[i]);
4154                 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4155                 zfs_rangelock_exit(lr);
4156                 spa_config_exit(spa, SCL_STATE, FTAG);
4157                 return;
4158         }
4159
4160         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4161
4162         /*
4163          * Reflow in memory.
4164          */
4165         uint64_t logical_sectors = logical_size >> ashift;
4166         for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4167                 int oldchild = i % (raidvd->vdev_children - 1);
4168                 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4169
4170                 int newchild = i % raidvd->vdev_children;
4171                 uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4172
4173                 /* a single sector should not be copying over itself */
4174                 ASSERT(!(newchild == oldchild && newoff == oldoff));
4175
4176                 abd_copy_off(abds[newchild], abds[oldchild],
4177                     newoff, oldoff, 1 << ashift);
4178         }
4179
4180         /*
4181          * Verify that we filled in everything we intended to (write_size on
4182          * each child).
4183          */
4184         VERIFY0(logical_sectors % raidvd->vdev_children);
4185         VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4186             write_size);
4187
4188         /*
4189          * Write to scratch location (boot area).
4190          */
4191         pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4192         for (int i = 0; i < raidvd->vdev_children; i++) {
4193                 /*
4194                  * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4195                  * the offset to calculate the physical offset to write to.
4196                  * Passing in a negative offset lets us access the boot area.
4197                  */
4198                 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4199                     VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4200                     write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
4201                     ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4202         }
4203         error = zio_wait(pio);
4204         if (error != 0) {
4205                 zfs_dbgmsg("reflow: error %d writing scratch location", error);
4206                 goto io_error_exit;
4207         }
4208         pio = zio_root(spa, NULL, NULL, 0);
4209         zio_flush(pio, raidvd);
4210         zio_wait(pio);
4211
4212         zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4213             (long long)logical_size);
4214
4215         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4216
4217         /*
4218          * Update uberblock to indicate that scratch space is valid.  This is
4219          * needed because after this point, the real location may be
4220          * overwritten.  If we crash, we need to get the data from the
4221          * scratch space, rather than the real location.
4222          *
4223          * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4224          * will prefer this uberblock.
4225          */
4226         RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4227         spa->spa_ubsync.ub_timestamp++;
4228         ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4229             &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4230         if (spa_multihost(spa))
4231                 mmp_update_uberblock(spa, &spa->spa_ubsync);
4232
4233         zfs_dbgmsg("reflow: uberblock updated "
4234             "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4235             (long long)spa->spa_ubsync.ub_txg,
4236             (long long)logical_size,
4237             (long long)spa->spa_ubsync.ub_timestamp);
4238
4239         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4240
4241         /*
4242          * Overwrite with reflow'ed data.
4243          */
4244 overwrite:
4245         pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4246         for (int i = 0; i < raidvd->vdev_children; i++) {
4247                 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4248                     0, abds[i], write_size, ZIO_TYPE_WRITE,
4249                     ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
4250                     raidz_scratch_child_done, pio));
4251         }
4252         error = zio_wait(pio);
4253         if (error != 0) {
4254                 /*
4255                  * When we exit early here and drop the range lock, new
4256                  * writes will go into the scratch area so we'll need to
4257                  * read from there when we return after pausing.
4258                  */
4259                 zfs_dbgmsg("reflow: error %d writing real location", error);
4260                 /*
4261                  * Update the uberblock that is written when this txg completes.
4262                  */
4263                 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4264                     logical_size);
4265                 goto io_error_exit;
4266         }
4267         pio = zio_root(spa, NULL, NULL, 0);
4268         zio_flush(pio, raidvd);
4269         zio_wait(pio);
4270
4271         zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4272             (long long)logical_size);
4273         for (int i = 0; i < raidvd->vdev_children; i++)
4274                 abd_free(abds[i]);
4275         kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4276
4277         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4278
4279         /*
4280          * Update uberblock to indicate that the initial part has been
4281          * reflow'ed.  This is needed because after this point (when we exit
4282          * the rangelock), we allow regular writes to this region, which will
4283          * be written to the new location only (because reflow_offset_next ==
4284          * reflow_offset_synced).  If we crashed and re-copied from the
4285          * scratch space, we would lose the regular writes.
4286          */
4287         RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4288             logical_size);
4289         spa->spa_ubsync.ub_timestamp++;
4290         ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4291             &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4292         if (spa_multihost(spa))
4293                 mmp_update_uberblock(spa, &spa->spa_ubsync);
4294
4295         zfs_dbgmsg("reflow: uberblock updated "
4296             "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4297             (long long)spa->spa_ubsync.ub_txg,
4298             (long long)logical_size,
4299             (long long)spa->spa_ubsync.ub_timestamp);
4300
4301         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4302
4303         /*
4304          * Update progress.
4305          */
4306         vre->vre_offset = logical_size;
4307         zfs_rangelock_exit(lr);
4308         spa_config_exit(spa, SCL_STATE, FTAG);
4309
4310         int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4311         vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4312         vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4313         /*
4314          * Note - raidz_reflow_sync() will update the uberblock state to
4315          * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4316          */
4317         raidz_reflow_sync(spa, tx);
4318
4319         raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4320 }
4321
4322 /*
4323  * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4324  * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
4325  */
4326 void
4327 vdev_raidz_reflow_copy_scratch(spa_t *spa)
4328 {
4329         vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4330         uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4331         ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4332
4333         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4334         vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4335         ASSERT0(logical_size % raidvd->vdev_children);
4336         uint64_t write_size = logical_size / raidvd->vdev_children;
4337
4338         zio_t *pio;
4339
4340         /*
4341          * Read from scratch space.
4342          */
4343         abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4344             KM_SLEEP);
4345         for (int i = 0; i < raidvd->vdev_children; i++) {
4346                 abds[i] = abd_alloc_linear(write_size, B_FALSE);
4347         }
4348
4349         pio = zio_root(spa, NULL, NULL, 0);
4350         for (int i = 0; i < raidvd->vdev_children; i++) {
4351                 /*
4352                  * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4353                  * the offset to calculate the physical offset to write to.
4354                  * Passing in a negative offset lets us access the boot area.
4355                  */
4356                 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4357                     VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4358                     write_size, ZIO_TYPE_READ,
4359                     ZIO_PRIORITY_ASYNC_READ, 0,
4360                     raidz_scratch_child_done, pio));
4361         }
4362         zio_wait(pio);
4363
4364         /*
4365          * Overwrite real location with reflow'ed data.
4366          */
4367         pio = zio_root(spa, NULL, NULL, 0);
4368         for (int i = 0; i < raidvd->vdev_children; i++) {
4369                 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4370                     0, abds[i], write_size, ZIO_TYPE_WRITE,
4371                     ZIO_PRIORITY_ASYNC_WRITE, 0,
4372                     raidz_scratch_child_done, pio));
4373         }
4374         zio_wait(pio);
4375         pio = zio_root(spa, NULL, NULL, 0);
4376         zio_flush(pio, raidvd);
4377         zio_wait(pio);
4378
4379         zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4380             "to real location", (long long)logical_size);
4381
4382         for (int i = 0; i < raidvd->vdev_children; i++)
4383                 abd_free(abds[i]);
4384         kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4385
4386         /*
4387          * Update uberblock.
4388          */
4389         RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4390             RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4391         spa->spa_ubsync.ub_timestamp++;
4392         VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4393             &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4394         if (spa_multihost(spa))
4395                 mmp_update_uberblock(spa, &spa->spa_ubsync);
4396
4397         zfs_dbgmsg("reflow recovery: uberblock updated "
4398             "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4399             (long long)spa->spa_ubsync.ub_txg,
4400             (long long)logical_size,
4401             (long long)spa->spa_ubsync.ub_timestamp);
4402
4403         dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4404             spa_first_txg(spa));
4405         int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4406         vre->vre_offset = logical_size;
4407         vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4408         vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4409         /*
4410          * Note that raidz_reflow_sync() will update the uberblock once more
4411          */
4412         raidz_reflow_sync(spa, tx);
4413
4414         dmu_tx_commit(tx);
4415
4416         spa_config_exit(spa, SCL_STATE, FTAG);
4417 }
4418
4419 static boolean_t
4420 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4421 {
4422         (void) zthr;
4423         spa_t *spa = arg;
4424
4425         return (spa->spa_raidz_expand != NULL &&
4426             !spa->spa_raidz_expand->vre_waiting_for_resilver);
4427 }
4428
4429 /*
4430  * RAIDZ expansion background thread
4431  *
4432  * Can be called multiple times if the reflow is paused
4433  */
4434 static void
4435 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4436 {
4437         spa_t *spa = arg;
4438         vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4439
4440         if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4441                 vre->vre_offset = 0;
4442         else
4443                 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4444
4445         /* Reflow the begining portion using the scratch area */
4446         if (vre->vre_offset == 0) {
4447                 VERIFY0(dsl_sync_task(spa_name(spa),
4448                     NULL, raidz_reflow_scratch_sync,
4449                     vre, 0, ZFS_SPACE_CHECK_NONE));
4450
4451                 /* if we encountered errors then pause */
4452                 if (vre->vre_offset == 0) {
4453                         mutex_enter(&vre->vre_lock);
4454                         vre->vre_waiting_for_resilver = B_TRUE;
4455                         mutex_exit(&vre->vre_lock);
4456                         return;
4457                 }
4458         }
4459
4460         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4461         vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4462
4463         uint64_t guid = raidvd->vdev_guid;
4464
4465         /* Iterate over all the remaining metaslabs */
4466         for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4467             i < raidvd->vdev_ms_count &&
4468             !zthr_iscancelled(zthr) &&
4469             vre->vre_failed_offset == UINT64_MAX; i++) {
4470                 metaslab_t *msp = raidvd->vdev_ms[i];
4471
4472                 metaslab_disable(msp);
4473                 mutex_enter(&msp->ms_lock);
4474
4475                 /*
4476                  * The metaslab may be newly created (for the expanded
4477                  * space), in which case its trees won't exist yet,
4478                  * so we need to bail out early.
4479                  */
4480                 if (msp->ms_new) {
4481                         mutex_exit(&msp->ms_lock);
4482                         metaslab_enable(msp, B_FALSE, B_FALSE);
4483                         continue;
4484                 }
4485
4486                 VERIFY0(metaslab_load(msp));
4487
4488                 /*
4489                  * We want to copy everything except the free (allocatable)
4490                  * space.  Note that there may be a little bit more free
4491                  * space (e.g. in ms_defer), and it's fine to copy that too.
4492                  */
4493                 range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64,
4494                     NULL, 0, 0);
4495                 range_tree_add(rt, msp->ms_start, msp->ms_size);
4496                 range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
4497                 mutex_exit(&msp->ms_lock);
4498
4499                 /*
4500                  * Force the last sector of each metaslab to be copied.  This
4501                  * ensures that we advance the on-disk progress to the end of
4502                  * this metaslab while the metaslab is disabled.  Otherwise, we
4503                  * could move past this metaslab without advancing the on-disk
4504                  * progress, and then an allocation to this metaslab would not
4505                  * be copied.
4506                  */
4507                 int sectorsz = 1 << raidvd->vdev_ashift;
4508                 uint64_t ms_last_offset = msp->ms_start +
4509                     msp->ms_size - sectorsz;
4510                 if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
4511                         range_tree_add(rt, ms_last_offset, sectorsz);
4512                 }
4513
4514                 /*
4515                  * When we are resuming from a paused expansion (i.e.
4516                  * when importing a pool with a expansion in progress),
4517                  * discard any state that we have already processed.
4518                  */
4519                 range_tree_clear(rt, 0, vre->vre_offset);
4520
4521                 while (!zthr_iscancelled(zthr) &&
4522                     !range_tree_is_empty(rt) &&
4523                     vre->vre_failed_offset == UINT64_MAX) {
4524
4525                         /*
4526                          * We need to periodically drop the config lock so that
4527                          * writers can get in.  Additionally, we can't wait
4528                          * for a txg to sync while holding a config lock
4529                          * (since a waiting writer could cause a 3-way deadlock
4530                          * with the sync thread, which also gets a config
4531                          * lock for reader).  So we can't hold the config lock
4532                          * while calling dmu_tx_assign().
4533                          */
4534                         spa_config_exit(spa, SCL_CONFIG, FTAG);
4535
4536                         /*
4537                          * If requested, pause the reflow when the amount
4538                          * specified by raidz_expand_max_reflow_bytes is reached
4539                          *
4540                          * This pause is only used during testing or debugging.
4541                          */
4542                         while (raidz_expand_max_reflow_bytes != 0 &&
4543                             raidz_expand_max_reflow_bytes <=
4544                             vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4545                                 delay(hz);
4546                         }
4547
4548                         mutex_enter(&vre->vre_lock);
4549                         while (vre->vre_outstanding_bytes >
4550                             raidz_expand_max_copy_bytes) {
4551                                 cv_wait(&vre->vre_cv, &vre->vre_lock);
4552                         }
4553                         mutex_exit(&vre->vre_lock);
4554
4555                         dmu_tx_t *tx =
4556                             dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4557
4558                         VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
4559                         uint64_t txg = dmu_tx_get_txg(tx);
4560
4561                         /*
4562                          * Reacquire the vdev_config lock.  Theoretically, the
4563                          * vdev_t that we're expanding may have changed.
4564                          */
4565                         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4566                         raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4567
4568                         boolean_t needsync =
4569                             raidz_reflow_impl(raidvd, vre, rt, tx);
4570
4571                         dmu_tx_commit(tx);
4572
4573                         if (needsync) {
4574                                 spa_config_exit(spa, SCL_CONFIG, FTAG);
4575                                 txg_wait_synced(spa->spa_dsl_pool, txg);
4576                                 spa_config_enter(spa, SCL_CONFIG, FTAG,
4577                                     RW_READER);
4578                         }
4579                 }
4580
4581                 spa_config_exit(spa, SCL_CONFIG, FTAG);
4582
4583                 metaslab_enable(msp, B_FALSE, B_FALSE);
4584                 range_tree_vacate(rt, NULL, NULL);
4585                 range_tree_destroy(rt);
4586
4587                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4588                 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4589         }
4590
4591         spa_config_exit(spa, SCL_CONFIG, FTAG);
4592
4593         /*
4594          * The txg_wait_synced() here ensures that all reflow zio's have
4595          * completed, and vre_failed_offset has been set if necessary.  It
4596          * also ensures that the progress of the last raidz_reflow_sync() is
4597          * written to disk before raidz_reflow_complete_sync() changes the
4598          * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
4599          * determine if a reflow is in progress, in which case we may need to
4600          * write to both old and new locations.  Therefore we can only change
4601          * vre_state once this is not necessary, which is once the on-disk
4602          * progress (in spa_ubsync) has been set past any possible writes (to
4603          * the end of the last metaslab).
4604          */
4605         txg_wait_synced(spa->spa_dsl_pool, 0);
4606
4607         if (!zthr_iscancelled(zthr) &&
4608             vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4609                 /*
4610                  * We are not being canceled or paused, so the reflow must be
4611                  * complete. In that case also mark it as completed on disk.
4612                  */
4613                 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4614                 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4615                     raidz_reflow_complete_sync, spa,
4616                     0, ZFS_SPACE_CHECK_NONE));
4617                 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4618         } else {
4619                 /*
4620                  * Wait for all copy zio's to complete and for all the
4621                  * raidz_reflow_sync() synctasks to be run.
4622                  */
4623                 spa_history_log_internal(spa, "reflow pause",
4624                     NULL, "offset=%llu failed_offset=%lld",
4625                     (long long)vre->vre_offset,
4626                     (long long)vre->vre_failed_offset);
4627                 mutex_enter(&vre->vre_lock);
4628                 if (vre->vre_failed_offset != UINT64_MAX) {
4629                         /*
4630                          * Reset progress so that we will retry everything
4631                          * after the point that something failed.
4632                          */
4633                         vre->vre_offset = vre->vre_failed_offset;
4634                         vre->vre_failed_offset = UINT64_MAX;
4635                         vre->vre_waiting_for_resilver = B_TRUE;
4636                 }
4637                 mutex_exit(&vre->vre_lock);
4638         }
4639 }
4640
4641 void
4642 spa_start_raidz_expansion_thread(spa_t *spa)
4643 {
4644         ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
4645         spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4646             spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4647             spa, defclsyspri);
4648 }
4649
4650 void
4651 raidz_dtl_reassessed(vdev_t *vd)
4652 {
4653         spa_t *spa = vd->vdev_spa;
4654         if (spa->spa_raidz_expand != NULL) {
4655                 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4656                 /*
4657                  * we get called often from vdev_dtl_reassess() so make
4658                  * sure it's our vdev and any replacing is complete
4659                  */
4660                 if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4661                     !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4662                         mutex_enter(&vre->vre_lock);
4663                         if (vre->vre_waiting_for_resilver) {
4664                                 vdev_dbgmsg(vd, "DTL reassessed, "
4665                                     "continuing raidz expansion");
4666                                 vre->vre_waiting_for_resilver = B_FALSE;
4667                                 zthr_wakeup(spa->spa_raidz_expand_zthr);
4668                         }
4669                         mutex_exit(&vre->vre_lock);
4670                 }
4671         }
4672 }
4673
4674 int
4675 vdev_raidz_attach_check(vdev_t *new_child)
4676 {
4677         vdev_t *raidvd = new_child->vdev_parent;
4678         uint64_t new_children = raidvd->vdev_children;
4679
4680         /*
4681          * We use the "boot" space as scratch space to handle overwriting the
4682          * initial part of the vdev.  If it is too small, then this expansion
4683          * is not allowed.  This would be very unusual (e.g. ashift > 13 and
4684          * >200 children).
4685          */
4686         if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4687                 return (EINVAL);
4688         }
4689         return (0);
4690 }
4691
4692 void
4693 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4694 {
4695         vdev_t *new_child = arg;
4696         spa_t *spa = new_child->vdev_spa;
4697         vdev_t *raidvd = new_child->vdev_parent;
4698         vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4699         ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4700         ASSERT3P(raidvd->vdev_top, ==, raidvd);
4701         ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4702         ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4703         ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4704             new_child);
4705
4706         spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4707
4708         vdrz->vd_physical_width++;
4709
4710         VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4711         vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4712         vdrz->vn_vre.vre_offset = 0;
4713         vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4714         spa->spa_raidz_expand = &vdrz->vn_vre;
4715         zthr_wakeup(spa->spa_raidz_expand_zthr);
4716
4717         /*
4718          * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4719          * written to the config.
4720          */
4721         vdev_config_dirty(raidvd);
4722
4723         vdrz->vn_vre.vre_start_time = gethrestime_sec();
4724         vdrz->vn_vre.vre_end_time = 0;
4725         vdrz->vn_vre.vre_state = DSS_SCANNING;
4726         vdrz->vn_vre.vre_bytes_copied = 0;
4727
4728         uint64_t state = vdrz->vn_vre.vre_state;
4729         VERIFY0(zap_update(spa->spa_meta_objset,
4730             raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4731             sizeof (state), 1, &state, tx));
4732
4733         uint64_t start_time = vdrz->vn_vre.vre_start_time;
4734         VERIFY0(zap_update(spa->spa_meta_objset,
4735             raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4736             sizeof (start_time), 1, &start_time, tx));
4737
4738         (void) zap_remove(spa->spa_meta_objset,
4739             raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4740         (void) zap_remove(spa->spa_meta_objset,
4741             raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4742
4743         spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
4744             "%s vdev %llu new width %llu", spa_name(spa),
4745             (unsigned long long)raidvd->vdev_id,
4746             (unsigned long long)raidvd->vdev_children);
4747 }
4748
4749 int
4750 vdev_raidz_load(vdev_t *vd)
4751 {
4752         vdev_raidz_t *vdrz = vd->vdev_tsd;
4753         int err;
4754
4755         uint64_t state = DSS_NONE;
4756         uint64_t start_time = 0;
4757         uint64_t end_time = 0;
4758         uint64_t bytes_copied = 0;
4759
4760         if (vd->vdev_top_zap != 0) {
4761                 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4762                     vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4763                     sizeof (state), 1, &state);
4764                 if (err != 0 && err != ENOENT)
4765                         return (err);
4766
4767                 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4768                     vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4769                     sizeof (start_time), 1, &start_time);
4770                 if (err != 0 && err != ENOENT)
4771                         return (err);
4772
4773                 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4774                     vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4775                     sizeof (end_time), 1, &end_time);
4776                 if (err != 0 && err != ENOENT)
4777                         return (err);
4778
4779                 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4780                     vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4781                     sizeof (bytes_copied), 1, &bytes_copied);
4782                 if (err != 0 && err != ENOENT)
4783                         return (err);
4784         }
4785
4786         /*
4787          * If we are in the middle of expansion, vre_state should have
4788          * already been set by vdev_raidz_init().
4789          */
4790         EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4791         vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4792         vdrz->vn_vre.vre_start_time = start_time;
4793         vdrz->vn_vre.vre_end_time = end_time;
4794         vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4795
4796         return (0);
4797 }
4798
4799 int
4800 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4801 {
4802         vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4803
4804         if (vre == NULL) {
4805                 /* no removal in progress; find most recent completed */
4806                 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4807                         vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4808                         if (vd->vdev_ops == &vdev_raidz_ops) {
4809                                 vdev_raidz_t *vdrz = vd->vdev_tsd;
4810
4811                                 if (vdrz->vn_vre.vre_end_time != 0 &&
4812                                     (vre == NULL ||
4813                                     vdrz->vn_vre.vre_end_time >
4814                                     vre->vre_end_time)) {
4815                                         vre = &vdrz->vn_vre;
4816                                 }
4817                         }
4818                 }
4819         }
4820
4821         if (vre == NULL) {
4822                 return (SET_ERROR(ENOENT));
4823         }
4824
4825         pres->pres_state = vre->vre_state;
4826         pres->pres_expanding_vdev = vre->vre_vdev_id;
4827
4828         vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4829         pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4830
4831         mutex_enter(&vre->vre_lock);
4832         pres->pres_reflowed = vre->vre_bytes_copied;
4833         for (int i = 0; i < TXG_SIZE; i++)
4834                 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4835         mutex_exit(&vre->vre_lock);
4836
4837         pres->pres_start_time = vre->vre_start_time;
4838         pres->pres_end_time = vre->vre_end_time;
4839         pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4840
4841         return (0);
4842 }
4843
4844 /*
4845  * Initialize private RAIDZ specific fields from the nvlist.
4846  */
4847 static int
4848 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
4849 {
4850         uint_t children;
4851         nvlist_t **child;
4852         int error = nvlist_lookup_nvlist_array(nv,
4853             ZPOOL_CONFIG_CHILDREN, &child, &children);
4854         if (error != 0)
4855                 return (SET_ERROR(EINVAL));
4856
4857         uint64_t nparity;
4858         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
4859                 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
4860                         return (SET_ERROR(EINVAL));
4861
4862                 /*
4863                  * Previous versions could only support 1 or 2 parity
4864                  * device.
4865                  */
4866                 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
4867                         return (SET_ERROR(EINVAL));
4868                 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
4869                         return (SET_ERROR(EINVAL));
4870         } else {
4871                 /*
4872                  * We require the parity to be specified for SPAs that
4873                  * support multiple parity levels.
4874                  */
4875                 if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
4876                         return (SET_ERROR(EINVAL));
4877
4878                 /*
4879                  * Otherwise, we default to 1 parity device for RAID-Z.
4880                  */
4881                 nparity = 1;
4882         }
4883
4884         vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4885         vdrz->vn_vre.vre_vdev_id = -1;
4886         vdrz->vn_vre.vre_offset = UINT64_MAX;
4887         vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4888         mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4889         cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4890         zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4891         mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4892         avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4893             sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4894
4895         vdrz->vd_physical_width = children;
4896         vdrz->vd_nparity = nparity;
4897
4898         /* note, the ID does not exist when creating a pool */
4899         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
4900             &vdrz->vn_vre.vre_vdev_id);
4901
4902         boolean_t reflow_in_progress =
4903             nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4904         if (reflow_in_progress) {
4905                 spa->spa_raidz_expand = &vdrz->vn_vre;
4906                 vdrz->vn_vre.vre_state = DSS_SCANNING;
4907         }
4908
4909         vdrz->vd_original_width = children;
4910         uint64_t *txgs;
4911         unsigned int txgs_size = 0;
4912         error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4913             &txgs, &txgs_size);
4914         if (error == 0) {
4915                 for (int i = 0; i < txgs_size; i++) {
4916                         reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4917                         re->re_txg = txgs[txgs_size - i - 1];
4918                         re->re_logical_width = vdrz->vd_physical_width - i;
4919
4920                         if (reflow_in_progress)
4921                                 re->re_logical_width--;
4922
4923                         avl_add(&vdrz->vd_expand_txgs, re);
4924                 }
4925
4926                 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
4927         }
4928         if (reflow_in_progress) {
4929                 vdrz->vd_original_width--;
4930                 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
4931                     children, txgs_size);
4932         }
4933
4934         *tsd = vdrz;
4935
4936         return (0);
4937 }
4938
4939 static void
4940 vdev_raidz_fini(vdev_t *vd)
4941 {
4942         vdev_raidz_t *vdrz = vd->vdev_tsd;
4943         if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
4944                 vd->vdev_spa->spa_raidz_expand = NULL;
4945         reflow_node_t *re;
4946         void *cookie = NULL;
4947         avl_tree_t *tree = &vdrz->vd_expand_txgs;
4948         while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
4949                 kmem_free(re, sizeof (*re));
4950         avl_destroy(&vdrz->vd_expand_txgs);
4951         mutex_destroy(&vdrz->vd_expand_lock);
4952         mutex_destroy(&vdrz->vn_vre.vre_lock);
4953         cv_destroy(&vdrz->vn_vre.vre_cv);
4954         zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
4955         kmem_free(vdrz, sizeof (*vdrz));
4956 }
4957
4958 /*
4959  * Add RAIDZ specific fields to the config nvlist.
4960  */
4961 static void
4962 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
4963 {
4964         ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
4965         vdev_raidz_t *vdrz = vd->vdev_tsd;
4966
4967         /*
4968          * Make sure someone hasn't managed to sneak a fancy new vdev
4969          * into a crufty old storage pool.
4970          */
4971         ASSERT(vdrz->vd_nparity == 1 ||
4972             (vdrz->vd_nparity <= 2 &&
4973             spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
4974             (vdrz->vd_nparity <= 3 &&
4975             spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
4976
4977         /*
4978          * Note that we'll add these even on storage pools where they
4979          * aren't strictly required -- older software will just ignore
4980          * it.
4981          */
4982         fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
4983
4984         if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
4985                 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4986         }
4987
4988         mutex_enter(&vdrz->vd_expand_lock);
4989         if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
4990                 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
4991                 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
4992                     KM_SLEEP);
4993                 uint64_t i = 0;
4994
4995                 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
4996                     re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
4997                         txgs[i++] = re->re_txg;
4998                 }
4999
5000                 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5001                     txgs, count);
5002
5003                 kmem_free(txgs, sizeof (uint64_t) * count);
5004         }
5005         mutex_exit(&vdrz->vd_expand_lock);
5006 }
5007
5008 static uint64_t
5009 vdev_raidz_nparity(vdev_t *vd)
5010 {
5011         vdev_raidz_t *vdrz = vd->vdev_tsd;
5012         return (vdrz->vd_nparity);
5013 }
5014
5015 static uint64_t
5016 vdev_raidz_ndisks(vdev_t *vd)
5017 {
5018         return (vd->vdev_children);
5019 }
5020
5021 vdev_ops_t vdev_raidz_ops = {
5022         .vdev_op_init = vdev_raidz_init,
5023         .vdev_op_fini = vdev_raidz_fini,
5024         .vdev_op_open = vdev_raidz_open,
5025         .vdev_op_close = vdev_raidz_close,
5026         .vdev_op_asize = vdev_raidz_asize,
5027         .vdev_op_min_asize = vdev_raidz_min_asize,
5028         .vdev_op_min_alloc = NULL,
5029         .vdev_op_io_start = vdev_raidz_io_start,
5030         .vdev_op_io_done = vdev_raidz_io_done,
5031         .vdev_op_state_change = vdev_raidz_state_change,
5032         .vdev_op_need_resilver = vdev_raidz_need_resilver,
5033         .vdev_op_hold = NULL,
5034         .vdev_op_rele = NULL,
5035         .vdev_op_remap = NULL,
5036         .vdev_op_xlate = vdev_raidz_xlate,
5037         .vdev_op_rebuild_asize = NULL,
5038         .vdev_op_metaslab_init = NULL,
5039         .vdev_op_config_generate = vdev_raidz_config_generate,
5040         .vdev_op_nparity = vdev_raidz_nparity,
5041         .vdev_op_ndisks = vdev_raidz_ndisks,
5042         .vdev_op_type = VDEV_TYPE_RAIDZ,        /* name of this vdev type */
5043         .vdev_op_leaf = B_FALSE                 /* not a leaf vdev */
5044 };
5045
5046 /* BEGIN CSTYLED */
5047 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5048         "For testing, pause RAIDZ expansion after reflowing this many bytes");
5049 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5050         "Max amount of concurrent i/o for RAIDZ expansion");
5051 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5052         "For expanded RAIDZ, aggregate reads that have more rows than this");
5053 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5054         "For expanded RAIDZ, automatically start a pool scrub when expansion "
5055         "completes");
5056 /* END CSTYLED */