module/zfs/vdev_raidz.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  25  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
  26  */
  27
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/vdev_impl.h>
  31 #include <sys/zio.h>
  32 #include <sys/zio_checksum.h>
  33 #include <sys/abd.h>
  34 #include <sys/fs/zfs.h>
  35 #include <sys/fm/fs/zfs.h>
  36 #include <sys/vdev_raidz.h>
  37 #include <sys/vdev_raidz_impl.h>
  38 #include <sys/vdev_draid.h>
  39
  40 #ifdef ZFS_DEBUG
  41 #include <sys/vdev.h>   /* For vdev_xlate() in vdev_raidz_io_verify() */
  42 #endif
  43
  44 /*
  45  * Virtual device vector for RAID-Z.
  46  *
  47  * This vdev supports single, double, and triple parity. For single parity,
  48  * we use a simple XOR of all the data columns. For double or triple parity,
  49  * we use a special case of Reed-Solomon coding. This extends the
  50  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  51  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  52  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  53  * former is also based. The latter is designed to provide higher performance
  54  * for writes.
  55  *
  56  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  57  * amended six years later identifying a critical flaw that invalidates its
  58  * claims. Nevertheless, the technique can be adapted to work for up to
  59  * triple parity. For additional parity, the amendment "Note: Correction to
  60  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  61  * is viable, but the additional complexity means that write performance will
  62  * suffer.
  63  *
  64  * All of the methods above operate on a Galois field, defined over the
  65  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  66  * can be expressed with a single byte. Briefly, the operations on the
  67  * field are defined as follows:
  68  *
  69  *   o addition (+) is represented by a bitwise XOR
  70  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  71  *   o multiplication of A by 2 is defined by the following bitwise expression:
  72  *
  73  *      (A * 2)_7 = A_6
  74  *      (A * 2)_6 = A_5
  75  *      (A * 2)_5 = A_4
  76  *      (A * 2)_4 = A_3 + A_7
  77  *      (A * 2)_3 = A_2 + A_7
  78  *      (A * 2)_2 = A_1 + A_7
  79  *      (A * 2)_1 = A_0
  80  *      (A * 2)_0 = A_7
  81  *
  82  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  83  * As an aside, this multiplication is derived from the error correcting
  84  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  85  *
  86  * Observe that any number in the field (except for 0) can be expressed as a
  87  * power of 2 -- a generator for the field. We store a table of the powers of
  88  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  89  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  90  * than field addition). The inverse of a field element A (A^-1) is therefore
  91  * A ^ (255 - 1) = A^254.
  92  *
  93  * The up-to-three parity columns, P, Q, R over several data columns,
  94  * D_0, ... D_n-1, can be expressed by field operations:
  95  *
  96  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  97  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  98  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  99  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
 100  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
 101  *
 102  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
 103  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
 104  * independent coefficients. (There are no additional coefficients that have
 105  * this property which is why the uncorrected Plank method breaks down.)
 106  *
 107  * See the reconstruction code below for how P, Q and R can used individually
 108  * or in concert to recover missing data columns.
 109  */
 110
 111 #define VDEV_RAIDZ_P            0
 112 #define VDEV_RAIDZ_Q            1
 113 #define VDEV_RAIDZ_R            2
 114
 115 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 116 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 117
 118 /*
 119  * We provide a mechanism to perform the field multiplication operation on a
 120  * 64-bit value all at once rather than a byte at a time. This works by
 121  * creating a mask from the top bit in each byte and using that to
 122  * conditionally apply the XOR of 0x1d.
 123  */
 124 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 125 { \
 126         (mask) = (x) & 0x8080808080808080ULL; \
 127         (mask) = ((mask) << 1) - ((mask) >> 7); \
 128         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 129             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 130 }
 131
 132 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 133 { \
 134         VDEV_RAIDZ_64MUL_2((x), mask); \
 135         VDEV_RAIDZ_64MUL_2((x), mask); \
 136 }
 137
 138 static void
 139 vdev_raidz_row_free(raidz_row_t *rr)
 140 {
 141         for (int c = 0; c < rr->rr_cols; c++) {
 142                 raidz_col_t *rc = &rr->rr_col[c];
 143
 144                 if (rc->rc_size != 0)
 145                         abd_free(rc->rc_abd);
 146                 if (rc->rc_orig_data != NULL)
 147                         abd_free(rc->rc_orig_data);
 148         }
 149
 150         if (rr->rr_abd_empty != NULL)
 151                 abd_free(rr->rr_abd_empty);
 152
 153         kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
 154 }
 155
 156 void
 157 vdev_raidz_map_free(raidz_map_t *rm)
 158 {
 159         for (int i = 0; i < rm->rm_nrows; i++)
 160                 vdev_raidz_row_free(rm->rm_row[i]);
 161
 162         kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
 163 }
 164
 165 static void
 166 vdev_raidz_map_free_vsd(zio_t *zio)
 167 {
 168         raidz_map_t *rm = zio->io_vsd;
 169
 170         vdev_raidz_map_free(rm);
 171 }
 172
 173 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 174         .vsd_free = vdev_raidz_map_free_vsd,
 175 };
 176
 177 /*
 178  * Divides the IO evenly across all child vdevs; usually, dcols is
 179  * the number of children in the target vdev.
 180  *
 181  * Avoid inlining the function to keep vdev_raidz_io_start(), which
 182  * is this functions only caller, as small as possible on the stack.
 183  */
 184 noinline raidz_map_t *
 185 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
 186     uint64_t nparity)
 187 {
 188         raidz_row_t *rr;
 189         /* The starting RAIDZ (parent) vdev sector of the block. */
 190         uint64_t b = zio->io_offset >> ashift;
 191         /* The zio's size in units of the vdev's minimum sector size. */
 192         uint64_t s = zio->io_size >> ashift;
 193         /* The first column for this stripe. */
 194         uint64_t f = b % dcols;
 195         /* The starting byte offset on each child vdev. */
 196         uint64_t o = (b / dcols) << ashift;
 197         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 198
 199         raidz_map_t *rm =
 200             kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
 201         rm->rm_nrows = 1;
 202
 203         /*
 204          * "Quotient": The number of data sectors for this stripe on all but
 205          * the "big column" child vdevs that also contain "remainder" data.
 206          */
 207         q = s / (dcols - nparity);
 208
 209         /*
 210          * "Remainder": The number of partial stripe data sectors in this I/O.
 211          * This will add a sector to some, but not all, child vdevs.
 212          */
 213         r = s - q * (dcols - nparity);
 214
 215         /* The number of "big columns" - those which contain remainder data. */
 216         bc = (r == 0 ? 0 : r + nparity);
 217
 218         /*
 219          * The total number of data and parity sectors associated with
 220          * this I/O.
 221          */
 222         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 223
 224         /*
 225          * acols: The columns that will be accessed.
 226          * scols: The columns that will be accessed or skipped.
 227          */
 228         if (q == 0) {
 229                 /* Our I/O request doesn't span all child vdevs. */
 230                 acols = bc;
 231                 scols = MIN(dcols, roundup(bc, nparity + 1));
 232         } else {
 233                 acols = dcols;
 234                 scols = dcols;
 235         }
 236
 237         ASSERT3U(acols, <=, scols);
 238
 239         rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP);
 240         rm->rm_row[0] = rr;
 241
 242         rr->rr_cols = acols;
 243         rr->rr_scols = scols;
 244         rr->rr_bigcols = bc;
 245         rr->rr_missingdata = 0;
 246         rr->rr_missingparity = 0;
 247         rr->rr_firstdatacol = nparity;
 248         rr->rr_abd_empty = NULL;
 249         rr->rr_nempty = 0;
 250 #ifdef ZFS_DEBUG
 251         rr->rr_offset = zio->io_offset;
 252         rr->rr_size = zio->io_size;
 253 #endif
 254
 255         asize = 0;
 256
 257         for (c = 0; c < scols; c++) {
 258                 raidz_col_t *rc = &rr->rr_col[c];
 259                 col = f + c;
 260                 coff = o;
 261                 if (col >= dcols) {
 262                         col -= dcols;
 263                         coff += 1ULL << ashift;
 264                 }
 265                 rc->rc_devidx = col;
 266                 rc->rc_offset = coff;
 267                 rc->rc_abd = NULL;
 268                 rc->rc_orig_data = NULL;
 269                 rc->rc_error = 0;
 270                 rc->rc_tried = 0;
 271                 rc->rc_skipped = 0;
 272                 rc->rc_force_repair = 0;
 273                 rc->rc_allow_repair = 1;
 274                 rc->rc_need_orig_restore = B_FALSE;
 275
 276                 if (c >= acols)
 277                         rc->rc_size = 0;
 278                 else if (c < bc)
 279                         rc->rc_size = (q + 1) << ashift;
 280                 else
 281                         rc->rc_size = q << ashift;
 282
 283                 asize += rc->rc_size;
 284         }
 285
 286         ASSERT3U(asize, ==, tot << ashift);
 287         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 288         rm->rm_skipstart = bc;
 289
 290         for (c = 0; c < rr->rr_firstdatacol; c++)
 291                 rr->rr_col[c].rc_abd =
 292                     abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
 293
 294         for (uint64_t off = 0; c < acols; c++) {
 295                 raidz_col_t *rc = &rr->rr_col[c];
 296                 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 297                     zio->io_abd, off, rc->rc_size);
 298                 off += rc->rc_size;
 299         }
 300
 301         /*
 302          * If all data stored spans all columns, there's a danger that parity
 303          * will always be on the same device and, since parity isn't read
 304          * during normal operation, that device's I/O bandwidth won't be
 305          * used effectively. We therefore switch the parity every 1MB.
 306          *
 307          * ... at least that was, ostensibly, the theory. As a practical
 308          * matter unless we juggle the parity between all devices evenly, we
 309          * won't see any benefit. Further, occasional writes that aren't a
 310          * multiple of the LCM of the number of children and the minimum
 311          * stripe width are sufficient to avoid pessimal behavior.
 312          * Unfortunately, this decision created an implicit on-disk format
 313          * requirement that we need to support for all eternity, but only
 314          * for single-parity RAID-Z.
 315          *
 316          * If we intend to skip a sector in the zeroth column for padding
 317          * we must make sure to note this swap. We will never intend to
 318          * skip the first column since at least one data and one parity
 319          * column must appear in each row.
 320          */
 321         ASSERT(rr->rr_cols >= 2);
 322         ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 323
 324         if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 325                 devidx = rr->rr_col[0].rc_devidx;
 326                 o = rr->rr_col[0].rc_offset;
 327                 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 328                 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 329                 rr->rr_col[1].rc_devidx = devidx;
 330                 rr->rr_col[1].rc_offset = o;
 331
 332                 if (rm->rm_skipstart == 0)
 333                         rm->rm_skipstart = 1;
 334         }
 335
 336         /* init RAIDZ parity ops */
 337         rm->rm_ops = vdev_raidz_math_get_ops();
 338
 339         return (rm);
 340 }
 341
 342 struct pqr_struct {
 343         uint64_t *p;
 344         uint64_t *q;
 345         uint64_t *r;
 346 };
 347
 348 static int
 349 vdev_raidz_p_func(void *buf, size_t size, void *private)
 350 {
 351         struct pqr_struct *pqr = private;
 352         const uint64_t *src = buf;
 353         int i, cnt = size / sizeof (src[0]);
 354
 355         ASSERT(pqr->p && !pqr->q && !pqr->r);
 356
 357         for (i = 0; i < cnt; i++, src++, pqr->p++)
 358                 *pqr->p ^= *src;
 359
 360         return (0);
 361 }
 362
 363 static int
 364 vdev_raidz_pq_func(void *buf, size_t size, void *private)
 365 {
 366         struct pqr_struct *pqr = private;
 367         const uint64_t *src = buf;
 368         uint64_t mask;
 369         int i, cnt = size / sizeof (src[0]);
 370
 371         ASSERT(pqr->p && pqr->q && !pqr->r);
 372
 373         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
 374                 *pqr->p ^= *src;
 375                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 376                 *pqr->q ^= *src;
 377         }
 378
 379         return (0);
 380 }
 381
 382 static int
 383 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 384 {
 385         struct pqr_struct *pqr = private;
 386         const uint64_t *src = buf;
 387         uint64_t mask;
 388         int i, cnt = size / sizeof (src[0]);
 389
 390         ASSERT(pqr->p && pqr->q && pqr->r);
 391
 392         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 393                 *pqr->p ^= *src;
 394                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 395                 *pqr->q ^= *src;
 396                 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 397                 *pqr->r ^= *src;
 398         }
 399
 400         return (0);
 401 }
 402
 403 static void
 404 vdev_raidz_generate_parity_p(raidz_row_t *rr)
 405 {
 406         uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 407
 408         for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 409                 abd_t *src = rr->rr_col[c].rc_abd;
 410
 411                 if (c == rr->rr_firstdatacol) {
 412                         abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 413                 } else {
 414                         struct pqr_struct pqr = { p, NULL, NULL };
 415                         (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 416                             vdev_raidz_p_func, &pqr);
 417                 }
 418         }
 419 }
 420
 421 static void
 422 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
 423 {
 424         uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 425         uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 426         uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 427         ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 428             rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 429
 430         for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 431                 abd_t *src = rr->rr_col[c].rc_abd;
 432
 433                 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
 434
 435                 if (c == rr->rr_firstdatacol) {
 436                         ASSERT(ccnt == pcnt || ccnt == 0);
 437                         abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 438                         (void) memcpy(q, p, rr->rr_col[c].rc_size);
 439
 440                         for (uint64_t i = ccnt; i < pcnt; i++) {
 441                                 p[i] = 0;
 442                                 q[i] = 0;
 443                         }
 444                 } else {
 445                         struct pqr_struct pqr = { p, q, NULL };
 446
 447                         ASSERT(ccnt <= pcnt);
 448                         (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 449                             vdev_raidz_pq_func, &pqr);
 450
 451                         /*
 452                          * Treat short columns as though they are full of 0s.
 453                          * Note that there's therefore nothing needed for P.
 454                          */
 455                         uint64_t mask;
 456                         for (uint64_t i = ccnt; i < pcnt; i++) {
 457                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 458                         }
 459                 }
 460         }
 461 }
 462
 463 static void
 464 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
 465 {
 466         uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 467         uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 468         uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
 469         uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 470         ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 471             rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 472         ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 473             rr->rr_col[VDEV_RAIDZ_R].rc_size);
 474
 475         for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 476                 abd_t *src = rr->rr_col[c].rc_abd;
 477
 478                 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
 479
 480                 if (c == rr->rr_firstdatacol) {
 481                         ASSERT(ccnt == pcnt || ccnt == 0);
 482                         abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 483                         (void) memcpy(q, p, rr->rr_col[c].rc_size);
 484                         (void) memcpy(r, p, rr->rr_col[c].rc_size);
 485
 486                         for (uint64_t i = ccnt; i < pcnt; i++) {
 487                                 p[i] = 0;
 488                                 q[i] = 0;
 489                                 r[i] = 0;
 490                         }
 491                 } else {
 492                         struct pqr_struct pqr = { p, q, r };
 493
 494                         ASSERT(ccnt <= pcnt);
 495                         (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 496                             vdev_raidz_pqr_func, &pqr);
 497
 498                         /*
 499                          * Treat short columns as though they are full of 0s.
 500                          * Note that there's therefore nothing needed for P.
 501                          */
 502                         uint64_t mask;
 503                         for (uint64_t i = ccnt; i < pcnt; i++) {
 504                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 505                                 VDEV_RAIDZ_64MUL_4(r[i], mask);
 506                         }
 507                 }
 508         }
 509 }
 510
 511 /*
 512  * Generate RAID parity in the first virtual columns according to the number of
 513  * parity columns available.
 514  */
 515 void
 516 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
 517 {
 518         ASSERT3U(rr->rr_cols, !=, 0);
 519
 520         /* Generate using the new math implementation */
 521         if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
 522                 return;
 523
 524         switch (rr->rr_firstdatacol) {
 525         case 1:
 526                 vdev_raidz_generate_parity_p(rr);
 527                 break;
 528         case 2:
 529                 vdev_raidz_generate_parity_pq(rr);
 530                 break;
 531         case 3:
 532                 vdev_raidz_generate_parity_pqr(rr);
 533                 break;
 534         default:
 535                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 536         }
 537 }
 538
 539 void
 540 vdev_raidz_generate_parity(raidz_map_t *rm)
 541 {
 542         for (int i = 0; i < rm->rm_nrows; i++) {
 543                 raidz_row_t *rr = rm->rm_row[i];
 544                 vdev_raidz_generate_parity_row(rm, rr);
 545         }
 546 }
 547
 548 /* ARGSUSED */
 549 static int
 550 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 551 {
 552         uint64_t *dst = dbuf;
 553         uint64_t *src = sbuf;
 554         int cnt = size / sizeof (src[0]);
 555
 556         for (int i = 0; i < cnt; i++) {
 557                 dst[i] ^= src[i];
 558         }
 559
 560         return (0);
 561 }
 562
 563 /* ARGSUSED */
 564 static int
 565 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
 566     void *private)
 567 {
 568         uint64_t *dst = dbuf;
 569         uint64_t *src = sbuf;
 570         uint64_t mask;
 571         int cnt = size / sizeof (dst[0]);
 572
 573         for (int i = 0; i < cnt; i++, dst++, src++) {
 574                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 575                 *dst ^= *src;
 576         }
 577
 578         return (0);
 579 }
 580
 581 /* ARGSUSED */
 582 static int
 583 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
 584 {
 585         uint64_t *dst = buf;
 586         uint64_t mask;
 587         int cnt = size / sizeof (dst[0]);
 588
 589         for (int i = 0; i < cnt; i++, dst++) {
 590                 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
 591                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 592         }
 593
 594         return (0);
 595 }
 596
 597 struct reconst_q_struct {
 598         uint64_t *q;
 599         int exp;
 600 };
 601
 602 static int
 603 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
 604 {
 605         struct reconst_q_struct *rq = private;
 606         uint64_t *dst = buf;
 607         int cnt = size / sizeof (dst[0]);
 608
 609         for (int i = 0; i < cnt; i++, dst++, rq->q++) {
 610                 int j;
 611                 uint8_t *b;
 612
 613                 *dst ^= *rq->q;
 614                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 615                         *b = vdev_raidz_exp2(*b, rq->exp);
 616                 }
 617         }
 618
 619         return (0);
 620 }
 621
 622 struct reconst_pq_struct {
 623         uint8_t *p;
 624         uint8_t *q;
 625         uint8_t *pxy;
 626         uint8_t *qxy;
 627         int aexp;
 628         int bexp;
 629 };
 630
 631 static int
 632 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
 633 {
 634         struct reconst_pq_struct *rpq = private;
 635         uint8_t *xd = xbuf;
 636         uint8_t *yd = ybuf;
 637
 638         for (int i = 0; i < size;
 639             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
 640                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 641                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 642                 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
 643         }
 644
 645         return (0);
 646 }
 647
 648 static int
 649 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
 650 {
 651         struct reconst_pq_struct *rpq = private;
 652         uint8_t *xd = xbuf;
 653
 654         for (int i = 0; i < size;
 655             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
 656                 /* same operation as vdev_raidz_reconst_pq_func() on xd */
 657                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 658                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 659         }
 660
 661         return (0);
 662 }
 663
 664 static void
 665 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
 666 {
 667         int x = tgts[0];
 668         abd_t *dst, *src;
 669
 670         ASSERT3U(ntgts, ==, 1);
 671         ASSERT3U(x, >=, rr->rr_firstdatacol);
 672         ASSERT3U(x, <, rr->rr_cols);
 673
 674         ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
 675
 676         src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
 677         dst = rr->rr_col[x].rc_abd;
 678
 679         abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
 680
 681         for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 682                 uint64_t size = MIN(rr->rr_col[x].rc_size,
 683                     rr->rr_col[c].rc_size);
 684
 685                 src = rr->rr_col[c].rc_abd;
 686
 687                 if (c == x)
 688                         continue;
 689
 690                 (void) abd_iterate_func2(dst, src, 0, 0, size,
 691                     vdev_raidz_reconst_p_func, NULL);
 692         }
 693 }
 694
 695 static void
 696 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
 697 {
 698         int x = tgts[0];
 699         int c, exp;
 700         abd_t *dst, *src;
 701
 702         ASSERT(ntgts == 1);
 703
 704         ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 705
 706         for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 707                 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
 708                     rr->rr_col[c].rc_size);
 709
 710                 src = rr->rr_col[c].rc_abd;
 711                 dst = rr->rr_col[x].rc_abd;
 712
 713                 if (c == rr->rr_firstdatacol) {
 714                         abd_copy(dst, src, size);
 715                         if (rr->rr_col[x].rc_size > size) {
 716                                 abd_zero_off(dst, size,
 717                                     rr->rr_col[x].rc_size - size);
 718                         }
 719                 } else {
 720                         ASSERT3U(size, <=, rr->rr_col[x].rc_size);
 721                         (void) abd_iterate_func2(dst, src, 0, 0, size,
 722                             vdev_raidz_reconst_q_pre_func, NULL);
 723                         (void) abd_iterate_func(dst,
 724                             size, rr->rr_col[x].rc_size - size,
 725                             vdev_raidz_reconst_q_pre_tail_func, NULL);
 726                 }
 727         }
 728
 729         src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
 730         dst = rr->rr_col[x].rc_abd;
 731         exp = 255 - (rr->rr_cols - 1 - x);
 732
 733         struct reconst_q_struct rq = { abd_to_buf(src), exp };
 734         (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
 735             vdev_raidz_reconst_q_post_func, &rq);
 736 }
 737
 738 static void
 739 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
 740 {
 741         uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
 742         abd_t *pdata, *qdata;
 743         uint64_t xsize, ysize;
 744         int x = tgts[0];
 745         int y = tgts[1];
 746         abd_t *xd, *yd;
 747
 748         ASSERT(ntgts == 2);
 749         ASSERT(x < y);
 750         ASSERT(x >= rr->rr_firstdatacol);
 751         ASSERT(y < rr->rr_cols);
 752
 753         ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
 754
 755         /*
 756          * Move the parity data aside -- we're going to compute parity as
 757          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 758          * reuse the parity generation mechanism without trashing the actual
 759          * parity so we make those columns appear to be full of zeros by
 760          * setting their lengths to zero.
 761          */
 762         pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
 763         qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
 764         xsize = rr->rr_col[x].rc_size;
 765         ysize = rr->rr_col[y].rc_size;
 766
 767         rr->rr_col[VDEV_RAIDZ_P].rc_abd =
 768             abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
 769         rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
 770             abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
 771         rr->rr_col[x].rc_size = 0;
 772         rr->rr_col[y].rc_size = 0;
 773
 774         vdev_raidz_generate_parity_pq(rr);
 775
 776         rr->rr_col[x].rc_size = xsize;
 777         rr->rr_col[y].rc_size = ysize;
 778
 779         p = abd_to_buf(pdata);
 780         q = abd_to_buf(qdata);
 781         pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 782         qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 783         xd = rr->rr_col[x].rc_abd;
 784         yd = rr->rr_col[y].rc_abd;
 785
 786         /*
 787          * We now have:
 788          *      Pxy = P + D_x + D_y
 789          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 790          *
 791          * We can then solve for D_x:
 792          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
 793          * where
 794          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
 795          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 796          *
 797          * With D_x in hand, we can easily solve for D_y:
 798          *      D_y = P + Pxy + D_x
 799          */
 800
 801         a = vdev_raidz_pow2[255 + x - y];
 802         b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
 803         tmp = 255 - vdev_raidz_log2[a ^ 1];
 804
 805         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 806         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 807
 808         ASSERT3U(xsize, >=, ysize);
 809         struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
 810
 811         (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
 812             vdev_raidz_reconst_pq_func, &rpq);
 813         (void) abd_iterate_func(xd, ysize, xsize - ysize,
 814             vdev_raidz_reconst_pq_tail_func, &rpq);
 815
 816         abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 817         abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 818
 819         /*
 820          * Restore the saved parity data.
 821          */
 822         rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
 823         rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
 824 }
 825
 826 /* BEGIN CSTYLED */
 827 /*
 828  * In the general case of reconstruction, we must solve the system of linear
 829  * equations defined by the coefficients used to generate parity as well as
 830  * the contents of the data and parity disks. This can be expressed with
 831  * vectors for the original data (D) and the actual data (d) and parity (p)
 832  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 833  *
 834  *            __   __                     __     __
 835  *            |     |         __     __   |  p_0  |
 836  *            |  V  |         |  D_0  |   | p_m-1 |
 837  *            |     |    x    |   :   | = |  d_0  |
 838  *            |  I  |         | D_n-1 |   |   :   |
 839  *            |     |         ~~     ~~   | d_n-1 |
 840  *            ~~   ~~                     ~~     ~~
 841  *
 842  * I is simply a square identity matrix of size n, and V is a vandermonde
 843  * matrix defined by the coefficients we chose for the various parity columns
 844  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 845  * computation as well as linear separability.
 846  *
 847  *      __               __               __     __
 848  *      |   1   ..  1 1 1 |               |  p_0  |
 849  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 850  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 851  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 852  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 853  *      |   :       : : : |   |   :   |   |  d_2  |
 854  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 855  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 856  *      |   0   ..  0 0 1 |               | d_n-1 |
 857  *      ~~               ~~               ~~     ~~
 858  *
 859  * Note that I, V, d, and p are known. To compute D, we must invert the
 860  * matrix and use the known data and parity values to reconstruct the unknown
 861  * data values. We begin by removing the rows in V|I and d|p that correspond
 862  * to failed or missing columns; we then make V|I square (n x n) and d|p
 863  * sized n by removing rows corresponding to unused parity from the bottom up
 864  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 865  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 866  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 867  *           __                               __
 868  *           |  1   1   1   1   1   1   1   1  |
 869  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 870  *           |  19 205 116  29  64  16  4   1  |      / /
 871  *           |  1   0   0   0   0   0   0   0  |     / /
 872  *           |  0   1   0   0   0   0   0   0  | <--' /
 873  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 874  *           |  0   0   0   1   0   0   0   0  |
 875  *           |  0   0   0   0   1   0   0   0  |
 876  *           |  0   0   0   0   0   1   0   0  |
 877  *           |  0   0   0   0   0   0   1   0  |
 878  *           |  0   0   0   0   0   0   0   1  |
 879  *           ~~                               ~~
 880  *           __                               __
 881  *           |  1   1   1   1   1   1   1   1  |
 882  *           | 128  64  32  16  8   4   2   1  |
 883  *           |  19 205 116  29  64  16  4   1  |
 884  *           |  1   0   0   0   0   0   0   0  |
 885  *           |  0   1   0   0   0   0   0   0  |
 886  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
 887  *           |  0   0   0   1   0   0   0   0  |
 888  *           |  0   0   0   0   1   0   0   0  |
 889  *           |  0   0   0   0   0   1   0   0  |
 890  *           |  0   0   0   0   0   0   1   0  |
 891  *           |  0   0   0   0   0   0   0   1  |
 892  *           ~~                               ~~
 893  *
 894  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
 895  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
 896  * matrix is not singular.
 897  * __                                                                 __
 898  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 899  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 900  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 901  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 902  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 903  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 904  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 905  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 906  * ~~                                                                 ~~
 907  * __                                                                 __
 908  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 909  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 910  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 911  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 912  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 913  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 914  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 915  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 916  * ~~                                                                 ~~
 917  * __                                                                 __
 918  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 919  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 920  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
 921  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 922  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 923  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 924  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 925  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 926  * ~~                                                                 ~~
 927  * __                                                                 __
 928  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 929  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 930  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
 931  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 932  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 933  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 934  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 935  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 936  * ~~                                                                 ~~
 937  * __                                                                 __
 938  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 939  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 940  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 941  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 942  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 943  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 944  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 945  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 946  * ~~                                                                 ~~
 947  * __                                                                 __
 948  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 949  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
 950  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 951  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 952  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 953  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 954  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 955  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 956  * ~~                                                                 ~~
 957  *                   __                               __
 958  *                   |  0   0   1   0   0   0   0   0  |
 959  *                   | 167 100  5   41 159 169 217 208 |
 960  *                   | 166 100  4   40 158 168 216 209 |
 961  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
 962  *                   |  0   0   0   0   1   0   0   0  |
 963  *                   |  0   0   0   0   0   1   0   0  |
 964  *                   |  0   0   0   0   0   0   1   0  |
 965  *                   |  0   0   0   0   0   0   0   1  |
 966  *                   ~~                               ~~
 967  *
 968  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
 969  * of the missing data.
 970  *
 971  * As is apparent from the example above, the only non-trivial rows in the
 972  * inverse matrix correspond to the data disks that we're trying to
 973  * reconstruct. Indeed, those are the only rows we need as the others would
 974  * only be useful for reconstructing data known or assumed to be valid. For
 975  * that reason, we only build the coefficients in the rows that correspond to
 976  * targeted columns.
 977  */
 978 /* END CSTYLED */
 979
 980 static void
 981 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
 982     uint8_t **rows)
 983 {
 984         int i, j;
 985         int pow;
 986
 987         ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
 988
 989         /*
 990          * Fill in the missing rows of interest.
 991          */
 992         for (i = 0; i < nmap; i++) {
 993                 ASSERT3S(0, <=, map[i]);
 994                 ASSERT3S(map[i], <=, 2);
 995
 996                 pow = map[i] * n;
 997                 if (pow > 255)
 998                         pow -= 255;
 999                 ASSERT(pow <= 255);
1000
1001                 for (j = 0; j < n; j++) {
1002                         pow -= map[i];
1003                         if (pow < 0)
1004                                 pow += 255;
1005                         rows[i][j] = vdev_raidz_pow2[pow];
1006                 }
1007         }
1008 }
1009
1010 static void
1011 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1012     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1013 {
1014         int i, j, ii, jj;
1015         uint8_t log;
1016
1017         /*
1018          * Assert that the first nmissing entries from the array of used
1019          * columns correspond to parity columns and that subsequent entries
1020          * correspond to data columns.
1021          */
1022         for (i = 0; i < nmissing; i++) {
1023                 ASSERT3S(used[i], <, rr->rr_firstdatacol);
1024         }
1025         for (; i < n; i++) {
1026                 ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1027         }
1028
1029         /*
1030          * First initialize the storage where we'll compute the inverse rows.
1031          */
1032         for (i = 0; i < nmissing; i++) {
1033                 for (j = 0; j < n; j++) {
1034                         invrows[i][j] = (i == j) ? 1 : 0;
1035                 }
1036         }
1037
1038         /*
1039          * Subtract all trivial rows from the rows of consequence.
1040          */
1041         for (i = 0; i < nmissing; i++) {
1042                 for (j = nmissing; j < n; j++) {
1043                         ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1044                         jj = used[j] - rr->rr_firstdatacol;
1045                         ASSERT3S(jj, <, n);
1046                         invrows[i][j] = rows[i][jj];
1047                         rows[i][jj] = 0;
1048                 }
1049         }
1050
1051         /*
1052          * For each of the rows of interest, we must normalize it and subtract
1053          * a multiple of it from the other rows.
1054          */
1055         for (i = 0; i < nmissing; i++) {
1056                 for (j = 0; j < missing[i]; j++) {
1057                         ASSERT0(rows[i][j]);
1058                 }
1059                 ASSERT3U(rows[i][missing[i]], !=, 0);
1060
1061                 /*
1062                  * Compute the inverse of the first element and multiply each
1063                  * element in the row by that value.
1064                  */
1065                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1066
1067                 for (j = 0; j < n; j++) {
1068                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1069                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1070                 }
1071
1072                 for (ii = 0; ii < nmissing; ii++) {
1073                         if (i == ii)
1074                                 continue;
1075
1076                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1077
1078                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1079
1080                         for (j = 0; j < n; j++) {
1081                                 rows[ii][j] ^=
1082                                     vdev_raidz_exp2(rows[i][j], log);
1083                                 invrows[ii][j] ^=
1084                                     vdev_raidz_exp2(invrows[i][j], log);
1085                         }
1086                 }
1087         }
1088
1089         /*
1090          * Verify that the data that is left in the rows are properly part of
1091          * an identity matrix.
1092          */
1093         for (i = 0; i < nmissing; i++) {
1094                 for (j = 0; j < n; j++) {
1095                         if (j == missing[i]) {
1096                                 ASSERT3U(rows[i][j], ==, 1);
1097                         } else {
1098                                 ASSERT0(rows[i][j]);
1099                         }
1100                 }
1101         }
1102 }
1103
1104 static void
1105 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1106     int *missing, uint8_t **invrows, const uint8_t *used)
1107 {
1108         int i, j, x, cc, c;
1109         uint8_t *src;
1110         uint64_t ccount;
1111         uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1112         uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1113         uint8_t log = 0;
1114         uint8_t val;
1115         int ll;
1116         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1117         uint8_t *p, *pp;
1118         size_t psize;
1119
1120         psize = sizeof (invlog[0][0]) * n * nmissing;
1121         p = kmem_alloc(psize, KM_SLEEP);
1122
1123         for (pp = p, i = 0; i < nmissing; i++) {
1124                 invlog[i] = pp;
1125                 pp += n;
1126         }
1127
1128         for (i = 0; i < nmissing; i++) {
1129                 for (j = 0; j < n; j++) {
1130                         ASSERT3U(invrows[i][j], !=, 0);
1131                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1132                 }
1133         }
1134
1135         for (i = 0; i < n; i++) {
1136                 c = used[i];
1137                 ASSERT3U(c, <, rr->rr_cols);
1138
1139                 ccount = rr->rr_col[c].rc_size;
1140                 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1141                 if (ccount == 0)
1142                         continue;
1143                 src = abd_to_buf(rr->rr_col[c].rc_abd);
1144                 for (j = 0; j < nmissing; j++) {
1145                         cc = missing[j] + rr->rr_firstdatacol;
1146                         ASSERT3U(cc, >=, rr->rr_firstdatacol);
1147                         ASSERT3U(cc, <, rr->rr_cols);
1148                         ASSERT3U(cc, !=, c);
1149
1150                         dcount[j] = rr->rr_col[cc].rc_size;
1151                         if (dcount[j] != 0)
1152                                 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1153                 }
1154
1155                 for (x = 0; x < ccount; x++, src++) {
1156                         if (*src != 0)
1157                                 log = vdev_raidz_log2[*src];
1158
1159                         for (cc = 0; cc < nmissing; cc++) {
1160                                 if (x >= dcount[cc])
1161                                         continue;
1162
1163                                 if (*src == 0) {
1164                                         val = 0;
1165                                 } else {
1166                                         if ((ll = log + invlog[cc][i]) >= 255)
1167                                                 ll -= 255;
1168                                         val = vdev_raidz_pow2[ll];
1169                                 }
1170
1171                                 if (i == 0)
1172                                         dst[cc][x] = val;
1173                                 else
1174                                         dst[cc][x] ^= val;
1175                         }
1176                 }
1177         }
1178
1179         kmem_free(p, psize);
1180 }
1181
1182 static void
1183 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1184 {
1185         int n, i, c, t, tt;
1186         int nmissing_rows;
1187         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1188         int parity_map[VDEV_RAIDZ_MAXPARITY];
1189         uint8_t *p, *pp;
1190         size_t psize;
1191         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1192         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1193         uint8_t *used;
1194
1195         abd_t **bufs = NULL;
1196
1197         /*
1198          * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1199          * temporary linear ABDs if any non-linear ABDs are found.
1200          */
1201         for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1202                 if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1203                         bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1204                             KM_PUSHPAGE);
1205
1206                         for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1207                                 raidz_col_t *col = &rr->rr_col[c];
1208
1209                                 bufs[c] = col->rc_abd;
1210                                 if (bufs[c] != NULL) {
1211                                         col->rc_abd = abd_alloc_linear(
1212                                             col->rc_size, B_TRUE);
1213                                         abd_copy(col->rc_abd, bufs[c],
1214                                             col->rc_size);
1215                                 }
1216                         }
1217
1218                         break;
1219                 }
1220         }
1221
1222         n = rr->rr_cols - rr->rr_firstdatacol;
1223
1224         /*
1225          * Figure out which data columns are missing.
1226          */
1227         nmissing_rows = 0;
1228         for (t = 0; t < ntgts; t++) {
1229                 if (tgts[t] >= rr->rr_firstdatacol) {
1230                         missing_rows[nmissing_rows++] =
1231                             tgts[t] - rr->rr_firstdatacol;
1232                 }
1233         }
1234
1235         /*
1236          * Figure out which parity columns to use to help generate the missing
1237          * data columns.
1238          */
1239         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1240                 ASSERT(tt < ntgts);
1241                 ASSERT(c < rr->rr_firstdatacol);
1242
1243                 /*
1244                  * Skip any targeted parity columns.
1245                  */
1246                 if (c == tgts[tt]) {
1247                         tt++;
1248                         continue;
1249                 }
1250
1251                 parity_map[i] = c;
1252                 i++;
1253         }
1254
1255         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1256             nmissing_rows * n + sizeof (used[0]) * n;
1257         p = kmem_alloc(psize, KM_SLEEP);
1258
1259         for (pp = p, i = 0; i < nmissing_rows; i++) {
1260                 rows[i] = pp;
1261                 pp += n;
1262                 invrows[i] = pp;
1263                 pp += n;
1264         }
1265         used = pp;
1266
1267         for (i = 0; i < nmissing_rows; i++) {
1268                 used[i] = parity_map[i];
1269         }
1270
1271         for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1272                 if (tt < nmissing_rows &&
1273                     c == missing_rows[tt] + rr->rr_firstdatacol) {
1274                         tt++;
1275                         continue;
1276                 }
1277
1278                 ASSERT3S(i, <, n);
1279                 used[i] = c;
1280                 i++;
1281         }
1282
1283         /*
1284          * Initialize the interesting rows of the matrix.
1285          */
1286         vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
1287
1288         /*
1289          * Invert the matrix.
1290          */
1291         vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
1292             invrows, used);
1293
1294         /*
1295          * Reconstruct the missing data using the generated matrix.
1296          */
1297         vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
1298             invrows, used);
1299
1300         kmem_free(p, psize);
1301
1302         /*
1303          * copy back from temporary linear abds and free them
1304          */
1305         if (bufs) {
1306                 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1307                         raidz_col_t *col = &rr->rr_col[c];
1308
1309                         if (bufs[c] != NULL) {
1310                                 abd_copy(bufs[c], col->rc_abd, col->rc_size);
1311                                 abd_free(col->rc_abd);
1312                         }
1313                         col->rc_abd = bufs[c];
1314                 }
1315                 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
1316         }
1317 }
1318
1319 static void
1320 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
1321     const int *t, int nt)
1322 {
1323         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1324         int ntgts;
1325         int i, c, ret;
1326         int nbadparity, nbaddata;
1327         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1328
1329         nbadparity = rr->rr_firstdatacol;
1330         nbaddata = rr->rr_cols - nbadparity;
1331         ntgts = 0;
1332         for (i = 0, c = 0; c < rr->rr_cols; c++) {
1333                 if (c < rr->rr_firstdatacol)
1334                         parity_valid[c] = B_FALSE;
1335
1336                 if (i < nt && c == t[i]) {
1337                         tgts[ntgts++] = c;
1338                         i++;
1339                 } else if (rr->rr_col[c].rc_error != 0) {
1340                         tgts[ntgts++] = c;
1341                 } else if (c >= rr->rr_firstdatacol) {
1342                         nbaddata--;
1343                 } else {
1344                         parity_valid[c] = B_TRUE;
1345                         nbadparity--;
1346                 }
1347         }
1348
1349         ASSERT(ntgts >= nt);
1350         ASSERT(nbaddata >= 0);
1351         ASSERT(nbaddata + nbadparity == ntgts);
1352
1353         dt = &tgts[nbadparity];
1354
1355         /* Reconstruct using the new math implementation */
1356         ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
1357         if (ret != RAIDZ_ORIGINAL_IMPL)
1358                 return;
1359
1360         /*
1361          * See if we can use any of our optimized reconstruction routines.
1362          */
1363         switch (nbaddata) {
1364         case 1:
1365                 if (parity_valid[VDEV_RAIDZ_P]) {
1366                         vdev_raidz_reconstruct_p(rr, dt, 1);
1367                         return;
1368                 }
1369
1370                 ASSERT(rr->rr_firstdatacol > 1);
1371
1372                 if (parity_valid[VDEV_RAIDZ_Q]) {
1373                         vdev_raidz_reconstruct_q(rr, dt, 1);
1374                         return;
1375                 }
1376
1377                 ASSERT(rr->rr_firstdatacol > 2);
1378                 break;
1379
1380         case 2:
1381                 ASSERT(rr->rr_firstdatacol > 1);
1382
1383                 if (parity_valid[VDEV_RAIDZ_P] &&
1384                     parity_valid[VDEV_RAIDZ_Q]) {
1385                         vdev_raidz_reconstruct_pq(rr, dt, 2);
1386                         return;
1387                 }
1388
1389                 ASSERT(rr->rr_firstdatacol > 2);
1390
1391                 break;
1392         }
1393
1394         vdev_raidz_reconstruct_general(rr, tgts, ntgts);
1395 }
1396
1397 static int
1398 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1399     uint64_t *logical_ashift, uint64_t *physical_ashift)
1400 {
1401         vdev_raidz_t *vdrz = vd->vdev_tsd;
1402         uint64_t nparity = vdrz->vd_nparity;
1403         int c;
1404         int lasterror = 0;
1405         int numerrors = 0;
1406
1407         ASSERT(nparity > 0);
1408
1409         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1410             vd->vdev_children < nparity + 1) {
1411                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1412                 return (SET_ERROR(EINVAL));
1413         }
1414
1415         vdev_open_children(vd);
1416
1417         for (c = 0; c < vd->vdev_children; c++) {
1418                 vdev_t *cvd = vd->vdev_child[c];
1419
1420                 if (cvd->vdev_open_error != 0) {
1421                         lasterror = cvd->vdev_open_error;
1422                         numerrors++;
1423                         continue;
1424                 }
1425
1426                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1427                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1428                 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1429                 *physical_ashift = MAX(*physical_ashift,
1430                     cvd->vdev_physical_ashift);
1431         }
1432
1433         *asize *= vd->vdev_children;
1434         *max_asize *= vd->vdev_children;
1435
1436         if (numerrors > nparity) {
1437                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1438                 return (lasterror);
1439         }
1440
1441         return (0);
1442 }
1443
1444 static void
1445 vdev_raidz_close(vdev_t *vd)
1446 {
1447         for (int c = 0; c < vd->vdev_children; c++) {
1448                 if (vd->vdev_child[c] != NULL)
1449                         vdev_close(vd->vdev_child[c]);
1450         }
1451 }
1452
1453 static uint64_t
1454 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1455 {
1456         vdev_raidz_t *vdrz = vd->vdev_tsd;
1457         uint64_t asize;
1458         uint64_t ashift = vd->vdev_top->vdev_ashift;
1459         uint64_t cols = vdrz->vd_logical_width;
1460         uint64_t nparity = vdrz->vd_nparity;
1461
1462         asize = ((psize - 1) >> ashift) + 1;
1463         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1464         asize = roundup(asize, nparity + 1) << ashift;
1465
1466         return (asize);
1467 }
1468
1469 /*
1470  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
1471  * so each child must provide at least 1/Nth of its asize.
1472  */
1473 static uint64_t
1474 vdev_raidz_min_asize(vdev_t *vd)
1475 {
1476         return ((vd->vdev_min_asize + vd->vdev_children - 1) /
1477             vd->vdev_children);
1478 }
1479
1480 void
1481 vdev_raidz_child_done(zio_t *zio)
1482 {
1483         raidz_col_t *rc = zio->io_private;
1484
1485         rc->rc_error = zio->io_error;
1486         rc->rc_tried = 1;
1487         rc->rc_skipped = 0;
1488 }
1489
1490 static void
1491 vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
1492 {
1493 #ifdef ZFS_DEBUG
1494         vdev_t *tvd = vd->vdev_top;
1495
1496         range_seg64_t logical_rs, physical_rs, remain_rs;
1497         logical_rs.rs_start = rr->rr_offset;
1498         logical_rs.rs_end = logical_rs.rs_start +
1499             vdev_raidz_asize(vd, rr->rr_size);
1500
1501         raidz_col_t *rc = &rr->rr_col[col];
1502         vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1503
1504         vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
1505         ASSERT(vdev_xlate_is_empty(&remain_rs));
1506         ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1507         ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1508         /*
1509          * It would be nice to assert that rs_end is equal
1510          * to rc_offset + rc_size but there might be an
1511          * optional I/O at the end that is not accounted in
1512          * rc_size.
1513          */
1514         if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1515                 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1516                     rc->rc_size + (1 << tvd->vdev_ashift));
1517         } else {
1518                 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1519         }
1520 #endif
1521 }
1522
1523 static void
1524 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
1525 {
1526         vdev_t *vd = zio->io_vd;
1527         raidz_map_t *rm = zio->io_vsd;
1528         int c, i;
1529
1530         vdev_raidz_generate_parity_row(rm, rr);
1531
1532         for (int c = 0; c < rr->rr_cols; c++) {
1533                 raidz_col_t *rc = &rr->rr_col[c];
1534                 if (rc->rc_size == 0)
1535                         continue;
1536
1537                 /* Verify physical to logical translation */
1538                 vdev_raidz_io_verify(vd, rr, c);
1539
1540                 zio_nowait(zio_vdev_child_io(zio, NULL,
1541                     vd->vdev_child[rc->rc_devidx], rc->rc_offset,
1542                     rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
1543                     0, vdev_raidz_child_done, rc));
1544         }
1545
1546         /*
1547          * Generate optional I/Os for skip sectors to improve aggregation
1548          * contiguity.
1549          */
1550         for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1551                 ASSERT(c <= rr->rr_scols);
1552                 if (c == rr->rr_scols)
1553                         c = 0;
1554
1555                 raidz_col_t *rc = &rr->rr_col[c];
1556                 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1557
1558                 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1559                     rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift,
1560                     zio->io_type, zio->io_priority,
1561                     ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1562         }
1563 }
1564
1565 static void
1566 vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
1567 {
1568         vdev_t *vd = zio->io_vd;
1569
1570         /*
1571          * Iterate over the columns in reverse order so that we hit the parity
1572          * last -- any errors along the way will force us to read the parity.
1573          */
1574         for (int c = rr->rr_cols - 1; c >= 0; c--) {
1575                 raidz_col_t *rc = &rr->rr_col[c];
1576                 if (rc->rc_size == 0)
1577                         continue;
1578                 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1579                 if (!vdev_readable(cvd)) {
1580                         if (c >= rr->rr_firstdatacol)
1581                                 rr->rr_missingdata++;
1582                         else
1583                                 rr->rr_missingparity++;
1584                         rc->rc_error = SET_ERROR(ENXIO);
1585                         rc->rc_tried = 1;       /* don't even try */
1586                         rc->rc_skipped = 1;
1587                         continue;
1588                 }
1589                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1590                         if (c >= rr->rr_firstdatacol)
1591                                 rr->rr_missingdata++;
1592                         else
1593                                 rr->rr_missingparity++;
1594                         rc->rc_error = SET_ERROR(ESTALE);
1595                         rc->rc_skipped = 1;
1596                         continue;
1597                 }
1598                 if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
1599                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1600                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1601                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1602                             zio->io_type, zio->io_priority, 0,
1603                             vdev_raidz_child_done, rc));
1604                 }
1605         }
1606 }
1607
1608 /*
1609  * Start an IO operation on a RAIDZ VDev
1610  *
1611  * Outline:
1612  * - For write operations:
1613  *   1. Generate the parity data
1614  *   2. Create child zio write operations to each column's vdev, for both
1615  *      data and parity.
1616  *   3. If the column skips any sectors for padding, create optional dummy
1617  *      write zio children for those areas to improve aggregation continuity.
1618  * - For read operations:
1619  *   1. Create child zio read operations to each data column's vdev to read
1620  *      the range of data required for zio.
1621  *   2. If this is a scrub or resilver operation, or if any of the data
1622  *      vdevs have had errors, then create zio read operations to the parity
1623  *      columns' VDevs as well.
1624  */
1625 static void
1626 vdev_raidz_io_start(zio_t *zio)
1627 {
1628         vdev_t *vd = zio->io_vd;
1629         vdev_t *tvd = vd->vdev_top;
1630         vdev_raidz_t *vdrz = vd->vdev_tsd;
1631
1632         raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
1633             vdrz->vd_logical_width, vdrz->vd_nparity);
1634         zio->io_vsd = rm;
1635         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1636
1637         /*
1638          * Until raidz expansion is implemented all maps for a raidz vdev
1639          * contain a single row.
1640          */
1641         ASSERT3U(rm->rm_nrows, ==, 1);
1642         raidz_row_t *rr = rm->rm_row[0];
1643
1644         if (zio->io_type == ZIO_TYPE_WRITE) {
1645                 vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);
1646         } else {
1647                 ASSERT(zio->io_type == ZIO_TYPE_READ);
1648                 vdev_raidz_io_start_read(zio, rr);
1649         }
1650
1651         zio_execute(zio);
1652 }
1653
1654 /*
1655  * Report a checksum error for a child of a RAID-Z device.
1656  */
1657 static void
1658 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
1659 {
1660         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1661
1662         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
1663             zio->io_priority != ZIO_PRIORITY_REBUILD) {
1664                 zio_bad_cksum_t zbc;
1665                 raidz_map_t *rm = zio->io_vsd;
1666
1667                 zbc.zbc_has_cksum = 0;
1668                 zbc.zbc_injected = rm->rm_ecksuminjected;
1669
1670                 (void) zfs_ereport_post_checksum(zio->io_spa, vd,
1671                     &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
1672                     rc->rc_abd, bad_data, &zbc);
1673                 mutex_enter(&vd->vdev_stat_lock);
1674                 vd->vdev_stat.vs_checksum_errors++;
1675                 mutex_exit(&vd->vdev_stat_lock);
1676         }
1677 }
1678
1679 /*
1680  * We keep track of whether or not there were any injected errors, so that
1681  * any ereports we generate can note it.
1682  */
1683 static int
1684 raidz_checksum_verify(zio_t *zio)
1685 {
1686         zio_bad_cksum_t zbc;
1687         raidz_map_t *rm = zio->io_vsd;
1688
1689         bzero(&zbc, sizeof (zio_bad_cksum_t));
1690
1691         int ret = zio_checksum_error(zio, &zbc);
1692         if (ret != 0 && zbc.zbc_injected != 0)
1693                 rm->rm_ecksuminjected = 1;
1694
1695         return (ret);
1696 }
1697
1698 /*
1699  * Generate the parity from the data columns. If we tried and were able to
1700  * read the parity without error, verify that the generated parity matches the
1701  * data we read. If it doesn't, we fire off a checksum error. Return the
1702  * number of such failures.
1703  */
1704 static int
1705 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
1706 {
1707         abd_t *orig[VDEV_RAIDZ_MAXPARITY];
1708         int c, ret = 0;
1709         raidz_map_t *rm = zio->io_vsd;
1710         raidz_col_t *rc;
1711
1712         blkptr_t *bp = zio->io_bp;
1713         enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1714             (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1715
1716         if (checksum == ZIO_CHECKSUM_NOPARITY)
1717                 return (ret);
1718
1719         for (c = 0; c < rr->rr_firstdatacol; c++) {
1720                 rc = &rr->rr_col[c];
1721                 if (!rc->rc_tried || rc->rc_error != 0)
1722                         continue;
1723
1724                 orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
1725                 abd_copy(orig[c], rc->rc_abd, rc->rc_size);
1726         }
1727
1728         /*
1729          * Regenerates parity even for !tried||rc_error!=0 columns.  This
1730          * isn't harmful but it does have the side effect of fixing stuff
1731          * we didn't realize was necessary (i.e. even if we return 0).
1732          */
1733         vdev_raidz_generate_parity_row(rm, rr);
1734
1735         for (c = 0; c < rr->rr_firstdatacol; c++) {
1736                 rc = &rr->rr_col[c];
1737
1738                 if (!rc->rc_tried || rc->rc_error != 0)
1739                         continue;
1740
1741                 if (abd_cmp(orig[c], rc->rc_abd) != 0) {
1742                         raidz_checksum_error(zio, rc, orig[c]);
1743                         rc->rc_error = SET_ERROR(ECKSUM);
1744                         ret++;
1745                 }
1746                 abd_free(orig[c]);
1747         }
1748
1749         return (ret);
1750 }
1751
1752 static int
1753 vdev_raidz_worst_error(raidz_row_t *rr)
1754 {
1755         int error = 0;
1756
1757         for (int c = 0; c < rr->rr_cols; c++)
1758                 error = zio_worst_error(error, rr->rr_col[c].rc_error);
1759
1760         return (error);
1761 }
1762
1763 static void
1764 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
1765 {
1766         int unexpected_errors = 0;
1767         int parity_errors = 0;
1768         int parity_untried = 0;
1769         int data_errors = 0;
1770
1771         ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
1772
1773         for (int c = 0; c < rr->rr_cols; c++) {
1774                 raidz_col_t *rc = &rr->rr_col[c];
1775
1776                 if (rc->rc_error) {
1777                         if (c < rr->rr_firstdatacol)
1778                                 parity_errors++;
1779                         else
1780                                 data_errors++;
1781
1782                         if (!rc->rc_skipped)
1783                                 unexpected_errors++;
1784                 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
1785                         parity_untried++;
1786                 }
1787         }
1788
1789         /*
1790          * If we read more parity disks than were used for
1791          * reconstruction, confirm that the other parity disks produced
1792          * correct data.
1793          *
1794          * Note that we also regenerate parity when resilvering so we
1795          * can write it out to failed devices later.
1796          */
1797         if (parity_errors + parity_untried <
1798             rr->rr_firstdatacol - data_errors ||
1799             (zio->io_flags & ZIO_FLAG_RESILVER)) {
1800                 int n = raidz_parity_verify(zio, rr);
1801                 unexpected_errors += n;
1802                 ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol);
1803         }
1804
1805         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
1806             (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1807                 /*
1808                  * Use the good data we have in hand to repair damaged children.
1809                  */
1810                 for (int c = 0; c < rr->rr_cols; c++) {
1811                         raidz_col_t *rc = &rr->rr_col[c];
1812                         vdev_t *vd = zio->io_vd;
1813                         vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1814
1815                         if (!rc->rc_allow_repair) {
1816                                 continue;
1817                         } else if (!rc->rc_force_repair &&
1818                             (rc->rc_error == 0 || rc->rc_size == 0)) {
1819                                 continue;
1820                         }
1821
1822                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1823                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1824                             ZIO_TYPE_WRITE,
1825                             zio->io_priority == ZIO_PRIORITY_REBUILD ?
1826                             ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
1827                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
1828                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
1829                 }
1830         }
1831 }
1832
1833 static void
1834 raidz_restore_orig_data(raidz_map_t *rm)
1835 {
1836         for (int i = 0; i < rm->rm_nrows; i++) {
1837                 raidz_row_t *rr = rm->rm_row[i];
1838                 for (int c = 0; c < rr->rr_cols; c++) {
1839                         raidz_col_t *rc = &rr->rr_col[c];
1840                         if (rc->rc_need_orig_restore) {
1841                                 abd_copy(rc->rc_abd,
1842                                     rc->rc_orig_data, rc->rc_size);
1843                                 rc->rc_need_orig_restore = B_FALSE;
1844                         }
1845                 }
1846         }
1847 }
1848
1849 /*
1850  * returns EINVAL if reconstruction of the block will not be possible
1851  * returns ECKSUM if this specific reconstruction failed
1852  * returns 0 on successful reconstruction
1853  */
1854 static int
1855 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
1856 {
1857         raidz_map_t *rm = zio->io_vsd;
1858
1859         /* Reconstruct each row */
1860         for (int r = 0; r < rm->rm_nrows; r++) {
1861                 raidz_row_t *rr = rm->rm_row[r];
1862                 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
1863                 int t = 0;
1864                 int dead = 0;
1865                 int dead_data = 0;
1866
1867                 for (int c = 0; c < rr->rr_cols; c++) {
1868                         raidz_col_t *rc = &rr->rr_col[c];
1869                         ASSERT0(rc->rc_need_orig_restore);
1870                         if (rc->rc_error != 0) {
1871                                 dead++;
1872                                 if (c >= nparity)
1873                                         dead_data++;
1874                                 continue;
1875                         }
1876                         if (rc->rc_size == 0)
1877                                 continue;
1878                         for (int lt = 0; lt < ntgts; lt++) {
1879                                 if (rc->rc_devidx == ltgts[lt]) {
1880                                         if (rc->rc_orig_data == NULL) {
1881                                                 rc->rc_orig_data =
1882                                                     abd_alloc_linear(
1883                                                     rc->rc_size, B_TRUE);
1884                                                 abd_copy(rc->rc_orig_data,
1885                                                     rc->rc_abd, rc->rc_size);
1886                                         }
1887                                         rc->rc_need_orig_restore = B_TRUE;
1888
1889                                         dead++;
1890                                         if (c >= nparity)
1891                                                 dead_data++;
1892                                         my_tgts[t++] = c;
1893                                         break;
1894                                 }
1895                         }
1896                 }
1897                 if (dead > nparity) {
1898                         /* reconstruction not possible */
1899                         raidz_restore_orig_data(rm);
1900                         return (EINVAL);
1901                 }
1902                 if (dead_data > 0)
1903                         vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
1904         }
1905
1906         /* Check for success */
1907         if (raidz_checksum_verify(zio) == 0) {
1908
1909                 /* Reconstruction succeeded - report errors */
1910                 for (int i = 0; i < rm->rm_nrows; i++) {
1911                         raidz_row_t *rr = rm->rm_row[i];
1912
1913                         for (int c = 0; c < rr->rr_cols; c++) {
1914                                 raidz_col_t *rc = &rr->rr_col[c];
1915                                 if (rc->rc_need_orig_restore) {
1916                                         /*
1917                                          * Note: if this is a parity column,
1918                                          * we don't really know if it's wrong.
1919                                          * We need to let
1920                                          * vdev_raidz_io_done_verified() check
1921                                          * it, and if we set rc_error, it will
1922                                          * think that it is a "known" error
1923                                          * that doesn't need to be checked
1924                                          * or corrected.
1925                                          */
1926                                         if (rc->rc_error == 0 &&
1927                                             c >= rr->rr_firstdatacol) {
1928                                                 raidz_checksum_error(zio,
1929                                                     rc, rc->rc_orig_data);
1930                                                 rc->rc_error =
1931                                                     SET_ERROR(ECKSUM);
1932                                         }
1933                                         rc->rc_need_orig_restore = B_FALSE;
1934                                 }
1935                         }
1936
1937                         vdev_raidz_io_done_verified(zio, rr);
1938                 }
1939
1940                 zio_checksum_verified(zio);
1941
1942                 return (0);
1943         }
1944
1945         /* Reconstruction failed - restore original data */
1946         raidz_restore_orig_data(rm);
1947         return (ECKSUM);
1948 }
1949
1950 /*
1951  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
1952  * Note that the algorithm below is non-optimal because it doesn't take into
1953  * account how reconstruction is actually performed. For example, with
1954  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1955  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1956  * cases we'd only use parity information in column 0.
1957  *
1958  * The order that we find the various possible combinations of failed
1959  * disks is dictated by these rules:
1960  * - Examine each "slot" (the "i" in tgts[i])
1961  *   - Try to increment this slot (tgts[i] = tgts[i] + 1)
1962  *   - if we can't increment because it runs into the next slot,
1963  *     reset our slot to the minimum, and examine the next slot
1964  *
1965  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
1966  *  3 columns to reconstruct), we will generate the following sequence:
1967  *
1968  *  STATE        ACTION
1969  *  0 1 2        special case: skip since these are all parity
1970  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
1971  *  0   2 3      first slot: increment to 1
1972  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
1973  *  0 1     4    first: reset to 0; middle: increment to 2
1974  *  0   2   4    first: increment to 1
1975  *    1 2   4    first: reset to 0; middle: increment to 3
1976  *  0     3 4    first: increment to 1
1977  *    1   3 4    first: increment to 2
1978  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
1979  *  0 1       5  first: reset to 0; middle: increment to 2
1980  *  0   2     5  first: increment to 1
1981  *    1 2     5  first: reset to 0; middle: increment to 3
1982  *  0     3   5  first: increment to 1
1983  *    1   3   5  first: increment to 2
1984  *      2 3   5  first: reset to 0; middle: increment to 4
1985  *  0       4 5  first: increment to 1
1986  *    1     4 5  first: increment to 2
1987  *      2   4 5  first: increment to 3
1988  *        3 4 5  done
1989  *
1990  * This strategy works for dRAID but is less efficient when there are a large
1991  * number of child vdevs and therefore permutations to check. Furthermore,
1992  * since the raidz_map_t rows likely do not overlap reconstruction would be
1993  * possible as long as there are no more than nparity data errors per row.
1994  * These additional permutations are not currently checked but could be as
1995  * a future improvement.
1996  */
1997 static int
1998 vdev_raidz_combrec(zio_t *zio)
1999 {
2000         int nparity = vdev_get_nparity(zio->io_vd);
2001         raidz_map_t *rm = zio->io_vsd;
2002
2003         /* Check if there's enough data to attempt reconstrution. */
2004         for (int i = 0; i < rm->rm_nrows; i++) {
2005                 raidz_row_t *rr = rm->rm_row[i];
2006                 int total_errors = 0;
2007
2008                 for (int c = 0; c < rr->rr_cols; c++) {
2009                         if (rr->rr_col[c].rc_error)
2010                                 total_errors++;
2011                 }
2012
2013                 if (total_errors > nparity)
2014                         return (vdev_raidz_worst_error(rr));
2015         }
2016
2017         for (int num_failures = 1; num_failures <= nparity; num_failures++) {
2018                 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2019                 int *ltgts = &tstore[1]; /* value is logical child ID */
2020
2021                 /* Determine number of logical children, n */
2022                 int n = zio->io_vd->vdev_children;
2023
2024                 ASSERT3U(num_failures, <=, nparity);
2025                 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
2026
2027                 /* Handle corner cases in combrec logic */
2028                 ltgts[-1] = -1;
2029                 for (int i = 0; i < num_failures; i++) {
2030                         ltgts[i] = i;
2031                 }
2032                 ltgts[num_failures] = n;
2033
2034                 for (;;) {
2035                         int err = raidz_reconstruct(zio, ltgts, num_failures,
2036                             nparity);
2037                         if (err == EINVAL) {
2038                                 /*
2039                                  * Reconstruction not possible with this #
2040                                  * failures; try more failures.
2041                                  */
2042                                 break;
2043                         } else if (err == 0)
2044                                 return (0);
2045
2046                         /* Compute next targets to try */
2047                         for (int t = 0; ; t++) {
2048                                 ASSERT3U(t, <, num_failures);
2049                                 ltgts[t]++;
2050                                 if (ltgts[t] == n) {
2051                                         /* try more failures */
2052                                         ASSERT3U(t, ==, num_failures - 1);
2053                                         break;
2054                                 }
2055
2056                                 ASSERT3U(ltgts[t], <, n);
2057                                 ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
2058
2059                                 /*
2060                                  * If that spot is available, we're done here.
2061                                  * Try the next combination.
2062                                  */
2063                                 if (ltgts[t] != ltgts[t + 1])
2064                                         break;
2065
2066                                 /*
2067                                  * Otherwise, reset this tgt to the minimum,
2068                                  * and move on to the next tgt.
2069                                  */
2070                                 ltgts[t] = ltgts[t - 1] + 1;
2071                                 ASSERT3U(ltgts[t], ==, t);
2072                         }
2073
2074                         /* Increase the number of failures and keep trying. */
2075                         if (ltgts[num_failures - 1] == n)
2076                                 break;
2077                 }
2078         }
2079
2080         return (ECKSUM);
2081 }
2082
2083 void
2084 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
2085 {
2086         for (uint64_t row = 0; row < rm->rm_nrows; row++) {
2087                 raidz_row_t *rr = rm->rm_row[row];
2088                 vdev_raidz_reconstruct_row(rm, rr, t, nt);
2089         }
2090 }
2091
2092 /*
2093  * Complete a write IO operation on a RAIDZ VDev
2094  *
2095  * Outline:
2096  *   1. Check for errors on the child IOs.
2097  *   2. Return, setting an error code if too few child VDevs were written
2098  *      to reconstruct the data later.  Note that partial writes are
2099  *      considered successful if they can be reconstructed at all.
2100  */
2101 static void
2102 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
2103 {
2104         int total_errors = 0;
2105
2106         ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
2107         ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
2108         ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
2109
2110         for (int c = 0; c < rr->rr_cols; c++) {
2111                 raidz_col_t *rc = &rr->rr_col[c];
2112
2113                 if (rc->rc_error) {
2114                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2115
2116                         total_errors++;
2117                 }
2118         }
2119
2120         /*
2121          * Treat partial writes as a success. If we couldn't write enough
2122          * columns to reconstruct the data, the I/O failed.  Otherwise,
2123          * good enough.
2124          *
2125          * Now that we support write reallocation, it would be better
2126          * to treat partial failure as real failure unless there are
2127          * no non-degraded top-level vdevs left, and not update DTLs
2128          * if we intend to reallocate.
2129          */
2130         if (total_errors > rr->rr_firstdatacol) {
2131                 zio->io_error = zio_worst_error(zio->io_error,
2132                     vdev_raidz_worst_error(rr));
2133         }
2134 }
2135
2136 static void
2137 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
2138     raidz_row_t *rr)
2139 {
2140         int parity_errors = 0;
2141         int parity_untried = 0;
2142         int data_errors = 0;
2143         int total_errors = 0;
2144
2145         ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
2146         ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
2147         ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2148
2149         for (int c = 0; c < rr->rr_cols; c++) {
2150                 raidz_col_t *rc = &rr->rr_col[c];
2151
2152                 if (rc->rc_error) {
2153                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2154
2155                         if (c < rr->rr_firstdatacol)
2156                                 parity_errors++;
2157                         else
2158                                 data_errors++;
2159
2160                         total_errors++;
2161                 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2162                         parity_untried++;
2163                 }
2164         }
2165
2166         /*
2167          * If there were data errors and the number of errors we saw was
2168          * correctable -- less than or equal to the number of parity disks read
2169          * -- reconstruct based on the missing data.
2170          */
2171         if (data_errors != 0 &&
2172             total_errors <= rr->rr_firstdatacol - parity_untried) {
2173                 /*
2174                  * We either attempt to read all the parity columns or
2175                  * none of them. If we didn't try to read parity, we
2176                  * wouldn't be here in the correctable case. There must
2177                  * also have been fewer parity errors than parity
2178                  * columns or, again, we wouldn't be in this code path.
2179                  */
2180                 ASSERT(parity_untried == 0);
2181                 ASSERT(parity_errors < rr->rr_firstdatacol);
2182
2183                 /*
2184                  * Identify the data columns that reported an error.
2185                  */
2186                 int n = 0;
2187                 int tgts[VDEV_RAIDZ_MAXPARITY];
2188                 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2189                         raidz_col_t *rc = &rr->rr_col[c];
2190                         if (rc->rc_error != 0) {
2191                                 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2192                                 tgts[n++] = c;
2193                         }
2194                 }
2195
2196                 ASSERT(rr->rr_firstdatacol >= n);
2197
2198                 vdev_raidz_reconstruct_row(rm, rr, tgts, n);
2199         }
2200 }
2201
2202 /*
2203  * Return the number of reads issued.
2204  */
2205 static int
2206 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
2207 {
2208         vdev_t *vd = zio->io_vd;
2209         int nread = 0;
2210
2211         rr->rr_missingdata = 0;
2212         rr->rr_missingparity = 0;
2213
2214         /*
2215          * If this rows contains empty sectors which are not required
2216          * for a normal read then allocate an ABD for them now so they
2217          * may be read, verified, and any needed repairs performed.
2218          */
2219         if (rr->rr_nempty && rr->rr_abd_empty == NULL)
2220                 vdev_draid_map_alloc_empty(zio, rr);
2221
2222         for (int c = 0; c < rr->rr_cols; c++) {
2223                 raidz_col_t *rc = &rr->rr_col[c];
2224                 if (rc->rc_tried || rc->rc_size == 0)
2225                         continue;
2226
2227                 zio_nowait(zio_vdev_child_io(zio, NULL,
2228                     vd->vdev_child[rc->rc_devidx],
2229                     rc->rc_offset, rc->rc_abd, rc->rc_size,
2230                     zio->io_type, zio->io_priority, 0,
2231                     vdev_raidz_child_done, rc));
2232                 nread++;
2233         }
2234         return (nread);
2235 }
2236
2237 /*
2238  * We're here because either there were too many errors to even attempt
2239  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
2240  * failed. In either case, there is enough bad data to prevent reconstruction.
2241  * Start checksum ereports for all children which haven't failed.
2242  */
2243 static void
2244 vdev_raidz_io_done_unrecoverable(zio_t *zio)
2245 {
2246         raidz_map_t *rm = zio->io_vsd;
2247
2248         for (int i = 0; i < rm->rm_nrows; i++) {
2249                 raidz_row_t *rr = rm->rm_row[i];
2250
2251                 for (int c = 0; c < rr->rr_cols; c++) {
2252                         raidz_col_t *rc = &rr->rr_col[c];
2253                         vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2254
2255                         if (rc->rc_error != 0)
2256                                 continue;
2257
2258                         zio_bad_cksum_t zbc;
2259                         zbc.zbc_has_cksum = 0;
2260                         zbc.zbc_injected = rm->rm_ecksuminjected;
2261
2262                         (void) zfs_ereport_start_checksum(zio->io_spa,
2263                             cvd, &zio->io_bookmark, zio, rc->rc_offset,
2264                             rc->rc_size, &zbc);
2265                         mutex_enter(&cvd->vdev_stat_lock);
2266                         cvd->vdev_stat.vs_checksum_errors++;
2267                         mutex_exit(&cvd->vdev_stat_lock);
2268                 }
2269         }
2270 }
2271
2272 void
2273 vdev_raidz_io_done(zio_t *zio)
2274 {
2275         raidz_map_t *rm = zio->io_vsd;
2276
2277         if (zio->io_type == ZIO_TYPE_WRITE) {
2278                 for (int i = 0; i < rm->rm_nrows; i++) {
2279                         vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
2280                 }
2281         } else {
2282                 for (int i = 0; i < rm->rm_nrows; i++) {
2283                         raidz_row_t *rr = rm->rm_row[i];
2284                         vdev_raidz_io_done_reconstruct_known_missing(zio,
2285                             rm, rr);
2286                 }
2287
2288                 if (raidz_checksum_verify(zio) == 0) {
2289                         for (int i = 0; i < rm->rm_nrows; i++) {
2290                                 raidz_row_t *rr = rm->rm_row[i];
2291                                 vdev_raidz_io_done_verified(zio, rr);
2292                         }
2293                         zio_checksum_verified(zio);
2294                 } else {
2295                         /*
2296                          * A sequential resilver has no checksum which makes
2297                          * combinatoral reconstruction impossible. This code
2298                          * path is unreachable since raidz_checksum_verify()
2299                          * has no checksum to verify and must succeed.
2300                          */
2301                         ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
2302
2303                         /*
2304                          * This isn't a typical situation -- either we got a
2305                          * read error or a child silently returned bad data.
2306                          * Read every block so we can try again with as much
2307                          * data and parity as we can track down. If we've
2308                          * already been through once before, all children will
2309                          * be marked as tried so we'll proceed to combinatorial
2310                          * reconstruction.
2311                          */
2312                         int nread = 0;
2313                         for (int i = 0; i < rm->rm_nrows; i++) {
2314                                 nread += vdev_raidz_read_all(zio,
2315                                     rm->rm_row[i]);
2316                         }
2317                         if (nread != 0) {
2318                                 /*
2319                                  * Normally our stage is VDEV_IO_DONE, but if
2320                                  * we've already called redone(), it will have
2321                                  * changed to VDEV_IO_START, in which case we
2322                                  * don't want to call redone() again.
2323                                  */
2324                                 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
2325                                         zio_vdev_io_redone(zio);
2326                                 return;
2327                         }
2328
2329                         zio->io_error = vdev_raidz_combrec(zio);
2330                         if (zio->io_error == ECKSUM &&
2331                             !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2332                                 vdev_raidz_io_done_unrecoverable(zio);
2333                         }
2334                 }
2335         }
2336 }
2337
2338 static void
2339 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2340 {
2341         vdev_raidz_t *vdrz = vd->vdev_tsd;
2342         if (faulted > vdrz->vd_nparity)
2343                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2344                     VDEV_AUX_NO_REPLICAS);
2345         else if (degraded + faulted != 0)
2346                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2347         else
2348                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2349 }
2350
2351 /*
2352  * Determine if any portion of the provided block resides on a child vdev
2353  * with a dirty DTL and therefore needs to be resilvered.  The function
2354  * assumes that at least one DTL is dirty which implies that full stripe
2355  * width blocks must be resilvered.
2356  */
2357 static boolean_t
2358 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
2359     uint64_t phys_birth)
2360 {
2361         vdev_raidz_t *vdrz = vd->vdev_tsd;
2362         uint64_t dcols = vd->vdev_children;
2363         uint64_t nparity = vdrz->vd_nparity;
2364         uint64_t ashift = vd->vdev_top->vdev_ashift;
2365         /* The starting RAIDZ (parent) vdev sector of the block. */
2366         uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
2367         /* The zio's size in units of the vdev's minimum sector size. */
2368         uint64_t s = ((psize - 1) >> ashift) + 1;
2369         /* The first column for this stripe. */
2370         uint64_t f = b % dcols;
2371
2372         /* Unreachable by sequential resilver. */
2373         ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
2374
2375         if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
2376                 return (B_FALSE);
2377
2378         if (s + nparity >= dcols)
2379                 return (B_TRUE);
2380
2381         for (uint64_t c = 0; c < s + nparity; c++) {
2382                 uint64_t devidx = (f + c) % dcols;
2383                 vdev_t *cvd = vd->vdev_child[devidx];
2384
2385                 /*
2386                  * dsl_scan_need_resilver() already checked vd with
2387                  * vdev_dtl_contains(). So here just check cvd with
2388                  * vdev_dtl_empty(), cheaper and a good approximation.
2389                  */
2390                 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2391                         return (B_TRUE);
2392         }
2393
2394         return (B_FALSE);
2395 }
2396
2397 static void
2398 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
2399     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
2400 {
2401         vdev_t *raidvd = cvd->vdev_parent;
2402         ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2403
2404         uint64_t width = raidvd->vdev_children;
2405         uint64_t tgt_col = cvd->vdev_id;
2406         uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2407
2408         /* make sure the offsets are block-aligned */
2409         ASSERT0(logical_rs->rs_start % (1 << ashift));
2410         ASSERT0(logical_rs->rs_end % (1 << ashift));
2411         uint64_t b_start = logical_rs->rs_start >> ashift;
2412         uint64_t b_end = logical_rs->rs_end >> ashift;
2413
2414         uint64_t start_row = 0;
2415         if (b_start > tgt_col) /* avoid underflow */
2416                 start_row = ((b_start - tgt_col - 1) / width) + 1;
2417
2418         uint64_t end_row = 0;
2419         if (b_end > tgt_col)
2420                 end_row = ((b_end - tgt_col - 1) / width) + 1;
2421
2422         physical_rs->rs_start = start_row << ashift;
2423         physical_rs->rs_end = end_row << ashift;
2424
2425         ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
2426         ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
2427             logical_rs->rs_end - logical_rs->rs_start);
2428 }
2429
2430 /*
2431  * Initialize private RAIDZ specific fields from the nvlist.
2432  */
2433 static int
2434 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
2435 {
2436         vdev_raidz_t *vdrz;
2437         uint64_t nparity;
2438
2439         uint_t children;
2440         nvlist_t **child;
2441         int error = nvlist_lookup_nvlist_array(nv,
2442             ZPOOL_CONFIG_CHILDREN, &child, &children);
2443         if (error != 0)
2444                 return (SET_ERROR(EINVAL));
2445
2446         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
2447                 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
2448                         return (SET_ERROR(EINVAL));
2449
2450                 /*
2451                  * Previous versions could only support 1 or 2 parity
2452                  * device.
2453                  */
2454                 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
2455                         return (SET_ERROR(EINVAL));
2456                 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
2457                         return (SET_ERROR(EINVAL));
2458         } else {
2459                 /*
2460                  * We require the parity to be specified for SPAs that
2461                  * support multiple parity levels.
2462                  */
2463                 if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
2464                         return (SET_ERROR(EINVAL));
2465
2466                 /*
2467                  * Otherwise, we default to 1 parity device for RAID-Z.
2468                  */
2469                 nparity = 1;
2470         }
2471
2472         vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
2473         vdrz->vd_logical_width = children;
2474         vdrz->vd_nparity = nparity;
2475
2476         *tsd = vdrz;
2477
2478         return (0);
2479 }
2480
2481 static void
2482 vdev_raidz_fini(vdev_t *vd)
2483 {
2484         kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t));
2485 }
2486
2487 /*
2488  * Add RAIDZ specific fields to the config nvlist.
2489  */
2490 static void
2491 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
2492 {
2493         ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
2494         vdev_raidz_t *vdrz = vd->vdev_tsd;
2495
2496         /*
2497          * Make sure someone hasn't managed to sneak a fancy new vdev
2498          * into a crufty old storage pool.
2499          */
2500         ASSERT(vdrz->vd_nparity == 1 ||
2501             (vdrz->vd_nparity <= 2 &&
2502             spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
2503             (vdrz->vd_nparity <= 3 &&
2504             spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
2505
2506         /*
2507          * Note that we'll add these even on storage pools where they
2508          * aren't strictly required -- older software will just ignore
2509          * it.
2510          */
2511         fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
2512 }
2513
2514 static uint64_t
2515 vdev_raidz_nparity(vdev_t *vd)
2516 {
2517         vdev_raidz_t *vdrz = vd->vdev_tsd;
2518         return (vdrz->vd_nparity);
2519 }
2520
2521 static uint64_t
2522 vdev_raidz_ndisks(vdev_t *vd)
2523 {
2524         return (vd->vdev_children);
2525 }
2526
2527 vdev_ops_t vdev_raidz_ops = {
2528         .vdev_op_init = vdev_raidz_init,
2529         .vdev_op_fini = vdev_raidz_fini,
2530         .vdev_op_open = vdev_raidz_open,
2531         .vdev_op_close = vdev_raidz_close,
2532         .vdev_op_asize = vdev_raidz_asize,
2533         .vdev_op_min_asize = vdev_raidz_min_asize,
2534         .vdev_op_min_alloc = NULL,
2535         .vdev_op_io_start = vdev_raidz_io_start,
2536         .vdev_op_io_done = vdev_raidz_io_done,
2537         .vdev_op_state_change = vdev_raidz_state_change,
2538         .vdev_op_need_resilver = vdev_raidz_need_resilver,
2539         .vdev_op_hold = NULL,
2540         .vdev_op_rele = NULL,
2541         .vdev_op_remap = NULL,
2542         .vdev_op_xlate = vdev_raidz_xlate,
2543         .vdev_op_rebuild_asize = NULL,
2544         .vdev_op_metaslab_init = NULL,
2545         .vdev_op_config_generate = vdev_raidz_config_generate,
2546         .vdev_op_nparity = vdev_raidz_nparity,
2547         .vdev_op_ndisks = vdev_raidz_ndisks,
2548         .vdev_op_type = VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2549         .vdev_op_leaf = B_FALSE                 /* not a leaf vdev */
2550 };