Optimize RAIDZ expansion
[zfs.git] / module / zfs / vdev_raidz.c
blob04539554957722a504e7b17caf634bb28edb84f9
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/spa_impl.h>
31 #include <sys/zap.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/metaslab_impl.h>
34 #include <sys/zio.h>
35 #include <sys/zio_checksum.h>
36 #include <sys/dmu_tx.h>
37 #include <sys/abd.h>
38 #include <sys/zfs_rlock.h>
39 #include <sys/fs/zfs.h>
40 #include <sys/fm/fs/zfs.h>
41 #include <sys/vdev_raidz.h>
42 #include <sys/vdev_raidz_impl.h>
43 #include <sys/vdev_draid.h>
44 #include <sys/uberblock_impl.h>
45 #include <sys/dsl_scan.h>
47 #ifdef ZFS_DEBUG
48 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
49 #endif
52 * Virtual device vector for RAID-Z.
54 * This vdev supports single, double, and triple parity. For single parity,
55 * we use a simple XOR of all the data columns. For double or triple parity,
56 * we use a special case of Reed-Solomon coding. This extends the
57 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
58 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
59 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
60 * former is also based. The latter is designed to provide higher performance
61 * for writes.
63 * Note that the Plank paper claimed to support arbitrary N+M, but was then
64 * amended six years later identifying a critical flaw that invalidates its
65 * claims. Nevertheless, the technique can be adapted to work for up to
66 * triple parity. For additional parity, the amendment "Note: Correction to
67 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
68 * is viable, but the additional complexity means that write performance will
69 * suffer.
71 * All of the methods above operate on a Galois field, defined over the
72 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
73 * can be expressed with a single byte. Briefly, the operations on the
74 * field are defined as follows:
76 * o addition (+) is represented by a bitwise XOR
77 * o subtraction (-) is therefore identical to addition: A + B = A - B
78 * o multiplication of A by 2 is defined by the following bitwise expression:
80 * (A * 2)_7 = A_6
81 * (A * 2)_6 = A_5
82 * (A * 2)_5 = A_4
83 * (A * 2)_4 = A_3 + A_7
84 * (A * 2)_3 = A_2 + A_7
85 * (A * 2)_2 = A_1 + A_7
86 * (A * 2)_1 = A_0
87 * (A * 2)_0 = A_7
89 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
90 * As an aside, this multiplication is derived from the error correcting
91 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
93 * Observe that any number in the field (except for 0) can be expressed as a
94 * power of 2 -- a generator for the field. We store a table of the powers of
95 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
96 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
97 * than field addition). The inverse of a field element A (A^-1) is therefore
98 * A ^ (255 - 1) = A^254.
100 * The up-to-three parity columns, P, Q, R over several data columns,
101 * D_0, ... D_n-1, can be expressed by field operations:
103 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
104 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
105 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
106 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
107 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
109 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
110 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
111 * independent coefficients. (There are no additional coefficients that have
112 * this property which is why the uncorrected Plank method breaks down.)
114 * See the reconstruction code below for how P, Q and R can used individually
115 * or in concert to recover missing data columns.
118 #define VDEV_RAIDZ_P 0
119 #define VDEV_RAIDZ_Q 1
120 #define VDEV_RAIDZ_R 2
122 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
123 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
126 * We provide a mechanism to perform the field multiplication operation on a
127 * 64-bit value all at once rather than a byte at a time. This works by
128 * creating a mask from the top bit in each byte and using that to
129 * conditionally apply the XOR of 0x1d.
131 #define VDEV_RAIDZ_64MUL_2(x, mask) \
133 (mask) = (x) & 0x8080808080808080ULL; \
134 (mask) = ((mask) << 1) - ((mask) >> 7); \
135 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
136 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
139 #define VDEV_RAIDZ_64MUL_4(x, mask) \
141 VDEV_RAIDZ_64MUL_2((x), mask); \
142 VDEV_RAIDZ_64MUL_2((x), mask); \
147 * Big Theory Statement for how a RAIDZ VDEV is expanded
149 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
150 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
151 * that have been previously expanded can be expanded again.
153 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
154 * the VDEV) when an expansion starts. And the expansion will pause if any
155 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
156 * operations on the pool can continue while an expansion is in progress (e.g.
157 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
158 * and zpool initialize which can't be run during an expansion. Following a
159 * reboot or export/import, the expansion resumes where it left off.
161 * == Reflowing the Data ==
163 * The expansion involves reflowing (copying) the data from the current set
164 * of disks to spread it across the new set which now has one more disk. This
165 * reflow operation is similar to reflowing text when the column width of a
166 * text editor window is expanded. The text doesn’t change but the location of
167 * the text changes to accommodate the new width. An example reflow result for
168 * a 4-wide RAIDZ1 to a 5-wide is shown below.
170 * Reflow End State
171 * Each letter indicates a parity group (logical stripe)
173 * Before expansion After Expansion
174 * D1 D2 D3 D4 D1 D2 D3 D4 D5
175 * +------+------+------+------+ +------+------+------+------+------+
176 * | | | | | | | | | | |
177 * | A | A | A | A | | A | A | A | A | B |
178 * | 1| 2| 3| 4| | 1| 2| 3| 4| 5|
179 * +------+------+------+------+ +------+------+------+------+------+
180 * | | | | | | | | | | |
181 * | B | B | C | C | | B | C | C | C | C |
182 * | 5| 6| 7| 8| | 6| 7| 8| 9| 10|
183 * +------+------+------+------+ +------+------+------+------+------+
184 * | | | | | | | | | | |
185 * | C | C | D | D | | D | D | E | E | E |
186 * | 9| 10| 11| 12| | 11| 12| 13| 14| 15|
187 * +------+------+------+------+ +------+------+------+------+------+
188 * | | | | | | | | | | |
189 * | E | E | E | E | --> | E | F | F | G | G |
190 * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20|
191 * +------+------+------+------+ +------+------+------+------+------+
192 * | | | | | | | | | | |
193 * | F | F | G | G | | G | G | H | H | H |
194 * | 17| 18| 19| 20| | 21| 22| 23| 24| 25|
195 * +------+------+------+------+ +------+------+------+------+------+
196 * | | | | | | | | | | |
197 * | G | G | H | H | | H | I | I | J | J |
198 * | 21| 22| 23| 24| | 26| 27| 28| 29| 30|
199 * +------+------+------+------+ +------+------+------+------+------+
200 * | | | | | | | | | | |
201 * | H | H | I | I | | J | J | | | K |
202 * | 25| 26| 27| 28| | 31| 32| 33| 34| 35|
203 * +------+------+------+------+ +------+------+------+------+------+
205 * This reflow approach has several advantages. There is no need to read or
206 * modify the block pointers or recompute any block checksums. The reflow
207 * doesn’t need to know where the parity sectors reside. We can read and write
208 * data sequentially and the copy can occur in a background thread in open
209 * context. The design also allows for fast discovery of what data to copy.
211 * The VDEV metaslabs are processed, one at a time, to copy the block data to
212 * have it flow across all the disks. The metaslab is disabled for allocations
213 * during the copy. As an optimization, we only copy the allocated data which
214 * can be determined by looking at the metaslab range tree. During the copy we
215 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
216 * need to be able to survive losing parity count disks). This means we
217 * cannot overwrite data during the reflow that would be needed if a disk is
218 * lost.
220 * After the reflow completes, all newly-written blocks will have the new
221 * layout, i.e., they will have the parity to data ratio implied by the new
222 * number of disks in the RAIDZ group. Even though the reflow copies all of
223 * the allocated space (data and parity), it is only rearranged, not changed.
225 * This act of reflowing the data has a few implications about blocks
226 * that were written before the reflow completes:
228 * - Old blocks will still use the same amount of space (i.e., they will have
229 * the parity to data ratio implied by the old number of disks in the RAIDZ
230 * group).
231 * - Reading old blocks will be slightly slower than before the reflow, for
232 * two reasons. First, we will have to read from all disks in the RAIDZ
233 * VDEV, rather than being able to skip the children that contain only
234 * parity of this block (because the data of a single block is now spread
235 * out across all the disks). Second, in most cases there will be an extra
236 * bcopy, needed to rearrange the data back to its original layout in memory.
238 * == Scratch Area ==
240 * As we copy the block data, we can only progress to the point that writes
241 * will not overlap with blocks whose progress has not yet been recorded on
242 * disk. Since partially-copied rows are always read from the old location,
243 * we need to stop one row before the sector-wise overlap, to prevent any
244 * row-wise overlap. For example, in the diagram above, when we reflow sector
245 * B6 it will overwite the original location for B5.
247 * To get around this, a scratch space is used so that we can start copying
248 * without risking data loss by overlapping the row. As an added benefit, it
249 * improves performance at the beginning of the reflow, but that small perf
250 * boost wouldn't be worth the complexity on its own.
252 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
253 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
254 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
255 * the widths will likely be single digits so we can get a substantial chuck
256 * size using only a few MB of scratch per disk.
258 * The scratch area is persisted to disk which holds a large amount of reflowed
259 * state. We can always read the partially written stripes when a disk fails or
260 * the copy is interrupted (crash) during the initial copying phase and also
261 * get past a small chunk size restriction. At a minimum, the scratch space
262 * must be large enough to get us to the point that one row does not overlap
263 * itself when moved (i.e new_width^2). But going larger is even better. We
264 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
265 * as our scratch space to handle overwriting the initial part of the VDEV.
267 * 0 256K 512K 4M
268 * +------+------+-----------------------+-----------------------------
269 * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ...
270 * | L0 | L1 | Reserved | (Metaslabs)
271 * +------+------+-----------------------+-------------------------------
272 * Scratch Area
274 * == Reflow Progress Updates ==
275 * After the initial scratch-based reflow, the expansion process works
276 * similarly to device removal. We create a new open context thread which
277 * reflows the data, and periodically kicks off sync tasks to update logical
278 * state. In this case, state is the committed progress (offset of next data
279 * to copy). We need to persist the completed offset on disk, so that if we
280 * crash we know which format each VDEV offset is in.
282 * == Time Dependent Geometry ==
284 * In non-expanded RAIDZ, blocks are read from disk in a column by column
285 * fashion. For a multi-row block, the second sector is in the first column
286 * not in the second column. This allows us to issue full reads for each
287 * column directly into the request buffer. The block data is thus laid out
288 * sequentially in a column-by-column fashion.
290 * For example, in the before expansion diagram above, one logical block might
291 * be sectors G19-H26. The parity is in G19,H23; and the data is in
292 * G20,H24,G21,H25,G22,H26.
294 * After a block is reflowed, the sectors that were all in the original column
295 * data can now reside in different columns. When reading from an expanded
296 * VDEV, we need to know the logical stripe width for each block so we can
297 * reconstitute the block’s data after the reads are completed. Likewise,
298 * when we perform the combinatorial reconstruction we need to know the
299 * original width so we can retry combinations from the past layouts.
301 * Time dependent geometry is what we call having blocks with different layouts
302 * (stripe widths) in the same VDEV. This time-dependent geometry uses the
303 * block’s birth time (+ the time expansion ended) to establish the correct
304 * width for a given block. After an expansion completes, we record the time
305 * for blocks written with a particular width (geometry).
307 * == On Disk Format Changes ==
309 * New pool feature flag, 'raidz_expansion' whose reference count is the number
310 * of RAIDZ VDEVs that have been expanded.
312 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
314 * Since the uberblock can point to arbitrary blocks, which might be on the
315 * expanding RAIDZ, and might or might not have been expanded. We need to know
316 * which way a block is laid out before reading it. This info is the next
317 * offset that needs to be reflowed and we persist that in the uberblock, in
318 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
319 * After the expansion is complete, we then use the raidz_expand_txgs array
320 * (see below) to determine how to read a block and the ub_raidz_reflow_info
321 * field no longer required.
323 * The uberblock's ub_raidz_reflow_info field also holds the scratch space
324 * state (i.e., active or not) which is also required before reading a block
325 * during the initial phase of reflowing the data.
327 * The top-level RAIDZ VDEV has two new entries in the nvlist:
329 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
330 * and used after the expansion is complete to
331 * determine how to read a raidz block
332 * 'raidz_expanding' boolean: present during reflow and removed after completion
333 * used during a spa import to resume an unfinished
334 * expansion
336 * And finally the VDEVs top zap adds the following informational entries:
337 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
338 * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
339 * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
340 * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
344 * For testing only: pause the raidz expansion after reflowing this amount.
345 * (accessed by ZTS and ztest)
347 #ifdef _KERNEL
348 static
349 #endif /* _KERNEL */
350 unsigned long raidz_expand_max_reflow_bytes = 0;
353 * For testing only: pause the raidz expansion at a certain point.
355 uint_t raidz_expand_pause_point = 0;
358 * Maximum amount of copy io's outstanding at once.
360 #ifdef _ILP32
361 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
362 #else
363 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
364 #endif
367 * Apply raidz map abds aggregation if the number of rows in the map is equal
368 * or greater than the value below.
370 static unsigned long raidz_io_aggregate_rows = 4;
373 * Automatically start a pool scrub when a RAIDZ expansion completes in
374 * order to verify the checksums of all blocks which have been copied
375 * during the expansion. Automatic scrubbing is enabled by default and
376 * is strongly recommended.
378 static int zfs_scrub_after_expand = 1;
380 static void
381 vdev_raidz_row_free(raidz_row_t *rr)
383 for (int c = 0; c < rr->rr_cols; c++) {
384 raidz_col_t *rc = &rr->rr_col[c];
386 if (rc->rc_size != 0)
387 abd_free(rc->rc_abd);
388 if (rc->rc_orig_data != NULL)
389 abd_free(rc->rc_orig_data);
392 if (rr->rr_abd_empty != NULL)
393 abd_free(rr->rr_abd_empty);
395 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
398 void
399 vdev_raidz_map_free(raidz_map_t *rm)
401 for (int i = 0; i < rm->rm_nrows; i++)
402 vdev_raidz_row_free(rm->rm_row[i]);
404 if (rm->rm_nphys_cols) {
405 for (int i = 0; i < rm->rm_nphys_cols; i++) {
406 if (rm->rm_phys_col[i].rc_abd != NULL)
407 abd_free(rm->rm_phys_col[i].rc_abd);
410 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
411 rm->rm_nphys_cols);
414 ASSERT3P(rm->rm_lr, ==, NULL);
415 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
418 static void
419 vdev_raidz_map_free_vsd(zio_t *zio)
421 raidz_map_t *rm = zio->io_vsd;
423 vdev_raidz_map_free(rm);
426 static int
427 vdev_raidz_reflow_compare(const void *x1, const void *x2)
429 const reflow_node_t *l = x1;
430 const reflow_node_t *r = x2;
432 return (TREE_CMP(l->re_txg, r->re_txg));
435 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
436 .vsd_free = vdev_raidz_map_free_vsd,
439 raidz_row_t *
440 vdev_raidz_row_alloc(int cols, zio_t *zio)
442 raidz_row_t *rr =
443 kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
445 rr->rr_cols = cols;
446 rr->rr_scols = cols;
448 for (int c = 0; c < cols; c++) {
449 raidz_col_t *rc = &rr->rr_col[c];
450 rc->rc_shadow_devidx = INT_MAX;
451 rc->rc_shadow_offset = UINT64_MAX;
453 * We can not allow self healing to take place for Direct I/O
454 * reads. There is nothing that stops the buffer contents from
455 * being manipulated while the I/O is in flight. It is possible
456 * that the checksum could be verified on the buffer and then
457 * the contents of that buffer are manipulated afterwards. This
458 * could lead to bad data being written out during self
459 * healing.
461 if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
462 rc->rc_allow_repair = 1;
464 return (rr);
467 static void
468 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
470 int c;
471 int nwrapped = 0;
472 uint64_t off = 0;
473 raidz_row_t *rr = rm->rm_row[0];
475 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
476 ASSERT3U(rm->rm_nrows, ==, 1);
479 * Pad any parity columns with additional space to account for skip
480 * sectors.
482 if (rm->rm_skipstart < rr->rr_firstdatacol) {
483 ASSERT0(rm->rm_skipstart);
484 nwrapped = rm->rm_nskip;
485 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
486 nwrapped =
487 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
491 * Optional single skip sectors (rc_size == 0) will be handled in
492 * vdev_raidz_io_start_write().
494 int skipped = rr->rr_scols - rr->rr_cols;
496 /* Allocate buffers for the parity columns */
497 for (c = 0; c < rr->rr_firstdatacol; c++) {
498 raidz_col_t *rc = &rr->rr_col[c];
501 * Parity columns will pad out a linear ABD to account for
502 * the skip sector. A linear ABD is used here because
503 * parity calculations use the ABD buffer directly to calculate
504 * parity. This avoids doing a memcpy back to the ABD after the
505 * parity has been calculated. By issuing the parity column
506 * with the skip sector we can reduce contention on the child
507 * VDEV queue locks (vq_lock).
509 if (c < nwrapped) {
510 rc->rc_abd = abd_alloc_linear(
511 rc->rc_size + (1ULL << ashift), B_FALSE);
512 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
513 skipped++;
514 } else {
515 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
519 for (off = 0; c < rr->rr_cols; c++) {
520 raidz_col_t *rc = &rr->rr_col[c];
521 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
522 zio->io_abd, off, rc->rc_size);
525 * Generate I/O for skip sectors to improve aggregation
526 * continuity. We will use gang ABD's to reduce contention
527 * on the child VDEV queue locks (vq_lock) by issuing
528 * a single I/O that contains the data and skip sector.
530 * It is important to make sure that rc_size is not updated
531 * even though we are adding a skip sector to the ABD. When
532 * calculating the parity in vdev_raidz_generate_parity_row()
533 * the rc_size is used to iterate through the ABD's. We can
534 * not have zero'd out skip sectors used for calculating
535 * parity for raidz, because those same sectors are not used
536 * during reconstruction.
538 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
539 rc->rc_abd = abd_alloc_gang();
540 abd_gang_add(rc->rc_abd, abd, B_TRUE);
541 abd_gang_add(rc->rc_abd,
542 abd_get_zeros(1ULL << ashift), B_TRUE);
543 skipped++;
544 } else {
545 rc->rc_abd = abd;
547 off += rc->rc_size;
550 ASSERT3U(off, ==, zio->io_size);
551 ASSERT3S(skipped, ==, rm->rm_nskip);
554 static void
555 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
557 int c;
558 raidz_row_t *rr = rm->rm_row[0];
560 ASSERT3U(rm->rm_nrows, ==, 1);
562 /* Allocate buffers for the parity columns */
563 for (c = 0; c < rr->rr_firstdatacol; c++)
564 rr->rr_col[c].rc_abd =
565 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
567 for (uint64_t off = 0; c < rr->rr_cols; c++) {
568 raidz_col_t *rc = &rr->rr_col[c];
569 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
570 zio->io_abd, off, rc->rc_size);
571 off += rc->rc_size;
576 * Divides the IO evenly across all child vdevs; usually, dcols is
577 * the number of children in the target vdev.
579 * Avoid inlining the function to keep vdev_raidz_io_start(), which
580 * is this functions only caller, as small as possible on the stack.
582 noinline raidz_map_t *
583 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
584 uint64_t nparity)
586 raidz_row_t *rr;
587 /* The starting RAIDZ (parent) vdev sector of the block. */
588 uint64_t b = zio->io_offset >> ashift;
589 /* The zio's size in units of the vdev's minimum sector size. */
590 uint64_t s = zio->io_size >> ashift;
591 /* The first column for this stripe. */
592 uint64_t f = b % dcols;
593 /* The starting byte offset on each child vdev. */
594 uint64_t o = (b / dcols) << ashift;
595 uint64_t acols, scols;
597 raidz_map_t *rm =
598 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
599 rm->rm_nrows = 1;
602 * "Quotient": The number of data sectors for this stripe on all but
603 * the "big column" child vdevs that also contain "remainder" data.
605 uint64_t q = s / (dcols - nparity);
608 * "Remainder": The number of partial stripe data sectors in this I/O.
609 * This will add a sector to some, but not all, child vdevs.
611 uint64_t r = s - q * (dcols - nparity);
613 /* The number of "big columns" - those which contain remainder data. */
614 uint64_t bc = (r == 0 ? 0 : r + nparity);
617 * The total number of data and parity sectors associated with
618 * this I/O.
620 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
623 * acols: The columns that will be accessed.
624 * scols: The columns that will be accessed or skipped.
626 if (q == 0) {
627 /* Our I/O request doesn't span all child vdevs. */
628 acols = bc;
629 scols = MIN(dcols, roundup(bc, nparity + 1));
630 } else {
631 acols = dcols;
632 scols = dcols;
635 ASSERT3U(acols, <=, scols);
636 rr = vdev_raidz_row_alloc(scols, zio);
637 rm->rm_row[0] = rr;
638 rr->rr_cols = acols;
639 rr->rr_bigcols = bc;
640 rr->rr_firstdatacol = nparity;
641 #ifdef ZFS_DEBUG
642 rr->rr_offset = zio->io_offset;
643 rr->rr_size = zio->io_size;
644 #endif
646 uint64_t asize = 0;
648 for (uint64_t c = 0; c < scols; c++) {
649 raidz_col_t *rc = &rr->rr_col[c];
650 uint64_t col = f + c;
651 uint64_t coff = o;
652 if (col >= dcols) {
653 col -= dcols;
654 coff += 1ULL << ashift;
656 rc->rc_devidx = col;
657 rc->rc_offset = coff;
659 if (c >= acols)
660 rc->rc_size = 0;
661 else if (c < bc)
662 rc->rc_size = (q + 1) << ashift;
663 else
664 rc->rc_size = q << ashift;
666 asize += rc->rc_size;
669 ASSERT3U(asize, ==, tot << ashift);
670 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
671 rm->rm_skipstart = bc;
674 * If all data stored spans all columns, there's a danger that parity
675 * will always be on the same device and, since parity isn't read
676 * during normal operation, that device's I/O bandwidth won't be
677 * used effectively. We therefore switch the parity every 1MB.
679 * ... at least that was, ostensibly, the theory. As a practical
680 * matter unless we juggle the parity between all devices evenly, we
681 * won't see any benefit. Further, occasional writes that aren't a
682 * multiple of the LCM of the number of children and the minimum
683 * stripe width are sufficient to avoid pessimal behavior.
684 * Unfortunately, this decision created an implicit on-disk format
685 * requirement that we need to support for all eternity, but only
686 * for single-parity RAID-Z.
688 * If we intend to skip a sector in the zeroth column for padding
689 * we must make sure to note this swap. We will never intend to
690 * skip the first column since at least one data and one parity
691 * column must appear in each row.
693 ASSERT(rr->rr_cols >= 2);
694 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
696 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
697 uint64_t devidx = rr->rr_col[0].rc_devidx;
698 o = rr->rr_col[0].rc_offset;
699 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
700 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
701 rr->rr_col[1].rc_devidx = devidx;
702 rr->rr_col[1].rc_offset = o;
703 if (rm->rm_skipstart == 0)
704 rm->rm_skipstart = 1;
707 if (zio->io_type == ZIO_TYPE_WRITE) {
708 vdev_raidz_map_alloc_write(zio, rm, ashift);
709 } else {
710 vdev_raidz_map_alloc_read(zio, rm);
712 /* init RAIDZ parity ops */
713 rm->rm_ops = vdev_raidz_math_get_ops();
715 return (rm);
719 * Everything before reflow_offset_synced should have been moved to the new
720 * location (read and write completed). However, this may not yet be reflected
721 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
722 * uberblock has not yet been written). If reflow is not in progress,
723 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
724 * entirely before reflow_offset_synced, it will come from the new location.
725 * Otherwise this row will come from the old location. Therefore, rows that
726 * straddle the reflow_offset_synced will come from the old location.
728 * For writes, reflow_offset_next is the next offset to copy. If a sector has
729 * been copied, but not yet reflected in the on-disk progress
730 * (reflow_offset_synced), it will also be written to the new (already copied)
731 * offset.
733 noinline raidz_map_t *
734 vdev_raidz_map_alloc_expanded(zio_t *zio,
735 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
736 uint64_t nparity, uint64_t reflow_offset_synced,
737 uint64_t reflow_offset_next, boolean_t use_scratch)
739 abd_t *abd = zio->io_abd;
740 uint64_t offset = zio->io_offset;
741 uint64_t size = zio->io_size;
743 /* The zio's size in units of the vdev's minimum sector size. */
744 uint64_t s = size >> ashift;
747 * "Quotient": The number of data sectors for this stripe on all but
748 * the "big column" child vdevs that also contain "remainder" data.
749 * AKA "full rows"
751 uint64_t q = s / (logical_cols - nparity);
754 * "Remainder": The number of partial stripe data sectors in this I/O.
755 * This will add a sector to some, but not all, child vdevs.
757 uint64_t r = s - q * (logical_cols - nparity);
759 /* The number of "big columns" - those which contain remainder data. */
760 uint64_t bc = (r == 0 ? 0 : r + nparity);
763 * The total number of data and parity sectors associated with
764 * this I/O.
766 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
768 /* How many rows contain data (not skip) */
769 uint64_t rows = howmany(tot, logical_cols);
770 int cols = MIN(tot, logical_cols);
772 raidz_map_t *rm =
773 kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
774 KM_SLEEP);
775 rm->rm_nrows = rows;
776 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
777 rm->rm_skipstart = bc;
778 uint64_t asize = 0;
780 for (uint64_t row = 0; row < rows; row++) {
781 boolean_t row_use_scratch = B_FALSE;
782 raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
783 rm->rm_row[row] = rr;
785 /* The starting RAIDZ (parent) vdev sector of the row. */
786 uint64_t b = (offset >> ashift) + row * logical_cols;
789 * If we are in the middle of a reflow, and the copying has
790 * not yet completed for any part of this row, then use the
791 * old location of this row. Note that reflow_offset_synced
792 * reflects the i/o that's been completed, because it's
793 * updated by a synctask, after zio_wait(spa_txg_zio[]).
794 * This is sufficient for our check, even if that progress
795 * has not yet been recorded to disk (reflected in
796 * spa_ubsync). Also note that we consider the last row to
797 * be "full width" (`cols`-wide rather than `bc`-wide) for
798 * this calculation. This causes a tiny bit of unnecessary
799 * double-writes but is safe and simpler to calculate.
801 int row_phys_cols = physical_cols;
802 if (b + cols > reflow_offset_synced >> ashift)
803 row_phys_cols--;
804 else if (use_scratch)
805 row_use_scratch = B_TRUE;
807 /* starting child of this row */
808 uint64_t child_id = b % row_phys_cols;
809 /* The starting byte offset on each child vdev. */
810 uint64_t child_offset = (b / row_phys_cols) << ashift;
813 * Note, rr_cols is the entire width of the block, even
814 * if this row is shorter. This is needed because parity
815 * generation (for Q and R) needs to know the entire width,
816 * because it treats the short row as though it was
817 * full-width (and the "phantom" sectors were zero-filled).
819 * Another approach to this would be to set cols shorter
820 * (to just the number of columns that we might do i/o to)
821 * and have another mechanism to tell the parity generation
822 * about the "entire width". Reconstruction (at least
823 * vdev_raidz_reconstruct_general()) would also need to
824 * know about the "entire width".
826 rr->rr_firstdatacol = nparity;
827 #ifdef ZFS_DEBUG
829 * note: rr_size is PSIZE, not ASIZE
831 rr->rr_offset = b << ashift;
832 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
833 #endif
835 for (int c = 0; c < rr->rr_cols; c++, child_id++) {
836 if (child_id >= row_phys_cols) {
837 child_id -= row_phys_cols;
838 child_offset += 1ULL << ashift;
840 raidz_col_t *rc = &rr->rr_col[c];
841 rc->rc_devidx = child_id;
842 rc->rc_offset = child_offset;
845 * Get this from the scratch space if appropriate.
846 * This only happens if we crashed in the middle of
847 * raidz_reflow_scratch_sync() (while it's running,
848 * the rangelock prevents us from doing concurrent
849 * io), and even then only during zpool import or
850 * when the pool is imported readonly.
852 if (row_use_scratch)
853 rc->rc_offset -= VDEV_BOOT_SIZE;
855 uint64_t dc = c - rr->rr_firstdatacol;
856 if (c < rr->rr_firstdatacol) {
857 rc->rc_size = 1ULL << ashift;
860 * Parity sectors' rc_abd's are set below
861 * after determining if this is an aggregation.
863 } else if (row == rows - 1 && bc != 0 && c >= bc) {
865 * Past the end of the block (even including
866 * skip sectors). This sector is part of the
867 * map so that we have full rows for p/q parity
868 * generation.
870 rc->rc_size = 0;
871 rc->rc_abd = NULL;
872 } else {
873 /* "data column" (col excluding parity) */
874 uint64_t off;
876 if (c < bc || r == 0) {
877 off = dc * rows + row;
878 } else {
879 off = r * rows +
880 (dc - r) * (rows - 1) + row;
882 rc->rc_size = 1ULL << ashift;
883 rc->rc_abd = abd_get_offset_struct(
884 &rc->rc_abdstruct, abd, off << ashift,
885 rc->rc_size);
888 if (rc->rc_size == 0)
889 continue;
892 * If any part of this row is in both old and new
893 * locations, the primary location is the old
894 * location. If this sector was already copied to the
895 * new location, we need to also write to the new,
896 * "shadow" location.
898 * Note, `row_phys_cols != physical_cols` indicates
899 * that the primary location is the old location.
900 * `b+c < reflow_offset_next` indicates that the copy
901 * to the new location has been initiated. We know
902 * that the copy has completed because we have the
903 * rangelock, which is held exclusively while the
904 * copy is in progress.
906 if (row_use_scratch ||
907 (row_phys_cols != physical_cols &&
908 b + c < reflow_offset_next >> ashift)) {
909 rc->rc_shadow_devidx = (b + c) % physical_cols;
910 rc->rc_shadow_offset =
911 ((b + c) / physical_cols) << ashift;
912 if (row_use_scratch)
913 rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
916 asize += rc->rc_size;
920 * See comment in vdev_raidz_map_alloc()
922 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
923 (offset & (1ULL << 20))) {
924 ASSERT(rr->rr_cols >= 2);
925 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
927 int devidx0 = rr->rr_col[0].rc_devidx;
928 uint64_t offset0 = rr->rr_col[0].rc_offset;
929 int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
930 uint64_t shadow_offset0 =
931 rr->rr_col[0].rc_shadow_offset;
933 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
934 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
935 rr->rr_col[0].rc_shadow_devidx =
936 rr->rr_col[1].rc_shadow_devidx;
937 rr->rr_col[0].rc_shadow_offset =
938 rr->rr_col[1].rc_shadow_offset;
940 rr->rr_col[1].rc_devidx = devidx0;
941 rr->rr_col[1].rc_offset = offset0;
942 rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
943 rr->rr_col[1].rc_shadow_offset = shadow_offset0;
946 ASSERT3U(asize, ==, tot << ashift);
949 * Determine if the block is contiguous, in which case we can use
950 * an aggregation.
952 if (rows >= raidz_io_aggregate_rows) {
953 rm->rm_nphys_cols = physical_cols;
954 rm->rm_phys_col =
955 kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
956 KM_SLEEP);
959 * Determine the aggregate io's offset and size, and check
960 * that the io is contiguous.
962 for (int i = 0;
963 i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
964 raidz_row_t *rr = rm->rm_row[i];
965 for (int c = 0; c < rr->rr_cols; c++) {
966 raidz_col_t *rc = &rr->rr_col[c];
967 raidz_col_t *prc =
968 &rm->rm_phys_col[rc->rc_devidx];
970 if (rc->rc_size == 0)
971 continue;
973 if (prc->rc_size == 0) {
974 ASSERT0(prc->rc_offset);
975 prc->rc_offset = rc->rc_offset;
976 } else if (prc->rc_offset + prc->rc_size !=
977 rc->rc_offset) {
979 * This block is not contiguous and
980 * therefore can't be aggregated.
981 * This is expected to be rare, so
982 * the cost of allocating and then
983 * freeing rm_phys_col is not
984 * significant.
986 kmem_free(rm->rm_phys_col,
987 sizeof (raidz_col_t) *
988 rm->rm_nphys_cols);
989 rm->rm_phys_col = NULL;
990 rm->rm_nphys_cols = 0;
991 break;
993 prc->rc_size += rc->rc_size;
997 if (rm->rm_phys_col != NULL) {
999 * Allocate aggregate ABD's.
1001 for (int i = 0; i < rm->rm_nphys_cols; i++) {
1002 raidz_col_t *prc = &rm->rm_phys_col[i];
1004 prc->rc_devidx = i;
1006 if (prc->rc_size == 0)
1007 continue;
1009 prc->rc_abd =
1010 abd_alloc_linear(rm->rm_phys_col[i].rc_size,
1011 B_FALSE);
1015 * Point the parity abd's into the aggregate abd's.
1017 for (int i = 0; i < rm->rm_nrows; i++) {
1018 raidz_row_t *rr = rm->rm_row[i];
1019 for (int c = 0; c < rr->rr_firstdatacol; c++) {
1020 raidz_col_t *rc = &rr->rr_col[c];
1021 raidz_col_t *prc =
1022 &rm->rm_phys_col[rc->rc_devidx];
1023 rc->rc_abd =
1024 abd_get_offset_struct(&rc->rc_abdstruct,
1025 prc->rc_abd,
1026 rc->rc_offset - prc->rc_offset,
1027 rc->rc_size);
1030 } else {
1032 * Allocate new abd's for the parity sectors.
1034 for (int i = 0; i < rm->rm_nrows; i++) {
1035 raidz_row_t *rr = rm->rm_row[i];
1036 for (int c = 0; c < rr->rr_firstdatacol; c++) {
1037 raidz_col_t *rc = &rr->rr_col[c];
1038 rc->rc_abd =
1039 abd_alloc_linear(rc->rc_size,
1040 B_TRUE);
1044 /* init RAIDZ parity ops */
1045 rm->rm_ops = vdev_raidz_math_get_ops();
1047 return (rm);
1050 struct pqr_struct {
1051 uint64_t *p;
1052 uint64_t *q;
1053 uint64_t *r;
1056 static int
1057 vdev_raidz_p_func(void *buf, size_t size, void *private)
1059 struct pqr_struct *pqr = private;
1060 const uint64_t *src = buf;
1061 int cnt = size / sizeof (src[0]);
1063 ASSERT(pqr->p && !pqr->q && !pqr->r);
1065 for (int i = 0; i < cnt; i++, src++, pqr->p++)
1066 *pqr->p ^= *src;
1068 return (0);
1071 static int
1072 vdev_raidz_pq_func(void *buf, size_t size, void *private)
1074 struct pqr_struct *pqr = private;
1075 const uint64_t *src = buf;
1076 uint64_t mask;
1077 int cnt = size / sizeof (src[0]);
1079 ASSERT(pqr->p && pqr->q && !pqr->r);
1081 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1082 *pqr->p ^= *src;
1083 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1084 *pqr->q ^= *src;
1087 return (0);
1090 static int
1091 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1093 struct pqr_struct *pqr = private;
1094 const uint64_t *src = buf;
1095 uint64_t mask;
1096 int cnt = size / sizeof (src[0]);
1098 ASSERT(pqr->p && pqr->q && pqr->r);
1100 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1101 *pqr->p ^= *src;
1102 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1103 *pqr->q ^= *src;
1104 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1105 *pqr->r ^= *src;
1108 return (0);
1111 static void
1112 vdev_raidz_generate_parity_p(raidz_row_t *rr)
1114 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1116 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1117 abd_t *src = rr->rr_col[c].rc_abd;
1119 if (c == rr->rr_firstdatacol) {
1120 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1121 } else {
1122 struct pqr_struct pqr = { p, NULL, NULL };
1123 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1124 vdev_raidz_p_func, &pqr);
1129 static void
1130 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1132 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1133 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1134 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1135 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1136 rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1138 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1139 abd_t *src = rr->rr_col[c].rc_abd;
1141 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1143 if (c == rr->rr_firstdatacol) {
1144 ASSERT(ccnt == pcnt || ccnt == 0);
1145 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1146 (void) memcpy(q, p, rr->rr_col[c].rc_size);
1148 for (uint64_t i = ccnt; i < pcnt; i++) {
1149 p[i] = 0;
1150 q[i] = 0;
1152 } else {
1153 struct pqr_struct pqr = { p, q, NULL };
1155 ASSERT(ccnt <= pcnt);
1156 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1157 vdev_raidz_pq_func, &pqr);
1160 * Treat short columns as though they are full of 0s.
1161 * Note that there's therefore nothing needed for P.
1163 uint64_t mask;
1164 for (uint64_t i = ccnt; i < pcnt; i++) {
1165 VDEV_RAIDZ_64MUL_2(q[i], mask);
1171 static void
1172 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1174 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1175 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1176 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1177 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1178 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1179 rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1180 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1181 rr->rr_col[VDEV_RAIDZ_R].rc_size);
1183 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1184 abd_t *src = rr->rr_col[c].rc_abd;
1186 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1188 if (c == rr->rr_firstdatacol) {
1189 ASSERT(ccnt == pcnt || ccnt == 0);
1190 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1191 (void) memcpy(q, p, rr->rr_col[c].rc_size);
1192 (void) memcpy(r, p, rr->rr_col[c].rc_size);
1194 for (uint64_t i = ccnt; i < pcnt; i++) {
1195 p[i] = 0;
1196 q[i] = 0;
1197 r[i] = 0;
1199 } else {
1200 struct pqr_struct pqr = { p, q, r };
1202 ASSERT(ccnt <= pcnt);
1203 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1204 vdev_raidz_pqr_func, &pqr);
1207 * Treat short columns as though they are full of 0s.
1208 * Note that there's therefore nothing needed for P.
1210 uint64_t mask;
1211 for (uint64_t i = ccnt; i < pcnt; i++) {
1212 VDEV_RAIDZ_64MUL_2(q[i], mask);
1213 VDEV_RAIDZ_64MUL_4(r[i], mask);
1220 * Generate RAID parity in the first virtual columns according to the number of
1221 * parity columns available.
1223 void
1224 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1226 if (rr->rr_cols == 0) {
1228 * We are handling this block one row at a time (because
1229 * this block has a different logical vs physical width,
1230 * due to RAIDZ expansion), and this is a pad-only row,
1231 * which has no parity.
1233 return;
1236 /* Generate using the new math implementation */
1237 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1238 return;
1240 switch (rr->rr_firstdatacol) {
1241 case 1:
1242 vdev_raidz_generate_parity_p(rr);
1243 break;
1244 case 2:
1245 vdev_raidz_generate_parity_pq(rr);
1246 break;
1247 case 3:
1248 vdev_raidz_generate_parity_pqr(rr);
1249 break;
1250 default:
1251 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1255 void
1256 vdev_raidz_generate_parity(raidz_map_t *rm)
1258 for (int i = 0; i < rm->rm_nrows; i++) {
1259 raidz_row_t *rr = rm->rm_row[i];
1260 vdev_raidz_generate_parity_row(rm, rr);
1264 static int
1265 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1267 (void) private;
1268 uint64_t *dst = dbuf;
1269 uint64_t *src = sbuf;
1270 int cnt = size / sizeof (src[0]);
1272 for (int i = 0; i < cnt; i++) {
1273 dst[i] ^= src[i];
1276 return (0);
1279 static int
1280 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1281 void *private)
1283 (void) private;
1284 uint64_t *dst = dbuf;
1285 uint64_t *src = sbuf;
1286 uint64_t mask;
1287 int cnt = size / sizeof (dst[0]);
1289 for (int i = 0; i < cnt; i++, dst++, src++) {
1290 VDEV_RAIDZ_64MUL_2(*dst, mask);
1291 *dst ^= *src;
1294 return (0);
1297 static int
1298 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1300 (void) private;
1301 uint64_t *dst = buf;
1302 uint64_t mask;
1303 int cnt = size / sizeof (dst[0]);
1305 for (int i = 0; i < cnt; i++, dst++) {
1306 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1307 VDEV_RAIDZ_64MUL_2(*dst, mask);
1310 return (0);
1313 struct reconst_q_struct {
1314 uint64_t *q;
1315 int exp;
1318 static int
1319 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1321 struct reconst_q_struct *rq = private;
1322 uint64_t *dst = buf;
1323 int cnt = size / sizeof (dst[0]);
1325 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1326 int j;
1327 uint8_t *b;
1329 *dst ^= *rq->q;
1330 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1331 *b = vdev_raidz_exp2(*b, rq->exp);
1335 return (0);
1338 struct reconst_pq_struct {
1339 uint8_t *p;
1340 uint8_t *q;
1341 uint8_t *pxy;
1342 uint8_t *qxy;
1343 int aexp;
1344 int bexp;
1347 static int
1348 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1350 struct reconst_pq_struct *rpq = private;
1351 uint8_t *xd = xbuf;
1352 uint8_t *yd = ybuf;
1354 for (int i = 0; i < size;
1355 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1356 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1357 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1358 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
1361 return (0);
1364 static int
1365 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1367 struct reconst_pq_struct *rpq = private;
1368 uint8_t *xd = xbuf;
1370 for (int i = 0; i < size;
1371 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1372 /* same operation as vdev_raidz_reconst_pq_func() on xd */
1373 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1374 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1377 return (0);
1380 static void
1381 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1383 int x = tgts[0];
1384 abd_t *dst, *src;
1386 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1387 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1389 ASSERT3U(ntgts, ==, 1);
1390 ASSERT3U(x, >=, rr->rr_firstdatacol);
1391 ASSERT3U(x, <, rr->rr_cols);
1393 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1395 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1396 dst = rr->rr_col[x].rc_abd;
1398 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1400 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1401 uint64_t size = MIN(rr->rr_col[x].rc_size,
1402 rr->rr_col[c].rc_size);
1404 src = rr->rr_col[c].rc_abd;
1406 if (c == x)
1407 continue;
1409 (void) abd_iterate_func2(dst, src, 0, 0, size,
1410 vdev_raidz_reconst_p_func, NULL);
1414 static void
1415 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1417 int x = tgts[0];
1418 int c, exp;
1419 abd_t *dst, *src;
1421 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1422 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1424 ASSERT(ntgts == 1);
1426 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1428 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1429 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1430 rr->rr_col[c].rc_size);
1432 src = rr->rr_col[c].rc_abd;
1433 dst = rr->rr_col[x].rc_abd;
1435 if (c == rr->rr_firstdatacol) {
1436 abd_copy(dst, src, size);
1437 if (rr->rr_col[x].rc_size > size) {
1438 abd_zero_off(dst, size,
1439 rr->rr_col[x].rc_size - size);
1441 } else {
1442 ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1443 (void) abd_iterate_func2(dst, src, 0, 0, size,
1444 vdev_raidz_reconst_q_pre_func, NULL);
1445 (void) abd_iterate_func(dst,
1446 size, rr->rr_col[x].rc_size - size,
1447 vdev_raidz_reconst_q_pre_tail_func, NULL);
1451 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1452 dst = rr->rr_col[x].rc_abd;
1453 exp = 255 - (rr->rr_cols - 1 - x);
1455 struct reconst_q_struct rq = { abd_to_buf(src), exp };
1456 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1457 vdev_raidz_reconst_q_post_func, &rq);
1460 static void
1461 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1463 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1464 abd_t *pdata, *qdata;
1465 uint64_t xsize, ysize;
1466 int x = tgts[0];
1467 int y = tgts[1];
1468 abd_t *xd, *yd;
1470 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1471 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1473 ASSERT(ntgts == 2);
1474 ASSERT(x < y);
1475 ASSERT(x >= rr->rr_firstdatacol);
1476 ASSERT(y < rr->rr_cols);
1478 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1481 * Move the parity data aside -- we're going to compute parity as
1482 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1483 * reuse the parity generation mechanism without trashing the actual
1484 * parity so we make those columns appear to be full of zeros by
1485 * setting their lengths to zero.
1487 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1488 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1489 xsize = rr->rr_col[x].rc_size;
1490 ysize = rr->rr_col[y].rc_size;
1492 rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1493 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1494 rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1495 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1496 rr->rr_col[x].rc_size = 0;
1497 rr->rr_col[y].rc_size = 0;
1499 vdev_raidz_generate_parity_pq(rr);
1501 rr->rr_col[x].rc_size = xsize;
1502 rr->rr_col[y].rc_size = ysize;
1504 p = abd_to_buf(pdata);
1505 q = abd_to_buf(qdata);
1506 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1507 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1508 xd = rr->rr_col[x].rc_abd;
1509 yd = rr->rr_col[y].rc_abd;
1512 * We now have:
1513 * Pxy = P + D_x + D_y
1514 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1516 * We can then solve for D_x:
1517 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1518 * where
1519 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1520 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1522 * With D_x in hand, we can easily solve for D_y:
1523 * D_y = P + Pxy + D_x
1526 a = vdev_raidz_pow2[255 + x - y];
1527 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1528 tmp = 255 - vdev_raidz_log2[a ^ 1];
1530 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1531 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1533 ASSERT3U(xsize, >=, ysize);
1534 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1536 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1537 vdev_raidz_reconst_pq_func, &rpq);
1538 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1539 vdev_raidz_reconst_pq_tail_func, &rpq);
1541 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1542 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1545 * Restore the saved parity data.
1547 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1548 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1552 * In the general case of reconstruction, we must solve the system of linear
1553 * equations defined by the coefficients used to generate parity as well as
1554 * the contents of the data and parity disks. This can be expressed with
1555 * vectors for the original data (D) and the actual data (d) and parity (p)
1556 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1558 * __ __ __ __
1559 * | | __ __ | p_0 |
1560 * | V | | D_0 | | p_m-1 |
1561 * | | x | : | = | d_0 |
1562 * | I | | D_n-1 | | : |
1563 * | | ~~ ~~ | d_n-1 |
1564 * ~~ ~~ ~~ ~~
1566 * I is simply a square identity matrix of size n, and V is a vandermonde
1567 * matrix defined by the coefficients we chose for the various parity columns
1568 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1569 * computation as well as linear separability.
1571 * __ __ __ __
1572 * | 1 .. 1 1 1 | | p_0 |
1573 * | 2^n-1 .. 4 2 1 | __ __ | : |
1574 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1575 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1576 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1577 * | : : : : | | : | | d_2 |
1578 * | 0 .. 1 0 0 | | D_n-1 | | : |
1579 * | 0 .. 0 1 0 | ~~ ~~ | : |
1580 * | 0 .. 0 0 1 | | d_n-1 |
1581 * ~~ ~~ ~~ ~~
1583 * Note that I, V, d, and p are known. To compute D, we must invert the
1584 * matrix and use the known data and parity values to reconstruct the unknown
1585 * data values. We begin by removing the rows in V|I and d|p that correspond
1586 * to failed or missing columns; we then make V|I square (n x n) and d|p
1587 * sized n by removing rows corresponding to unused parity from the bottom up
1588 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1589 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1590 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1591 * __ __
1592 * | 1 1 1 1 1 1 1 1 |
1593 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1594 * | 19 205 116 29 64 16 4 1 | / /
1595 * | 1 0 0 0 0 0 0 0 | / /
1596 * | 0 1 0 0 0 0 0 0 | <--' /
1597 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1598 * | 0 0 0 1 0 0 0 0 |
1599 * | 0 0 0 0 1 0 0 0 |
1600 * | 0 0 0 0 0 1 0 0 |
1601 * | 0 0 0 0 0 0 1 0 |
1602 * | 0 0 0 0 0 0 0 1 |
1603 * ~~ ~~
1604 * __ __
1605 * | 1 1 1 1 1 1 1 1 |
1606 * | 128 64 32 16 8 4 2 1 |
1607 * | 19 205 116 29 64 16 4 1 |
1608 * | 1 0 0 0 0 0 0 0 |
1609 * | 0 1 0 0 0 0 0 0 |
1610 * (V|I)' = | 0 0 1 0 0 0 0 0 |
1611 * | 0 0 0 1 0 0 0 0 |
1612 * | 0 0 0 0 1 0 0 0 |
1613 * | 0 0 0 0 0 1 0 0 |
1614 * | 0 0 0 0 0 0 1 0 |
1615 * | 0 0 0 0 0 0 0 1 |
1616 * ~~ ~~
1618 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1619 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1620 * matrix is not singular.
1621 * __ __
1622 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1623 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1624 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1625 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1626 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1627 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1628 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1629 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1630 * ~~ ~~
1631 * __ __
1632 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1633 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1634 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1635 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1636 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1637 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1638 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1639 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1640 * ~~ ~~
1641 * __ __
1642 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1643 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1644 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1645 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1646 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1647 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1648 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1649 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1650 * ~~ ~~
1651 * __ __
1652 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1653 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1654 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1655 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1656 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1657 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1658 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1659 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1660 * ~~ ~~
1661 * __ __
1662 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1663 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1664 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1665 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1666 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1667 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1668 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1669 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1670 * ~~ ~~
1671 * __ __
1672 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1673 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1674 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1675 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1676 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1677 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1678 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1679 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1680 * ~~ ~~
1681 * __ __
1682 * | 0 0 1 0 0 0 0 0 |
1683 * | 167 100 5 41 159 169 217 208 |
1684 * | 166 100 4 40 158 168 216 209 |
1685 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1686 * | 0 0 0 0 1 0 0 0 |
1687 * | 0 0 0 0 0 1 0 0 |
1688 * | 0 0 0 0 0 0 1 0 |
1689 * | 0 0 0 0 0 0 0 1 |
1690 * ~~ ~~
1692 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1693 * of the missing data.
1695 * As is apparent from the example above, the only non-trivial rows in the
1696 * inverse matrix correspond to the data disks that we're trying to
1697 * reconstruct. Indeed, those are the only rows we need as the others would
1698 * only be useful for reconstructing data known or assumed to be valid. For
1699 * that reason, we only build the coefficients in the rows that correspond to
1700 * targeted columns.
1703 static void
1704 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1705 uint8_t **rows)
1707 int i, j;
1708 int pow;
1710 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1713 * Fill in the missing rows of interest.
1715 for (i = 0; i < nmap; i++) {
1716 ASSERT3S(0, <=, map[i]);
1717 ASSERT3S(map[i], <=, 2);
1719 pow = map[i] * n;
1720 if (pow > 255)
1721 pow -= 255;
1722 ASSERT(pow <= 255);
1724 for (j = 0; j < n; j++) {
1725 pow -= map[i];
1726 if (pow < 0)
1727 pow += 255;
1728 rows[i][j] = vdev_raidz_pow2[pow];
1733 static void
1734 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1735 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1737 int i, j, ii, jj;
1738 uint8_t log;
1741 * Assert that the first nmissing entries from the array of used
1742 * columns correspond to parity columns and that subsequent entries
1743 * correspond to data columns.
1745 for (i = 0; i < nmissing; i++) {
1746 ASSERT3S(used[i], <, rr->rr_firstdatacol);
1748 for (; i < n; i++) {
1749 ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1753 * First initialize the storage where we'll compute the inverse rows.
1755 for (i = 0; i < nmissing; i++) {
1756 for (j = 0; j < n; j++) {
1757 invrows[i][j] = (i == j) ? 1 : 0;
1762 * Subtract all trivial rows from the rows of consequence.
1764 for (i = 0; i < nmissing; i++) {
1765 for (j = nmissing; j < n; j++) {
1766 ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1767 jj = used[j] - rr->rr_firstdatacol;
1768 ASSERT3S(jj, <, n);
1769 invrows[i][j] = rows[i][jj];
1770 rows[i][jj] = 0;
1775 * For each of the rows of interest, we must normalize it and subtract
1776 * a multiple of it from the other rows.
1778 for (i = 0; i < nmissing; i++) {
1779 for (j = 0; j < missing[i]; j++) {
1780 ASSERT0(rows[i][j]);
1782 ASSERT3U(rows[i][missing[i]], !=, 0);
1785 * Compute the inverse of the first element and multiply each
1786 * element in the row by that value.
1788 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1790 for (j = 0; j < n; j++) {
1791 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1792 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1795 for (ii = 0; ii < nmissing; ii++) {
1796 if (i == ii)
1797 continue;
1799 ASSERT3U(rows[ii][missing[i]], !=, 0);
1801 log = vdev_raidz_log2[rows[ii][missing[i]]];
1803 for (j = 0; j < n; j++) {
1804 rows[ii][j] ^=
1805 vdev_raidz_exp2(rows[i][j], log);
1806 invrows[ii][j] ^=
1807 vdev_raidz_exp2(invrows[i][j], log);
1813 * Verify that the data that is left in the rows are properly part of
1814 * an identity matrix.
1816 for (i = 0; i < nmissing; i++) {
1817 for (j = 0; j < n; j++) {
1818 if (j == missing[i]) {
1819 ASSERT3U(rows[i][j], ==, 1);
1820 } else {
1821 ASSERT0(rows[i][j]);
1827 static void
1828 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1829 int *missing, uint8_t **invrows, const uint8_t *used)
1831 int i, j, x, cc, c;
1832 uint8_t *src;
1833 uint64_t ccount;
1834 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1835 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1836 uint8_t log = 0;
1837 uint8_t val;
1838 int ll;
1839 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1840 uint8_t *p, *pp;
1841 size_t psize;
1843 psize = sizeof (invlog[0][0]) * n * nmissing;
1844 p = kmem_alloc(psize, KM_SLEEP);
1846 for (pp = p, i = 0; i < nmissing; i++) {
1847 invlog[i] = pp;
1848 pp += n;
1851 for (i = 0; i < nmissing; i++) {
1852 for (j = 0; j < n; j++) {
1853 ASSERT3U(invrows[i][j], !=, 0);
1854 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1858 for (i = 0; i < n; i++) {
1859 c = used[i];
1860 ASSERT3U(c, <, rr->rr_cols);
1862 ccount = rr->rr_col[c].rc_size;
1863 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1864 if (ccount == 0)
1865 continue;
1866 src = abd_to_buf(rr->rr_col[c].rc_abd);
1867 for (j = 0; j < nmissing; j++) {
1868 cc = missing[j] + rr->rr_firstdatacol;
1869 ASSERT3U(cc, >=, rr->rr_firstdatacol);
1870 ASSERT3U(cc, <, rr->rr_cols);
1871 ASSERT3U(cc, !=, c);
1873 dcount[j] = rr->rr_col[cc].rc_size;
1874 if (dcount[j] != 0)
1875 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1878 for (x = 0; x < ccount; x++, src++) {
1879 if (*src != 0)
1880 log = vdev_raidz_log2[*src];
1882 for (cc = 0; cc < nmissing; cc++) {
1883 if (x >= dcount[cc])
1884 continue;
1886 if (*src == 0) {
1887 val = 0;
1888 } else {
1889 if ((ll = log + invlog[cc][i]) >= 255)
1890 ll -= 255;
1891 val = vdev_raidz_pow2[ll];
1894 if (i == 0)
1895 dst[cc][x] = val;
1896 else
1897 dst[cc][x] ^= val;
1902 kmem_free(p, psize);
1905 static void
1906 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1908 int i, c, t, tt;
1909 unsigned int n;
1910 unsigned int nmissing_rows;
1911 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1912 int parity_map[VDEV_RAIDZ_MAXPARITY];
1913 uint8_t *p, *pp;
1914 size_t psize;
1915 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1916 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1917 uint8_t *used;
1919 abd_t **bufs = NULL;
1921 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1922 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1924 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1925 * temporary linear ABDs if any non-linear ABDs are found.
1927 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1928 ASSERT(rr->rr_col[i].rc_abd != NULL);
1929 if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1930 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1931 KM_PUSHPAGE);
1933 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1934 raidz_col_t *col = &rr->rr_col[c];
1936 bufs[c] = col->rc_abd;
1937 if (bufs[c] != NULL) {
1938 col->rc_abd = abd_alloc_linear(
1939 col->rc_size, B_TRUE);
1940 abd_copy(col->rc_abd, bufs[c],
1941 col->rc_size);
1945 break;
1949 n = rr->rr_cols - rr->rr_firstdatacol;
1952 * Figure out which data columns are missing.
1954 nmissing_rows = 0;
1955 for (t = 0; t < ntgts; t++) {
1956 if (tgts[t] >= rr->rr_firstdatacol) {
1957 missing_rows[nmissing_rows++] =
1958 tgts[t] - rr->rr_firstdatacol;
1963 * Figure out which parity columns to use to help generate the missing
1964 * data columns.
1966 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1967 ASSERT(tt < ntgts);
1968 ASSERT(c < rr->rr_firstdatacol);
1971 * Skip any targeted parity columns.
1973 if (c == tgts[tt]) {
1974 tt++;
1975 continue;
1978 parity_map[i] = c;
1979 i++;
1982 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1983 nmissing_rows * n + sizeof (used[0]) * n;
1984 p = kmem_alloc(psize, KM_SLEEP);
1986 for (pp = p, i = 0; i < nmissing_rows; i++) {
1987 rows[i] = pp;
1988 pp += n;
1989 invrows[i] = pp;
1990 pp += n;
1992 used = pp;
1994 for (i = 0; i < nmissing_rows; i++) {
1995 used[i] = parity_map[i];
1998 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1999 if (tt < nmissing_rows &&
2000 c == missing_rows[tt] + rr->rr_firstdatacol) {
2001 tt++;
2002 continue;
2005 ASSERT3S(i, <, n);
2006 used[i] = c;
2007 i++;
2011 * Initialize the interesting rows of the matrix.
2013 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2016 * Invert the matrix.
2018 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2019 invrows, used);
2022 * Reconstruct the missing data using the generated matrix.
2024 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2025 invrows, used);
2027 kmem_free(p, psize);
2030 * copy back from temporary linear abds and free them
2032 if (bufs) {
2033 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2034 raidz_col_t *col = &rr->rr_col[c];
2036 if (bufs[c] != NULL) {
2037 abd_copy(bufs[c], col->rc_abd, col->rc_size);
2038 abd_free(col->rc_abd);
2040 col->rc_abd = bufs[c];
2042 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2046 static void
2047 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2048 const int *t, int nt)
2050 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2051 int ntgts;
2052 int i, c, ret;
2053 int nbadparity, nbaddata;
2054 int parity_valid[VDEV_RAIDZ_MAXPARITY];
2056 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2057 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2058 rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2059 (int)rr->rr_missingparity);
2062 nbadparity = rr->rr_firstdatacol;
2063 nbaddata = rr->rr_cols - nbadparity;
2064 ntgts = 0;
2065 for (i = 0, c = 0; c < rr->rr_cols; c++) {
2066 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2067 zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2068 "offset=%llx error=%u)",
2069 rr, c, (int)rr->rr_col[c].rc_devidx,
2070 (long long)rr->rr_col[c].rc_offset,
2071 (int)rr->rr_col[c].rc_error);
2073 if (c < rr->rr_firstdatacol)
2074 parity_valid[c] = B_FALSE;
2076 if (i < nt && c == t[i]) {
2077 tgts[ntgts++] = c;
2078 i++;
2079 } else if (rr->rr_col[c].rc_error != 0) {
2080 tgts[ntgts++] = c;
2081 } else if (c >= rr->rr_firstdatacol) {
2082 nbaddata--;
2083 } else {
2084 parity_valid[c] = B_TRUE;
2085 nbadparity--;
2089 ASSERT(ntgts >= nt);
2090 ASSERT(nbaddata >= 0);
2091 ASSERT(nbaddata + nbadparity == ntgts);
2093 dt = &tgts[nbadparity];
2095 /* Reconstruct using the new math implementation */
2096 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2097 if (ret != RAIDZ_ORIGINAL_IMPL)
2098 return;
2101 * See if we can use any of our optimized reconstruction routines.
2103 switch (nbaddata) {
2104 case 1:
2105 if (parity_valid[VDEV_RAIDZ_P]) {
2106 vdev_raidz_reconstruct_p(rr, dt, 1);
2107 return;
2110 ASSERT(rr->rr_firstdatacol > 1);
2112 if (parity_valid[VDEV_RAIDZ_Q]) {
2113 vdev_raidz_reconstruct_q(rr, dt, 1);
2114 return;
2117 ASSERT(rr->rr_firstdatacol > 2);
2118 break;
2120 case 2:
2121 ASSERT(rr->rr_firstdatacol > 1);
2123 if (parity_valid[VDEV_RAIDZ_P] &&
2124 parity_valid[VDEV_RAIDZ_Q]) {
2125 vdev_raidz_reconstruct_pq(rr, dt, 2);
2126 return;
2129 ASSERT(rr->rr_firstdatacol > 2);
2131 break;
2134 vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2137 static int
2138 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2139 uint64_t *logical_ashift, uint64_t *physical_ashift)
2141 vdev_raidz_t *vdrz = vd->vdev_tsd;
2142 uint64_t nparity = vdrz->vd_nparity;
2143 int c;
2144 int lasterror = 0;
2145 int numerrors = 0;
2147 ASSERT(nparity > 0);
2149 if (nparity > VDEV_RAIDZ_MAXPARITY ||
2150 vd->vdev_children < nparity + 1) {
2151 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2152 return (SET_ERROR(EINVAL));
2155 vdev_open_children(vd);
2157 for (c = 0; c < vd->vdev_children; c++) {
2158 vdev_t *cvd = vd->vdev_child[c];
2160 if (cvd->vdev_open_error != 0) {
2161 lasterror = cvd->vdev_open_error;
2162 numerrors++;
2163 continue;
2166 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2167 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2168 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2170 for (c = 0; c < vd->vdev_children; c++) {
2171 vdev_t *cvd = vd->vdev_child[c];
2173 if (cvd->vdev_open_error != 0)
2174 continue;
2175 *physical_ashift = vdev_best_ashift(*logical_ashift,
2176 *physical_ashift, cvd->vdev_physical_ashift);
2179 if (vd->vdev_rz_expanding) {
2180 *asize *= vd->vdev_children - 1;
2181 *max_asize *= vd->vdev_children - 1;
2183 vd->vdev_min_asize = *asize;
2184 } else {
2185 *asize *= vd->vdev_children;
2186 *max_asize *= vd->vdev_children;
2189 if (numerrors > nparity) {
2190 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2191 return (lasterror);
2194 return (0);
2197 static void
2198 vdev_raidz_close(vdev_t *vd)
2200 for (int c = 0; c < vd->vdev_children; c++) {
2201 if (vd->vdev_child[c] != NULL)
2202 vdev_close(vd->vdev_child[c]);
2207 * Return the logical width to use, given the txg in which the allocation
2208 * happened. Note that BP_GET_BIRTH() is usually the txg in which the
2209 * BP was allocated. Remapped BP's (that were relocated due to device
2210 * removal, see remap_blkptr_cb()), will have a more recent physical birth
2211 * which reflects when the BP was relocated, but we can ignore these because
2212 * they can't be on RAIDZ (device removal doesn't support RAIDZ).
2214 static uint64_t
2215 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2217 reflow_node_t lookup = {
2218 .re_txg = txg,
2220 avl_index_t where;
2222 uint64_t width;
2223 mutex_enter(&vdrz->vd_expand_lock);
2224 reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2225 if (re != NULL) {
2226 width = re->re_logical_width;
2227 } else {
2228 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2229 if (re != NULL)
2230 width = re->re_logical_width;
2231 else
2232 width = vdrz->vd_original_width;
2234 mutex_exit(&vdrz->vd_expand_lock);
2235 return (width);
2239 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2240 * more space due to the lower data-to-parity ratio. In this case it's
2241 * important to pass in the correct txg. Note that vdev_gang_header_asize()
2242 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2243 * regardless of txg. This is assured because for a single data sector, we
2244 * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2246 static uint64_t
2247 vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2249 vdev_raidz_t *vdrz = vd->vdev_tsd;
2250 uint64_t asize;
2251 uint64_t ashift = vd->vdev_top->vdev_ashift;
2252 uint64_t cols = vdrz->vd_original_width;
2253 uint64_t nparity = vdrz->vd_nparity;
2255 cols = vdev_raidz_get_logical_width(vdrz, txg);
2257 asize = ((psize - 1) >> ashift) + 1;
2258 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2259 asize = roundup(asize, nparity + 1) << ashift;
2261 #ifdef ZFS_DEBUG
2262 uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2263 uint64_t ncols_new = vdrz->vd_physical_width;
2264 asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2265 (ncols_new - nparity));
2266 asize_new = roundup(asize_new, nparity + 1) << ashift;
2267 VERIFY3U(asize_new, <=, asize);
2268 #endif
2270 return (asize);
2274 * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2275 * so each child must provide at least 1/Nth of its asize.
2277 static uint64_t
2278 vdev_raidz_min_asize(vdev_t *vd)
2280 return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2281 vd->vdev_children);
2284 void
2285 vdev_raidz_child_done(zio_t *zio)
2287 raidz_col_t *rc = zio->io_private;
2289 ASSERT3P(rc->rc_abd, !=, NULL);
2290 rc->rc_error = zio->io_error;
2291 rc->rc_tried = 1;
2292 rc->rc_skipped = 0;
2295 static void
2296 vdev_raidz_shadow_child_done(zio_t *zio)
2298 raidz_col_t *rc = zio->io_private;
2300 rc->rc_shadow_error = zio->io_error;
2303 static void
2304 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2306 (void) rm;
2307 #ifdef ZFS_DEBUG
2308 range_seg64_t logical_rs, physical_rs, remain_rs;
2309 logical_rs.rs_start = rr->rr_offset;
2310 logical_rs.rs_end = logical_rs.rs_start +
2311 vdev_raidz_asize(zio->io_vd, rr->rr_size,
2312 BP_GET_BIRTH(zio->io_bp));
2314 raidz_col_t *rc = &rr->rr_col[col];
2315 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2317 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2318 ASSERT(vdev_xlate_is_empty(&remain_rs));
2319 if (vdev_xlate_is_empty(&physical_rs)) {
2321 * If we are in the middle of expansion, the
2322 * physical->logical mapping is changing so vdev_xlate()
2323 * can't give us a reliable answer.
2325 return;
2327 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2328 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2330 * It would be nice to assert that rs_end is equal
2331 * to rc_offset + rc_size but there might be an
2332 * optional I/O at the end that is not accounted in
2333 * rc_size.
2335 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2336 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2337 rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2338 } else {
2339 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2341 #endif
2344 static void
2345 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2347 vdev_t *vd = zio->io_vd;
2348 raidz_map_t *rm = zio->io_vsd;
2350 vdev_raidz_generate_parity_row(rm, rr);
2352 for (int c = 0; c < rr->rr_scols; c++) {
2353 raidz_col_t *rc = &rr->rr_col[c];
2354 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2356 /* Verify physical to logical translation */
2357 vdev_raidz_io_verify(zio, rm, rr, c);
2359 if (rc->rc_size == 0)
2360 continue;
2362 ASSERT3U(rc->rc_offset + rc->rc_size, <,
2363 cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2365 ASSERT3P(rc->rc_abd, !=, NULL);
2366 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2367 rc->rc_offset, rc->rc_abd,
2368 abd_get_size(rc->rc_abd), zio->io_type,
2369 zio->io_priority, 0, vdev_raidz_child_done, rc));
2371 if (rc->rc_shadow_devidx != INT_MAX) {
2372 vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2374 ASSERT3U(
2375 rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2376 cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2378 zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2379 rc->rc_shadow_offset, rc->rc_abd,
2380 abd_get_size(rc->rc_abd),
2381 zio->io_type, zio->io_priority, 0,
2382 vdev_raidz_shadow_child_done, rc));
2388 * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2389 * This only works for vdev_raidz_map_alloc() (not _expanded()).
2391 static void
2392 raidz_start_skip_writes(zio_t *zio)
2394 vdev_t *vd = zio->io_vd;
2395 uint64_t ashift = vd->vdev_top->vdev_ashift;
2396 raidz_map_t *rm = zio->io_vsd;
2397 ASSERT3U(rm->rm_nrows, ==, 1);
2398 raidz_row_t *rr = rm->rm_row[0];
2399 for (int c = 0; c < rr->rr_scols; c++) {
2400 raidz_col_t *rc = &rr->rr_col[c];
2401 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2402 if (rc->rc_size != 0)
2403 continue;
2404 ASSERT3P(rc->rc_abd, ==, NULL);
2406 ASSERT3U(rc->rc_offset, <,
2407 cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2409 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2410 NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2411 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2415 static void
2416 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2418 vdev_t *vd = zio->io_vd;
2421 * Iterate over the columns in reverse order so that we hit the parity
2422 * last -- any errors along the way will force us to read the parity.
2424 for (int c = rr->rr_cols - 1; c >= 0; c--) {
2425 raidz_col_t *rc = &rr->rr_col[c];
2426 if (rc->rc_size == 0)
2427 continue;
2428 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2429 if (!vdev_readable(cvd)) {
2430 if (c >= rr->rr_firstdatacol)
2431 rr->rr_missingdata++;
2432 else
2433 rr->rr_missingparity++;
2434 rc->rc_error = SET_ERROR(ENXIO);
2435 rc->rc_tried = 1; /* don't even try */
2436 rc->rc_skipped = 1;
2437 continue;
2439 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2440 if (c >= rr->rr_firstdatacol)
2441 rr->rr_missingdata++;
2442 else
2443 rr->rr_missingparity++;
2444 rc->rc_error = SET_ERROR(ESTALE);
2445 rc->rc_skipped = 1;
2446 continue;
2448 if (forceparity ||
2449 c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
2450 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2451 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2452 rc->rc_offset, rc->rc_abd, rc->rc_size,
2453 zio->io_type, zio->io_priority, 0,
2454 vdev_raidz_child_done, rc));
2459 static void
2460 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2462 vdev_t *vd = zio->io_vd;
2464 for (int i = 0; i < rm->rm_nphys_cols; i++) {
2465 raidz_col_t *prc = &rm->rm_phys_col[i];
2466 if (prc->rc_size == 0)
2467 continue;
2469 ASSERT3U(prc->rc_devidx, ==, i);
2470 vdev_t *cvd = vd->vdev_child[i];
2471 if (!vdev_readable(cvd)) {
2472 prc->rc_error = SET_ERROR(ENXIO);
2473 prc->rc_tried = 1; /* don't even try */
2474 prc->rc_skipped = 1;
2475 continue;
2477 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2478 prc->rc_error = SET_ERROR(ESTALE);
2479 prc->rc_skipped = 1;
2480 continue;
2482 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2483 prc->rc_offset, prc->rc_abd, prc->rc_size,
2484 zio->io_type, zio->io_priority, 0,
2485 vdev_raidz_child_done, prc));
2489 static void
2490 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2493 * If there are multiple rows, we will be hitting
2494 * all disks, so go ahead and read the parity so
2495 * that we are reading in decent size chunks.
2497 boolean_t forceparity = rm->rm_nrows > 1;
2499 if (rm->rm_phys_col) {
2500 vdev_raidz_io_start_read_phys_cols(zio, rm);
2501 } else {
2502 for (int i = 0; i < rm->rm_nrows; i++) {
2503 raidz_row_t *rr = rm->rm_row[i];
2504 vdev_raidz_io_start_read_row(zio, rr, forceparity);
2510 * Start an IO operation on a RAIDZ VDev
2512 * Outline:
2513 * - For write operations:
2514 * 1. Generate the parity data
2515 * 2. Create child zio write operations to each column's vdev, for both
2516 * data and parity.
2517 * 3. If the column skips any sectors for padding, create optional dummy
2518 * write zio children for those areas to improve aggregation continuity.
2519 * - For read operations:
2520 * 1. Create child zio read operations to each data column's vdev to read
2521 * the range of data required for zio.
2522 * 2. If this is a scrub or resilver operation, or if any of the data
2523 * vdevs have had errors, then create zio read operations to the parity
2524 * columns' VDevs as well.
2526 static void
2527 vdev_raidz_io_start(zio_t *zio)
2529 vdev_t *vd = zio->io_vd;
2530 vdev_t *tvd = vd->vdev_top;
2531 vdev_raidz_t *vdrz = vd->vdev_tsd;
2532 raidz_map_t *rm;
2534 uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2535 BP_GET_BIRTH(zio->io_bp));
2536 if (logical_width != vdrz->vd_physical_width) {
2537 zfs_locked_range_t *lr = NULL;
2538 uint64_t synced_offset = UINT64_MAX;
2539 uint64_t next_offset = UINT64_MAX;
2540 boolean_t use_scratch = B_FALSE;
2542 * Note: when the expansion is completing, we set
2543 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2544 * in a later txg than when we last update spa_ubsync's state
2545 * (see the end of spa_raidz_expand_thread()). Therefore we
2546 * may see vre_state!=SCANNING before
2547 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2548 * on disk, but the copying progress has been synced to disk
2549 * (and reflected in spa_ubsync). In this case it's fine to
2550 * treat the expansion as completed, since if we crash there's
2551 * no additional copying to do.
2553 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2554 ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2555 &vdrz->vn_vre);
2556 lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2557 zio->io_offset, zio->io_size, RL_READER);
2558 use_scratch =
2559 (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2560 RRSS_SCRATCH_VALID);
2561 synced_offset =
2562 RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2563 next_offset = vdrz->vn_vre.vre_offset;
2565 * If we haven't resumed expanding since importing the
2566 * pool, vre_offset won't have been set yet. In
2567 * this case the next offset to be copied is the same
2568 * as what was synced.
2570 if (next_offset == UINT64_MAX) {
2571 next_offset = synced_offset;
2574 if (use_scratch) {
2575 zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2576 "%lld next_offset=%lld use_scratch=%u",
2577 zio,
2578 zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2579 (long long)zio->io_offset,
2580 (long long)synced_offset,
2581 (long long)next_offset,
2582 use_scratch);
2585 rm = vdev_raidz_map_alloc_expanded(zio,
2586 tvd->vdev_ashift, vdrz->vd_physical_width,
2587 logical_width, vdrz->vd_nparity,
2588 synced_offset, next_offset, use_scratch);
2589 rm->rm_lr = lr;
2590 } else {
2591 rm = vdev_raidz_map_alloc(zio,
2592 tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2594 rm->rm_original_width = vdrz->vd_original_width;
2596 zio->io_vsd = rm;
2597 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2598 if (zio->io_type == ZIO_TYPE_WRITE) {
2599 for (int i = 0; i < rm->rm_nrows; i++) {
2600 vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2603 if (logical_width == vdrz->vd_physical_width) {
2604 raidz_start_skip_writes(zio);
2606 } else {
2607 ASSERT(zio->io_type == ZIO_TYPE_READ);
2608 vdev_raidz_io_start_read(zio, rm);
2611 zio_execute(zio);
2615 * Report a checksum error for a child of a RAID-Z device.
2617 void
2618 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2620 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2622 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2623 zio->io_priority != ZIO_PRIORITY_REBUILD) {
2624 zio_bad_cksum_t zbc;
2625 raidz_map_t *rm = zio->io_vsd;
2627 zbc.zbc_has_cksum = 0;
2628 zbc.zbc_injected = rm->rm_ecksuminjected;
2630 mutex_enter(&vd->vdev_stat_lock);
2631 vd->vdev_stat.vs_checksum_errors++;
2632 mutex_exit(&vd->vdev_stat_lock);
2633 (void) zfs_ereport_post_checksum(zio->io_spa, vd,
2634 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2635 rc->rc_abd, bad_data, &zbc);
2640 * We keep track of whether or not there were any injected errors, so that
2641 * any ereports we generate can note it.
2643 static int
2644 raidz_checksum_verify(zio_t *zio)
2646 zio_bad_cksum_t zbc = {0};
2647 raidz_map_t *rm = zio->io_vsd;
2649 int ret = zio_checksum_error(zio, &zbc);
2651 * Any Direct I/O read that has a checksum error must be treated as
2652 * suspicious as the contents of the buffer could be getting
2653 * manipulated while the I/O is taking place. The checksum verify error
2654 * will be reported to the top-level RAIDZ VDEV.
2656 if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
2657 zio->io_error = ret;
2658 zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
2659 zio_dio_chksum_verify_error_report(zio);
2660 zio_checksum_verified(zio);
2661 return (0);
2664 if (ret != 0 && zbc.zbc_injected != 0)
2665 rm->rm_ecksuminjected = 1;
2667 return (ret);
2671 * Generate the parity from the data columns. If we tried and were able to
2672 * read the parity without error, verify that the generated parity matches the
2673 * data we read. If it doesn't, we fire off a checksum error. Return the
2674 * number of such failures.
2676 static int
2677 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2679 abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2680 int c, ret = 0;
2681 raidz_map_t *rm = zio->io_vsd;
2682 raidz_col_t *rc;
2684 blkptr_t *bp = zio->io_bp;
2685 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2686 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2688 if (checksum == ZIO_CHECKSUM_NOPARITY)
2689 return (ret);
2691 for (c = 0; c < rr->rr_firstdatacol; c++) {
2692 rc = &rr->rr_col[c];
2693 if (!rc->rc_tried || rc->rc_error != 0)
2694 continue;
2696 orig[c] = rc->rc_abd;
2697 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2698 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2702 * Verify any empty sectors are zero filled to ensure the parity
2703 * is calculated correctly even if these non-data sectors are damaged.
2705 if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2706 ret += vdev_draid_map_verify_empty(zio, rr);
2709 * Regenerates parity even for !tried||rc_error!=0 columns. This
2710 * isn't harmful but it does have the side effect of fixing stuff
2711 * we didn't realize was necessary (i.e. even if we return 0).
2713 vdev_raidz_generate_parity_row(rm, rr);
2715 for (c = 0; c < rr->rr_firstdatacol; c++) {
2716 rc = &rr->rr_col[c];
2718 if (!rc->rc_tried || rc->rc_error != 0)
2719 continue;
2721 if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2722 zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2723 c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
2724 vdev_raidz_checksum_error(zio, rc, orig[c]);
2725 rc->rc_error = SET_ERROR(ECKSUM);
2726 ret++;
2728 abd_free(orig[c]);
2731 return (ret);
2734 static int
2735 vdev_raidz_worst_error(raidz_row_t *rr)
2737 int error = 0;
2739 for (int c = 0; c < rr->rr_cols; c++) {
2740 error = zio_worst_error(error, rr->rr_col[c].rc_error);
2741 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2744 return (error);
2747 static void
2748 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
2750 int unexpected_errors = 0;
2751 int parity_errors = 0;
2752 int parity_untried = 0;
2753 int data_errors = 0;
2755 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2757 for (int c = 0; c < rr->rr_cols; c++) {
2758 raidz_col_t *rc = &rr->rr_col[c];
2760 if (rc->rc_error) {
2761 if (c < rr->rr_firstdatacol)
2762 parity_errors++;
2763 else
2764 data_errors++;
2766 if (!rc->rc_skipped)
2767 unexpected_errors++;
2768 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2769 parity_untried++;
2772 if (rc->rc_force_repair)
2773 unexpected_errors++;
2777 * If we read more parity disks than were used for
2778 * reconstruction, confirm that the other parity disks produced
2779 * correct data.
2781 * Note that we also regenerate parity when resilvering so we
2782 * can write it out to failed devices later.
2784 if (parity_errors + parity_untried <
2785 rr->rr_firstdatacol - data_errors ||
2786 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2787 int n = raidz_parity_verify(zio, rr);
2788 unexpected_errors += n;
2791 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2792 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2794 * Use the good data we have in hand to repair damaged children.
2796 for (int c = 0; c < rr->rr_cols; c++) {
2797 raidz_col_t *rc = &rr->rr_col[c];
2798 vdev_t *vd = zio->io_vd;
2799 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2801 if (!rc->rc_allow_repair) {
2802 continue;
2803 } else if (!rc->rc_force_repair &&
2804 (rc->rc_error == 0 || rc->rc_size == 0)) {
2805 continue;
2808 * We do not allow self healing for Direct I/O reads.
2809 * See comment in vdev_raid_row_alloc().
2811 ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
2813 zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2814 "offset=%llx",
2815 zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2817 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2818 rc->rc_offset, rc->rc_abd, rc->rc_size,
2819 ZIO_TYPE_WRITE,
2820 zio->io_priority == ZIO_PRIORITY_REBUILD ?
2821 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
2822 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2823 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2828 * Scrub or resilver i/o's: overwrite any shadow locations with the
2829 * good data. This ensures that if we've already copied this sector,
2830 * it will be corrected if it was damaged. This writes more than is
2831 * necessary, but since expansion is paused during scrub/resilver, at
2832 * most a single row will have a shadow location.
2834 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2835 (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2836 for (int c = 0; c < rr->rr_cols; c++) {
2837 raidz_col_t *rc = &rr->rr_col[c];
2838 vdev_t *vd = zio->io_vd;
2840 if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2841 continue;
2842 vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2845 * Note: We don't want to update the repair stats
2846 * because that would incorrectly indicate that there
2847 * was bad data to repair, which we aren't sure about.
2848 * By clearing the SCAN_THREAD flag, we prevent this
2849 * from happening, despite having the REPAIR flag set.
2850 * We need to set SELF_HEAL so that this i/o can't be
2851 * bypassed by zio_vdev_io_start().
2853 zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2854 rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2855 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2856 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2857 NULL, NULL);
2858 cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2859 zio_nowait(cio);
2864 static void
2865 raidz_restore_orig_data(raidz_map_t *rm)
2867 for (int i = 0; i < rm->rm_nrows; i++) {
2868 raidz_row_t *rr = rm->rm_row[i];
2869 for (int c = 0; c < rr->rr_cols; c++) {
2870 raidz_col_t *rc = &rr->rr_col[c];
2871 if (rc->rc_need_orig_restore) {
2872 abd_copy(rc->rc_abd,
2873 rc->rc_orig_data, rc->rc_size);
2874 rc->rc_need_orig_restore = B_FALSE;
2881 * During raidz_reconstruct() for expanded VDEV, we need special consideration
2882 * failure simulations. See note in raidz_reconstruct() on simulating failure
2883 * of a pre-expansion device.
2885 * Treating logical child i as failed, return TRUE if the given column should
2886 * be treated as failed. The idea of logical children allows us to imagine
2887 * that a disk silently failed before a RAIDZ expansion (reads from this disk
2888 * succeed but return the wrong data). Since the expansion doesn't verify
2889 * checksums, the incorrect data will be moved to new locations spread among
2890 * the children (going diagonally across them).
2892 * Higher "logical child failures" (values of `i`) indicate these
2893 * "pre-expansion failures". The first physical_width values imagine that a
2894 * current child failed; the next physical_width-1 values imagine that a
2895 * child failed before the most recent expansion; the next physical_width-2
2896 * values imagine a child failed in the expansion before that, etc.
2898 static boolean_t
2899 raidz_simulate_failure(int physical_width, int original_width, int ashift,
2900 int i, raidz_col_t *rc)
2902 uint64_t sector_id =
2903 physical_width * (rc->rc_offset >> ashift) +
2904 rc->rc_devidx;
2906 for (int w = physical_width; w >= original_width; w--) {
2907 if (i < w) {
2908 return (sector_id % w == i);
2909 } else {
2910 i -= w;
2913 ASSERT(!"invalid logical child id");
2914 return (B_FALSE);
2918 * returns EINVAL if reconstruction of the block will not be possible
2919 * returns ECKSUM if this specific reconstruction failed
2920 * returns 0 on successful reconstruction
2922 static int
2923 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
2925 raidz_map_t *rm = zio->io_vsd;
2926 int physical_width = zio->io_vd->vdev_children;
2927 int original_width = (rm->rm_original_width != 0) ?
2928 rm->rm_original_width : physical_width;
2929 int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2931 if (dbgmsg) {
2932 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2933 "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2936 /* Reconstruct each row */
2937 for (int r = 0; r < rm->rm_nrows; r++) {
2938 raidz_row_t *rr = rm->rm_row[r];
2939 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
2940 int t = 0;
2941 int dead = 0;
2942 int dead_data = 0;
2944 if (dbgmsg)
2945 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2947 for (int c = 0; c < rr->rr_cols; c++) {
2948 raidz_col_t *rc = &rr->rr_col[c];
2949 ASSERT0(rc->rc_need_orig_restore);
2950 if (rc->rc_error != 0) {
2951 dead++;
2952 if (c >= nparity)
2953 dead_data++;
2954 continue;
2956 if (rc->rc_size == 0)
2957 continue;
2958 for (int lt = 0; lt < ntgts; lt++) {
2959 if (raidz_simulate_failure(physical_width,
2960 original_width,
2961 zio->io_vd->vdev_top->vdev_ashift,
2962 ltgts[lt], rc)) {
2963 if (rc->rc_orig_data == NULL) {
2964 rc->rc_orig_data =
2965 abd_alloc_linear(
2966 rc->rc_size, B_TRUE);
2967 abd_copy(rc->rc_orig_data,
2968 rc->rc_abd, rc->rc_size);
2970 rc->rc_need_orig_restore = B_TRUE;
2972 dead++;
2973 if (c >= nparity)
2974 dead_data++;
2976 * Note: simulating failure of a
2977 * pre-expansion device can hit more
2978 * than one column, in which case we
2979 * might try to simulate more failures
2980 * than can be reconstructed, which is
2981 * also more than the size of my_tgts.
2982 * This check prevents accessing past
2983 * the end of my_tgts. The "dead >
2984 * nparity" check below will fail this
2985 * reconstruction attempt.
2987 if (t < VDEV_RAIDZ_MAXPARITY) {
2988 my_tgts[t++] = c;
2989 if (dbgmsg) {
2990 zfs_dbgmsg("simulating "
2991 "failure of col %u "
2992 "devidx %u", c,
2993 (int)rc->rc_devidx);
2996 break;
3000 if (dead > nparity) {
3001 /* reconstruction not possible */
3002 if (dbgmsg) {
3003 zfs_dbgmsg("reconstruction not possible; "
3004 "too many failures");
3006 raidz_restore_orig_data(rm);
3007 return (EINVAL);
3009 if (dead_data > 0)
3010 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
3013 /* Check for success */
3014 if (raidz_checksum_verify(zio) == 0) {
3015 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
3016 return (0);
3018 /* Reconstruction succeeded - report errors */
3019 for (int i = 0; i < rm->rm_nrows; i++) {
3020 raidz_row_t *rr = rm->rm_row[i];
3022 for (int c = 0; c < rr->rr_cols; c++) {
3023 raidz_col_t *rc = &rr->rr_col[c];
3024 if (rc->rc_need_orig_restore) {
3026 * Note: if this is a parity column,
3027 * we don't really know if it's wrong.
3028 * We need to let
3029 * vdev_raidz_io_done_verified() check
3030 * it, and if we set rc_error, it will
3031 * think that it is a "known" error
3032 * that doesn't need to be checked
3033 * or corrected.
3035 if (rc->rc_error == 0 &&
3036 c >= rr->rr_firstdatacol) {
3037 vdev_raidz_checksum_error(zio,
3038 rc, rc->rc_orig_data);
3039 rc->rc_error =
3040 SET_ERROR(ECKSUM);
3042 rc->rc_need_orig_restore = B_FALSE;
3046 vdev_raidz_io_done_verified(zio, rr);
3049 zio_checksum_verified(zio);
3051 if (dbgmsg) {
3052 zfs_dbgmsg("reconstruction successful "
3053 "(checksum verified)");
3055 return (0);
3058 /* Reconstruction failed - restore original data */
3059 raidz_restore_orig_data(rm);
3060 if (dbgmsg) {
3061 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3062 "failed", zio);
3064 return (ECKSUM);
3068 * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3069 * Note that the algorithm below is non-optimal because it doesn't take into
3070 * account how reconstruction is actually performed. For example, with
3071 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3072 * is targeted as invalid as if columns 1 and 4 are targeted since in both
3073 * cases we'd only use parity information in column 0.
3075 * The order that we find the various possible combinations of failed
3076 * disks is dictated by these rules:
3077 * - Examine each "slot" (the "i" in tgts[i])
3078 * - Try to increment this slot (tgts[i] += 1)
3079 * - if we can't increment because it runs into the next slot,
3080 * reset our slot to the minimum, and examine the next slot
3082 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3083 * 3 columns to reconstruct), we will generate the following sequence:
3085 * STATE ACTION
3086 * 0 1 2 special case: skip since these are all parity
3087 * 0 1 3 first slot: reset to 0; middle slot: increment to 2
3088 * 0 2 3 first slot: increment to 1
3089 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
3090 * 0 1 4 first: reset to 0; middle: increment to 2
3091 * 0 2 4 first: increment to 1
3092 * 1 2 4 first: reset to 0; middle: increment to 3
3093 * 0 3 4 first: increment to 1
3094 * 1 3 4 first: increment to 2
3095 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
3096 * 0 1 5 first: reset to 0; middle: increment to 2
3097 * 0 2 5 first: increment to 1
3098 * 1 2 5 first: reset to 0; middle: increment to 3
3099 * 0 3 5 first: increment to 1
3100 * 1 3 5 first: increment to 2
3101 * 2 3 5 first: reset to 0; middle: increment to 4
3102 * 0 4 5 first: increment to 1
3103 * 1 4 5 first: increment to 2
3104 * 2 4 5 first: increment to 3
3105 * 3 4 5 done
3107 * This strategy works for dRAID but is less efficient when there are a large
3108 * number of child vdevs and therefore permutations to check. Furthermore,
3109 * since the raidz_map_t rows likely do not overlap, reconstruction would be
3110 * possible as long as there are no more than nparity data errors per row.
3111 * These additional permutations are not currently checked but could be as
3112 * a future improvement.
3114 * Returns 0 on success, ECKSUM on failure.
3116 static int
3117 vdev_raidz_combrec(zio_t *zio)
3119 int nparity = vdev_get_nparity(zio->io_vd);
3120 raidz_map_t *rm = zio->io_vsd;
3121 int physical_width = zio->io_vd->vdev_children;
3122 int original_width = (rm->rm_original_width != 0) ?
3123 rm->rm_original_width : physical_width;
3125 for (int i = 0; i < rm->rm_nrows; i++) {
3126 raidz_row_t *rr = rm->rm_row[i];
3127 int total_errors = 0;
3129 for (int c = 0; c < rr->rr_cols; c++) {
3130 if (rr->rr_col[c].rc_error)
3131 total_errors++;
3134 if (total_errors > nparity)
3135 return (vdev_raidz_worst_error(rr));
3138 for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3139 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3140 int *ltgts = &tstore[1]; /* value is logical child ID */
3144 * Determine number of logical children, n. See comment
3145 * above raidz_simulate_failure().
3147 int n = 0;
3148 for (int w = physical_width;
3149 w >= original_width; w--) {
3150 n += w;
3153 ASSERT3U(num_failures, <=, nparity);
3154 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3156 /* Handle corner cases in combrec logic */
3157 ltgts[-1] = -1;
3158 for (int i = 0; i < num_failures; i++) {
3159 ltgts[i] = i;
3161 ltgts[num_failures] = n;
3163 for (;;) {
3164 int err = raidz_reconstruct(zio, ltgts, num_failures,
3165 nparity);
3166 if (err == EINVAL) {
3168 * Reconstruction not possible with this #
3169 * failures; try more failures.
3171 break;
3172 } else if (err == 0)
3173 return (0);
3175 /* Compute next targets to try */
3176 for (int t = 0; ; t++) {
3177 ASSERT3U(t, <, num_failures);
3178 ltgts[t]++;
3179 if (ltgts[t] == n) {
3180 /* try more failures */
3181 ASSERT3U(t, ==, num_failures - 1);
3182 if (zfs_flags &
3183 ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3184 zfs_dbgmsg("reconstruction "
3185 "failed for num_failures="
3186 "%u; tried all "
3187 "combinations",
3188 num_failures);
3190 break;
3193 ASSERT3U(ltgts[t], <, n);
3194 ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3197 * If that spot is available, we're done here.
3198 * Try the next combination.
3200 if (ltgts[t] != ltgts[t + 1])
3201 break; // found next combination
3204 * Otherwise, reset this tgt to the minimum,
3205 * and move on to the next tgt.
3207 ltgts[t] = ltgts[t - 1] + 1;
3208 ASSERT3U(ltgts[t], ==, t);
3211 /* Increase the number of failures and keep trying. */
3212 if (ltgts[num_failures - 1] == n)
3213 break;
3216 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3217 zfs_dbgmsg("reconstruction failed for all num_failures");
3218 return (ECKSUM);
3221 void
3222 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3224 for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3225 raidz_row_t *rr = rm->rm_row[row];
3226 vdev_raidz_reconstruct_row(rm, rr, t, nt);
3231 * Complete a write IO operation on a RAIDZ VDev
3233 * Outline:
3234 * 1. Check for errors on the child IOs.
3235 * 2. Return, setting an error code if too few child VDevs were written
3236 * to reconstruct the data later. Note that partial writes are
3237 * considered successful if they can be reconstructed at all.
3239 static void
3240 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3242 int normal_errors = 0;
3243 int shadow_errors = 0;
3245 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3246 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3247 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3249 for (int c = 0; c < rr->rr_cols; c++) {
3250 raidz_col_t *rc = &rr->rr_col[c];
3252 if (rc->rc_error != 0) {
3253 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
3254 normal_errors++;
3256 if (rc->rc_shadow_error != 0) {
3257 ASSERT(rc->rc_shadow_error != ECKSUM);
3258 shadow_errors++;
3263 * Treat partial writes as a success. If we couldn't write enough
3264 * columns to reconstruct the data, the I/O failed. Otherwise, good
3265 * enough. Note that in the case of a shadow write (during raidz
3266 * expansion), depending on if we crash, either the normal (old) or
3267 * shadow (new) location may become the "real" version of the block,
3268 * so both locations must have sufficient redundancy.
3270 * Now that we support write reallocation, it would be better
3271 * to treat partial failure as real failure unless there are
3272 * no non-degraded top-level vdevs left, and not update DTLs
3273 * if we intend to reallocate.
3275 if (normal_errors > rr->rr_firstdatacol ||
3276 shadow_errors > rr->rr_firstdatacol) {
3277 zio->io_error = zio_worst_error(zio->io_error,
3278 vdev_raidz_worst_error(rr));
3282 static void
3283 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3284 raidz_row_t *rr)
3286 int parity_errors = 0;
3287 int parity_untried = 0;
3288 int data_errors = 0;
3289 int total_errors = 0;
3291 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3292 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3294 for (int c = 0; c < rr->rr_cols; c++) {
3295 raidz_col_t *rc = &rr->rr_col[c];
3298 * If scrubbing and a replacing/sparing child vdev determined
3299 * that not all of its children have an identical copy of the
3300 * data, then clear the error so the column is treated like
3301 * any other read and force a repair to correct the damage.
3303 if (rc->rc_error == ECKSUM) {
3304 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3305 vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3306 rc->rc_force_repair = 1;
3307 rc->rc_error = 0;
3310 if (rc->rc_error) {
3311 if (c < rr->rr_firstdatacol)
3312 parity_errors++;
3313 else
3314 data_errors++;
3316 total_errors++;
3317 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3318 parity_untried++;
3323 * If there were data errors and the number of errors we saw was
3324 * correctable -- less than or equal to the number of parity disks read
3325 * -- reconstruct based on the missing data.
3327 if (data_errors != 0 &&
3328 total_errors <= rr->rr_firstdatacol - parity_untried) {
3330 * We either attempt to read all the parity columns or
3331 * none of them. If we didn't try to read parity, we
3332 * wouldn't be here in the correctable case. There must
3333 * also have been fewer parity errors than parity
3334 * columns or, again, we wouldn't be in this code path.
3336 ASSERT(parity_untried == 0);
3337 ASSERT(parity_errors < rr->rr_firstdatacol);
3340 * Identify the data columns that reported an error.
3342 int n = 0;
3343 int tgts[VDEV_RAIDZ_MAXPARITY];
3344 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3345 raidz_col_t *rc = &rr->rr_col[c];
3346 if (rc->rc_error != 0) {
3347 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3348 tgts[n++] = c;
3352 ASSERT(rr->rr_firstdatacol >= n);
3354 vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3359 * Return the number of reads issued.
3361 static int
3362 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3364 vdev_t *vd = zio->io_vd;
3365 int nread = 0;
3367 rr->rr_missingdata = 0;
3368 rr->rr_missingparity = 0;
3371 * If this rows contains empty sectors which are not required
3372 * for a normal read then allocate an ABD for them now so they
3373 * may be read, verified, and any needed repairs performed.
3375 if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3376 vdev_draid_map_alloc_empty(zio, rr);
3378 for (int c = 0; c < rr->rr_cols; c++) {
3379 raidz_col_t *rc = &rr->rr_col[c];
3380 if (rc->rc_tried || rc->rc_size == 0)
3381 continue;
3383 zio_nowait(zio_vdev_child_io(zio, NULL,
3384 vd->vdev_child[rc->rc_devidx],
3385 rc->rc_offset, rc->rc_abd, rc->rc_size,
3386 zio->io_type, zio->io_priority, 0,
3387 vdev_raidz_child_done, rc));
3388 nread++;
3390 return (nread);
3394 * We're here because either there were too many errors to even attempt
3395 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3396 * failed. In either case, there is enough bad data to prevent reconstruction.
3397 * Start checksum ereports for all children which haven't failed.
3399 static void
3400 vdev_raidz_io_done_unrecoverable(zio_t *zio)
3402 raidz_map_t *rm = zio->io_vsd;
3404 for (int i = 0; i < rm->rm_nrows; i++) {
3405 raidz_row_t *rr = rm->rm_row[i];
3407 for (int c = 0; c < rr->rr_cols; c++) {
3408 raidz_col_t *rc = &rr->rr_col[c];
3409 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3411 if (rc->rc_error != 0)
3412 continue;
3414 zio_bad_cksum_t zbc;
3415 zbc.zbc_has_cksum = 0;
3416 zbc.zbc_injected = rm->rm_ecksuminjected;
3417 mutex_enter(&cvd->vdev_stat_lock);
3418 cvd->vdev_stat.vs_checksum_errors++;
3419 mutex_exit(&cvd->vdev_stat_lock);
3420 (void) zfs_ereport_start_checksum(zio->io_spa,
3421 cvd, &zio->io_bookmark, zio, rc->rc_offset,
3422 rc->rc_size, &zbc);
3427 void
3428 vdev_raidz_io_done(zio_t *zio)
3430 raidz_map_t *rm = zio->io_vsd;
3432 ASSERT(zio->io_bp != NULL);
3433 if (zio->io_type == ZIO_TYPE_WRITE) {
3434 for (int i = 0; i < rm->rm_nrows; i++) {
3435 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3437 } else {
3438 if (rm->rm_phys_col) {
3440 * This is an aggregated read. Copy the data and status
3441 * from the aggregate abd's to the individual rows.
3443 for (int i = 0; i < rm->rm_nrows; i++) {
3444 raidz_row_t *rr = rm->rm_row[i];
3446 for (int c = 0; c < rr->rr_cols; c++) {
3447 raidz_col_t *rc = &rr->rr_col[c];
3448 if (rc->rc_tried || rc->rc_size == 0)
3449 continue;
3451 raidz_col_t *prc =
3452 &rm->rm_phys_col[rc->rc_devidx];
3453 rc->rc_error = prc->rc_error;
3454 rc->rc_tried = prc->rc_tried;
3455 rc->rc_skipped = prc->rc_skipped;
3456 if (c >= rr->rr_firstdatacol) {
3458 * Note: this is slightly faster
3459 * than using abd_copy_off().
3461 char *physbuf = abd_to_buf(
3462 prc->rc_abd);
3463 void *physloc = physbuf +
3464 rc->rc_offset -
3465 prc->rc_offset;
3467 abd_copy_from_buf(rc->rc_abd,
3468 physloc, rc->rc_size);
3474 for (int i = 0; i < rm->rm_nrows; i++) {
3475 raidz_row_t *rr = rm->rm_row[i];
3476 vdev_raidz_io_done_reconstruct_known_missing(zio,
3477 rm, rr);
3480 if (raidz_checksum_verify(zio) == 0) {
3481 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
3482 goto done;
3484 for (int i = 0; i < rm->rm_nrows; i++) {
3485 raidz_row_t *rr = rm->rm_row[i];
3486 vdev_raidz_io_done_verified(zio, rr);
3488 zio_checksum_verified(zio);
3489 } else {
3491 * A sequential resilver has no checksum which makes
3492 * combinatoral reconstruction impossible. This code
3493 * path is unreachable since raidz_checksum_verify()
3494 * has no checksum to verify and must succeed.
3496 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3499 * This isn't a typical situation -- either we got a
3500 * read error or a child silently returned bad data.
3501 * Read every block so we can try again with as much
3502 * data and parity as we can track down. If we've
3503 * already been through once before, all children will
3504 * be marked as tried so we'll proceed to combinatorial
3505 * reconstruction.
3507 int nread = 0;
3508 for (int i = 0; i < rm->rm_nrows; i++) {
3509 nread += vdev_raidz_read_all(zio,
3510 rm->rm_row[i]);
3512 if (nread != 0) {
3514 * Normally our stage is VDEV_IO_DONE, but if
3515 * we've already called redone(), it will have
3516 * changed to VDEV_IO_START, in which case we
3517 * don't want to call redone() again.
3519 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3520 zio_vdev_io_redone(zio);
3521 return;
3524 * It would be too expensive to try every possible
3525 * combination of failed sectors in every row, so
3526 * instead we try every combination of failed current or
3527 * past physical disk. This means that if the incorrect
3528 * sectors were all on Nparity disks at any point in the
3529 * past, we will find the correct data. The only known
3530 * case where this is less durable than a non-expanded
3531 * RAIDZ, is if we have a silent failure during
3532 * expansion. In that case, one block could be
3533 * partially in the old format and partially in the
3534 * new format, so we'd lost some sectors from the old
3535 * format and some from the new format.
3537 * e.g. logical_width=4 physical_width=6
3538 * the 15 (6+5+4) possible failed disks are:
3539 * width=6 child=0
3540 * width=6 child=1
3541 * width=6 child=2
3542 * width=6 child=3
3543 * width=6 child=4
3544 * width=6 child=5
3545 * width=5 child=0
3546 * width=5 child=1
3547 * width=5 child=2
3548 * width=5 child=3
3549 * width=5 child=4
3550 * width=4 child=0
3551 * width=4 child=1
3552 * width=4 child=2
3553 * width=4 child=3
3554 * And we will try every combination of Nparity of these
3555 * failing.
3557 * As a first pass, we can generate every combo,
3558 * and try reconstructing, ignoring any known
3559 * failures. If any row has too many known + simulated
3560 * failures, then we bail on reconstructing with this
3561 * number of simulated failures. As an improvement,
3562 * we could detect the number of whole known failures
3563 * (i.e. we have known failures on these disks for
3564 * every row; the disks never succeeded), and
3565 * subtract that from the max # failures to simulate.
3566 * We could go even further like the current
3567 * combrec code, but that doesn't seem like it
3568 * gains us very much. If we simulate a failure
3569 * that is also a known failure, that's fine.
3571 zio->io_error = vdev_raidz_combrec(zio);
3572 if (zio->io_error == ECKSUM &&
3573 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3574 vdev_raidz_io_done_unrecoverable(zio);
3578 done:
3579 if (rm->rm_lr != NULL) {
3580 zfs_rangelock_exit(rm->rm_lr);
3581 rm->rm_lr = NULL;
3585 static void
3586 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3588 vdev_raidz_t *vdrz = vd->vdev_tsd;
3589 if (faulted > vdrz->vd_nparity)
3590 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3591 VDEV_AUX_NO_REPLICAS);
3592 else if (degraded + faulted != 0)
3593 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3594 else
3595 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3599 * Determine if any portion of the provided block resides on a child vdev
3600 * with a dirty DTL and therefore needs to be resilvered. The function
3601 * assumes that at least one DTL is dirty which implies that full stripe
3602 * width blocks must be resilvered.
3604 static boolean_t
3605 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3606 uint64_t phys_birth)
3608 vdev_raidz_t *vdrz = vd->vdev_tsd;
3611 * If we're in the middle of a RAIDZ expansion, this block may be in
3612 * the old and/or new location. For simplicity, always resilver it.
3614 if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3615 return (B_TRUE);
3617 uint64_t dcols = vd->vdev_children;
3618 uint64_t nparity = vdrz->vd_nparity;
3619 uint64_t ashift = vd->vdev_top->vdev_ashift;
3620 /* The starting RAIDZ (parent) vdev sector of the block. */
3621 uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3622 /* The zio's size in units of the vdev's minimum sector size. */
3623 uint64_t s = ((psize - 1) >> ashift) + 1;
3624 /* The first column for this stripe. */
3625 uint64_t f = b % dcols;
3627 /* Unreachable by sequential resilver. */
3628 ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3630 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3631 return (B_FALSE);
3633 if (s + nparity >= dcols)
3634 return (B_TRUE);
3636 for (uint64_t c = 0; c < s + nparity; c++) {
3637 uint64_t devidx = (f + c) % dcols;
3638 vdev_t *cvd = vd->vdev_child[devidx];
3641 * dsl_scan_need_resilver() already checked vd with
3642 * vdev_dtl_contains(). So here just check cvd with
3643 * vdev_dtl_empty(), cheaper and a good approximation.
3645 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3646 return (B_TRUE);
3649 return (B_FALSE);
3652 static void
3653 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
3654 range_seg64_t *physical_rs, range_seg64_t *remain_rs)
3656 (void) remain_rs;
3658 vdev_t *raidvd = cvd->vdev_parent;
3659 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3661 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3663 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3665 * We're in the middle of expansion, in which case the
3666 * translation is in flux. Any answer we give may be wrong
3667 * by the time we return, so it isn't safe for the caller to
3668 * act on it. Therefore we say that this range isn't present
3669 * on any children. The only consumers of this are "zpool
3670 * initialize" and trimming, both of which are "best effort"
3671 * anyway.
3673 physical_rs->rs_start = physical_rs->rs_end = 0;
3674 remain_rs->rs_start = remain_rs->rs_end = 0;
3675 return;
3678 uint64_t width = vdrz->vd_physical_width;
3679 uint64_t tgt_col = cvd->vdev_id;
3680 uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3682 /* make sure the offsets are block-aligned */
3683 ASSERT0(logical_rs->rs_start % (1 << ashift));
3684 ASSERT0(logical_rs->rs_end % (1 << ashift));
3685 uint64_t b_start = logical_rs->rs_start >> ashift;
3686 uint64_t b_end = logical_rs->rs_end >> ashift;
3688 uint64_t start_row = 0;
3689 if (b_start > tgt_col) /* avoid underflow */
3690 start_row = ((b_start - tgt_col - 1) / width) + 1;
3692 uint64_t end_row = 0;
3693 if (b_end > tgt_col)
3694 end_row = ((b_end - tgt_col - 1) / width) + 1;
3696 physical_rs->rs_start = start_row << ashift;
3697 physical_rs->rs_end = end_row << ashift;
3699 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
3700 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
3701 logical_rs->rs_end - logical_rs->rs_start);
3704 static void
3705 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
3707 spa_t *spa = arg;
3708 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3709 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3712 * Ensure there are no i/os to the range that is being committed.
3714 uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3715 ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
3717 mutex_enter(&vre->vre_lock);
3718 uint64_t new_offset =
3719 MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3721 * We should not have committed anything that failed.
3723 VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3724 mutex_exit(&vre->vre_lock);
3726 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3727 old_offset, new_offset - old_offset,
3728 RL_WRITER);
3731 * Update the uberblock that will be written when this txg completes.
3733 RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3734 RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3735 vre->vre_offset_pertxg[txgoff] = 0;
3736 zfs_rangelock_exit(lr);
3738 mutex_enter(&vre->vre_lock);
3739 vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3740 vre->vre_bytes_copied_pertxg[txgoff] = 0;
3741 mutex_exit(&vre->vre_lock);
3743 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3744 VERIFY0(zap_update(spa->spa_meta_objset,
3745 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3746 sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3749 static void
3750 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3752 spa_t *spa = arg;
3753 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3754 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3755 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3757 for (int i = 0; i < TXG_SIZE; i++)
3758 VERIFY0(vre->vre_offset_pertxg[i]);
3760 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3761 re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3762 re->re_logical_width = vdrz->vd_physical_width;
3763 mutex_enter(&vdrz->vd_expand_lock);
3764 avl_add(&vdrz->vd_expand_txgs, re);
3765 mutex_exit(&vdrz->vd_expand_lock);
3767 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3770 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3771 * will get written (based on vd_expand_txgs).
3773 vdev_config_dirty(vd);
3776 * Before we change vre_state, the on-disk state must reflect that we
3777 * have completed all copying, so that vdev_raidz_io_start() can use
3778 * vre_state to determine if the reflow is in progress. See also the
3779 * end of spa_raidz_expand_thread().
3781 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3782 raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3784 vre->vre_end_time = gethrestime_sec();
3785 vre->vre_state = DSS_FINISHED;
3787 uint64_t state = vre->vre_state;
3788 VERIFY0(zap_update(spa->spa_meta_objset,
3789 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3790 sizeof (state), 1, &state, tx));
3792 uint64_t end_time = vre->vre_end_time;
3793 VERIFY0(zap_update(spa->spa_meta_objset,
3794 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3795 sizeof (end_time), 1, &end_time, tx));
3797 spa->spa_uberblock.ub_raidz_reflow_info = 0;
3799 spa_history_log_internal(spa, "raidz vdev expansion completed", tx,
3800 "%s vdev %llu new width %llu", spa_name(spa),
3801 (unsigned long long)vd->vdev_id,
3802 (unsigned long long)vd->vdev_children);
3804 spa->spa_raidz_expand = NULL;
3805 raidvd->vdev_rz_expanding = B_FALSE;
3807 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3808 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3809 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3811 spa_notify_waiters(spa);
3814 * While we're in syncing context take the opportunity to
3815 * setup a scrub. All the data has been sucessfully copied
3816 * but we have not validated any checksums.
3818 setup_sync_arg_t setup_sync_arg = {
3819 .func = POOL_SCAN_SCRUB,
3820 .txgstart = 0,
3821 .txgend = 0,
3823 if (zfs_scrub_after_expand &&
3824 dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
3825 dsl_scan_setup_sync(&setup_sync_arg, tx);
3830 * State of one copy batch.
3832 typedef struct raidz_reflow_arg {
3833 vdev_raidz_expand_t *rra_vre; /* Global expantion state. */
3834 zfs_locked_range_t *rra_lr; /* Range lock of this batch. */
3835 uint64_t rra_txg; /* TXG of this batch. */
3836 uint_t rra_ashift; /* Ashift of the vdev. */
3837 uint32_t rra_tbd; /* Number of in-flight ZIOs. */
3838 uint32_t rra_writes; /* Number of write ZIOs. */
3839 zio_t *rra_zio[]; /* Write ZIO pointers. */
3840 } raidz_reflow_arg_t;
3843 * Write of the new location on one child is done. Once all of them are done
3844 * we can unlock and free everything.
3846 static void
3847 raidz_reflow_write_done(zio_t *zio)
3849 raidz_reflow_arg_t *rra = zio->io_private;
3850 vdev_raidz_expand_t *vre = rra->rra_vre;
3852 abd_free(zio->io_abd);
3854 mutex_enter(&vre->vre_lock);
3855 if (zio->io_error != 0) {
3856 /* Force a reflow pause on errors */
3857 vre->vre_failed_offset =
3858 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3860 ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3861 vre->vre_outstanding_bytes -= zio->io_size;
3862 if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3863 vre->vre_failed_offset) {
3864 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3865 zio->io_size;
3867 cv_signal(&vre->vre_cv);
3868 boolean_t done = (--rra->rra_tbd == 0);
3869 mutex_exit(&vre->vre_lock);
3871 if (!done)
3872 return;
3873 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
3874 zfs_rangelock_exit(rra->rra_lr);
3875 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
3879 * Read of the old location on one child is done. Once all of them are done
3880 * writes should have all the data and we can issue them.
3882 static void
3883 raidz_reflow_read_done(zio_t *zio)
3885 raidz_reflow_arg_t *rra = zio->io_private;
3886 vdev_raidz_expand_t *vre = rra->rra_vre;
3888 /* Reads of only one block use write ABDs. For bigger free gangs. */
3889 if (zio->io_size > (1 << rra->rra_ashift))
3890 abd_free(zio->io_abd);
3893 * If the read failed, or if it was done on a vdev that is not fully
3894 * healthy (e.g. a child that has a resilver in progress), we may not
3895 * have the correct data. Note that it's OK if the write proceeds.
3896 * It may write garbage but the location is otherwise unused and we
3897 * will retry later due to vre_failed_offset.
3899 if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3900 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3901 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3902 (long long)rra->rra_lr->lr_offset,
3903 (long long)rra->rra_lr->lr_length,
3904 (long long)rra->rra_txg,
3905 zio->io_error,
3906 vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3907 vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3908 mutex_enter(&vre->vre_lock);
3909 /* Force a reflow pause on errors */
3910 vre->vre_failed_offset =
3911 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3912 mutex_exit(&vre->vre_lock);
3915 if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
3916 return;
3917 rra->rra_tbd = rra->rra_writes;
3918 for (uint64_t i = 0; i < rra->rra_writes; i++)
3919 zio_nowait(rra->rra_zio[i]);
3922 static void
3923 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3924 dmu_tx_t *tx)
3926 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3927 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3929 if (offset == 0)
3930 return;
3932 mutex_enter(&vre->vre_lock);
3933 ASSERT3U(vre->vre_offset, <=, offset);
3934 vre->vre_offset = offset;
3935 mutex_exit(&vre->vre_lock);
3937 if (vre->vre_offset_pertxg[txgoff] == 0) {
3938 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3939 spa, tx);
3941 vre->vre_offset_pertxg[txgoff] = offset;
3944 static boolean_t
3945 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3947 for (int i = 0; i < raidz_vd->vdev_children; i++) {
3948 /* Quick check if a child is being replaced */
3949 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3950 return (B_TRUE);
3952 return (B_FALSE);
3955 static boolean_t
3956 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3957 dmu_tx_t *tx)
3959 spa_t *spa = vd->vdev_spa;
3960 uint_t ashift = vd->vdev_top->vdev_ashift;
3962 range_seg_t *rs = range_tree_first(rt);
3963 if (rt == NULL)
3964 return (B_FALSE);
3965 uint64_t offset = rs_get_start(rs, rt);
3966 ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3967 uint64_t size = rs_get_end(rs, rt) - offset;
3968 ASSERT3U(size, >=, 1 << ashift);
3969 ASSERT(IS_P2ALIGNED(size, 1 << ashift));
3971 uint64_t blkid = offset >> ashift;
3972 uint_t old_children = vd->vdev_children - 1;
3975 * We can only progress to the point that writes will not overlap
3976 * with blocks whose progress has not yet been recorded on disk.
3977 * Since partially-copied rows are still read from the old location,
3978 * we need to stop one row before the sector-wise overlap, to prevent
3979 * row-wise overlap.
3981 * Note that even if we are skipping over a large unallocated region,
3982 * we can't move the on-disk progress to `offset`, because concurrent
3983 * writes/allocations could still use the currently-unallocated
3984 * region.
3986 uint64_t ubsync_blkid =
3987 RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
3988 uint64_t next_overwrite_blkid = ubsync_blkid +
3989 ubsync_blkid / old_children - old_children;
3990 VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
3991 if (blkid >= next_overwrite_blkid) {
3992 raidz_reflow_record_progress(vre,
3993 next_overwrite_blkid << ashift, tx);
3994 return (B_TRUE);
3997 size = MIN(size, raidz_expand_max_copy_bytes);
3998 size = MIN(size, (uint64_t)old_children *
3999 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
4000 size = MAX(size, 1 << ashift);
4001 uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
4002 size = (uint64_t)blocks << ashift;
4004 range_tree_remove(rt, offset, size);
4006 uint_t reads = MIN(blocks, old_children);
4007 uint_t writes = MIN(blocks, vd->vdev_children);
4008 raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
4009 sizeof (zio_t *) * writes, KM_SLEEP);
4010 rra->rra_vre = vre;
4011 rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
4012 offset, size, RL_WRITER);
4013 rra->rra_txg = dmu_tx_get_txg(tx);
4014 rra->rra_ashift = ashift;
4015 rra->rra_tbd = reads;
4016 rra->rra_writes = writes;
4018 raidz_reflow_record_progress(vre, offset + size, tx);
4021 * SCL_STATE will be released when the read and write are done,
4022 * by raidz_reflow_write_done().
4024 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
4026 /* check if a replacing vdev was added, if so treat it as an error */
4027 if (vdev_raidz_expand_child_replacing(vd)) {
4028 zfs_dbgmsg("replacing vdev encountered, reflow paused at "
4029 "offset=%llu txg=%llu",
4030 (long long)rra->rra_lr->lr_offset,
4031 (long long)rra->rra_txg);
4033 mutex_enter(&vre->vre_lock);
4034 vre->vre_failed_offset =
4035 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4036 cv_signal(&vre->vre_cv);
4037 mutex_exit(&vre->vre_lock);
4039 /* drop everything we acquired */
4040 spa_config_exit(spa, SCL_STATE, spa);
4041 zfs_rangelock_exit(rra->rra_lr);
4042 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
4043 return (B_TRUE);
4046 mutex_enter(&vre->vre_lock);
4047 vre->vre_outstanding_bytes += size;
4048 mutex_exit(&vre->vre_lock);
4050 /* Allocate ABD and ZIO for each child we write. */
4051 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4052 zio_t *pio = spa->spa_txg_zio[txgoff];
4053 uint_t b = blocks / vd->vdev_children;
4054 uint_t bb = blocks % vd->vdev_children;
4055 for (uint_t i = 0; i < writes; i++) {
4056 uint_t n = b + (i < bb);
4057 abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
4058 rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
4059 vd->vdev_child[(blkid + i) % vd->vdev_children],
4060 ((blkid + i) / vd->vdev_children) << ashift,
4061 abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4062 ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
4066 * Allocate and issue ZIO for each child we read. For reads of only
4067 * one block we can use respective writer ABDs, since they will also
4068 * have only one block. For bigger reads create gang ABDs and fill
4069 * them with respective blocks from writer ABDs.
4071 b = blocks / old_children;
4072 bb = blocks % old_children;
4073 for (uint_t i = 0; i < reads; i++) {
4074 uint_t n = b + (i < bb);
4075 abd_t *abd;
4076 if (n > 1) {
4077 abd = abd_alloc_gang();
4078 for (uint_t j = 0; j < n; j++) {
4079 uint_t b = j * old_children + i;
4080 abd_t *cabd = abd_get_offset_size(
4081 rra->rra_zio[b % vd->vdev_children]->io_abd,
4082 (b / vd->vdev_children) << ashift,
4083 1 << ashift);
4084 abd_gang_add(abd, cabd, B_TRUE);
4086 } else {
4087 abd = rra->rra_zio[i]->io_abd;
4089 zio_nowait(zio_vdev_child_io(pio, NULL,
4090 vd->vdev_child[(blkid + i) % old_children],
4091 ((blkid + i) / old_children) << ashift, abd,
4092 n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4093 ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
4096 return (B_FALSE);
4100 * For testing (ztest specific)
4102 static void
4103 raidz_expand_pause(uint_t pause_point)
4105 while (raidz_expand_pause_point != 0 &&
4106 raidz_expand_pause_point <= pause_point)
4107 delay(hz);
4110 static void
4111 raidz_scratch_child_done(zio_t *zio)
4113 zio_t *pio = zio->io_private;
4115 mutex_enter(&pio->io_lock);
4116 pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4117 mutex_exit(&pio->io_lock);
4121 * Reflow the beginning portion of the vdev into an intermediate scratch area
4122 * in memory and on disk. This operation must be persisted on disk before we
4123 * proceed to overwrite the beginning portion with the reflowed data.
4125 * This multi-step task can fail to complete if disk errors are encountered
4126 * and we can return here after a pause (waiting for disk to become healthy).
4128 static void
4129 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4131 vdev_raidz_expand_t *vre = arg;
4132 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4133 zio_t *pio;
4134 int error;
4136 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4137 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4138 int ashift = raidvd->vdev_ashift;
4139 uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4140 uint64_t);
4141 uint64_t logical_size = write_size * raidvd->vdev_children;
4142 uint64_t read_size =
4143 P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4144 1 << ashift);
4147 * The scratch space must be large enough to get us to the point
4148 * that one row does not overlap itself when moved. This is checked
4149 * by vdev_raidz_attach_check().
4151 VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4152 VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4153 VERIFY3U(write_size, <=, read_size);
4155 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4156 0, logical_size, RL_WRITER);
4158 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4159 KM_SLEEP);
4160 for (int i = 0; i < raidvd->vdev_children; i++) {
4161 abds[i] = abd_alloc_linear(read_size, B_FALSE);
4164 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4167 * If we have already written the scratch area then we must read from
4168 * there, since new writes were redirected there while we were paused
4169 * or the original location may have been partially overwritten with
4170 * reflowed data.
4172 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4173 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4175 * Read from scratch space.
4177 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4178 for (int i = 0; i < raidvd->vdev_children; i++) {
4180 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4181 * to the offset to calculate the physical offset to
4182 * write to. Passing in a negative offset makes us
4183 * access the scratch area.
4185 zio_nowait(zio_vdev_child_io(pio, NULL,
4186 raidvd->vdev_child[i],
4187 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4188 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4189 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4191 error = zio_wait(pio);
4192 if (error != 0) {
4193 zfs_dbgmsg("reflow: error %d reading scratch location",
4194 error);
4195 goto io_error_exit;
4197 goto overwrite;
4201 * Read from original location.
4203 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4204 for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4205 ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4206 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4207 0, abds[i], read_size, ZIO_TYPE_READ,
4208 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4209 raidz_scratch_child_done, pio));
4211 error = zio_wait(pio);
4212 if (error != 0) {
4213 zfs_dbgmsg("reflow: error %d reading original location", error);
4214 io_error_exit:
4215 for (int i = 0; i < raidvd->vdev_children; i++)
4216 abd_free(abds[i]);
4217 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4218 zfs_rangelock_exit(lr);
4219 spa_config_exit(spa, SCL_STATE, FTAG);
4220 return;
4223 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4226 * Reflow in memory.
4228 uint64_t logical_sectors = logical_size >> ashift;
4229 for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4230 int oldchild = i % (raidvd->vdev_children - 1);
4231 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4233 int newchild = i % raidvd->vdev_children;
4234 uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4236 /* a single sector should not be copying over itself */
4237 ASSERT(!(newchild == oldchild && newoff == oldoff));
4239 abd_copy_off(abds[newchild], abds[oldchild],
4240 newoff, oldoff, 1 << ashift);
4244 * Verify that we filled in everything we intended to (write_size on
4245 * each child).
4247 VERIFY0(logical_sectors % raidvd->vdev_children);
4248 VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4249 write_size);
4252 * Write to scratch location (boot area).
4254 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4255 for (int i = 0; i < raidvd->vdev_children; i++) {
4257 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4258 * the offset to calculate the physical offset to write to.
4259 * Passing in a negative offset lets us access the boot area.
4261 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4262 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4263 write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4264 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4266 error = zio_wait(pio);
4267 if (error != 0) {
4268 zfs_dbgmsg("reflow: error %d writing scratch location", error);
4269 goto io_error_exit;
4271 pio = zio_root(spa, NULL, NULL, 0);
4272 zio_flush(pio, raidvd);
4273 zio_wait(pio);
4275 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4276 (long long)logical_size);
4278 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4281 * Update uberblock to indicate that scratch space is valid. This is
4282 * needed because after this point, the real location may be
4283 * overwritten. If we crash, we need to get the data from the
4284 * scratch space, rather than the real location.
4286 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4287 * will prefer this uberblock.
4289 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4290 spa->spa_ubsync.ub_timestamp++;
4291 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4292 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4293 if (spa_multihost(spa))
4294 mmp_update_uberblock(spa, &spa->spa_ubsync);
4296 zfs_dbgmsg("reflow: uberblock updated "
4297 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4298 (long long)spa->spa_ubsync.ub_txg,
4299 (long long)logical_size,
4300 (long long)spa->spa_ubsync.ub_timestamp);
4302 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4305 * Overwrite with reflow'ed data.
4307 overwrite:
4308 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4309 for (int i = 0; i < raidvd->vdev_children; i++) {
4310 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4311 0, abds[i], write_size, ZIO_TYPE_WRITE,
4312 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4313 raidz_scratch_child_done, pio));
4315 error = zio_wait(pio);
4316 if (error != 0) {
4318 * When we exit early here and drop the range lock, new
4319 * writes will go into the scratch area so we'll need to
4320 * read from there when we return after pausing.
4322 zfs_dbgmsg("reflow: error %d writing real location", error);
4324 * Update the uberblock that is written when this txg completes.
4326 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4327 logical_size);
4328 goto io_error_exit;
4330 pio = zio_root(spa, NULL, NULL, 0);
4331 zio_flush(pio, raidvd);
4332 zio_wait(pio);
4334 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4335 (long long)logical_size);
4336 for (int i = 0; i < raidvd->vdev_children; i++)
4337 abd_free(abds[i]);
4338 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4340 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4343 * Update uberblock to indicate that the initial part has been
4344 * reflow'ed. This is needed because after this point (when we exit
4345 * the rangelock), we allow regular writes to this region, which will
4346 * be written to the new location only (because reflow_offset_next ==
4347 * reflow_offset_synced). If we crashed and re-copied from the
4348 * scratch space, we would lose the regular writes.
4350 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4351 logical_size);
4352 spa->spa_ubsync.ub_timestamp++;
4353 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4354 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4355 if (spa_multihost(spa))
4356 mmp_update_uberblock(spa, &spa->spa_ubsync);
4358 zfs_dbgmsg("reflow: uberblock updated "
4359 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4360 (long long)spa->spa_ubsync.ub_txg,
4361 (long long)logical_size,
4362 (long long)spa->spa_ubsync.ub_timestamp);
4364 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4367 * Update progress.
4369 vre->vre_offset = logical_size;
4370 zfs_rangelock_exit(lr);
4371 spa_config_exit(spa, SCL_STATE, FTAG);
4373 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4374 vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4375 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4377 * Note - raidz_reflow_sync() will update the uberblock state to
4378 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4380 raidz_reflow_sync(spa, tx);
4382 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4386 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4387 * here. No other i/o can be in progress, so we don't need the vre_rangelock.
4389 void
4390 vdev_raidz_reflow_copy_scratch(spa_t *spa)
4392 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4393 uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4394 ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4396 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4397 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4398 ASSERT0(logical_size % raidvd->vdev_children);
4399 uint64_t write_size = logical_size / raidvd->vdev_children;
4401 zio_t *pio;
4404 * Read from scratch space.
4406 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4407 KM_SLEEP);
4408 for (int i = 0; i < raidvd->vdev_children; i++) {
4409 abds[i] = abd_alloc_linear(write_size, B_FALSE);
4412 pio = zio_root(spa, NULL, NULL, 0);
4413 for (int i = 0; i < raidvd->vdev_children; i++) {
4415 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4416 * the offset to calculate the physical offset to write to.
4417 * Passing in a negative offset lets us access the boot area.
4419 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4420 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4421 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
4422 raidz_scratch_child_done, pio));
4424 zio_wait(pio);
4427 * Overwrite real location with reflow'ed data.
4429 pio = zio_root(spa, NULL, NULL, 0);
4430 for (int i = 0; i < raidvd->vdev_children; i++) {
4431 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4432 0, abds[i], write_size, ZIO_TYPE_WRITE,
4433 ZIO_PRIORITY_REMOVAL, 0,
4434 raidz_scratch_child_done, pio));
4436 zio_wait(pio);
4437 pio = zio_root(spa, NULL, NULL, 0);
4438 zio_flush(pio, raidvd);
4439 zio_wait(pio);
4441 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4442 "to real location", (long long)logical_size);
4444 for (int i = 0; i < raidvd->vdev_children; i++)
4445 abd_free(abds[i]);
4446 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4449 * Update uberblock.
4451 RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4452 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4453 spa->spa_ubsync.ub_timestamp++;
4454 VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4455 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4456 if (spa_multihost(spa))
4457 mmp_update_uberblock(spa, &spa->spa_ubsync);
4459 zfs_dbgmsg("reflow recovery: uberblock updated "
4460 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4461 (long long)spa->spa_ubsync.ub_txg,
4462 (long long)logical_size,
4463 (long long)spa->spa_ubsync.ub_timestamp);
4465 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4466 spa_first_txg(spa));
4467 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4468 vre->vre_offset = logical_size;
4469 vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4470 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4472 * Note that raidz_reflow_sync() will update the uberblock once more
4474 raidz_reflow_sync(spa, tx);
4476 dmu_tx_commit(tx);
4478 spa_config_exit(spa, SCL_STATE, FTAG);
4481 static boolean_t
4482 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4484 (void) zthr;
4485 spa_t *spa = arg;
4487 return (spa->spa_raidz_expand != NULL &&
4488 !spa->spa_raidz_expand->vre_waiting_for_resilver);
4492 * RAIDZ expansion background thread
4494 * Can be called multiple times if the reflow is paused
4496 static void
4497 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4499 spa_t *spa = arg;
4500 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4502 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4503 vre->vre_offset = 0;
4504 else
4505 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4507 /* Reflow the begining portion using the scratch area */
4508 if (vre->vre_offset == 0) {
4509 VERIFY0(dsl_sync_task(spa_name(spa),
4510 NULL, raidz_reflow_scratch_sync,
4511 vre, 0, ZFS_SPACE_CHECK_NONE));
4513 /* if we encountered errors then pause */
4514 if (vre->vre_offset == 0) {
4515 mutex_enter(&vre->vre_lock);
4516 vre->vre_waiting_for_resilver = B_TRUE;
4517 mutex_exit(&vre->vre_lock);
4518 return;
4522 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4523 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4525 uint64_t guid = raidvd->vdev_guid;
4527 /* Iterate over all the remaining metaslabs */
4528 for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4529 i < raidvd->vdev_ms_count &&
4530 !zthr_iscancelled(zthr) &&
4531 vre->vre_failed_offset == UINT64_MAX; i++) {
4532 metaslab_t *msp = raidvd->vdev_ms[i];
4534 metaslab_disable(msp);
4535 mutex_enter(&msp->ms_lock);
4538 * The metaslab may be newly created (for the expanded
4539 * space), in which case its trees won't exist yet,
4540 * so we need to bail out early.
4542 if (msp->ms_new) {
4543 mutex_exit(&msp->ms_lock);
4544 metaslab_enable(msp, B_FALSE, B_FALSE);
4545 continue;
4548 VERIFY0(metaslab_load(msp));
4551 * We want to copy everything except the free (allocatable)
4552 * space. Note that there may be a little bit more free
4553 * space (e.g. in ms_defer), and it's fine to copy that too.
4555 uint64_t shift, start;
4556 range_seg_type_t type = metaslab_calculate_range_tree_type(
4557 raidvd, msp, &start, &shift);
4558 range_tree_t *rt = range_tree_create(NULL, type, NULL,
4559 start, shift);
4560 range_tree_add(rt, msp->ms_start, msp->ms_size);
4561 range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
4562 mutex_exit(&msp->ms_lock);
4565 * Force the last sector of each metaslab to be copied. This
4566 * ensures that we advance the on-disk progress to the end of
4567 * this metaslab while the metaslab is disabled. Otherwise, we
4568 * could move past this metaslab without advancing the on-disk
4569 * progress, and then an allocation to this metaslab would not
4570 * be copied.
4572 int sectorsz = 1 << raidvd->vdev_ashift;
4573 uint64_t ms_last_offset = msp->ms_start +
4574 msp->ms_size - sectorsz;
4575 if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
4576 range_tree_add(rt, ms_last_offset, sectorsz);
4580 * When we are resuming from a paused expansion (i.e.
4581 * when importing a pool with a expansion in progress),
4582 * discard any state that we have already processed.
4584 if (vre->vre_offset > msp->ms_start) {
4585 range_tree_clear(rt, msp->ms_start,
4586 vre->vre_offset - msp->ms_start);
4589 while (!zthr_iscancelled(zthr) &&
4590 !range_tree_is_empty(rt) &&
4591 vre->vre_failed_offset == UINT64_MAX) {
4594 * We need to periodically drop the config lock so that
4595 * writers can get in. Additionally, we can't wait
4596 * for a txg to sync while holding a config lock
4597 * (since a waiting writer could cause a 3-way deadlock
4598 * with the sync thread, which also gets a config
4599 * lock for reader). So we can't hold the config lock
4600 * while calling dmu_tx_assign().
4602 spa_config_exit(spa, SCL_CONFIG, FTAG);
4605 * If requested, pause the reflow when the amount
4606 * specified by raidz_expand_max_reflow_bytes is reached
4608 * This pause is only used during testing or debugging.
4610 while (raidz_expand_max_reflow_bytes != 0 &&
4611 raidz_expand_max_reflow_bytes <=
4612 vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4613 delay(hz);
4616 mutex_enter(&vre->vre_lock);
4617 while (vre->vre_outstanding_bytes >
4618 raidz_expand_max_copy_bytes) {
4619 cv_wait(&vre->vre_cv, &vre->vre_lock);
4621 mutex_exit(&vre->vre_lock);
4623 dmu_tx_t *tx =
4624 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4626 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
4627 uint64_t txg = dmu_tx_get_txg(tx);
4630 * Reacquire the vdev_config lock. Theoretically, the
4631 * vdev_t that we're expanding may have changed.
4633 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4634 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4636 boolean_t needsync =
4637 raidz_reflow_impl(raidvd, vre, rt, tx);
4639 dmu_tx_commit(tx);
4641 if (needsync) {
4642 spa_config_exit(spa, SCL_CONFIG, FTAG);
4643 txg_wait_synced(spa->spa_dsl_pool, txg);
4644 spa_config_enter(spa, SCL_CONFIG, FTAG,
4645 RW_READER);
4649 spa_config_exit(spa, SCL_CONFIG, FTAG);
4651 metaslab_enable(msp, B_FALSE, B_FALSE);
4652 range_tree_vacate(rt, NULL, NULL);
4653 range_tree_destroy(rt);
4655 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4656 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4659 spa_config_exit(spa, SCL_CONFIG, FTAG);
4662 * The txg_wait_synced() here ensures that all reflow zio's have
4663 * completed, and vre_failed_offset has been set if necessary. It
4664 * also ensures that the progress of the last raidz_reflow_sync() is
4665 * written to disk before raidz_reflow_complete_sync() changes the
4666 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to
4667 * determine if a reflow is in progress, in which case we may need to
4668 * write to both old and new locations. Therefore we can only change
4669 * vre_state once this is not necessary, which is once the on-disk
4670 * progress (in spa_ubsync) has been set past any possible writes (to
4671 * the end of the last metaslab).
4673 txg_wait_synced(spa->spa_dsl_pool, 0);
4675 if (!zthr_iscancelled(zthr) &&
4676 vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4678 * We are not being canceled or paused, so the reflow must be
4679 * complete. In that case also mark it as completed on disk.
4681 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4682 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4683 raidz_reflow_complete_sync, spa,
4684 0, ZFS_SPACE_CHECK_NONE));
4685 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4686 } else {
4688 * Wait for all copy zio's to complete and for all the
4689 * raidz_reflow_sync() synctasks to be run.
4691 spa_history_log_internal(spa, "reflow pause",
4692 NULL, "offset=%llu failed_offset=%lld",
4693 (long long)vre->vre_offset,
4694 (long long)vre->vre_failed_offset);
4695 mutex_enter(&vre->vre_lock);
4696 if (vre->vre_failed_offset != UINT64_MAX) {
4698 * Reset progress so that we will retry everything
4699 * after the point that something failed.
4701 vre->vre_offset = vre->vre_failed_offset;
4702 vre->vre_failed_offset = UINT64_MAX;
4703 vre->vre_waiting_for_resilver = B_TRUE;
4705 mutex_exit(&vre->vre_lock);
4709 void
4710 spa_start_raidz_expansion_thread(spa_t *spa)
4712 ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
4713 spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4714 spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4715 spa, defclsyspri);
4718 void
4719 raidz_dtl_reassessed(vdev_t *vd)
4721 spa_t *spa = vd->vdev_spa;
4722 if (spa->spa_raidz_expand != NULL) {
4723 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4725 * we get called often from vdev_dtl_reassess() so make
4726 * sure it's our vdev and any replacing is complete
4728 if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4729 !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4730 mutex_enter(&vre->vre_lock);
4731 if (vre->vre_waiting_for_resilver) {
4732 vdev_dbgmsg(vd, "DTL reassessed, "
4733 "continuing raidz expansion");
4734 vre->vre_waiting_for_resilver = B_FALSE;
4735 zthr_wakeup(spa->spa_raidz_expand_zthr);
4737 mutex_exit(&vre->vre_lock);
4743 vdev_raidz_attach_check(vdev_t *new_child)
4745 vdev_t *raidvd = new_child->vdev_parent;
4746 uint64_t new_children = raidvd->vdev_children;
4749 * We use the "boot" space as scratch space to handle overwriting the
4750 * initial part of the vdev. If it is too small, then this expansion
4751 * is not allowed. This would be very unusual (e.g. ashift > 13 and
4752 * >200 children).
4754 if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4755 return (EINVAL);
4757 return (0);
4760 void
4761 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4763 vdev_t *new_child = arg;
4764 spa_t *spa = new_child->vdev_spa;
4765 vdev_t *raidvd = new_child->vdev_parent;
4766 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4767 ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4768 ASSERT3P(raidvd->vdev_top, ==, raidvd);
4769 ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4770 ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4771 ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4772 new_child);
4774 spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4776 vdrz->vd_physical_width++;
4778 VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4779 vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4780 vdrz->vn_vre.vre_offset = 0;
4781 vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4782 spa->spa_raidz_expand = &vdrz->vn_vre;
4783 zthr_wakeup(spa->spa_raidz_expand_zthr);
4786 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4787 * written to the config.
4789 vdev_config_dirty(raidvd);
4791 vdrz->vn_vre.vre_start_time = gethrestime_sec();
4792 vdrz->vn_vre.vre_end_time = 0;
4793 vdrz->vn_vre.vre_state = DSS_SCANNING;
4794 vdrz->vn_vre.vre_bytes_copied = 0;
4796 uint64_t state = vdrz->vn_vre.vre_state;
4797 VERIFY0(zap_update(spa->spa_meta_objset,
4798 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4799 sizeof (state), 1, &state, tx));
4801 uint64_t start_time = vdrz->vn_vre.vre_start_time;
4802 VERIFY0(zap_update(spa->spa_meta_objset,
4803 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4804 sizeof (start_time), 1, &start_time, tx));
4806 (void) zap_remove(spa->spa_meta_objset,
4807 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4808 (void) zap_remove(spa->spa_meta_objset,
4809 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4811 spa_history_log_internal(spa, "raidz vdev expansion started", tx,
4812 "%s vdev %llu new width %llu", spa_name(spa),
4813 (unsigned long long)raidvd->vdev_id,
4814 (unsigned long long)raidvd->vdev_children);
4818 vdev_raidz_load(vdev_t *vd)
4820 vdev_raidz_t *vdrz = vd->vdev_tsd;
4821 int err;
4823 uint64_t state = DSS_NONE;
4824 uint64_t start_time = 0;
4825 uint64_t end_time = 0;
4826 uint64_t bytes_copied = 0;
4828 if (vd->vdev_top_zap != 0) {
4829 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4830 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4831 sizeof (state), 1, &state);
4832 if (err != 0 && err != ENOENT)
4833 return (err);
4835 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4836 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4837 sizeof (start_time), 1, &start_time);
4838 if (err != 0 && err != ENOENT)
4839 return (err);
4841 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4842 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4843 sizeof (end_time), 1, &end_time);
4844 if (err != 0 && err != ENOENT)
4845 return (err);
4847 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4848 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4849 sizeof (bytes_copied), 1, &bytes_copied);
4850 if (err != 0 && err != ENOENT)
4851 return (err);
4855 * If we are in the middle of expansion, vre_state should have
4856 * already been set by vdev_raidz_init().
4858 EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4859 vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4860 vdrz->vn_vre.vre_start_time = start_time;
4861 vdrz->vn_vre.vre_end_time = end_time;
4862 vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4864 return (0);
4868 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4870 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4872 if (vre == NULL) {
4873 /* no removal in progress; find most recent completed */
4874 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4875 vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4876 if (vd->vdev_ops == &vdev_raidz_ops) {
4877 vdev_raidz_t *vdrz = vd->vdev_tsd;
4879 if (vdrz->vn_vre.vre_end_time != 0 &&
4880 (vre == NULL ||
4881 vdrz->vn_vre.vre_end_time >
4882 vre->vre_end_time)) {
4883 vre = &vdrz->vn_vre;
4889 if (vre == NULL) {
4890 return (SET_ERROR(ENOENT));
4893 pres->pres_state = vre->vre_state;
4894 pres->pres_expanding_vdev = vre->vre_vdev_id;
4896 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4897 pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4899 mutex_enter(&vre->vre_lock);
4900 pres->pres_reflowed = vre->vre_bytes_copied;
4901 for (int i = 0; i < TXG_SIZE; i++)
4902 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4903 mutex_exit(&vre->vre_lock);
4905 pres->pres_start_time = vre->vre_start_time;
4906 pres->pres_end_time = vre->vre_end_time;
4907 pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4909 return (0);
4913 * Initialize private RAIDZ specific fields from the nvlist.
4915 static int
4916 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
4918 uint_t children;
4919 nvlist_t **child;
4920 int error = nvlist_lookup_nvlist_array(nv,
4921 ZPOOL_CONFIG_CHILDREN, &child, &children);
4922 if (error != 0)
4923 return (SET_ERROR(EINVAL));
4925 uint64_t nparity;
4926 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
4927 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
4928 return (SET_ERROR(EINVAL));
4931 * Previous versions could only support 1 or 2 parity
4932 * device.
4934 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
4935 return (SET_ERROR(EINVAL));
4936 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
4937 return (SET_ERROR(EINVAL));
4938 } else {
4940 * We require the parity to be specified for SPAs that
4941 * support multiple parity levels.
4943 if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
4944 return (SET_ERROR(EINVAL));
4947 * Otherwise, we default to 1 parity device for RAID-Z.
4949 nparity = 1;
4952 vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4953 vdrz->vn_vre.vre_vdev_id = -1;
4954 vdrz->vn_vre.vre_offset = UINT64_MAX;
4955 vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4956 mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4957 cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4958 zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4959 mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4960 avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4961 sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4963 vdrz->vd_physical_width = children;
4964 vdrz->vd_nparity = nparity;
4966 /* note, the ID does not exist when creating a pool */
4967 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
4968 &vdrz->vn_vre.vre_vdev_id);
4970 boolean_t reflow_in_progress =
4971 nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4972 if (reflow_in_progress) {
4973 spa->spa_raidz_expand = &vdrz->vn_vre;
4974 vdrz->vn_vre.vre_state = DSS_SCANNING;
4977 vdrz->vd_original_width = children;
4978 uint64_t *txgs;
4979 unsigned int txgs_size = 0;
4980 error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4981 &txgs, &txgs_size);
4982 if (error == 0) {
4983 for (int i = 0; i < txgs_size; i++) {
4984 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4985 re->re_txg = txgs[txgs_size - i - 1];
4986 re->re_logical_width = vdrz->vd_physical_width - i;
4988 if (reflow_in_progress)
4989 re->re_logical_width--;
4991 avl_add(&vdrz->vd_expand_txgs, re);
4994 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
4996 if (reflow_in_progress) {
4997 vdrz->vd_original_width--;
4998 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
4999 children, txgs_size);
5002 *tsd = vdrz;
5004 return (0);
5007 static void
5008 vdev_raidz_fini(vdev_t *vd)
5010 vdev_raidz_t *vdrz = vd->vdev_tsd;
5011 if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
5012 vd->vdev_spa->spa_raidz_expand = NULL;
5013 reflow_node_t *re;
5014 void *cookie = NULL;
5015 avl_tree_t *tree = &vdrz->vd_expand_txgs;
5016 while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
5017 kmem_free(re, sizeof (*re));
5018 avl_destroy(&vdrz->vd_expand_txgs);
5019 mutex_destroy(&vdrz->vd_expand_lock);
5020 mutex_destroy(&vdrz->vn_vre.vre_lock);
5021 cv_destroy(&vdrz->vn_vre.vre_cv);
5022 zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
5023 kmem_free(vdrz, sizeof (*vdrz));
5027 * Add RAIDZ specific fields to the config nvlist.
5029 static void
5030 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
5032 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
5033 vdev_raidz_t *vdrz = vd->vdev_tsd;
5036 * Make sure someone hasn't managed to sneak a fancy new vdev
5037 * into a crufty old storage pool.
5039 ASSERT(vdrz->vd_nparity == 1 ||
5040 (vdrz->vd_nparity <= 2 &&
5041 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
5042 (vdrz->vd_nparity <= 3 &&
5043 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
5046 * Note that we'll add these even on storage pools where they
5047 * aren't strictly required -- older software will just ignore
5048 * it.
5050 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
5052 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
5053 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5056 mutex_enter(&vdrz->vd_expand_lock);
5057 if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
5058 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
5059 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
5060 KM_SLEEP);
5061 uint64_t i = 0;
5063 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
5064 re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
5065 txgs[i++] = re->re_txg;
5068 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5069 txgs, count);
5071 kmem_free(txgs, sizeof (uint64_t) * count);
5073 mutex_exit(&vdrz->vd_expand_lock);
5076 static uint64_t
5077 vdev_raidz_nparity(vdev_t *vd)
5079 vdev_raidz_t *vdrz = vd->vdev_tsd;
5080 return (vdrz->vd_nparity);
5083 static uint64_t
5084 vdev_raidz_ndisks(vdev_t *vd)
5086 return (vd->vdev_children);
5089 vdev_ops_t vdev_raidz_ops = {
5090 .vdev_op_init = vdev_raidz_init,
5091 .vdev_op_fini = vdev_raidz_fini,
5092 .vdev_op_open = vdev_raidz_open,
5093 .vdev_op_close = vdev_raidz_close,
5094 .vdev_op_asize = vdev_raidz_asize,
5095 .vdev_op_min_asize = vdev_raidz_min_asize,
5096 .vdev_op_min_alloc = NULL,
5097 .vdev_op_io_start = vdev_raidz_io_start,
5098 .vdev_op_io_done = vdev_raidz_io_done,
5099 .vdev_op_state_change = vdev_raidz_state_change,
5100 .vdev_op_need_resilver = vdev_raidz_need_resilver,
5101 .vdev_op_hold = NULL,
5102 .vdev_op_rele = NULL,
5103 .vdev_op_remap = NULL,
5104 .vdev_op_xlate = vdev_raidz_xlate,
5105 .vdev_op_rebuild_asize = NULL,
5106 .vdev_op_metaslab_init = NULL,
5107 .vdev_op_config_generate = vdev_raidz_config_generate,
5108 .vdev_op_nparity = vdev_raidz_nparity,
5109 .vdev_op_ndisks = vdev_raidz_ndisks,
5110 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */
5111 .vdev_op_leaf = B_FALSE /* not a leaf vdev */
5114 /* BEGIN CSTYLED */
5115 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5116 "For testing, pause RAIDZ expansion after reflowing this many bytes");
5117 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5118 "Max amount of concurrent i/o for RAIDZ expansion");
5119 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5120 "For expanded RAIDZ, aggregate reads that have more rows than this");
5121 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5122 "For expanded RAIDZ, automatically start a pool scrub when expansion "
5123 "completes");
5124 /* END CSTYLED */