Tag 2.2.0-rc4
[zfs.git] / cmd / raidz_test / raidz_test.c
blob195026d3a7ab916c49967932386ba32dceb24473
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
26 #include <sys/zfs_context.h>
27 #include <sys/time.h>
28 #include <sys/wait.h>
29 #include <sys/zio.h>
30 #include <umem.h>
31 #include <sys/vdev_raidz.h>
32 #include <sys/vdev_raidz_impl.h>
33 #include <assert.h>
34 #include <stdio.h>
35 #include "raidz_test.h"
37 static int *rand_data;
38 raidz_test_opts_t rto_opts;
40 static char pid_s[16];
42 static void sig_handler(int signo)
44 int old_errno = errno;
45 struct sigaction action;
47 * Restore default action and re-raise signal so SIGSEGV and
48 * SIGABRT can trigger a core dump.
50 action.sa_handler = SIG_DFL;
51 sigemptyset(&action.sa_mask);
52 action.sa_flags = 0;
53 (void) sigaction(signo, &action, NULL);
55 if (rto_opts.rto_gdb) {
56 pid_t pid = fork();
57 if (pid == 0) {
58 execlp("gdb", "gdb", "-ex", "set pagination 0",
59 "-p", pid_s, NULL);
60 _exit(-1);
61 } else if (pid > 0)
62 while (waitpid(pid, NULL, 0) == -1 && errno == EINTR)
66 raise(signo);
67 errno = old_errno;
70 static void print_opts(raidz_test_opts_t *opts, boolean_t force)
72 const char *verbose;
73 switch (opts->rto_v) {
74 case D_ALL:
75 verbose = "no";
76 break;
77 case D_INFO:
78 verbose = "info";
79 break;
80 case D_DEBUG:
81 default:
82 verbose = "debug";
83 break;
86 if (force || opts->rto_v >= D_INFO) {
87 (void) fprintf(stdout, DBLSEP "Running with options:\n"
88 " (-a) zio ashift : %zu\n"
89 " (-o) zio offset : 1 << %zu\n"
90 " (-e) expanded map : %s\n"
91 " (-r) reflow offset : %llx\n"
92 " (-d) number of raidz data columns : %zu\n"
93 " (-s) size of DATA : 1 << %zu\n"
94 " (-S) sweep parameters : %s \n"
95 " (-v) verbose : %s \n\n",
96 opts->rto_ashift, /* -a */
97 ilog2(opts->rto_offset), /* -o */
98 opts->rto_expand ? "yes" : "no", /* -e */
99 (u_longlong_t)opts->rto_expand_offset, /* -r */
100 opts->rto_dcols, /* -d */
101 ilog2(opts->rto_dsize), /* -s */
102 opts->rto_sweep ? "yes" : "no", /* -S */
103 verbose); /* -v */
107 static void usage(boolean_t requested)
109 const raidz_test_opts_t *o = &rto_opts_defaults;
111 FILE *fp = requested ? stdout : stderr;
113 (void) fprintf(fp, "Usage:\n"
114 "\t[-a zio ashift (default: %zu)]\n"
115 "\t[-o zio offset, exponent radix 2 (default: %zu)]\n"
116 "\t[-d number of raidz data columns (default: %zu)]\n"
117 "\t[-s zio size, exponent radix 2 (default: %zu)]\n"
118 "\t[-S parameter sweep (default: %s)]\n"
119 "\t[-t timeout for parameter sweep test]\n"
120 "\t[-B benchmark all raidz implementations]\n"
121 "\t[-e use expanded raidz map (default: %s)]\n"
122 "\t[-r expanded raidz map reflow offset (default: %llx)]\n"
123 "\t[-v increase verbosity (default: %d)]\n"
124 "\t[-h (print help)]\n"
125 "\t[-T test the test, see if failure would be detected]\n"
126 "\t[-D debug (attach gdb on SIGSEGV)]\n"
128 o->rto_ashift, /* -a */
129 ilog2(o->rto_offset), /* -o */
130 o->rto_dcols, /* -d */
131 ilog2(o->rto_dsize), /* -s */
132 rto_opts.rto_sweep ? "yes" : "no", /* -S */
133 rto_opts.rto_expand ? "yes" : "no", /* -e */
134 (u_longlong_t)o->rto_expand_offset, /* -r */
135 o->rto_v); /* -v */
137 exit(requested ? 0 : 1);
140 static void process_options(int argc, char **argv)
142 size_t value;
143 int opt;
144 raidz_test_opts_t *o = &rto_opts;
146 memcpy(o, &rto_opts_defaults, sizeof (*o));
148 while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) {
149 switch (opt) {
150 case 'a':
151 value = strtoull(optarg, NULL, 0);
152 o->rto_ashift = MIN(13, MAX(9, value));
153 break;
154 case 'e':
155 o->rto_expand = 1;
156 break;
157 case 'r':
158 o->rto_expand_offset = strtoull(optarg, NULL, 0);
159 break;
160 case 'o':
161 value = strtoull(optarg, NULL, 0);
162 o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9;
163 break;
164 case 'd':
165 value = strtoull(optarg, NULL, 0);
166 o->rto_dcols = MIN(255, MAX(1, value));
167 break;
168 case 's':
169 value = strtoull(optarg, NULL, 0);
170 o->rto_dsize = 1ULL << MIN(SPA_MAXBLOCKSHIFT,
171 MAX(SPA_MINBLOCKSHIFT, value));
172 break;
173 case 't':
174 value = strtoull(optarg, NULL, 0);
175 o->rto_sweep_timeout = value;
176 break;
177 case 'v':
178 o->rto_v++;
179 break;
180 case 'S':
181 o->rto_sweep = 1;
182 break;
183 case 'B':
184 o->rto_benchmark = 1;
185 break;
186 case 'D':
187 o->rto_gdb = 1;
188 break;
189 case 'T':
190 o->rto_sanity = 1;
191 break;
192 case 'h':
193 usage(B_TRUE);
194 break;
195 case '?':
196 default:
197 usage(B_FALSE);
198 break;
203 #define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd)
204 #define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size)
206 #define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd)
207 #define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size)
209 static int
210 cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
212 int r, i, ret = 0;
214 VERIFY(parity >= 1 && parity <= 3);
216 for (r = 0; r < rm->rm_nrows; r++) {
217 raidz_row_t * const rr = rm->rm_row[r];
218 raidz_row_t * const rrg = opts->rm_golden->rm_row[r];
219 for (i = 0; i < parity; i++) {
220 if (CODE_COL_SIZE(rrg, i) == 0) {
221 VERIFY0(CODE_COL_SIZE(rr, i));
222 continue;
225 if (abd_cmp(CODE_COL(rr, i),
226 CODE_COL(rrg, i)) != 0) {
227 ret++;
228 LOG_OPT(D_DEBUG, opts,
229 "\nParity block [%d] different!\n", i);
233 return (ret);
236 static int
237 cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
239 int r, i, dcols, ret = 0;
241 for (r = 0; r < rm->rm_nrows; r++) {
242 raidz_row_t *rr = rm->rm_row[r];
243 raidz_row_t *rrg = opts->rm_golden->rm_row[r];
244 dcols = opts->rm_golden->rm_row[0]->rr_cols -
245 raidz_parity(opts->rm_golden);
246 for (i = 0; i < dcols; i++) {
247 if (DATA_COL_SIZE(rrg, i) == 0) {
248 VERIFY0(DATA_COL_SIZE(rr, i));
249 continue;
252 if (abd_cmp(DATA_COL(rrg, i),
253 DATA_COL(rr, i)) != 0) {
254 ret++;
256 LOG_OPT(D_DEBUG, opts,
257 "\nData block [%d] different!\n", i);
261 return (ret);
264 static int
265 init_rand(void *data, size_t size, void *private)
267 (void) private;
268 memcpy(data, rand_data, size);
269 return (0);
272 static void
273 corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
275 for (int r = 0; r < rm->rm_nrows; r++) {
276 raidz_row_t *rr = rm->rm_row[r];
277 for (int i = 0; i < cnt; i++) {
278 raidz_col_t *col = &rr->rr_col[tgts[i]];
279 abd_iterate_func(col->rc_abd, 0, col->rc_size,
280 init_rand, NULL);
285 void
286 init_zio_abd(zio_t *zio)
288 abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL);
291 static void
292 fini_raidz_map(zio_t **zio, raidz_map_t **rm)
294 vdev_raidz_map_free(*rm);
295 raidz_free((*zio)->io_abd, (*zio)->io_size);
296 umem_free(*zio, sizeof (zio_t));
298 *zio = NULL;
299 *rm = NULL;
302 static int
303 init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
305 int err = 0;
306 zio_t *zio_test;
307 raidz_map_t *rm_test;
308 const size_t total_ncols = opts->rto_dcols + parity;
310 if (opts->rm_golden) {
311 fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
314 opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
315 zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
317 opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset;
318 opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize;
320 opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize);
321 zio_test->io_abd = raidz_alloc(opts->rto_dsize);
323 init_zio_abd(opts->zio_golden);
324 init_zio_abd(zio_test);
326 VERIFY0(vdev_raidz_impl_set("original"));
328 if (opts->rto_expand) {
329 opts->rm_golden =
330 vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
331 opts->zio_golden->io_size, opts->zio_golden->io_offset,
332 opts->rto_ashift, total_ncols+1, total_ncols,
333 parity, opts->rto_expand_offset);
334 rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
335 zio_test->io_size, zio_test->io_offset,
336 opts->rto_ashift, total_ncols+1, total_ncols,
337 parity, opts->rto_expand_offset);
338 } else {
339 opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
340 opts->rto_ashift, total_ncols, parity);
341 rm_test = vdev_raidz_map_alloc(zio_test,
342 opts->rto_ashift, total_ncols, parity);
345 VERIFY(opts->zio_golden);
346 VERIFY(opts->rm_golden);
348 vdev_raidz_generate_parity(opts->rm_golden);
349 vdev_raidz_generate_parity(rm_test);
351 /* sanity check */
352 err |= cmp_data(opts, rm_test);
353 err |= cmp_code(opts, rm_test, parity);
355 if (err)
356 ERR("initializing the golden copy ... [FAIL]!\n");
358 /* tear down raidz_map of test zio */
359 fini_raidz_map(&zio_test, &rm_test);
361 return (err);
365 * If reflow is not in progress, reflow_offset should be UINT64_MAX.
366 * For each row, if the row is entirely before reflow_offset, it will
367 * come from the new location. Otherwise this row will come from the
368 * old location. Therefore, rows that straddle the reflow_offset will
369 * come from the old location.
371 * NOTE: Until raidz expansion is implemented this function is only
372 * needed by raidz_test.c to the multi-row raid_map_t functionality.
374 raidz_map_t *
375 vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
376 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
377 uint64_t nparity, uint64_t reflow_offset)
379 /* The zio's size in units of the vdev's minimum sector size. */
380 uint64_t s = size >> ashift;
381 uint64_t q, r, bc, devidx, asize = 0, tot;
384 * "Quotient": The number of data sectors for this stripe on all but
385 * the "big column" child vdevs that also contain "remainder" data.
386 * AKA "full rows"
388 q = s / (logical_cols - nparity);
391 * "Remainder": The number of partial stripe data sectors in this I/O.
392 * This will add a sector to some, but not all, child vdevs.
394 r = s - q * (logical_cols - nparity);
396 /* The number of "big columns" - those which contain remainder data. */
397 bc = (r == 0 ? 0 : r + nparity);
400 * The total number of data and parity sectors associated with
401 * this I/O.
403 tot = s + nparity * (q + (r == 0 ? 0 : 1));
405 /* How many rows contain data (not skip) */
406 uint64_t rows = howmany(tot, logical_cols);
407 int cols = MIN(tot, logical_cols);
409 raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
410 KM_SLEEP);
411 rm->rm_nrows = rows;
413 for (uint64_t row = 0; row < rows; row++) {
414 raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
415 rr_col[cols]), KM_SLEEP);
416 rm->rm_row[row] = rr;
418 /* The starting RAIDZ (parent) vdev sector of the row. */
419 uint64_t b = (offset >> ashift) + row * logical_cols;
422 * If we are in the middle of a reflow, and any part of this
423 * row has not been copied, then use the old location of
424 * this row.
426 int row_phys_cols = physical_cols;
427 if (b + (logical_cols - nparity) > reflow_offset >> ashift)
428 row_phys_cols--;
430 /* starting child of this row */
431 uint64_t child_id = b % row_phys_cols;
432 /* The starting byte offset on each child vdev. */
433 uint64_t child_offset = (b / row_phys_cols) << ashift;
436 * We set cols to the entire width of the block, even
437 * if this row is shorter. This is needed because parity
438 * generation (for Q and R) needs to know the entire width,
439 * because it treats the short row as though it was
440 * full-width (and the "phantom" sectors were zero-filled).
442 * Another approach to this would be to set cols shorter
443 * (to just the number of columns that we might do i/o to)
444 * and have another mechanism to tell the parity generation
445 * about the "entire width". Reconstruction (at least
446 * vdev_raidz_reconstruct_general()) would also need to
447 * know about the "entire width".
449 rr->rr_cols = cols;
450 rr->rr_bigcols = bc;
451 rr->rr_missingdata = 0;
452 rr->rr_missingparity = 0;
453 rr->rr_firstdatacol = nparity;
454 rr->rr_abd_empty = NULL;
455 rr->rr_nempty = 0;
457 for (int c = 0; c < rr->rr_cols; c++, child_id++) {
458 if (child_id >= row_phys_cols) {
459 child_id -= row_phys_cols;
460 child_offset += 1ULL << ashift;
462 rr->rr_col[c].rc_devidx = child_id;
463 rr->rr_col[c].rc_offset = child_offset;
464 rr->rr_col[c].rc_orig_data = NULL;
465 rr->rr_col[c].rc_error = 0;
466 rr->rr_col[c].rc_tried = 0;
467 rr->rr_col[c].rc_skipped = 0;
468 rr->rr_col[c].rc_need_orig_restore = B_FALSE;
470 uint64_t dc = c - rr->rr_firstdatacol;
471 if (c < rr->rr_firstdatacol) {
472 rr->rr_col[c].rc_size = 1ULL << ashift;
473 rr->rr_col[c].rc_abd =
474 abd_alloc_linear(rr->rr_col[c].rc_size,
475 B_TRUE);
476 } else if (row == rows - 1 && bc != 0 && c >= bc) {
478 * Past the end, this for parity generation.
480 rr->rr_col[c].rc_size = 0;
481 rr->rr_col[c].rc_abd = NULL;
482 } else {
484 * "data column" (col excluding parity)
485 * Add an ASCII art diagram here
487 uint64_t off;
489 if (c < bc || r == 0) {
490 off = dc * rows + row;
491 } else {
492 off = r * rows +
493 (dc - r) * (rows - 1) + row;
495 rr->rr_col[c].rc_size = 1ULL << ashift;
496 rr->rr_col[c].rc_abd = abd_get_offset_struct(
497 &rr->rr_col[c].rc_abdstruct,
498 abd, off << ashift, 1 << ashift);
501 asize += rr->rr_col[c].rc_size;
504 * If all data stored spans all columns, there's a danger that
505 * parity will always be on the same device and, since parity
506 * isn't read during normal operation, that that device's I/O
507 * bandwidth won't be used effectively. We therefore switch
508 * the parity every 1MB.
510 * ...at least that was, ostensibly, the theory. As a practical
511 * matter unless we juggle the parity between all devices
512 * evenly, we won't see any benefit. Further, occasional writes
513 * that aren't a multiple of the LCM of the number of children
514 * and the minimum stripe width are sufficient to avoid pessimal
515 * behavior. Unfortunately, this decision created an implicit
516 * on-disk format requirement that we need to support for all
517 * eternity, but only for single-parity RAID-Z.
519 * If we intend to skip a sector in the zeroth column for
520 * padding we must make sure to note this swap. We will never
521 * intend to skip the first column since at least one data and
522 * one parity column must appear in each row.
524 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
525 (offset & (1ULL << 20))) {
526 ASSERT(rr->rr_cols >= 2);
527 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
528 devidx = rr->rr_col[0].rc_devidx;
529 uint64_t o = rr->rr_col[0].rc_offset;
530 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
531 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
532 rr->rr_col[1].rc_devidx = devidx;
533 rr->rr_col[1].rc_offset = o;
537 ASSERT3U(asize, ==, tot << ashift);
539 /* init RAIDZ parity ops */
540 rm->rm_ops = vdev_raidz_math_get_ops();
542 return (rm);
545 static raidz_map_t *
546 init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
548 raidz_map_t *rm = NULL;
549 const size_t alloc_dsize = opts->rto_dsize;
550 const size_t total_ncols = opts->rto_dcols + parity;
551 const int ccols[] = { 0, 1, 2 };
553 VERIFY(zio);
554 VERIFY(parity <= 3 && parity >= 1);
556 *zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
558 (*zio)->io_offset = 0;
559 (*zio)->io_size = alloc_dsize;
560 (*zio)->io_abd = raidz_alloc(alloc_dsize);
561 init_zio_abd(*zio);
563 if (opts->rto_expand) {
564 rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
565 (*zio)->io_size, (*zio)->io_offset,
566 opts->rto_ashift, total_ncols+1, total_ncols,
567 parity, opts->rto_expand_offset);
568 } else {
569 rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
570 total_ncols, parity);
572 VERIFY(rm);
574 /* Make sure code columns are destroyed */
575 corrupt_colums(rm, ccols, parity);
577 return (rm);
580 static int
581 run_gen_check(raidz_test_opts_t *opts)
583 char **impl_name;
584 int fn, err = 0;
585 zio_t *zio_test;
586 raidz_map_t *rm_test;
588 err = init_raidz_golden_map(opts, PARITY_PQR);
589 if (0 != err)
590 return (err);
592 LOG(D_INFO, DBLSEP);
593 LOG(D_INFO, "Testing parity generation...\n");
595 for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
596 impl_name++) {
598 LOG(D_INFO, SEP);
599 LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
601 if (0 != vdev_raidz_impl_set(*impl_name)) {
602 LOG(D_INFO, "[SKIP]\n");
603 continue;
604 } else {
605 LOG(D_INFO, "[SUPPORTED]\n");
608 for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
610 /* Check if should stop */
611 if (rto_opts.rto_should_stop)
612 return (err);
614 /* create suitable raidz_map */
615 rm_test = init_raidz_map(opts, &zio_test, fn+1);
616 VERIFY(rm_test);
618 LOG(D_INFO, "\t\tTesting method [%s] ...",
619 raidz_gen_name[fn]);
621 if (!opts->rto_sanity)
622 vdev_raidz_generate_parity(rm_test);
624 if (cmp_code(opts, rm_test, fn+1) != 0) {
625 LOG(D_INFO, "[FAIL]\n");
626 err++;
627 } else
628 LOG(D_INFO, "[PASS]\n");
630 fini_raidz_map(&zio_test, &rm_test);
634 fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
636 return (err);
639 static int
640 run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
642 int x0, x1, x2;
643 int tgtidx[3];
644 int err = 0;
645 static const int rec_tgts[7][3] = {
646 {1, 2, 3}, /* rec_p: bad QR & D[0] */
647 {0, 2, 3}, /* rec_q: bad PR & D[0] */
648 {0, 1, 3}, /* rec_r: bad PQ & D[0] */
649 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
650 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
651 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
652 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
655 memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx));
657 if (fn < RAIDZ_REC_PQ) {
658 /* can reconstruct 1 failed data disk */
659 for (x0 = 0; x0 < opts->rto_dcols; x0++) {
660 if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
661 continue;
663 /* Check if should stop */
664 if (rto_opts.rto_should_stop)
665 return (err);
667 LOG(D_DEBUG, "[%d] ", x0);
669 tgtidx[2] = x0 + raidz_parity(rm);
671 corrupt_colums(rm, tgtidx+2, 1);
673 if (!opts->rto_sanity)
674 vdev_raidz_reconstruct(rm, tgtidx, 3);
676 if (cmp_data(opts, rm) != 0) {
677 err++;
678 LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0);
682 } else if (fn < RAIDZ_REC_PQR) {
683 /* can reconstruct 2 failed data disk */
684 for (x0 = 0; x0 < opts->rto_dcols; x0++) {
685 if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
686 continue;
687 for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
688 if (x1 >= rm->rm_row[0]->rr_cols -
689 raidz_parity(rm))
690 continue;
692 /* Check if should stop */
693 if (rto_opts.rto_should_stop)
694 return (err);
696 LOG(D_DEBUG, "[%d %d] ", x0, x1);
698 tgtidx[1] = x0 + raidz_parity(rm);
699 tgtidx[2] = x1 + raidz_parity(rm);
701 corrupt_colums(rm, tgtidx+1, 2);
703 if (!opts->rto_sanity)
704 vdev_raidz_reconstruct(rm, tgtidx, 3);
706 if (cmp_data(opts, rm) != 0) {
707 err++;
708 LOG(D_DEBUG, "\nREC D[%d %d]... "
709 "[FAIL]\n", x0, x1);
713 } else {
714 /* can reconstruct 3 failed data disk */
715 for (x0 = 0; x0 < opts->rto_dcols; x0++) {
716 if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
717 continue;
718 for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
719 if (x1 >= rm->rm_row[0]->rr_cols -
720 raidz_parity(rm))
721 continue;
722 for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) {
723 if (x2 >= rm->rm_row[0]->rr_cols -
724 raidz_parity(rm))
725 continue;
727 /* Check if should stop */
728 if (rto_opts.rto_should_stop)
729 return (err);
731 LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2);
733 tgtidx[0] = x0 + raidz_parity(rm);
734 tgtidx[1] = x1 + raidz_parity(rm);
735 tgtidx[2] = x2 + raidz_parity(rm);
737 corrupt_colums(rm, tgtidx, 3);
739 if (!opts->rto_sanity)
740 vdev_raidz_reconstruct(rm,
741 tgtidx, 3);
743 if (cmp_data(opts, rm) != 0) {
744 err++;
745 LOG(D_DEBUG,
746 "\nREC D[%d %d %d]... "
747 "[FAIL]\n", x0, x1, x2);
753 return (err);
756 static int
757 run_rec_check(raidz_test_opts_t *opts)
759 char **impl_name;
760 unsigned fn, err = 0;
761 zio_t *zio_test;
762 raidz_map_t *rm_test;
764 err = init_raidz_golden_map(opts, PARITY_PQR);
765 if (0 != err)
766 return (err);
768 LOG(D_INFO, DBLSEP);
769 LOG(D_INFO, "Testing data reconstruction...\n");
771 for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
772 impl_name++) {
774 LOG(D_INFO, SEP);
775 LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
777 if (vdev_raidz_impl_set(*impl_name) != 0) {
778 LOG(D_INFO, "[SKIP]\n");
779 continue;
780 } else
781 LOG(D_INFO, "[SUPPORTED]\n");
784 /* create suitable raidz_map */
785 rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR);
786 /* generate parity */
787 vdev_raidz_generate_parity(rm_test);
789 for (fn = 0; fn < RAIDZ_REC_NUM; fn++) {
791 LOG(D_INFO, "\t\tTesting method [%s] ...",
792 raidz_rec_name[fn]);
794 if (run_rec_check_impl(opts, rm_test, fn) != 0) {
795 LOG(D_INFO, "[FAIL]\n");
796 err++;
798 } else
799 LOG(D_INFO, "[PASS]\n");
802 /* tear down test raidz_map */
803 fini_raidz_map(&zio_test, &rm_test);
806 fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
808 return (err);
811 static int
812 run_test(raidz_test_opts_t *opts)
814 int err = 0;
816 if (opts == NULL)
817 opts = &rto_opts;
819 print_opts(opts, B_FALSE);
821 err |= run_gen_check(opts);
822 err |= run_rec_check(opts);
824 return (err);
827 #define SWEEP_RUNNING 0
828 #define SWEEP_FINISHED 1
829 #define SWEEP_ERROR 2
830 #define SWEEP_TIMEOUT 3
832 static int sweep_state = 0;
833 static raidz_test_opts_t failed_opts;
835 static kmutex_t sem_mtx;
836 static kcondvar_t sem_cv;
837 static int max_free_slots;
838 static int free_slots;
840 static __attribute__((noreturn)) void
841 sweep_thread(void *arg)
843 int err = 0;
844 raidz_test_opts_t *opts = (raidz_test_opts_t *)arg;
845 VERIFY(opts != NULL);
847 err = run_test(opts);
849 if (rto_opts.rto_sanity) {
850 /* 25% chance that a sweep test fails */
851 if (rand() < (RAND_MAX/4))
852 err = 1;
855 if (0 != err) {
856 mutex_enter(&sem_mtx);
857 memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t));
858 sweep_state = SWEEP_ERROR;
859 mutex_exit(&sem_mtx);
862 umem_free(opts, sizeof (raidz_test_opts_t));
864 /* signal the next thread */
865 mutex_enter(&sem_mtx);
866 free_slots++;
867 cv_signal(&sem_cv);
868 mutex_exit(&sem_mtx);
870 thread_exit();
873 static int
874 run_sweep(void)
876 static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 };
877 static const size_t ashift_v[] = { 9, 12, 14 };
878 static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12),
879 1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE };
881 (void) setvbuf(stdout, NULL, _IONBF, 0);
883 ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) *
884 ARRAY_SIZE(dcols_v);
885 ulong_t tried_comb = 0;
886 hrtime_t time_diff, start_time = gethrtime();
887 raidz_test_opts_t *opts;
888 int a, d, s;
890 max_free_slots = free_slots = MAX(2, boot_ncpus);
892 mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL);
893 cv_init(&sem_cv, NULL, CV_DEFAULT, NULL);
895 for (s = 0; s < ARRAY_SIZE(size_v); s++)
896 for (a = 0; a < ARRAY_SIZE(ashift_v); a++)
897 for (d = 0; d < ARRAY_SIZE(dcols_v); d++) {
899 if (size_v[s] < (1 << ashift_v[a])) {
900 total_comb--;
901 continue;
904 if (++tried_comb % 20 == 0)
905 LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb);
907 /* wait for signal to start new thread */
908 mutex_enter(&sem_mtx);
909 while (cv_timedwait_sig(&sem_cv, &sem_mtx,
910 ddi_get_lbolt() + hz)) {
912 /* check if should stop the test (timeout) */
913 time_diff = (gethrtime() - start_time) / NANOSEC;
914 if (rto_opts.rto_sweep_timeout > 0 &&
915 time_diff >= rto_opts.rto_sweep_timeout) {
916 sweep_state = SWEEP_TIMEOUT;
917 rto_opts.rto_should_stop = B_TRUE;
918 mutex_exit(&sem_mtx);
919 goto exit;
922 /* check if should stop the test (error) */
923 if (sweep_state != SWEEP_RUNNING) {
924 mutex_exit(&sem_mtx);
925 goto exit;
928 /* exit loop if a slot is available */
929 if (free_slots > 0) {
930 break;
934 free_slots--;
935 mutex_exit(&sem_mtx);
937 opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL);
938 opts->rto_ashift = ashift_v[a];
939 opts->rto_dcols = dcols_v[d];
940 opts->rto_offset = (1ULL << ashift_v[a]) * rand();
941 opts->rto_dsize = size_v[s];
942 opts->rto_expand = rto_opts.rto_expand;
943 opts->rto_expand_offset = rto_opts.rto_expand_offset;
944 opts->rto_v = 0; /* be quiet */
946 VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts,
947 0, NULL, TS_RUN, defclsyspri), !=, NULL);
950 exit:
951 LOG(D_ALL, "\nWaiting for test threads to finish...\n");
952 mutex_enter(&sem_mtx);
953 VERIFY(free_slots <= max_free_slots);
954 while (free_slots < max_free_slots) {
955 (void) cv_wait(&sem_cv, &sem_mtx);
957 mutex_exit(&sem_mtx);
959 if (sweep_state == SWEEP_ERROR) {
960 ERR("Sweep test failed! Failed option: \n");
961 print_opts(&failed_opts, B_TRUE);
962 } else {
963 if (sweep_state == SWEEP_TIMEOUT)
964 LOG(D_ALL, "Test timeout (%lus). Stopping...\n",
965 (ulong_t)rto_opts.rto_sweep_timeout);
967 LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n",
968 (ulong_t)tried_comb);
971 mutex_destroy(&sem_mtx);
973 return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0);
978 main(int argc, char **argv)
980 size_t i;
981 struct sigaction action;
982 int err = 0;
984 /* init gdb pid string early */
985 (void) sprintf(pid_s, "%d", getpid());
987 action.sa_handler = sig_handler;
988 sigemptyset(&action.sa_mask);
989 action.sa_flags = 0;
991 if (sigaction(SIGSEGV, &action, NULL) < 0) {
992 ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno));
993 exit(EXIT_FAILURE);
996 (void) setvbuf(stdout, NULL, _IOLBF, 0);
998 dprintf_setup(&argc, argv);
1000 process_options(argc, argv);
1002 kernel_init(SPA_MODE_READ);
1004 /* setup random data because rand() is not reentrant */
1005 rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
1006 srand((unsigned)time(NULL) * getpid());
1007 for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++)
1008 rand_data[i] = rand();
1010 mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ);
1012 if (rto_opts.rto_benchmark) {
1013 run_raidz_benchmark();
1014 } else if (rto_opts.rto_sweep) {
1015 err = run_sweep();
1016 } else {
1017 err = run_test(NULL);
1020 umem_free(rand_data, SPA_MAXBLOCKSIZE);
1021 kernel_fini();
1023 return (err);