4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
26 #include <sys/zfs_context.h>
27 #include <sys/types.h>
29 #include <sys/debug.h>
30 #include <sys/zfs_debug.h>
31 #include <sys/vdev_raidz.h>
32 #include <sys/vdev_raidz_impl.h>
34 /* Opaque implementation with NULL methods to represent original methods */
35 static const raidz_impl_ops_t vdev_raidz_original_impl
= {
37 .is_supported
= raidz_will_scalar_work
,
40 /* RAIDZ parity op that contain the fastest methods */
41 static raidz_impl_ops_t vdev_raidz_fastest_impl
= {
45 /* All compiled in implementations */
46 static const raidz_impl_ops_t
*const raidz_all_maths
[] = {
47 &vdev_raidz_original_impl
,
48 &vdev_raidz_scalar_impl
,
49 #if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */
50 &vdev_raidz_sse2_impl
,
52 #if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */
53 &vdev_raidz_ssse3_impl
,
55 #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */
56 &vdev_raidz_avx2_impl
,
58 #if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */
59 &vdev_raidz_avx512f_impl
,
61 #if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */
62 &vdev_raidz_avx512bw_impl
,
64 #if defined(__aarch64__) && !defined(__FreeBSD__)
65 &vdev_raidz_aarch64_neon_impl
,
66 &vdev_raidz_aarch64_neonx2_impl
,
68 #if defined(__powerpc__) && defined(__altivec__)
69 &vdev_raidz_powerpc_altivec_impl
,
73 /* Indicate that benchmark has been completed */
74 static boolean_t raidz_math_initialized
= B_FALSE
;
76 /* Select raidz implementation */
77 #define IMPL_FASTEST (UINT32_MAX)
78 #define IMPL_CYCLE (UINT32_MAX - 1)
79 #define IMPL_ORIGINAL (0)
80 #define IMPL_SCALAR (1)
82 #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i))
84 static uint32_t zfs_vdev_raidz_impl
= IMPL_SCALAR
;
85 static uint32_t user_sel_impl
= IMPL_FASTEST
;
87 /* Hold all supported implementations */
88 static size_t raidz_supp_impl_cnt
= 0;
89 static raidz_impl_ops_t
*raidz_supp_impl
[ARRAY_SIZE(raidz_all_maths
)];
93 * kstats values for supported implementations
94 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
96 static raidz_impl_kstat_t raidz_impl_kstats
[ARRAY_SIZE(raidz_all_maths
) + 1];
98 /* kstat for benchmarked implementations */
99 static kstat_t
*raidz_math_kstat
= NULL
;
103 * Returns the RAIDZ operations for raidz_map() parity calculations. When
104 * a SIMD implementation is not allowed in the current context, then fallback
105 * to the fastest generic implementation.
107 const raidz_impl_ops_t
*
108 vdev_raidz_math_get_ops(void)
111 return (&vdev_raidz_scalar_impl
);
113 raidz_impl_ops_t
*ops
= NULL
;
114 const uint32_t impl
= RAIDZ_IMPL_READ(zfs_vdev_raidz_impl
);
118 ASSERT(raidz_math_initialized
);
119 ops
= &vdev_raidz_fastest_impl
;
122 /* Cycle through all supported implementations */
123 ASSERT(raidz_math_initialized
);
124 ASSERT3U(raidz_supp_impl_cnt
, >, 0);
125 static size_t cycle_impl_idx
= 0;
126 size_t idx
= (++cycle_impl_idx
) % raidz_supp_impl_cnt
;
127 ops
= raidz_supp_impl
[idx
];
130 ops
= (raidz_impl_ops_t
*)&vdev_raidz_original_impl
;
133 ops
= (raidz_impl_ops_t
*)&vdev_raidz_scalar_impl
;
136 ASSERT3U(impl
, <, raidz_supp_impl_cnt
);
137 ASSERT3U(raidz_supp_impl_cnt
, >, 0);
138 if (impl
< ARRAY_SIZE(raidz_all_maths
))
139 ops
= raidz_supp_impl
[impl
];
143 ASSERT3P(ops
, !=, NULL
);
149 * Select parity generation method for raidz_map
152 vdev_raidz_math_generate(raidz_map_t
*rm
, raidz_row_t
*rr
)
154 raidz_gen_f gen_parity
= NULL
;
156 switch (raidz_parity(rm
)) {
158 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_P
];
161 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_PQ
];
164 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_PQR
];
168 cmn_err(CE_PANIC
, "invalid RAID-Z configuration %llu",
169 (u_longlong_t
)raidz_parity(rm
));
173 /* if method is NULL execute the original implementation */
174 if (gen_parity
== NULL
)
175 return (RAIDZ_ORIGINAL_IMPL
);
183 reconstruct_fun_p_sel(raidz_map_t
*rm
, const int *parity_valid
,
186 if (nbaddata
== 1 && parity_valid
[CODE_P
]) {
187 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
189 return ((raidz_rec_f
) NULL
);
193 reconstruct_fun_pq_sel(raidz_map_t
*rm
, const int *parity_valid
,
197 if (parity_valid
[CODE_P
]) {
198 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
199 } else if (parity_valid
[CODE_Q
]) {
200 return (rm
->rm_ops
->rec
[RAIDZ_REC_Q
]);
202 } else if (nbaddata
== 2 &&
203 parity_valid
[CODE_P
] && parity_valid
[CODE_Q
]) {
204 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQ
]);
206 return ((raidz_rec_f
) NULL
);
210 reconstruct_fun_pqr_sel(raidz_map_t
*rm
, const int *parity_valid
,
214 if (parity_valid
[CODE_P
]) {
215 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
216 } else if (parity_valid
[CODE_Q
]) {
217 return (rm
->rm_ops
->rec
[RAIDZ_REC_Q
]);
218 } else if (parity_valid
[CODE_R
]) {
219 return (rm
->rm_ops
->rec
[RAIDZ_REC_R
]);
221 } else if (nbaddata
== 2) {
222 if (parity_valid
[CODE_P
] && parity_valid
[CODE_Q
]) {
223 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQ
]);
224 } else if (parity_valid
[CODE_P
] && parity_valid
[CODE_R
]) {
225 return (rm
->rm_ops
->rec
[RAIDZ_REC_PR
]);
226 } else if (parity_valid
[CODE_Q
] && parity_valid
[CODE_R
]) {
227 return (rm
->rm_ops
->rec
[RAIDZ_REC_QR
]);
229 } else if (nbaddata
== 3 &&
230 parity_valid
[CODE_P
] && parity_valid
[CODE_Q
] &&
231 parity_valid
[CODE_R
]) {
232 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQR
]);
234 return ((raidz_rec_f
) NULL
);
238 * Select data reconstruction method for raidz_map
239 * @parity_valid - Parity validity flag
240 * @dt - Failed data index array
241 * @nbaddata - Number of failed data columns
244 vdev_raidz_math_reconstruct(raidz_map_t
*rm
, raidz_row_t
*rr
,
245 const int *parity_valid
, const int *dt
, const int nbaddata
)
247 raidz_rec_f rec_fn
= NULL
;
249 switch (raidz_parity(rm
)) {
251 rec_fn
= reconstruct_fun_p_sel(rm
, parity_valid
, nbaddata
);
254 rec_fn
= reconstruct_fun_pq_sel(rm
, parity_valid
, nbaddata
);
257 rec_fn
= reconstruct_fun_pqr_sel(rm
, parity_valid
, nbaddata
);
260 cmn_err(CE_PANIC
, "invalid RAID-Z configuration %llu",
261 (u_longlong_t
)raidz_parity(rm
));
266 return (RAIDZ_ORIGINAL_IMPL
);
268 return (rec_fn(rr
, dt
));
271 const char *const raidz_gen_name
[] = {
272 "gen_p", "gen_pq", "gen_pqr"
274 const char *const raidz_rec_name
[] = {
275 "rec_p", "rec_q", "rec_r",
276 "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
281 #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1)
284 raidz_math_kstat_headers(char *buf
, size_t size
)
286 ASSERT3U(size
, >=, RAIDZ_KSTAT_LINE_LEN
);
288 ssize_t off
= kmem_scnprintf(buf
, size
, "%-17s", "implementation");
290 for (int i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++)
291 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%-16s",
294 for (int i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++)
295 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%-16s",
298 (void) kmem_scnprintf(buf
+ off
, size
- off
, "\n");
304 raidz_math_kstat_data(char *buf
, size_t size
, void *data
)
306 raidz_impl_kstat_t
*fstat
= &raidz_impl_kstats
[raidz_supp_impl_cnt
];
307 raidz_impl_kstat_t
*cstat
= (raidz_impl_kstat_t
*)data
;
311 ASSERT3U(size
, >=, RAIDZ_KSTAT_LINE_LEN
);
313 if (cstat
== fstat
) {
314 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%-17s",
317 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++) {
318 int id
= fstat
->gen
[i
];
319 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%-16s",
320 raidz_supp_impl
[id
]->name
);
322 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++) {
323 int id
= fstat
->rec
[i
];
324 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%-16s",
325 raidz_supp_impl
[id
]->name
);
328 ptrdiff_t id
= cstat
- raidz_impl_kstats
;
330 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%-17s",
331 raidz_supp_impl
[id
]->name
);
333 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++)
334 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%-16llu",
335 (u_longlong_t
)cstat
->gen
[i
]);
337 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++)
338 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%-16llu",
339 (u_longlong_t
)cstat
->rec
[i
]);
342 (void) kmem_scnprintf(buf
+ off
, size
- off
, "\n");
348 raidz_math_kstat_addr(kstat_t
*ksp
, loff_t n
)
350 if (n
<= raidz_supp_impl_cnt
)
351 ksp
->ks_private
= (void *) (raidz_impl_kstats
+ n
);
353 ksp
->ks_private
= NULL
;
355 return (ksp
->ks_private
);
358 #define BENCH_D_COLS (8ULL)
359 #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
360 #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */
361 #define BENCH_NS MSEC2NSEC(1) /* 1ms */
363 typedef void (*benchmark_fn
)(raidz_map_t
*rm
, const int fn
);
366 benchmark_gen_impl(raidz_map_t
*rm
, const int fn
)
369 vdev_raidz_generate_parity(rm
);
373 benchmark_rec_impl(raidz_map_t
*rm
, const int fn
)
375 static const int rec_tgt
[7][3] = {
376 {1, 2, 3}, /* rec_p: bad QR & D[0] */
377 {0, 2, 3}, /* rec_q: bad PR & D[0] */
378 {0, 1, 3}, /* rec_r: bad PQ & D[0] */
379 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
380 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
381 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
382 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
385 vdev_raidz_reconstruct(rm
, rec_tgt
[fn
], 3);
389 * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
390 * is performed by setting the rm_ops pointer and calling the top level
391 * generate/reconstruct methods of bench_rm.
394 benchmark_raidz_impl(raidz_map_t
*bench_rm
, const int fn
, benchmark_fn bench_fn
)
396 uint64_t run_cnt
, speed
, best_speed
= 0;
397 hrtime_t t_start
, t_diff
;
398 raidz_impl_ops_t
*curr_impl
;
399 raidz_impl_kstat_t
*fstat
= &raidz_impl_kstats
[raidz_supp_impl_cnt
];
402 for (impl
= 0; impl
< raidz_supp_impl_cnt
; impl
++) {
403 /* set an implementation to benchmark */
404 curr_impl
= raidz_supp_impl
[impl
];
405 bench_rm
->rm_ops
= curr_impl
;
408 t_start
= gethrtime();
411 for (i
= 0; i
< 5; i
++, run_cnt
++)
412 bench_fn(bench_rm
, fn
);
414 t_diff
= gethrtime() - t_start
;
415 } while (t_diff
< BENCH_NS
);
417 speed
= run_cnt
* BENCH_ZIO_SIZE
* NANOSEC
;
418 speed
/= (t_diff
* BENCH_COLS
);
420 if (bench_fn
== benchmark_gen_impl
)
421 raidz_impl_kstats
[impl
].gen
[fn
] = speed
;
423 raidz_impl_kstats
[impl
].rec
[fn
] = speed
;
425 /* Update fastest implementation method */
426 if (speed
> best_speed
) {
429 if (bench_fn
== benchmark_gen_impl
) {
430 fstat
->gen
[fn
] = impl
;
431 vdev_raidz_fastest_impl
.gen
[fn
] =
434 fstat
->rec
[fn
] = impl
;
435 vdev_raidz_fastest_impl
.rec
[fn
] =
444 * Initialize and benchmark all supported implementations.
447 benchmark_raidz(void)
449 raidz_impl_ops_t
*curr_impl
;
452 /* Move supported impl into raidz_supp_impl */
453 for (i
= 0, c
= 0; i
< ARRAY_SIZE(raidz_all_maths
); i
++) {
454 curr_impl
= (raidz_impl_ops_t
*)raidz_all_maths
[i
];
459 if (curr_impl
->is_supported())
460 raidz_supp_impl
[c
++] = (raidz_impl_ops_t
*)curr_impl
;
462 membar_producer(); /* complete raidz_supp_impl[] init */
463 raidz_supp_impl_cnt
= c
; /* number of supported impl */
467 zio_t
*bench_zio
= NULL
;
468 raidz_map_t
*bench_rm
= NULL
;
469 uint64_t bench_parity
;
471 /* Fake a zio and run the benchmark on a warmed up buffer */
472 bench_zio
= kmem_zalloc(sizeof (zio_t
), KM_SLEEP
);
473 bench_zio
->io_offset
= 0;
474 bench_zio
->io_size
= BENCH_ZIO_SIZE
; /* only data columns */
475 bench_zio
->io_abd
= abd_alloc_linear(BENCH_ZIO_SIZE
, B_TRUE
);
476 memset(abd_to_buf(bench_zio
->io_abd
), 0xAA, BENCH_ZIO_SIZE
);
478 /* Benchmark parity generation methods */
479 for (int fn
= 0; fn
< RAIDZ_GEN_NUM
; fn
++) {
480 bench_parity
= fn
+ 1;
481 /* New raidz_map is needed for each generate_p/q/r */
482 bench_rm
= vdev_raidz_map_alloc(bench_zio
, SPA_MINBLOCKSHIFT
,
483 BENCH_D_COLS
+ bench_parity
, bench_parity
);
485 benchmark_raidz_impl(bench_rm
, fn
, benchmark_gen_impl
);
487 vdev_raidz_map_free(bench_rm
);
490 /* Benchmark data reconstruction methods */
491 bench_rm
= vdev_raidz_map_alloc(bench_zio
, SPA_MINBLOCKSHIFT
,
492 BENCH_COLS
, PARITY_PQR
);
494 /* Ensure that fake parity blocks are initialized */
495 for (c
= 0; c
< bench_rm
->rm_row
[0]->rr_firstdatacol
; c
++) {
496 pabd
= bench_rm
->rm_row
[0]->rr_col
[c
].rc_abd
;
497 memset(abd_to_buf(pabd
), 0xAA, abd_get_size(pabd
));
500 for (int fn
= 0; fn
< RAIDZ_REC_NUM
; fn
++)
501 benchmark_raidz_impl(bench_rm
, fn
, benchmark_rec_impl
);
503 vdev_raidz_map_free(bench_rm
);
505 /* cleanup the bench zio */
506 abd_free(bench_zio
->io_abd
);
507 kmem_free(bench_zio
, sizeof (zio_t
));
510 * Skip the benchmark in user space to avoid impacting libzpool
511 * consumers (zdb, zhack, zinject, ztest). The last implementation
512 * is assumed to be the fastest and used by default.
514 memcpy(&vdev_raidz_fastest_impl
,
515 raidz_supp_impl
[raidz_supp_impl_cnt
- 1],
516 sizeof (vdev_raidz_fastest_impl
));
517 strcpy(vdev_raidz_fastest_impl
.name
, "fastest");
522 vdev_raidz_math_init(void)
524 /* Determine the fastest available implementation. */
528 /* Install kstats for all implementations */
529 raidz_math_kstat
= kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
530 KSTAT_TYPE_RAW
, 0, KSTAT_FLAG_VIRTUAL
);
531 if (raidz_math_kstat
!= NULL
) {
532 raidz_math_kstat
->ks_data
= NULL
;
533 raidz_math_kstat
->ks_ndata
= UINT32_MAX
;
534 kstat_set_raw_ops(raidz_math_kstat
,
535 raidz_math_kstat_headers
,
536 raidz_math_kstat_data
,
537 raidz_math_kstat_addr
);
538 kstat_install(raidz_math_kstat
);
542 /* Finish initialization */
543 atomic_swap_32(&zfs_vdev_raidz_impl
, user_sel_impl
);
544 raidz_math_initialized
= B_TRUE
;
548 vdev_raidz_math_fini(void)
550 raidz_impl_ops_t
const *curr_impl
;
553 if (raidz_math_kstat
!= NULL
) {
554 kstat_delete(raidz_math_kstat
);
555 raidz_math_kstat
= NULL
;
559 for (int i
= 0; i
< ARRAY_SIZE(raidz_all_maths
); i
++) {
560 curr_impl
= raidz_all_maths
[i
];
566 static const struct {
569 } math_impl_opts
[] = {
570 { "cycle", IMPL_CYCLE
},
571 { "fastest", IMPL_FASTEST
},
572 { "original", IMPL_ORIGINAL
},
573 { "scalar", IMPL_SCALAR
}
577 * Function sets desired raidz implementation.
579 * If we are called before init(), user preference will be saved in
580 * user_sel_impl, and applied in later init() call. This occurs when module
581 * parameter is specified on module load. Otherwise, directly update
582 * zfs_vdev_raidz_impl.
584 * @val Name of raidz implementation to use
588 vdev_raidz_impl_set(const char *val
)
591 char req_name
[RAIDZ_IMPL_NAME_MAX
];
592 uint32_t impl
= RAIDZ_IMPL_READ(user_sel_impl
);
596 i
= strnlen(val
, RAIDZ_IMPL_NAME_MAX
);
597 if (i
== 0 || i
== RAIDZ_IMPL_NAME_MAX
)
600 strlcpy(req_name
, val
, RAIDZ_IMPL_NAME_MAX
);
601 while (i
> 0 && !!isspace(req_name
[i
-1]))
605 /* Check mandatory options */
606 for (i
= 0; i
< ARRAY_SIZE(math_impl_opts
); i
++) {
607 if (strcmp(req_name
, math_impl_opts
[i
].name
) == 0) {
608 impl
= math_impl_opts
[i
].sel
;
614 /* check all supported impl if init() was already called */
615 if (err
!= 0 && raidz_math_initialized
) {
616 /* check all supported implementations */
617 for (i
= 0; i
< raidz_supp_impl_cnt
; i
++) {
618 if (strcmp(req_name
, raidz_supp_impl
[i
]->name
) == 0) {
627 if (raidz_math_initialized
)
628 atomic_swap_32(&zfs_vdev_raidz_impl
, impl
);
630 atomic_swap_32(&user_sel_impl
, impl
);
636 #if defined(_KERNEL) && defined(__linux__)
639 zfs_vdev_raidz_impl_set(const char *val
, zfs_kernel_param_t
*kp
)
641 return (vdev_raidz_impl_set(val
));
645 zfs_vdev_raidz_impl_get(char *buffer
, zfs_kernel_param_t
*kp
)
649 const uint32_t impl
= RAIDZ_IMPL_READ(zfs_vdev_raidz_impl
);
651 ASSERT(raidz_math_initialized
);
653 /* list mandatory options */
654 for (i
= 0; i
< ARRAY_SIZE(math_impl_opts
) - 2; i
++) {
655 fmt
= (impl
== math_impl_opts
[i
].sel
) ? "[%s] " : "%s ";
656 cnt
+= kmem_scnprintf(buffer
+ cnt
, PAGE_SIZE
- cnt
, fmt
,
657 math_impl_opts
[i
].name
);
660 /* list all supported implementations */
661 for (i
= 0; i
< raidz_supp_impl_cnt
; i
++) {
662 fmt
= (i
== impl
) ? "[%s] " : "%s ";
663 cnt
+= kmem_scnprintf(buffer
+ cnt
, PAGE_SIZE
- cnt
, fmt
,
664 raidz_supp_impl
[i
]->name
);
670 module_param_call(zfs_vdev_raidz_impl
, zfs_vdev_raidz_impl_set
,
671 zfs_vdev_raidz_impl_get
, NULL
, 0644);
672 MODULE_PARM_DESC(zfs_vdev_raidz_impl
, "Select raidz implementation.");