4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
26 #include <sys/zio_checksum.h>
27 #include <sys/zfs_context.h>
28 #include <sys/zfs_chksum.h>
29 #include <sys/zfs_impl.h>
31 #include <sys/blake3.h>
34 /* limit benchmarking to max 256KiB, when EdonR is slower then this: */
35 #define LIMIT_PERF_MBS 300
48 zio_cksum_salt_t salt
;
49 zio_checksum_t
*(func
);
50 zio_checksum_tmpl_init_t
*(init
);
51 zio_checksum_tmpl_free_t
*(free
);
54 static chksum_stat_t
*chksum_stat_data
= 0;
55 static int chksum_stat_cnt
= 0;
56 static kstat_t
*chksum_kstat
= NULL
;
59 * Sample output on i3-1005G1 System:
61 * implementation 1k 4k 16k 64k 256k 1m 4m 16m
62 * edonr-generic 1278 1625 1769 1776 1783 1778 1771 1767
63 * skein-generic 548 594 613 623 621 623 621 486
64 * sha256-generic 255 270 281 278 279 281 283 283
65 * sha256-x64 288 310 316 317 318 317 317 316
66 * sha256-ssse3 304 342 351 355 356 357 356 356
67 * sha256-avx 311 348 359 362 362 363 363 362
68 * sha256-avx2 330 378 389 395 395 395 395 395
69 * sha256-shani 908 1127 1212 1230 1233 1234 1223 1230
70 * sha512-generic 359 409 431 427 429 430 428 423
71 * sha512-x64 420 473 490 496 497 497 496 495
72 * sha512-avx 406 522 546 560 560 560 556 560
73 * sha512-avx2 464 568 601 606 609 610 607 608
74 * blake3-generic 330 327 324 323 324 320 323 322
75 * blake3-sse2 424 1366 1449 1468 1458 1453 1395 1408
76 * blake3-sse41 453 1554 1658 1703 1689 1669 1622 1630
77 * blake3-avx2 452 2013 3225 3351 3356 3261 3076 3101
78 * blake3-avx512 498 2869 5269 5926 5872 5643 5014 5005
81 chksum_kstat_headers(char *buf
, size_t size
)
85 off
+= kmem_scnprintf(buf
+ off
, size
, "%-23s", "implementation");
86 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8s", "1k");
87 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8s", "4k");
88 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8s", "16k");
89 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8s", "64k");
90 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8s", "256k");
91 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8s", "1m");
92 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8s", "4m");
93 (void) kmem_scnprintf(buf
+ off
, size
- off
, "%8s\n", "16m");
99 chksum_kstat_data(char *buf
, size_t size
, void *data
)
105 cs
= (chksum_stat_t
*)data
;
106 kmem_scnprintf(b
, 23, "%s-%s", cs
->name
, cs
->impl
);
107 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%-23s", b
);
108 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8llu",
109 (u_longlong_t
)cs
->bs1k
);
110 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8llu",
111 (u_longlong_t
)cs
->bs4k
);
112 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8llu",
113 (u_longlong_t
)cs
->bs16k
);
114 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8llu",
115 (u_longlong_t
)cs
->bs64k
);
116 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8llu",
117 (u_longlong_t
)cs
->bs256k
);
118 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8llu",
119 (u_longlong_t
)cs
->bs1m
);
120 off
+= kmem_scnprintf(buf
+ off
, size
- off
, "%8llu",
121 (u_longlong_t
)cs
->bs4m
);
122 (void) kmem_scnprintf(buf
+ off
, size
- off
, "%8llu\n",
123 (u_longlong_t
)cs
->bs16m
);
129 chksum_kstat_addr(kstat_t
*ksp
, loff_t n
)
131 if (n
< chksum_stat_cnt
)
132 ksp
->ks_private
= (void *)(chksum_stat_data
+ n
);
134 ksp
->ks_private
= NULL
;
136 return (ksp
->ks_private
);
140 chksum_run(chksum_stat_t
*cs
, abd_t
*abd
, void *ctx
, int round
,
144 uint64_t run_bw
, run_time_ns
, run_count
= 0, size
= 0;
145 uint32_t l
, loops
= 0;
150 size
= 1<<10; loops
= 128; break;
152 size
= 1<<12; loops
= 64; break;
154 size
= 1<<14; loops
= 32; break;
156 size
= 1<<16; loops
= 16; break;
158 size
= 1<<18; loops
= 8; break;
160 size
= 1<<20; loops
= 4; break;
162 size
= 1<<22; loops
= 1; break;
164 size
= 1<<24; loops
= 1; break;
170 for (l
= 0; l
< loops
; l
++, run_count
++)
171 cs
->func(abd
, size
, ctx
, &zcp
);
173 run_time_ns
= gethrtime() - start
;
174 } while (run_time_ns
< MSEC2NSEC(1));
177 run_bw
= size
* run_count
* NANOSEC
;
178 run_bw
/= run_time_ns
; /* B/s */
179 *result
= run_bw
/1024/1024; /* MiB/s */
183 #define LIMIT_NEEDED 1
184 #define LIMIT_NOLIMIT 2
187 chksum_benchit(chksum_stat_t
*cs
)
191 void *salt
= &cs
->salt
.zcs_bytes
;
192 static int chksum_stat_limit
= LIMIT_INIT
;
194 memset(salt
, 0, sizeof (cs
->salt
.zcs_bytes
));
196 ctx
= cs
->init(&cs
->salt
);
198 /* allocate test memory via abd linear interface */
199 abd
= abd_alloc_linear(1<<20, B_FALSE
);
200 chksum_run(cs
, abd
, ctx
, 1, &cs
->bs1k
);
201 chksum_run(cs
, abd
, ctx
, 2, &cs
->bs4k
);
202 chksum_run(cs
, abd
, ctx
, 3, &cs
->bs16k
);
203 chksum_run(cs
, abd
, ctx
, 4, &cs
->bs64k
);
204 chksum_run(cs
, abd
, ctx
, 5, &cs
->bs256k
);
206 /* check if we ran on a slow cpu */
207 if (chksum_stat_limit
== LIMIT_INIT
) {
208 if (cs
->bs1k
< LIMIT_PERF_MBS
) {
209 chksum_stat_limit
= LIMIT_NEEDED
;
211 chksum_stat_limit
= LIMIT_NOLIMIT
;
215 /* skip benchmarks >= 1MiB when the CPU is to slow */
216 if (chksum_stat_limit
== LIMIT_NEEDED
)
219 chksum_run(cs
, abd
, ctx
, 6, &cs
->bs1m
);
222 /* allocate test memory via abd non linear interface */
223 abd
= abd_alloc(1<<24, B_FALSE
);
224 chksum_run(cs
, abd
, ctx
, 7, &cs
->bs4m
);
225 chksum_run(cs
, abd
, ctx
, 8, &cs
->bs16m
);
230 /* free up temp memory */
236 * Initialize and benchmark all supported implementations.
239 chksum_benchmark(void)
242 /* we need the benchmark only for the kernel module */
248 uint32_t id
, cbid
= 0, id_save
;
249 const zfs_impl_t
*blake3
= zfs_impl_get_ops("blake3");
250 const zfs_impl_t
*sha256
= zfs_impl_get_ops("sha256");
251 const zfs_impl_t
*sha512
= zfs_impl_get_ops("sha512");
253 /* count implementations */
255 chksum_stat_cnt
+= sha256
->getcnt();
256 chksum_stat_cnt
+= sha512
->getcnt();
257 chksum_stat_cnt
+= blake3
->getcnt();
258 chksum_stat_data
= kmem_zalloc(
259 sizeof (chksum_stat_t
) * chksum_stat_cnt
, KM_SLEEP
);
261 /* edonr - needs to be the first one here (slow CPU check) */
262 cs
= &chksum_stat_data
[cbid
++];
265 cs
->init
= abd_checksum_edonr_tmpl_init
;
266 cs
->func
= abd_checksum_edonr_native
;
267 cs
->free
= abd_checksum_edonr_tmpl_free
;
269 cs
->impl
= "generic";
273 cs
= &chksum_stat_data
[cbid
++];
274 cs
->init
= abd_checksum_skein_tmpl_init
;
275 cs
->func
= abd_checksum_skein_native
;
276 cs
->free
= abd_checksum_skein_tmpl_free
;
278 cs
->impl
= "generic";
282 id_save
= sha256
->getid();
283 for (max
= 0, id
= 0; id
< sha256
->getcnt(); id
++) {
285 cs
= &chksum_stat_data
[cbid
++];
287 cs
->func
= abd_checksum_sha256
;
289 cs
->name
= sha256
->name
;
290 cs
->impl
= sha256
->getname();
292 if (cs
->bs256k
> max
) {
294 sha256
->set_fastest(id
);
297 sha256
->setid(id_save
);
300 id_save
= sha512
->getid();
301 for (max
= 0, id
= 0; id
< sha512
->getcnt(); id
++) {
303 cs
= &chksum_stat_data
[cbid
++];
305 cs
->func
= abd_checksum_sha512_native
;
307 cs
->name
= sha512
->name
;
308 cs
->impl
= sha512
->getname();
310 if (cs
->bs256k
> max
) {
312 sha512
->set_fastest(id
);
315 sha512
->setid(id_save
);
318 id_save
= blake3
->getid();
319 for (max
= 0, id
= 0; id
< blake3
->getcnt(); id
++) {
321 cs
= &chksum_stat_data
[cbid
++];
322 cs
->init
= abd_checksum_blake3_tmpl_init
;
323 cs
->func
= abd_checksum_blake3_native
;
324 cs
->free
= abd_checksum_blake3_tmpl_free
;
325 cs
->name
= blake3
->name
;
326 cs
->impl
= blake3
->getname();
328 if (cs
->bs256k
> max
) {
330 blake3
->set_fastest(id
);
333 blake3
->setid(id_save
);
340 blake3_per_cpu_ctx_init();
343 /* Benchmark supported implementations */
346 /* Install kstats for all implementations */
347 chksum_kstat
= kstat_create("zfs", 0, "chksum_bench", "misc",
348 KSTAT_TYPE_RAW
, 0, KSTAT_FLAG_VIRTUAL
);
350 if (chksum_kstat
!= NULL
) {
351 chksum_kstat
->ks_data
= NULL
;
352 chksum_kstat
->ks_ndata
= UINT32_MAX
;
353 kstat_set_raw_ops(chksum_kstat
,
354 chksum_kstat_headers
,
357 kstat_install(chksum_kstat
);
364 if (chksum_kstat
!= NULL
) {
365 kstat_delete(chksum_kstat
);
369 if (chksum_stat_cnt
) {
370 kmem_free(chksum_stat_data
,
371 sizeof (chksum_stat_t
) * chksum_stat_cnt
);
373 chksum_stat_data
= 0;
377 blake3_per_cpu_ctx_fini();