4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
26 #include <sys/types.h>
28 #include <sys/zio_checksum.h>
29 #include <sys/zfs_context.h>
30 #include <sys/zfs_chksum.h>
32 #include <sys/blake3.h>
34 /* limit benchmarking to max 256KiB, when EdonR is slower then this: */
35 #define LIMIT_PERF_MBS 300
48 zio_cksum_salt_t salt
;
49 zio_checksum_t
*(func
);
50 zio_checksum_tmpl_init_t
*(init
);
51 zio_checksum_tmpl_free_t
*(free
);
54 static chksum_stat_t
*chksum_stat_data
= 0;
55 static int chksum_stat_cnt
= 0;
56 static kstat_t
*chksum_kstat
= NULL
;
59 * i3-1005G1 test output:
61 * implementation 1k 4k 16k 64k 256k 1m 4m
62 * fletcher-4 5421 15001 26468 32555 34720 32801 18847
63 * edonr-generic 1196 1602 1761 1749 1762 1759 1751
64 * skein-generic 546 591 608 615 619 612 616
65 * sha256-generic 246 270 274 274 277 275 276
66 * sha256-avx 262 296 304 307 307 307 306
67 * sha256-sha-ni 769 1072 1172 1220 1219 1232 1228
68 * sha256-openssl 240 300 316 314 304 285 276
69 * sha512-generic 333 374 385 392 391 393 392
70 * sha512-openssl 353 441 467 476 472 467 426
71 * sha512-avx 362 444 473 475 479 476 478
72 * sha512-avx2 394 500 530 538 543 545 542
73 * blake3-generic 308 313 313 313 312 313 312
74 * blake3-sse2 402 1289 1423 1446 1432 1458 1413
75 * blake3-sse41 427 1470 1625 1704 1679 1607 1629
76 * blake3-avx2 428 1920 3095 3343 3356 3318 3204
77 * blake3-avx512 473 2687 4905 5836 5844 5643 5374
80 chksum_kstat_headers(char *buf
, size_t size
)
84 off
+= snprintf(buf
+ off
, size
, "%-23s", "implementation");
85 off
+= snprintf(buf
+ off
, size
- off
, "%8s", "1k");
86 off
+= snprintf(buf
+ off
, size
- off
, "%8s", "4k");
87 off
+= snprintf(buf
+ off
, size
- off
, "%8s", "16k");
88 off
+= snprintf(buf
+ off
, size
- off
, "%8s", "64k");
89 off
+= snprintf(buf
+ off
, size
- off
, "%8s", "256k");
90 off
+= snprintf(buf
+ off
, size
- off
, "%8s", "1m");
91 off
+= snprintf(buf
+ off
, size
- off
, "%8s", "4m");
92 (void) snprintf(buf
+ off
, size
- off
, "%8s\n", "16m");
98 chksum_kstat_data(char *buf
, size_t size
, void *data
)
104 cs
= (chksum_stat_t
*)data
;
105 snprintf(b
, 23, "%s-%s", cs
->name
, cs
->impl
);
106 off
+= snprintf(buf
+ off
, size
- off
, "%-23s", b
);
107 off
+= snprintf(buf
+ off
, size
- off
, "%8llu",
108 (u_longlong_t
)cs
->bs1k
);
109 off
+= snprintf(buf
+ off
, size
- off
, "%8llu",
110 (u_longlong_t
)cs
->bs4k
);
111 off
+= snprintf(buf
+ off
, size
- off
, "%8llu",
112 (u_longlong_t
)cs
->bs16k
);
113 off
+= snprintf(buf
+ off
, size
- off
, "%8llu",
114 (u_longlong_t
)cs
->bs64k
);
115 off
+= snprintf(buf
+ off
, size
- off
, "%8llu",
116 (u_longlong_t
)cs
->bs256k
);
117 off
+= snprintf(buf
+ off
, size
- off
, "%8llu",
118 (u_longlong_t
)cs
->bs1m
);
119 off
+= snprintf(buf
+ off
, size
- off
, "%8llu",
120 (u_longlong_t
)cs
->bs4m
);
121 (void) snprintf(buf
+ off
, size
- off
, "%8llu\n",
122 (u_longlong_t
)cs
->bs16m
);
128 chksum_kstat_addr(kstat_t
*ksp
, loff_t n
)
130 if (n
< chksum_stat_cnt
)
131 ksp
->ks_private
= (void *)(chksum_stat_data
+ n
);
133 ksp
->ks_private
= NULL
;
135 return (ksp
->ks_private
);
139 chksum_run(chksum_stat_t
*cs
, abd_t
*abd
, void *ctx
, int round
,
143 uint64_t run_bw
, run_time_ns
, run_count
= 0, size
= 0;
144 uint32_t l
, loops
= 0;
149 size
= 1<<10; loops
= 128; break;
151 size
= 1<<12; loops
= 64; break;
153 size
= 1<<14; loops
= 32; break;
155 size
= 1<<16; loops
= 16; break;
157 size
= 1<<18; loops
= 8; break;
159 size
= 1<<20; loops
= 4; break;
161 size
= 1<<22; loops
= 1; break;
163 size
= 1<<24; loops
= 1; break;
169 for (l
= 0; l
< loops
; l
++, run_count
++)
170 cs
->func(abd
, size
, ctx
, &zcp
);
172 run_time_ns
= gethrtime() - start
;
173 } while (run_time_ns
< MSEC2NSEC(1));
176 run_bw
= size
* run_count
* NANOSEC
;
177 run_bw
/= run_time_ns
; /* B/s */
178 *result
= run_bw
/1024/1024; /* MiB/s */
182 #define LIMIT_NEEDED 1
183 #define LIMIT_NOLIMIT 2
186 chksum_benchit(chksum_stat_t
*cs
)
190 void *salt
= &cs
->salt
.zcs_bytes
;
191 static int chksum_stat_limit
= LIMIT_INIT
;
193 memset(salt
, 0, sizeof (cs
->salt
.zcs_bytes
));
195 ctx
= cs
->init(&cs
->salt
);
197 /* allocate test memory via abd linear interface */
198 abd
= abd_alloc_linear(1<<20, B_FALSE
);
199 chksum_run(cs
, abd
, ctx
, 1, &cs
->bs1k
);
200 chksum_run(cs
, abd
, ctx
, 2, &cs
->bs4k
);
201 chksum_run(cs
, abd
, ctx
, 3, &cs
->bs16k
);
202 chksum_run(cs
, abd
, ctx
, 4, &cs
->bs64k
);
203 chksum_run(cs
, abd
, ctx
, 5, &cs
->bs256k
);
205 /* check if we ran on a slow cpu */
206 if (chksum_stat_limit
== LIMIT_INIT
) {
207 if (cs
->bs1k
< LIMIT_PERF_MBS
) {
208 chksum_stat_limit
= LIMIT_NEEDED
;
210 chksum_stat_limit
= LIMIT_NOLIMIT
;
214 /* skip benchmarks >= 1MiB when the CPU is to slow */
215 if (chksum_stat_limit
== LIMIT_NEEDED
)
218 chksum_run(cs
, abd
, ctx
, 6, &cs
->bs1m
);
221 /* allocate test memory via abd non linear interface */
222 abd
= abd_alloc(1<<24, B_FALSE
);
223 chksum_run(cs
, abd
, ctx
, 7, &cs
->bs4m
);
224 chksum_run(cs
, abd
, ctx
, 8, &cs
->bs16m
);
229 /* free up temp memory */
235 * Initialize and benchmark all supported implementations.
238 chksum_benchmark(void)
242 /* we need the benchmark only for the kernel module */
249 uint32_t id
, id_save
;
251 /* space for the benchmark times */
253 chksum_stat_cnt
+= blake3_impl_getcnt();
254 chksum_stat_data
= (chksum_stat_t
*)kmem_zalloc(
255 sizeof (chksum_stat_t
) * chksum_stat_cnt
, KM_SLEEP
);
257 /* edonr - needs to be the first one here (slow CPU check) */
258 cs
= &chksum_stat_data
[cbid
++];
259 cs
->init
= abd_checksum_edonr_tmpl_init
;
260 cs
->func
= abd_checksum_edonr_native
;
261 cs
->free
= abd_checksum_edonr_tmpl_free
;
263 cs
->impl
= "generic";
267 cs
= &chksum_stat_data
[cbid
++];
268 cs
->init
= abd_checksum_skein_tmpl_init
;
269 cs
->func
= abd_checksum_skein_native
;
270 cs
->free
= abd_checksum_skein_tmpl_free
;
272 cs
->impl
= "generic";
276 cs
= &chksum_stat_data
[cbid
++];
278 cs
->func
= abd_checksum_SHA256
;
281 cs
->impl
= "generic";
285 cs
= &chksum_stat_data
[cbid
++];
287 cs
->func
= abd_checksum_SHA512_native
;
290 cs
->impl
= "generic";
294 id_save
= blake3_impl_getid();
295 for (id
= 0; id
< blake3_impl_getcnt(); id
++) {
296 blake3_impl_setid(id
);
297 cs
= &chksum_stat_data
[cbid
++];
298 cs
->init
= abd_checksum_blake3_tmpl_init
;
299 cs
->func
= abd_checksum_blake3_native
;
300 cs
->free
= abd_checksum_blake3_tmpl_free
;
302 cs
->impl
= blake3_impl_getname();
304 if (cs
->bs256k
> max
) {
306 blake3_impl_set_fastest(id
);
310 /* restore initial value */
311 blake3_impl_setid(id_save
);
318 blake3_per_cpu_ctx_init();
321 /* Benchmark supported implementations */
324 /* Install kstats for all implementations */
325 chksum_kstat
= kstat_create("zfs", 0, "chksum_bench", "misc",
326 KSTAT_TYPE_RAW
, 0, KSTAT_FLAG_VIRTUAL
);
328 if (chksum_kstat
!= NULL
) {
329 chksum_kstat
->ks_data
= NULL
;
330 chksum_kstat
->ks_ndata
= UINT32_MAX
;
331 kstat_set_raw_ops(chksum_kstat
,
332 chksum_kstat_headers
,
335 kstat_install(chksum_kstat
);
342 if (chksum_kstat
!= NULL
) {
343 kstat_delete(chksum_kstat
);
347 if (chksum_stat_cnt
) {
348 kmem_free(chksum_stat_data
,
349 sizeof (chksum_stat_t
) * chksum_stat_cnt
);
351 chksum_stat_data
= 0;
355 blake3_per_cpu_ctx_fini();