zpool/zfs: restore -V & --version options
[zfs.git] / module / zstd / zfs_zstd.c
blobe113962f65b6272731a09b844b581ecd3732eaca
1 /*
2 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
7 * 1. Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
14 * 3. Neither the name of the copyright holder nor the names of its
15 * contributors may be used to endorse or promote products derived from this
16 * software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
32 * Copyright (c) 2016-2018, Klara Inc.
33 * Copyright (c) 2016-2018, Allan Jude
34 * Copyright (c) 2018-2020, Sebastian Gottschall
35 * Copyright (c) 2019-2020, Michael Niewöhner
36 * Copyright (c) 2020, The FreeBSD Foundation [1]
38 * [1] Portions of this software were developed by Allan Jude
39 * under sponsorship from the FreeBSD Foundation.
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zio_compress.h>
46 #include <sys/spa.h>
47 #include <sys/zstd/zstd.h>
49 #define ZSTD_STATIC_LINKING_ONLY
50 #include "lib/zstd.h"
51 #include "lib/common/zstd_errors.h"
53 static uint_t zstd_earlyabort_pass = 1;
54 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;
55 static unsigned int zstd_abort_size = (128 * 1024);
57 static kstat_t *zstd_ksp = NULL;
59 typedef struct zstd_stats {
60 kstat_named_t zstd_stat_alloc_fail;
61 kstat_named_t zstd_stat_alloc_fallback;
62 kstat_named_t zstd_stat_com_alloc_fail;
63 kstat_named_t zstd_stat_dec_alloc_fail;
64 kstat_named_t zstd_stat_com_inval;
65 kstat_named_t zstd_stat_dec_inval;
66 kstat_named_t zstd_stat_dec_header_inval;
67 kstat_named_t zstd_stat_com_fail;
68 kstat_named_t zstd_stat_dec_fail;
70 * LZ4 first-pass early abort verdict
72 kstat_named_t zstd_stat_lz4pass_allowed;
73 kstat_named_t zstd_stat_lz4pass_rejected;
75 * zstd-1 second-pass early abort verdict
77 kstat_named_t zstd_stat_zstdpass_allowed;
78 kstat_named_t zstd_stat_zstdpass_rejected;
80 * We excluded this from early abort for some reason
82 kstat_named_t zstd_stat_passignored;
83 kstat_named_t zstd_stat_passignored_size;
84 kstat_named_t zstd_stat_buffers;
85 kstat_named_t zstd_stat_size;
86 } zstd_stats_t;
88 static zstd_stats_t zstd_stats = {
89 { "alloc_fail", KSTAT_DATA_UINT64 },
90 { "alloc_fallback", KSTAT_DATA_UINT64 },
91 { "compress_alloc_fail", KSTAT_DATA_UINT64 },
92 { "decompress_alloc_fail", KSTAT_DATA_UINT64 },
93 { "compress_level_invalid", KSTAT_DATA_UINT64 },
94 { "decompress_level_invalid", KSTAT_DATA_UINT64 },
95 { "decompress_header_invalid", KSTAT_DATA_UINT64 },
96 { "compress_failed", KSTAT_DATA_UINT64 },
97 { "decompress_failed", KSTAT_DATA_UINT64 },
98 { "lz4pass_allowed", KSTAT_DATA_UINT64 },
99 { "lz4pass_rejected", KSTAT_DATA_UINT64 },
100 { "zstdpass_allowed", KSTAT_DATA_UINT64 },
101 { "zstdpass_rejected", KSTAT_DATA_UINT64 },
102 { "passignored", KSTAT_DATA_UINT64 },
103 { "passignored_size", KSTAT_DATA_UINT64 },
104 { "buffers", KSTAT_DATA_UINT64 },
105 { "size", KSTAT_DATA_UINT64 },
108 #ifdef _KERNEL
109 static int
110 kstat_zstd_update(kstat_t *ksp, int rw)
112 ASSERT(ksp != NULL);
114 if (rw == KSTAT_WRITE && ksp == zstd_ksp) {
115 ZSTDSTAT_ZERO(zstd_stat_alloc_fail);
116 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);
117 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);
118 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);
119 ZSTDSTAT_ZERO(zstd_stat_com_inval);
120 ZSTDSTAT_ZERO(zstd_stat_dec_inval);
121 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);
122 ZSTDSTAT_ZERO(zstd_stat_com_fail);
123 ZSTDSTAT_ZERO(zstd_stat_dec_fail);
124 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);
125 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);
126 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);
127 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);
128 ZSTDSTAT_ZERO(zstd_stat_passignored);
129 ZSTDSTAT_ZERO(zstd_stat_passignored_size);
132 return (0);
134 #endif
136 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
137 enum zstd_kmem_type {
138 ZSTD_KMEM_UNKNOWN = 0,
139 /* Allocation type using kmem_vmalloc */
140 ZSTD_KMEM_DEFAULT,
141 /* Pool based allocation using mempool_alloc */
142 ZSTD_KMEM_POOL,
143 /* Reserved fallback memory for decompression only */
144 ZSTD_KMEM_DCTX,
145 ZSTD_KMEM_COUNT,
148 /* Structure for pooled memory objects */
149 struct zstd_pool {
150 void *mem;
151 size_t size;
152 kmutex_t barrier;
153 hrtime_t timeout;
156 /* Global structure for handling memory allocations */
157 struct zstd_kmem {
158 enum zstd_kmem_type kmem_type;
159 size_t kmem_size;
160 struct zstd_pool *pool;
163 /* Fallback memory structure used for decompression only if memory runs out */
164 struct zstd_fallback_mem {
165 size_t mem_size;
166 void *mem;
167 kmutex_t barrier;
170 struct zstd_levelmap {
171 int16_t zstd_level;
172 enum zio_zstd_levels level;
176 * ZSTD memory handlers
178 * For decompression we use a different handler which also provides fallback
179 * memory allocation in case memory runs out.
181 * The ZSTD handlers were split up for the most simplified implementation.
183 static void *zstd_alloc(void *opaque, size_t size);
184 static void *zstd_dctx_alloc(void *opaque, size_t size);
185 static void zstd_free(void *opaque, void *ptr);
187 /* Compression memory handler */
188 static const ZSTD_customMem zstd_malloc = {
189 zstd_alloc,
190 zstd_free,
191 NULL,
194 /* Decompression memory handler */
195 static const ZSTD_customMem zstd_dctx_malloc = {
196 zstd_dctx_alloc,
197 zstd_free,
198 NULL,
201 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
202 static struct zstd_levelmap zstd_levels[] = {
203 {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
204 {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
205 {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
206 {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
207 {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
208 {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
209 {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
210 {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
211 {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
212 {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
213 {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
214 {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
215 {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
216 {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
217 {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
218 {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
219 {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
220 {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
221 {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
222 {-1, ZIO_ZSTD_LEVEL_FAST_1},
223 {-2, ZIO_ZSTD_LEVEL_FAST_2},
224 {-3, ZIO_ZSTD_LEVEL_FAST_3},
225 {-4, ZIO_ZSTD_LEVEL_FAST_4},
226 {-5, ZIO_ZSTD_LEVEL_FAST_5},
227 {-6, ZIO_ZSTD_LEVEL_FAST_6},
228 {-7, ZIO_ZSTD_LEVEL_FAST_7},
229 {-8, ZIO_ZSTD_LEVEL_FAST_8},
230 {-9, ZIO_ZSTD_LEVEL_FAST_9},
231 {-10, ZIO_ZSTD_LEVEL_FAST_10},
232 {-20, ZIO_ZSTD_LEVEL_FAST_20},
233 {-30, ZIO_ZSTD_LEVEL_FAST_30},
234 {-40, ZIO_ZSTD_LEVEL_FAST_40},
235 {-50, ZIO_ZSTD_LEVEL_FAST_50},
236 {-60, ZIO_ZSTD_LEVEL_FAST_60},
237 {-70, ZIO_ZSTD_LEVEL_FAST_70},
238 {-80, ZIO_ZSTD_LEVEL_FAST_80},
239 {-90, ZIO_ZSTD_LEVEL_FAST_90},
240 {-100, ZIO_ZSTD_LEVEL_FAST_100},
241 {-500, ZIO_ZSTD_LEVEL_FAST_500},
242 {-1000, ZIO_ZSTD_LEVEL_FAST_1000},
246 * This variable represents the maximum count of the pool based on the number
247 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
249 static int pool_count = 16;
251 #define ZSTD_POOL_MAX pool_count
252 #define ZSTD_POOL_TIMEOUT 60 * 2
254 static struct zstd_fallback_mem zstd_dctx_fallback;
255 static struct zstd_pool *zstd_mempool_cctx;
256 static struct zstd_pool *zstd_mempool_dctx;
259 * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
260 * and while ASAN does this, KASAN defines that and does not. So to avoid
261 * changing the external code, we do this.
263 #if defined(ZFS_ASAN_ENABLED)
264 #define ADDRESS_SANITIZER 1
265 #endif
266 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
267 void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
268 void __asan_poison_memory_region(void const volatile *addr, size_t size);
269 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
270 void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
271 #endif
274 static void
275 zstd_mempool_reap(struct zstd_pool *zstd_mempool)
277 struct zstd_pool *pool;
279 if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
280 return;
283 /* free obsolete slots */
284 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
285 pool = &zstd_mempool[i];
286 if (pool->mem && mutex_tryenter(&pool->barrier)) {
287 /* Free memory if unused object older than 2 minutes */
288 if (pool->mem && gethrestime_sec() > pool->timeout) {
289 vmem_free(pool->mem, pool->size);
290 ZSTDSTAT_SUB(zstd_stat_buffers, 1);
291 ZSTDSTAT_SUB(zstd_stat_size, pool->size);
292 pool->mem = NULL;
293 pool->size = 0;
294 pool->timeout = 0;
296 mutex_exit(&pool->barrier);
302 * Try to get a cached allocated buffer from memory pool or allocate a new one
303 * if necessary. If a object is older than 2 minutes and does not fit the
304 * requested size, it will be released and a new cached entry will be allocated.
305 * If other pooled objects are detected without being used for 2 minutes, they
306 * will be released, too.
308 * The concept is that high frequency memory allocations of bigger objects are
309 * expensive. So if a lot of work is going on, allocations will be kept for a
310 * while and can be reused in that time frame.
312 * The scheduled release will be updated every time a object is reused.
315 static void *
316 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
318 struct zstd_pool *pool;
319 struct zstd_kmem *mem = NULL;
321 if (!zstd_mempool) {
322 return (NULL);
325 /* Seek for preallocated memory slot and free obsolete slots */
326 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
327 pool = &zstd_mempool[i];
329 * This lock is simply a marker for a pool object being in use.
330 * If it's already hold, it will be skipped.
332 * We need to create it before checking it to avoid race
333 * conditions caused by running in a threaded context.
335 * The lock is later released by zstd_mempool_free.
337 if (mutex_tryenter(&pool->barrier)) {
339 * Check if objects fits the size, if so we take it and
340 * update the timestamp.
342 if (pool->mem && size <= pool->size) {
343 pool->timeout = gethrestime_sec() +
344 ZSTD_POOL_TIMEOUT;
345 mem = pool->mem;
346 return (mem);
348 mutex_exit(&pool->barrier);
353 * If no preallocated slot was found, try to fill in a new one.
355 * We run a similar algorithm twice here to avoid pool fragmentation.
356 * The first one may generate holes in the list if objects get released.
357 * We always make sure that these holes get filled instead of adding new
358 * allocations constantly at the end.
360 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
361 pool = &zstd_mempool[i];
362 if (mutex_tryenter(&pool->barrier)) {
363 /* Object is free, try to allocate new one */
364 if (!pool->mem) {
365 mem = vmem_alloc(size, KM_SLEEP);
366 if (mem) {
367 ZSTDSTAT_ADD(zstd_stat_buffers, 1);
368 ZSTDSTAT_ADD(zstd_stat_size, size);
369 pool->mem = mem;
370 pool->size = size;
371 /* Keep track for later release */
372 mem->pool = pool;
373 mem->kmem_type = ZSTD_KMEM_POOL;
374 mem->kmem_size = size;
378 if (size <= pool->size) {
379 /* Update timestamp */
380 pool->timeout = gethrestime_sec() +
381 ZSTD_POOL_TIMEOUT;
383 return (pool->mem);
386 mutex_exit(&pool->barrier);
391 * If the pool is full or the allocation failed, try lazy allocation
392 * instead.
394 if (!mem) {
395 mem = vmem_alloc(size, KM_NOSLEEP);
396 if (mem) {
397 mem->pool = NULL;
398 mem->kmem_type = ZSTD_KMEM_DEFAULT;
399 mem->kmem_size = size;
403 return (mem);
406 /* Mark object as released by releasing the barrier mutex */
407 static void
408 zstd_mempool_free(struct zstd_kmem *z)
410 mutex_exit(&z->pool->barrier);
413 /* Convert ZFS internal enum to ZSTD level */
414 static int
415 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
417 if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
418 *zstd_level = zstd_levels[level - 1].zstd_level;
419 return (0);
421 if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
422 level <= ZIO_ZSTD_LEVEL_FAST_1000) {
423 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
424 + ZIO_ZSTD_LEVEL_19].zstd_level;
425 return (0);
428 /* Invalid/unknown zfs compression enum - this should never happen. */
429 return (1);
432 /* Compress block using zstd */
433 static size_t
434 zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
435 int level)
437 size_t c_len;
438 int16_t zstd_level;
439 zfs_zstdhdr_t *hdr;
440 ZSTD_CCtx *cctx;
442 hdr = (zfs_zstdhdr_t *)d_start;
444 /* Skip compression if the specified level is invalid */
445 if (zstd_enum_to_level(level, &zstd_level)) {
446 ZSTDSTAT_BUMP(zstd_stat_com_inval);
447 return (s_len);
450 ASSERT3U(d_len, >=, sizeof (*hdr));
451 ASSERT3U(d_len, <=, s_len);
452 ASSERT3U(zstd_level, !=, 0);
454 cctx = ZSTD_createCCtx_advanced(zstd_malloc);
457 * Out of kernel memory, gently fall through - this will disable
458 * compression in zio_compress_data
460 if (!cctx) {
461 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
462 return (s_len);
465 /* Set the compression level */
466 ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
468 /* Use the "magicless" zstd header which saves us 4 header bytes */
469 ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
472 * Disable redundant checksum calculation and content size storage since
473 * this is already done by ZFS itself.
475 ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
476 ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
478 c_len = ZSTD_compress2(cctx,
479 hdr->data,
480 d_len - sizeof (*hdr),
481 s_start, s_len);
483 ZSTD_freeCCtx(cctx);
485 /* Error in the compression routine, disable compression. */
486 if (ZSTD_isError(c_len)) {
488 * If we are aborting the compression because the saves are
489 * too small, that is not a failure. Everything else is a
490 * failure, so increment the compression failure counter.
492 int err = ZSTD_getErrorCode(c_len);
493 if (err != ZSTD_error_dstSize_tooSmall) {
494 ZSTDSTAT_BUMP(zstd_stat_com_fail);
495 dprintf("Error: %s", ZSTD_getErrorString(err));
497 return (s_len);
501 * Encode the compressed buffer size at the start. We'll need this in
502 * decompression to counter the effects of padding which might be added
503 * to the compressed buffer and which, if unhandled, would confuse the
504 * hell out of our decompression function.
506 hdr->c_len = BE_32(c_len);
509 * Check version for overflow.
510 * The limit of 24 bits must not be exceeded. This allows a maximum
511 * version 1677.72.15 which we don't expect to be ever reached.
513 ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
516 * Encode the compression level as well. We may need to know the
517 * original compression level if compressed_arc is disabled, to match
518 * the compression settings to write this block to the L2ARC.
520 * Encode the actual level, so if the enum changes in the future, we
521 * will be compatible.
523 * The upper 24 bits store the ZSTD version to be able to provide
524 * future compatibility, since new versions might enhance the
525 * compression algorithm in a way, where the compressed data will
526 * change.
528 * As soon as such incompatibility occurs, handling code needs to be
529 * added, differentiating between the versions.
531 zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
532 zfs_set_hdrlevel(hdr, level);
533 hdr->raw_version_level = BE_32(hdr->raw_version_level);
535 return (c_len + sizeof (*hdr));
539 static size_t
540 zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
541 int level)
543 int16_t zstd_level;
544 if (zstd_enum_to_level(level, &zstd_level)) {
545 ZSTDSTAT_BUMP(zstd_stat_com_inval);
546 return (s_len);
549 * A zstd early abort heuristic.
551 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
552 * 128k), don't try any of this, just go.
553 * (because experimentally that was a reasonable cutoff for a perf win
554 * with tiny ratio change)
555 * - First, we try LZ4 compression, and if it doesn't early abort, we
556 * jump directly to whatever compression level we intended to try.
557 * - Second, we try zstd-1 - if that errors out (usually, but not
558 * exclusively, if it would overflow), we give up early.
560 * If it works, instead we go on and compress anyway.
562 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
563 * compressible data, it was losing up to 8.5% of the compressed
564 * savings versus no early abort, and all the zstd-fast levels are
565 * worse indications on their own than LZ4, and don't improve the LZ4
566 * pass noticably if stacked like this.
568 size_t actual_abort_size = zstd_abort_size;
569 if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
570 s_len >= actual_abort_size) {
571 int pass_len = 1;
572 abd_t sabd, dabd;
573 abd_get_from_buf_struct(&sabd, s_start, s_len);
574 abd_get_from_buf_struct(&dabd, d_start, d_len);
575 pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);
576 abd_free(&dabd);
577 abd_free(&sabd);
578 if (pass_len < d_len) {
579 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
580 goto keep_trying;
582 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
584 pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len,
585 d_len, ZIO_ZSTD_LEVEL_1);
586 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
587 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
588 return (s_len);
590 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
591 } else {
592 ZSTDSTAT_BUMP(zstd_stat_passignored);
593 if (s_len < actual_abort_size) {
594 ZSTDSTAT_BUMP(zstd_stat_passignored_size);
597 keep_trying:
598 return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level));
602 /* Decompress block using zstd and return its stored level */
603 static int
604 zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len,
605 size_t d_len, uint8_t *level)
607 ZSTD_DCtx *dctx;
608 size_t result;
609 int16_t zstd_level;
610 uint32_t c_len;
611 const zfs_zstdhdr_t *hdr;
612 zfs_zstdhdr_t hdr_copy;
614 hdr = (const zfs_zstdhdr_t *)s_start;
615 c_len = BE_32(hdr->c_len);
618 * Make a copy instead of directly converting the header, since we must
619 * not modify the original data that may be used again later.
621 hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
622 uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
625 * NOTE: We ignore the ZSTD version for now. As soon as any
626 * incompatibility occurs, it has to be handled accordingly.
627 * The version can be accessed via `hdr_copy.version`.
631 * Convert and check the level
632 * An invalid level is a strong indicator for data corruption! In such
633 * case return an error so the upper layers can try to fix it.
635 if (zstd_enum_to_level(curlevel, &zstd_level)) {
636 ZSTDSTAT_BUMP(zstd_stat_dec_inval);
637 return (1);
640 ASSERT3U(d_len, >=, s_len);
641 ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
643 /* Invalid compressed buffer size encoded at start */
644 if (c_len + sizeof (*hdr) > s_len) {
645 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
646 return (1);
649 dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
650 if (!dctx) {
651 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
652 return (1);
655 /* Set header type to "magicless" */
656 ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
658 /* Decompress the data and release the context */
659 result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
660 ZSTD_freeDCtx(dctx);
663 * Returns 0 on success (decompression function returned non-negative)
664 * and non-zero on failure (decompression function returned negative.
666 if (ZSTD_isError(result)) {
667 ZSTDSTAT_BUMP(zstd_stat_dec_fail);
668 return (1);
671 if (level) {
672 *level = curlevel;
675 return (0);
678 /* Decompress datablock using zstd */
679 static int
680 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
681 size_t d_len, int level __maybe_unused)
684 return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
685 NULL));
688 ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress)
689 ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)
690 ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level)
693 /* Allocator for zstd compression context using mempool_allocator */
694 static void *
695 zstd_alloc(void *opaque __maybe_unused, size_t size)
697 size_t nbytes = sizeof (struct zstd_kmem) + size;
698 struct zstd_kmem *z = NULL;
700 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
702 if (!z) {
703 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
704 return (NULL);
707 return ((void*)z + (sizeof (struct zstd_kmem)));
711 * Allocator for zstd decompression context using mempool_allocator with
712 * fallback to reserved memory if allocation fails
714 static void *
715 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
717 size_t nbytes = sizeof (struct zstd_kmem) + size;
718 struct zstd_kmem *z = NULL;
719 enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
721 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
722 if (!z) {
723 /* Try harder, decompression shall not fail */
724 z = vmem_alloc(nbytes, KM_SLEEP);
725 if (z) {
726 z->pool = NULL;
728 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
729 } else {
730 return ((void*)z + (sizeof (struct zstd_kmem)));
733 /* Fallback if everything fails */
734 if (!z) {
736 * Barrier since we only can handle it in a single thread. All
737 * other following threads need to wait here until decompression
738 * is completed. zstd_free will release this barrier later.
740 mutex_enter(&zstd_dctx_fallback.barrier);
742 z = zstd_dctx_fallback.mem;
743 type = ZSTD_KMEM_DCTX;
744 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
747 /* Allocation should always be successful */
748 if (!z) {
749 return (NULL);
752 z->kmem_type = type;
753 z->kmem_size = nbytes;
755 return ((void*)z + (sizeof (struct zstd_kmem)));
758 /* Free allocated memory by its specific type */
759 static void
760 zstd_free(void *opaque __maybe_unused, void *ptr)
762 struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
763 enum zstd_kmem_type type;
765 ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
766 ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
768 type = z->kmem_type;
769 switch (type) {
770 case ZSTD_KMEM_DEFAULT:
771 vmem_free(z, z->kmem_size);
772 break;
773 case ZSTD_KMEM_POOL:
774 zstd_mempool_free(z);
775 break;
776 case ZSTD_KMEM_DCTX:
777 mutex_exit(&zstd_dctx_fallback.barrier);
778 break;
779 default:
780 break;
784 /* Allocate fallback memory to ensure safe decompression */
785 static void __init
786 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
788 mem->mem_size = size;
789 mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
790 mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
793 /* Initialize memory pool barrier mutexes */
794 static void __init
795 zstd_mempool_init(void)
797 zstd_mempool_cctx =
798 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
799 zstd_mempool_dctx =
800 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
802 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
803 mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
804 MUTEX_DEFAULT, NULL);
805 mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
806 MUTEX_DEFAULT, NULL);
810 /* Initialize zstd-related memory handling */
811 static int __init
812 zstd_meminit(void)
814 zstd_mempool_init();
817 * Estimate the size of the fallback decompression context.
818 * The expected size on x64 with current ZSTD should be about 160 KB.
820 create_fallback_mem(&zstd_dctx_fallback,
821 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
822 PAGESIZE));
824 return (0);
827 /* Release object from pool and free memory */
828 static void
829 release_pool(struct zstd_pool *pool)
831 mutex_destroy(&pool->barrier);
832 vmem_free(pool->mem, pool->size);
833 pool->mem = NULL;
834 pool->size = 0;
837 /* Release memory pool objects */
838 static void
839 zstd_mempool_deinit(void)
841 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
842 release_pool(&zstd_mempool_cctx[i]);
843 release_pool(&zstd_mempool_dctx[i]);
846 kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
847 kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
848 zstd_mempool_dctx = NULL;
849 zstd_mempool_cctx = NULL;
852 /* release unused memory from pool */
854 void
855 zfs_zstd_cache_reap_now(void)
859 * Short-circuit if there are no buffers to begin with.
861 if (ZSTDSTAT(zstd_stat_buffers) == 0)
862 return;
865 * calling alloc with zero size seeks
866 * and releases old unused objects
868 zstd_mempool_reap(zstd_mempool_cctx);
869 zstd_mempool_reap(zstd_mempool_dctx);
872 extern int __init
873 zstd_init(void)
875 /* Set pool size by using maximum sane thread count * 4 */
876 pool_count = (boot_ncpus * 4);
877 zstd_meminit();
879 /* Initialize kstat */
880 zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
881 KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
882 KSTAT_FLAG_VIRTUAL);
883 if (zstd_ksp != NULL) {
884 zstd_ksp->ks_data = &zstd_stats;
885 kstat_install(zstd_ksp);
886 #ifdef _KERNEL
887 zstd_ksp->ks_update = kstat_zstd_update;
888 #endif
891 return (0);
894 extern void
895 zstd_fini(void)
897 /* Deinitialize kstat */
898 if (zstd_ksp != NULL) {
899 kstat_delete(zstd_ksp);
900 zstd_ksp = NULL;
903 /* Release fallback memory */
904 vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
905 mutex_destroy(&zstd_dctx_fallback.barrier);
907 /* Deinit memory pool */
908 zstd_mempool_deinit();
911 #if defined(_KERNEL)
912 #ifdef __FreeBSD__
913 module_init(zstd_init);
914 module_exit(zstd_fini);
915 #endif
917 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW,
918 "Enable early abort attempts when using zstd");
919 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,
920 "Minimal size of block to attempt early abort");
921 #endif