2 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
7 * 1. Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
14 * 3. Neither the name of the copyright holder nor the names of its
15 * contributors may be used to endorse or promote products derived from this
16 * software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
32 * Copyright (c) 2016-2018, Klara Inc.
33 * Copyright (c) 2016-2018, Allan Jude
34 * Copyright (c) 2018-2020, Sebastian Gottschall
35 * Copyright (c) 2019-2020, Michael Niewöhner
36 * Copyright (c) 2020, The FreeBSD Foundation [1]
38 * [1] Portions of this software were developed by Allan Jude
39 * under sponsorship from the FreeBSD Foundation.
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zio_compress.h>
47 #include <sys/zstd/zstd.h>
49 #define ZSTD_STATIC_LINKING_ONLY
51 #include "lib/common/zstd_errors.h"
53 static uint_t zstd_earlyabort_pass
= 1;
54 static int zstd_cutoff_level
= ZIO_ZSTD_LEVEL_3
;
55 static unsigned int zstd_abort_size
= (128 * 1024);
57 static kstat_t
*zstd_ksp
= NULL
;
59 typedef struct zstd_stats
{
60 kstat_named_t zstd_stat_alloc_fail
;
61 kstat_named_t zstd_stat_alloc_fallback
;
62 kstat_named_t zstd_stat_com_alloc_fail
;
63 kstat_named_t zstd_stat_dec_alloc_fail
;
64 kstat_named_t zstd_stat_com_inval
;
65 kstat_named_t zstd_stat_dec_inval
;
66 kstat_named_t zstd_stat_dec_header_inval
;
67 kstat_named_t zstd_stat_com_fail
;
68 kstat_named_t zstd_stat_dec_fail
;
70 * LZ4 first-pass early abort verdict
72 kstat_named_t zstd_stat_lz4pass_allowed
;
73 kstat_named_t zstd_stat_lz4pass_rejected
;
75 * zstd-1 second-pass early abort verdict
77 kstat_named_t zstd_stat_zstdpass_allowed
;
78 kstat_named_t zstd_stat_zstdpass_rejected
;
80 * We excluded this from early abort for some reason
82 kstat_named_t zstd_stat_passignored
;
83 kstat_named_t zstd_stat_passignored_size
;
84 kstat_named_t zstd_stat_buffers
;
85 kstat_named_t zstd_stat_size
;
88 static zstd_stats_t zstd_stats
= {
89 { "alloc_fail", KSTAT_DATA_UINT64
},
90 { "alloc_fallback", KSTAT_DATA_UINT64
},
91 { "compress_alloc_fail", KSTAT_DATA_UINT64
},
92 { "decompress_alloc_fail", KSTAT_DATA_UINT64
},
93 { "compress_level_invalid", KSTAT_DATA_UINT64
},
94 { "decompress_level_invalid", KSTAT_DATA_UINT64
},
95 { "decompress_header_invalid", KSTAT_DATA_UINT64
},
96 { "compress_failed", KSTAT_DATA_UINT64
},
97 { "decompress_failed", KSTAT_DATA_UINT64
},
98 { "lz4pass_allowed", KSTAT_DATA_UINT64
},
99 { "lz4pass_rejected", KSTAT_DATA_UINT64
},
100 { "zstdpass_allowed", KSTAT_DATA_UINT64
},
101 { "zstdpass_rejected", KSTAT_DATA_UINT64
},
102 { "passignored", KSTAT_DATA_UINT64
},
103 { "passignored_size", KSTAT_DATA_UINT64
},
104 { "buffers", KSTAT_DATA_UINT64
},
105 { "size", KSTAT_DATA_UINT64
},
110 kstat_zstd_update(kstat_t
*ksp
, int rw
)
114 if (rw
== KSTAT_WRITE
&& ksp
== zstd_ksp
) {
115 ZSTDSTAT_ZERO(zstd_stat_alloc_fail
);
116 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback
);
117 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail
);
118 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail
);
119 ZSTDSTAT_ZERO(zstd_stat_com_inval
);
120 ZSTDSTAT_ZERO(zstd_stat_dec_inval
);
121 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval
);
122 ZSTDSTAT_ZERO(zstd_stat_com_fail
);
123 ZSTDSTAT_ZERO(zstd_stat_dec_fail
);
124 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed
);
125 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected
);
126 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed
);
127 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected
);
128 ZSTDSTAT_ZERO(zstd_stat_passignored
);
129 ZSTDSTAT_ZERO(zstd_stat_passignored_size
);
136 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
137 enum zstd_kmem_type
{
138 ZSTD_KMEM_UNKNOWN
= 0,
139 /* Allocation type using kmem_vmalloc */
141 /* Pool based allocation using mempool_alloc */
143 /* Reserved fallback memory for decompression only */
148 /* Structure for pooled memory objects */
156 /* Global structure for handling memory allocations */
158 enum zstd_kmem_type kmem_type
;
160 struct zstd_pool
*pool
;
163 /* Fallback memory structure used for decompression only if memory runs out */
164 struct zstd_fallback_mem
{
170 struct zstd_levelmap
{
172 enum zio_zstd_levels level
;
176 * ZSTD memory handlers
178 * For decompression we use a different handler which also provides fallback
179 * memory allocation in case memory runs out.
181 * The ZSTD handlers were split up for the most simplified implementation.
183 static void *zstd_alloc(void *opaque
, size_t size
);
184 static void *zstd_dctx_alloc(void *opaque
, size_t size
);
185 static void zstd_free(void *opaque
, void *ptr
);
187 /* Compression memory handler */
188 static const ZSTD_customMem zstd_malloc
= {
194 /* Decompression memory handler */
195 static const ZSTD_customMem zstd_dctx_malloc
= {
201 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
202 static struct zstd_levelmap zstd_levels
[] = {
203 {ZIO_ZSTD_LEVEL_1
, ZIO_ZSTD_LEVEL_1
},
204 {ZIO_ZSTD_LEVEL_2
, ZIO_ZSTD_LEVEL_2
},
205 {ZIO_ZSTD_LEVEL_3
, ZIO_ZSTD_LEVEL_3
},
206 {ZIO_ZSTD_LEVEL_4
, ZIO_ZSTD_LEVEL_4
},
207 {ZIO_ZSTD_LEVEL_5
, ZIO_ZSTD_LEVEL_5
},
208 {ZIO_ZSTD_LEVEL_6
, ZIO_ZSTD_LEVEL_6
},
209 {ZIO_ZSTD_LEVEL_7
, ZIO_ZSTD_LEVEL_7
},
210 {ZIO_ZSTD_LEVEL_8
, ZIO_ZSTD_LEVEL_8
},
211 {ZIO_ZSTD_LEVEL_9
, ZIO_ZSTD_LEVEL_9
},
212 {ZIO_ZSTD_LEVEL_10
, ZIO_ZSTD_LEVEL_10
},
213 {ZIO_ZSTD_LEVEL_11
, ZIO_ZSTD_LEVEL_11
},
214 {ZIO_ZSTD_LEVEL_12
, ZIO_ZSTD_LEVEL_12
},
215 {ZIO_ZSTD_LEVEL_13
, ZIO_ZSTD_LEVEL_13
},
216 {ZIO_ZSTD_LEVEL_14
, ZIO_ZSTD_LEVEL_14
},
217 {ZIO_ZSTD_LEVEL_15
, ZIO_ZSTD_LEVEL_15
},
218 {ZIO_ZSTD_LEVEL_16
, ZIO_ZSTD_LEVEL_16
},
219 {ZIO_ZSTD_LEVEL_17
, ZIO_ZSTD_LEVEL_17
},
220 {ZIO_ZSTD_LEVEL_18
, ZIO_ZSTD_LEVEL_18
},
221 {ZIO_ZSTD_LEVEL_19
, ZIO_ZSTD_LEVEL_19
},
222 {-1, ZIO_ZSTD_LEVEL_FAST_1
},
223 {-2, ZIO_ZSTD_LEVEL_FAST_2
},
224 {-3, ZIO_ZSTD_LEVEL_FAST_3
},
225 {-4, ZIO_ZSTD_LEVEL_FAST_4
},
226 {-5, ZIO_ZSTD_LEVEL_FAST_5
},
227 {-6, ZIO_ZSTD_LEVEL_FAST_6
},
228 {-7, ZIO_ZSTD_LEVEL_FAST_7
},
229 {-8, ZIO_ZSTD_LEVEL_FAST_8
},
230 {-9, ZIO_ZSTD_LEVEL_FAST_9
},
231 {-10, ZIO_ZSTD_LEVEL_FAST_10
},
232 {-20, ZIO_ZSTD_LEVEL_FAST_20
},
233 {-30, ZIO_ZSTD_LEVEL_FAST_30
},
234 {-40, ZIO_ZSTD_LEVEL_FAST_40
},
235 {-50, ZIO_ZSTD_LEVEL_FAST_50
},
236 {-60, ZIO_ZSTD_LEVEL_FAST_60
},
237 {-70, ZIO_ZSTD_LEVEL_FAST_70
},
238 {-80, ZIO_ZSTD_LEVEL_FAST_80
},
239 {-90, ZIO_ZSTD_LEVEL_FAST_90
},
240 {-100, ZIO_ZSTD_LEVEL_FAST_100
},
241 {-500, ZIO_ZSTD_LEVEL_FAST_500
},
242 {-1000, ZIO_ZSTD_LEVEL_FAST_1000
},
246 * This variable represents the maximum count of the pool based on the number
247 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
249 static int pool_count
= 16;
251 #define ZSTD_POOL_MAX pool_count
252 #define ZSTD_POOL_TIMEOUT 60 * 2
254 static struct zstd_fallback_mem zstd_dctx_fallback
;
255 static struct zstd_pool
*zstd_mempool_cctx
;
256 static struct zstd_pool
*zstd_mempool_dctx
;
259 * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
260 * and while ASAN does this, KASAN defines that and does not. So to avoid
261 * changing the external code, we do this.
263 #if defined(ZFS_ASAN_ENABLED)
264 #define ADDRESS_SANITIZER 1
266 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
267 void __asan_unpoison_memory_region(void const volatile *addr
, size_t size
);
268 void __asan_poison_memory_region(void const volatile *addr
, size_t size
);
269 void __asan_unpoison_memory_region(void const volatile *addr
, size_t size
) {};
270 void __asan_poison_memory_region(void const volatile *addr
, size_t size
) {};
275 zstd_mempool_reap(struct zstd_pool
*zstd_mempool
)
277 struct zstd_pool
*pool
;
279 if (!zstd_mempool
|| !ZSTDSTAT(zstd_stat_buffers
)) {
283 /* free obsolete slots */
284 for (int i
= 0; i
< ZSTD_POOL_MAX
; i
++) {
285 pool
= &zstd_mempool
[i
];
286 if (pool
->mem
&& mutex_tryenter(&pool
->barrier
)) {
287 /* Free memory if unused object older than 2 minutes */
288 if (pool
->mem
&& gethrestime_sec() > pool
->timeout
) {
289 vmem_free(pool
->mem
, pool
->size
);
290 ZSTDSTAT_SUB(zstd_stat_buffers
, 1);
291 ZSTDSTAT_SUB(zstd_stat_size
, pool
->size
);
296 mutex_exit(&pool
->barrier
);
302 * Try to get a cached allocated buffer from memory pool or allocate a new one
303 * if necessary. If a object is older than 2 minutes and does not fit the
304 * requested size, it will be released and a new cached entry will be allocated.
305 * If other pooled objects are detected without being used for 2 minutes, they
306 * will be released, too.
308 * The concept is that high frequency memory allocations of bigger objects are
309 * expensive. So if a lot of work is going on, allocations will be kept for a
310 * while and can be reused in that time frame.
312 * The scheduled release will be updated every time a object is reused.
316 zstd_mempool_alloc(struct zstd_pool
*zstd_mempool
, size_t size
)
318 struct zstd_pool
*pool
;
319 struct zstd_kmem
*mem
= NULL
;
325 /* Seek for preallocated memory slot and free obsolete slots */
326 for (int i
= 0; i
< ZSTD_POOL_MAX
; i
++) {
327 pool
= &zstd_mempool
[i
];
329 * This lock is simply a marker for a pool object being in use.
330 * If it's already hold, it will be skipped.
332 * We need to create it before checking it to avoid race
333 * conditions caused by running in a threaded context.
335 * The lock is later released by zstd_mempool_free.
337 if (mutex_tryenter(&pool
->barrier
)) {
339 * Check if objects fits the size, if so we take it and
340 * update the timestamp.
342 if (pool
->mem
&& size
<= pool
->size
) {
343 pool
->timeout
= gethrestime_sec() +
348 mutex_exit(&pool
->barrier
);
353 * If no preallocated slot was found, try to fill in a new one.
355 * We run a similar algorithm twice here to avoid pool fragmentation.
356 * The first one may generate holes in the list if objects get released.
357 * We always make sure that these holes get filled instead of adding new
358 * allocations constantly at the end.
360 for (int i
= 0; i
< ZSTD_POOL_MAX
; i
++) {
361 pool
= &zstd_mempool
[i
];
362 if (mutex_tryenter(&pool
->barrier
)) {
363 /* Object is free, try to allocate new one */
365 mem
= vmem_alloc(size
, KM_SLEEP
);
367 ZSTDSTAT_ADD(zstd_stat_buffers
, 1);
368 ZSTDSTAT_ADD(zstd_stat_size
, size
);
371 /* Keep track for later release */
373 mem
->kmem_type
= ZSTD_KMEM_POOL
;
374 mem
->kmem_size
= size
;
378 if (size
<= pool
->size
) {
379 /* Update timestamp */
380 pool
->timeout
= gethrestime_sec() +
386 mutex_exit(&pool
->barrier
);
391 * If the pool is full or the allocation failed, try lazy allocation
395 mem
= vmem_alloc(size
, KM_NOSLEEP
);
398 mem
->kmem_type
= ZSTD_KMEM_DEFAULT
;
399 mem
->kmem_size
= size
;
406 /* Mark object as released by releasing the barrier mutex */
408 zstd_mempool_free(struct zstd_kmem
*z
)
410 mutex_exit(&z
->pool
->barrier
);
413 /* Convert ZFS internal enum to ZSTD level */
415 zstd_enum_to_level(enum zio_zstd_levels level
, int16_t *zstd_level
)
417 if (level
> 0 && level
<= ZIO_ZSTD_LEVEL_19
) {
418 *zstd_level
= zstd_levels
[level
- 1].zstd_level
;
421 if (level
>= ZIO_ZSTD_LEVEL_FAST_1
&&
422 level
<= ZIO_ZSTD_LEVEL_FAST_1000
) {
423 *zstd_level
= zstd_levels
[level
- ZIO_ZSTD_LEVEL_FAST_1
424 + ZIO_ZSTD_LEVEL_19
].zstd_level
;
428 /* Invalid/unknown zfs compression enum - this should never happen. */
432 /* Compress block using zstd */
434 zfs_zstd_compress_impl(void *s_start
, void *d_start
, size_t s_len
, size_t d_len
,
442 hdr
= (zfs_zstdhdr_t
*)d_start
;
444 /* Skip compression if the specified level is invalid */
445 if (zstd_enum_to_level(level
, &zstd_level
)) {
446 ZSTDSTAT_BUMP(zstd_stat_com_inval
);
450 ASSERT3U(d_len
, >=, sizeof (*hdr
));
451 ASSERT3U(d_len
, <=, s_len
);
452 ASSERT3U(zstd_level
, !=, 0);
454 cctx
= ZSTD_createCCtx_advanced(zstd_malloc
);
457 * Out of kernel memory, gently fall through - this will disable
458 * compression in zio_compress_data
461 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail
);
465 /* Set the compression level */
466 ZSTD_CCtx_setParameter(cctx
, ZSTD_c_compressionLevel
, zstd_level
);
468 /* Use the "magicless" zstd header which saves us 4 header bytes */
469 ZSTD_CCtx_setParameter(cctx
, ZSTD_c_format
, ZSTD_f_zstd1_magicless
);
472 * Disable redundant checksum calculation and content size storage since
473 * this is already done by ZFS itself.
475 ZSTD_CCtx_setParameter(cctx
, ZSTD_c_checksumFlag
, 0);
476 ZSTD_CCtx_setParameter(cctx
, ZSTD_c_contentSizeFlag
, 0);
478 c_len
= ZSTD_compress2(cctx
,
480 d_len
- sizeof (*hdr
),
485 /* Error in the compression routine, disable compression. */
486 if (ZSTD_isError(c_len
)) {
488 * If we are aborting the compression because the saves are
489 * too small, that is not a failure. Everything else is a
490 * failure, so increment the compression failure counter.
492 int err
= ZSTD_getErrorCode(c_len
);
493 if (err
!= ZSTD_error_dstSize_tooSmall
) {
494 ZSTDSTAT_BUMP(zstd_stat_com_fail
);
495 dprintf("Error: %s", ZSTD_getErrorString(err
));
501 * Encode the compressed buffer size at the start. We'll need this in
502 * decompression to counter the effects of padding which might be added
503 * to the compressed buffer and which, if unhandled, would confuse the
504 * hell out of our decompression function.
506 hdr
->c_len
= BE_32(c_len
);
509 * Check version for overflow.
510 * The limit of 24 bits must not be exceeded. This allows a maximum
511 * version 1677.72.15 which we don't expect to be ever reached.
513 ASSERT3U(ZSTD_VERSION_NUMBER
, <=, 0xFFFFFF);
516 * Encode the compression level as well. We may need to know the
517 * original compression level if compressed_arc is disabled, to match
518 * the compression settings to write this block to the L2ARC.
520 * Encode the actual level, so if the enum changes in the future, we
521 * will be compatible.
523 * The upper 24 bits store the ZSTD version to be able to provide
524 * future compatibility, since new versions might enhance the
525 * compression algorithm in a way, where the compressed data will
528 * As soon as such incompatibility occurs, handling code needs to be
529 * added, differentiating between the versions.
531 zfs_set_hdrversion(hdr
, ZSTD_VERSION_NUMBER
);
532 zfs_set_hdrlevel(hdr
, level
);
533 hdr
->raw_version_level
= BE_32(hdr
->raw_version_level
);
535 return (c_len
+ sizeof (*hdr
));
540 zfs_zstd_compress_buf(void *s_start
, void *d_start
, size_t s_len
, size_t d_len
,
544 if (zstd_enum_to_level(level
, &zstd_level
)) {
545 ZSTDSTAT_BUMP(zstd_stat_com_inval
);
549 * A zstd early abort heuristic.
551 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
552 * 128k), don't try any of this, just go.
553 * (because experimentally that was a reasonable cutoff for a perf win
554 * with tiny ratio change)
555 * - First, we try LZ4 compression, and if it doesn't early abort, we
556 * jump directly to whatever compression level we intended to try.
557 * - Second, we try zstd-1 - if that errors out (usually, but not
558 * exclusively, if it would overflow), we give up early.
560 * If it works, instead we go on and compress anyway.
562 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
563 * compressible data, it was losing up to 8.5% of the compressed
564 * savings versus no early abort, and all the zstd-fast levels are
565 * worse indications on their own than LZ4, and don't improve the LZ4
566 * pass noticably if stacked like this.
568 size_t actual_abort_size
= zstd_abort_size
;
569 if (zstd_earlyabort_pass
> 0 && zstd_level
>= zstd_cutoff_level
&&
570 s_len
>= actual_abort_size
) {
573 abd_get_from_buf_struct(&sabd
, s_start
, s_len
);
574 abd_get_from_buf_struct(&dabd
, d_start
, d_len
);
575 pass_len
= zfs_lz4_compress(&sabd
, &dabd
, s_len
, d_len
, 0);
578 if (pass_len
< d_len
) {
579 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed
);
582 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected
);
584 pass_len
= zfs_zstd_compress_impl(s_start
, d_start
, s_len
,
585 d_len
, ZIO_ZSTD_LEVEL_1
);
586 if (pass_len
== s_len
|| pass_len
<= 0 || pass_len
> d_len
) {
587 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected
);
590 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed
);
592 ZSTDSTAT_BUMP(zstd_stat_passignored
);
593 if (s_len
< actual_abort_size
) {
594 ZSTDSTAT_BUMP(zstd_stat_passignored_size
);
598 return (zfs_zstd_compress_impl(s_start
, d_start
, s_len
, d_len
, level
));
602 /* Decompress block using zstd and return its stored level */
604 zfs_zstd_decompress_level_buf(void *s_start
, void *d_start
, size_t s_len
,
605 size_t d_len
, uint8_t *level
)
611 const zfs_zstdhdr_t
*hdr
;
612 zfs_zstdhdr_t hdr_copy
;
614 hdr
= (const zfs_zstdhdr_t
*)s_start
;
615 c_len
= BE_32(hdr
->c_len
);
618 * Make a copy instead of directly converting the header, since we must
619 * not modify the original data that may be used again later.
621 hdr_copy
.raw_version_level
= BE_32(hdr
->raw_version_level
);
622 uint8_t curlevel
= zfs_get_hdrlevel(&hdr_copy
);
625 * NOTE: We ignore the ZSTD version for now. As soon as any
626 * incompatibility occurs, it has to be handled accordingly.
627 * The version can be accessed via `hdr_copy.version`.
631 * Convert and check the level
632 * An invalid level is a strong indicator for data corruption! In such
633 * case return an error so the upper layers can try to fix it.
635 if (zstd_enum_to_level(curlevel
, &zstd_level
)) {
636 ZSTDSTAT_BUMP(zstd_stat_dec_inval
);
640 ASSERT3U(d_len
, >=, s_len
);
641 ASSERT3U(curlevel
, !=, ZIO_COMPLEVEL_INHERIT
);
643 /* Invalid compressed buffer size encoded at start */
644 if (c_len
+ sizeof (*hdr
) > s_len
) {
645 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval
);
649 dctx
= ZSTD_createDCtx_advanced(zstd_dctx_malloc
);
651 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail
);
655 /* Set header type to "magicless" */
656 ZSTD_DCtx_setParameter(dctx
, ZSTD_d_format
, ZSTD_f_zstd1_magicless
);
658 /* Decompress the data and release the context */
659 result
= ZSTD_decompressDCtx(dctx
, d_start
, d_len
, hdr
->data
, c_len
);
663 * Returns 0 on success (decompression function returned non-negative)
664 * and non-zero on failure (decompression function returned negative.
666 if (ZSTD_isError(result
)) {
667 ZSTDSTAT_BUMP(zstd_stat_dec_fail
);
678 /* Decompress datablock using zstd */
680 zfs_zstd_decompress_buf(void *s_start
, void *d_start
, size_t s_len
,
681 size_t d_len
, int level __maybe_unused
)
684 return (zfs_zstd_decompress_level_buf(s_start
, d_start
, s_len
, d_len
,
688 ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress
)
689 ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress
)
690 ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level
)
693 /* Allocator for zstd compression context using mempool_allocator */
695 zstd_alloc(void *opaque __maybe_unused
, size_t size
)
697 size_t nbytes
= sizeof (struct zstd_kmem
) + size
;
698 struct zstd_kmem
*z
= NULL
;
700 z
= (struct zstd_kmem
*)zstd_mempool_alloc(zstd_mempool_cctx
, nbytes
);
703 ZSTDSTAT_BUMP(zstd_stat_alloc_fail
);
707 return ((void*)z
+ (sizeof (struct zstd_kmem
)));
711 * Allocator for zstd decompression context using mempool_allocator with
712 * fallback to reserved memory if allocation fails
715 zstd_dctx_alloc(void *opaque __maybe_unused
, size_t size
)
717 size_t nbytes
= sizeof (struct zstd_kmem
) + size
;
718 struct zstd_kmem
*z
= NULL
;
719 enum zstd_kmem_type type
= ZSTD_KMEM_DEFAULT
;
721 z
= (struct zstd_kmem
*)zstd_mempool_alloc(zstd_mempool_dctx
, nbytes
);
723 /* Try harder, decompression shall not fail */
724 z
= vmem_alloc(nbytes
, KM_SLEEP
);
728 ZSTDSTAT_BUMP(zstd_stat_alloc_fail
);
730 return ((void*)z
+ (sizeof (struct zstd_kmem
)));
733 /* Fallback if everything fails */
736 * Barrier since we only can handle it in a single thread. All
737 * other following threads need to wait here until decompression
738 * is completed. zstd_free will release this barrier later.
740 mutex_enter(&zstd_dctx_fallback
.barrier
);
742 z
= zstd_dctx_fallback
.mem
;
743 type
= ZSTD_KMEM_DCTX
;
744 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback
);
747 /* Allocation should always be successful */
753 z
->kmem_size
= nbytes
;
755 return ((void*)z
+ (sizeof (struct zstd_kmem
)));
758 /* Free allocated memory by its specific type */
760 zstd_free(void *opaque __maybe_unused
, void *ptr
)
762 struct zstd_kmem
*z
= (ptr
- sizeof (struct zstd_kmem
));
763 enum zstd_kmem_type type
;
765 ASSERT3U(z
->kmem_type
, <, ZSTD_KMEM_COUNT
);
766 ASSERT3U(z
->kmem_type
, >, ZSTD_KMEM_UNKNOWN
);
770 case ZSTD_KMEM_DEFAULT
:
771 vmem_free(z
, z
->kmem_size
);
774 zstd_mempool_free(z
);
777 mutex_exit(&zstd_dctx_fallback
.barrier
);
784 /* Allocate fallback memory to ensure safe decompression */
786 create_fallback_mem(struct zstd_fallback_mem
*mem
, size_t size
)
788 mem
->mem_size
= size
;
789 mem
->mem
= vmem_zalloc(mem
->mem_size
, KM_SLEEP
);
790 mutex_init(&mem
->barrier
, NULL
, MUTEX_DEFAULT
, NULL
);
793 /* Initialize memory pool barrier mutexes */
795 zstd_mempool_init(void)
798 kmem_zalloc(ZSTD_POOL_MAX
* sizeof (struct zstd_pool
), KM_SLEEP
);
800 kmem_zalloc(ZSTD_POOL_MAX
* sizeof (struct zstd_pool
), KM_SLEEP
);
802 for (int i
= 0; i
< ZSTD_POOL_MAX
; i
++) {
803 mutex_init(&zstd_mempool_cctx
[i
].barrier
, NULL
,
804 MUTEX_DEFAULT
, NULL
);
805 mutex_init(&zstd_mempool_dctx
[i
].barrier
, NULL
,
806 MUTEX_DEFAULT
, NULL
);
810 /* Initialize zstd-related memory handling */
817 * Estimate the size of the fallback decompression context.
818 * The expected size on x64 with current ZSTD should be about 160 KB.
820 create_fallback_mem(&zstd_dctx_fallback
,
821 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem
),
827 /* Release object from pool and free memory */
829 release_pool(struct zstd_pool
*pool
)
831 mutex_destroy(&pool
->barrier
);
832 vmem_free(pool
->mem
, pool
->size
);
837 /* Release memory pool objects */
839 zstd_mempool_deinit(void)
841 for (int i
= 0; i
< ZSTD_POOL_MAX
; i
++) {
842 release_pool(&zstd_mempool_cctx
[i
]);
843 release_pool(&zstd_mempool_dctx
[i
]);
846 kmem_free(zstd_mempool_dctx
, ZSTD_POOL_MAX
* sizeof (struct zstd_pool
));
847 kmem_free(zstd_mempool_cctx
, ZSTD_POOL_MAX
* sizeof (struct zstd_pool
));
848 zstd_mempool_dctx
= NULL
;
849 zstd_mempool_cctx
= NULL
;
852 /* release unused memory from pool */
855 zfs_zstd_cache_reap_now(void)
859 * Short-circuit if there are no buffers to begin with.
861 if (ZSTDSTAT(zstd_stat_buffers
) == 0)
865 * calling alloc with zero size seeks
866 * and releases old unused objects
868 zstd_mempool_reap(zstd_mempool_cctx
);
869 zstd_mempool_reap(zstd_mempool_dctx
);
875 /* Set pool size by using maximum sane thread count * 4 */
876 pool_count
= (boot_ncpus
* 4);
879 /* Initialize kstat */
880 zstd_ksp
= kstat_create("zfs", 0, "zstd", "misc",
881 KSTAT_TYPE_NAMED
, sizeof (zstd_stats
) / sizeof (kstat_named_t
),
883 if (zstd_ksp
!= NULL
) {
884 zstd_ksp
->ks_data
= &zstd_stats
;
885 kstat_install(zstd_ksp
);
887 zstd_ksp
->ks_update
= kstat_zstd_update
;
897 /* Deinitialize kstat */
898 if (zstd_ksp
!= NULL
) {
899 kstat_delete(zstd_ksp
);
903 /* Release fallback memory */
904 vmem_free(zstd_dctx_fallback
.mem
, zstd_dctx_fallback
.mem_size
);
905 mutex_destroy(&zstd_dctx_fallback
.barrier
);
907 /* Deinit memory pool */
908 zstd_mempool_deinit();
913 module_init(zstd_init
);
914 module_exit(zstd_fini
);
917 ZFS_MODULE_PARAM(zfs
, zstd_
, earlyabort_pass
, UINT
, ZMOD_RW
,
918 "Enable early abort attempts when using zstd");
919 ZFS_MODULE_PARAM(zfs
, zstd_
, abort_size
, UINT
, ZMOD_RW
,
920 "Minimal size of block to attempt early abort");