1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2016-present, Facebook, Inc.
9 #include <linux/bitmap.h>
10 #include <linux/err.h>
11 #include <linux/init.h>
12 #include <linux/kernel.h>
14 #include <linux/sched/mm.h>
15 #include <linux/pagemap.h>
16 #include <linux/refcount.h>
17 #include <linux/sched.h>
18 #include <linux/slab.h>
19 #include <linux/zstd.h>
22 #include "btrfs_inode.h"
23 #include "compression.h"
26 #define ZSTD_BTRFS_MAX_WINDOWLOG 17
27 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
28 #define ZSTD_BTRFS_DEFAULT_LEVEL 3
29 #define ZSTD_BTRFS_MAX_LEVEL 15
30 /* 307s to avoid pathologically clashing with transaction commit */
31 #define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
33 static zstd_parameters
zstd_get_btrfs_parameters(unsigned int level
,
36 zstd_parameters params
= zstd_get_params(level
, src_len
);
38 if (params
.cParams
.windowLog
> ZSTD_BTRFS_MAX_WINDOWLOG
)
39 params
.cParams
.windowLog
= ZSTD_BTRFS_MAX_WINDOWLOG
;
40 WARN_ON(src_len
> ZSTD_BTRFS_MAX_INPUT
);
49 unsigned int req_level
;
50 unsigned long last_used
; /* jiffies */
51 struct list_head list
;
52 struct list_head lru_list
;
53 zstd_in_buffer in_buf
;
54 zstd_out_buffer out_buf
;
58 * Zstd Workspace Management
60 * Zstd workspaces have different memory requirements depending on the level.
61 * The zstd workspaces are managed by having individual lists for each level
62 * and a global lru. Forward progress is maintained by protecting a max level
65 * Getting a workspace is done by using the bitmap to identify the levels that
66 * have available workspaces and scans up. This lets us recycle higher level
67 * workspaces because of the monotonic memory guarantee. A workspace's
68 * last_used is only updated if it is being used by the corresponding memory
69 * level. Putting a workspace involves adding it back to the appropriate places
70 * and adding it back to the lru if necessary.
72 * A timer is used to reclaim workspaces if they have not been used for
73 * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around.
74 * The upper bound is provided by the workqueue limit which is 2 (percpu limit).
77 struct zstd_workspace_manager
{
78 const struct btrfs_compress_op
*ops
;
80 struct list_head lru_list
;
81 struct list_head idle_ws
[ZSTD_BTRFS_MAX_LEVEL
];
82 unsigned long active_map
;
83 wait_queue_head_t wait
;
84 struct timer_list timer
;
87 static struct zstd_workspace_manager wsm
;
89 static size_t zstd_ws_mem_sizes
[ZSTD_BTRFS_MAX_LEVEL
];
91 static inline struct workspace
*list_to_workspace(struct list_head
*list
)
93 return container_of(list
, struct workspace
, list
);
96 void zstd_free_workspace(struct list_head
*ws
);
97 struct list_head
*zstd_alloc_workspace(unsigned int level
);
100 * Timer callback to free unused workspaces.
104 * This scans the lru_list and attempts to reclaim any workspace that hasn't
105 * been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
107 * The context is softirq and does not need the _bh locking primitives.
109 static void zstd_reclaim_timer_fn(struct timer_list
*timer
)
111 unsigned long reclaim_threshold
= jiffies
- ZSTD_BTRFS_RECLAIM_JIFFIES
;
112 struct list_head
*pos
, *next
;
114 spin_lock(&wsm
.lock
);
116 if (list_empty(&wsm
.lru_list
)) {
117 spin_unlock(&wsm
.lock
);
121 list_for_each_prev_safe(pos
, next
, &wsm
.lru_list
) {
122 struct workspace
*victim
= container_of(pos
, struct workspace
,
126 if (time_after(victim
->last_used
, reclaim_threshold
))
129 /* workspace is in use */
130 if (victim
->req_level
)
133 level
= victim
->level
;
134 list_del(&victim
->lru_list
);
135 list_del(&victim
->list
);
136 zstd_free_workspace(&victim
->list
);
138 if (list_empty(&wsm
.idle_ws
[level
- 1]))
139 clear_bit(level
- 1, &wsm
.active_map
);
143 if (!list_empty(&wsm
.lru_list
))
144 mod_timer(&wsm
.timer
, jiffies
+ ZSTD_BTRFS_RECLAIM_JIFFIES
);
146 spin_unlock(&wsm
.lock
);
150 * Calculate monotonic memory bounds.
152 * It is possible based on the level configurations that a higher level
153 * workspace uses less memory than a lower level workspace. In order to reuse
154 * workspaces, this must be made a monotonic relationship. This precomputes
155 * the required memory for each level and enforces the monotonicity between
156 * level and memory required.
158 static void zstd_calc_ws_mem_sizes(void)
163 for (level
= 1; level
<= ZSTD_BTRFS_MAX_LEVEL
; level
++) {
164 zstd_parameters params
=
165 zstd_get_btrfs_parameters(level
, ZSTD_BTRFS_MAX_INPUT
);
168 zstd_cstream_workspace_bound(¶ms
.cParams
),
169 zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT
));
171 max_size
= max_t(size_t, max_size
, level_size
);
172 zstd_ws_mem_sizes
[level
- 1] = max_size
;
176 void zstd_init_workspace_manager(void)
178 struct list_head
*ws
;
181 zstd_calc_ws_mem_sizes();
183 wsm
.ops
= &btrfs_zstd_compress
;
184 spin_lock_init(&wsm
.lock
);
185 init_waitqueue_head(&wsm
.wait
);
186 timer_setup(&wsm
.timer
, zstd_reclaim_timer_fn
, 0);
188 INIT_LIST_HEAD(&wsm
.lru_list
);
189 for (i
= 0; i
< ZSTD_BTRFS_MAX_LEVEL
; i
++)
190 INIT_LIST_HEAD(&wsm
.idle_ws
[i
]);
192 ws
= zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL
);
195 "BTRFS: cannot preallocate zstd compression workspace\n");
197 set_bit(ZSTD_BTRFS_MAX_LEVEL
- 1, &wsm
.active_map
);
198 list_add(ws
, &wsm
.idle_ws
[ZSTD_BTRFS_MAX_LEVEL
- 1]);
202 void zstd_cleanup_workspace_manager(void)
204 struct workspace
*workspace
;
207 spin_lock_bh(&wsm
.lock
);
208 for (i
= 0; i
< ZSTD_BTRFS_MAX_LEVEL
; i
++) {
209 while (!list_empty(&wsm
.idle_ws
[i
])) {
210 workspace
= container_of(wsm
.idle_ws
[i
].next
,
211 struct workspace
, list
);
212 list_del(&workspace
->list
);
213 list_del(&workspace
->lru_list
);
214 zstd_free_workspace(&workspace
->list
);
217 spin_unlock_bh(&wsm
.lock
);
219 del_timer_sync(&wsm
.timer
);
223 * Find workspace for given level.
225 * @level: compression level
227 * This iterates over the set bits in the active_map beginning at the requested
228 * compression level. This lets us utilize already allocated workspaces before
229 * allocating a new one. If the workspace is of a larger size, it is used, but
230 * the place in the lru_list and last_used times are not updated. This is to
231 * offer the opportunity to reclaim the workspace in favor of allocating an
232 * appropriately sized one in the future.
234 static struct list_head
*zstd_find_workspace(unsigned int level
)
236 struct list_head
*ws
;
237 struct workspace
*workspace
;
240 spin_lock_bh(&wsm
.lock
);
241 for_each_set_bit_from(i
, &wsm
.active_map
, ZSTD_BTRFS_MAX_LEVEL
) {
242 if (!list_empty(&wsm
.idle_ws
[i
])) {
243 ws
= wsm
.idle_ws
[i
].next
;
244 workspace
= list_to_workspace(ws
);
246 /* keep its place if it's a lower level using this */
247 workspace
->req_level
= level
;
248 if (level
== workspace
->level
)
249 list_del(&workspace
->lru_list
);
250 if (list_empty(&wsm
.idle_ws
[i
]))
251 clear_bit(i
, &wsm
.active_map
);
252 spin_unlock_bh(&wsm
.lock
);
256 spin_unlock_bh(&wsm
.lock
);
262 * Zstd get_workspace for level.
264 * @level: compression level
266 * If @level is 0, then any compression level can be used. Therefore, we begin
267 * scanning from 1. We first scan through possible workspaces and then after
268 * attempt to allocate a new workspace. If we fail to allocate one due to
269 * memory pressure, go to sleep waiting for the max level workspace to free up.
271 struct list_head
*zstd_get_workspace(unsigned int level
)
273 struct list_head
*ws
;
274 unsigned int nofs_flag
;
276 /* level == 0 means we can use any workspace */
281 ws
= zstd_find_workspace(level
);
285 nofs_flag
= memalloc_nofs_save();
286 ws
= zstd_alloc_workspace(level
);
287 memalloc_nofs_restore(nofs_flag
);
292 prepare_to_wait(&wsm
.wait
, &wait
, TASK_UNINTERRUPTIBLE
);
294 finish_wait(&wsm
.wait
, &wait
);
303 * Zstd put_workspace.
305 * @ws: list_head for the workspace
307 * When putting back a workspace, we only need to update the LRU if we are of
308 * the requested compression level. Here is where we continue to protect the
309 * max level workspace or update last_used accordingly. If the reclaim timer
310 * isn't set, it is also set here. Only the max level workspace tries and wakes
311 * up waiting workspaces.
313 void zstd_put_workspace(struct list_head
*ws
)
315 struct workspace
*workspace
= list_to_workspace(ws
);
317 spin_lock_bh(&wsm
.lock
);
319 /* A node is only taken off the lru if we are the corresponding level */
320 if (workspace
->req_level
== workspace
->level
) {
321 /* Hide a max level workspace from reclaim */
322 if (list_empty(&wsm
.idle_ws
[ZSTD_BTRFS_MAX_LEVEL
- 1])) {
323 INIT_LIST_HEAD(&workspace
->lru_list
);
325 workspace
->last_used
= jiffies
;
326 list_add(&workspace
->lru_list
, &wsm
.lru_list
);
327 if (!timer_pending(&wsm
.timer
))
328 mod_timer(&wsm
.timer
,
329 jiffies
+ ZSTD_BTRFS_RECLAIM_JIFFIES
);
333 set_bit(workspace
->level
- 1, &wsm
.active_map
);
334 list_add(&workspace
->list
, &wsm
.idle_ws
[workspace
->level
- 1]);
335 workspace
->req_level
= 0;
337 spin_unlock_bh(&wsm
.lock
);
339 if (workspace
->level
== ZSTD_BTRFS_MAX_LEVEL
)
340 cond_wake_up(&wsm
.wait
);
343 void zstd_free_workspace(struct list_head
*ws
)
345 struct workspace
*workspace
= list_entry(ws
, struct workspace
, list
);
347 kvfree(workspace
->mem
);
348 kfree(workspace
->buf
);
352 struct list_head
*zstd_alloc_workspace(unsigned int level
)
354 struct workspace
*workspace
;
356 workspace
= kzalloc(sizeof(*workspace
), GFP_KERNEL
);
358 return ERR_PTR(-ENOMEM
);
360 workspace
->size
= zstd_ws_mem_sizes
[level
- 1];
361 workspace
->level
= level
;
362 workspace
->req_level
= level
;
363 workspace
->last_used
= jiffies
;
364 workspace
->mem
= kvmalloc(workspace
->size
, GFP_KERNEL
| __GFP_NOWARN
);
365 workspace
->buf
= kmalloc(PAGE_SIZE
, GFP_KERNEL
);
366 if (!workspace
->mem
|| !workspace
->buf
)
369 INIT_LIST_HEAD(&workspace
->list
);
370 INIT_LIST_HEAD(&workspace
->lru_list
);
372 return &workspace
->list
;
374 zstd_free_workspace(&workspace
->list
);
375 return ERR_PTR(-ENOMEM
);
378 int zstd_compress_folios(struct list_head
*ws
, struct address_space
*mapping
,
379 u64 start
, struct folio
**folios
, unsigned long *out_folios
,
380 unsigned long *total_in
, unsigned long *total_out
)
382 struct workspace
*workspace
= list_entry(ws
, struct workspace
, list
);
383 zstd_cstream
*stream
;
386 struct folio
*in_folio
= NULL
; /* The current folio to read. */
387 struct folio
*out_folio
= NULL
; /* The current folio to write to. */
388 unsigned long tot_in
= 0;
389 unsigned long tot_out
= 0;
390 unsigned long len
= *total_out
;
391 const unsigned long nr_dest_folios
= *out_folios
;
392 const u64 orig_end
= start
+ len
;
393 unsigned long max_out
= nr_dest_folios
* PAGE_SIZE
;
395 unsigned int cur_len
;
396 zstd_parameters params
= zstd_get_btrfs_parameters(workspace
->req_level
,
403 /* Initialize the stream */
404 stream
= zstd_init_cstream(¶ms
, len
, workspace
->mem
,
406 if (unlikely(!stream
)) {
407 struct btrfs_inode
*inode
= BTRFS_I(mapping
->host
);
409 btrfs_err(inode
->root
->fs_info
,
410 "zstd compression init level %d failed, root %llu inode %llu offset %llu",
411 workspace
->req_level
, btrfs_root_id(inode
->root
),
412 btrfs_ino(inode
), start
);
417 /* map in the first page of input data */
418 ret
= btrfs_compress_filemap_get_folio(mapping
, start
, &in_folio
);
421 pg_off
= offset_in_page(start
);
422 cur_len
= btrfs_calc_input_length(orig_end
, start
);
423 workspace
->in_buf
.src
= kmap_local_folio(in_folio
, pg_off
);
424 workspace
->in_buf
.pos
= 0;
425 workspace
->in_buf
.size
= cur_len
;
427 /* Allocate and map in the output buffer */
428 out_folio
= btrfs_alloc_compr_folio();
429 if (out_folio
== NULL
) {
433 folios
[nr_folios
++] = out_folio
;
434 workspace
->out_buf
.dst
= folio_address(out_folio
);
435 workspace
->out_buf
.pos
= 0;
436 workspace
->out_buf
.size
= min_t(size_t, max_out
, PAGE_SIZE
);
441 ret2
= zstd_compress_stream(stream
, &workspace
->out_buf
,
443 if (unlikely(zstd_is_error(ret2
))) {
444 struct btrfs_inode
*inode
= BTRFS_I(mapping
->host
);
446 btrfs_warn(inode
->root
->fs_info
,
447 "zstd compression level %d failed, error %d root %llu inode %llu offset %llu",
448 workspace
->req_level
, zstd_get_error_code(ret2
),
449 btrfs_root_id(inode
->root
), btrfs_ino(inode
),
455 /* Check to see if we are making it bigger */
456 if (tot_in
+ workspace
->in_buf
.pos
> 8192 &&
457 tot_in
+ workspace
->in_buf
.pos
<
458 tot_out
+ workspace
->out_buf
.pos
) {
463 /* We've reached the end of our output range */
464 if (workspace
->out_buf
.pos
>= max_out
) {
465 tot_out
+= workspace
->out_buf
.pos
;
470 /* Check if we need more output space */
471 if (workspace
->out_buf
.pos
== workspace
->out_buf
.size
) {
472 tot_out
+= PAGE_SIZE
;
473 max_out
-= PAGE_SIZE
;
474 if (nr_folios
== nr_dest_folios
) {
478 out_folio
= btrfs_alloc_compr_folio();
479 if (out_folio
== NULL
) {
483 folios
[nr_folios
++] = out_folio
;
484 workspace
->out_buf
.dst
= folio_address(out_folio
);
485 workspace
->out_buf
.pos
= 0;
486 workspace
->out_buf
.size
= min_t(size_t, max_out
,
490 /* We've reached the end of the input */
491 if (workspace
->in_buf
.pos
>= len
) {
492 tot_in
+= workspace
->in_buf
.pos
;
496 /* Check if we need more input */
497 if (workspace
->in_buf
.pos
== workspace
->in_buf
.size
) {
499 kunmap_local(workspace
->in_buf
.src
);
500 workspace
->in_buf
.src
= NULL
;
504 ret
= btrfs_compress_filemap_get_folio(mapping
, start
, &in_folio
);
507 pg_off
= offset_in_page(start
);
508 cur_len
= btrfs_calc_input_length(orig_end
, start
);
509 workspace
->in_buf
.src
= kmap_local_folio(in_folio
, pg_off
);
510 workspace
->in_buf
.pos
= 0;
511 workspace
->in_buf
.size
= cur_len
;
517 ret2
= zstd_end_stream(stream
, &workspace
->out_buf
);
518 if (unlikely(zstd_is_error(ret2
))) {
519 struct btrfs_inode
*inode
= BTRFS_I(mapping
->host
);
521 btrfs_err(inode
->root
->fs_info
,
522 "zstd compression end level %d failed, error %d root %llu inode %llu offset %llu",
523 workspace
->req_level
, zstd_get_error_code(ret2
),
524 btrfs_root_id(inode
->root
), btrfs_ino(inode
),
530 tot_out
+= workspace
->out_buf
.pos
;
533 if (workspace
->out_buf
.pos
>= max_out
) {
534 tot_out
+= workspace
->out_buf
.pos
;
539 tot_out
+= PAGE_SIZE
;
540 max_out
-= PAGE_SIZE
;
541 if (nr_folios
== nr_dest_folios
) {
545 out_folio
= btrfs_alloc_compr_folio();
546 if (out_folio
== NULL
) {
550 folios
[nr_folios
++] = out_folio
;
551 workspace
->out_buf
.dst
= folio_address(out_folio
);
552 workspace
->out_buf
.pos
= 0;
553 workspace
->out_buf
.size
= min_t(size_t, max_out
, PAGE_SIZE
);
556 if (tot_out
>= tot_in
) {
563 *total_out
= tot_out
;
565 *out_folios
= nr_folios
;
566 if (workspace
->in_buf
.src
) {
567 kunmap_local(workspace
->in_buf
.src
);
573 int zstd_decompress_bio(struct list_head
*ws
, struct compressed_bio
*cb
)
575 struct workspace
*workspace
= list_entry(ws
, struct workspace
, list
);
576 struct folio
**folios_in
= cb
->compressed_folios
;
577 size_t srclen
= cb
->compressed_len
;
578 zstd_dstream
*stream
;
580 unsigned long folio_in_index
= 0;
581 unsigned long total_folios_in
= DIV_ROUND_UP(srclen
, PAGE_SIZE
);
582 unsigned long buf_start
;
583 unsigned long total_out
= 0;
585 stream
= zstd_init_dstream(
586 ZSTD_BTRFS_MAX_INPUT
, workspace
->mem
, workspace
->size
);
587 if (unlikely(!stream
)) {
588 struct btrfs_inode
*inode
= cb
->bbio
.inode
;
590 btrfs_err(inode
->root
->fs_info
,
591 "zstd decompression init failed, root %llu inode %llu offset %llu",
592 btrfs_root_id(inode
->root
), btrfs_ino(inode
), cb
->start
);
597 workspace
->in_buf
.src
= kmap_local_folio(folios_in
[folio_in_index
], 0);
598 workspace
->in_buf
.pos
= 0;
599 workspace
->in_buf
.size
= min_t(size_t, srclen
, PAGE_SIZE
);
601 workspace
->out_buf
.dst
= workspace
->buf
;
602 workspace
->out_buf
.pos
= 0;
603 workspace
->out_buf
.size
= PAGE_SIZE
;
608 ret2
= zstd_decompress_stream(stream
, &workspace
->out_buf
,
610 if (unlikely(zstd_is_error(ret2
))) {
611 struct btrfs_inode
*inode
= cb
->bbio
.inode
;
613 btrfs_err(inode
->root
->fs_info
,
614 "zstd decompression failed, error %d root %llu inode %llu offset %llu",
615 zstd_get_error_code(ret2
), btrfs_root_id(inode
->root
),
616 btrfs_ino(inode
), cb
->start
);
620 buf_start
= total_out
;
621 total_out
+= workspace
->out_buf
.pos
;
622 workspace
->out_buf
.pos
= 0;
624 ret
= btrfs_decompress_buf2page(workspace
->out_buf
.dst
,
625 total_out
- buf_start
, cb
, buf_start
);
629 if (workspace
->in_buf
.pos
>= srclen
)
632 /* Check if we've hit the end of a frame */
636 if (workspace
->in_buf
.pos
== workspace
->in_buf
.size
) {
637 kunmap_local(workspace
->in_buf
.src
);
639 if (folio_in_index
>= total_folios_in
) {
640 workspace
->in_buf
.src
= NULL
;
645 workspace
->in_buf
.src
=
646 kmap_local_folio(folios_in
[folio_in_index
], 0);
647 workspace
->in_buf
.pos
= 0;
648 workspace
->in_buf
.size
= min_t(size_t, srclen
, PAGE_SIZE
);
653 if (workspace
->in_buf
.src
)
654 kunmap_local(workspace
->in_buf
.src
);
658 int zstd_decompress(struct list_head
*ws
, const u8
*data_in
,
659 struct folio
*dest_folio
, unsigned long dest_pgoff
, size_t srclen
,
662 struct workspace
*workspace
= list_entry(ws
, struct workspace
, list
);
663 struct btrfs_fs_info
*fs_info
= btrfs_sb(folio_inode(dest_folio
)->i_sb
);
664 const u32 sectorsize
= fs_info
->sectorsize
;
665 zstd_dstream
*stream
;
667 unsigned long to_copy
= 0;
669 stream
= zstd_init_dstream(
670 ZSTD_BTRFS_MAX_INPUT
, workspace
->mem
, workspace
->size
);
671 if (unlikely(!stream
)) {
672 struct btrfs_inode
*inode
= folio_to_inode(dest_folio
);
674 btrfs_err(inode
->root
->fs_info
,
675 "zstd decompression init failed, root %llu inode %llu offset %llu",
676 btrfs_root_id(inode
->root
), btrfs_ino(inode
),
677 folio_pos(dest_folio
));
682 workspace
->in_buf
.src
= data_in
;
683 workspace
->in_buf
.pos
= 0;
684 workspace
->in_buf
.size
= srclen
;
686 workspace
->out_buf
.dst
= workspace
->buf
;
687 workspace
->out_buf
.pos
= 0;
688 workspace
->out_buf
.size
= sectorsize
;
691 * Since both input and output buffers should not exceed one sector,
692 * one call should end the decompression.
694 ret
= zstd_decompress_stream(stream
, &workspace
->out_buf
, &workspace
->in_buf
);
695 if (unlikely(zstd_is_error(ret
))) {
696 struct btrfs_inode
*inode
= folio_to_inode(dest_folio
);
698 btrfs_err(inode
->root
->fs_info
,
699 "zstd decompression failed, error %d root %llu inode %llu offset %llu",
700 zstd_get_error_code(ret
), btrfs_root_id(inode
->root
),
701 btrfs_ino(inode
), folio_pos(dest_folio
));
704 to_copy
= workspace
->out_buf
.pos
;
705 memcpy_to_folio(dest_folio
, dest_pgoff
, workspace
->out_buf
.dst
, to_copy
);
707 /* Error or early end. */
708 if (unlikely(to_copy
< destlen
)) {
710 folio_zero_range(dest_folio
, dest_pgoff
+ to_copy
, destlen
- to_copy
);
715 const struct btrfs_compress_op btrfs_zstd_compress
= {
716 /* ZSTD uses own workspace manager */
717 .workspace_manager
= NULL
,
718 .max_level
= ZSTD_BTRFS_MAX_LEVEL
,
719 .default_level
= ZSTD_BTRFS_DEFAULT_LEVEL
,