1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2016-present, Facebook, Inc.
9 #include <linux/bitmap.h>
10 #include <linux/err.h>
11 #include <linux/init.h>
12 #include <linux/kernel.h>
14 #include <linux/sched/mm.h>
15 #include <linux/pagemap.h>
16 #include <linux/refcount.h>
17 #include <linux/sched.h>
18 #include <linux/slab.h>
19 #include <linux/zstd.h>
21 #include "compression.h"
24 #define ZSTD_BTRFS_MAX_WINDOWLOG 17
25 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
26 #define ZSTD_BTRFS_DEFAULT_LEVEL 3
27 #define ZSTD_BTRFS_MAX_LEVEL 15
28 /* 307s to avoid pathologically clashing with transaction commit */
29 #define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
31 static ZSTD_parameters
zstd_get_btrfs_parameters(unsigned int level
,
34 ZSTD_parameters params
= ZSTD_getParams(level
, src_len
, 0);
36 if (params
.cParams
.windowLog
> ZSTD_BTRFS_MAX_WINDOWLOG
)
37 params
.cParams
.windowLog
= ZSTD_BTRFS_MAX_WINDOWLOG
;
38 WARN_ON(src_len
> ZSTD_BTRFS_MAX_INPUT
);
47 unsigned int req_level
;
48 unsigned long last_used
; /* jiffies */
49 struct list_head list
;
50 struct list_head lru_list
;
52 ZSTD_outBuffer out_buf
;
56 * Zstd Workspace Management
58 * Zstd workspaces have different memory requirements depending on the level.
59 * The zstd workspaces are managed by having individual lists for each level
60 * and a global lru. Forward progress is maintained by protecting a max level
63 * Getting a workspace is done by using the bitmap to identify the levels that
64 * have available workspaces and scans up. This lets us recycle higher level
65 * workspaces because of the monotonic memory guarantee. A workspace's
66 * last_used is only updated if it is being used by the corresponding memory
67 * level. Putting a workspace involves adding it back to the appropriate places
68 * and adding it back to the lru if necessary.
70 * A timer is used to reclaim workspaces if they have not been used for
71 * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around.
72 * The upper bound is provided by the workqueue limit which is 2 (percpu limit).
75 struct zstd_workspace_manager
{
76 const struct btrfs_compress_op
*ops
;
78 struct list_head lru_list
;
79 struct list_head idle_ws
[ZSTD_BTRFS_MAX_LEVEL
];
80 unsigned long active_map
;
81 wait_queue_head_t wait
;
82 struct timer_list timer
;
85 static struct zstd_workspace_manager wsm
;
87 static size_t zstd_ws_mem_sizes
[ZSTD_BTRFS_MAX_LEVEL
];
89 static inline struct workspace
*list_to_workspace(struct list_head
*list
)
91 return container_of(list
, struct workspace
, list
);
94 void zstd_free_workspace(struct list_head
*ws
);
95 struct list_head
*zstd_alloc_workspace(unsigned int level
);
97 * zstd_reclaim_timer_fn - reclaim timer
100 * This scans the lru_list and attempts to reclaim any workspace that hasn't
101 * been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
103 static void zstd_reclaim_timer_fn(struct timer_list
*timer
)
105 unsigned long reclaim_threshold
= jiffies
- ZSTD_BTRFS_RECLAIM_JIFFIES
;
106 struct list_head
*pos
, *next
;
108 spin_lock_bh(&wsm
.lock
);
110 if (list_empty(&wsm
.lru_list
)) {
111 spin_unlock_bh(&wsm
.lock
);
115 list_for_each_prev_safe(pos
, next
, &wsm
.lru_list
) {
116 struct workspace
*victim
= container_of(pos
, struct workspace
,
120 if (time_after(victim
->last_used
, reclaim_threshold
))
123 /* workspace is in use */
124 if (victim
->req_level
)
127 level
= victim
->level
;
128 list_del(&victim
->lru_list
);
129 list_del(&victim
->list
);
130 zstd_free_workspace(&victim
->list
);
132 if (list_empty(&wsm
.idle_ws
[level
- 1]))
133 clear_bit(level
- 1, &wsm
.active_map
);
137 if (!list_empty(&wsm
.lru_list
))
138 mod_timer(&wsm
.timer
, jiffies
+ ZSTD_BTRFS_RECLAIM_JIFFIES
);
140 spin_unlock_bh(&wsm
.lock
);
144 * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
146 * It is possible based on the level configurations that a higher level
147 * workspace uses less memory than a lower level workspace. In order to reuse
148 * workspaces, this must be made a monotonic relationship. This precomputes
149 * the required memory for each level and enforces the monotonicity between
150 * level and memory required.
152 static void zstd_calc_ws_mem_sizes(void)
157 for (level
= 1; level
<= ZSTD_BTRFS_MAX_LEVEL
; level
++) {
158 ZSTD_parameters params
=
159 zstd_get_btrfs_parameters(level
, ZSTD_BTRFS_MAX_INPUT
);
162 ZSTD_CStreamWorkspaceBound(params
.cParams
),
163 ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT
));
165 max_size
= max_t(size_t, max_size
, level_size
);
166 zstd_ws_mem_sizes
[level
- 1] = max_size
;
170 void zstd_init_workspace_manager(void)
172 struct list_head
*ws
;
175 zstd_calc_ws_mem_sizes();
177 wsm
.ops
= &btrfs_zstd_compress
;
178 spin_lock_init(&wsm
.lock
);
179 init_waitqueue_head(&wsm
.wait
);
180 timer_setup(&wsm
.timer
, zstd_reclaim_timer_fn
, 0);
182 INIT_LIST_HEAD(&wsm
.lru_list
);
183 for (i
= 0; i
< ZSTD_BTRFS_MAX_LEVEL
; i
++)
184 INIT_LIST_HEAD(&wsm
.idle_ws
[i
]);
186 ws
= zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL
);
189 "BTRFS: cannot preallocate zstd compression workspace\n");
191 set_bit(ZSTD_BTRFS_MAX_LEVEL
- 1, &wsm
.active_map
);
192 list_add(ws
, &wsm
.idle_ws
[ZSTD_BTRFS_MAX_LEVEL
- 1]);
196 void zstd_cleanup_workspace_manager(void)
198 struct workspace
*workspace
;
201 spin_lock_bh(&wsm
.lock
);
202 for (i
= 0; i
< ZSTD_BTRFS_MAX_LEVEL
; i
++) {
203 while (!list_empty(&wsm
.idle_ws
[i
])) {
204 workspace
= container_of(wsm
.idle_ws
[i
].next
,
205 struct workspace
, list
);
206 list_del(&workspace
->list
);
207 list_del(&workspace
->lru_list
);
208 zstd_free_workspace(&workspace
->list
);
211 spin_unlock_bh(&wsm
.lock
);
213 del_timer_sync(&wsm
.timer
);
217 * zstd_find_workspace - find workspace
218 * @level: compression level
220 * This iterates over the set bits in the active_map beginning at the requested
221 * compression level. This lets us utilize already allocated workspaces before
222 * allocating a new one. If the workspace is of a larger size, it is used, but
223 * the place in the lru_list and last_used times are not updated. This is to
224 * offer the opportunity to reclaim the workspace in favor of allocating an
225 * appropriately sized one in the future.
227 static struct list_head
*zstd_find_workspace(unsigned int level
)
229 struct list_head
*ws
;
230 struct workspace
*workspace
;
233 spin_lock_bh(&wsm
.lock
);
234 for_each_set_bit_from(i
, &wsm
.active_map
, ZSTD_BTRFS_MAX_LEVEL
) {
235 if (!list_empty(&wsm
.idle_ws
[i
])) {
236 ws
= wsm
.idle_ws
[i
].next
;
237 workspace
= list_to_workspace(ws
);
239 /* keep its place if it's a lower level using this */
240 workspace
->req_level
= level
;
241 if (level
== workspace
->level
)
242 list_del(&workspace
->lru_list
);
243 if (list_empty(&wsm
.idle_ws
[i
]))
244 clear_bit(i
, &wsm
.active_map
);
245 spin_unlock_bh(&wsm
.lock
);
249 spin_unlock_bh(&wsm
.lock
);
255 * zstd_get_workspace - zstd's get_workspace
256 * @level: compression level
258 * If @level is 0, then any compression level can be used. Therefore, we begin
259 * scanning from 1. We first scan through possible workspaces and then after
260 * attempt to allocate a new workspace. If we fail to allocate one due to
261 * memory pressure, go to sleep waiting for the max level workspace to free up.
263 struct list_head
*zstd_get_workspace(unsigned int level
)
265 struct list_head
*ws
;
266 unsigned int nofs_flag
;
268 /* level == 0 means we can use any workspace */
273 ws
= zstd_find_workspace(level
);
277 nofs_flag
= memalloc_nofs_save();
278 ws
= zstd_alloc_workspace(level
);
279 memalloc_nofs_restore(nofs_flag
);
284 prepare_to_wait(&wsm
.wait
, &wait
, TASK_UNINTERRUPTIBLE
);
286 finish_wait(&wsm
.wait
, &wait
);
295 * zstd_put_workspace - zstd put_workspace
296 * @ws: list_head for the workspace
298 * When putting back a workspace, we only need to update the LRU if we are of
299 * the requested compression level. Here is where we continue to protect the
300 * max level workspace or update last_used accordingly. If the reclaim timer
301 * isn't set, it is also set here. Only the max level workspace tries and wakes
302 * up waiting workspaces.
304 void zstd_put_workspace(struct list_head
*ws
)
306 struct workspace
*workspace
= list_to_workspace(ws
);
308 spin_lock_bh(&wsm
.lock
);
310 /* A node is only taken off the lru if we are the corresponding level */
311 if (workspace
->req_level
== workspace
->level
) {
312 /* Hide a max level workspace from reclaim */
313 if (list_empty(&wsm
.idle_ws
[ZSTD_BTRFS_MAX_LEVEL
- 1])) {
314 INIT_LIST_HEAD(&workspace
->lru_list
);
316 workspace
->last_used
= jiffies
;
317 list_add(&workspace
->lru_list
, &wsm
.lru_list
);
318 if (!timer_pending(&wsm
.timer
))
319 mod_timer(&wsm
.timer
,
320 jiffies
+ ZSTD_BTRFS_RECLAIM_JIFFIES
);
324 set_bit(workspace
->level
- 1, &wsm
.active_map
);
325 list_add(&workspace
->list
, &wsm
.idle_ws
[workspace
->level
- 1]);
326 workspace
->req_level
= 0;
328 spin_unlock_bh(&wsm
.lock
);
330 if (workspace
->level
== ZSTD_BTRFS_MAX_LEVEL
)
331 cond_wake_up(&wsm
.wait
);
334 void zstd_free_workspace(struct list_head
*ws
)
336 struct workspace
*workspace
= list_entry(ws
, struct workspace
, list
);
338 kvfree(workspace
->mem
);
339 kfree(workspace
->buf
);
343 struct list_head
*zstd_alloc_workspace(unsigned int level
)
345 struct workspace
*workspace
;
347 workspace
= kzalloc(sizeof(*workspace
), GFP_KERNEL
);
349 return ERR_PTR(-ENOMEM
);
351 workspace
->size
= zstd_ws_mem_sizes
[level
- 1];
352 workspace
->level
= level
;
353 workspace
->req_level
= level
;
354 workspace
->last_used
= jiffies
;
355 workspace
->mem
= kvmalloc(workspace
->size
, GFP_KERNEL
);
356 workspace
->buf
= kmalloc(PAGE_SIZE
, GFP_KERNEL
);
357 if (!workspace
->mem
|| !workspace
->buf
)
360 INIT_LIST_HEAD(&workspace
->list
);
361 INIT_LIST_HEAD(&workspace
->lru_list
);
363 return &workspace
->list
;
365 zstd_free_workspace(&workspace
->list
);
366 return ERR_PTR(-ENOMEM
);
369 int zstd_compress_pages(struct list_head
*ws
, struct address_space
*mapping
,
370 u64 start
, struct page
**pages
, unsigned long *out_pages
,
371 unsigned long *total_in
, unsigned long *total_out
)
373 struct workspace
*workspace
= list_entry(ws
, struct workspace
, list
);
374 ZSTD_CStream
*stream
;
377 struct page
*in_page
= NULL
; /* The current page to read */
378 struct page
*out_page
= NULL
; /* The current page to write to */
379 unsigned long tot_in
= 0;
380 unsigned long tot_out
= 0;
381 unsigned long len
= *total_out
;
382 const unsigned long nr_dest_pages
= *out_pages
;
383 unsigned long max_out
= nr_dest_pages
* PAGE_SIZE
;
384 ZSTD_parameters params
= zstd_get_btrfs_parameters(workspace
->req_level
,
391 /* Initialize the stream */
392 stream
= ZSTD_initCStream(params
, len
, workspace
->mem
,
395 pr_warn("BTRFS: ZSTD_initCStream failed\n");
400 /* map in the first page of input data */
401 in_page
= find_get_page(mapping
, start
>> PAGE_SHIFT
);
402 workspace
->in_buf
.src
= kmap(in_page
);
403 workspace
->in_buf
.pos
= 0;
404 workspace
->in_buf
.size
= min_t(size_t, len
, PAGE_SIZE
);
407 /* Allocate and map in the output buffer */
408 out_page
= alloc_page(GFP_NOFS
| __GFP_HIGHMEM
);
409 if (out_page
== NULL
) {
413 pages
[nr_pages
++] = out_page
;
414 workspace
->out_buf
.dst
= kmap(out_page
);
415 workspace
->out_buf
.pos
= 0;
416 workspace
->out_buf
.size
= min_t(size_t, max_out
, PAGE_SIZE
);
421 ret2
= ZSTD_compressStream(stream
, &workspace
->out_buf
,
423 if (ZSTD_isError(ret2
)) {
424 pr_debug("BTRFS: ZSTD_compressStream returned %d\n",
425 ZSTD_getErrorCode(ret2
));
430 /* Check to see if we are making it bigger */
431 if (tot_in
+ workspace
->in_buf
.pos
> 8192 &&
432 tot_in
+ workspace
->in_buf
.pos
<
433 tot_out
+ workspace
->out_buf
.pos
) {
438 /* We've reached the end of our output range */
439 if (workspace
->out_buf
.pos
>= max_out
) {
440 tot_out
+= workspace
->out_buf
.pos
;
445 /* Check if we need more output space */
446 if (workspace
->out_buf
.pos
== workspace
->out_buf
.size
) {
447 tot_out
+= PAGE_SIZE
;
448 max_out
-= PAGE_SIZE
;
450 if (nr_pages
== nr_dest_pages
) {
455 out_page
= alloc_page(GFP_NOFS
| __GFP_HIGHMEM
);
456 if (out_page
== NULL
) {
460 pages
[nr_pages
++] = out_page
;
461 workspace
->out_buf
.dst
= kmap(out_page
);
462 workspace
->out_buf
.pos
= 0;
463 workspace
->out_buf
.size
= min_t(size_t, max_out
,
467 /* We've reached the end of the input */
468 if (workspace
->in_buf
.pos
>= len
) {
469 tot_in
+= workspace
->in_buf
.pos
;
473 /* Check if we need more input */
474 if (workspace
->in_buf
.pos
== workspace
->in_buf
.size
) {
481 in_page
= find_get_page(mapping
, start
>> PAGE_SHIFT
);
482 workspace
->in_buf
.src
= kmap(in_page
);
483 workspace
->in_buf
.pos
= 0;
484 workspace
->in_buf
.size
= min_t(size_t, len
, PAGE_SIZE
);
490 ret2
= ZSTD_endStream(stream
, &workspace
->out_buf
);
491 if (ZSTD_isError(ret2
)) {
492 pr_debug("BTRFS: ZSTD_endStream returned %d\n",
493 ZSTD_getErrorCode(ret2
));
498 tot_out
+= workspace
->out_buf
.pos
;
501 if (workspace
->out_buf
.pos
>= max_out
) {
502 tot_out
+= workspace
->out_buf
.pos
;
507 tot_out
+= PAGE_SIZE
;
508 max_out
-= PAGE_SIZE
;
510 if (nr_pages
== nr_dest_pages
) {
515 out_page
= alloc_page(GFP_NOFS
| __GFP_HIGHMEM
);
516 if (out_page
== NULL
) {
520 pages
[nr_pages
++] = out_page
;
521 workspace
->out_buf
.dst
= kmap(out_page
);
522 workspace
->out_buf
.pos
= 0;
523 workspace
->out_buf
.size
= min_t(size_t, max_out
, PAGE_SIZE
);
526 if (tot_out
>= tot_in
) {
533 *total_out
= tot_out
;
535 *out_pages
= nr_pages
;
546 int zstd_decompress_bio(struct list_head
*ws
, struct compressed_bio
*cb
)
548 struct workspace
*workspace
= list_entry(ws
, struct workspace
, list
);
549 struct page
**pages_in
= cb
->compressed_pages
;
550 u64 disk_start
= cb
->start
;
551 struct bio
*orig_bio
= cb
->orig_bio
;
552 size_t srclen
= cb
->compressed_len
;
553 ZSTD_DStream
*stream
;
555 unsigned long page_in_index
= 0;
556 unsigned long total_pages_in
= DIV_ROUND_UP(srclen
, PAGE_SIZE
);
557 unsigned long buf_start
;
558 unsigned long total_out
= 0;
560 stream
= ZSTD_initDStream(
561 ZSTD_BTRFS_MAX_INPUT
, workspace
->mem
, workspace
->size
);
563 pr_debug("BTRFS: ZSTD_initDStream failed\n");
568 workspace
->in_buf
.src
= kmap(pages_in
[page_in_index
]);
569 workspace
->in_buf
.pos
= 0;
570 workspace
->in_buf
.size
= min_t(size_t, srclen
, PAGE_SIZE
);
572 workspace
->out_buf
.dst
= workspace
->buf
;
573 workspace
->out_buf
.pos
= 0;
574 workspace
->out_buf
.size
= PAGE_SIZE
;
579 ret2
= ZSTD_decompressStream(stream
, &workspace
->out_buf
,
581 if (ZSTD_isError(ret2
)) {
582 pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
583 ZSTD_getErrorCode(ret2
));
587 buf_start
= total_out
;
588 total_out
+= workspace
->out_buf
.pos
;
589 workspace
->out_buf
.pos
= 0;
591 ret
= btrfs_decompress_buf2page(workspace
->out_buf
.dst
,
592 buf_start
, total_out
, disk_start
, orig_bio
);
596 if (workspace
->in_buf
.pos
>= srclen
)
599 /* Check if we've hit the end of a frame */
603 if (workspace
->in_buf
.pos
== workspace
->in_buf
.size
) {
604 kunmap(pages_in
[page_in_index
++]);
605 if (page_in_index
>= total_pages_in
) {
606 workspace
->in_buf
.src
= NULL
;
611 workspace
->in_buf
.src
= kmap(pages_in
[page_in_index
]);
612 workspace
->in_buf
.pos
= 0;
613 workspace
->in_buf
.size
= min_t(size_t, srclen
, PAGE_SIZE
);
617 zero_fill_bio(orig_bio
);
619 if (workspace
->in_buf
.src
)
620 kunmap(pages_in
[page_in_index
]);
624 int zstd_decompress(struct list_head
*ws
, unsigned char *data_in
,
625 struct page
*dest_page
, unsigned long start_byte
, size_t srclen
,
628 struct workspace
*workspace
= list_entry(ws
, struct workspace
, list
);
629 ZSTD_DStream
*stream
;
632 unsigned long total_out
= 0;
633 unsigned long pg_offset
= 0;
636 stream
= ZSTD_initDStream(
637 ZSTD_BTRFS_MAX_INPUT
, workspace
->mem
, workspace
->size
);
639 pr_warn("BTRFS: ZSTD_initDStream failed\n");
644 destlen
= min_t(size_t, destlen
, PAGE_SIZE
);
646 workspace
->in_buf
.src
= data_in
;
647 workspace
->in_buf
.pos
= 0;
648 workspace
->in_buf
.size
= srclen
;
650 workspace
->out_buf
.dst
= workspace
->buf
;
651 workspace
->out_buf
.pos
= 0;
652 workspace
->out_buf
.size
= PAGE_SIZE
;
655 while (pg_offset
< destlen
656 && workspace
->in_buf
.pos
< workspace
->in_buf
.size
) {
657 unsigned long buf_start
;
658 unsigned long buf_offset
;
661 /* Check if the frame is over and we still need more input */
663 pr_debug("BTRFS: ZSTD_decompressStream ended early\n");
667 ret2
= ZSTD_decompressStream(stream
, &workspace
->out_buf
,
669 if (ZSTD_isError(ret2
)) {
670 pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
671 ZSTD_getErrorCode(ret2
));
676 buf_start
= total_out
;
677 total_out
+= workspace
->out_buf
.pos
;
678 workspace
->out_buf
.pos
= 0;
680 if (total_out
<= start_byte
)
683 if (total_out
> start_byte
&& buf_start
< start_byte
)
684 buf_offset
= start_byte
- buf_start
;
688 bytes
= min_t(unsigned long, destlen
- pg_offset
,
689 workspace
->out_buf
.size
- buf_offset
);
691 kaddr
= kmap_atomic(dest_page
);
692 memcpy(kaddr
+ pg_offset
, workspace
->out_buf
.dst
+ buf_offset
,
694 kunmap_atomic(kaddr
);
700 if (pg_offset
< destlen
) {
701 kaddr
= kmap_atomic(dest_page
);
702 memset(kaddr
+ pg_offset
, 0, destlen
- pg_offset
);
703 kunmap_atomic(kaddr
);
708 const struct btrfs_compress_op btrfs_zstd_compress
= {
709 /* ZSTD uses own workspace manager */
710 .workspace_manager
= NULL
,
711 .max_level
= ZSTD_BTRFS_MAX_LEVEL
,
712 .default_level
= ZSTD_BTRFS_DEFAULT_LEVEL
,