1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2008 Red Hat. All rights reserved.
6 #include <linux/pagemap.h>
7 #include <linux/sched.h>
8 #include <linux/sched/signal.h>
9 #include <linux/slab.h>
10 #include <linux/math64.h>
11 #include <linux/ratelimit.h>
12 #include <linux/error-injection.h>
13 #include <linux/sched/mm.h>
14 #include <linux/string_choices.h>
19 #include "free-space-cache.h"
20 #include "transaction.h"
22 #include "extent_io.h"
23 #include "space-info.h"
24 #include "block-group.h"
27 #include "inode-item.h"
28 #include "accessors.h"
29 #include "file-item.h"
33 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
34 #define MAX_CACHE_BYTES_PER_GIG SZ_64K
35 #define FORCE_EXTENT_THRESHOLD SZ_1M
37 static struct kmem_cache
*btrfs_free_space_cachep
;
38 static struct kmem_cache
*btrfs_free_space_bitmap_cachep
;
40 struct btrfs_trim_range
{
43 struct list_head list
;
46 static int link_free_space(struct btrfs_free_space_ctl
*ctl
,
47 struct btrfs_free_space
*info
);
48 static void unlink_free_space(struct btrfs_free_space_ctl
*ctl
,
49 struct btrfs_free_space
*info
, bool update_stat
);
50 static int search_bitmap(struct btrfs_free_space_ctl
*ctl
,
51 struct btrfs_free_space
*bitmap_info
, u64
*offset
,
52 u64
*bytes
, bool for_alloc
);
53 static void free_bitmap(struct btrfs_free_space_ctl
*ctl
,
54 struct btrfs_free_space
*bitmap_info
);
55 static void bitmap_clear_bits(struct btrfs_free_space_ctl
*ctl
,
56 struct btrfs_free_space
*info
, u64 offset
,
57 u64 bytes
, bool update_stats
);
59 static void btrfs_crc32c_final(u32 crc
, u8
*result
)
61 put_unaligned_le32(~crc
, result
);
64 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl
*ctl
)
66 struct btrfs_free_space
*info
;
69 while ((node
= rb_last(&ctl
->free_space_offset
)) != NULL
) {
70 info
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
72 unlink_free_space(ctl
, info
, true);
73 kmem_cache_free(btrfs_free_space_cachep
, info
);
75 free_bitmap(ctl
, info
);
78 cond_resched_lock(&ctl
->tree_lock
);
82 static struct inode
*__lookup_free_space_inode(struct btrfs_root
*root
,
83 struct btrfs_path
*path
,
87 struct btrfs_key location
;
88 struct btrfs_disk_key disk_key
;
89 struct btrfs_free_space_header
*header
;
90 struct extent_buffer
*leaf
;
91 struct inode
*inode
= NULL
;
95 key
.objectid
= BTRFS_FREE_SPACE_OBJECTID
;
99 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
103 btrfs_release_path(path
);
104 return ERR_PTR(-ENOENT
);
107 leaf
= path
->nodes
[0];
108 header
= btrfs_item_ptr(leaf
, path
->slots
[0],
109 struct btrfs_free_space_header
);
110 btrfs_free_space_key(leaf
, header
, &disk_key
);
111 btrfs_disk_key_to_cpu(&location
, &disk_key
);
112 btrfs_release_path(path
);
115 * We are often under a trans handle at this point, so we need to make
116 * sure NOFS is set to keep us from deadlocking.
118 nofs_flag
= memalloc_nofs_save();
119 inode
= btrfs_iget_path(location
.objectid
, root
, path
);
120 btrfs_release_path(path
);
121 memalloc_nofs_restore(nofs_flag
);
125 mapping_set_gfp_mask(inode
->i_mapping
,
126 mapping_gfp_constraint(inode
->i_mapping
,
127 ~(__GFP_FS
| __GFP_HIGHMEM
)));
132 struct inode
*lookup_free_space_inode(struct btrfs_block_group
*block_group
,
133 struct btrfs_path
*path
)
135 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
136 struct inode
*inode
= NULL
;
137 u32 flags
= BTRFS_INODE_NODATASUM
| BTRFS_INODE_NODATACOW
;
139 spin_lock(&block_group
->lock
);
140 if (block_group
->inode
)
141 inode
= igrab(&block_group
->inode
->vfs_inode
);
142 spin_unlock(&block_group
->lock
);
146 inode
= __lookup_free_space_inode(fs_info
->tree_root
, path
,
151 spin_lock(&block_group
->lock
);
152 if (!((BTRFS_I(inode
)->flags
& flags
) == flags
)) {
153 btrfs_info(fs_info
, "Old style space inode found, converting.");
154 BTRFS_I(inode
)->flags
|= BTRFS_INODE_NODATASUM
|
155 BTRFS_INODE_NODATACOW
;
156 block_group
->disk_cache_state
= BTRFS_DC_CLEAR
;
159 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF
, &block_group
->runtime_flags
))
160 block_group
->inode
= BTRFS_I(igrab(inode
));
161 spin_unlock(&block_group
->lock
);
166 static int __create_free_space_inode(struct btrfs_root
*root
,
167 struct btrfs_trans_handle
*trans
,
168 struct btrfs_path
*path
,
171 struct btrfs_key key
;
172 struct btrfs_disk_key disk_key
;
173 struct btrfs_free_space_header
*header
;
174 struct btrfs_inode_item
*inode_item
;
175 struct extent_buffer
*leaf
;
176 /* We inline CRCs for the free disk space cache */
177 const u64 flags
= BTRFS_INODE_NOCOMPRESS
| BTRFS_INODE_PREALLOC
|
178 BTRFS_INODE_NODATASUM
| BTRFS_INODE_NODATACOW
;
181 ret
= btrfs_insert_empty_inode(trans
, root
, path
, ino
);
185 leaf
= path
->nodes
[0];
186 inode_item
= btrfs_item_ptr(leaf
, path
->slots
[0],
187 struct btrfs_inode_item
);
188 btrfs_item_key(leaf
, &disk_key
, path
->slots
[0]);
189 memzero_extent_buffer(leaf
, (unsigned long)inode_item
,
190 sizeof(*inode_item
));
191 btrfs_set_inode_generation(leaf
, inode_item
, trans
->transid
);
192 btrfs_set_inode_size(leaf
, inode_item
, 0);
193 btrfs_set_inode_nbytes(leaf
, inode_item
, 0);
194 btrfs_set_inode_uid(leaf
, inode_item
, 0);
195 btrfs_set_inode_gid(leaf
, inode_item
, 0);
196 btrfs_set_inode_mode(leaf
, inode_item
, S_IFREG
| 0600);
197 btrfs_set_inode_flags(leaf
, inode_item
, flags
);
198 btrfs_set_inode_nlink(leaf
, inode_item
, 1);
199 btrfs_set_inode_transid(leaf
, inode_item
, trans
->transid
);
200 btrfs_set_inode_block_group(leaf
, inode_item
, offset
);
201 btrfs_mark_buffer_dirty(trans
, leaf
);
202 btrfs_release_path(path
);
204 key
.objectid
= BTRFS_FREE_SPACE_OBJECTID
;
207 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
,
208 sizeof(struct btrfs_free_space_header
));
210 btrfs_release_path(path
);
214 leaf
= path
->nodes
[0];
215 header
= btrfs_item_ptr(leaf
, path
->slots
[0],
216 struct btrfs_free_space_header
);
217 memzero_extent_buffer(leaf
, (unsigned long)header
, sizeof(*header
));
218 btrfs_set_free_space_key(leaf
, header
, &disk_key
);
219 btrfs_mark_buffer_dirty(trans
, leaf
);
220 btrfs_release_path(path
);
225 int create_free_space_inode(struct btrfs_trans_handle
*trans
,
226 struct btrfs_block_group
*block_group
,
227 struct btrfs_path
*path
)
232 ret
= btrfs_get_free_objectid(trans
->fs_info
->tree_root
, &ino
);
236 return __create_free_space_inode(trans
->fs_info
->tree_root
, trans
, path
,
237 ino
, block_group
->start
);
241 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
242 * handles lookup, otherwise it takes ownership and iputs the inode.
243 * Don't reuse an inode pointer after passing it into this function.
245 int btrfs_remove_free_space_inode(struct btrfs_trans_handle
*trans
,
247 struct btrfs_block_group
*block_group
)
249 struct btrfs_path
*path
;
250 struct btrfs_key key
;
253 path
= btrfs_alloc_path();
258 inode
= lookup_free_space_inode(block_group
, path
);
260 if (PTR_ERR(inode
) != -ENOENT
)
261 ret
= PTR_ERR(inode
);
264 ret
= btrfs_orphan_add(trans
, BTRFS_I(inode
));
266 btrfs_add_delayed_iput(BTRFS_I(inode
));
270 /* One for the block groups ref */
271 spin_lock(&block_group
->lock
);
272 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF
, &block_group
->runtime_flags
)) {
273 block_group
->inode
= NULL
;
274 spin_unlock(&block_group
->lock
);
277 spin_unlock(&block_group
->lock
);
279 /* One for the lookup ref */
280 btrfs_add_delayed_iput(BTRFS_I(inode
));
282 key
.objectid
= BTRFS_FREE_SPACE_OBJECTID
;
284 key
.offset
= block_group
->start
;
285 ret
= btrfs_search_slot(trans
, trans
->fs_info
->tree_root
, &key
, path
,
292 ret
= btrfs_del_item(trans
, trans
->fs_info
->tree_root
, path
);
294 btrfs_free_path(path
);
298 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle
*trans
,
299 struct btrfs_block_group
*block_group
,
300 struct inode
*vfs_inode
)
302 struct btrfs_truncate_control control
= {
303 .inode
= BTRFS_I(vfs_inode
),
305 .ino
= btrfs_ino(BTRFS_I(vfs_inode
)),
306 .min_type
= BTRFS_EXTENT_DATA_KEY
,
307 .clear_extent_range
= true,
309 struct btrfs_inode
*inode
= BTRFS_I(vfs_inode
);
310 struct btrfs_root
*root
= inode
->root
;
311 struct extent_state
*cached_state
= NULL
;
316 struct btrfs_path
*path
= btrfs_alloc_path();
323 mutex_lock(&trans
->transaction
->cache_write_mutex
);
324 if (!list_empty(&block_group
->io_list
)) {
325 list_del_init(&block_group
->io_list
);
327 btrfs_wait_cache_io(trans
, block_group
, path
);
328 btrfs_put_block_group(block_group
);
332 * now that we've truncated the cache away, its no longer
335 spin_lock(&block_group
->lock
);
336 block_group
->disk_cache_state
= BTRFS_DC_CLEAR
;
337 spin_unlock(&block_group
->lock
);
338 btrfs_free_path(path
);
341 btrfs_i_size_write(inode
, 0);
342 truncate_pagecache(vfs_inode
, 0);
344 lock_extent(&inode
->io_tree
, 0, (u64
)-1, &cached_state
);
345 btrfs_drop_extent_map_range(inode
, 0, (u64
)-1, false);
348 * We skip the throttling logic for free space cache inodes, so we don't
349 * need to check for -EAGAIN.
351 ret
= btrfs_truncate_inode_items(trans
, root
, &control
);
353 inode_sub_bytes(&inode
->vfs_inode
, control
.sub_bytes
);
354 btrfs_inode_safe_disk_i_size_write(inode
, control
.last_size
);
356 unlock_extent(&inode
->io_tree
, 0, (u64
)-1, &cached_state
);
360 ret
= btrfs_update_inode(trans
, inode
);
364 mutex_unlock(&trans
->transaction
->cache_write_mutex
);
366 btrfs_abort_transaction(trans
, ret
);
371 static void readahead_cache(struct inode
*inode
)
373 struct file_ra_state ra
;
374 unsigned long last_index
;
376 file_ra_state_init(&ra
, inode
->i_mapping
);
377 last_index
= (i_size_read(inode
) - 1) >> PAGE_SHIFT
;
379 page_cache_sync_readahead(inode
->i_mapping
, &ra
, NULL
, 0, last_index
);
382 static int io_ctl_init(struct btrfs_io_ctl
*io_ctl
, struct inode
*inode
,
387 num_pages
= DIV_ROUND_UP(i_size_read(inode
), PAGE_SIZE
);
389 /* Make sure we can fit our crcs and generation into the first page */
390 if (write
&& (num_pages
* sizeof(u32
) + sizeof(u64
)) > PAGE_SIZE
)
393 memset(io_ctl
, 0, sizeof(struct btrfs_io_ctl
));
395 io_ctl
->pages
= kcalloc(num_pages
, sizeof(struct page
*), GFP_NOFS
);
399 io_ctl
->num_pages
= num_pages
;
400 io_ctl
->fs_info
= inode_to_fs_info(inode
);
401 io_ctl
->inode
= inode
;
405 ALLOW_ERROR_INJECTION(io_ctl_init
, ERRNO
);
407 static void io_ctl_free(struct btrfs_io_ctl
*io_ctl
)
409 kfree(io_ctl
->pages
);
410 io_ctl
->pages
= NULL
;
413 static void io_ctl_unmap_page(struct btrfs_io_ctl
*io_ctl
)
421 static void io_ctl_map_page(struct btrfs_io_ctl
*io_ctl
, int clear
)
423 ASSERT(io_ctl
->index
< io_ctl
->num_pages
);
424 io_ctl
->page
= io_ctl
->pages
[io_ctl
->index
++];
425 io_ctl
->cur
= page_address(io_ctl
->page
);
426 io_ctl
->orig
= io_ctl
->cur
;
427 io_ctl
->size
= PAGE_SIZE
;
429 clear_page(io_ctl
->cur
);
432 static void io_ctl_drop_pages(struct btrfs_io_ctl
*io_ctl
)
436 io_ctl_unmap_page(io_ctl
);
438 for (i
= 0; i
< io_ctl
->num_pages
; i
++) {
439 if (io_ctl
->pages
[i
]) {
440 btrfs_folio_clear_checked(io_ctl
->fs_info
,
441 page_folio(io_ctl
->pages
[i
]),
442 page_offset(io_ctl
->pages
[i
]),
444 unlock_page(io_ctl
->pages
[i
]);
445 put_page(io_ctl
->pages
[i
]);
450 static int io_ctl_prepare_pages(struct btrfs_io_ctl
*io_ctl
, bool uptodate
)
453 struct inode
*inode
= io_ctl
->inode
;
454 gfp_t mask
= btrfs_alloc_write_mask(inode
->i_mapping
);
457 for (i
= 0; i
< io_ctl
->num_pages
; i
++) {
460 page
= find_or_create_page(inode
->i_mapping
, i
, mask
);
462 io_ctl_drop_pages(io_ctl
);
466 ret
= set_page_extent_mapped(page
);
470 io_ctl_drop_pages(io_ctl
);
474 io_ctl
->pages
[i
] = page
;
475 if (uptodate
&& !PageUptodate(page
)) {
476 btrfs_read_folio(NULL
, page_folio(page
));
478 if (page
->mapping
!= inode
->i_mapping
) {
479 btrfs_err(BTRFS_I(inode
)->root
->fs_info
,
480 "free space cache page truncated");
481 io_ctl_drop_pages(io_ctl
);
484 if (!PageUptodate(page
)) {
485 btrfs_err(BTRFS_I(inode
)->root
->fs_info
,
486 "error reading free space cache");
487 io_ctl_drop_pages(io_ctl
);
493 for (i
= 0; i
< io_ctl
->num_pages
; i
++)
494 clear_page_dirty_for_io(io_ctl
->pages
[i
]);
499 static void io_ctl_set_generation(struct btrfs_io_ctl
*io_ctl
, u64 generation
)
501 io_ctl_map_page(io_ctl
, 1);
504 * Skip the csum areas. If we don't check crcs then we just have a
505 * 64bit chunk at the front of the first page.
507 io_ctl
->cur
+= (sizeof(u32
) * io_ctl
->num_pages
);
508 io_ctl
->size
-= sizeof(u64
) + (sizeof(u32
) * io_ctl
->num_pages
);
510 put_unaligned_le64(generation
, io_ctl
->cur
);
511 io_ctl
->cur
+= sizeof(u64
);
514 static int io_ctl_check_generation(struct btrfs_io_ctl
*io_ctl
, u64 generation
)
519 * Skip the crc area. If we don't check crcs then we just have a 64bit
520 * chunk at the front of the first page.
522 io_ctl
->cur
+= sizeof(u32
) * io_ctl
->num_pages
;
523 io_ctl
->size
-= sizeof(u64
) + (sizeof(u32
) * io_ctl
->num_pages
);
525 cache_gen
= get_unaligned_le64(io_ctl
->cur
);
526 if (cache_gen
!= generation
) {
527 btrfs_err_rl(io_ctl
->fs_info
,
528 "space cache generation (%llu) does not match inode (%llu)",
529 cache_gen
, generation
);
530 io_ctl_unmap_page(io_ctl
);
533 io_ctl
->cur
+= sizeof(u64
);
537 static void io_ctl_set_crc(struct btrfs_io_ctl
*io_ctl
, int index
)
544 offset
= sizeof(u32
) * io_ctl
->num_pages
;
546 crc
= crc32c(crc
, io_ctl
->orig
+ offset
, PAGE_SIZE
- offset
);
547 btrfs_crc32c_final(crc
, (u8
*)&crc
);
548 io_ctl_unmap_page(io_ctl
);
549 tmp
= page_address(io_ctl
->pages
[0]);
554 static int io_ctl_check_crc(struct btrfs_io_ctl
*io_ctl
, int index
)
561 offset
= sizeof(u32
) * io_ctl
->num_pages
;
563 tmp
= page_address(io_ctl
->pages
[0]);
567 io_ctl_map_page(io_ctl
, 0);
568 crc
= crc32c(crc
, io_ctl
->orig
+ offset
, PAGE_SIZE
- offset
);
569 btrfs_crc32c_final(crc
, (u8
*)&crc
);
571 btrfs_err_rl(io_ctl
->fs_info
,
572 "csum mismatch on free space cache");
573 io_ctl_unmap_page(io_ctl
);
580 static int io_ctl_add_entry(struct btrfs_io_ctl
*io_ctl
, u64 offset
, u64 bytes
,
583 struct btrfs_free_space_entry
*entry
;
589 put_unaligned_le64(offset
, &entry
->offset
);
590 put_unaligned_le64(bytes
, &entry
->bytes
);
591 entry
->type
= (bitmap
) ? BTRFS_FREE_SPACE_BITMAP
:
592 BTRFS_FREE_SPACE_EXTENT
;
593 io_ctl
->cur
+= sizeof(struct btrfs_free_space_entry
);
594 io_ctl
->size
-= sizeof(struct btrfs_free_space_entry
);
596 if (io_ctl
->size
>= sizeof(struct btrfs_free_space_entry
))
599 io_ctl_set_crc(io_ctl
, io_ctl
->index
- 1);
601 /* No more pages to map */
602 if (io_ctl
->index
>= io_ctl
->num_pages
)
605 /* map the next page */
606 io_ctl_map_page(io_ctl
, 1);
610 static int io_ctl_add_bitmap(struct btrfs_io_ctl
*io_ctl
, void *bitmap
)
616 * If we aren't at the start of the current page, unmap this one and
617 * map the next one if there is any left.
619 if (io_ctl
->cur
!= io_ctl
->orig
) {
620 io_ctl_set_crc(io_ctl
, io_ctl
->index
- 1);
621 if (io_ctl
->index
>= io_ctl
->num_pages
)
623 io_ctl_map_page(io_ctl
, 0);
626 copy_page(io_ctl
->cur
, bitmap
);
627 io_ctl_set_crc(io_ctl
, io_ctl
->index
- 1);
628 if (io_ctl
->index
< io_ctl
->num_pages
)
629 io_ctl_map_page(io_ctl
, 0);
633 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl
*io_ctl
)
636 * If we're not on the boundary we know we've modified the page and we
637 * need to crc the page.
639 if (io_ctl
->cur
!= io_ctl
->orig
)
640 io_ctl_set_crc(io_ctl
, io_ctl
->index
- 1);
642 io_ctl_unmap_page(io_ctl
);
644 while (io_ctl
->index
< io_ctl
->num_pages
) {
645 io_ctl_map_page(io_ctl
, 1);
646 io_ctl_set_crc(io_ctl
, io_ctl
->index
- 1);
650 static int io_ctl_read_entry(struct btrfs_io_ctl
*io_ctl
,
651 struct btrfs_free_space
*entry
, u8
*type
)
653 struct btrfs_free_space_entry
*e
;
657 ret
= io_ctl_check_crc(io_ctl
, io_ctl
->index
);
663 entry
->offset
= get_unaligned_le64(&e
->offset
);
664 entry
->bytes
= get_unaligned_le64(&e
->bytes
);
666 io_ctl
->cur
+= sizeof(struct btrfs_free_space_entry
);
667 io_ctl
->size
-= sizeof(struct btrfs_free_space_entry
);
669 if (io_ctl
->size
>= sizeof(struct btrfs_free_space_entry
))
672 io_ctl_unmap_page(io_ctl
);
677 static int io_ctl_read_bitmap(struct btrfs_io_ctl
*io_ctl
,
678 struct btrfs_free_space
*entry
)
682 ret
= io_ctl_check_crc(io_ctl
, io_ctl
->index
);
686 copy_page(entry
->bitmap
, io_ctl
->cur
);
687 io_ctl_unmap_page(io_ctl
);
692 static void recalculate_thresholds(struct btrfs_free_space_ctl
*ctl
)
694 struct btrfs_block_group
*block_group
= ctl
->block_group
;
698 u64 size
= block_group
->length
;
699 u64 bytes_per_bg
= BITS_PER_BITMAP
* ctl
->unit
;
700 u64 max_bitmaps
= div64_u64(size
+ bytes_per_bg
- 1, bytes_per_bg
);
702 max_bitmaps
= max_t(u64
, max_bitmaps
, 1);
704 if (ctl
->total_bitmaps
> max_bitmaps
)
705 btrfs_err(block_group
->fs_info
,
706 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
707 block_group
->start
, block_group
->length
,
708 ctl
->total_bitmaps
, ctl
->unit
, max_bitmaps
,
710 ASSERT(ctl
->total_bitmaps
<= max_bitmaps
);
713 * We are trying to keep the total amount of memory used per 1GiB of
714 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
715 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
716 * bitmaps, we may end up using more memory than this.
719 max_bytes
= MAX_CACHE_BYTES_PER_GIG
;
721 max_bytes
= MAX_CACHE_BYTES_PER_GIG
* div_u64(size
, SZ_1G
);
723 bitmap_bytes
= ctl
->total_bitmaps
* ctl
->unit
;
726 * we want the extent entry threshold to always be at most 1/2 the max
727 * bytes we can have, or whatever is less than that.
729 extent_bytes
= max_bytes
- bitmap_bytes
;
730 extent_bytes
= min_t(u64
, extent_bytes
, max_bytes
>> 1);
732 ctl
->extents_thresh
=
733 div_u64(extent_bytes
, sizeof(struct btrfs_free_space
));
736 static int __load_free_space_cache(struct btrfs_root
*root
, struct inode
*inode
,
737 struct btrfs_free_space_ctl
*ctl
,
738 struct btrfs_path
*path
, u64 offset
)
740 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
741 struct btrfs_free_space_header
*header
;
742 struct extent_buffer
*leaf
;
743 struct btrfs_io_ctl io_ctl
;
744 struct btrfs_key key
;
745 struct btrfs_free_space
*e
, *n
;
753 /* Nothing in the space cache, goodbye */
754 if (!i_size_read(inode
))
757 key
.objectid
= BTRFS_FREE_SPACE_OBJECTID
;
761 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
765 btrfs_release_path(path
);
771 leaf
= path
->nodes
[0];
772 header
= btrfs_item_ptr(leaf
, path
->slots
[0],
773 struct btrfs_free_space_header
);
774 num_entries
= btrfs_free_space_entries(leaf
, header
);
775 num_bitmaps
= btrfs_free_space_bitmaps(leaf
, header
);
776 generation
= btrfs_free_space_generation(leaf
, header
);
777 btrfs_release_path(path
);
779 if (!BTRFS_I(inode
)->generation
) {
781 "the free space cache file (%llu) is invalid, skip it",
786 if (BTRFS_I(inode
)->generation
!= generation
) {
788 "free space inode generation (%llu) did not match free space cache generation (%llu)",
789 BTRFS_I(inode
)->generation
, generation
);
796 ret
= io_ctl_init(&io_ctl
, inode
, 0);
800 readahead_cache(inode
);
802 ret
= io_ctl_prepare_pages(&io_ctl
, true);
806 ret
= io_ctl_check_crc(&io_ctl
, 0);
810 ret
= io_ctl_check_generation(&io_ctl
, generation
);
814 while (num_entries
) {
815 e
= kmem_cache_zalloc(btrfs_free_space_cachep
,
822 ret
= io_ctl_read_entry(&io_ctl
, e
, &type
);
824 kmem_cache_free(btrfs_free_space_cachep
, e
);
830 kmem_cache_free(btrfs_free_space_cachep
, e
);
834 if (type
== BTRFS_FREE_SPACE_EXTENT
) {
835 spin_lock(&ctl
->tree_lock
);
836 ret
= link_free_space(ctl
, e
);
837 spin_unlock(&ctl
->tree_lock
);
840 "Duplicate entries in free space cache, dumping");
841 kmem_cache_free(btrfs_free_space_cachep
, e
);
847 e
->bitmap
= kmem_cache_zalloc(
848 btrfs_free_space_bitmap_cachep
, GFP_NOFS
);
852 btrfs_free_space_cachep
, e
);
855 spin_lock(&ctl
->tree_lock
);
856 ret
= link_free_space(ctl
, e
);
858 spin_unlock(&ctl
->tree_lock
);
860 "Duplicate entries in free space cache, dumping");
861 kmem_cache_free(btrfs_free_space_bitmap_cachep
, e
->bitmap
);
862 kmem_cache_free(btrfs_free_space_cachep
, e
);
865 ctl
->total_bitmaps
++;
866 recalculate_thresholds(ctl
);
867 spin_unlock(&ctl
->tree_lock
);
868 list_add_tail(&e
->list
, &bitmaps
);
874 io_ctl_unmap_page(&io_ctl
);
877 * We add the bitmaps at the end of the entries in order that
878 * the bitmap entries are added to the cache.
880 list_for_each_entry_safe(e
, n
, &bitmaps
, list
) {
881 list_del_init(&e
->list
);
882 ret
= io_ctl_read_bitmap(&io_ctl
, e
);
887 io_ctl_drop_pages(&io_ctl
);
890 io_ctl_free(&io_ctl
);
893 io_ctl_drop_pages(&io_ctl
);
895 spin_lock(&ctl
->tree_lock
);
896 __btrfs_remove_free_space_cache(ctl
);
897 spin_unlock(&ctl
->tree_lock
);
901 static int copy_free_space_cache(struct btrfs_block_group
*block_group
,
902 struct btrfs_free_space_ctl
*ctl
)
904 struct btrfs_free_space
*info
;
908 while (!ret
&& (n
= rb_first(&ctl
->free_space_offset
)) != NULL
) {
909 info
= rb_entry(n
, struct btrfs_free_space
, offset_index
);
911 const u64 offset
= info
->offset
;
912 const u64 bytes
= info
->bytes
;
914 unlink_free_space(ctl
, info
, true);
915 spin_unlock(&ctl
->tree_lock
);
916 kmem_cache_free(btrfs_free_space_cachep
, info
);
917 ret
= btrfs_add_free_space(block_group
, offset
, bytes
);
918 spin_lock(&ctl
->tree_lock
);
920 u64 offset
= info
->offset
;
921 u64 bytes
= ctl
->unit
;
923 ret
= search_bitmap(ctl
, info
, &offset
, &bytes
, false);
925 bitmap_clear_bits(ctl
, info
, offset
, bytes
, true);
926 spin_unlock(&ctl
->tree_lock
);
927 ret
= btrfs_add_free_space(block_group
, offset
,
929 spin_lock(&ctl
->tree_lock
);
931 free_bitmap(ctl
, info
);
935 cond_resched_lock(&ctl
->tree_lock
);
940 static struct lock_class_key btrfs_free_space_inode_key
;
942 int load_free_space_cache(struct btrfs_block_group
*block_group
)
944 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
945 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
946 struct btrfs_free_space_ctl tmp_ctl
= {};
948 struct btrfs_path
*path
;
951 u64 used
= block_group
->used
;
954 * Because we could potentially discard our loaded free space, we want
955 * to load everything into a temporary structure first, and then if it's
956 * valid copy it all into the actual free space ctl.
958 btrfs_init_free_space_ctl(block_group
, &tmp_ctl
);
961 * If this block group has been marked to be cleared for one reason or
962 * another then we can't trust the on disk cache, so just return.
964 spin_lock(&block_group
->lock
);
965 if (block_group
->disk_cache_state
!= BTRFS_DC_WRITTEN
) {
966 spin_unlock(&block_group
->lock
);
969 spin_unlock(&block_group
->lock
);
971 path
= btrfs_alloc_path();
974 path
->search_commit_root
= 1;
975 path
->skip_locking
= 1;
978 * We must pass a path with search_commit_root set to btrfs_iget in
979 * order to avoid a deadlock when allocating extents for the tree root.
981 * When we are COWing an extent buffer from the tree root, when looking
982 * for a free extent, at extent-tree.c:find_free_extent(), we can find
983 * block group without its free space cache loaded. When we find one
984 * we must load its space cache which requires reading its free space
985 * cache's inode item from the root tree. If this inode item is located
986 * in the same leaf that we started COWing before, then we end up in
987 * deadlock on the extent buffer (trying to read lock it when we
988 * previously write locked it).
990 * It's safe to read the inode item using the commit root because
991 * block groups, once loaded, stay in memory forever (until they are
992 * removed) as well as their space caches once loaded. New block groups
993 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
994 * we will never try to read their inode item while the fs is mounted.
996 inode
= lookup_free_space_inode(block_group
, path
);
998 btrfs_free_path(path
);
1002 /* We may have converted the inode and made the cache invalid. */
1003 spin_lock(&block_group
->lock
);
1004 if (block_group
->disk_cache_state
!= BTRFS_DC_WRITTEN
) {
1005 spin_unlock(&block_group
->lock
);
1006 btrfs_free_path(path
);
1009 spin_unlock(&block_group
->lock
);
1012 * Reinitialize the class of struct inode's mapping->invalidate_lock for
1013 * free space inodes to prevent false positives related to locks for normal
1016 lockdep_set_class(&(&inode
->i_data
)->invalidate_lock
,
1017 &btrfs_free_space_inode_key
);
1019 ret
= __load_free_space_cache(fs_info
->tree_root
, inode
, &tmp_ctl
,
1020 path
, block_group
->start
);
1021 btrfs_free_path(path
);
1025 matched
= (tmp_ctl
.free_space
== (block_group
->length
- used
-
1026 block_group
->bytes_super
));
1029 spin_lock(&tmp_ctl
.tree_lock
);
1030 ret
= copy_free_space_cache(block_group
, &tmp_ctl
);
1031 spin_unlock(&tmp_ctl
.tree_lock
);
1033 * ret == 1 means we successfully loaded the free space cache,
1034 * so we need to re-set it here.
1040 * We need to call the _locked variant so we don't try to update
1041 * the discard counters.
1043 spin_lock(&tmp_ctl
.tree_lock
);
1044 __btrfs_remove_free_space_cache(&tmp_ctl
);
1045 spin_unlock(&tmp_ctl
.tree_lock
);
1047 "block group %llu has wrong amount of free space",
1048 block_group
->start
);
1053 /* This cache is bogus, make sure it gets cleared */
1054 spin_lock(&block_group
->lock
);
1055 block_group
->disk_cache_state
= BTRFS_DC_CLEAR
;
1056 spin_unlock(&block_group
->lock
);
1060 "failed to load free space cache for block group %llu, rebuilding it now",
1061 block_group
->start
);
1064 spin_lock(&ctl
->tree_lock
);
1065 btrfs_discard_update_discardable(block_group
);
1066 spin_unlock(&ctl
->tree_lock
);
1071 static noinline_for_stack
1072 int write_cache_extent_entries(struct btrfs_io_ctl
*io_ctl
,
1073 struct btrfs_free_space_ctl
*ctl
,
1074 struct btrfs_block_group
*block_group
,
1075 int *entries
, int *bitmaps
,
1076 struct list_head
*bitmap_list
)
1079 struct btrfs_free_cluster
*cluster
= NULL
;
1080 struct btrfs_free_cluster
*cluster_locked
= NULL
;
1081 struct rb_node
*node
= rb_first(&ctl
->free_space_offset
);
1082 struct btrfs_trim_range
*trim_entry
;
1084 /* Get the cluster for this block_group if it exists */
1085 if (block_group
&& !list_empty(&block_group
->cluster_list
)) {
1086 cluster
= list_entry(block_group
->cluster_list
.next
,
1087 struct btrfs_free_cluster
,
1091 if (!node
&& cluster
) {
1092 cluster_locked
= cluster
;
1093 spin_lock(&cluster_locked
->lock
);
1094 node
= rb_first(&cluster
->root
);
1098 /* Write out the extent entries */
1100 struct btrfs_free_space
*e
;
1102 e
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
1105 ret
= io_ctl_add_entry(io_ctl
, e
->offset
, e
->bytes
,
1111 list_add_tail(&e
->list
, bitmap_list
);
1114 node
= rb_next(node
);
1115 if (!node
&& cluster
) {
1116 node
= rb_first(&cluster
->root
);
1117 cluster_locked
= cluster
;
1118 spin_lock(&cluster_locked
->lock
);
1122 if (cluster_locked
) {
1123 spin_unlock(&cluster_locked
->lock
);
1124 cluster_locked
= NULL
;
1128 * Make sure we don't miss any range that was removed from our rbtree
1129 * because trimming is running. Otherwise after a umount+mount (or crash
1130 * after committing the transaction) we would leak free space and get
1131 * an inconsistent free space cache report from fsck.
1133 list_for_each_entry(trim_entry
, &ctl
->trimming_ranges
, list
) {
1134 ret
= io_ctl_add_entry(io_ctl
, trim_entry
->start
,
1135 trim_entry
->bytes
, NULL
);
1144 spin_unlock(&cluster_locked
->lock
);
1148 static noinline_for_stack
int
1149 update_cache_item(struct btrfs_trans_handle
*trans
,
1150 struct btrfs_root
*root
,
1151 struct inode
*inode
,
1152 struct btrfs_path
*path
, u64 offset
,
1153 int entries
, int bitmaps
)
1155 struct btrfs_key key
;
1156 struct btrfs_free_space_header
*header
;
1157 struct extent_buffer
*leaf
;
1160 key
.objectid
= BTRFS_FREE_SPACE_OBJECTID
;
1161 key
.offset
= offset
;
1164 ret
= btrfs_search_slot(trans
, root
, &key
, path
, 0, 1);
1166 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, 0, inode
->i_size
- 1,
1167 EXTENT_DELALLOC
, NULL
);
1170 leaf
= path
->nodes
[0];
1172 struct btrfs_key found_key
;
1173 ASSERT(path
->slots
[0]);
1175 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
1176 if (found_key
.objectid
!= BTRFS_FREE_SPACE_OBJECTID
||
1177 found_key
.offset
!= offset
) {
1178 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, 0,
1179 inode
->i_size
- 1, EXTENT_DELALLOC
,
1181 btrfs_release_path(path
);
1186 BTRFS_I(inode
)->generation
= trans
->transid
;
1187 header
= btrfs_item_ptr(leaf
, path
->slots
[0],
1188 struct btrfs_free_space_header
);
1189 btrfs_set_free_space_entries(leaf
, header
, entries
);
1190 btrfs_set_free_space_bitmaps(leaf
, header
, bitmaps
);
1191 btrfs_set_free_space_generation(leaf
, header
, trans
->transid
);
1192 btrfs_mark_buffer_dirty(trans
, leaf
);
1193 btrfs_release_path(path
);
1201 static noinline_for_stack
int write_pinned_extent_entries(
1202 struct btrfs_trans_handle
*trans
,
1203 struct btrfs_block_group
*block_group
,
1204 struct btrfs_io_ctl
*io_ctl
,
1207 u64 start
, extent_start
, extent_end
, len
;
1208 struct extent_io_tree
*unpin
= NULL
;
1215 * We want to add any pinned extents to our free space cache
1216 * so we don't leak the space
1218 * We shouldn't have switched the pinned extents yet so this is the
1221 unpin
= &trans
->transaction
->pinned_extents
;
1223 start
= block_group
->start
;
1225 while (start
< block_group
->start
+ block_group
->length
) {
1226 if (!find_first_extent_bit(unpin
, start
,
1227 &extent_start
, &extent_end
,
1228 EXTENT_DIRTY
, NULL
))
1231 /* This pinned extent is out of our range */
1232 if (extent_start
>= block_group
->start
+ block_group
->length
)
1235 extent_start
= max(extent_start
, start
);
1236 extent_end
= min(block_group
->start
+ block_group
->length
,
1238 len
= extent_end
- extent_start
;
1241 ret
= io_ctl_add_entry(io_ctl
, extent_start
, len
, NULL
);
1251 static noinline_for_stack
int
1252 write_bitmap_entries(struct btrfs_io_ctl
*io_ctl
, struct list_head
*bitmap_list
)
1254 struct btrfs_free_space
*entry
, *next
;
1257 /* Write out the bitmaps */
1258 list_for_each_entry_safe(entry
, next
, bitmap_list
, list
) {
1259 ret
= io_ctl_add_bitmap(io_ctl
, entry
->bitmap
);
1262 list_del_init(&entry
->list
);
1268 static int flush_dirty_cache(struct inode
*inode
)
1272 ret
= btrfs_wait_ordered_range(BTRFS_I(inode
), 0, (u64
)-1);
1274 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, 0, inode
->i_size
- 1,
1275 EXTENT_DELALLOC
, NULL
);
1280 static void noinline_for_stack
1281 cleanup_bitmap_list(struct list_head
*bitmap_list
)
1283 struct btrfs_free_space
*entry
, *next
;
1285 list_for_each_entry_safe(entry
, next
, bitmap_list
, list
)
1286 list_del_init(&entry
->list
);
1289 static void noinline_for_stack
1290 cleanup_write_cache_enospc(struct inode
*inode
,
1291 struct btrfs_io_ctl
*io_ctl
,
1292 struct extent_state
**cached_state
)
1294 io_ctl_drop_pages(io_ctl
);
1295 unlock_extent(&BTRFS_I(inode
)->io_tree
, 0, i_size_read(inode
) - 1,
1299 static int __btrfs_wait_cache_io(struct btrfs_root
*root
,
1300 struct btrfs_trans_handle
*trans
,
1301 struct btrfs_block_group
*block_group
,
1302 struct btrfs_io_ctl
*io_ctl
,
1303 struct btrfs_path
*path
, u64 offset
)
1306 struct inode
*inode
= io_ctl
->inode
;
1311 /* Flush the dirty pages in the cache file. */
1312 ret
= flush_dirty_cache(inode
);
1316 /* Update the cache item to tell everyone this cache file is valid. */
1317 ret
= update_cache_item(trans
, root
, inode
, path
, offset
,
1318 io_ctl
->entries
, io_ctl
->bitmaps
);
1321 invalidate_inode_pages2(inode
->i_mapping
);
1322 BTRFS_I(inode
)->generation
= 0;
1324 btrfs_debug(root
->fs_info
,
1325 "failed to write free space cache for block group %llu error %d",
1326 block_group
->start
, ret
);
1328 btrfs_update_inode(trans
, BTRFS_I(inode
));
1331 /* the dirty list is protected by the dirty_bgs_lock */
1332 spin_lock(&trans
->transaction
->dirty_bgs_lock
);
1334 /* the disk_cache_state is protected by the block group lock */
1335 spin_lock(&block_group
->lock
);
1338 * only mark this as written if we didn't get put back on
1339 * the dirty list while waiting for IO. Otherwise our
1340 * cache state won't be right, and we won't get written again
1342 if (!ret
&& list_empty(&block_group
->dirty_list
))
1343 block_group
->disk_cache_state
= BTRFS_DC_WRITTEN
;
1345 block_group
->disk_cache_state
= BTRFS_DC_ERROR
;
1347 spin_unlock(&block_group
->lock
);
1348 spin_unlock(&trans
->transaction
->dirty_bgs_lock
);
1349 io_ctl
->inode
= NULL
;
1357 int btrfs_wait_cache_io(struct btrfs_trans_handle
*trans
,
1358 struct btrfs_block_group
*block_group
,
1359 struct btrfs_path
*path
)
1361 return __btrfs_wait_cache_io(block_group
->fs_info
->tree_root
, trans
,
1362 block_group
, &block_group
->io_ctl
,
1363 path
, block_group
->start
);
1367 * Write out cached info to an inode.
1369 * @inode: freespace inode we are writing out
1370 * @ctl: free space cache we are going to write out
1371 * @block_group: block_group for this cache if it belongs to a block_group
1372 * @io_ctl: holds context for the io
1373 * @trans: the trans handle
1375 * This function writes out a free space cache struct to disk for quick recovery
1376 * on mount. This will return 0 if it was successful in writing the cache out,
1377 * or an errno if it was not.
1379 static int __btrfs_write_out_cache(struct inode
*inode
,
1380 struct btrfs_free_space_ctl
*ctl
,
1381 struct btrfs_block_group
*block_group
,
1382 struct btrfs_io_ctl
*io_ctl
,
1383 struct btrfs_trans_handle
*trans
)
1385 struct extent_state
*cached_state
= NULL
;
1386 LIST_HEAD(bitmap_list
);
1393 if (!i_size_read(inode
))
1396 WARN_ON(io_ctl
->pages
);
1397 ret
= io_ctl_init(io_ctl
, inode
, 1);
1401 if (block_group
&& (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
)) {
1402 down_write(&block_group
->data_rwsem
);
1403 spin_lock(&block_group
->lock
);
1404 if (block_group
->delalloc_bytes
) {
1405 block_group
->disk_cache_state
= BTRFS_DC_WRITTEN
;
1406 spin_unlock(&block_group
->lock
);
1407 up_write(&block_group
->data_rwsem
);
1408 BTRFS_I(inode
)->generation
= 0;
1413 spin_unlock(&block_group
->lock
);
1416 /* Lock all pages first so we can lock the extent safely. */
1417 ret
= io_ctl_prepare_pages(io_ctl
, false);
1421 lock_extent(&BTRFS_I(inode
)->io_tree
, 0, i_size_read(inode
) - 1,
1424 io_ctl_set_generation(io_ctl
, trans
->transid
);
1426 mutex_lock(&ctl
->cache_writeout_mutex
);
1427 /* Write out the extent entries in the free space cache */
1428 spin_lock(&ctl
->tree_lock
);
1429 ret
= write_cache_extent_entries(io_ctl
, ctl
,
1430 block_group
, &entries
, &bitmaps
,
1433 goto out_nospc_locked
;
1436 * Some spaces that are freed in the current transaction are pinned,
1437 * they will be added into free space cache after the transaction is
1438 * committed, we shouldn't lose them.
1440 * If this changes while we are working we'll get added back to
1441 * the dirty list and redo it. No locking needed
1443 ret
= write_pinned_extent_entries(trans
, block_group
, io_ctl
, &entries
);
1445 goto out_nospc_locked
;
1448 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1449 * locked while doing it because a concurrent trim can be manipulating
1450 * or freeing the bitmap.
1452 ret
= write_bitmap_entries(io_ctl
, &bitmap_list
);
1453 spin_unlock(&ctl
->tree_lock
);
1454 mutex_unlock(&ctl
->cache_writeout_mutex
);
1458 /* Zero out the rest of the pages just to make sure */
1459 io_ctl_zero_remaining_pages(io_ctl
);
1461 /* Everything is written out, now we dirty the pages in the file. */
1462 i_size
= i_size_read(inode
);
1463 for (int i
= 0; i
< round_up(i_size
, PAGE_SIZE
) / PAGE_SIZE
; i
++) {
1464 u64 dirty_start
= i
* PAGE_SIZE
;
1465 u64 dirty_len
= min_t(u64
, dirty_start
+ PAGE_SIZE
, i_size
) - dirty_start
;
1467 ret
= btrfs_dirty_folio(BTRFS_I(inode
), page_folio(io_ctl
->pages
[i
]),
1468 dirty_start
, dirty_len
, &cached_state
, false);
1473 if (block_group
&& (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
))
1474 up_write(&block_group
->data_rwsem
);
1476 * Release the pages and unlock the extent, we will flush
1479 io_ctl_drop_pages(io_ctl
);
1480 io_ctl_free(io_ctl
);
1482 unlock_extent(&BTRFS_I(inode
)->io_tree
, 0, i_size_read(inode
) - 1,
1486 * at this point the pages are under IO and we're happy,
1487 * The caller is responsible for waiting on them and updating
1488 * the cache and the inode
1490 io_ctl
->entries
= entries
;
1491 io_ctl
->bitmaps
= bitmaps
;
1493 ret
= btrfs_fdatawrite_range(BTRFS_I(inode
), 0, (u64
)-1);
1500 cleanup_bitmap_list(&bitmap_list
);
1501 spin_unlock(&ctl
->tree_lock
);
1502 mutex_unlock(&ctl
->cache_writeout_mutex
);
1505 cleanup_write_cache_enospc(inode
, io_ctl
, &cached_state
);
1508 if (block_group
&& (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
))
1509 up_write(&block_group
->data_rwsem
);
1512 io_ctl
->inode
= NULL
;
1513 io_ctl_free(io_ctl
);
1515 invalidate_inode_pages2(inode
->i_mapping
);
1516 BTRFS_I(inode
)->generation
= 0;
1518 btrfs_update_inode(trans
, BTRFS_I(inode
));
1524 int btrfs_write_out_cache(struct btrfs_trans_handle
*trans
,
1525 struct btrfs_block_group
*block_group
,
1526 struct btrfs_path
*path
)
1528 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1529 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
1530 struct inode
*inode
;
1533 spin_lock(&block_group
->lock
);
1534 if (block_group
->disk_cache_state
< BTRFS_DC_SETUP
) {
1535 spin_unlock(&block_group
->lock
);
1538 spin_unlock(&block_group
->lock
);
1540 inode
= lookup_free_space_inode(block_group
, path
);
1544 ret
= __btrfs_write_out_cache(inode
, ctl
, block_group
,
1545 &block_group
->io_ctl
, trans
);
1547 btrfs_debug(fs_info
,
1548 "failed to write free space cache for block group %llu error %d",
1549 block_group
->start
, ret
);
1550 spin_lock(&block_group
->lock
);
1551 block_group
->disk_cache_state
= BTRFS_DC_ERROR
;
1552 spin_unlock(&block_group
->lock
);
1554 block_group
->io_ctl
.inode
= NULL
;
1559 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1560 * to wait for IO and put the inode
1566 static inline unsigned long offset_to_bit(u64 bitmap_start
, u32 unit
,
1569 ASSERT(offset
>= bitmap_start
);
1570 offset
-= bitmap_start
;
1571 return (unsigned long)(div_u64(offset
, unit
));
1574 static inline unsigned long bytes_to_bits(u64 bytes
, u32 unit
)
1576 return (unsigned long)(div_u64(bytes
, unit
));
1579 static inline u64
offset_to_bitmap(struct btrfs_free_space_ctl
*ctl
,
1583 u64 bytes_per_bitmap
;
1585 bytes_per_bitmap
= BITS_PER_BITMAP
* ctl
->unit
;
1586 bitmap_start
= offset
- ctl
->start
;
1587 bitmap_start
= div64_u64(bitmap_start
, bytes_per_bitmap
);
1588 bitmap_start
*= bytes_per_bitmap
;
1589 bitmap_start
+= ctl
->start
;
1591 return bitmap_start
;
1594 static int tree_insert_offset(struct btrfs_free_space_ctl
*ctl
,
1595 struct btrfs_free_cluster
*cluster
,
1596 struct btrfs_free_space
*new_entry
)
1598 struct rb_root
*root
;
1600 struct rb_node
*parent
= NULL
;
1602 lockdep_assert_held(&ctl
->tree_lock
);
1605 lockdep_assert_held(&cluster
->lock
);
1606 root
= &cluster
->root
;
1608 root
= &ctl
->free_space_offset
;
1614 struct btrfs_free_space
*info
;
1617 info
= rb_entry(parent
, struct btrfs_free_space
, offset_index
);
1619 if (new_entry
->offset
< info
->offset
) {
1621 } else if (new_entry
->offset
> info
->offset
) {
1622 p
= &(*p
)->rb_right
;
1625 * we could have a bitmap entry and an extent entry
1626 * share the same offset. If this is the case, we want
1627 * the extent entry to always be found first if we do a
1628 * linear search through the tree, since we want to have
1629 * the quickest allocation time, and allocating from an
1630 * extent is faster than allocating from a bitmap. So
1631 * if we're inserting a bitmap and we find an entry at
1632 * this offset, we want to go right, or after this entry
1633 * logically. If we are inserting an extent and we've
1634 * found a bitmap, we want to go left, or before
1637 if (new_entry
->bitmap
) {
1642 p
= &(*p
)->rb_right
;
1644 if (!info
->bitmap
) {
1653 rb_link_node(&new_entry
->offset_index
, parent
, p
);
1654 rb_insert_color(&new_entry
->offset_index
, root
);
1660 * This is a little subtle. We *only* have ->max_extent_size set if we actually
1661 * searched through the bitmap and figured out the largest ->max_extent_size,
1662 * otherwise it's 0. In the case that it's 0 we don't want to tell the
1663 * allocator the wrong thing, we want to use the actual real max_extent_size
1664 * we've found already if it's larger, or we want to use ->bytes.
1666 * This matters because find_free_space() will skip entries who's ->bytes is
1667 * less than the required bytes. So if we didn't search down this bitmap, we
1668 * may pick some previous entry that has a smaller ->max_extent_size than we
1669 * have. For example, assume we have two entries, one that has
1670 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set
1671 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will
1672 * call into find_free_space(), and return with max_extent_size == 4K, because
1673 * that first bitmap entry had ->max_extent_size set, but the second one did
1674 * not. If instead we returned 8K we'd come in searching for 8K, and find the
1675 * 8K contiguous range.
1677 * Consider the other case, we have 2 8K chunks in that second entry and still
1678 * don't have ->max_extent_size set. We'll return 16K, and the next time the
1679 * allocator comes in it'll fully search our second bitmap, and this time it'll
1680 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the
1681 * right allocation the next loop through.
1683 static inline u64
get_max_extent_size(const struct btrfs_free_space
*entry
)
1685 if (entry
->bitmap
&& entry
->max_extent_size
)
1686 return entry
->max_extent_size
;
1687 return entry
->bytes
;
1691 * We want the largest entry to be leftmost, so this is inverted from what you'd
1694 static bool entry_less(struct rb_node
*node
, const struct rb_node
*parent
)
1696 const struct btrfs_free_space
*entry
, *exist
;
1698 entry
= rb_entry(node
, struct btrfs_free_space
, bytes_index
);
1699 exist
= rb_entry(parent
, struct btrfs_free_space
, bytes_index
);
1700 return get_max_extent_size(exist
) < get_max_extent_size(entry
);
1704 * searches the tree for the given offset.
1706 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1707 * want a section that has at least bytes size and comes at or after the given
1710 static struct btrfs_free_space
*
1711 tree_search_offset(struct btrfs_free_space_ctl
*ctl
,
1712 u64 offset
, int bitmap_only
, int fuzzy
)
1714 struct rb_node
*n
= ctl
->free_space_offset
.rb_node
;
1715 struct btrfs_free_space
*entry
= NULL
, *prev
= NULL
;
1717 lockdep_assert_held(&ctl
->tree_lock
);
1719 /* find entry that is closest to the 'offset' */
1721 entry
= rb_entry(n
, struct btrfs_free_space
, offset_index
);
1724 if (offset
< entry
->offset
)
1726 else if (offset
> entry
->offset
)
1741 * bitmap entry and extent entry may share same offset,
1742 * in that case, bitmap entry comes after extent entry.
1747 entry
= rb_entry(n
, struct btrfs_free_space
, offset_index
);
1748 if (entry
->offset
!= offset
)
1751 WARN_ON(!entry
->bitmap
);
1754 if (entry
->bitmap
) {
1756 * if previous extent entry covers the offset,
1757 * we should return it instead of the bitmap entry
1759 n
= rb_prev(&entry
->offset_index
);
1761 prev
= rb_entry(n
, struct btrfs_free_space
,
1763 if (!prev
->bitmap
&&
1764 prev
->offset
+ prev
->bytes
> offset
)
1774 /* find last entry before the 'offset' */
1776 if (entry
->offset
> offset
) {
1777 n
= rb_prev(&entry
->offset_index
);
1779 entry
= rb_entry(n
, struct btrfs_free_space
,
1781 ASSERT(entry
->offset
<= offset
);
1790 if (entry
->bitmap
) {
1791 n
= rb_prev(&entry
->offset_index
);
1793 prev
= rb_entry(n
, struct btrfs_free_space
,
1795 if (!prev
->bitmap
&&
1796 prev
->offset
+ prev
->bytes
> offset
)
1799 if (entry
->offset
+ BITS_PER_BITMAP
* ctl
->unit
> offset
)
1801 } else if (entry
->offset
+ entry
->bytes
> offset
)
1808 n
= rb_next(&entry
->offset_index
);
1811 entry
= rb_entry(n
, struct btrfs_free_space
, offset_index
);
1812 if (entry
->bitmap
) {
1813 if (entry
->offset
+ BITS_PER_BITMAP
*
1817 if (entry
->offset
+ entry
->bytes
> offset
)
1824 static inline void unlink_free_space(struct btrfs_free_space_ctl
*ctl
,
1825 struct btrfs_free_space
*info
,
1828 lockdep_assert_held(&ctl
->tree_lock
);
1830 rb_erase(&info
->offset_index
, &ctl
->free_space_offset
);
1831 rb_erase_cached(&info
->bytes_index
, &ctl
->free_space_bytes
);
1832 ctl
->free_extents
--;
1834 if (!info
->bitmap
&& !btrfs_free_space_trimmed(info
)) {
1835 ctl
->discardable_extents
[BTRFS_STAT_CURR
]--;
1836 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] -= info
->bytes
;
1840 ctl
->free_space
-= info
->bytes
;
1843 static int link_free_space(struct btrfs_free_space_ctl
*ctl
,
1844 struct btrfs_free_space
*info
)
1848 lockdep_assert_held(&ctl
->tree_lock
);
1850 ASSERT(info
->bytes
|| info
->bitmap
);
1851 ret
= tree_insert_offset(ctl
, NULL
, info
);
1855 rb_add_cached(&info
->bytes_index
, &ctl
->free_space_bytes
, entry_less
);
1857 if (!info
->bitmap
&& !btrfs_free_space_trimmed(info
)) {
1858 ctl
->discardable_extents
[BTRFS_STAT_CURR
]++;
1859 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] += info
->bytes
;
1862 ctl
->free_space
+= info
->bytes
;
1863 ctl
->free_extents
++;
1867 static void relink_bitmap_entry(struct btrfs_free_space_ctl
*ctl
,
1868 struct btrfs_free_space
*info
)
1870 ASSERT(info
->bitmap
);
1873 * If our entry is empty it's because we're on a cluster and we don't
1874 * want to re-link it into our ctl bytes index.
1876 if (RB_EMPTY_NODE(&info
->bytes_index
))
1879 lockdep_assert_held(&ctl
->tree_lock
);
1881 rb_erase_cached(&info
->bytes_index
, &ctl
->free_space_bytes
);
1882 rb_add_cached(&info
->bytes_index
, &ctl
->free_space_bytes
, entry_less
);
1885 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl
*ctl
,
1886 struct btrfs_free_space
*info
,
1887 u64 offset
, u64 bytes
, bool update_stat
)
1889 unsigned long start
, count
, end
;
1890 int extent_delta
= -1;
1892 start
= offset_to_bit(info
->offset
, ctl
->unit
, offset
);
1893 count
= bytes_to_bits(bytes
, ctl
->unit
);
1894 end
= start
+ count
;
1895 ASSERT(end
<= BITS_PER_BITMAP
);
1897 bitmap_clear(info
->bitmap
, start
, count
);
1899 info
->bytes
-= bytes
;
1900 if (info
->max_extent_size
> ctl
->unit
)
1901 info
->max_extent_size
= 0;
1903 relink_bitmap_entry(ctl
, info
);
1905 if (start
&& test_bit(start
- 1, info
->bitmap
))
1908 if (end
< BITS_PER_BITMAP
&& test_bit(end
, info
->bitmap
))
1911 info
->bitmap_extents
+= extent_delta
;
1912 if (!btrfs_free_space_trimmed(info
)) {
1913 ctl
->discardable_extents
[BTRFS_STAT_CURR
] += extent_delta
;
1914 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] -= bytes
;
1918 ctl
->free_space
-= bytes
;
1921 static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl
*ctl
,
1922 struct btrfs_free_space
*info
, u64 offset
,
1925 unsigned long start
, count
, end
;
1926 int extent_delta
= 1;
1928 start
= offset_to_bit(info
->offset
, ctl
->unit
, offset
);
1929 count
= bytes_to_bits(bytes
, ctl
->unit
);
1930 end
= start
+ count
;
1931 ASSERT(end
<= BITS_PER_BITMAP
);
1933 bitmap_set(info
->bitmap
, start
, count
);
1936 * We set some bytes, we have no idea what the max extent size is
1939 info
->max_extent_size
= 0;
1940 info
->bytes
+= bytes
;
1941 ctl
->free_space
+= bytes
;
1943 relink_bitmap_entry(ctl
, info
);
1945 if (start
&& test_bit(start
- 1, info
->bitmap
))
1948 if (end
< BITS_PER_BITMAP
&& test_bit(end
, info
->bitmap
))
1951 info
->bitmap_extents
+= extent_delta
;
1952 if (!btrfs_free_space_trimmed(info
)) {
1953 ctl
->discardable_extents
[BTRFS_STAT_CURR
] += extent_delta
;
1954 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] += bytes
;
1959 * If we can not find suitable extent, we will use bytes to record
1960 * the size of the max extent.
1962 static int search_bitmap(struct btrfs_free_space_ctl
*ctl
,
1963 struct btrfs_free_space
*bitmap_info
, u64
*offset
,
1964 u64
*bytes
, bool for_alloc
)
1966 unsigned long found_bits
= 0;
1967 unsigned long max_bits
= 0;
1968 unsigned long bits
, i
;
1969 unsigned long next_zero
;
1970 unsigned long extent_bits
;
1973 * Skip searching the bitmap if we don't have a contiguous section that
1974 * is large enough for this allocation.
1977 bitmap_info
->max_extent_size
&&
1978 bitmap_info
->max_extent_size
< *bytes
) {
1979 *bytes
= bitmap_info
->max_extent_size
;
1983 i
= offset_to_bit(bitmap_info
->offset
, ctl
->unit
,
1984 max_t(u64
, *offset
, bitmap_info
->offset
));
1985 bits
= bytes_to_bits(*bytes
, ctl
->unit
);
1987 for_each_set_bit_from(i
, bitmap_info
->bitmap
, BITS_PER_BITMAP
) {
1988 if (for_alloc
&& bits
== 1) {
1992 next_zero
= find_next_zero_bit(bitmap_info
->bitmap
,
1993 BITS_PER_BITMAP
, i
);
1994 extent_bits
= next_zero
- i
;
1995 if (extent_bits
>= bits
) {
1996 found_bits
= extent_bits
;
1998 } else if (extent_bits
> max_bits
) {
1999 max_bits
= extent_bits
;
2005 *offset
= (u64
)(i
* ctl
->unit
) + bitmap_info
->offset
;
2006 *bytes
= (u64
)(found_bits
) * ctl
->unit
;
2010 *bytes
= (u64
)(max_bits
) * ctl
->unit
;
2011 bitmap_info
->max_extent_size
= *bytes
;
2012 relink_bitmap_entry(ctl
, bitmap_info
);
2016 /* Cache the size of the max extent in bytes */
2017 static struct btrfs_free_space
*
2018 find_free_space(struct btrfs_free_space_ctl
*ctl
, u64
*offset
, u64
*bytes
,
2019 unsigned long align
, u64
*max_extent_size
, bool use_bytes_index
)
2021 struct btrfs_free_space
*entry
;
2022 struct rb_node
*node
;
2027 if (!ctl
->free_space_offset
.rb_node
)
2030 if (use_bytes_index
) {
2031 node
= rb_first_cached(&ctl
->free_space_bytes
);
2033 entry
= tree_search_offset(ctl
, offset_to_bitmap(ctl
, *offset
),
2037 node
= &entry
->offset_index
;
2040 for (; node
; node
= rb_next(node
)) {
2041 if (use_bytes_index
)
2042 entry
= rb_entry(node
, struct btrfs_free_space
,
2045 entry
= rb_entry(node
, struct btrfs_free_space
,
2049 * If we are using the bytes index then all subsequent entries
2050 * in this tree are going to be < bytes, so simply set the max
2051 * extent size and exit the loop.
2053 * If we're using the offset index then we need to keep going
2054 * through the rest of the tree.
2056 if (entry
->bytes
< *bytes
) {
2057 *max_extent_size
= max(get_max_extent_size(entry
),
2059 if (use_bytes_index
)
2064 /* make sure the space returned is big enough
2065 * to match our requested alignment
2067 if (*bytes
>= align
) {
2068 tmp
= entry
->offset
- ctl
->start
+ align
- 1;
2069 tmp
= div64_u64(tmp
, align
);
2070 tmp
= tmp
* align
+ ctl
->start
;
2071 align_off
= tmp
- entry
->offset
;
2074 tmp
= entry
->offset
;
2078 * We don't break here if we're using the bytes index because we
2079 * may have another entry that has the correct alignment that is
2080 * the right size, so we don't want to miss that possibility.
2081 * At worst this adds another loop through the logic, but if we
2082 * broke here we could prematurely ENOSPC.
2084 if (entry
->bytes
< *bytes
+ align_off
) {
2085 *max_extent_size
= max(get_max_extent_size(entry
),
2090 if (entry
->bitmap
) {
2091 struct rb_node
*old_next
= rb_next(node
);
2094 ret
= search_bitmap(ctl
, entry
, &tmp
, &size
, true);
2101 max(get_max_extent_size(entry
),
2106 * The bitmap may have gotten re-arranged in the space
2107 * index here because the max_extent_size may have been
2108 * updated. Start from the beginning again if this
2111 if (use_bytes_index
&& old_next
!= rb_next(node
))
2117 *bytes
= entry
->bytes
- align_off
;
2124 static void add_new_bitmap(struct btrfs_free_space_ctl
*ctl
,
2125 struct btrfs_free_space
*info
, u64 offset
)
2127 info
->offset
= offset_to_bitmap(ctl
, offset
);
2129 info
->bitmap_extents
= 0;
2130 INIT_LIST_HEAD(&info
->list
);
2131 link_free_space(ctl
, info
);
2132 ctl
->total_bitmaps
++;
2133 recalculate_thresholds(ctl
);
2136 static void free_bitmap(struct btrfs_free_space_ctl
*ctl
,
2137 struct btrfs_free_space
*bitmap_info
)
2140 * Normally when this is called, the bitmap is completely empty. However,
2141 * if we are blowing up the free space cache for one reason or another
2142 * via __btrfs_remove_free_space_cache(), then it may not be freed and
2143 * we may leave stats on the table.
2145 if (bitmap_info
->bytes
&& !btrfs_free_space_trimmed(bitmap_info
)) {
2146 ctl
->discardable_extents
[BTRFS_STAT_CURR
] -=
2147 bitmap_info
->bitmap_extents
;
2148 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] -= bitmap_info
->bytes
;
2151 unlink_free_space(ctl
, bitmap_info
, true);
2152 kmem_cache_free(btrfs_free_space_bitmap_cachep
, bitmap_info
->bitmap
);
2153 kmem_cache_free(btrfs_free_space_cachep
, bitmap_info
);
2154 ctl
->total_bitmaps
--;
2155 recalculate_thresholds(ctl
);
2158 static noinline
int remove_from_bitmap(struct btrfs_free_space_ctl
*ctl
,
2159 struct btrfs_free_space
*bitmap_info
,
2160 u64
*offset
, u64
*bytes
)
2163 u64 search_start
, search_bytes
;
2167 end
= bitmap_info
->offset
+ (u64
)(BITS_PER_BITMAP
* ctl
->unit
) - 1;
2170 * We need to search for bits in this bitmap. We could only cover some
2171 * of the extent in this bitmap thanks to how we add space, so we need
2172 * to search for as much as it as we can and clear that amount, and then
2173 * go searching for the next bit.
2175 search_start
= *offset
;
2176 search_bytes
= ctl
->unit
;
2177 search_bytes
= min(search_bytes
, end
- search_start
+ 1);
2178 ret
= search_bitmap(ctl
, bitmap_info
, &search_start
, &search_bytes
,
2180 if (ret
< 0 || search_start
!= *offset
)
2183 /* We may have found more bits than what we need */
2184 search_bytes
= min(search_bytes
, *bytes
);
2186 /* Cannot clear past the end of the bitmap */
2187 search_bytes
= min(search_bytes
, end
- search_start
+ 1);
2189 bitmap_clear_bits(ctl
, bitmap_info
, search_start
, search_bytes
, true);
2190 *offset
+= search_bytes
;
2191 *bytes
-= search_bytes
;
2194 struct rb_node
*next
= rb_next(&bitmap_info
->offset_index
);
2195 if (!bitmap_info
->bytes
)
2196 free_bitmap(ctl
, bitmap_info
);
2199 * no entry after this bitmap, but we still have bytes to
2200 * remove, so something has gone wrong.
2205 bitmap_info
= rb_entry(next
, struct btrfs_free_space
,
2209 * if the next entry isn't a bitmap we need to return to let the
2210 * extent stuff do its work.
2212 if (!bitmap_info
->bitmap
)
2216 * Ok the next item is a bitmap, but it may not actually hold
2217 * the information for the rest of this free space stuff, so
2218 * look for it, and if we don't find it return so we can try
2219 * everything over again.
2221 search_start
= *offset
;
2222 search_bytes
= ctl
->unit
;
2223 ret
= search_bitmap(ctl
, bitmap_info
, &search_start
,
2224 &search_bytes
, false);
2225 if (ret
< 0 || search_start
!= *offset
)
2229 } else if (!bitmap_info
->bytes
)
2230 free_bitmap(ctl
, bitmap_info
);
2235 static u64
add_bytes_to_bitmap(struct btrfs_free_space_ctl
*ctl
,
2236 struct btrfs_free_space
*info
, u64 offset
,
2237 u64 bytes
, enum btrfs_trim_state trim_state
)
2239 u64 bytes_to_set
= 0;
2243 * This is a tradeoff to make bitmap trim state minimal. We mark the
2244 * whole bitmap untrimmed if at any point we add untrimmed regions.
2246 if (trim_state
== BTRFS_TRIM_STATE_UNTRIMMED
) {
2247 if (btrfs_free_space_trimmed(info
)) {
2248 ctl
->discardable_extents
[BTRFS_STAT_CURR
] +=
2249 info
->bitmap_extents
;
2250 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] += info
->bytes
;
2252 info
->trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
2255 end
= info
->offset
+ (u64
)(BITS_PER_BITMAP
* ctl
->unit
);
2257 bytes_to_set
= min(end
- offset
, bytes
);
2259 btrfs_bitmap_set_bits(ctl
, info
, offset
, bytes_to_set
);
2261 return bytes_to_set
;
2265 static bool use_bitmap(struct btrfs_free_space_ctl
*ctl
,
2266 struct btrfs_free_space
*info
)
2268 struct btrfs_block_group
*block_group
= ctl
->block_group
;
2269 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2270 bool forced
= false;
2272 #ifdef CONFIG_BTRFS_DEBUG
2273 if (btrfs_should_fragment_free_space(block_group
))
2277 /* This is a way to reclaim large regions from the bitmaps. */
2278 if (!forced
&& info
->bytes
>= FORCE_EXTENT_THRESHOLD
)
2282 * If we are below the extents threshold then we can add this as an
2283 * extent, and don't have to deal with the bitmap
2285 if (!forced
&& ctl
->free_extents
< ctl
->extents_thresh
) {
2287 * If this block group has some small extents we don't want to
2288 * use up all of our free slots in the cache with them, we want
2289 * to reserve them to larger extents, however if we have plenty
2290 * of cache left then go ahead an dadd them, no sense in adding
2291 * the overhead of a bitmap if we don't have to.
2293 if (info
->bytes
<= fs_info
->sectorsize
* 8) {
2294 if (ctl
->free_extents
* 3 <= ctl
->extents_thresh
)
2302 * The original block groups from mkfs can be really small, like 8
2303 * megabytes, so don't bother with a bitmap for those entries. However
2304 * some block groups can be smaller than what a bitmap would cover but
2305 * are still large enough that they could overflow the 32k memory limit,
2306 * so allow those block groups to still be allowed to have a bitmap
2309 if (((BITS_PER_BITMAP
* ctl
->unit
) >> 1) > block_group
->length
)
2315 static const struct btrfs_free_space_op free_space_op
= {
2316 .use_bitmap
= use_bitmap
,
2319 static int insert_into_bitmap(struct btrfs_free_space_ctl
*ctl
,
2320 struct btrfs_free_space
*info
)
2322 struct btrfs_free_space
*bitmap_info
;
2323 struct btrfs_block_group
*block_group
= NULL
;
2325 u64 bytes
, offset
, bytes_added
;
2326 enum btrfs_trim_state trim_state
;
2329 bytes
= info
->bytes
;
2330 offset
= info
->offset
;
2331 trim_state
= info
->trim_state
;
2333 if (!ctl
->op
->use_bitmap(ctl
, info
))
2336 if (ctl
->op
== &free_space_op
)
2337 block_group
= ctl
->block_group
;
2340 * Since we link bitmaps right into the cluster we need to see if we
2341 * have a cluster here, and if so and it has our bitmap we need to add
2342 * the free space to that bitmap.
2344 if (block_group
&& !list_empty(&block_group
->cluster_list
)) {
2345 struct btrfs_free_cluster
*cluster
;
2346 struct rb_node
*node
;
2347 struct btrfs_free_space
*entry
;
2349 cluster
= list_entry(block_group
->cluster_list
.next
,
2350 struct btrfs_free_cluster
,
2352 spin_lock(&cluster
->lock
);
2353 node
= rb_first(&cluster
->root
);
2355 spin_unlock(&cluster
->lock
);
2356 goto no_cluster_bitmap
;
2359 entry
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
2360 if (!entry
->bitmap
) {
2361 spin_unlock(&cluster
->lock
);
2362 goto no_cluster_bitmap
;
2365 if (entry
->offset
== offset_to_bitmap(ctl
, offset
)) {
2366 bytes_added
= add_bytes_to_bitmap(ctl
, entry
, offset
,
2368 bytes
-= bytes_added
;
2369 offset
+= bytes_added
;
2371 spin_unlock(&cluster
->lock
);
2379 bitmap_info
= tree_search_offset(ctl
, offset_to_bitmap(ctl
, offset
),
2386 bytes_added
= add_bytes_to_bitmap(ctl
, bitmap_info
, offset
, bytes
,
2388 bytes
-= bytes_added
;
2389 offset
+= bytes_added
;
2399 if (info
&& info
->bitmap
) {
2400 add_new_bitmap(ctl
, info
, offset
);
2405 spin_unlock(&ctl
->tree_lock
);
2407 /* no pre-allocated info, allocate a new one */
2409 info
= kmem_cache_zalloc(btrfs_free_space_cachep
,
2412 spin_lock(&ctl
->tree_lock
);
2418 /* allocate the bitmap */
2419 info
->bitmap
= kmem_cache_zalloc(btrfs_free_space_bitmap_cachep
,
2421 info
->trim_state
= BTRFS_TRIM_STATE_TRIMMED
;
2422 spin_lock(&ctl
->tree_lock
);
2423 if (!info
->bitmap
) {
2433 kmem_cache_free(btrfs_free_space_bitmap_cachep
,
2435 kmem_cache_free(btrfs_free_space_cachep
, info
);
2442 * Free space merging rules:
2443 * 1) Merge trimmed areas together
2444 * 2) Let untrimmed areas coalesce with trimmed areas
2445 * 3) Always pull neighboring regions from bitmaps
2447 * The above rules are for when we merge free space based on btrfs_trim_state.
2448 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2449 * same reason: to promote larger extent regions which makes life easier for
2450 * find_free_extent(). Rule 2 enables coalescing based on the common path
2451 * being returning free space from btrfs_finish_extent_commit(). So when free
2452 * space is trimmed, it will prevent aggregating trimmed new region and
2453 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents
2454 * and provide find_free_extent() with the largest extents possible hoping for
2457 static bool try_merge_free_space(struct btrfs_free_space_ctl
*ctl
,
2458 struct btrfs_free_space
*info
, bool update_stat
)
2460 struct btrfs_free_space
*left_info
= NULL
;
2461 struct btrfs_free_space
*right_info
;
2462 bool merged
= false;
2463 u64 offset
= info
->offset
;
2464 u64 bytes
= info
->bytes
;
2465 const bool is_trimmed
= btrfs_free_space_trimmed(info
);
2466 struct rb_node
*right_prev
= NULL
;
2469 * first we want to see if there is free space adjacent to the range we
2470 * are adding, if there is remove that struct and add a new one to
2471 * cover the entire range
2473 right_info
= tree_search_offset(ctl
, offset
+ bytes
, 0, 0);
2475 right_prev
= rb_prev(&right_info
->offset_index
);
2478 left_info
= rb_entry(right_prev
, struct btrfs_free_space
, offset_index
);
2479 else if (!right_info
)
2480 left_info
= tree_search_offset(ctl
, offset
- 1, 0, 0);
2482 /* See try_merge_free_space() comment. */
2483 if (right_info
&& !right_info
->bitmap
&&
2484 (!is_trimmed
|| btrfs_free_space_trimmed(right_info
))) {
2485 unlink_free_space(ctl
, right_info
, update_stat
);
2486 info
->bytes
+= right_info
->bytes
;
2487 kmem_cache_free(btrfs_free_space_cachep
, right_info
);
2491 /* See try_merge_free_space() comment. */
2492 if (left_info
&& !left_info
->bitmap
&&
2493 left_info
->offset
+ left_info
->bytes
== offset
&&
2494 (!is_trimmed
|| btrfs_free_space_trimmed(left_info
))) {
2495 unlink_free_space(ctl
, left_info
, update_stat
);
2496 info
->offset
= left_info
->offset
;
2497 info
->bytes
+= left_info
->bytes
;
2498 kmem_cache_free(btrfs_free_space_cachep
, left_info
);
2505 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl
*ctl
,
2506 struct btrfs_free_space
*info
,
2509 struct btrfs_free_space
*bitmap
;
2512 const u64 end
= info
->offset
+ info
->bytes
;
2513 const u64 bitmap_offset
= offset_to_bitmap(ctl
, end
);
2516 bitmap
= tree_search_offset(ctl
, bitmap_offset
, 1, 0);
2520 i
= offset_to_bit(bitmap
->offset
, ctl
->unit
, end
);
2521 j
= find_next_zero_bit(bitmap
->bitmap
, BITS_PER_BITMAP
, i
);
2524 bytes
= (j
- i
) * ctl
->unit
;
2525 info
->bytes
+= bytes
;
2527 /* See try_merge_free_space() comment. */
2528 if (!btrfs_free_space_trimmed(bitmap
))
2529 info
->trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
2531 bitmap_clear_bits(ctl
, bitmap
, end
, bytes
, update_stat
);
2534 free_bitmap(ctl
, bitmap
);
2539 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl
*ctl
,
2540 struct btrfs_free_space
*info
,
2543 struct btrfs_free_space
*bitmap
;
2547 unsigned long prev_j
;
2550 bitmap_offset
= offset_to_bitmap(ctl
, info
->offset
);
2551 /* If we're on a boundary, try the previous logical bitmap. */
2552 if (bitmap_offset
== info
->offset
) {
2553 if (info
->offset
== 0)
2555 bitmap_offset
= offset_to_bitmap(ctl
, info
->offset
- 1);
2558 bitmap
= tree_search_offset(ctl
, bitmap_offset
, 1, 0);
2562 i
= offset_to_bit(bitmap
->offset
, ctl
->unit
, info
->offset
) - 1;
2564 prev_j
= (unsigned long)-1;
2565 for_each_clear_bit_from(j
, bitmap
->bitmap
, BITS_PER_BITMAP
) {
2573 if (prev_j
== (unsigned long)-1)
2574 bytes
= (i
+ 1) * ctl
->unit
;
2576 bytes
= (i
- prev_j
) * ctl
->unit
;
2578 info
->offset
-= bytes
;
2579 info
->bytes
+= bytes
;
2581 /* See try_merge_free_space() comment. */
2582 if (!btrfs_free_space_trimmed(bitmap
))
2583 info
->trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
2585 bitmap_clear_bits(ctl
, bitmap
, info
->offset
, bytes
, update_stat
);
2588 free_bitmap(ctl
, bitmap
);
2594 * We prefer always to allocate from extent entries, both for clustered and
2595 * non-clustered allocation requests. So when attempting to add a new extent
2596 * entry, try to see if there's adjacent free space in bitmap entries, and if
2597 * there is, migrate that space from the bitmaps to the extent.
2598 * Like this we get better chances of satisfying space allocation requests
2599 * because we attempt to satisfy them based on a single cache entry, and never
2600 * on 2 or more entries - even if the entries represent a contiguous free space
2601 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2604 static void steal_from_bitmap(struct btrfs_free_space_ctl
*ctl
,
2605 struct btrfs_free_space
*info
,
2609 * Only work with disconnected entries, as we can change their offset,
2610 * and must be extent entries.
2612 ASSERT(!info
->bitmap
);
2613 ASSERT(RB_EMPTY_NODE(&info
->offset_index
));
2615 if (ctl
->total_bitmaps
> 0) {
2617 bool stole_front
= false;
2619 stole_end
= steal_from_bitmap_to_end(ctl
, info
, update_stat
);
2620 if (ctl
->total_bitmaps
> 0)
2621 stole_front
= steal_from_bitmap_to_front(ctl
, info
,
2624 if (stole_end
|| stole_front
)
2625 try_merge_free_space(ctl
, info
, update_stat
);
2629 static int __btrfs_add_free_space(struct btrfs_block_group
*block_group
,
2630 u64 offset
, u64 bytes
,
2631 enum btrfs_trim_state trim_state
)
2633 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2634 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
2635 struct btrfs_free_space
*info
;
2637 u64 filter_bytes
= bytes
;
2639 ASSERT(!btrfs_is_zoned(fs_info
));
2641 info
= kmem_cache_zalloc(btrfs_free_space_cachep
, GFP_NOFS
);
2645 info
->offset
= offset
;
2646 info
->bytes
= bytes
;
2647 info
->trim_state
= trim_state
;
2648 RB_CLEAR_NODE(&info
->offset_index
);
2649 RB_CLEAR_NODE(&info
->bytes_index
);
2651 spin_lock(&ctl
->tree_lock
);
2653 if (try_merge_free_space(ctl
, info
, true))
2657 * There was no extent directly to the left or right of this new
2658 * extent then we know we're going to have to allocate a new extent, so
2659 * before we do that see if we need to drop this into a bitmap
2661 ret
= insert_into_bitmap(ctl
, info
);
2670 * Only steal free space from adjacent bitmaps if we're sure we're not
2671 * going to add the new free space to existing bitmap entries - because
2672 * that would mean unnecessary work that would be reverted. Therefore
2673 * attempt to steal space from bitmaps if we're adding an extent entry.
2675 steal_from_bitmap(ctl
, info
, true);
2677 filter_bytes
= max(filter_bytes
, info
->bytes
);
2679 ret
= link_free_space(ctl
, info
);
2681 kmem_cache_free(btrfs_free_space_cachep
, info
);
2683 btrfs_discard_update_discardable(block_group
);
2684 spin_unlock(&ctl
->tree_lock
);
2687 btrfs_crit(fs_info
, "unable to add free space :%d", ret
);
2688 ASSERT(ret
!= -EEXIST
);
2691 if (trim_state
!= BTRFS_TRIM_STATE_TRIMMED
) {
2692 btrfs_discard_check_filter(block_group
, filter_bytes
);
2693 btrfs_discard_queue_work(&fs_info
->discard_ctl
, block_group
);
2699 static int __btrfs_add_free_space_zoned(struct btrfs_block_group
*block_group
,
2700 u64 bytenr
, u64 size
, bool used
)
2702 struct btrfs_space_info
*sinfo
= block_group
->space_info
;
2703 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
2704 u64 offset
= bytenr
- block_group
->start
;
2705 u64 to_free
, to_unusable
;
2706 int bg_reclaim_threshold
= 0;
2708 u64 reclaimable_unusable
;
2710 spin_lock(&block_group
->lock
);
2712 initial
= ((size
== block_group
->length
) && (block_group
->alloc_offset
== 0));
2713 WARN_ON(!initial
&& offset
+ size
> block_group
->zone_capacity
);
2715 bg_reclaim_threshold
= READ_ONCE(sinfo
->bg_reclaim_threshold
);
2720 to_free
= block_group
->zone_capacity
;
2721 else if (offset
>= block_group
->alloc_offset
)
2723 else if (offset
+ size
<= block_group
->alloc_offset
)
2726 to_free
= offset
+ size
- block_group
->alloc_offset
;
2727 to_unusable
= size
- to_free
;
2729 spin_lock(&ctl
->tree_lock
);
2730 ctl
->free_space
+= to_free
;
2731 spin_unlock(&ctl
->tree_lock
);
2733 * If the block group is read-only, we should account freed space into
2736 if (!block_group
->ro
) {
2737 block_group
->zone_unusable
+= to_unusable
;
2738 WARN_ON(block_group
->zone_unusable
> block_group
->length
);
2741 block_group
->alloc_offset
-= size
;
2744 reclaimable_unusable
= block_group
->zone_unusable
-
2745 (block_group
->length
- block_group
->zone_capacity
);
2746 /* All the region is now unusable. Mark it as unused and reclaim */
2747 if (block_group
->zone_unusable
== block_group
->length
) {
2748 btrfs_mark_bg_unused(block_group
);
2749 } else if (bg_reclaim_threshold
&&
2750 reclaimable_unusable
>=
2751 mult_perc(block_group
->zone_capacity
, bg_reclaim_threshold
)) {
2752 btrfs_mark_bg_to_reclaim(block_group
);
2755 spin_unlock(&block_group
->lock
);
2760 int btrfs_add_free_space(struct btrfs_block_group
*block_group
,
2761 u64 bytenr
, u64 size
)
2763 enum btrfs_trim_state trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
2765 if (btrfs_is_zoned(block_group
->fs_info
))
2766 return __btrfs_add_free_space_zoned(block_group
, bytenr
, size
,
2769 if (btrfs_test_opt(block_group
->fs_info
, DISCARD_SYNC
))
2770 trim_state
= BTRFS_TRIM_STATE_TRIMMED
;
2772 return __btrfs_add_free_space(block_group
, bytenr
, size
, trim_state
);
2775 int btrfs_add_free_space_unused(struct btrfs_block_group
*block_group
,
2776 u64 bytenr
, u64 size
)
2778 if (btrfs_is_zoned(block_group
->fs_info
))
2779 return __btrfs_add_free_space_zoned(block_group
, bytenr
, size
,
2782 return btrfs_add_free_space(block_group
, bytenr
, size
);
2786 * This is a subtle distinction because when adding free space back in general,
2787 * we want it to be added as untrimmed for async. But in the case where we add
2788 * it on loading of a block group, we want to consider it trimmed.
2790 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group
*block_group
,
2791 u64 bytenr
, u64 size
)
2793 enum btrfs_trim_state trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
2795 if (btrfs_is_zoned(block_group
->fs_info
))
2796 return __btrfs_add_free_space_zoned(block_group
, bytenr
, size
,
2799 if (btrfs_test_opt(block_group
->fs_info
, DISCARD_SYNC
) ||
2800 btrfs_test_opt(block_group
->fs_info
, DISCARD_ASYNC
))
2801 trim_state
= BTRFS_TRIM_STATE_TRIMMED
;
2803 return __btrfs_add_free_space(block_group
, bytenr
, size
, trim_state
);
2806 int btrfs_remove_free_space(struct btrfs_block_group
*block_group
,
2807 u64 offset
, u64 bytes
)
2809 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
2810 struct btrfs_free_space
*info
;
2812 bool re_search
= false;
2814 if (btrfs_is_zoned(block_group
->fs_info
)) {
2816 * This can happen with conventional zones when replaying log.
2817 * Since the allocation info of tree-log nodes are not recorded
2818 * to the extent-tree, calculate_alloc_pointer() failed to
2819 * advance the allocation pointer after last allocated tree log
2822 * This function is called from
2823 * btrfs_pin_extent_for_log_replay() when replaying the log.
2824 * Advance the pointer not to overwrite the tree-log nodes.
2826 if (block_group
->start
+ block_group
->alloc_offset
<
2828 block_group
->alloc_offset
=
2829 offset
+ bytes
- block_group
->start
;
2834 spin_lock(&ctl
->tree_lock
);
2841 info
= tree_search_offset(ctl
, offset
, 0, 0);
2844 * oops didn't find an extent that matched the space we wanted
2845 * to remove, look for a bitmap instead
2847 info
= tree_search_offset(ctl
, offset_to_bitmap(ctl
, offset
),
2851 * If we found a partial bit of our free space in a
2852 * bitmap but then couldn't find the other part this may
2853 * be a problem, so WARN about it.
2861 if (!info
->bitmap
) {
2862 unlink_free_space(ctl
, info
, true);
2863 if (offset
== info
->offset
) {
2864 u64 to_free
= min(bytes
, info
->bytes
);
2866 info
->bytes
-= to_free
;
2867 info
->offset
+= to_free
;
2869 ret
= link_free_space(ctl
, info
);
2872 kmem_cache_free(btrfs_free_space_cachep
, info
);
2879 u64 old_end
= info
->bytes
+ info
->offset
;
2881 info
->bytes
= offset
- info
->offset
;
2882 ret
= link_free_space(ctl
, info
);
2887 /* Not enough bytes in this entry to satisfy us */
2888 if (old_end
< offset
+ bytes
) {
2889 bytes
-= old_end
- offset
;
2892 } else if (old_end
== offset
+ bytes
) {
2896 spin_unlock(&ctl
->tree_lock
);
2898 ret
= __btrfs_add_free_space(block_group
,
2900 old_end
- (offset
+ bytes
),
2907 ret
= remove_from_bitmap(ctl
, info
, &offset
, &bytes
);
2908 if (ret
== -EAGAIN
) {
2913 btrfs_discard_update_discardable(block_group
);
2914 spin_unlock(&ctl
->tree_lock
);
2919 void btrfs_dump_free_space(struct btrfs_block_group
*block_group
,
2922 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2923 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
2924 struct btrfs_free_space
*info
;
2929 * Zoned btrfs does not use free space tree and cluster. Just print
2930 * out the free space after the allocation offset.
2932 if (btrfs_is_zoned(fs_info
)) {
2933 btrfs_info(fs_info
, "free space %llu active %d",
2934 block_group
->zone_capacity
- block_group
->alloc_offset
,
2935 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
,
2936 &block_group
->runtime_flags
));
2940 spin_lock(&ctl
->tree_lock
);
2941 for (n
= rb_first(&ctl
->free_space_offset
); n
; n
= rb_next(n
)) {
2942 info
= rb_entry(n
, struct btrfs_free_space
, offset_index
);
2943 if (info
->bytes
>= bytes
&& !block_group
->ro
)
2945 btrfs_crit(fs_info
, "entry offset %llu, bytes %llu, bitmap %s",
2946 info
->offset
, info
->bytes
, str_yes_no(info
->bitmap
));
2948 spin_unlock(&ctl
->tree_lock
);
2949 btrfs_info(fs_info
, "block group has cluster?: %s",
2950 str_no_yes(list_empty(&block_group
->cluster_list
)));
2952 "%d free space entries at or bigger than %llu bytes",
2956 void btrfs_init_free_space_ctl(struct btrfs_block_group
*block_group
,
2957 struct btrfs_free_space_ctl
*ctl
)
2959 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2961 spin_lock_init(&ctl
->tree_lock
);
2962 ctl
->unit
= fs_info
->sectorsize
;
2963 ctl
->start
= block_group
->start
;
2964 ctl
->block_group
= block_group
;
2965 ctl
->op
= &free_space_op
;
2966 ctl
->free_space_bytes
= RB_ROOT_CACHED
;
2967 INIT_LIST_HEAD(&ctl
->trimming_ranges
);
2968 mutex_init(&ctl
->cache_writeout_mutex
);
2971 * we only want to have 32k of ram per block group for keeping
2972 * track of free space, and if we pass 1/2 of that we want to
2973 * start converting things over to using bitmaps
2975 ctl
->extents_thresh
= (SZ_32K
/ 2) / sizeof(struct btrfs_free_space
);
2979 * for a given cluster, put all of its extents back into the free
2980 * space cache. If the block group passed doesn't match the block group
2981 * pointed to by the cluster, someone else raced in and freed the
2982 * cluster already. In that case, we just return without changing anything
2984 static void __btrfs_return_cluster_to_free_space(
2985 struct btrfs_block_group
*block_group
,
2986 struct btrfs_free_cluster
*cluster
)
2988 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
2989 struct rb_node
*node
;
2991 lockdep_assert_held(&ctl
->tree_lock
);
2993 spin_lock(&cluster
->lock
);
2994 if (cluster
->block_group
!= block_group
) {
2995 spin_unlock(&cluster
->lock
);
2999 cluster
->block_group
= NULL
;
3000 cluster
->window_start
= 0;
3001 list_del_init(&cluster
->block_group_list
);
3003 node
= rb_first(&cluster
->root
);
3005 struct btrfs_free_space
*entry
;
3007 entry
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
3008 node
= rb_next(&entry
->offset_index
);
3009 rb_erase(&entry
->offset_index
, &cluster
->root
);
3010 RB_CLEAR_NODE(&entry
->offset_index
);
3012 if (!entry
->bitmap
) {
3013 /* Merging treats extents as if they were new */
3014 if (!btrfs_free_space_trimmed(entry
)) {
3015 ctl
->discardable_extents
[BTRFS_STAT_CURR
]--;
3016 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] -=
3020 try_merge_free_space(ctl
, entry
, false);
3021 steal_from_bitmap(ctl
, entry
, false);
3023 /* As we insert directly, update these statistics */
3024 if (!btrfs_free_space_trimmed(entry
)) {
3025 ctl
->discardable_extents
[BTRFS_STAT_CURR
]++;
3026 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] +=
3030 tree_insert_offset(ctl
, NULL
, entry
);
3031 rb_add_cached(&entry
->bytes_index
, &ctl
->free_space_bytes
,
3034 cluster
->root
= RB_ROOT
;
3035 spin_unlock(&cluster
->lock
);
3036 btrfs_put_block_group(block_group
);
3039 void btrfs_remove_free_space_cache(struct btrfs_block_group
*block_group
)
3041 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3042 struct btrfs_free_cluster
*cluster
;
3043 struct list_head
*head
;
3045 spin_lock(&ctl
->tree_lock
);
3046 while ((head
= block_group
->cluster_list
.next
) !=
3047 &block_group
->cluster_list
) {
3048 cluster
= list_entry(head
, struct btrfs_free_cluster
,
3051 WARN_ON(cluster
->block_group
!= block_group
);
3052 __btrfs_return_cluster_to_free_space(block_group
, cluster
);
3054 cond_resched_lock(&ctl
->tree_lock
);
3056 __btrfs_remove_free_space_cache(ctl
);
3057 btrfs_discard_update_discardable(block_group
);
3058 spin_unlock(&ctl
->tree_lock
);
3063 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
3065 bool btrfs_is_free_space_trimmed(struct btrfs_block_group
*block_group
)
3067 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3068 struct btrfs_free_space
*info
;
3069 struct rb_node
*node
;
3072 spin_lock(&ctl
->tree_lock
);
3073 node
= rb_first(&ctl
->free_space_offset
);
3076 info
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
3078 if (!btrfs_free_space_trimmed(info
)) {
3083 node
= rb_next(node
);
3086 spin_unlock(&ctl
->tree_lock
);
3090 u64
btrfs_find_space_for_alloc(struct btrfs_block_group
*block_group
,
3091 u64 offset
, u64 bytes
, u64 empty_size
,
3092 u64
*max_extent_size
)
3094 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3095 struct btrfs_discard_ctl
*discard_ctl
=
3096 &block_group
->fs_info
->discard_ctl
;
3097 struct btrfs_free_space
*entry
= NULL
;
3098 u64 bytes_search
= bytes
+ empty_size
;
3101 u64 align_gap_len
= 0;
3102 enum btrfs_trim_state align_gap_trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
3103 bool use_bytes_index
= (offset
== block_group
->start
);
3105 ASSERT(!btrfs_is_zoned(block_group
->fs_info
));
3107 spin_lock(&ctl
->tree_lock
);
3108 entry
= find_free_space(ctl
, &offset
, &bytes_search
,
3109 block_group
->full_stripe_len
, max_extent_size
,
3115 if (entry
->bitmap
) {
3116 bitmap_clear_bits(ctl
, entry
, offset
, bytes
, true);
3118 if (!btrfs_free_space_trimmed(entry
))
3119 atomic64_add(bytes
, &discard_ctl
->discard_bytes_saved
);
3122 free_bitmap(ctl
, entry
);
3124 unlink_free_space(ctl
, entry
, true);
3125 align_gap_len
= offset
- entry
->offset
;
3126 align_gap
= entry
->offset
;
3127 align_gap_trim_state
= entry
->trim_state
;
3129 if (!btrfs_free_space_trimmed(entry
))
3130 atomic64_add(bytes
, &discard_ctl
->discard_bytes_saved
);
3132 entry
->offset
= offset
+ bytes
;
3133 WARN_ON(entry
->bytes
< bytes
+ align_gap_len
);
3135 entry
->bytes
-= bytes
+ align_gap_len
;
3137 kmem_cache_free(btrfs_free_space_cachep
, entry
);
3139 link_free_space(ctl
, entry
);
3142 btrfs_discard_update_discardable(block_group
);
3143 spin_unlock(&ctl
->tree_lock
);
3146 __btrfs_add_free_space(block_group
, align_gap
, align_gap_len
,
3147 align_gap_trim_state
);
3152 * given a cluster, put all of its extents back into the free space
3153 * cache. If a block group is passed, this function will only free
3154 * a cluster that belongs to the passed block group.
3156 * Otherwise, it'll get a reference on the block group pointed to by the
3157 * cluster and remove the cluster from it.
3159 void btrfs_return_cluster_to_free_space(
3160 struct btrfs_block_group
*block_group
,
3161 struct btrfs_free_cluster
*cluster
)
3163 struct btrfs_free_space_ctl
*ctl
;
3165 /* first, get a safe pointer to the block group */
3166 spin_lock(&cluster
->lock
);
3168 block_group
= cluster
->block_group
;
3170 spin_unlock(&cluster
->lock
);
3173 } else if (cluster
->block_group
!= block_group
) {
3174 /* someone else has already freed it don't redo their work */
3175 spin_unlock(&cluster
->lock
);
3178 btrfs_get_block_group(block_group
);
3179 spin_unlock(&cluster
->lock
);
3181 ctl
= block_group
->free_space_ctl
;
3183 /* now return any extents the cluster had on it */
3184 spin_lock(&ctl
->tree_lock
);
3185 __btrfs_return_cluster_to_free_space(block_group
, cluster
);
3186 spin_unlock(&ctl
->tree_lock
);
3188 btrfs_discard_queue_work(&block_group
->fs_info
->discard_ctl
, block_group
);
3190 /* finally drop our ref */
3191 btrfs_put_block_group(block_group
);
3194 static u64
btrfs_alloc_from_bitmap(struct btrfs_block_group
*block_group
,
3195 struct btrfs_free_cluster
*cluster
,
3196 struct btrfs_free_space
*entry
,
3197 u64 bytes
, u64 min_start
,
3198 u64
*max_extent_size
)
3200 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3202 u64 search_start
= cluster
->window_start
;
3203 u64 search_bytes
= bytes
;
3206 search_start
= min_start
;
3207 search_bytes
= bytes
;
3209 err
= search_bitmap(ctl
, entry
, &search_start
, &search_bytes
, true);
3211 *max_extent_size
= max(get_max_extent_size(entry
),
3217 bitmap_clear_bits(ctl
, entry
, ret
, bytes
, false);
3223 * given a cluster, try to allocate 'bytes' from it, returns 0
3224 * if it couldn't find anything suitably large, or a logical disk offset
3225 * if things worked out
3227 u64
btrfs_alloc_from_cluster(struct btrfs_block_group
*block_group
,
3228 struct btrfs_free_cluster
*cluster
, u64 bytes
,
3229 u64 min_start
, u64
*max_extent_size
)
3231 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3232 struct btrfs_discard_ctl
*discard_ctl
=
3233 &block_group
->fs_info
->discard_ctl
;
3234 struct btrfs_free_space
*entry
= NULL
;
3235 struct rb_node
*node
;
3238 ASSERT(!btrfs_is_zoned(block_group
->fs_info
));
3240 spin_lock(&cluster
->lock
);
3241 if (bytes
> cluster
->max_size
)
3244 if (cluster
->block_group
!= block_group
)
3247 node
= rb_first(&cluster
->root
);
3251 entry
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
3253 if (entry
->bytes
< bytes
)
3254 *max_extent_size
= max(get_max_extent_size(entry
),
3257 if (entry
->bytes
< bytes
||
3258 (!entry
->bitmap
&& entry
->offset
< min_start
)) {
3259 node
= rb_next(&entry
->offset_index
);
3262 entry
= rb_entry(node
, struct btrfs_free_space
,
3267 if (entry
->bitmap
) {
3268 ret
= btrfs_alloc_from_bitmap(block_group
,
3269 cluster
, entry
, bytes
,
3270 cluster
->window_start
,
3273 node
= rb_next(&entry
->offset_index
);
3276 entry
= rb_entry(node
, struct btrfs_free_space
,
3280 cluster
->window_start
+= bytes
;
3282 ret
= entry
->offset
;
3284 entry
->offset
+= bytes
;
3285 entry
->bytes
-= bytes
;
3291 spin_unlock(&cluster
->lock
);
3296 spin_lock(&ctl
->tree_lock
);
3298 if (!btrfs_free_space_trimmed(entry
))
3299 atomic64_add(bytes
, &discard_ctl
->discard_bytes_saved
);
3301 ctl
->free_space
-= bytes
;
3302 if (!entry
->bitmap
&& !btrfs_free_space_trimmed(entry
))
3303 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] -= bytes
;
3305 spin_lock(&cluster
->lock
);
3306 if (entry
->bytes
== 0) {
3307 rb_erase(&entry
->offset_index
, &cluster
->root
);
3308 ctl
->free_extents
--;
3309 if (entry
->bitmap
) {
3310 kmem_cache_free(btrfs_free_space_bitmap_cachep
,
3312 ctl
->total_bitmaps
--;
3313 recalculate_thresholds(ctl
);
3314 } else if (!btrfs_free_space_trimmed(entry
)) {
3315 ctl
->discardable_extents
[BTRFS_STAT_CURR
]--;
3317 kmem_cache_free(btrfs_free_space_cachep
, entry
);
3320 spin_unlock(&cluster
->lock
);
3321 spin_unlock(&ctl
->tree_lock
);
3326 static int btrfs_bitmap_cluster(struct btrfs_block_group
*block_group
,
3327 struct btrfs_free_space
*entry
,
3328 struct btrfs_free_cluster
*cluster
,
3329 u64 offset
, u64 bytes
,
3330 u64 cont1_bytes
, u64 min_bytes
)
3332 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3333 unsigned long next_zero
;
3335 unsigned long want_bits
;
3336 unsigned long min_bits
;
3337 unsigned long found_bits
;
3338 unsigned long max_bits
= 0;
3339 unsigned long start
= 0;
3340 unsigned long total_found
= 0;
3343 lockdep_assert_held(&ctl
->tree_lock
);
3345 i
= offset_to_bit(entry
->offset
, ctl
->unit
,
3346 max_t(u64
, offset
, entry
->offset
));
3347 want_bits
= bytes_to_bits(bytes
, ctl
->unit
);
3348 min_bits
= bytes_to_bits(min_bytes
, ctl
->unit
);
3351 * Don't bother looking for a cluster in this bitmap if it's heavily
3354 if (entry
->max_extent_size
&&
3355 entry
->max_extent_size
< cont1_bytes
)
3359 for_each_set_bit_from(i
, entry
->bitmap
, BITS_PER_BITMAP
) {
3360 next_zero
= find_next_zero_bit(entry
->bitmap
,
3361 BITS_PER_BITMAP
, i
);
3362 if (next_zero
- i
>= min_bits
) {
3363 found_bits
= next_zero
- i
;
3364 if (found_bits
> max_bits
)
3365 max_bits
= found_bits
;
3368 if (next_zero
- i
> max_bits
)
3369 max_bits
= next_zero
- i
;
3374 entry
->max_extent_size
= (u64
)max_bits
* ctl
->unit
;
3380 cluster
->max_size
= 0;
3383 total_found
+= found_bits
;
3385 if (cluster
->max_size
< found_bits
* ctl
->unit
)
3386 cluster
->max_size
= found_bits
* ctl
->unit
;
3388 if (total_found
< want_bits
|| cluster
->max_size
< cont1_bytes
) {
3393 cluster
->window_start
= start
* ctl
->unit
+ entry
->offset
;
3394 rb_erase(&entry
->offset_index
, &ctl
->free_space_offset
);
3395 rb_erase_cached(&entry
->bytes_index
, &ctl
->free_space_bytes
);
3398 * We need to know if we're currently on the normal space index when we
3399 * manipulate the bitmap so that we know we need to remove and re-insert
3400 * it into the space_index tree. Clear the bytes_index node here so the
3401 * bitmap manipulation helpers know not to mess with the space_index
3402 * until this bitmap entry is added back into the normal cache.
3404 RB_CLEAR_NODE(&entry
->bytes_index
);
3406 ret
= tree_insert_offset(ctl
, cluster
, entry
);
3407 ASSERT(!ret
); /* -EEXIST; Logic error */
3409 trace_btrfs_setup_cluster(block_group
, cluster
,
3410 total_found
* ctl
->unit
, 1);
3415 * This searches the block group for just extents to fill the cluster with.
3416 * Try to find a cluster with at least bytes total bytes, at least one
3417 * extent of cont1_bytes, and other clusters of at least min_bytes.
3420 setup_cluster_no_bitmap(struct btrfs_block_group
*block_group
,
3421 struct btrfs_free_cluster
*cluster
,
3422 struct list_head
*bitmaps
, u64 offset
, u64 bytes
,
3423 u64 cont1_bytes
, u64 min_bytes
)
3425 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3426 struct btrfs_free_space
*first
= NULL
;
3427 struct btrfs_free_space
*entry
= NULL
;
3428 struct btrfs_free_space
*last
;
3429 struct rb_node
*node
;
3434 lockdep_assert_held(&ctl
->tree_lock
);
3436 entry
= tree_search_offset(ctl
, offset
, 0, 1);
3441 * We don't want bitmaps, so just move along until we find a normal
3444 while (entry
->bitmap
|| entry
->bytes
< min_bytes
) {
3445 if (entry
->bitmap
&& list_empty(&entry
->list
))
3446 list_add_tail(&entry
->list
, bitmaps
);
3447 node
= rb_next(&entry
->offset_index
);
3450 entry
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
3453 window_free
= entry
->bytes
;
3454 max_extent
= entry
->bytes
;
3458 for (node
= rb_next(&entry
->offset_index
); node
;
3459 node
= rb_next(&entry
->offset_index
)) {
3460 entry
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
3462 if (entry
->bitmap
) {
3463 if (list_empty(&entry
->list
))
3464 list_add_tail(&entry
->list
, bitmaps
);
3468 if (entry
->bytes
< min_bytes
)
3472 window_free
+= entry
->bytes
;
3473 if (entry
->bytes
> max_extent
)
3474 max_extent
= entry
->bytes
;
3477 if (window_free
< bytes
|| max_extent
< cont1_bytes
)
3480 cluster
->window_start
= first
->offset
;
3482 node
= &first
->offset_index
;
3485 * now we've found our entries, pull them out of the free space
3486 * cache and put them into the cluster rbtree
3491 entry
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
3492 node
= rb_next(&entry
->offset_index
);
3493 if (entry
->bitmap
|| entry
->bytes
< min_bytes
)
3496 rb_erase(&entry
->offset_index
, &ctl
->free_space_offset
);
3497 rb_erase_cached(&entry
->bytes_index
, &ctl
->free_space_bytes
);
3498 ret
= tree_insert_offset(ctl
, cluster
, entry
);
3499 total_size
+= entry
->bytes
;
3500 ASSERT(!ret
); /* -EEXIST; Logic error */
3501 } while (node
&& entry
!= last
);
3503 cluster
->max_size
= max_extent
;
3504 trace_btrfs_setup_cluster(block_group
, cluster
, total_size
, 0);
3509 * This specifically looks for bitmaps that may work in the cluster, we assume
3510 * that we have already failed to find extents that will work.
3513 setup_cluster_bitmap(struct btrfs_block_group
*block_group
,
3514 struct btrfs_free_cluster
*cluster
,
3515 struct list_head
*bitmaps
, u64 offset
, u64 bytes
,
3516 u64 cont1_bytes
, u64 min_bytes
)
3518 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3519 struct btrfs_free_space
*entry
= NULL
;
3521 u64 bitmap_offset
= offset_to_bitmap(ctl
, offset
);
3523 if (ctl
->total_bitmaps
== 0)
3527 * The bitmap that covers offset won't be in the list unless offset
3528 * is just its start offset.
3530 if (!list_empty(bitmaps
))
3531 entry
= list_first_entry(bitmaps
, struct btrfs_free_space
, list
);
3533 if (!entry
|| entry
->offset
!= bitmap_offset
) {
3534 entry
= tree_search_offset(ctl
, bitmap_offset
, 1, 0);
3535 if (entry
&& list_empty(&entry
->list
))
3536 list_add(&entry
->list
, bitmaps
);
3539 list_for_each_entry(entry
, bitmaps
, list
) {
3540 if (entry
->bytes
< bytes
)
3542 ret
= btrfs_bitmap_cluster(block_group
, entry
, cluster
, offset
,
3543 bytes
, cont1_bytes
, min_bytes
);
3549 * The bitmaps list has all the bitmaps that record free space
3550 * starting after offset, so no more search is required.
3556 * here we try to find a cluster of blocks in a block group. The goal
3557 * is to find at least bytes+empty_size.
3558 * We might not find them all in one contiguous area.
3560 * returns zero and sets up cluster if things worked out, otherwise
3561 * it returns -enospc
3563 int btrfs_find_space_cluster(struct btrfs_block_group
*block_group
,
3564 struct btrfs_free_cluster
*cluster
,
3565 u64 offset
, u64 bytes
, u64 empty_size
)
3567 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
3568 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3569 struct btrfs_free_space
*entry
, *tmp
;
3576 * Choose the minimum extent size we'll require for this
3577 * cluster. For SSD_SPREAD, don't allow any fragmentation.
3578 * For metadata, allow allocates with smaller extents. For
3579 * data, keep it dense.
3581 if (btrfs_test_opt(fs_info
, SSD_SPREAD
)) {
3582 cont1_bytes
= bytes
+ empty_size
;
3583 min_bytes
= cont1_bytes
;
3584 } else if (block_group
->flags
& BTRFS_BLOCK_GROUP_METADATA
) {
3585 cont1_bytes
= bytes
;
3586 min_bytes
= fs_info
->sectorsize
;
3588 cont1_bytes
= max(bytes
, (bytes
+ empty_size
) >> 2);
3589 min_bytes
= fs_info
->sectorsize
;
3592 spin_lock(&ctl
->tree_lock
);
3595 * If we know we don't have enough space to make a cluster don't even
3596 * bother doing all the work to try and find one.
3598 if (ctl
->free_space
< bytes
) {
3599 spin_unlock(&ctl
->tree_lock
);
3603 spin_lock(&cluster
->lock
);
3605 /* someone already found a cluster, hooray */
3606 if (cluster
->block_group
) {
3611 trace_btrfs_find_cluster(block_group
, offset
, bytes
, empty_size
,
3614 ret
= setup_cluster_no_bitmap(block_group
, cluster
, &bitmaps
, offset
,
3616 cont1_bytes
, min_bytes
);
3618 ret
= setup_cluster_bitmap(block_group
, cluster
, &bitmaps
,
3619 offset
, bytes
+ empty_size
,
3620 cont1_bytes
, min_bytes
);
3622 /* Clear our temporary list */
3623 list_for_each_entry_safe(entry
, tmp
, &bitmaps
, list
)
3624 list_del_init(&entry
->list
);
3627 btrfs_get_block_group(block_group
);
3628 list_add_tail(&cluster
->block_group_list
,
3629 &block_group
->cluster_list
);
3630 cluster
->block_group
= block_group
;
3632 trace_btrfs_failed_cluster_setup(block_group
);
3635 spin_unlock(&cluster
->lock
);
3636 spin_unlock(&ctl
->tree_lock
);
3642 * simple code to zero out a cluster
3644 void btrfs_init_free_cluster(struct btrfs_free_cluster
*cluster
)
3646 spin_lock_init(&cluster
->lock
);
3647 spin_lock_init(&cluster
->refill_lock
);
3648 cluster
->root
= RB_ROOT
;
3649 cluster
->max_size
= 0;
3650 cluster
->fragmented
= false;
3651 INIT_LIST_HEAD(&cluster
->block_group_list
);
3652 cluster
->block_group
= NULL
;
3655 static int do_trimming(struct btrfs_block_group
*block_group
,
3656 u64
*total_trimmed
, u64 start
, u64 bytes
,
3657 u64 reserved_start
, u64 reserved_bytes
,
3658 enum btrfs_trim_state reserved_trim_state
,
3659 struct btrfs_trim_range
*trim_entry
)
3661 struct btrfs_space_info
*space_info
= block_group
->space_info
;
3662 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
3663 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3666 const u64 end
= start
+ bytes
;
3667 const u64 reserved_end
= reserved_start
+ reserved_bytes
;
3668 enum btrfs_trim_state trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
3671 spin_lock(&space_info
->lock
);
3672 spin_lock(&block_group
->lock
);
3673 if (!block_group
->ro
) {
3674 block_group
->reserved
+= reserved_bytes
;
3675 space_info
->bytes_reserved
+= reserved_bytes
;
3678 spin_unlock(&block_group
->lock
);
3679 spin_unlock(&space_info
->lock
);
3681 ret
= btrfs_discard_extent(fs_info
, start
, bytes
, &trimmed
);
3683 *total_trimmed
+= trimmed
;
3684 trim_state
= BTRFS_TRIM_STATE_TRIMMED
;
3687 mutex_lock(&ctl
->cache_writeout_mutex
);
3688 if (reserved_start
< start
)
3689 __btrfs_add_free_space(block_group
, reserved_start
,
3690 start
- reserved_start
,
3691 reserved_trim_state
);
3692 if (end
< reserved_end
)
3693 __btrfs_add_free_space(block_group
, end
, reserved_end
- end
,
3694 reserved_trim_state
);
3695 __btrfs_add_free_space(block_group
, start
, bytes
, trim_state
);
3696 list_del(&trim_entry
->list
);
3697 mutex_unlock(&ctl
->cache_writeout_mutex
);
3700 spin_lock(&space_info
->lock
);
3701 spin_lock(&block_group
->lock
);
3702 if (block_group
->ro
)
3703 space_info
->bytes_readonly
+= reserved_bytes
;
3704 block_group
->reserved
-= reserved_bytes
;
3705 space_info
->bytes_reserved
-= reserved_bytes
;
3706 spin_unlock(&block_group
->lock
);
3707 spin_unlock(&space_info
->lock
);
3714 * If @async is set, then we will trim 1 region and return.
3716 static int trim_no_bitmap(struct btrfs_block_group
*block_group
,
3717 u64
*total_trimmed
, u64 start
, u64 end
, u64 minlen
,
3720 struct btrfs_discard_ctl
*discard_ctl
=
3721 &block_group
->fs_info
->discard_ctl
;
3722 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3723 struct btrfs_free_space
*entry
;
3724 struct rb_node
*node
;
3728 enum btrfs_trim_state extent_trim_state
;
3730 const u64 max_discard_size
= READ_ONCE(discard_ctl
->max_discard_size
);
3732 while (start
< end
) {
3733 struct btrfs_trim_range trim_entry
;
3735 mutex_lock(&ctl
->cache_writeout_mutex
);
3736 spin_lock(&ctl
->tree_lock
);
3738 if (ctl
->free_space
< minlen
)
3741 entry
= tree_search_offset(ctl
, start
, 0, 1);
3745 /* Skip bitmaps and if async, already trimmed entries */
3746 while (entry
->bitmap
||
3747 (async
&& btrfs_free_space_trimmed(entry
))) {
3748 node
= rb_next(&entry
->offset_index
);
3751 entry
= rb_entry(node
, struct btrfs_free_space
,
3755 if (entry
->offset
>= end
)
3758 extent_start
= entry
->offset
;
3759 extent_bytes
= entry
->bytes
;
3760 extent_trim_state
= entry
->trim_state
;
3762 start
= entry
->offset
;
3763 bytes
= entry
->bytes
;
3764 if (bytes
< minlen
) {
3765 spin_unlock(&ctl
->tree_lock
);
3766 mutex_unlock(&ctl
->cache_writeout_mutex
);
3769 unlink_free_space(ctl
, entry
, true);
3771 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3772 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3773 * X when we come back around. So trim it now.
3775 if (max_discard_size
&&
3776 bytes
>= (max_discard_size
+
3777 BTRFS_ASYNC_DISCARD_MIN_FILTER
)) {
3778 bytes
= max_discard_size
;
3779 extent_bytes
= max_discard_size
;
3780 entry
->offset
+= max_discard_size
;
3781 entry
->bytes
-= max_discard_size
;
3782 link_free_space(ctl
, entry
);
3784 kmem_cache_free(btrfs_free_space_cachep
, entry
);
3787 start
= max(start
, extent_start
);
3788 bytes
= min(extent_start
+ extent_bytes
, end
) - start
;
3789 if (bytes
< minlen
) {
3790 spin_unlock(&ctl
->tree_lock
);
3791 mutex_unlock(&ctl
->cache_writeout_mutex
);
3795 unlink_free_space(ctl
, entry
, true);
3796 kmem_cache_free(btrfs_free_space_cachep
, entry
);
3799 spin_unlock(&ctl
->tree_lock
);
3800 trim_entry
.start
= extent_start
;
3801 trim_entry
.bytes
= extent_bytes
;
3802 list_add_tail(&trim_entry
.list
, &ctl
->trimming_ranges
);
3803 mutex_unlock(&ctl
->cache_writeout_mutex
);
3805 ret
= do_trimming(block_group
, total_trimmed
, start
, bytes
,
3806 extent_start
, extent_bytes
, extent_trim_state
,
3809 block_group
->discard_cursor
= start
+ bytes
;
3814 block_group
->discard_cursor
= start
;
3815 if (async
&& *total_trimmed
)
3818 if (btrfs_trim_interrupted()) {
3829 block_group
->discard_cursor
= btrfs_block_group_end(block_group
);
3830 spin_unlock(&ctl
->tree_lock
);
3831 mutex_unlock(&ctl
->cache_writeout_mutex
);
3837 * If we break out of trimming a bitmap prematurely, we should reset the
3838 * trimming bit. In a rather contrieved case, it's possible to race here so
3839 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3841 * start = start of bitmap
3842 * end = near end of bitmap
3844 * Thread 1: Thread 2:
3845 * trim_bitmaps(start)
3847 * end_trimming_bitmap()
3848 * reset_trimming_bitmap()
3850 static void reset_trimming_bitmap(struct btrfs_free_space_ctl
*ctl
, u64 offset
)
3852 struct btrfs_free_space
*entry
;
3854 spin_lock(&ctl
->tree_lock
);
3855 entry
= tree_search_offset(ctl
, offset
, 1, 0);
3857 if (btrfs_free_space_trimmed(entry
)) {
3858 ctl
->discardable_extents
[BTRFS_STAT_CURR
] +=
3859 entry
->bitmap_extents
;
3860 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] += entry
->bytes
;
3862 entry
->trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
3865 spin_unlock(&ctl
->tree_lock
);
3868 static void end_trimming_bitmap(struct btrfs_free_space_ctl
*ctl
,
3869 struct btrfs_free_space
*entry
)
3871 if (btrfs_free_space_trimming_bitmap(entry
)) {
3872 entry
->trim_state
= BTRFS_TRIM_STATE_TRIMMED
;
3873 ctl
->discardable_extents
[BTRFS_STAT_CURR
] -=
3874 entry
->bitmap_extents
;
3875 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] -= entry
->bytes
;
3880 * If @async is set, then we will trim 1 region and return.
3882 static int trim_bitmaps(struct btrfs_block_group
*block_group
,
3883 u64
*total_trimmed
, u64 start
, u64 end
, u64 minlen
,
3884 u64 maxlen
, bool async
)
3886 struct btrfs_discard_ctl
*discard_ctl
=
3887 &block_group
->fs_info
->discard_ctl
;
3888 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3889 struct btrfs_free_space
*entry
;
3893 u64 offset
= offset_to_bitmap(ctl
, start
);
3894 const u64 max_discard_size
= READ_ONCE(discard_ctl
->max_discard_size
);
3896 while (offset
< end
) {
3897 bool next_bitmap
= false;
3898 struct btrfs_trim_range trim_entry
;
3900 mutex_lock(&ctl
->cache_writeout_mutex
);
3901 spin_lock(&ctl
->tree_lock
);
3903 if (ctl
->free_space
< minlen
) {
3904 block_group
->discard_cursor
=
3905 btrfs_block_group_end(block_group
);
3906 spin_unlock(&ctl
->tree_lock
);
3907 mutex_unlock(&ctl
->cache_writeout_mutex
);
3911 entry
= tree_search_offset(ctl
, offset
, 1, 0);
3913 * Bitmaps are marked trimmed lossily now to prevent constant
3914 * discarding of the same bitmap (the reason why we are bound
3915 * by the filters). So, retrim the block group bitmaps when we
3916 * are preparing to punt to the unused_bgs list. This uses
3917 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3918 * which is the only discard index which sets minlen to 0.
3920 if (!entry
|| (async
&& minlen
&& start
== offset
&&
3921 btrfs_free_space_trimmed(entry
))) {
3922 spin_unlock(&ctl
->tree_lock
);
3923 mutex_unlock(&ctl
->cache_writeout_mutex
);
3929 * Async discard bitmap trimming begins at by setting the start
3930 * to be key.objectid and the offset_to_bitmap() aligns to the
3931 * start of the bitmap. This lets us know we are fully
3932 * scanning the bitmap rather than only some portion of it.
3934 if (start
== offset
)
3935 entry
->trim_state
= BTRFS_TRIM_STATE_TRIMMING
;
3938 ret2
= search_bitmap(ctl
, entry
, &start
, &bytes
, false);
3939 if (ret2
|| start
>= end
) {
3941 * We lossily consider a bitmap trimmed if we only skip
3942 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3944 if (ret2
&& minlen
<= BTRFS_ASYNC_DISCARD_MIN_FILTER
)
3945 end_trimming_bitmap(ctl
, entry
);
3947 entry
->trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
3948 spin_unlock(&ctl
->tree_lock
);
3949 mutex_unlock(&ctl
->cache_writeout_mutex
);
3955 * We already trimmed a region, but are using the locking above
3956 * to reset the trim_state.
3958 if (async
&& *total_trimmed
) {
3959 spin_unlock(&ctl
->tree_lock
);
3960 mutex_unlock(&ctl
->cache_writeout_mutex
);
3964 bytes
= min(bytes
, end
- start
);
3965 if (bytes
< minlen
|| (async
&& maxlen
&& bytes
> maxlen
)) {
3966 spin_unlock(&ctl
->tree_lock
);
3967 mutex_unlock(&ctl
->cache_writeout_mutex
);
3972 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3973 * If X < @minlen, we won't trim X when we come back around.
3974 * So trim it now. We differ here from trimming extents as we
3975 * don't keep individual state per bit.
3979 bytes
> (max_discard_size
+ minlen
))
3980 bytes
= max_discard_size
;
3982 bitmap_clear_bits(ctl
, entry
, start
, bytes
, true);
3983 if (entry
->bytes
== 0)
3984 free_bitmap(ctl
, entry
);
3986 spin_unlock(&ctl
->tree_lock
);
3987 trim_entry
.start
= start
;
3988 trim_entry
.bytes
= bytes
;
3989 list_add_tail(&trim_entry
.list
, &ctl
->trimming_ranges
);
3990 mutex_unlock(&ctl
->cache_writeout_mutex
);
3992 ret
= do_trimming(block_group
, total_trimmed
, start
, bytes
,
3993 start
, bytes
, 0, &trim_entry
);
3995 reset_trimming_bitmap(ctl
, offset
);
3996 block_group
->discard_cursor
=
3997 btrfs_block_group_end(block_group
);
4002 offset
+= BITS_PER_BITMAP
* ctl
->unit
;
4007 block_group
->discard_cursor
= start
;
4009 if (btrfs_trim_interrupted()) {
4010 if (start
!= offset
)
4011 reset_trimming_bitmap(ctl
, offset
);
4020 block_group
->discard_cursor
= end
;
4026 int btrfs_trim_block_group(struct btrfs_block_group
*block_group
,
4027 u64
*trimmed
, u64 start
, u64 end
, u64 minlen
)
4029 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
4033 ASSERT(!btrfs_is_zoned(block_group
->fs_info
));
4037 spin_lock(&block_group
->lock
);
4038 if (test_bit(BLOCK_GROUP_FLAG_REMOVED
, &block_group
->runtime_flags
)) {
4039 spin_unlock(&block_group
->lock
);
4042 btrfs_freeze_block_group(block_group
);
4043 spin_unlock(&block_group
->lock
);
4045 ret
= trim_no_bitmap(block_group
, trimmed
, start
, end
, minlen
, false);
4049 ret
= trim_bitmaps(block_group
, trimmed
, start
, end
, minlen
, 0, false);
4050 div64_u64_rem(end
, BITS_PER_BITMAP
* ctl
->unit
, &rem
);
4051 /* If we ended in the middle of a bitmap, reset the trimming flag */
4053 reset_trimming_bitmap(ctl
, offset_to_bitmap(ctl
, end
));
4055 btrfs_unfreeze_block_group(block_group
);
4059 int btrfs_trim_block_group_extents(struct btrfs_block_group
*block_group
,
4060 u64
*trimmed
, u64 start
, u64 end
, u64 minlen
,
4067 spin_lock(&block_group
->lock
);
4068 if (test_bit(BLOCK_GROUP_FLAG_REMOVED
, &block_group
->runtime_flags
)) {
4069 spin_unlock(&block_group
->lock
);
4072 btrfs_freeze_block_group(block_group
);
4073 spin_unlock(&block_group
->lock
);
4075 ret
= trim_no_bitmap(block_group
, trimmed
, start
, end
, minlen
, async
);
4076 btrfs_unfreeze_block_group(block_group
);
4081 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group
*block_group
,
4082 u64
*trimmed
, u64 start
, u64 end
, u64 minlen
,
4083 u64 maxlen
, bool async
)
4089 spin_lock(&block_group
->lock
);
4090 if (test_bit(BLOCK_GROUP_FLAG_REMOVED
, &block_group
->runtime_flags
)) {
4091 spin_unlock(&block_group
->lock
);
4094 btrfs_freeze_block_group(block_group
);
4095 spin_unlock(&block_group
->lock
);
4097 ret
= trim_bitmaps(block_group
, trimmed
, start
, end
, minlen
, maxlen
,
4100 btrfs_unfreeze_block_group(block_group
);
4105 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info
*fs_info
)
4107 return btrfs_super_cache_generation(fs_info
->super_copy
);
4110 static int cleanup_free_space_cache_v1(struct btrfs_fs_info
*fs_info
,
4111 struct btrfs_trans_handle
*trans
)
4113 struct btrfs_block_group
*block_group
;
4114 struct rb_node
*node
;
4117 btrfs_info(fs_info
, "cleaning free space cache v1");
4119 node
= rb_first_cached(&fs_info
->block_group_cache_tree
);
4121 block_group
= rb_entry(node
, struct btrfs_block_group
, cache_node
);
4122 ret
= btrfs_remove_free_space_inode(trans
, NULL
, block_group
);
4125 node
= rb_next(node
);
4131 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info
*fs_info
, bool active
)
4133 struct btrfs_trans_handle
*trans
;
4137 * update_super_roots will appropriately set or unset
4138 * super_copy->cache_generation based on SPACE_CACHE and
4139 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
4140 * transaction commit whether we are enabling space cache v1 and don't
4141 * have any other work to do, or are disabling it and removing free
4144 trans
= btrfs_start_transaction(fs_info
->tree_root
, 0);
4146 return PTR_ERR(trans
);
4149 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1
, &fs_info
->flags
);
4150 ret
= cleanup_free_space_cache_v1(fs_info
, trans
);
4152 btrfs_abort_transaction(trans
, ret
);
4153 btrfs_end_transaction(trans
);
4158 ret
= btrfs_commit_transaction(trans
);
4160 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1
, &fs_info
->flags
);
4165 int __init
btrfs_free_space_init(void)
4167 btrfs_free_space_cachep
= KMEM_CACHE(btrfs_free_space
, 0);
4168 if (!btrfs_free_space_cachep
)
4171 btrfs_free_space_bitmap_cachep
= kmem_cache_create("btrfs_free_space_bitmap",
4172 PAGE_SIZE
, PAGE_SIZE
,
4174 if (!btrfs_free_space_bitmap_cachep
) {
4175 kmem_cache_destroy(btrfs_free_space_cachep
);
4182 void __cold
btrfs_free_space_exit(void)
4184 kmem_cache_destroy(btrfs_free_space_cachep
);
4185 kmem_cache_destroy(btrfs_free_space_bitmap_cachep
);
4188 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4190 * Use this if you need to make a bitmap or extent entry specifically, it
4191 * doesn't do any of the merging that add_free_space does, this acts a lot like
4192 * how the free space cache loading stuff works, so you can get really weird
4195 int test_add_free_space_entry(struct btrfs_block_group
*cache
,
4196 u64 offset
, u64 bytes
, bool bitmap
)
4198 struct btrfs_free_space_ctl
*ctl
= cache
->free_space_ctl
;
4199 struct btrfs_free_space
*info
= NULL
, *bitmap_info
;
4201 enum btrfs_trim_state trim_state
= BTRFS_TRIM_STATE_TRIMMED
;
4207 info
= kmem_cache_zalloc(btrfs_free_space_cachep
, GFP_NOFS
);
4213 spin_lock(&ctl
->tree_lock
);
4214 info
->offset
= offset
;
4215 info
->bytes
= bytes
;
4216 info
->max_extent_size
= 0;
4217 ret
= link_free_space(ctl
, info
);
4218 spin_unlock(&ctl
->tree_lock
);
4220 kmem_cache_free(btrfs_free_space_cachep
, info
);
4225 map
= kmem_cache_zalloc(btrfs_free_space_bitmap_cachep
, GFP_NOFS
);
4227 kmem_cache_free(btrfs_free_space_cachep
, info
);
4232 spin_lock(&ctl
->tree_lock
);
4233 bitmap_info
= tree_search_offset(ctl
, offset_to_bitmap(ctl
, offset
),
4238 add_new_bitmap(ctl
, info
, offset
);
4243 bytes_added
= add_bytes_to_bitmap(ctl
, bitmap_info
, offset
, bytes
,
4246 bytes
-= bytes_added
;
4247 offset
+= bytes_added
;
4248 spin_unlock(&ctl
->tree_lock
);
4254 kmem_cache_free(btrfs_free_space_cachep
, info
);
4256 kmem_cache_free(btrfs_free_space_bitmap_cachep
, map
);
4261 * Checks to see if the given range is in the free space cache. This is really
4262 * just used to check the absence of space, so if there is free space in the
4263 * range at all we will return 1.
4265 int test_check_exists(struct btrfs_block_group
*cache
,
4266 u64 offset
, u64 bytes
)
4268 struct btrfs_free_space_ctl
*ctl
= cache
->free_space_ctl
;
4269 struct btrfs_free_space
*info
;
4272 spin_lock(&ctl
->tree_lock
);
4273 info
= tree_search_offset(ctl
, offset
, 0, 0);
4275 info
= tree_search_offset(ctl
, offset_to_bitmap(ctl
, offset
),
4283 u64 bit_off
, bit_bytes
;
4285 struct btrfs_free_space
*tmp
;
4288 bit_bytes
= ctl
->unit
;
4289 ret
= search_bitmap(ctl
, info
, &bit_off
, &bit_bytes
, false);
4291 if (bit_off
== offset
) {
4294 } else if (bit_off
> offset
&&
4295 offset
+ bytes
> bit_off
) {
4301 n
= rb_prev(&info
->offset_index
);
4303 tmp
= rb_entry(n
, struct btrfs_free_space
,
4305 if (tmp
->offset
+ tmp
->bytes
< offset
)
4307 if (offset
+ bytes
< tmp
->offset
) {
4308 n
= rb_prev(&tmp
->offset_index
);
4315 n
= rb_next(&info
->offset_index
);
4317 tmp
= rb_entry(n
, struct btrfs_free_space
,
4319 if (offset
+ bytes
< tmp
->offset
)
4321 if (tmp
->offset
+ tmp
->bytes
< offset
) {
4322 n
= rb_next(&tmp
->offset_index
);
4333 if (info
->offset
== offset
) {
4338 if (offset
> info
->offset
&& offset
< info
->offset
+ info
->bytes
)
4341 spin_unlock(&ctl
->tree_lock
);
4344 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */