1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2008 Red Hat. All rights reserved.
6 #include <linux/pagemap.h>
7 #include <linux/sched.h>
8 #include <linux/sched/signal.h>
9 #include <linux/slab.h>
10 #include <linux/math64.h>
11 #include <linux/ratelimit.h>
12 #include <linux/error-injection.h>
13 #include <linux/sched/mm.h>
14 #include <linux/string_choices.h>
15 #include "extent-tree.h"
19 #include "free-space-cache.h"
20 #include "transaction.h"
22 #include "extent_io.h"
23 #include "space-info.h"
24 #include "block-group.h"
27 #include "inode-item.h"
28 #include "accessors.h"
29 #include "file-item.h"
33 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
34 #define MAX_CACHE_BYTES_PER_GIG SZ_64K
35 #define FORCE_EXTENT_THRESHOLD SZ_1M
37 static struct kmem_cache
*btrfs_free_space_cachep
;
38 static struct kmem_cache
*btrfs_free_space_bitmap_cachep
;
40 struct btrfs_trim_range
{
43 struct list_head list
;
46 static int link_free_space(struct btrfs_free_space_ctl
*ctl
,
47 struct btrfs_free_space
*info
);
48 static void unlink_free_space(struct btrfs_free_space_ctl
*ctl
,
49 struct btrfs_free_space
*info
, bool update_stat
);
50 static int search_bitmap(struct btrfs_free_space_ctl
*ctl
,
51 struct btrfs_free_space
*bitmap_info
, u64
*offset
,
52 u64
*bytes
, bool for_alloc
);
53 static void free_bitmap(struct btrfs_free_space_ctl
*ctl
,
54 struct btrfs_free_space
*bitmap_info
);
55 static void bitmap_clear_bits(struct btrfs_free_space_ctl
*ctl
,
56 struct btrfs_free_space
*info
, u64 offset
,
57 u64 bytes
, bool update_stats
);
59 static void btrfs_crc32c_final(u32 crc
, u8
*result
)
61 put_unaligned_le32(~crc
, result
);
64 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl
*ctl
)
66 struct btrfs_free_space
*info
;
69 while ((node
= rb_last(&ctl
->free_space_offset
)) != NULL
) {
70 info
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
72 unlink_free_space(ctl
, info
, true);
73 kmem_cache_free(btrfs_free_space_cachep
, info
);
75 free_bitmap(ctl
, info
);
78 cond_resched_lock(&ctl
->tree_lock
);
82 static struct inode
*__lookup_free_space_inode(struct btrfs_root
*root
,
83 struct btrfs_path
*path
,
87 struct btrfs_key location
;
88 struct btrfs_disk_key disk_key
;
89 struct btrfs_free_space_header
*header
;
90 struct extent_buffer
*leaf
;
91 struct inode
*inode
= NULL
;
95 key
.objectid
= BTRFS_FREE_SPACE_OBJECTID
;
99 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
103 btrfs_release_path(path
);
104 return ERR_PTR(-ENOENT
);
107 leaf
= path
->nodes
[0];
108 header
= btrfs_item_ptr(leaf
, path
->slots
[0],
109 struct btrfs_free_space_header
);
110 btrfs_free_space_key(leaf
, header
, &disk_key
);
111 btrfs_disk_key_to_cpu(&location
, &disk_key
);
112 btrfs_release_path(path
);
115 * We are often under a trans handle at this point, so we need to make
116 * sure NOFS is set to keep us from deadlocking.
118 nofs_flag
= memalloc_nofs_save();
119 inode
= btrfs_iget_path(location
.objectid
, root
, path
);
120 btrfs_release_path(path
);
121 memalloc_nofs_restore(nofs_flag
);
125 mapping_set_gfp_mask(inode
->i_mapping
,
126 mapping_gfp_constraint(inode
->i_mapping
,
127 ~(__GFP_FS
| __GFP_HIGHMEM
)));
132 struct inode
*lookup_free_space_inode(struct btrfs_block_group
*block_group
,
133 struct btrfs_path
*path
)
135 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
136 struct inode
*inode
= NULL
;
137 u32 flags
= BTRFS_INODE_NODATASUM
| BTRFS_INODE_NODATACOW
;
139 spin_lock(&block_group
->lock
);
140 if (block_group
->inode
)
141 inode
= igrab(&block_group
->inode
->vfs_inode
);
142 spin_unlock(&block_group
->lock
);
146 inode
= __lookup_free_space_inode(fs_info
->tree_root
, path
,
151 spin_lock(&block_group
->lock
);
152 if (!((BTRFS_I(inode
)->flags
& flags
) == flags
)) {
153 btrfs_info(fs_info
, "Old style space inode found, converting.");
154 BTRFS_I(inode
)->flags
|= BTRFS_INODE_NODATASUM
|
155 BTRFS_INODE_NODATACOW
;
156 block_group
->disk_cache_state
= BTRFS_DC_CLEAR
;
159 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF
, &block_group
->runtime_flags
))
160 block_group
->inode
= BTRFS_I(igrab(inode
));
161 spin_unlock(&block_group
->lock
);
166 static int __create_free_space_inode(struct btrfs_root
*root
,
167 struct btrfs_trans_handle
*trans
,
168 struct btrfs_path
*path
,
171 struct btrfs_key key
;
172 struct btrfs_disk_key disk_key
;
173 struct btrfs_free_space_header
*header
;
174 struct btrfs_inode_item
*inode_item
;
175 struct extent_buffer
*leaf
;
176 /* We inline CRCs for the free disk space cache */
177 const u64 flags
= BTRFS_INODE_NOCOMPRESS
| BTRFS_INODE_PREALLOC
|
178 BTRFS_INODE_NODATASUM
| BTRFS_INODE_NODATACOW
;
181 ret
= btrfs_insert_empty_inode(trans
, root
, path
, ino
);
185 leaf
= path
->nodes
[0];
186 inode_item
= btrfs_item_ptr(leaf
, path
->slots
[0],
187 struct btrfs_inode_item
);
188 btrfs_item_key(leaf
, &disk_key
, path
->slots
[0]);
189 memzero_extent_buffer(leaf
, (unsigned long)inode_item
,
190 sizeof(*inode_item
));
191 btrfs_set_inode_generation(leaf
, inode_item
, trans
->transid
);
192 btrfs_set_inode_size(leaf
, inode_item
, 0);
193 btrfs_set_inode_nbytes(leaf
, inode_item
, 0);
194 btrfs_set_inode_uid(leaf
, inode_item
, 0);
195 btrfs_set_inode_gid(leaf
, inode_item
, 0);
196 btrfs_set_inode_mode(leaf
, inode_item
, S_IFREG
| 0600);
197 btrfs_set_inode_flags(leaf
, inode_item
, flags
);
198 btrfs_set_inode_nlink(leaf
, inode_item
, 1);
199 btrfs_set_inode_transid(leaf
, inode_item
, trans
->transid
);
200 btrfs_set_inode_block_group(leaf
, inode_item
, offset
);
201 btrfs_release_path(path
);
203 key
.objectid
= BTRFS_FREE_SPACE_OBJECTID
;
206 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
,
207 sizeof(struct btrfs_free_space_header
));
209 btrfs_release_path(path
);
213 leaf
= path
->nodes
[0];
214 header
= btrfs_item_ptr(leaf
, path
->slots
[0],
215 struct btrfs_free_space_header
);
216 memzero_extent_buffer(leaf
, (unsigned long)header
, sizeof(*header
));
217 btrfs_set_free_space_key(leaf
, header
, &disk_key
);
218 btrfs_release_path(path
);
223 int create_free_space_inode(struct btrfs_trans_handle
*trans
,
224 struct btrfs_block_group
*block_group
,
225 struct btrfs_path
*path
)
230 ret
= btrfs_get_free_objectid(trans
->fs_info
->tree_root
, &ino
);
234 return __create_free_space_inode(trans
->fs_info
->tree_root
, trans
, path
,
235 ino
, block_group
->start
);
239 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
240 * handles lookup, otherwise it takes ownership and iputs the inode.
241 * Don't reuse an inode pointer after passing it into this function.
243 int btrfs_remove_free_space_inode(struct btrfs_trans_handle
*trans
,
245 struct btrfs_block_group
*block_group
)
247 struct btrfs_path
*path
;
248 struct btrfs_key key
;
251 path
= btrfs_alloc_path();
256 inode
= lookup_free_space_inode(block_group
, path
);
258 if (PTR_ERR(inode
) != -ENOENT
)
259 ret
= PTR_ERR(inode
);
262 ret
= btrfs_orphan_add(trans
, BTRFS_I(inode
));
264 btrfs_add_delayed_iput(BTRFS_I(inode
));
268 /* One for the block groups ref */
269 spin_lock(&block_group
->lock
);
270 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF
, &block_group
->runtime_flags
)) {
271 block_group
->inode
= NULL
;
272 spin_unlock(&block_group
->lock
);
275 spin_unlock(&block_group
->lock
);
277 /* One for the lookup ref */
278 btrfs_add_delayed_iput(BTRFS_I(inode
));
280 key
.objectid
= BTRFS_FREE_SPACE_OBJECTID
;
282 key
.offset
= block_group
->start
;
283 ret
= btrfs_search_slot(trans
, trans
->fs_info
->tree_root
, &key
, path
,
290 ret
= btrfs_del_item(trans
, trans
->fs_info
->tree_root
, path
);
292 btrfs_free_path(path
);
296 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle
*trans
,
297 struct btrfs_block_group
*block_group
,
298 struct inode
*vfs_inode
)
300 struct btrfs_truncate_control control
= {
301 .inode
= BTRFS_I(vfs_inode
),
303 .ino
= btrfs_ino(BTRFS_I(vfs_inode
)),
304 .min_type
= BTRFS_EXTENT_DATA_KEY
,
305 .clear_extent_range
= true,
307 struct btrfs_inode
*inode
= BTRFS_I(vfs_inode
);
308 struct btrfs_root
*root
= inode
->root
;
309 struct extent_state
*cached_state
= NULL
;
314 struct btrfs_path
*path
= btrfs_alloc_path();
321 mutex_lock(&trans
->transaction
->cache_write_mutex
);
322 if (!list_empty(&block_group
->io_list
)) {
323 list_del_init(&block_group
->io_list
);
325 btrfs_wait_cache_io(trans
, block_group
, path
);
326 btrfs_put_block_group(block_group
);
330 * now that we've truncated the cache away, its no longer
333 spin_lock(&block_group
->lock
);
334 block_group
->disk_cache_state
= BTRFS_DC_CLEAR
;
335 spin_unlock(&block_group
->lock
);
336 btrfs_free_path(path
);
339 btrfs_i_size_write(inode
, 0);
340 truncate_pagecache(vfs_inode
, 0);
342 lock_extent(&inode
->io_tree
, 0, (u64
)-1, &cached_state
);
343 btrfs_drop_extent_map_range(inode
, 0, (u64
)-1, false);
346 * We skip the throttling logic for free space cache inodes, so we don't
347 * need to check for -EAGAIN.
349 ret
= btrfs_truncate_inode_items(trans
, root
, &control
);
351 inode_sub_bytes(&inode
->vfs_inode
, control
.sub_bytes
);
352 btrfs_inode_safe_disk_i_size_write(inode
, control
.last_size
);
354 unlock_extent(&inode
->io_tree
, 0, (u64
)-1, &cached_state
);
358 ret
= btrfs_update_inode(trans
, inode
);
362 mutex_unlock(&trans
->transaction
->cache_write_mutex
);
364 btrfs_abort_transaction(trans
, ret
);
369 static void readahead_cache(struct inode
*inode
)
371 struct file_ra_state ra
;
372 unsigned long last_index
;
374 file_ra_state_init(&ra
, inode
->i_mapping
);
375 last_index
= (i_size_read(inode
) - 1) >> PAGE_SHIFT
;
377 page_cache_sync_readahead(inode
->i_mapping
, &ra
, NULL
, 0, last_index
);
380 static int io_ctl_init(struct btrfs_io_ctl
*io_ctl
, struct inode
*inode
,
385 num_pages
= DIV_ROUND_UP(i_size_read(inode
), PAGE_SIZE
);
387 /* Make sure we can fit our crcs and generation into the first page */
388 if (write
&& (num_pages
* sizeof(u32
) + sizeof(u64
)) > PAGE_SIZE
)
391 memset(io_ctl
, 0, sizeof(struct btrfs_io_ctl
));
393 io_ctl
->pages
= kcalloc(num_pages
, sizeof(struct page
*), GFP_NOFS
);
397 io_ctl
->num_pages
= num_pages
;
398 io_ctl
->fs_info
= inode_to_fs_info(inode
);
399 io_ctl
->inode
= inode
;
403 ALLOW_ERROR_INJECTION(io_ctl_init
, ERRNO
);
405 static void io_ctl_free(struct btrfs_io_ctl
*io_ctl
)
407 kfree(io_ctl
->pages
);
408 io_ctl
->pages
= NULL
;
411 static void io_ctl_unmap_page(struct btrfs_io_ctl
*io_ctl
)
419 static void io_ctl_map_page(struct btrfs_io_ctl
*io_ctl
, int clear
)
421 ASSERT(io_ctl
->index
< io_ctl
->num_pages
);
422 io_ctl
->page
= io_ctl
->pages
[io_ctl
->index
++];
423 io_ctl
->cur
= page_address(io_ctl
->page
);
424 io_ctl
->orig
= io_ctl
->cur
;
425 io_ctl
->size
= PAGE_SIZE
;
427 clear_page(io_ctl
->cur
);
430 static void io_ctl_drop_pages(struct btrfs_io_ctl
*io_ctl
)
434 io_ctl_unmap_page(io_ctl
);
436 for (i
= 0; i
< io_ctl
->num_pages
; i
++) {
437 if (io_ctl
->pages
[i
]) {
438 btrfs_folio_clear_checked(io_ctl
->fs_info
,
439 page_folio(io_ctl
->pages
[i
]),
440 page_offset(io_ctl
->pages
[i
]),
442 unlock_page(io_ctl
->pages
[i
]);
443 put_page(io_ctl
->pages
[i
]);
448 static int io_ctl_prepare_pages(struct btrfs_io_ctl
*io_ctl
, bool uptodate
)
451 struct inode
*inode
= io_ctl
->inode
;
452 gfp_t mask
= btrfs_alloc_write_mask(inode
->i_mapping
);
455 for (i
= 0; i
< io_ctl
->num_pages
; i
++) {
458 page
= find_or_create_page(inode
->i_mapping
, i
, mask
);
460 io_ctl_drop_pages(io_ctl
);
464 ret
= set_folio_extent_mapped(page_folio(page
));
468 io_ctl_drop_pages(io_ctl
);
472 io_ctl
->pages
[i
] = page
;
473 if (uptodate
&& !PageUptodate(page
)) {
474 btrfs_read_folio(NULL
, page_folio(page
));
476 if (page
->mapping
!= inode
->i_mapping
) {
477 btrfs_err(BTRFS_I(inode
)->root
->fs_info
,
478 "free space cache page truncated");
479 io_ctl_drop_pages(io_ctl
);
482 if (!PageUptodate(page
)) {
483 btrfs_err(BTRFS_I(inode
)->root
->fs_info
,
484 "error reading free space cache");
485 io_ctl_drop_pages(io_ctl
);
491 for (i
= 0; i
< io_ctl
->num_pages
; i
++)
492 clear_page_dirty_for_io(io_ctl
->pages
[i
]);
497 static void io_ctl_set_generation(struct btrfs_io_ctl
*io_ctl
, u64 generation
)
499 io_ctl_map_page(io_ctl
, 1);
502 * Skip the csum areas. If we don't check crcs then we just have a
503 * 64bit chunk at the front of the first page.
505 io_ctl
->cur
+= (sizeof(u32
) * io_ctl
->num_pages
);
506 io_ctl
->size
-= sizeof(u64
) + (sizeof(u32
) * io_ctl
->num_pages
);
508 put_unaligned_le64(generation
, io_ctl
->cur
);
509 io_ctl
->cur
+= sizeof(u64
);
512 static int io_ctl_check_generation(struct btrfs_io_ctl
*io_ctl
, u64 generation
)
517 * Skip the crc area. If we don't check crcs then we just have a 64bit
518 * chunk at the front of the first page.
520 io_ctl
->cur
+= sizeof(u32
) * io_ctl
->num_pages
;
521 io_ctl
->size
-= sizeof(u64
) + (sizeof(u32
) * io_ctl
->num_pages
);
523 cache_gen
= get_unaligned_le64(io_ctl
->cur
);
524 if (cache_gen
!= generation
) {
525 btrfs_err_rl(io_ctl
->fs_info
,
526 "space cache generation (%llu) does not match inode (%llu)",
527 cache_gen
, generation
);
528 io_ctl_unmap_page(io_ctl
);
531 io_ctl
->cur
+= sizeof(u64
);
535 static void io_ctl_set_crc(struct btrfs_io_ctl
*io_ctl
, int index
)
542 offset
= sizeof(u32
) * io_ctl
->num_pages
;
544 crc
= crc32c(crc
, io_ctl
->orig
+ offset
, PAGE_SIZE
- offset
);
545 btrfs_crc32c_final(crc
, (u8
*)&crc
);
546 io_ctl_unmap_page(io_ctl
);
547 tmp
= page_address(io_ctl
->pages
[0]);
552 static int io_ctl_check_crc(struct btrfs_io_ctl
*io_ctl
, int index
)
559 offset
= sizeof(u32
) * io_ctl
->num_pages
;
561 tmp
= page_address(io_ctl
->pages
[0]);
565 io_ctl_map_page(io_ctl
, 0);
566 crc
= crc32c(crc
, io_ctl
->orig
+ offset
, PAGE_SIZE
- offset
);
567 btrfs_crc32c_final(crc
, (u8
*)&crc
);
569 btrfs_err_rl(io_ctl
->fs_info
,
570 "csum mismatch on free space cache");
571 io_ctl_unmap_page(io_ctl
);
578 static int io_ctl_add_entry(struct btrfs_io_ctl
*io_ctl
, u64 offset
, u64 bytes
,
581 struct btrfs_free_space_entry
*entry
;
587 put_unaligned_le64(offset
, &entry
->offset
);
588 put_unaligned_le64(bytes
, &entry
->bytes
);
589 entry
->type
= (bitmap
) ? BTRFS_FREE_SPACE_BITMAP
:
590 BTRFS_FREE_SPACE_EXTENT
;
591 io_ctl
->cur
+= sizeof(struct btrfs_free_space_entry
);
592 io_ctl
->size
-= sizeof(struct btrfs_free_space_entry
);
594 if (io_ctl
->size
>= sizeof(struct btrfs_free_space_entry
))
597 io_ctl_set_crc(io_ctl
, io_ctl
->index
- 1);
599 /* No more pages to map */
600 if (io_ctl
->index
>= io_ctl
->num_pages
)
603 /* map the next page */
604 io_ctl_map_page(io_ctl
, 1);
608 static int io_ctl_add_bitmap(struct btrfs_io_ctl
*io_ctl
, void *bitmap
)
614 * If we aren't at the start of the current page, unmap this one and
615 * map the next one if there is any left.
617 if (io_ctl
->cur
!= io_ctl
->orig
) {
618 io_ctl_set_crc(io_ctl
, io_ctl
->index
- 1);
619 if (io_ctl
->index
>= io_ctl
->num_pages
)
621 io_ctl_map_page(io_ctl
, 0);
624 copy_page(io_ctl
->cur
, bitmap
);
625 io_ctl_set_crc(io_ctl
, io_ctl
->index
- 1);
626 if (io_ctl
->index
< io_ctl
->num_pages
)
627 io_ctl_map_page(io_ctl
, 0);
631 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl
*io_ctl
)
634 * If we're not on the boundary we know we've modified the page and we
635 * need to crc the page.
637 if (io_ctl
->cur
!= io_ctl
->orig
)
638 io_ctl_set_crc(io_ctl
, io_ctl
->index
- 1);
640 io_ctl_unmap_page(io_ctl
);
642 while (io_ctl
->index
< io_ctl
->num_pages
) {
643 io_ctl_map_page(io_ctl
, 1);
644 io_ctl_set_crc(io_ctl
, io_ctl
->index
- 1);
648 static int io_ctl_read_entry(struct btrfs_io_ctl
*io_ctl
,
649 struct btrfs_free_space
*entry
, u8
*type
)
651 struct btrfs_free_space_entry
*e
;
655 ret
= io_ctl_check_crc(io_ctl
, io_ctl
->index
);
661 entry
->offset
= get_unaligned_le64(&e
->offset
);
662 entry
->bytes
= get_unaligned_le64(&e
->bytes
);
664 io_ctl
->cur
+= sizeof(struct btrfs_free_space_entry
);
665 io_ctl
->size
-= sizeof(struct btrfs_free_space_entry
);
667 if (io_ctl
->size
>= sizeof(struct btrfs_free_space_entry
))
670 io_ctl_unmap_page(io_ctl
);
675 static int io_ctl_read_bitmap(struct btrfs_io_ctl
*io_ctl
,
676 struct btrfs_free_space
*entry
)
680 ret
= io_ctl_check_crc(io_ctl
, io_ctl
->index
);
684 copy_page(entry
->bitmap
, io_ctl
->cur
);
685 io_ctl_unmap_page(io_ctl
);
690 static void recalculate_thresholds(struct btrfs_free_space_ctl
*ctl
)
692 struct btrfs_block_group
*block_group
= ctl
->block_group
;
696 u64 size
= block_group
->length
;
697 u64 bytes_per_bg
= BITS_PER_BITMAP
* ctl
->unit
;
698 u64 max_bitmaps
= div64_u64(size
+ bytes_per_bg
- 1, bytes_per_bg
);
700 max_bitmaps
= max_t(u64
, max_bitmaps
, 1);
702 if (ctl
->total_bitmaps
> max_bitmaps
)
703 btrfs_err(block_group
->fs_info
,
704 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
705 block_group
->start
, block_group
->length
,
706 ctl
->total_bitmaps
, ctl
->unit
, max_bitmaps
,
708 ASSERT(ctl
->total_bitmaps
<= max_bitmaps
);
711 * We are trying to keep the total amount of memory used per 1GiB of
712 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
713 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
714 * bitmaps, we may end up using more memory than this.
717 max_bytes
= MAX_CACHE_BYTES_PER_GIG
;
719 max_bytes
= MAX_CACHE_BYTES_PER_GIG
* div_u64(size
, SZ_1G
);
721 bitmap_bytes
= ctl
->total_bitmaps
* ctl
->unit
;
724 * we want the extent entry threshold to always be at most 1/2 the max
725 * bytes we can have, or whatever is less than that.
727 extent_bytes
= max_bytes
- bitmap_bytes
;
728 extent_bytes
= min_t(u64
, extent_bytes
, max_bytes
>> 1);
730 ctl
->extents_thresh
=
731 div_u64(extent_bytes
, sizeof(struct btrfs_free_space
));
734 static int __load_free_space_cache(struct btrfs_root
*root
, struct inode
*inode
,
735 struct btrfs_free_space_ctl
*ctl
,
736 struct btrfs_path
*path
, u64 offset
)
738 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
739 struct btrfs_free_space_header
*header
;
740 struct extent_buffer
*leaf
;
741 struct btrfs_io_ctl io_ctl
;
742 struct btrfs_key key
;
743 struct btrfs_free_space
*e
, *n
;
751 /* Nothing in the space cache, goodbye */
752 if (!i_size_read(inode
))
755 key
.objectid
= BTRFS_FREE_SPACE_OBJECTID
;
759 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
763 btrfs_release_path(path
);
769 leaf
= path
->nodes
[0];
770 header
= btrfs_item_ptr(leaf
, path
->slots
[0],
771 struct btrfs_free_space_header
);
772 num_entries
= btrfs_free_space_entries(leaf
, header
);
773 num_bitmaps
= btrfs_free_space_bitmaps(leaf
, header
);
774 generation
= btrfs_free_space_generation(leaf
, header
);
775 btrfs_release_path(path
);
777 if (!BTRFS_I(inode
)->generation
) {
779 "the free space cache file (%llu) is invalid, skip it",
784 if (BTRFS_I(inode
)->generation
!= generation
) {
786 "free space inode generation (%llu) did not match free space cache generation (%llu)",
787 BTRFS_I(inode
)->generation
, generation
);
794 ret
= io_ctl_init(&io_ctl
, inode
, 0);
798 readahead_cache(inode
);
800 ret
= io_ctl_prepare_pages(&io_ctl
, true);
804 ret
= io_ctl_check_crc(&io_ctl
, 0);
808 ret
= io_ctl_check_generation(&io_ctl
, generation
);
812 while (num_entries
) {
813 e
= kmem_cache_zalloc(btrfs_free_space_cachep
,
820 ret
= io_ctl_read_entry(&io_ctl
, e
, &type
);
822 kmem_cache_free(btrfs_free_space_cachep
, e
);
828 kmem_cache_free(btrfs_free_space_cachep
, e
);
832 if (type
== BTRFS_FREE_SPACE_EXTENT
) {
833 spin_lock(&ctl
->tree_lock
);
834 ret
= link_free_space(ctl
, e
);
835 spin_unlock(&ctl
->tree_lock
);
838 "Duplicate entries in free space cache, dumping");
839 kmem_cache_free(btrfs_free_space_cachep
, e
);
845 e
->bitmap
= kmem_cache_zalloc(
846 btrfs_free_space_bitmap_cachep
, GFP_NOFS
);
850 btrfs_free_space_cachep
, e
);
853 spin_lock(&ctl
->tree_lock
);
854 ret
= link_free_space(ctl
, e
);
856 spin_unlock(&ctl
->tree_lock
);
858 "Duplicate entries in free space cache, dumping");
859 kmem_cache_free(btrfs_free_space_bitmap_cachep
, e
->bitmap
);
860 kmem_cache_free(btrfs_free_space_cachep
, e
);
863 ctl
->total_bitmaps
++;
864 recalculate_thresholds(ctl
);
865 spin_unlock(&ctl
->tree_lock
);
866 list_add_tail(&e
->list
, &bitmaps
);
872 io_ctl_unmap_page(&io_ctl
);
875 * We add the bitmaps at the end of the entries in order that
876 * the bitmap entries are added to the cache.
878 list_for_each_entry_safe(e
, n
, &bitmaps
, list
) {
879 list_del_init(&e
->list
);
880 ret
= io_ctl_read_bitmap(&io_ctl
, e
);
885 io_ctl_drop_pages(&io_ctl
);
888 io_ctl_free(&io_ctl
);
891 io_ctl_drop_pages(&io_ctl
);
893 spin_lock(&ctl
->tree_lock
);
894 __btrfs_remove_free_space_cache(ctl
);
895 spin_unlock(&ctl
->tree_lock
);
899 static int copy_free_space_cache(struct btrfs_block_group
*block_group
,
900 struct btrfs_free_space_ctl
*ctl
)
902 struct btrfs_free_space
*info
;
906 while (!ret
&& (n
= rb_first(&ctl
->free_space_offset
)) != NULL
) {
907 info
= rb_entry(n
, struct btrfs_free_space
, offset_index
);
909 const u64 offset
= info
->offset
;
910 const u64 bytes
= info
->bytes
;
912 unlink_free_space(ctl
, info
, true);
913 spin_unlock(&ctl
->tree_lock
);
914 kmem_cache_free(btrfs_free_space_cachep
, info
);
915 ret
= btrfs_add_free_space(block_group
, offset
, bytes
);
916 spin_lock(&ctl
->tree_lock
);
918 u64 offset
= info
->offset
;
919 u64 bytes
= ctl
->unit
;
921 ret
= search_bitmap(ctl
, info
, &offset
, &bytes
, false);
923 bitmap_clear_bits(ctl
, info
, offset
, bytes
, true);
924 spin_unlock(&ctl
->tree_lock
);
925 ret
= btrfs_add_free_space(block_group
, offset
,
927 spin_lock(&ctl
->tree_lock
);
929 free_bitmap(ctl
, info
);
933 cond_resched_lock(&ctl
->tree_lock
);
938 static struct lock_class_key btrfs_free_space_inode_key
;
940 int load_free_space_cache(struct btrfs_block_group
*block_group
)
942 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
943 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
944 struct btrfs_free_space_ctl tmp_ctl
= {};
946 struct btrfs_path
*path
;
949 u64 used
= block_group
->used
;
952 * Because we could potentially discard our loaded free space, we want
953 * to load everything into a temporary structure first, and then if it's
954 * valid copy it all into the actual free space ctl.
956 btrfs_init_free_space_ctl(block_group
, &tmp_ctl
);
959 * If this block group has been marked to be cleared for one reason or
960 * another then we can't trust the on disk cache, so just return.
962 spin_lock(&block_group
->lock
);
963 if (block_group
->disk_cache_state
!= BTRFS_DC_WRITTEN
) {
964 spin_unlock(&block_group
->lock
);
967 spin_unlock(&block_group
->lock
);
969 path
= btrfs_alloc_path();
972 path
->search_commit_root
= 1;
973 path
->skip_locking
= 1;
976 * We must pass a path with search_commit_root set to btrfs_iget in
977 * order to avoid a deadlock when allocating extents for the tree root.
979 * When we are COWing an extent buffer from the tree root, when looking
980 * for a free extent, at extent-tree.c:find_free_extent(), we can find
981 * block group without its free space cache loaded. When we find one
982 * we must load its space cache which requires reading its free space
983 * cache's inode item from the root tree. If this inode item is located
984 * in the same leaf that we started COWing before, then we end up in
985 * deadlock on the extent buffer (trying to read lock it when we
986 * previously write locked it).
988 * It's safe to read the inode item using the commit root because
989 * block groups, once loaded, stay in memory forever (until they are
990 * removed) as well as their space caches once loaded. New block groups
991 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
992 * we will never try to read their inode item while the fs is mounted.
994 inode
= lookup_free_space_inode(block_group
, path
);
996 btrfs_free_path(path
);
1000 /* We may have converted the inode and made the cache invalid. */
1001 spin_lock(&block_group
->lock
);
1002 if (block_group
->disk_cache_state
!= BTRFS_DC_WRITTEN
) {
1003 spin_unlock(&block_group
->lock
);
1004 btrfs_free_path(path
);
1007 spin_unlock(&block_group
->lock
);
1010 * Reinitialize the class of struct inode's mapping->invalidate_lock for
1011 * free space inodes to prevent false positives related to locks for normal
1014 lockdep_set_class(&(&inode
->i_data
)->invalidate_lock
,
1015 &btrfs_free_space_inode_key
);
1017 ret
= __load_free_space_cache(fs_info
->tree_root
, inode
, &tmp_ctl
,
1018 path
, block_group
->start
);
1019 btrfs_free_path(path
);
1023 matched
= (tmp_ctl
.free_space
== (block_group
->length
- used
-
1024 block_group
->bytes_super
));
1027 spin_lock(&tmp_ctl
.tree_lock
);
1028 ret
= copy_free_space_cache(block_group
, &tmp_ctl
);
1029 spin_unlock(&tmp_ctl
.tree_lock
);
1031 * ret == 1 means we successfully loaded the free space cache,
1032 * so we need to re-set it here.
1038 * We need to call the _locked variant so we don't try to update
1039 * the discard counters.
1041 spin_lock(&tmp_ctl
.tree_lock
);
1042 __btrfs_remove_free_space_cache(&tmp_ctl
);
1043 spin_unlock(&tmp_ctl
.tree_lock
);
1045 "block group %llu has wrong amount of free space",
1046 block_group
->start
);
1051 /* This cache is bogus, make sure it gets cleared */
1052 spin_lock(&block_group
->lock
);
1053 block_group
->disk_cache_state
= BTRFS_DC_CLEAR
;
1054 spin_unlock(&block_group
->lock
);
1058 "failed to load free space cache for block group %llu, rebuilding it now",
1059 block_group
->start
);
1062 spin_lock(&ctl
->tree_lock
);
1063 btrfs_discard_update_discardable(block_group
);
1064 spin_unlock(&ctl
->tree_lock
);
1069 static noinline_for_stack
1070 int write_cache_extent_entries(struct btrfs_io_ctl
*io_ctl
,
1071 struct btrfs_free_space_ctl
*ctl
,
1072 struct btrfs_block_group
*block_group
,
1073 int *entries
, int *bitmaps
,
1074 struct list_head
*bitmap_list
)
1077 struct btrfs_free_cluster
*cluster
= NULL
;
1078 struct btrfs_free_cluster
*cluster_locked
= NULL
;
1079 struct rb_node
*node
= rb_first(&ctl
->free_space_offset
);
1080 struct btrfs_trim_range
*trim_entry
;
1082 /* Get the cluster for this block_group if it exists */
1083 if (block_group
&& !list_empty(&block_group
->cluster_list
)) {
1084 cluster
= list_entry(block_group
->cluster_list
.next
,
1085 struct btrfs_free_cluster
,
1089 if (!node
&& cluster
) {
1090 cluster_locked
= cluster
;
1091 spin_lock(&cluster_locked
->lock
);
1092 node
= rb_first(&cluster
->root
);
1096 /* Write out the extent entries */
1098 struct btrfs_free_space
*e
;
1100 e
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
1103 ret
= io_ctl_add_entry(io_ctl
, e
->offset
, e
->bytes
,
1109 list_add_tail(&e
->list
, bitmap_list
);
1112 node
= rb_next(node
);
1113 if (!node
&& cluster
) {
1114 node
= rb_first(&cluster
->root
);
1115 cluster_locked
= cluster
;
1116 spin_lock(&cluster_locked
->lock
);
1120 if (cluster_locked
) {
1121 spin_unlock(&cluster_locked
->lock
);
1122 cluster_locked
= NULL
;
1126 * Make sure we don't miss any range that was removed from our rbtree
1127 * because trimming is running. Otherwise after a umount+mount (or crash
1128 * after committing the transaction) we would leak free space and get
1129 * an inconsistent free space cache report from fsck.
1131 list_for_each_entry(trim_entry
, &ctl
->trimming_ranges
, list
) {
1132 ret
= io_ctl_add_entry(io_ctl
, trim_entry
->start
,
1133 trim_entry
->bytes
, NULL
);
1142 spin_unlock(&cluster_locked
->lock
);
1146 static noinline_for_stack
int
1147 update_cache_item(struct btrfs_trans_handle
*trans
,
1148 struct btrfs_root
*root
,
1149 struct inode
*inode
,
1150 struct btrfs_path
*path
, u64 offset
,
1151 int entries
, int bitmaps
)
1153 struct btrfs_key key
;
1154 struct btrfs_free_space_header
*header
;
1155 struct extent_buffer
*leaf
;
1158 key
.objectid
= BTRFS_FREE_SPACE_OBJECTID
;
1159 key
.offset
= offset
;
1162 ret
= btrfs_search_slot(trans
, root
, &key
, path
, 0, 1);
1164 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, 0, inode
->i_size
- 1,
1165 EXTENT_DELALLOC
, NULL
);
1168 leaf
= path
->nodes
[0];
1170 struct btrfs_key found_key
;
1171 ASSERT(path
->slots
[0]);
1173 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
1174 if (found_key
.objectid
!= BTRFS_FREE_SPACE_OBJECTID
||
1175 found_key
.offset
!= offset
) {
1176 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, 0,
1177 inode
->i_size
- 1, EXTENT_DELALLOC
,
1179 btrfs_release_path(path
);
1184 BTRFS_I(inode
)->generation
= trans
->transid
;
1185 header
= btrfs_item_ptr(leaf
, path
->slots
[0],
1186 struct btrfs_free_space_header
);
1187 btrfs_set_free_space_entries(leaf
, header
, entries
);
1188 btrfs_set_free_space_bitmaps(leaf
, header
, bitmaps
);
1189 btrfs_set_free_space_generation(leaf
, header
, trans
->transid
);
1190 btrfs_release_path(path
);
1198 static noinline_for_stack
int write_pinned_extent_entries(
1199 struct btrfs_trans_handle
*trans
,
1200 struct btrfs_block_group
*block_group
,
1201 struct btrfs_io_ctl
*io_ctl
,
1204 u64 start
, extent_start
, extent_end
, len
;
1205 struct extent_io_tree
*unpin
= NULL
;
1212 * We want to add any pinned extents to our free space cache
1213 * so we don't leak the space
1215 * We shouldn't have switched the pinned extents yet so this is the
1218 unpin
= &trans
->transaction
->pinned_extents
;
1220 start
= block_group
->start
;
1222 while (start
< block_group
->start
+ block_group
->length
) {
1223 if (!find_first_extent_bit(unpin
, start
,
1224 &extent_start
, &extent_end
,
1225 EXTENT_DIRTY
, NULL
))
1228 /* This pinned extent is out of our range */
1229 if (extent_start
>= block_group
->start
+ block_group
->length
)
1232 extent_start
= max(extent_start
, start
);
1233 extent_end
= min(block_group
->start
+ block_group
->length
,
1235 len
= extent_end
- extent_start
;
1238 ret
= io_ctl_add_entry(io_ctl
, extent_start
, len
, NULL
);
1248 static noinline_for_stack
int
1249 write_bitmap_entries(struct btrfs_io_ctl
*io_ctl
, struct list_head
*bitmap_list
)
1251 struct btrfs_free_space
*entry
, *next
;
1254 /* Write out the bitmaps */
1255 list_for_each_entry_safe(entry
, next
, bitmap_list
, list
) {
1256 ret
= io_ctl_add_bitmap(io_ctl
, entry
->bitmap
);
1259 list_del_init(&entry
->list
);
1265 static int flush_dirty_cache(struct inode
*inode
)
1269 ret
= btrfs_wait_ordered_range(BTRFS_I(inode
), 0, (u64
)-1);
1271 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, 0, inode
->i_size
- 1,
1272 EXTENT_DELALLOC
, NULL
);
1277 static void noinline_for_stack
1278 cleanup_bitmap_list(struct list_head
*bitmap_list
)
1280 struct btrfs_free_space
*entry
, *next
;
1282 list_for_each_entry_safe(entry
, next
, bitmap_list
, list
)
1283 list_del_init(&entry
->list
);
1286 static void noinline_for_stack
1287 cleanup_write_cache_enospc(struct inode
*inode
,
1288 struct btrfs_io_ctl
*io_ctl
,
1289 struct extent_state
**cached_state
)
1291 io_ctl_drop_pages(io_ctl
);
1292 unlock_extent(&BTRFS_I(inode
)->io_tree
, 0, i_size_read(inode
) - 1,
1296 static int __btrfs_wait_cache_io(struct btrfs_root
*root
,
1297 struct btrfs_trans_handle
*trans
,
1298 struct btrfs_block_group
*block_group
,
1299 struct btrfs_io_ctl
*io_ctl
,
1300 struct btrfs_path
*path
, u64 offset
)
1303 struct inode
*inode
= io_ctl
->inode
;
1308 /* Flush the dirty pages in the cache file. */
1309 ret
= flush_dirty_cache(inode
);
1313 /* Update the cache item to tell everyone this cache file is valid. */
1314 ret
= update_cache_item(trans
, root
, inode
, path
, offset
,
1315 io_ctl
->entries
, io_ctl
->bitmaps
);
1318 invalidate_inode_pages2(inode
->i_mapping
);
1319 BTRFS_I(inode
)->generation
= 0;
1321 btrfs_debug(root
->fs_info
,
1322 "failed to write free space cache for block group %llu error %d",
1323 block_group
->start
, ret
);
1325 btrfs_update_inode(trans
, BTRFS_I(inode
));
1328 /* the dirty list is protected by the dirty_bgs_lock */
1329 spin_lock(&trans
->transaction
->dirty_bgs_lock
);
1331 /* the disk_cache_state is protected by the block group lock */
1332 spin_lock(&block_group
->lock
);
1335 * only mark this as written if we didn't get put back on
1336 * the dirty list while waiting for IO. Otherwise our
1337 * cache state won't be right, and we won't get written again
1339 if (!ret
&& list_empty(&block_group
->dirty_list
))
1340 block_group
->disk_cache_state
= BTRFS_DC_WRITTEN
;
1342 block_group
->disk_cache_state
= BTRFS_DC_ERROR
;
1344 spin_unlock(&block_group
->lock
);
1345 spin_unlock(&trans
->transaction
->dirty_bgs_lock
);
1346 io_ctl
->inode
= NULL
;
1354 int btrfs_wait_cache_io(struct btrfs_trans_handle
*trans
,
1355 struct btrfs_block_group
*block_group
,
1356 struct btrfs_path
*path
)
1358 return __btrfs_wait_cache_io(block_group
->fs_info
->tree_root
, trans
,
1359 block_group
, &block_group
->io_ctl
,
1360 path
, block_group
->start
);
1364 * Write out cached info to an inode.
1366 * @inode: freespace inode we are writing out
1367 * @ctl: free space cache we are going to write out
1368 * @block_group: block_group for this cache if it belongs to a block_group
1369 * @io_ctl: holds context for the io
1370 * @trans: the trans handle
1372 * This function writes out a free space cache struct to disk for quick recovery
1373 * on mount. This will return 0 if it was successful in writing the cache out,
1374 * or an errno if it was not.
1376 static int __btrfs_write_out_cache(struct inode
*inode
,
1377 struct btrfs_free_space_ctl
*ctl
,
1378 struct btrfs_block_group
*block_group
,
1379 struct btrfs_io_ctl
*io_ctl
,
1380 struct btrfs_trans_handle
*trans
)
1382 struct extent_state
*cached_state
= NULL
;
1383 LIST_HEAD(bitmap_list
);
1390 if (!i_size_read(inode
))
1393 WARN_ON(io_ctl
->pages
);
1394 ret
= io_ctl_init(io_ctl
, inode
, 1);
1398 if (block_group
&& (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
)) {
1399 down_write(&block_group
->data_rwsem
);
1400 spin_lock(&block_group
->lock
);
1401 if (block_group
->delalloc_bytes
) {
1402 block_group
->disk_cache_state
= BTRFS_DC_WRITTEN
;
1403 spin_unlock(&block_group
->lock
);
1404 up_write(&block_group
->data_rwsem
);
1405 BTRFS_I(inode
)->generation
= 0;
1410 spin_unlock(&block_group
->lock
);
1413 /* Lock all pages first so we can lock the extent safely. */
1414 ret
= io_ctl_prepare_pages(io_ctl
, false);
1418 lock_extent(&BTRFS_I(inode
)->io_tree
, 0, i_size_read(inode
) - 1,
1421 io_ctl_set_generation(io_ctl
, trans
->transid
);
1423 mutex_lock(&ctl
->cache_writeout_mutex
);
1424 /* Write out the extent entries in the free space cache */
1425 spin_lock(&ctl
->tree_lock
);
1426 ret
= write_cache_extent_entries(io_ctl
, ctl
,
1427 block_group
, &entries
, &bitmaps
,
1430 goto out_nospc_locked
;
1433 * Some spaces that are freed in the current transaction are pinned,
1434 * they will be added into free space cache after the transaction is
1435 * committed, we shouldn't lose them.
1437 * If this changes while we are working we'll get added back to
1438 * the dirty list and redo it. No locking needed
1440 ret
= write_pinned_extent_entries(trans
, block_group
, io_ctl
, &entries
);
1442 goto out_nospc_locked
;
1445 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1446 * locked while doing it because a concurrent trim can be manipulating
1447 * or freeing the bitmap.
1449 ret
= write_bitmap_entries(io_ctl
, &bitmap_list
);
1450 spin_unlock(&ctl
->tree_lock
);
1451 mutex_unlock(&ctl
->cache_writeout_mutex
);
1455 /* Zero out the rest of the pages just to make sure */
1456 io_ctl_zero_remaining_pages(io_ctl
);
1458 /* Everything is written out, now we dirty the pages in the file. */
1459 i_size
= i_size_read(inode
);
1460 for (int i
= 0; i
< round_up(i_size
, PAGE_SIZE
) / PAGE_SIZE
; i
++) {
1461 u64 dirty_start
= i
* PAGE_SIZE
;
1462 u64 dirty_len
= min_t(u64
, dirty_start
+ PAGE_SIZE
, i_size
) - dirty_start
;
1464 ret
= btrfs_dirty_folio(BTRFS_I(inode
), page_folio(io_ctl
->pages
[i
]),
1465 dirty_start
, dirty_len
, &cached_state
, false);
1470 if (block_group
&& (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
))
1471 up_write(&block_group
->data_rwsem
);
1473 * Release the pages and unlock the extent, we will flush
1476 io_ctl_drop_pages(io_ctl
);
1477 io_ctl_free(io_ctl
);
1479 unlock_extent(&BTRFS_I(inode
)->io_tree
, 0, i_size_read(inode
) - 1,
1483 * at this point the pages are under IO and we're happy,
1484 * The caller is responsible for waiting on them and updating
1485 * the cache and the inode
1487 io_ctl
->entries
= entries
;
1488 io_ctl
->bitmaps
= bitmaps
;
1490 ret
= btrfs_fdatawrite_range(BTRFS_I(inode
), 0, (u64
)-1);
1497 cleanup_bitmap_list(&bitmap_list
);
1498 spin_unlock(&ctl
->tree_lock
);
1499 mutex_unlock(&ctl
->cache_writeout_mutex
);
1502 cleanup_write_cache_enospc(inode
, io_ctl
, &cached_state
);
1505 if (block_group
&& (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
))
1506 up_write(&block_group
->data_rwsem
);
1509 io_ctl
->inode
= NULL
;
1510 io_ctl_free(io_ctl
);
1512 invalidate_inode_pages2(inode
->i_mapping
);
1513 BTRFS_I(inode
)->generation
= 0;
1515 btrfs_update_inode(trans
, BTRFS_I(inode
));
1521 int btrfs_write_out_cache(struct btrfs_trans_handle
*trans
,
1522 struct btrfs_block_group
*block_group
,
1523 struct btrfs_path
*path
)
1525 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1526 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
1527 struct inode
*inode
;
1530 spin_lock(&block_group
->lock
);
1531 if (block_group
->disk_cache_state
< BTRFS_DC_SETUP
) {
1532 spin_unlock(&block_group
->lock
);
1535 spin_unlock(&block_group
->lock
);
1537 inode
= lookup_free_space_inode(block_group
, path
);
1541 ret
= __btrfs_write_out_cache(inode
, ctl
, block_group
,
1542 &block_group
->io_ctl
, trans
);
1544 btrfs_debug(fs_info
,
1545 "failed to write free space cache for block group %llu error %d",
1546 block_group
->start
, ret
);
1547 spin_lock(&block_group
->lock
);
1548 block_group
->disk_cache_state
= BTRFS_DC_ERROR
;
1549 spin_unlock(&block_group
->lock
);
1551 block_group
->io_ctl
.inode
= NULL
;
1556 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1557 * to wait for IO and put the inode
1563 static inline unsigned long offset_to_bit(u64 bitmap_start
, u32 unit
,
1566 ASSERT(offset
>= bitmap_start
);
1567 offset
-= bitmap_start
;
1568 return (unsigned long)(div_u64(offset
, unit
));
1571 static inline unsigned long bytes_to_bits(u64 bytes
, u32 unit
)
1573 return (unsigned long)(div_u64(bytes
, unit
));
1576 static inline u64
offset_to_bitmap(struct btrfs_free_space_ctl
*ctl
,
1580 u64 bytes_per_bitmap
;
1582 bytes_per_bitmap
= BITS_PER_BITMAP
* ctl
->unit
;
1583 bitmap_start
= offset
- ctl
->start
;
1584 bitmap_start
= div64_u64(bitmap_start
, bytes_per_bitmap
);
1585 bitmap_start
*= bytes_per_bitmap
;
1586 bitmap_start
+= ctl
->start
;
1588 return bitmap_start
;
1591 static int tree_insert_offset(struct btrfs_free_space_ctl
*ctl
,
1592 struct btrfs_free_cluster
*cluster
,
1593 struct btrfs_free_space
*new_entry
)
1595 struct rb_root
*root
;
1597 struct rb_node
*parent
= NULL
;
1599 lockdep_assert_held(&ctl
->tree_lock
);
1602 lockdep_assert_held(&cluster
->lock
);
1603 root
= &cluster
->root
;
1605 root
= &ctl
->free_space_offset
;
1611 struct btrfs_free_space
*info
;
1614 info
= rb_entry(parent
, struct btrfs_free_space
, offset_index
);
1616 if (new_entry
->offset
< info
->offset
) {
1618 } else if (new_entry
->offset
> info
->offset
) {
1619 p
= &(*p
)->rb_right
;
1622 * we could have a bitmap entry and an extent entry
1623 * share the same offset. If this is the case, we want
1624 * the extent entry to always be found first if we do a
1625 * linear search through the tree, since we want to have
1626 * the quickest allocation time, and allocating from an
1627 * extent is faster than allocating from a bitmap. So
1628 * if we're inserting a bitmap and we find an entry at
1629 * this offset, we want to go right, or after this entry
1630 * logically. If we are inserting an extent and we've
1631 * found a bitmap, we want to go left, or before
1634 if (new_entry
->bitmap
) {
1639 p
= &(*p
)->rb_right
;
1641 if (!info
->bitmap
) {
1650 rb_link_node(&new_entry
->offset_index
, parent
, p
);
1651 rb_insert_color(&new_entry
->offset_index
, root
);
1657 * This is a little subtle. We *only* have ->max_extent_size set if we actually
1658 * searched through the bitmap and figured out the largest ->max_extent_size,
1659 * otherwise it's 0. In the case that it's 0 we don't want to tell the
1660 * allocator the wrong thing, we want to use the actual real max_extent_size
1661 * we've found already if it's larger, or we want to use ->bytes.
1663 * This matters because find_free_space() will skip entries who's ->bytes is
1664 * less than the required bytes. So if we didn't search down this bitmap, we
1665 * may pick some previous entry that has a smaller ->max_extent_size than we
1666 * have. For example, assume we have two entries, one that has
1667 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set
1668 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will
1669 * call into find_free_space(), and return with max_extent_size == 4K, because
1670 * that first bitmap entry had ->max_extent_size set, but the second one did
1671 * not. If instead we returned 8K we'd come in searching for 8K, and find the
1672 * 8K contiguous range.
1674 * Consider the other case, we have 2 8K chunks in that second entry and still
1675 * don't have ->max_extent_size set. We'll return 16K, and the next time the
1676 * allocator comes in it'll fully search our second bitmap, and this time it'll
1677 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the
1678 * right allocation the next loop through.
1680 static inline u64
get_max_extent_size(const struct btrfs_free_space
*entry
)
1682 if (entry
->bitmap
&& entry
->max_extent_size
)
1683 return entry
->max_extent_size
;
1684 return entry
->bytes
;
1688 * We want the largest entry to be leftmost, so this is inverted from what you'd
1691 static bool entry_less(struct rb_node
*node
, const struct rb_node
*parent
)
1693 const struct btrfs_free_space
*entry
, *exist
;
1695 entry
= rb_entry(node
, struct btrfs_free_space
, bytes_index
);
1696 exist
= rb_entry(parent
, struct btrfs_free_space
, bytes_index
);
1697 return get_max_extent_size(exist
) < get_max_extent_size(entry
);
1701 * searches the tree for the given offset.
1703 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1704 * want a section that has at least bytes size and comes at or after the given
1707 static struct btrfs_free_space
*
1708 tree_search_offset(struct btrfs_free_space_ctl
*ctl
,
1709 u64 offset
, int bitmap_only
, int fuzzy
)
1711 struct rb_node
*n
= ctl
->free_space_offset
.rb_node
;
1712 struct btrfs_free_space
*entry
= NULL
, *prev
= NULL
;
1714 lockdep_assert_held(&ctl
->tree_lock
);
1716 /* find entry that is closest to the 'offset' */
1718 entry
= rb_entry(n
, struct btrfs_free_space
, offset_index
);
1721 if (offset
< entry
->offset
)
1723 else if (offset
> entry
->offset
)
1738 * bitmap entry and extent entry may share same offset,
1739 * in that case, bitmap entry comes after extent entry.
1744 entry
= rb_entry(n
, struct btrfs_free_space
, offset_index
);
1745 if (entry
->offset
!= offset
)
1748 WARN_ON(!entry
->bitmap
);
1751 if (entry
->bitmap
) {
1753 * if previous extent entry covers the offset,
1754 * we should return it instead of the bitmap entry
1756 n
= rb_prev(&entry
->offset_index
);
1758 prev
= rb_entry(n
, struct btrfs_free_space
,
1760 if (!prev
->bitmap
&&
1761 prev
->offset
+ prev
->bytes
> offset
)
1771 /* find last entry before the 'offset' */
1773 if (entry
->offset
> offset
) {
1774 n
= rb_prev(&entry
->offset_index
);
1776 entry
= rb_entry(n
, struct btrfs_free_space
,
1778 ASSERT(entry
->offset
<= offset
);
1787 if (entry
->bitmap
) {
1788 n
= rb_prev(&entry
->offset_index
);
1790 prev
= rb_entry(n
, struct btrfs_free_space
,
1792 if (!prev
->bitmap
&&
1793 prev
->offset
+ prev
->bytes
> offset
)
1796 if (entry
->offset
+ BITS_PER_BITMAP
* ctl
->unit
> offset
)
1798 } else if (entry
->offset
+ entry
->bytes
> offset
)
1805 n
= rb_next(&entry
->offset_index
);
1808 entry
= rb_entry(n
, struct btrfs_free_space
, offset_index
);
1809 if (entry
->bitmap
) {
1810 if (entry
->offset
+ BITS_PER_BITMAP
*
1814 if (entry
->offset
+ entry
->bytes
> offset
)
1821 static inline void unlink_free_space(struct btrfs_free_space_ctl
*ctl
,
1822 struct btrfs_free_space
*info
,
1825 lockdep_assert_held(&ctl
->tree_lock
);
1827 rb_erase(&info
->offset_index
, &ctl
->free_space_offset
);
1828 rb_erase_cached(&info
->bytes_index
, &ctl
->free_space_bytes
);
1829 ctl
->free_extents
--;
1831 if (!info
->bitmap
&& !btrfs_free_space_trimmed(info
)) {
1832 ctl
->discardable_extents
[BTRFS_STAT_CURR
]--;
1833 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] -= info
->bytes
;
1837 ctl
->free_space
-= info
->bytes
;
1840 static int link_free_space(struct btrfs_free_space_ctl
*ctl
,
1841 struct btrfs_free_space
*info
)
1845 lockdep_assert_held(&ctl
->tree_lock
);
1847 ASSERT(info
->bytes
|| info
->bitmap
);
1848 ret
= tree_insert_offset(ctl
, NULL
, info
);
1852 rb_add_cached(&info
->bytes_index
, &ctl
->free_space_bytes
, entry_less
);
1854 if (!info
->bitmap
&& !btrfs_free_space_trimmed(info
)) {
1855 ctl
->discardable_extents
[BTRFS_STAT_CURR
]++;
1856 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] += info
->bytes
;
1859 ctl
->free_space
+= info
->bytes
;
1860 ctl
->free_extents
++;
1864 static void relink_bitmap_entry(struct btrfs_free_space_ctl
*ctl
,
1865 struct btrfs_free_space
*info
)
1867 ASSERT(info
->bitmap
);
1870 * If our entry is empty it's because we're on a cluster and we don't
1871 * want to re-link it into our ctl bytes index.
1873 if (RB_EMPTY_NODE(&info
->bytes_index
))
1876 lockdep_assert_held(&ctl
->tree_lock
);
1878 rb_erase_cached(&info
->bytes_index
, &ctl
->free_space_bytes
);
1879 rb_add_cached(&info
->bytes_index
, &ctl
->free_space_bytes
, entry_less
);
1882 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl
*ctl
,
1883 struct btrfs_free_space
*info
,
1884 u64 offset
, u64 bytes
, bool update_stat
)
1886 unsigned long start
, count
, end
;
1887 int extent_delta
= -1;
1889 start
= offset_to_bit(info
->offset
, ctl
->unit
, offset
);
1890 count
= bytes_to_bits(bytes
, ctl
->unit
);
1891 end
= start
+ count
;
1892 ASSERT(end
<= BITS_PER_BITMAP
);
1894 bitmap_clear(info
->bitmap
, start
, count
);
1896 info
->bytes
-= bytes
;
1897 if (info
->max_extent_size
> ctl
->unit
)
1898 info
->max_extent_size
= 0;
1900 relink_bitmap_entry(ctl
, info
);
1902 if (start
&& test_bit(start
- 1, info
->bitmap
))
1905 if (end
< BITS_PER_BITMAP
&& test_bit(end
, info
->bitmap
))
1908 info
->bitmap_extents
+= extent_delta
;
1909 if (!btrfs_free_space_trimmed(info
)) {
1910 ctl
->discardable_extents
[BTRFS_STAT_CURR
] += extent_delta
;
1911 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] -= bytes
;
1915 ctl
->free_space
-= bytes
;
1918 static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl
*ctl
,
1919 struct btrfs_free_space
*info
, u64 offset
,
1922 unsigned long start
, count
, end
;
1923 int extent_delta
= 1;
1925 start
= offset_to_bit(info
->offset
, ctl
->unit
, offset
);
1926 count
= bytes_to_bits(bytes
, ctl
->unit
);
1927 end
= start
+ count
;
1928 ASSERT(end
<= BITS_PER_BITMAP
);
1930 bitmap_set(info
->bitmap
, start
, count
);
1933 * We set some bytes, we have no idea what the max extent size is
1936 info
->max_extent_size
= 0;
1937 info
->bytes
+= bytes
;
1938 ctl
->free_space
+= bytes
;
1940 relink_bitmap_entry(ctl
, info
);
1942 if (start
&& test_bit(start
- 1, info
->bitmap
))
1945 if (end
< BITS_PER_BITMAP
&& test_bit(end
, info
->bitmap
))
1948 info
->bitmap_extents
+= extent_delta
;
1949 if (!btrfs_free_space_trimmed(info
)) {
1950 ctl
->discardable_extents
[BTRFS_STAT_CURR
] += extent_delta
;
1951 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] += bytes
;
1956 * If we can not find suitable extent, we will use bytes to record
1957 * the size of the max extent.
1959 static int search_bitmap(struct btrfs_free_space_ctl
*ctl
,
1960 struct btrfs_free_space
*bitmap_info
, u64
*offset
,
1961 u64
*bytes
, bool for_alloc
)
1963 unsigned long found_bits
= 0;
1964 unsigned long max_bits
= 0;
1965 unsigned long bits
, i
;
1966 unsigned long next_zero
;
1967 unsigned long extent_bits
;
1970 * Skip searching the bitmap if we don't have a contiguous section that
1971 * is large enough for this allocation.
1974 bitmap_info
->max_extent_size
&&
1975 bitmap_info
->max_extent_size
< *bytes
) {
1976 *bytes
= bitmap_info
->max_extent_size
;
1980 i
= offset_to_bit(bitmap_info
->offset
, ctl
->unit
,
1981 max_t(u64
, *offset
, bitmap_info
->offset
));
1982 bits
= bytes_to_bits(*bytes
, ctl
->unit
);
1984 for_each_set_bit_from(i
, bitmap_info
->bitmap
, BITS_PER_BITMAP
) {
1985 if (for_alloc
&& bits
== 1) {
1989 next_zero
= find_next_zero_bit(bitmap_info
->bitmap
,
1990 BITS_PER_BITMAP
, i
);
1991 extent_bits
= next_zero
- i
;
1992 if (extent_bits
>= bits
) {
1993 found_bits
= extent_bits
;
1995 } else if (extent_bits
> max_bits
) {
1996 max_bits
= extent_bits
;
2002 *offset
= (u64
)(i
* ctl
->unit
) + bitmap_info
->offset
;
2003 *bytes
= (u64
)(found_bits
) * ctl
->unit
;
2007 *bytes
= (u64
)(max_bits
) * ctl
->unit
;
2008 bitmap_info
->max_extent_size
= *bytes
;
2009 relink_bitmap_entry(ctl
, bitmap_info
);
2013 /* Cache the size of the max extent in bytes */
2014 static struct btrfs_free_space
*
2015 find_free_space(struct btrfs_free_space_ctl
*ctl
, u64
*offset
, u64
*bytes
,
2016 unsigned long align
, u64
*max_extent_size
, bool use_bytes_index
)
2018 struct btrfs_free_space
*entry
;
2019 struct rb_node
*node
;
2024 if (!ctl
->free_space_offset
.rb_node
)
2027 if (use_bytes_index
) {
2028 node
= rb_first_cached(&ctl
->free_space_bytes
);
2030 entry
= tree_search_offset(ctl
, offset_to_bitmap(ctl
, *offset
),
2034 node
= &entry
->offset_index
;
2037 for (; node
; node
= rb_next(node
)) {
2038 if (use_bytes_index
)
2039 entry
= rb_entry(node
, struct btrfs_free_space
,
2042 entry
= rb_entry(node
, struct btrfs_free_space
,
2046 * If we are using the bytes index then all subsequent entries
2047 * in this tree are going to be < bytes, so simply set the max
2048 * extent size and exit the loop.
2050 * If we're using the offset index then we need to keep going
2051 * through the rest of the tree.
2053 if (entry
->bytes
< *bytes
) {
2054 *max_extent_size
= max(get_max_extent_size(entry
),
2056 if (use_bytes_index
)
2061 /* make sure the space returned is big enough
2062 * to match our requested alignment
2064 if (*bytes
>= align
) {
2065 tmp
= entry
->offset
- ctl
->start
+ align
- 1;
2066 tmp
= div64_u64(tmp
, align
);
2067 tmp
= tmp
* align
+ ctl
->start
;
2068 align_off
= tmp
- entry
->offset
;
2071 tmp
= entry
->offset
;
2075 * We don't break here if we're using the bytes index because we
2076 * may have another entry that has the correct alignment that is
2077 * the right size, so we don't want to miss that possibility.
2078 * At worst this adds another loop through the logic, but if we
2079 * broke here we could prematurely ENOSPC.
2081 if (entry
->bytes
< *bytes
+ align_off
) {
2082 *max_extent_size
= max(get_max_extent_size(entry
),
2087 if (entry
->bitmap
) {
2088 struct rb_node
*old_next
= rb_next(node
);
2091 ret
= search_bitmap(ctl
, entry
, &tmp
, &size
, true);
2098 max(get_max_extent_size(entry
),
2103 * The bitmap may have gotten re-arranged in the space
2104 * index here because the max_extent_size may have been
2105 * updated. Start from the beginning again if this
2108 if (use_bytes_index
&& old_next
!= rb_next(node
))
2114 *bytes
= entry
->bytes
- align_off
;
2121 static void add_new_bitmap(struct btrfs_free_space_ctl
*ctl
,
2122 struct btrfs_free_space
*info
, u64 offset
)
2124 info
->offset
= offset_to_bitmap(ctl
, offset
);
2126 info
->bitmap_extents
= 0;
2127 INIT_LIST_HEAD(&info
->list
);
2128 link_free_space(ctl
, info
);
2129 ctl
->total_bitmaps
++;
2130 recalculate_thresholds(ctl
);
2133 static void free_bitmap(struct btrfs_free_space_ctl
*ctl
,
2134 struct btrfs_free_space
*bitmap_info
)
2137 * Normally when this is called, the bitmap is completely empty. However,
2138 * if we are blowing up the free space cache for one reason or another
2139 * via __btrfs_remove_free_space_cache(), then it may not be freed and
2140 * we may leave stats on the table.
2142 if (bitmap_info
->bytes
&& !btrfs_free_space_trimmed(bitmap_info
)) {
2143 ctl
->discardable_extents
[BTRFS_STAT_CURR
] -=
2144 bitmap_info
->bitmap_extents
;
2145 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] -= bitmap_info
->bytes
;
2148 unlink_free_space(ctl
, bitmap_info
, true);
2149 kmem_cache_free(btrfs_free_space_bitmap_cachep
, bitmap_info
->bitmap
);
2150 kmem_cache_free(btrfs_free_space_cachep
, bitmap_info
);
2151 ctl
->total_bitmaps
--;
2152 recalculate_thresholds(ctl
);
2155 static noinline
int remove_from_bitmap(struct btrfs_free_space_ctl
*ctl
,
2156 struct btrfs_free_space
*bitmap_info
,
2157 u64
*offset
, u64
*bytes
)
2160 u64 search_start
, search_bytes
;
2164 end
= bitmap_info
->offset
+ (u64
)(BITS_PER_BITMAP
* ctl
->unit
) - 1;
2167 * We need to search for bits in this bitmap. We could only cover some
2168 * of the extent in this bitmap thanks to how we add space, so we need
2169 * to search for as much as it as we can and clear that amount, and then
2170 * go searching for the next bit.
2172 search_start
= *offset
;
2173 search_bytes
= ctl
->unit
;
2174 search_bytes
= min(search_bytes
, end
- search_start
+ 1);
2175 ret
= search_bitmap(ctl
, bitmap_info
, &search_start
, &search_bytes
,
2177 if (ret
< 0 || search_start
!= *offset
)
2180 /* We may have found more bits than what we need */
2181 search_bytes
= min(search_bytes
, *bytes
);
2183 /* Cannot clear past the end of the bitmap */
2184 search_bytes
= min(search_bytes
, end
- search_start
+ 1);
2186 bitmap_clear_bits(ctl
, bitmap_info
, search_start
, search_bytes
, true);
2187 *offset
+= search_bytes
;
2188 *bytes
-= search_bytes
;
2191 struct rb_node
*next
= rb_next(&bitmap_info
->offset_index
);
2192 if (!bitmap_info
->bytes
)
2193 free_bitmap(ctl
, bitmap_info
);
2196 * no entry after this bitmap, but we still have bytes to
2197 * remove, so something has gone wrong.
2202 bitmap_info
= rb_entry(next
, struct btrfs_free_space
,
2206 * if the next entry isn't a bitmap we need to return to let the
2207 * extent stuff do its work.
2209 if (!bitmap_info
->bitmap
)
2213 * Ok the next item is a bitmap, but it may not actually hold
2214 * the information for the rest of this free space stuff, so
2215 * look for it, and if we don't find it return so we can try
2216 * everything over again.
2218 search_start
= *offset
;
2219 search_bytes
= ctl
->unit
;
2220 ret
= search_bitmap(ctl
, bitmap_info
, &search_start
,
2221 &search_bytes
, false);
2222 if (ret
< 0 || search_start
!= *offset
)
2226 } else if (!bitmap_info
->bytes
)
2227 free_bitmap(ctl
, bitmap_info
);
2232 static u64
add_bytes_to_bitmap(struct btrfs_free_space_ctl
*ctl
,
2233 struct btrfs_free_space
*info
, u64 offset
,
2234 u64 bytes
, enum btrfs_trim_state trim_state
)
2236 u64 bytes_to_set
= 0;
2240 * This is a tradeoff to make bitmap trim state minimal. We mark the
2241 * whole bitmap untrimmed if at any point we add untrimmed regions.
2243 if (trim_state
== BTRFS_TRIM_STATE_UNTRIMMED
) {
2244 if (btrfs_free_space_trimmed(info
)) {
2245 ctl
->discardable_extents
[BTRFS_STAT_CURR
] +=
2246 info
->bitmap_extents
;
2247 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] += info
->bytes
;
2249 info
->trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
2252 end
= info
->offset
+ (u64
)(BITS_PER_BITMAP
* ctl
->unit
);
2254 bytes_to_set
= min(end
- offset
, bytes
);
2256 btrfs_bitmap_set_bits(ctl
, info
, offset
, bytes_to_set
);
2258 return bytes_to_set
;
2262 static bool use_bitmap(struct btrfs_free_space_ctl
*ctl
,
2263 struct btrfs_free_space
*info
)
2265 struct btrfs_block_group
*block_group
= ctl
->block_group
;
2266 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2267 bool forced
= false;
2269 #ifdef CONFIG_BTRFS_DEBUG
2270 if (btrfs_should_fragment_free_space(block_group
))
2274 /* This is a way to reclaim large regions from the bitmaps. */
2275 if (!forced
&& info
->bytes
>= FORCE_EXTENT_THRESHOLD
)
2279 * If we are below the extents threshold then we can add this as an
2280 * extent, and don't have to deal with the bitmap
2282 if (!forced
&& ctl
->free_extents
< ctl
->extents_thresh
) {
2284 * If this block group has some small extents we don't want to
2285 * use up all of our free slots in the cache with them, we want
2286 * to reserve them to larger extents, however if we have plenty
2287 * of cache left then go ahead an dadd them, no sense in adding
2288 * the overhead of a bitmap if we don't have to.
2290 if (info
->bytes
<= fs_info
->sectorsize
* 8) {
2291 if (ctl
->free_extents
* 3 <= ctl
->extents_thresh
)
2299 * The original block groups from mkfs can be really small, like 8
2300 * megabytes, so don't bother with a bitmap for those entries. However
2301 * some block groups can be smaller than what a bitmap would cover but
2302 * are still large enough that they could overflow the 32k memory limit,
2303 * so allow those block groups to still be allowed to have a bitmap
2306 if (((BITS_PER_BITMAP
* ctl
->unit
) >> 1) > block_group
->length
)
2312 static const struct btrfs_free_space_op free_space_op
= {
2313 .use_bitmap
= use_bitmap
,
2316 static int insert_into_bitmap(struct btrfs_free_space_ctl
*ctl
,
2317 struct btrfs_free_space
*info
)
2319 struct btrfs_free_space
*bitmap_info
;
2320 struct btrfs_block_group
*block_group
= NULL
;
2322 u64 bytes
, offset
, bytes_added
;
2323 enum btrfs_trim_state trim_state
;
2326 bytes
= info
->bytes
;
2327 offset
= info
->offset
;
2328 trim_state
= info
->trim_state
;
2330 if (!ctl
->op
->use_bitmap(ctl
, info
))
2333 if (ctl
->op
== &free_space_op
)
2334 block_group
= ctl
->block_group
;
2337 * Since we link bitmaps right into the cluster we need to see if we
2338 * have a cluster here, and if so and it has our bitmap we need to add
2339 * the free space to that bitmap.
2341 if (block_group
&& !list_empty(&block_group
->cluster_list
)) {
2342 struct btrfs_free_cluster
*cluster
;
2343 struct rb_node
*node
;
2344 struct btrfs_free_space
*entry
;
2346 cluster
= list_entry(block_group
->cluster_list
.next
,
2347 struct btrfs_free_cluster
,
2349 spin_lock(&cluster
->lock
);
2350 node
= rb_first(&cluster
->root
);
2352 spin_unlock(&cluster
->lock
);
2353 goto no_cluster_bitmap
;
2356 entry
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
2357 if (!entry
->bitmap
) {
2358 spin_unlock(&cluster
->lock
);
2359 goto no_cluster_bitmap
;
2362 if (entry
->offset
== offset_to_bitmap(ctl
, offset
)) {
2363 bytes_added
= add_bytes_to_bitmap(ctl
, entry
, offset
,
2365 bytes
-= bytes_added
;
2366 offset
+= bytes_added
;
2368 spin_unlock(&cluster
->lock
);
2376 bitmap_info
= tree_search_offset(ctl
, offset_to_bitmap(ctl
, offset
),
2383 bytes_added
= add_bytes_to_bitmap(ctl
, bitmap_info
, offset
, bytes
,
2385 bytes
-= bytes_added
;
2386 offset
+= bytes_added
;
2396 if (info
&& info
->bitmap
) {
2397 add_new_bitmap(ctl
, info
, offset
);
2402 spin_unlock(&ctl
->tree_lock
);
2404 /* no pre-allocated info, allocate a new one */
2406 info
= kmem_cache_zalloc(btrfs_free_space_cachep
,
2409 spin_lock(&ctl
->tree_lock
);
2415 /* allocate the bitmap */
2416 info
->bitmap
= kmem_cache_zalloc(btrfs_free_space_bitmap_cachep
,
2418 info
->trim_state
= BTRFS_TRIM_STATE_TRIMMED
;
2419 spin_lock(&ctl
->tree_lock
);
2420 if (!info
->bitmap
) {
2430 kmem_cache_free(btrfs_free_space_bitmap_cachep
,
2432 kmem_cache_free(btrfs_free_space_cachep
, info
);
2439 * Free space merging rules:
2440 * 1) Merge trimmed areas together
2441 * 2) Let untrimmed areas coalesce with trimmed areas
2442 * 3) Always pull neighboring regions from bitmaps
2444 * The above rules are for when we merge free space based on btrfs_trim_state.
2445 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2446 * same reason: to promote larger extent regions which makes life easier for
2447 * find_free_extent(). Rule 2 enables coalescing based on the common path
2448 * being returning free space from btrfs_finish_extent_commit(). So when free
2449 * space is trimmed, it will prevent aggregating trimmed new region and
2450 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents
2451 * and provide find_free_extent() with the largest extents possible hoping for
2454 static bool try_merge_free_space(struct btrfs_free_space_ctl
*ctl
,
2455 struct btrfs_free_space
*info
, bool update_stat
)
2457 struct btrfs_free_space
*left_info
= NULL
;
2458 struct btrfs_free_space
*right_info
;
2459 bool merged
= false;
2460 u64 offset
= info
->offset
;
2461 u64 bytes
= info
->bytes
;
2462 const bool is_trimmed
= btrfs_free_space_trimmed(info
);
2463 struct rb_node
*right_prev
= NULL
;
2466 * first we want to see if there is free space adjacent to the range we
2467 * are adding, if there is remove that struct and add a new one to
2468 * cover the entire range
2470 right_info
= tree_search_offset(ctl
, offset
+ bytes
, 0, 0);
2472 right_prev
= rb_prev(&right_info
->offset_index
);
2475 left_info
= rb_entry(right_prev
, struct btrfs_free_space
, offset_index
);
2476 else if (!right_info
)
2477 left_info
= tree_search_offset(ctl
, offset
- 1, 0, 0);
2479 /* See try_merge_free_space() comment. */
2480 if (right_info
&& !right_info
->bitmap
&&
2481 (!is_trimmed
|| btrfs_free_space_trimmed(right_info
))) {
2482 unlink_free_space(ctl
, right_info
, update_stat
);
2483 info
->bytes
+= right_info
->bytes
;
2484 kmem_cache_free(btrfs_free_space_cachep
, right_info
);
2488 /* See try_merge_free_space() comment. */
2489 if (left_info
&& !left_info
->bitmap
&&
2490 left_info
->offset
+ left_info
->bytes
== offset
&&
2491 (!is_trimmed
|| btrfs_free_space_trimmed(left_info
))) {
2492 unlink_free_space(ctl
, left_info
, update_stat
);
2493 info
->offset
= left_info
->offset
;
2494 info
->bytes
+= left_info
->bytes
;
2495 kmem_cache_free(btrfs_free_space_cachep
, left_info
);
2502 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl
*ctl
,
2503 struct btrfs_free_space
*info
,
2506 struct btrfs_free_space
*bitmap
;
2509 const u64 end
= info
->offset
+ info
->bytes
;
2510 const u64 bitmap_offset
= offset_to_bitmap(ctl
, end
);
2513 bitmap
= tree_search_offset(ctl
, bitmap_offset
, 1, 0);
2517 i
= offset_to_bit(bitmap
->offset
, ctl
->unit
, end
);
2518 j
= find_next_zero_bit(bitmap
->bitmap
, BITS_PER_BITMAP
, i
);
2521 bytes
= (j
- i
) * ctl
->unit
;
2522 info
->bytes
+= bytes
;
2524 /* See try_merge_free_space() comment. */
2525 if (!btrfs_free_space_trimmed(bitmap
))
2526 info
->trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
2528 bitmap_clear_bits(ctl
, bitmap
, end
, bytes
, update_stat
);
2531 free_bitmap(ctl
, bitmap
);
2536 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl
*ctl
,
2537 struct btrfs_free_space
*info
,
2540 struct btrfs_free_space
*bitmap
;
2544 unsigned long prev_j
;
2547 bitmap_offset
= offset_to_bitmap(ctl
, info
->offset
);
2548 /* If we're on a boundary, try the previous logical bitmap. */
2549 if (bitmap_offset
== info
->offset
) {
2550 if (info
->offset
== 0)
2552 bitmap_offset
= offset_to_bitmap(ctl
, info
->offset
- 1);
2555 bitmap
= tree_search_offset(ctl
, bitmap_offset
, 1, 0);
2559 i
= offset_to_bit(bitmap
->offset
, ctl
->unit
, info
->offset
) - 1;
2561 prev_j
= (unsigned long)-1;
2562 for_each_clear_bit_from(j
, bitmap
->bitmap
, BITS_PER_BITMAP
) {
2570 if (prev_j
== (unsigned long)-1)
2571 bytes
= (i
+ 1) * ctl
->unit
;
2573 bytes
= (i
- prev_j
) * ctl
->unit
;
2575 info
->offset
-= bytes
;
2576 info
->bytes
+= bytes
;
2578 /* See try_merge_free_space() comment. */
2579 if (!btrfs_free_space_trimmed(bitmap
))
2580 info
->trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
2582 bitmap_clear_bits(ctl
, bitmap
, info
->offset
, bytes
, update_stat
);
2585 free_bitmap(ctl
, bitmap
);
2591 * We prefer always to allocate from extent entries, both for clustered and
2592 * non-clustered allocation requests. So when attempting to add a new extent
2593 * entry, try to see if there's adjacent free space in bitmap entries, and if
2594 * there is, migrate that space from the bitmaps to the extent.
2595 * Like this we get better chances of satisfying space allocation requests
2596 * because we attempt to satisfy them based on a single cache entry, and never
2597 * on 2 or more entries - even if the entries represent a contiguous free space
2598 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2601 static void steal_from_bitmap(struct btrfs_free_space_ctl
*ctl
,
2602 struct btrfs_free_space
*info
,
2606 * Only work with disconnected entries, as we can change their offset,
2607 * and must be extent entries.
2609 ASSERT(!info
->bitmap
);
2610 ASSERT(RB_EMPTY_NODE(&info
->offset_index
));
2612 if (ctl
->total_bitmaps
> 0) {
2614 bool stole_front
= false;
2616 stole_end
= steal_from_bitmap_to_end(ctl
, info
, update_stat
);
2617 if (ctl
->total_bitmaps
> 0)
2618 stole_front
= steal_from_bitmap_to_front(ctl
, info
,
2621 if (stole_end
|| stole_front
)
2622 try_merge_free_space(ctl
, info
, update_stat
);
2626 static int __btrfs_add_free_space(struct btrfs_block_group
*block_group
,
2627 u64 offset
, u64 bytes
,
2628 enum btrfs_trim_state trim_state
)
2630 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2631 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
2632 struct btrfs_free_space
*info
;
2634 u64 filter_bytes
= bytes
;
2636 ASSERT(!btrfs_is_zoned(fs_info
));
2638 info
= kmem_cache_zalloc(btrfs_free_space_cachep
, GFP_NOFS
);
2642 info
->offset
= offset
;
2643 info
->bytes
= bytes
;
2644 info
->trim_state
= trim_state
;
2645 RB_CLEAR_NODE(&info
->offset_index
);
2646 RB_CLEAR_NODE(&info
->bytes_index
);
2648 spin_lock(&ctl
->tree_lock
);
2650 if (try_merge_free_space(ctl
, info
, true))
2654 * There was no extent directly to the left or right of this new
2655 * extent then we know we're going to have to allocate a new extent, so
2656 * before we do that see if we need to drop this into a bitmap
2658 ret
= insert_into_bitmap(ctl
, info
);
2667 * Only steal free space from adjacent bitmaps if we're sure we're not
2668 * going to add the new free space to existing bitmap entries - because
2669 * that would mean unnecessary work that would be reverted. Therefore
2670 * attempt to steal space from bitmaps if we're adding an extent entry.
2672 steal_from_bitmap(ctl
, info
, true);
2674 filter_bytes
= max(filter_bytes
, info
->bytes
);
2676 ret
= link_free_space(ctl
, info
);
2678 kmem_cache_free(btrfs_free_space_cachep
, info
);
2680 btrfs_discard_update_discardable(block_group
);
2681 spin_unlock(&ctl
->tree_lock
);
2684 btrfs_crit(fs_info
, "unable to add free space :%d", ret
);
2685 ASSERT(ret
!= -EEXIST
);
2688 if (trim_state
!= BTRFS_TRIM_STATE_TRIMMED
) {
2689 btrfs_discard_check_filter(block_group
, filter_bytes
);
2690 btrfs_discard_queue_work(&fs_info
->discard_ctl
, block_group
);
2696 static int __btrfs_add_free_space_zoned(struct btrfs_block_group
*block_group
,
2697 u64 bytenr
, u64 size
, bool used
)
2699 struct btrfs_space_info
*sinfo
= block_group
->space_info
;
2700 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
2701 u64 offset
= bytenr
- block_group
->start
;
2702 u64 to_free
, to_unusable
;
2703 int bg_reclaim_threshold
= 0;
2705 u64 reclaimable_unusable
;
2707 spin_lock(&block_group
->lock
);
2709 initial
= ((size
== block_group
->length
) && (block_group
->alloc_offset
== 0));
2710 WARN_ON(!initial
&& offset
+ size
> block_group
->zone_capacity
);
2712 bg_reclaim_threshold
= READ_ONCE(sinfo
->bg_reclaim_threshold
);
2717 to_free
= block_group
->zone_capacity
;
2718 else if (offset
>= block_group
->alloc_offset
)
2720 else if (offset
+ size
<= block_group
->alloc_offset
)
2723 to_free
= offset
+ size
- block_group
->alloc_offset
;
2724 to_unusable
= size
- to_free
;
2726 spin_lock(&ctl
->tree_lock
);
2727 ctl
->free_space
+= to_free
;
2728 spin_unlock(&ctl
->tree_lock
);
2730 * If the block group is read-only, we should account freed space into
2733 if (!block_group
->ro
) {
2734 block_group
->zone_unusable
+= to_unusable
;
2735 WARN_ON(block_group
->zone_unusable
> block_group
->length
);
2738 block_group
->alloc_offset
-= size
;
2741 reclaimable_unusable
= block_group
->zone_unusable
-
2742 (block_group
->length
- block_group
->zone_capacity
);
2743 /* All the region is now unusable. Mark it as unused and reclaim */
2744 if (block_group
->zone_unusable
== block_group
->length
) {
2745 btrfs_mark_bg_unused(block_group
);
2746 } else if (bg_reclaim_threshold
&&
2747 reclaimable_unusable
>=
2748 mult_perc(block_group
->zone_capacity
, bg_reclaim_threshold
)) {
2749 btrfs_mark_bg_to_reclaim(block_group
);
2752 spin_unlock(&block_group
->lock
);
2757 int btrfs_add_free_space(struct btrfs_block_group
*block_group
,
2758 u64 bytenr
, u64 size
)
2760 enum btrfs_trim_state trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
2762 if (btrfs_is_zoned(block_group
->fs_info
))
2763 return __btrfs_add_free_space_zoned(block_group
, bytenr
, size
,
2766 if (btrfs_test_opt(block_group
->fs_info
, DISCARD_SYNC
))
2767 trim_state
= BTRFS_TRIM_STATE_TRIMMED
;
2769 return __btrfs_add_free_space(block_group
, bytenr
, size
, trim_state
);
2772 int btrfs_add_free_space_unused(struct btrfs_block_group
*block_group
,
2773 u64 bytenr
, u64 size
)
2775 if (btrfs_is_zoned(block_group
->fs_info
))
2776 return __btrfs_add_free_space_zoned(block_group
, bytenr
, size
,
2779 return btrfs_add_free_space(block_group
, bytenr
, size
);
2783 * This is a subtle distinction because when adding free space back in general,
2784 * we want it to be added as untrimmed for async. But in the case where we add
2785 * it on loading of a block group, we want to consider it trimmed.
2787 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group
*block_group
,
2788 u64 bytenr
, u64 size
)
2790 enum btrfs_trim_state trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
2792 if (btrfs_is_zoned(block_group
->fs_info
))
2793 return __btrfs_add_free_space_zoned(block_group
, bytenr
, size
,
2796 if (btrfs_test_opt(block_group
->fs_info
, DISCARD_SYNC
) ||
2797 btrfs_test_opt(block_group
->fs_info
, DISCARD_ASYNC
))
2798 trim_state
= BTRFS_TRIM_STATE_TRIMMED
;
2800 return __btrfs_add_free_space(block_group
, bytenr
, size
, trim_state
);
2803 int btrfs_remove_free_space(struct btrfs_block_group
*block_group
,
2804 u64 offset
, u64 bytes
)
2806 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
2807 struct btrfs_free_space
*info
;
2809 bool re_search
= false;
2811 if (btrfs_is_zoned(block_group
->fs_info
)) {
2813 * This can happen with conventional zones when replaying log.
2814 * Since the allocation info of tree-log nodes are not recorded
2815 * to the extent-tree, calculate_alloc_pointer() failed to
2816 * advance the allocation pointer after last allocated tree log
2819 * This function is called from
2820 * btrfs_pin_extent_for_log_replay() when replaying the log.
2821 * Advance the pointer not to overwrite the tree-log nodes.
2823 if (block_group
->start
+ block_group
->alloc_offset
<
2825 block_group
->alloc_offset
=
2826 offset
+ bytes
- block_group
->start
;
2831 spin_lock(&ctl
->tree_lock
);
2838 info
= tree_search_offset(ctl
, offset
, 0, 0);
2841 * oops didn't find an extent that matched the space we wanted
2842 * to remove, look for a bitmap instead
2844 info
= tree_search_offset(ctl
, offset_to_bitmap(ctl
, offset
),
2848 * If we found a partial bit of our free space in a
2849 * bitmap but then couldn't find the other part this may
2850 * be a problem, so WARN about it.
2858 if (!info
->bitmap
) {
2859 unlink_free_space(ctl
, info
, true);
2860 if (offset
== info
->offset
) {
2861 u64 to_free
= min(bytes
, info
->bytes
);
2863 info
->bytes
-= to_free
;
2864 info
->offset
+= to_free
;
2866 ret
= link_free_space(ctl
, info
);
2869 kmem_cache_free(btrfs_free_space_cachep
, info
);
2876 u64 old_end
= info
->bytes
+ info
->offset
;
2878 info
->bytes
= offset
- info
->offset
;
2879 ret
= link_free_space(ctl
, info
);
2884 /* Not enough bytes in this entry to satisfy us */
2885 if (old_end
< offset
+ bytes
) {
2886 bytes
-= old_end
- offset
;
2889 } else if (old_end
== offset
+ bytes
) {
2893 spin_unlock(&ctl
->tree_lock
);
2895 ret
= __btrfs_add_free_space(block_group
,
2897 old_end
- (offset
+ bytes
),
2904 ret
= remove_from_bitmap(ctl
, info
, &offset
, &bytes
);
2905 if (ret
== -EAGAIN
) {
2910 btrfs_discard_update_discardable(block_group
);
2911 spin_unlock(&ctl
->tree_lock
);
2916 void btrfs_dump_free_space(struct btrfs_block_group
*block_group
,
2919 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2920 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
2921 struct btrfs_free_space
*info
;
2926 * Zoned btrfs does not use free space tree and cluster. Just print
2927 * out the free space after the allocation offset.
2929 if (btrfs_is_zoned(fs_info
)) {
2930 btrfs_info(fs_info
, "free space %llu active %d",
2931 block_group
->zone_capacity
- block_group
->alloc_offset
,
2932 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
,
2933 &block_group
->runtime_flags
));
2937 spin_lock(&ctl
->tree_lock
);
2938 for (n
= rb_first(&ctl
->free_space_offset
); n
; n
= rb_next(n
)) {
2939 info
= rb_entry(n
, struct btrfs_free_space
, offset_index
);
2940 if (info
->bytes
>= bytes
&& !block_group
->ro
)
2942 btrfs_crit(fs_info
, "entry offset %llu, bytes %llu, bitmap %s",
2943 info
->offset
, info
->bytes
, str_yes_no(info
->bitmap
));
2945 spin_unlock(&ctl
->tree_lock
);
2946 btrfs_info(fs_info
, "block group has cluster?: %s",
2947 str_no_yes(list_empty(&block_group
->cluster_list
)));
2949 "%d free space entries at or bigger than %llu bytes",
2953 void btrfs_init_free_space_ctl(struct btrfs_block_group
*block_group
,
2954 struct btrfs_free_space_ctl
*ctl
)
2956 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2958 spin_lock_init(&ctl
->tree_lock
);
2959 ctl
->unit
= fs_info
->sectorsize
;
2960 ctl
->start
= block_group
->start
;
2961 ctl
->block_group
= block_group
;
2962 ctl
->op
= &free_space_op
;
2963 ctl
->free_space_bytes
= RB_ROOT_CACHED
;
2964 INIT_LIST_HEAD(&ctl
->trimming_ranges
);
2965 mutex_init(&ctl
->cache_writeout_mutex
);
2968 * we only want to have 32k of ram per block group for keeping
2969 * track of free space, and if we pass 1/2 of that we want to
2970 * start converting things over to using bitmaps
2972 ctl
->extents_thresh
= (SZ_32K
/ 2) / sizeof(struct btrfs_free_space
);
2976 * for a given cluster, put all of its extents back into the free
2977 * space cache. If the block group passed doesn't match the block group
2978 * pointed to by the cluster, someone else raced in and freed the
2979 * cluster already. In that case, we just return without changing anything
2981 static void __btrfs_return_cluster_to_free_space(
2982 struct btrfs_block_group
*block_group
,
2983 struct btrfs_free_cluster
*cluster
)
2985 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
2986 struct rb_node
*node
;
2988 lockdep_assert_held(&ctl
->tree_lock
);
2990 spin_lock(&cluster
->lock
);
2991 if (cluster
->block_group
!= block_group
) {
2992 spin_unlock(&cluster
->lock
);
2996 cluster
->block_group
= NULL
;
2997 cluster
->window_start
= 0;
2998 list_del_init(&cluster
->block_group_list
);
3000 node
= rb_first(&cluster
->root
);
3002 struct btrfs_free_space
*entry
;
3004 entry
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
3005 node
= rb_next(&entry
->offset_index
);
3006 rb_erase(&entry
->offset_index
, &cluster
->root
);
3007 RB_CLEAR_NODE(&entry
->offset_index
);
3009 if (!entry
->bitmap
) {
3010 /* Merging treats extents as if they were new */
3011 if (!btrfs_free_space_trimmed(entry
)) {
3012 ctl
->discardable_extents
[BTRFS_STAT_CURR
]--;
3013 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] -=
3017 try_merge_free_space(ctl
, entry
, false);
3018 steal_from_bitmap(ctl
, entry
, false);
3020 /* As we insert directly, update these statistics */
3021 if (!btrfs_free_space_trimmed(entry
)) {
3022 ctl
->discardable_extents
[BTRFS_STAT_CURR
]++;
3023 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] +=
3027 tree_insert_offset(ctl
, NULL
, entry
);
3028 rb_add_cached(&entry
->bytes_index
, &ctl
->free_space_bytes
,
3031 cluster
->root
= RB_ROOT
;
3032 spin_unlock(&cluster
->lock
);
3033 btrfs_put_block_group(block_group
);
3036 void btrfs_remove_free_space_cache(struct btrfs_block_group
*block_group
)
3038 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3039 struct btrfs_free_cluster
*cluster
;
3040 struct list_head
*head
;
3042 spin_lock(&ctl
->tree_lock
);
3043 while ((head
= block_group
->cluster_list
.next
) !=
3044 &block_group
->cluster_list
) {
3045 cluster
= list_entry(head
, struct btrfs_free_cluster
,
3048 WARN_ON(cluster
->block_group
!= block_group
);
3049 __btrfs_return_cluster_to_free_space(block_group
, cluster
);
3051 cond_resched_lock(&ctl
->tree_lock
);
3053 __btrfs_remove_free_space_cache(ctl
);
3054 btrfs_discard_update_discardable(block_group
);
3055 spin_unlock(&ctl
->tree_lock
);
3060 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
3062 bool btrfs_is_free_space_trimmed(struct btrfs_block_group
*block_group
)
3064 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3065 struct btrfs_free_space
*info
;
3066 struct rb_node
*node
;
3069 spin_lock(&ctl
->tree_lock
);
3070 node
= rb_first(&ctl
->free_space_offset
);
3073 info
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
3075 if (!btrfs_free_space_trimmed(info
)) {
3080 node
= rb_next(node
);
3083 spin_unlock(&ctl
->tree_lock
);
3087 u64
btrfs_find_space_for_alloc(struct btrfs_block_group
*block_group
,
3088 u64 offset
, u64 bytes
, u64 empty_size
,
3089 u64
*max_extent_size
)
3091 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3092 struct btrfs_discard_ctl
*discard_ctl
=
3093 &block_group
->fs_info
->discard_ctl
;
3094 struct btrfs_free_space
*entry
= NULL
;
3095 u64 bytes_search
= bytes
+ empty_size
;
3098 u64 align_gap_len
= 0;
3099 enum btrfs_trim_state align_gap_trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
3100 bool use_bytes_index
= (offset
== block_group
->start
);
3102 ASSERT(!btrfs_is_zoned(block_group
->fs_info
));
3104 spin_lock(&ctl
->tree_lock
);
3105 entry
= find_free_space(ctl
, &offset
, &bytes_search
,
3106 block_group
->full_stripe_len
, max_extent_size
,
3112 if (entry
->bitmap
) {
3113 bitmap_clear_bits(ctl
, entry
, offset
, bytes
, true);
3115 if (!btrfs_free_space_trimmed(entry
))
3116 atomic64_add(bytes
, &discard_ctl
->discard_bytes_saved
);
3119 free_bitmap(ctl
, entry
);
3121 unlink_free_space(ctl
, entry
, true);
3122 align_gap_len
= offset
- entry
->offset
;
3123 align_gap
= entry
->offset
;
3124 align_gap_trim_state
= entry
->trim_state
;
3126 if (!btrfs_free_space_trimmed(entry
))
3127 atomic64_add(bytes
, &discard_ctl
->discard_bytes_saved
);
3129 entry
->offset
= offset
+ bytes
;
3130 WARN_ON(entry
->bytes
< bytes
+ align_gap_len
);
3132 entry
->bytes
-= bytes
+ align_gap_len
;
3134 kmem_cache_free(btrfs_free_space_cachep
, entry
);
3136 link_free_space(ctl
, entry
);
3139 btrfs_discard_update_discardable(block_group
);
3140 spin_unlock(&ctl
->tree_lock
);
3143 __btrfs_add_free_space(block_group
, align_gap
, align_gap_len
,
3144 align_gap_trim_state
);
3149 * given a cluster, put all of its extents back into the free space
3150 * cache. If a block group is passed, this function will only free
3151 * a cluster that belongs to the passed block group.
3153 * Otherwise, it'll get a reference on the block group pointed to by the
3154 * cluster and remove the cluster from it.
3156 void btrfs_return_cluster_to_free_space(
3157 struct btrfs_block_group
*block_group
,
3158 struct btrfs_free_cluster
*cluster
)
3160 struct btrfs_free_space_ctl
*ctl
;
3162 /* first, get a safe pointer to the block group */
3163 spin_lock(&cluster
->lock
);
3165 block_group
= cluster
->block_group
;
3167 spin_unlock(&cluster
->lock
);
3170 } else if (cluster
->block_group
!= block_group
) {
3171 /* someone else has already freed it don't redo their work */
3172 spin_unlock(&cluster
->lock
);
3175 btrfs_get_block_group(block_group
);
3176 spin_unlock(&cluster
->lock
);
3178 ctl
= block_group
->free_space_ctl
;
3180 /* now return any extents the cluster had on it */
3181 spin_lock(&ctl
->tree_lock
);
3182 __btrfs_return_cluster_to_free_space(block_group
, cluster
);
3183 spin_unlock(&ctl
->tree_lock
);
3185 btrfs_discard_queue_work(&block_group
->fs_info
->discard_ctl
, block_group
);
3187 /* finally drop our ref */
3188 btrfs_put_block_group(block_group
);
3191 static u64
btrfs_alloc_from_bitmap(struct btrfs_block_group
*block_group
,
3192 struct btrfs_free_cluster
*cluster
,
3193 struct btrfs_free_space
*entry
,
3194 u64 bytes
, u64 min_start
,
3195 u64
*max_extent_size
)
3197 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3199 u64 search_start
= cluster
->window_start
;
3200 u64 search_bytes
= bytes
;
3203 search_start
= min_start
;
3204 search_bytes
= bytes
;
3206 err
= search_bitmap(ctl
, entry
, &search_start
, &search_bytes
, true);
3208 *max_extent_size
= max(get_max_extent_size(entry
),
3214 bitmap_clear_bits(ctl
, entry
, ret
, bytes
, false);
3220 * given a cluster, try to allocate 'bytes' from it, returns 0
3221 * if it couldn't find anything suitably large, or a logical disk offset
3222 * if things worked out
3224 u64
btrfs_alloc_from_cluster(struct btrfs_block_group
*block_group
,
3225 struct btrfs_free_cluster
*cluster
, u64 bytes
,
3226 u64 min_start
, u64
*max_extent_size
)
3228 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3229 struct btrfs_discard_ctl
*discard_ctl
=
3230 &block_group
->fs_info
->discard_ctl
;
3231 struct btrfs_free_space
*entry
= NULL
;
3232 struct rb_node
*node
;
3235 ASSERT(!btrfs_is_zoned(block_group
->fs_info
));
3237 spin_lock(&cluster
->lock
);
3238 if (bytes
> cluster
->max_size
)
3241 if (cluster
->block_group
!= block_group
)
3244 node
= rb_first(&cluster
->root
);
3248 entry
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
3250 if (entry
->bytes
< bytes
)
3251 *max_extent_size
= max(get_max_extent_size(entry
),
3254 if (entry
->bytes
< bytes
||
3255 (!entry
->bitmap
&& entry
->offset
< min_start
)) {
3256 node
= rb_next(&entry
->offset_index
);
3259 entry
= rb_entry(node
, struct btrfs_free_space
,
3264 if (entry
->bitmap
) {
3265 ret
= btrfs_alloc_from_bitmap(block_group
,
3266 cluster
, entry
, bytes
,
3267 cluster
->window_start
,
3270 node
= rb_next(&entry
->offset_index
);
3273 entry
= rb_entry(node
, struct btrfs_free_space
,
3277 cluster
->window_start
+= bytes
;
3279 ret
= entry
->offset
;
3281 entry
->offset
+= bytes
;
3282 entry
->bytes
-= bytes
;
3288 spin_unlock(&cluster
->lock
);
3293 spin_lock(&ctl
->tree_lock
);
3295 if (!btrfs_free_space_trimmed(entry
))
3296 atomic64_add(bytes
, &discard_ctl
->discard_bytes_saved
);
3298 ctl
->free_space
-= bytes
;
3299 if (!entry
->bitmap
&& !btrfs_free_space_trimmed(entry
))
3300 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] -= bytes
;
3302 spin_lock(&cluster
->lock
);
3303 if (entry
->bytes
== 0) {
3304 rb_erase(&entry
->offset_index
, &cluster
->root
);
3305 ctl
->free_extents
--;
3306 if (entry
->bitmap
) {
3307 kmem_cache_free(btrfs_free_space_bitmap_cachep
,
3309 ctl
->total_bitmaps
--;
3310 recalculate_thresholds(ctl
);
3311 } else if (!btrfs_free_space_trimmed(entry
)) {
3312 ctl
->discardable_extents
[BTRFS_STAT_CURR
]--;
3314 kmem_cache_free(btrfs_free_space_cachep
, entry
);
3317 spin_unlock(&cluster
->lock
);
3318 spin_unlock(&ctl
->tree_lock
);
3323 static int btrfs_bitmap_cluster(struct btrfs_block_group
*block_group
,
3324 struct btrfs_free_space
*entry
,
3325 struct btrfs_free_cluster
*cluster
,
3326 u64 offset
, u64 bytes
,
3327 u64 cont1_bytes
, u64 min_bytes
)
3329 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3330 unsigned long next_zero
;
3332 unsigned long want_bits
;
3333 unsigned long min_bits
;
3334 unsigned long found_bits
;
3335 unsigned long max_bits
= 0;
3336 unsigned long start
= 0;
3337 unsigned long total_found
= 0;
3340 lockdep_assert_held(&ctl
->tree_lock
);
3342 i
= offset_to_bit(entry
->offset
, ctl
->unit
,
3343 max_t(u64
, offset
, entry
->offset
));
3344 want_bits
= bytes_to_bits(bytes
, ctl
->unit
);
3345 min_bits
= bytes_to_bits(min_bytes
, ctl
->unit
);
3348 * Don't bother looking for a cluster in this bitmap if it's heavily
3351 if (entry
->max_extent_size
&&
3352 entry
->max_extent_size
< cont1_bytes
)
3356 for_each_set_bit_from(i
, entry
->bitmap
, BITS_PER_BITMAP
) {
3357 next_zero
= find_next_zero_bit(entry
->bitmap
,
3358 BITS_PER_BITMAP
, i
);
3359 if (next_zero
- i
>= min_bits
) {
3360 found_bits
= next_zero
- i
;
3361 if (found_bits
> max_bits
)
3362 max_bits
= found_bits
;
3365 if (next_zero
- i
> max_bits
)
3366 max_bits
= next_zero
- i
;
3371 entry
->max_extent_size
= (u64
)max_bits
* ctl
->unit
;
3377 cluster
->max_size
= 0;
3380 total_found
+= found_bits
;
3382 if (cluster
->max_size
< found_bits
* ctl
->unit
)
3383 cluster
->max_size
= found_bits
* ctl
->unit
;
3385 if (total_found
< want_bits
|| cluster
->max_size
< cont1_bytes
) {
3390 cluster
->window_start
= start
* ctl
->unit
+ entry
->offset
;
3391 rb_erase(&entry
->offset_index
, &ctl
->free_space_offset
);
3392 rb_erase_cached(&entry
->bytes_index
, &ctl
->free_space_bytes
);
3395 * We need to know if we're currently on the normal space index when we
3396 * manipulate the bitmap so that we know we need to remove and re-insert
3397 * it into the space_index tree. Clear the bytes_index node here so the
3398 * bitmap manipulation helpers know not to mess with the space_index
3399 * until this bitmap entry is added back into the normal cache.
3401 RB_CLEAR_NODE(&entry
->bytes_index
);
3403 ret
= tree_insert_offset(ctl
, cluster
, entry
);
3404 ASSERT(!ret
); /* -EEXIST; Logic error */
3406 trace_btrfs_setup_cluster(block_group
, cluster
,
3407 total_found
* ctl
->unit
, 1);
3412 * This searches the block group for just extents to fill the cluster with.
3413 * Try to find a cluster with at least bytes total bytes, at least one
3414 * extent of cont1_bytes, and other clusters of at least min_bytes.
3417 setup_cluster_no_bitmap(struct btrfs_block_group
*block_group
,
3418 struct btrfs_free_cluster
*cluster
,
3419 struct list_head
*bitmaps
, u64 offset
, u64 bytes
,
3420 u64 cont1_bytes
, u64 min_bytes
)
3422 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3423 struct btrfs_free_space
*first
= NULL
;
3424 struct btrfs_free_space
*entry
= NULL
;
3425 struct btrfs_free_space
*last
;
3426 struct rb_node
*node
;
3431 lockdep_assert_held(&ctl
->tree_lock
);
3433 entry
= tree_search_offset(ctl
, offset
, 0, 1);
3438 * We don't want bitmaps, so just move along until we find a normal
3441 while (entry
->bitmap
|| entry
->bytes
< min_bytes
) {
3442 if (entry
->bitmap
&& list_empty(&entry
->list
))
3443 list_add_tail(&entry
->list
, bitmaps
);
3444 node
= rb_next(&entry
->offset_index
);
3447 entry
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
3450 window_free
= entry
->bytes
;
3451 max_extent
= entry
->bytes
;
3455 for (node
= rb_next(&entry
->offset_index
); node
;
3456 node
= rb_next(&entry
->offset_index
)) {
3457 entry
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
3459 if (entry
->bitmap
) {
3460 if (list_empty(&entry
->list
))
3461 list_add_tail(&entry
->list
, bitmaps
);
3465 if (entry
->bytes
< min_bytes
)
3469 window_free
+= entry
->bytes
;
3470 if (entry
->bytes
> max_extent
)
3471 max_extent
= entry
->bytes
;
3474 if (window_free
< bytes
|| max_extent
< cont1_bytes
)
3477 cluster
->window_start
= first
->offset
;
3479 node
= &first
->offset_index
;
3482 * now we've found our entries, pull them out of the free space
3483 * cache and put them into the cluster rbtree
3488 entry
= rb_entry(node
, struct btrfs_free_space
, offset_index
);
3489 node
= rb_next(&entry
->offset_index
);
3490 if (entry
->bitmap
|| entry
->bytes
< min_bytes
)
3493 rb_erase(&entry
->offset_index
, &ctl
->free_space_offset
);
3494 rb_erase_cached(&entry
->bytes_index
, &ctl
->free_space_bytes
);
3495 ret
= tree_insert_offset(ctl
, cluster
, entry
);
3496 total_size
+= entry
->bytes
;
3497 ASSERT(!ret
); /* -EEXIST; Logic error */
3498 } while (node
&& entry
!= last
);
3500 cluster
->max_size
= max_extent
;
3501 trace_btrfs_setup_cluster(block_group
, cluster
, total_size
, 0);
3506 * This specifically looks for bitmaps that may work in the cluster, we assume
3507 * that we have already failed to find extents that will work.
3510 setup_cluster_bitmap(struct btrfs_block_group
*block_group
,
3511 struct btrfs_free_cluster
*cluster
,
3512 struct list_head
*bitmaps
, u64 offset
, u64 bytes
,
3513 u64 cont1_bytes
, u64 min_bytes
)
3515 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3516 struct btrfs_free_space
*entry
= NULL
;
3518 u64 bitmap_offset
= offset_to_bitmap(ctl
, offset
);
3520 if (ctl
->total_bitmaps
== 0)
3524 * The bitmap that covers offset won't be in the list unless offset
3525 * is just its start offset.
3527 if (!list_empty(bitmaps
))
3528 entry
= list_first_entry(bitmaps
, struct btrfs_free_space
, list
);
3530 if (!entry
|| entry
->offset
!= bitmap_offset
) {
3531 entry
= tree_search_offset(ctl
, bitmap_offset
, 1, 0);
3532 if (entry
&& list_empty(&entry
->list
))
3533 list_add(&entry
->list
, bitmaps
);
3536 list_for_each_entry(entry
, bitmaps
, list
) {
3537 if (entry
->bytes
< bytes
)
3539 ret
= btrfs_bitmap_cluster(block_group
, entry
, cluster
, offset
,
3540 bytes
, cont1_bytes
, min_bytes
);
3546 * The bitmaps list has all the bitmaps that record free space
3547 * starting after offset, so no more search is required.
3553 * here we try to find a cluster of blocks in a block group. The goal
3554 * is to find at least bytes+empty_size.
3555 * We might not find them all in one contiguous area.
3557 * returns zero and sets up cluster if things worked out, otherwise
3558 * it returns -enospc
3560 int btrfs_find_space_cluster(struct btrfs_block_group
*block_group
,
3561 struct btrfs_free_cluster
*cluster
,
3562 u64 offset
, u64 bytes
, u64 empty_size
)
3564 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
3565 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3566 struct btrfs_free_space
*entry
, *tmp
;
3573 * Choose the minimum extent size we'll require for this
3574 * cluster. For SSD_SPREAD, don't allow any fragmentation.
3575 * For metadata, allow allocates with smaller extents. For
3576 * data, keep it dense.
3578 if (btrfs_test_opt(fs_info
, SSD_SPREAD
)) {
3579 cont1_bytes
= bytes
+ empty_size
;
3580 min_bytes
= cont1_bytes
;
3581 } else if (block_group
->flags
& BTRFS_BLOCK_GROUP_METADATA
) {
3582 cont1_bytes
= bytes
;
3583 min_bytes
= fs_info
->sectorsize
;
3585 cont1_bytes
= max(bytes
, (bytes
+ empty_size
) >> 2);
3586 min_bytes
= fs_info
->sectorsize
;
3589 spin_lock(&ctl
->tree_lock
);
3592 * If we know we don't have enough space to make a cluster don't even
3593 * bother doing all the work to try and find one.
3595 if (ctl
->free_space
< bytes
) {
3596 spin_unlock(&ctl
->tree_lock
);
3600 spin_lock(&cluster
->lock
);
3602 /* someone already found a cluster, hooray */
3603 if (cluster
->block_group
) {
3608 trace_btrfs_find_cluster(block_group
, offset
, bytes
, empty_size
,
3611 ret
= setup_cluster_no_bitmap(block_group
, cluster
, &bitmaps
, offset
,
3613 cont1_bytes
, min_bytes
);
3615 ret
= setup_cluster_bitmap(block_group
, cluster
, &bitmaps
,
3616 offset
, bytes
+ empty_size
,
3617 cont1_bytes
, min_bytes
);
3619 /* Clear our temporary list */
3620 list_for_each_entry_safe(entry
, tmp
, &bitmaps
, list
)
3621 list_del_init(&entry
->list
);
3624 btrfs_get_block_group(block_group
);
3625 list_add_tail(&cluster
->block_group_list
,
3626 &block_group
->cluster_list
);
3627 cluster
->block_group
= block_group
;
3629 trace_btrfs_failed_cluster_setup(block_group
);
3632 spin_unlock(&cluster
->lock
);
3633 spin_unlock(&ctl
->tree_lock
);
3639 * simple code to zero out a cluster
3641 void btrfs_init_free_cluster(struct btrfs_free_cluster
*cluster
)
3643 spin_lock_init(&cluster
->lock
);
3644 spin_lock_init(&cluster
->refill_lock
);
3645 cluster
->root
= RB_ROOT
;
3646 cluster
->max_size
= 0;
3647 cluster
->fragmented
= false;
3648 INIT_LIST_HEAD(&cluster
->block_group_list
);
3649 cluster
->block_group
= NULL
;
3652 static int do_trimming(struct btrfs_block_group
*block_group
,
3653 u64
*total_trimmed
, u64 start
, u64 bytes
,
3654 u64 reserved_start
, u64 reserved_bytes
,
3655 enum btrfs_trim_state reserved_trim_state
,
3656 struct btrfs_trim_range
*trim_entry
)
3658 struct btrfs_space_info
*space_info
= block_group
->space_info
;
3659 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
3660 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3663 const u64 end
= start
+ bytes
;
3664 const u64 reserved_end
= reserved_start
+ reserved_bytes
;
3665 enum btrfs_trim_state trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
3668 spin_lock(&space_info
->lock
);
3669 spin_lock(&block_group
->lock
);
3670 if (!block_group
->ro
) {
3671 block_group
->reserved
+= reserved_bytes
;
3672 space_info
->bytes_reserved
+= reserved_bytes
;
3675 spin_unlock(&block_group
->lock
);
3676 spin_unlock(&space_info
->lock
);
3678 ret
= btrfs_discard_extent(fs_info
, start
, bytes
, &trimmed
);
3680 *total_trimmed
+= trimmed
;
3681 trim_state
= BTRFS_TRIM_STATE_TRIMMED
;
3684 mutex_lock(&ctl
->cache_writeout_mutex
);
3685 if (reserved_start
< start
)
3686 __btrfs_add_free_space(block_group
, reserved_start
,
3687 start
- reserved_start
,
3688 reserved_trim_state
);
3689 if (end
< reserved_end
)
3690 __btrfs_add_free_space(block_group
, end
, reserved_end
- end
,
3691 reserved_trim_state
);
3692 __btrfs_add_free_space(block_group
, start
, bytes
, trim_state
);
3693 list_del(&trim_entry
->list
);
3694 mutex_unlock(&ctl
->cache_writeout_mutex
);
3697 spin_lock(&space_info
->lock
);
3698 spin_lock(&block_group
->lock
);
3699 if (block_group
->ro
)
3700 space_info
->bytes_readonly
+= reserved_bytes
;
3701 block_group
->reserved
-= reserved_bytes
;
3702 space_info
->bytes_reserved
-= reserved_bytes
;
3703 spin_unlock(&block_group
->lock
);
3704 spin_unlock(&space_info
->lock
);
3711 * If @async is set, then we will trim 1 region and return.
3713 static int trim_no_bitmap(struct btrfs_block_group
*block_group
,
3714 u64
*total_trimmed
, u64 start
, u64 end
, u64 minlen
,
3717 struct btrfs_discard_ctl
*discard_ctl
=
3718 &block_group
->fs_info
->discard_ctl
;
3719 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3720 struct btrfs_free_space
*entry
;
3721 struct rb_node
*node
;
3725 enum btrfs_trim_state extent_trim_state
;
3727 const u64 max_discard_size
= READ_ONCE(discard_ctl
->max_discard_size
);
3729 while (start
< end
) {
3730 struct btrfs_trim_range trim_entry
;
3732 mutex_lock(&ctl
->cache_writeout_mutex
);
3733 spin_lock(&ctl
->tree_lock
);
3735 if (ctl
->free_space
< minlen
)
3738 entry
= tree_search_offset(ctl
, start
, 0, 1);
3742 /* Skip bitmaps and if async, already trimmed entries */
3743 while (entry
->bitmap
||
3744 (async
&& btrfs_free_space_trimmed(entry
))) {
3745 node
= rb_next(&entry
->offset_index
);
3748 entry
= rb_entry(node
, struct btrfs_free_space
,
3752 if (entry
->offset
>= end
)
3755 extent_start
= entry
->offset
;
3756 extent_bytes
= entry
->bytes
;
3757 extent_trim_state
= entry
->trim_state
;
3759 start
= entry
->offset
;
3760 bytes
= entry
->bytes
;
3761 if (bytes
< minlen
) {
3762 spin_unlock(&ctl
->tree_lock
);
3763 mutex_unlock(&ctl
->cache_writeout_mutex
);
3766 unlink_free_space(ctl
, entry
, true);
3768 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3769 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3770 * X when we come back around. So trim it now.
3772 if (max_discard_size
&&
3773 bytes
>= (max_discard_size
+
3774 BTRFS_ASYNC_DISCARD_MIN_FILTER
)) {
3775 bytes
= max_discard_size
;
3776 extent_bytes
= max_discard_size
;
3777 entry
->offset
+= max_discard_size
;
3778 entry
->bytes
-= max_discard_size
;
3779 link_free_space(ctl
, entry
);
3781 kmem_cache_free(btrfs_free_space_cachep
, entry
);
3784 start
= max(start
, extent_start
);
3785 bytes
= min(extent_start
+ extent_bytes
, end
) - start
;
3786 if (bytes
< minlen
) {
3787 spin_unlock(&ctl
->tree_lock
);
3788 mutex_unlock(&ctl
->cache_writeout_mutex
);
3792 unlink_free_space(ctl
, entry
, true);
3793 kmem_cache_free(btrfs_free_space_cachep
, entry
);
3796 spin_unlock(&ctl
->tree_lock
);
3797 trim_entry
.start
= extent_start
;
3798 trim_entry
.bytes
= extent_bytes
;
3799 list_add_tail(&trim_entry
.list
, &ctl
->trimming_ranges
);
3800 mutex_unlock(&ctl
->cache_writeout_mutex
);
3802 ret
= do_trimming(block_group
, total_trimmed
, start
, bytes
,
3803 extent_start
, extent_bytes
, extent_trim_state
,
3806 block_group
->discard_cursor
= start
+ bytes
;
3811 block_group
->discard_cursor
= start
;
3812 if (async
&& *total_trimmed
)
3815 if (btrfs_trim_interrupted()) {
3826 block_group
->discard_cursor
= btrfs_block_group_end(block_group
);
3827 spin_unlock(&ctl
->tree_lock
);
3828 mutex_unlock(&ctl
->cache_writeout_mutex
);
3834 * If we break out of trimming a bitmap prematurely, we should reset the
3835 * trimming bit. In a rather contrieved case, it's possible to race here so
3836 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3838 * start = start of bitmap
3839 * end = near end of bitmap
3841 * Thread 1: Thread 2:
3842 * trim_bitmaps(start)
3844 * end_trimming_bitmap()
3845 * reset_trimming_bitmap()
3847 static void reset_trimming_bitmap(struct btrfs_free_space_ctl
*ctl
, u64 offset
)
3849 struct btrfs_free_space
*entry
;
3851 spin_lock(&ctl
->tree_lock
);
3852 entry
= tree_search_offset(ctl
, offset
, 1, 0);
3854 if (btrfs_free_space_trimmed(entry
)) {
3855 ctl
->discardable_extents
[BTRFS_STAT_CURR
] +=
3856 entry
->bitmap_extents
;
3857 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] += entry
->bytes
;
3859 entry
->trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
3862 spin_unlock(&ctl
->tree_lock
);
3865 static void end_trimming_bitmap(struct btrfs_free_space_ctl
*ctl
,
3866 struct btrfs_free_space
*entry
)
3868 if (btrfs_free_space_trimming_bitmap(entry
)) {
3869 entry
->trim_state
= BTRFS_TRIM_STATE_TRIMMED
;
3870 ctl
->discardable_extents
[BTRFS_STAT_CURR
] -=
3871 entry
->bitmap_extents
;
3872 ctl
->discardable_bytes
[BTRFS_STAT_CURR
] -= entry
->bytes
;
3877 * If @async is set, then we will trim 1 region and return.
3879 static int trim_bitmaps(struct btrfs_block_group
*block_group
,
3880 u64
*total_trimmed
, u64 start
, u64 end
, u64 minlen
,
3881 u64 maxlen
, bool async
)
3883 struct btrfs_discard_ctl
*discard_ctl
=
3884 &block_group
->fs_info
->discard_ctl
;
3885 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3886 struct btrfs_free_space
*entry
;
3890 u64 offset
= offset_to_bitmap(ctl
, start
);
3891 const u64 max_discard_size
= READ_ONCE(discard_ctl
->max_discard_size
);
3893 while (offset
< end
) {
3894 bool next_bitmap
= false;
3895 struct btrfs_trim_range trim_entry
;
3897 mutex_lock(&ctl
->cache_writeout_mutex
);
3898 spin_lock(&ctl
->tree_lock
);
3900 if (ctl
->free_space
< minlen
) {
3901 block_group
->discard_cursor
=
3902 btrfs_block_group_end(block_group
);
3903 spin_unlock(&ctl
->tree_lock
);
3904 mutex_unlock(&ctl
->cache_writeout_mutex
);
3908 entry
= tree_search_offset(ctl
, offset
, 1, 0);
3910 * Bitmaps are marked trimmed lossily now to prevent constant
3911 * discarding of the same bitmap (the reason why we are bound
3912 * by the filters). So, retrim the block group bitmaps when we
3913 * are preparing to punt to the unused_bgs list. This uses
3914 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3915 * which is the only discard index which sets minlen to 0.
3917 if (!entry
|| (async
&& minlen
&& start
== offset
&&
3918 btrfs_free_space_trimmed(entry
))) {
3919 spin_unlock(&ctl
->tree_lock
);
3920 mutex_unlock(&ctl
->cache_writeout_mutex
);
3926 * Async discard bitmap trimming begins at by setting the start
3927 * to be key.objectid and the offset_to_bitmap() aligns to the
3928 * start of the bitmap. This lets us know we are fully
3929 * scanning the bitmap rather than only some portion of it.
3931 if (start
== offset
)
3932 entry
->trim_state
= BTRFS_TRIM_STATE_TRIMMING
;
3935 ret2
= search_bitmap(ctl
, entry
, &start
, &bytes
, false);
3936 if (ret2
|| start
>= end
) {
3938 * We lossily consider a bitmap trimmed if we only skip
3939 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3941 if (ret2
&& minlen
<= BTRFS_ASYNC_DISCARD_MIN_FILTER
)
3942 end_trimming_bitmap(ctl
, entry
);
3944 entry
->trim_state
= BTRFS_TRIM_STATE_UNTRIMMED
;
3945 spin_unlock(&ctl
->tree_lock
);
3946 mutex_unlock(&ctl
->cache_writeout_mutex
);
3952 * We already trimmed a region, but are using the locking above
3953 * to reset the trim_state.
3955 if (async
&& *total_trimmed
) {
3956 spin_unlock(&ctl
->tree_lock
);
3957 mutex_unlock(&ctl
->cache_writeout_mutex
);
3961 bytes
= min(bytes
, end
- start
);
3962 if (bytes
< minlen
|| (async
&& maxlen
&& bytes
> maxlen
)) {
3963 spin_unlock(&ctl
->tree_lock
);
3964 mutex_unlock(&ctl
->cache_writeout_mutex
);
3969 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3970 * If X < @minlen, we won't trim X when we come back around.
3971 * So trim it now. We differ here from trimming extents as we
3972 * don't keep individual state per bit.
3976 bytes
> (max_discard_size
+ minlen
))
3977 bytes
= max_discard_size
;
3979 bitmap_clear_bits(ctl
, entry
, start
, bytes
, true);
3980 if (entry
->bytes
== 0)
3981 free_bitmap(ctl
, entry
);
3983 spin_unlock(&ctl
->tree_lock
);
3984 trim_entry
.start
= start
;
3985 trim_entry
.bytes
= bytes
;
3986 list_add_tail(&trim_entry
.list
, &ctl
->trimming_ranges
);
3987 mutex_unlock(&ctl
->cache_writeout_mutex
);
3989 ret
= do_trimming(block_group
, total_trimmed
, start
, bytes
,
3990 start
, bytes
, 0, &trim_entry
);
3992 reset_trimming_bitmap(ctl
, offset
);
3993 block_group
->discard_cursor
=
3994 btrfs_block_group_end(block_group
);
3999 offset
+= BITS_PER_BITMAP
* ctl
->unit
;
4004 block_group
->discard_cursor
= start
;
4006 if (btrfs_trim_interrupted()) {
4007 if (start
!= offset
)
4008 reset_trimming_bitmap(ctl
, offset
);
4017 block_group
->discard_cursor
= end
;
4023 int btrfs_trim_block_group(struct btrfs_block_group
*block_group
,
4024 u64
*trimmed
, u64 start
, u64 end
, u64 minlen
)
4026 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
4030 ASSERT(!btrfs_is_zoned(block_group
->fs_info
));
4034 spin_lock(&block_group
->lock
);
4035 if (test_bit(BLOCK_GROUP_FLAG_REMOVED
, &block_group
->runtime_flags
)) {
4036 spin_unlock(&block_group
->lock
);
4039 btrfs_freeze_block_group(block_group
);
4040 spin_unlock(&block_group
->lock
);
4042 ret
= trim_no_bitmap(block_group
, trimmed
, start
, end
, minlen
, false);
4046 ret
= trim_bitmaps(block_group
, trimmed
, start
, end
, minlen
, 0, false);
4047 div64_u64_rem(end
, BITS_PER_BITMAP
* ctl
->unit
, &rem
);
4048 /* If we ended in the middle of a bitmap, reset the trimming flag */
4050 reset_trimming_bitmap(ctl
, offset_to_bitmap(ctl
, end
));
4052 btrfs_unfreeze_block_group(block_group
);
4056 int btrfs_trim_block_group_extents(struct btrfs_block_group
*block_group
,
4057 u64
*trimmed
, u64 start
, u64 end
, u64 minlen
,
4064 spin_lock(&block_group
->lock
);
4065 if (test_bit(BLOCK_GROUP_FLAG_REMOVED
, &block_group
->runtime_flags
)) {
4066 spin_unlock(&block_group
->lock
);
4069 btrfs_freeze_block_group(block_group
);
4070 spin_unlock(&block_group
->lock
);
4072 ret
= trim_no_bitmap(block_group
, trimmed
, start
, end
, minlen
, async
);
4073 btrfs_unfreeze_block_group(block_group
);
4078 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group
*block_group
,
4079 u64
*trimmed
, u64 start
, u64 end
, u64 minlen
,
4080 u64 maxlen
, bool async
)
4086 spin_lock(&block_group
->lock
);
4087 if (test_bit(BLOCK_GROUP_FLAG_REMOVED
, &block_group
->runtime_flags
)) {
4088 spin_unlock(&block_group
->lock
);
4091 btrfs_freeze_block_group(block_group
);
4092 spin_unlock(&block_group
->lock
);
4094 ret
= trim_bitmaps(block_group
, trimmed
, start
, end
, minlen
, maxlen
,
4097 btrfs_unfreeze_block_group(block_group
);
4102 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info
*fs_info
)
4104 return btrfs_super_cache_generation(fs_info
->super_copy
);
4107 static int cleanup_free_space_cache_v1(struct btrfs_fs_info
*fs_info
,
4108 struct btrfs_trans_handle
*trans
)
4110 struct btrfs_block_group
*block_group
;
4111 struct rb_node
*node
;
4114 btrfs_info(fs_info
, "cleaning free space cache v1");
4116 node
= rb_first_cached(&fs_info
->block_group_cache_tree
);
4118 block_group
= rb_entry(node
, struct btrfs_block_group
, cache_node
);
4119 ret
= btrfs_remove_free_space_inode(trans
, NULL
, block_group
);
4122 node
= rb_next(node
);
4128 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info
*fs_info
, bool active
)
4130 struct btrfs_trans_handle
*trans
;
4134 * update_super_roots will appropriately set or unset
4135 * super_copy->cache_generation based on SPACE_CACHE and
4136 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
4137 * transaction commit whether we are enabling space cache v1 and don't
4138 * have any other work to do, or are disabling it and removing free
4141 trans
= btrfs_start_transaction(fs_info
->tree_root
, 0);
4143 return PTR_ERR(trans
);
4146 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1
, &fs_info
->flags
);
4147 ret
= cleanup_free_space_cache_v1(fs_info
, trans
);
4149 btrfs_abort_transaction(trans
, ret
);
4150 btrfs_end_transaction(trans
);
4155 ret
= btrfs_commit_transaction(trans
);
4157 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1
, &fs_info
->flags
);
4162 int __init
btrfs_free_space_init(void)
4164 btrfs_free_space_cachep
= KMEM_CACHE(btrfs_free_space
, 0);
4165 if (!btrfs_free_space_cachep
)
4168 btrfs_free_space_bitmap_cachep
= kmem_cache_create("btrfs_free_space_bitmap",
4169 PAGE_SIZE
, PAGE_SIZE
,
4171 if (!btrfs_free_space_bitmap_cachep
) {
4172 kmem_cache_destroy(btrfs_free_space_cachep
);
4179 void __cold
btrfs_free_space_exit(void)
4181 kmem_cache_destroy(btrfs_free_space_cachep
);
4182 kmem_cache_destroy(btrfs_free_space_bitmap_cachep
);
4185 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4187 * Use this if you need to make a bitmap or extent entry specifically, it
4188 * doesn't do any of the merging that add_free_space does, this acts a lot like
4189 * how the free space cache loading stuff works, so you can get really weird
4192 int test_add_free_space_entry(struct btrfs_block_group
*cache
,
4193 u64 offset
, u64 bytes
, bool bitmap
)
4195 struct btrfs_free_space_ctl
*ctl
= cache
->free_space_ctl
;
4196 struct btrfs_free_space
*info
= NULL
, *bitmap_info
;
4198 enum btrfs_trim_state trim_state
= BTRFS_TRIM_STATE_TRIMMED
;
4204 info
= kmem_cache_zalloc(btrfs_free_space_cachep
, GFP_NOFS
);
4210 spin_lock(&ctl
->tree_lock
);
4211 info
->offset
= offset
;
4212 info
->bytes
= bytes
;
4213 info
->max_extent_size
= 0;
4214 ret
= link_free_space(ctl
, info
);
4215 spin_unlock(&ctl
->tree_lock
);
4217 kmem_cache_free(btrfs_free_space_cachep
, info
);
4222 map
= kmem_cache_zalloc(btrfs_free_space_bitmap_cachep
, GFP_NOFS
);
4224 kmem_cache_free(btrfs_free_space_cachep
, info
);
4229 spin_lock(&ctl
->tree_lock
);
4230 bitmap_info
= tree_search_offset(ctl
, offset_to_bitmap(ctl
, offset
),
4235 add_new_bitmap(ctl
, info
, offset
);
4240 bytes_added
= add_bytes_to_bitmap(ctl
, bitmap_info
, offset
, bytes
,
4243 bytes
-= bytes_added
;
4244 offset
+= bytes_added
;
4245 spin_unlock(&ctl
->tree_lock
);
4251 kmem_cache_free(btrfs_free_space_cachep
, info
);
4253 kmem_cache_free(btrfs_free_space_bitmap_cachep
, map
);
4258 * Checks to see if the given range is in the free space cache. This is really
4259 * just used to check the absence of space, so if there is free space in the
4260 * range at all we will return 1.
4262 int test_check_exists(struct btrfs_block_group
*cache
,
4263 u64 offset
, u64 bytes
)
4265 struct btrfs_free_space_ctl
*ctl
= cache
->free_space_ctl
;
4266 struct btrfs_free_space
*info
;
4269 spin_lock(&ctl
->tree_lock
);
4270 info
= tree_search_offset(ctl
, offset
, 0, 0);
4272 info
= tree_search_offset(ctl
, offset_to_bitmap(ctl
, offset
),
4280 u64 bit_off
, bit_bytes
;
4282 struct btrfs_free_space
*tmp
;
4285 bit_bytes
= ctl
->unit
;
4286 ret
= search_bitmap(ctl
, info
, &bit_off
, &bit_bytes
, false);
4288 if (bit_off
== offset
) {
4291 } else if (bit_off
> offset
&&
4292 offset
+ bytes
> bit_off
) {
4298 n
= rb_prev(&info
->offset_index
);
4300 tmp
= rb_entry(n
, struct btrfs_free_space
,
4302 if (tmp
->offset
+ tmp
->bytes
< offset
)
4304 if (offset
+ bytes
< tmp
->offset
) {
4305 n
= rb_prev(&tmp
->offset_index
);
4312 n
= rb_next(&info
->offset_index
);
4314 tmp
= rb_entry(n
, struct btrfs_free_space
,
4316 if (offset
+ bytes
< tmp
->offset
)
4318 if (tmp
->offset
+ tmp
->bytes
< offset
) {
4319 n
= rb_next(&tmp
->offset_index
);
4330 if (info
->offset
== offset
) {
4335 if (offset
> info
->offset
&& offset
< info
->offset
+ info
->bytes
)
4338 spin_unlock(&ctl
->tree_lock
);
4341 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */