1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
6 #include <linux/sched.h>
9 #include "transaction.h"
11 #include "accessors.h"
13 #include "delalloc-space.h"
16 #include "file-item.h"
19 static struct kmem_cache
*btrfs_inode_defrag_cachep
;
22 * When auto defrag is enabled we queue up these defrag structs to remember
23 * which inodes need defragging passes.
26 struct rb_node rb_node
;
30 * Transid where the defrag was added, we search for extents newer than
39 * The extent size threshold for autodefrag.
41 * This value is different for compressed/non-compressed extents, thus
42 * needs to be passed from higher layer.
43 * (aka, inode_should_defrag())
48 static int compare_inode_defrag(const struct inode_defrag
*defrag1
,
49 const struct inode_defrag
*defrag2
)
51 if (defrag1
->root
> defrag2
->root
)
53 else if (defrag1
->root
< defrag2
->root
)
55 else if (defrag1
->ino
> defrag2
->ino
)
57 else if (defrag1
->ino
< defrag2
->ino
)
64 * Insert a record for an inode into the defrag tree. The lock must be held
67 * If you're inserting a record for an older transid than an existing record,
68 * the transid already in the tree is lowered.
70 static int btrfs_insert_inode_defrag(struct btrfs_inode
*inode
,
71 struct inode_defrag
*defrag
)
73 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
74 struct inode_defrag
*entry
;
76 struct rb_node
*parent
= NULL
;
79 p
= &fs_info
->defrag_inodes
.rb_node
;
82 entry
= rb_entry(parent
, struct inode_defrag
, rb_node
);
84 ret
= compare_inode_defrag(defrag
, entry
);
88 p
= &parent
->rb_right
;
91 * If we're reinserting an entry for an old defrag run,
92 * make sure to lower the transid of our existing
95 if (defrag
->transid
< entry
->transid
)
96 entry
->transid
= defrag
->transid
;
97 entry
->extent_thresh
= min(defrag
->extent_thresh
,
98 entry
->extent_thresh
);
102 set_bit(BTRFS_INODE_IN_DEFRAG
, &inode
->runtime_flags
);
103 rb_link_node(&defrag
->rb_node
, parent
, p
);
104 rb_insert_color(&defrag
->rb_node
, &fs_info
->defrag_inodes
);
108 static inline int need_auto_defrag(struct btrfs_fs_info
*fs_info
)
110 if (!btrfs_test_opt(fs_info
, AUTO_DEFRAG
))
113 if (btrfs_fs_closing(fs_info
))
120 * Insert a defrag record for this inode if auto defrag is enabled. No errors
121 * returned as they're not considered fatal.
123 void btrfs_add_inode_defrag(struct btrfs_inode
*inode
, u32 extent_thresh
)
125 struct btrfs_root
*root
= inode
->root
;
126 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
127 struct inode_defrag
*defrag
;
130 if (!need_auto_defrag(fs_info
))
133 if (test_bit(BTRFS_INODE_IN_DEFRAG
, &inode
->runtime_flags
))
136 defrag
= kmem_cache_zalloc(btrfs_inode_defrag_cachep
, GFP_NOFS
);
140 defrag
->ino
= btrfs_ino(inode
);
141 defrag
->transid
= btrfs_get_root_last_trans(root
);
142 defrag
->root
= btrfs_root_id(root
);
143 defrag
->extent_thresh
= extent_thresh
;
145 spin_lock(&fs_info
->defrag_inodes_lock
);
146 if (!test_bit(BTRFS_INODE_IN_DEFRAG
, &inode
->runtime_flags
)) {
148 * If we set IN_DEFRAG flag and evict the inode from memory,
149 * and then re-read this inode, this new inode doesn't have
150 * IN_DEFRAG flag. At the case, we may find the existed defrag.
152 ret
= btrfs_insert_inode_defrag(inode
, defrag
);
154 kmem_cache_free(btrfs_inode_defrag_cachep
, defrag
);
156 kmem_cache_free(btrfs_inode_defrag_cachep
, defrag
);
158 spin_unlock(&fs_info
->defrag_inodes_lock
);
162 * Pick the defragable inode that we want, if it doesn't exist, we will get the
165 static struct inode_defrag
*btrfs_pick_defrag_inode(
166 struct btrfs_fs_info
*fs_info
, u64 root
, u64 ino
)
168 struct inode_defrag
*entry
= NULL
;
169 struct inode_defrag tmp
;
171 struct rb_node
*parent
= NULL
;
177 spin_lock(&fs_info
->defrag_inodes_lock
);
178 p
= fs_info
->defrag_inodes
.rb_node
;
181 entry
= rb_entry(parent
, struct inode_defrag
, rb_node
);
183 ret
= compare_inode_defrag(&tmp
, entry
);
187 p
= parent
->rb_right
;
192 if (parent
&& compare_inode_defrag(&tmp
, entry
) > 0) {
193 parent
= rb_next(parent
);
195 entry
= rb_entry(parent
, struct inode_defrag
, rb_node
);
201 rb_erase(parent
, &fs_info
->defrag_inodes
);
202 spin_unlock(&fs_info
->defrag_inodes_lock
);
206 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info
*fs_info
)
208 struct inode_defrag
*defrag
, *next
;
210 spin_lock(&fs_info
->defrag_inodes_lock
);
212 rbtree_postorder_for_each_entry_safe(defrag
, next
,
213 &fs_info
->defrag_inodes
, rb_node
)
214 kmem_cache_free(btrfs_inode_defrag_cachep
, defrag
);
216 fs_info
->defrag_inodes
= RB_ROOT
;
218 spin_unlock(&fs_info
->defrag_inodes_lock
);
221 #define BTRFS_DEFRAG_BATCH 1024
223 static int btrfs_run_defrag_inode(struct btrfs_fs_info
*fs_info
,
224 struct inode_defrag
*defrag
,
225 struct file_ra_state
*ra
)
227 struct btrfs_root
*inode_root
;
229 struct btrfs_ioctl_defrag_range_args range
;
234 if (test_bit(BTRFS_FS_STATE_REMOUNTING
, &fs_info
->fs_state
))
236 if (!need_auto_defrag(fs_info
))
240 inode_root
= btrfs_get_fs_root(fs_info
, defrag
->root
, true);
241 if (IS_ERR(inode_root
)) {
242 ret
= PTR_ERR(inode_root
);
246 inode
= btrfs_iget(defrag
->ino
, inode_root
);
247 btrfs_put_root(inode_root
);
249 ret
= PTR_ERR(inode
);
253 if (cur
>= i_size_read(inode
)) {
258 /* Do a chunk of defrag */
259 clear_bit(BTRFS_INODE_IN_DEFRAG
, &BTRFS_I(inode
)->runtime_flags
);
260 memset(&range
, 0, sizeof(range
));
263 range
.extent_thresh
= defrag
->extent_thresh
;
264 file_ra_state_init(ra
, inode
->i_mapping
);
266 sb_start_write(fs_info
->sb
);
267 ret
= btrfs_defrag_file(inode
, ra
, &range
, defrag
->transid
,
269 sb_end_write(fs_info
->sb
);
275 cur
= max(cur
+ fs_info
->sectorsize
, range
.start
);
279 kmem_cache_free(btrfs_inode_defrag_cachep
, defrag
);
284 * Run through the list of inodes in the FS that need defragging.
286 int btrfs_run_defrag_inodes(struct btrfs_fs_info
*fs_info
)
288 struct inode_defrag
*defrag
;
290 u64 root_objectid
= 0;
292 atomic_inc(&fs_info
->defrag_running
);
294 struct file_ra_state ra
= { 0 };
296 /* Pause the auto defragger. */
297 if (test_bit(BTRFS_FS_STATE_REMOUNTING
, &fs_info
->fs_state
))
300 if (!need_auto_defrag(fs_info
))
303 /* find an inode to defrag */
304 defrag
= btrfs_pick_defrag_inode(fs_info
, root_objectid
, first_ino
);
306 if (root_objectid
|| first_ino
) {
315 first_ino
= defrag
->ino
+ 1;
316 root_objectid
= defrag
->root
;
318 btrfs_run_defrag_inode(fs_info
, defrag
, &ra
);
320 atomic_dec(&fs_info
->defrag_running
);
323 * During unmount, we use the transaction_wait queue to wait for the
326 wake_up(&fs_info
->transaction_wait
);
331 * Check if two blocks addresses are close, used by defrag.
333 static bool close_blocks(u64 blocknr
, u64 other
, u32 blocksize
)
335 if (blocknr
< other
&& other
- (blocknr
+ blocksize
) < SZ_32K
)
337 if (blocknr
> other
&& blocknr
- (other
+ blocksize
) < SZ_32K
)
343 * Go through all the leaves pointed to by a node and reallocate them so that
344 * disk order is close to key order.
346 static int btrfs_realloc_node(struct btrfs_trans_handle
*trans
,
347 struct btrfs_root
*root
,
348 struct extent_buffer
*parent
,
349 int start_slot
, u64
*last_ret
,
350 struct btrfs_key
*progress
)
352 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
353 const u32 blocksize
= fs_info
->nodesize
;
354 const int end_slot
= btrfs_header_nritems(parent
) - 1;
355 u64 search_start
= *last_ret
;
358 bool progress_passed
= false;
361 * COWing must happen through a running transaction, which always
362 * matches the current fs generation (it's a transaction with a state
363 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
364 * into error state to prevent the commit of any transaction.
366 if (unlikely(trans
->transaction
!= fs_info
->running_transaction
||
367 trans
->transid
!= fs_info
->generation
)) {
368 btrfs_abort_transaction(trans
, -EUCLEAN
);
370 "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
371 parent
->start
, btrfs_root_id(root
), trans
->transid
,
372 fs_info
->running_transaction
->transid
,
373 fs_info
->generation
);
377 if (btrfs_header_nritems(parent
) <= 1)
380 for (int i
= start_slot
; i
<= end_slot
; i
++) {
381 struct extent_buffer
*cur
;
382 struct btrfs_disk_key disk_key
;
387 btrfs_node_key(parent
, &disk_key
, i
);
388 if (!progress_passed
&& btrfs_comp_keys(&disk_key
, progress
) < 0)
391 progress_passed
= true;
392 blocknr
= btrfs_node_blockptr(parent
, i
);
394 last_block
= blocknr
;
397 other
= btrfs_node_blockptr(parent
, i
- 1);
398 close
= close_blocks(blocknr
, other
, blocksize
);
400 if (!close
&& i
< end_slot
) {
401 other
= btrfs_node_blockptr(parent
, i
+ 1);
402 close
= close_blocks(blocknr
, other
, blocksize
);
405 last_block
= blocknr
;
409 cur
= btrfs_read_node_slot(parent
, i
);
412 if (search_start
== 0)
413 search_start
= last_block
;
415 btrfs_tree_lock(cur
);
416 ret
= btrfs_force_cow_block(trans
, root
, cur
, parent
, i
,
419 (end_slot
- i
) * blocksize
),
422 btrfs_tree_unlock(cur
);
423 free_extent_buffer(cur
);
426 search_start
= cur
->start
;
427 last_block
= cur
->start
;
428 *last_ret
= search_start
;
429 btrfs_tree_unlock(cur
);
430 free_extent_buffer(cur
);
436 * Defrag all the leaves in a given btree.
437 * Read all the leaves and try to get key order to
438 * better reflect disk order
441 static int btrfs_defrag_leaves(struct btrfs_trans_handle
*trans
,
442 struct btrfs_root
*root
)
444 struct btrfs_path
*path
= NULL
;
445 struct btrfs_key key
;
449 int next_key_ret
= 0;
452 if (!test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
))
455 path
= btrfs_alloc_path();
461 level
= btrfs_header_level(root
->node
);
466 if (root
->defrag_progress
.objectid
== 0) {
467 struct extent_buffer
*root_node
;
470 root_node
= btrfs_lock_root_node(root
);
471 nritems
= btrfs_header_nritems(root_node
);
472 root
->defrag_max
.objectid
= 0;
473 /* from above we know this is not a leaf */
474 btrfs_node_key_to_cpu(root_node
, &root
->defrag_max
,
476 btrfs_tree_unlock(root_node
);
477 free_extent_buffer(root_node
);
478 memset(&key
, 0, sizeof(key
));
480 memcpy(&key
, &root
->defrag_progress
, sizeof(key
));
483 path
->keep_locks
= 1;
485 ret
= btrfs_search_forward(root
, &key
, path
, BTRFS_OLDEST_GENERATION
);
492 btrfs_release_path(path
);
494 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
495 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
496 * a deadlock (attempting to write lock an already write locked leaf).
498 path
->lowest_level
= 1;
499 wret
= btrfs_search_slot(trans
, root
, &key
, path
, 0, 1);
505 if (!path
->nodes
[1]) {
510 * The node at level 1 must always be locked when our path has
511 * keep_locks set and lowest_level is 1, regardless of the value of
514 ASSERT(path
->locks
[1] != 0);
515 ret
= btrfs_realloc_node(trans
, root
,
518 &root
->defrag_progress
);
520 WARN_ON(ret
== -EAGAIN
);
524 * Now that we reallocated the node we can find the next key. Note that
525 * btrfs_find_next_key() can release our path and do another search
526 * without COWing, this is because even with path->keep_locks = 1,
527 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
528 * node when path->slots[node_level - 1] does not point to the last
529 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
530 * we search for the next key after reallocating our node.
532 path
->slots
[1] = btrfs_header_nritems(path
->nodes
[1]);
533 next_key_ret
= btrfs_find_next_key(root
, path
, &key
, 1,
534 BTRFS_OLDEST_GENERATION
);
535 if (next_key_ret
== 0) {
536 memcpy(&root
->defrag_progress
, &key
, sizeof(key
));
540 btrfs_free_path(path
);
541 if (ret
== -EAGAIN
) {
542 if (root
->defrag_max
.objectid
> root
->defrag_progress
.objectid
)
544 if (root
->defrag_max
.type
> root
->defrag_progress
.type
)
546 if (root
->defrag_max
.offset
> root
->defrag_progress
.offset
)
552 memset(&root
->defrag_progress
, 0,
553 sizeof(root
->defrag_progress
));
559 * Defrag a given btree. Every leaf in the btree is read and defragmented.
561 int btrfs_defrag_root(struct btrfs_root
*root
)
563 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
566 if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING
, &root
->state
))
570 struct btrfs_trans_handle
*trans
;
572 trans
= btrfs_start_transaction(root
, 0);
574 ret
= PTR_ERR(trans
);
578 ret
= btrfs_defrag_leaves(trans
, root
);
580 btrfs_end_transaction(trans
);
581 btrfs_btree_balance_dirty(fs_info
);
584 if (btrfs_fs_closing(fs_info
) || ret
!= -EAGAIN
)
587 if (btrfs_defrag_cancelled(fs_info
)) {
588 btrfs_debug(fs_info
, "defrag_root cancelled");
593 clear_bit(BTRFS_ROOT_DEFRAG_RUNNING
, &root
->state
);
598 * Defrag specific helper to get an extent map.
600 * Differences between this and btrfs_get_extent() are:
602 * - No extent_map will be added to inode->extent_tree
603 * To reduce memory usage in the long run.
605 * - Extra optimization to skip file extents older than @newer_than
606 * By using btrfs_search_forward() we can skip entire file ranges that
607 * have extents created in past transactions, because btrfs_search_forward()
608 * will not visit leaves and nodes with a generation smaller than given
609 * minimal generation threshold (@newer_than).
611 * Return valid em if we find a file extent matching the requirement.
612 * Return NULL if we can not find a file extent matching the requirement.
614 * Return ERR_PTR() for error.
616 static struct extent_map
*defrag_get_extent(struct btrfs_inode
*inode
,
617 u64 start
, u64 newer_than
)
619 struct btrfs_root
*root
= inode
->root
;
620 struct btrfs_file_extent_item
*fi
;
621 struct btrfs_path path
= { 0 };
622 struct extent_map
*em
;
623 struct btrfs_key key
;
624 u64 ino
= btrfs_ino(inode
);
627 em
= alloc_extent_map();
634 key
.type
= BTRFS_EXTENT_DATA_KEY
;
638 ret
= btrfs_search_forward(root
, &key
, &path
, newer_than
);
641 /* Can't find anything newer */
645 ret
= btrfs_search_slot(NULL
, root
, &key
, &path
, 0, 0);
649 if (path
.slots
[0] >= btrfs_header_nritems(path
.nodes
[0])) {
651 * If btrfs_search_slot() makes path to point beyond nritems,
652 * we should not have an empty leaf, as this inode must at
653 * least have its INODE_ITEM.
655 ASSERT(btrfs_header_nritems(path
.nodes
[0]));
656 path
.slots
[0] = btrfs_header_nritems(path
.nodes
[0]) - 1;
658 btrfs_item_key_to_cpu(path
.nodes
[0], &key
, path
.slots
[0]);
659 /* Perfect match, no need to go one slot back */
660 if (key
.objectid
== ino
&& key
.type
== BTRFS_EXTENT_DATA_KEY
&&
664 /* We didn't find a perfect match, needs to go one slot back */
665 if (path
.slots
[0] > 0) {
666 btrfs_item_key_to_cpu(path
.nodes
[0], &key
, path
.slots
[0]);
667 if (key
.objectid
== ino
&& key
.type
== BTRFS_EXTENT_DATA_KEY
)
672 /* Iterate through the path to find a file extent covering @start */
676 if (path
.slots
[0] >= btrfs_header_nritems(path
.nodes
[0]))
679 btrfs_item_key_to_cpu(path
.nodes
[0], &key
, path
.slots
[0]);
682 * We may go one slot back to INODE_REF/XATTR item, then
683 * need to go forward until we reach an EXTENT_DATA.
684 * But we should still has the correct ino as key.objectid.
686 if (WARN_ON(key
.objectid
< ino
) || key
.type
< BTRFS_EXTENT_DATA_KEY
)
689 /* It's beyond our target range, definitely not extent found */
690 if (key
.objectid
> ino
|| key
.type
> BTRFS_EXTENT_DATA_KEY
)
694 * | |<- File extent ->|
697 * This means there is a hole between start and key.offset.
699 if (key
.offset
> start
) {
701 em
->disk_bytenr
= EXTENT_MAP_HOLE
;
702 em
->disk_num_bytes
= 0;
705 em
->len
= key
.offset
- start
;
709 fi
= btrfs_item_ptr(path
.nodes
[0], path
.slots
[0],
710 struct btrfs_file_extent_item
);
711 extent_end
= btrfs_file_extent_end(&path
);
714 * |<- file extent ->| |
717 * We haven't reached start, search next slot.
719 if (extent_end
<= start
)
722 /* Now this extent covers @start, convert it to em */
723 btrfs_extent_item_to_extent_map(inode
, &path
, fi
, em
);
726 ret
= btrfs_next_item(root
, &path
);
732 btrfs_release_path(&path
);
736 btrfs_release_path(&path
);
741 btrfs_release_path(&path
);
746 static struct extent_map
*defrag_lookup_extent(struct inode
*inode
, u64 start
,
747 u64 newer_than
, bool locked
)
749 struct extent_map_tree
*em_tree
= &BTRFS_I(inode
)->extent_tree
;
750 struct extent_io_tree
*io_tree
= &BTRFS_I(inode
)->io_tree
;
751 struct extent_map
*em
;
752 const u32 sectorsize
= BTRFS_I(inode
)->root
->fs_info
->sectorsize
;
755 * Hopefully we have this extent in the tree already, try without the
758 read_lock(&em_tree
->lock
);
759 em
= lookup_extent_mapping(em_tree
, start
, sectorsize
);
760 read_unlock(&em_tree
->lock
);
763 * We can get a merged extent, in that case, we need to re-search
764 * tree to get the original em for defrag.
766 * If @newer_than is 0 or em::generation < newer_than, we can trust
767 * this em, as either we don't care about the generation, or the
768 * merged extent map will be rejected anyway.
770 if (em
&& (em
->flags
& EXTENT_FLAG_MERGED
) &&
771 newer_than
&& em
->generation
>= newer_than
) {
777 struct extent_state
*cached
= NULL
;
778 u64 end
= start
+ sectorsize
- 1;
780 /* Get the big lock and read metadata off disk. */
782 lock_extent(io_tree
, start
, end
, &cached
);
783 em
= defrag_get_extent(BTRFS_I(inode
), start
, newer_than
);
785 unlock_extent(io_tree
, start
, end
, &cached
);
794 static u32
get_extent_max_capacity(const struct btrfs_fs_info
*fs_info
,
795 const struct extent_map
*em
)
797 if (extent_map_is_compressed(em
))
798 return BTRFS_MAX_COMPRESSED
;
799 return fs_info
->max_extent_size
;
802 static bool defrag_check_next_extent(struct inode
*inode
, struct extent_map
*em
,
803 u32 extent_thresh
, u64 newer_than
, bool locked
)
805 struct btrfs_fs_info
*fs_info
= inode_to_fs_info(inode
);
806 struct extent_map
*next
;
809 /* This is the last extent */
810 if (em
->start
+ em
->len
>= i_size_read(inode
))
814 * Here we need to pass @newer_then when checking the next extent, or
815 * we will hit a case we mark current extent for defrag, but the next
816 * one will not be a target.
817 * This will just cause extra IO without really reducing the fragments.
819 next
= defrag_lookup_extent(inode
, em
->start
+ em
->len
, newer_than
, locked
);
820 /* No more em or hole */
821 if (!next
|| next
->disk_bytenr
>= EXTENT_MAP_LAST_BYTE
)
823 if (next
->flags
& EXTENT_FLAG_PREALLOC
)
826 * If the next extent is at its max capacity, defragging current extent
827 * makes no sense, as the total number of extents won't change.
829 if (next
->len
>= get_extent_max_capacity(fs_info
, em
))
831 /* Skip older extent */
832 if (next
->generation
< newer_than
)
834 /* Also check extent size */
835 if (next
->len
>= extent_thresh
)
840 free_extent_map(next
);
845 * Prepare one page to be defragged.
849 * - Returned page is locked and has been set up properly.
850 * - No ordered extent exists in the page.
851 * - The page is uptodate.
853 * NOTE: Caller should also wait for page writeback after the cluster is
854 * prepared, here we don't do writeback wait for each page.
856 static struct folio
*defrag_prepare_one_folio(struct btrfs_inode
*inode
, pgoff_t index
)
858 struct address_space
*mapping
= inode
->vfs_inode
.i_mapping
;
859 gfp_t mask
= btrfs_alloc_write_mask(mapping
);
860 u64 page_start
= (u64
)index
<< PAGE_SHIFT
;
861 u64 page_end
= page_start
+ PAGE_SIZE
- 1;
862 struct extent_state
*cached_state
= NULL
;
867 folio
= __filemap_get_folio(mapping
, index
,
868 FGP_LOCK
| FGP_ACCESSED
| FGP_CREAT
, mask
);
873 * Since we can defragment files opened read-only, we can encounter
874 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
875 * can't do I/O using huge pages yet, so return an error for now.
876 * Filesystem transparent huge pages are typically only used for
877 * executables that explicitly enable them, so this isn't very
880 if (folio_test_large(folio
)) {
883 return ERR_PTR(-ETXTBSY
);
886 ret
= set_folio_extent_mapped(folio
);
893 /* Wait for any existing ordered extent in the range */
895 struct btrfs_ordered_extent
*ordered
;
897 lock_extent(&inode
->io_tree
, page_start
, page_end
, &cached_state
);
898 ordered
= btrfs_lookup_ordered_range(inode
, page_start
, PAGE_SIZE
);
899 unlock_extent(&inode
->io_tree
, page_start
, page_end
,
905 btrfs_start_ordered_extent(ordered
);
906 btrfs_put_ordered_extent(ordered
);
909 * We unlocked the folio above, so we need check if it was
912 if (folio
->mapping
!= mapping
|| !folio
->private) {
920 * Now the page range has no ordered extent any more. Read the page to
923 if (!folio_test_uptodate(folio
)) {
924 btrfs_read_folio(NULL
, folio
);
926 if (folio
->mapping
!= mapping
|| !folio
->private) {
931 if (!folio_test_uptodate(folio
)) {
934 return ERR_PTR(-EIO
);
940 struct defrag_target_range
{
941 struct list_head list
;
947 * Collect all valid target extents.
949 * @start: file offset to lookup
950 * @len: length to lookup
951 * @extent_thresh: file extent size threshold, any extent size >= this value
953 * @newer_than: only defrag extents newer than this value
954 * @do_compress: whether the defrag is doing compression
955 * if true, @extent_thresh will be ignored and all regular
956 * file extents meeting @newer_than will be targets.
957 * @locked: if the range has already held extent lock
958 * @target_list: list of targets file extents
960 static int defrag_collect_targets(struct btrfs_inode
*inode
,
961 u64 start
, u64 len
, u32 extent_thresh
,
962 u64 newer_than
, bool do_compress
,
963 bool locked
, struct list_head
*target_list
,
964 u64
*last_scanned_ret
)
966 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
967 bool last_is_target
= false;
971 while (cur
< start
+ len
) {
972 struct extent_map
*em
;
973 struct defrag_target_range
*new;
974 bool next_mergeable
= true;
977 last_is_target
= false;
978 em
= defrag_lookup_extent(&inode
->vfs_inode
, cur
, newer_than
, locked
);
983 * If the file extent is an inlined one, we may still want to
984 * defrag it (fallthrough) if it will cause a regular extent.
985 * This is for users who want to convert inline extents to
986 * regular ones through max_inline= mount option.
988 if (em
->disk_bytenr
== EXTENT_MAP_INLINE
&&
989 em
->len
<= inode
->root
->fs_info
->max_inline
)
992 /* Skip holes and preallocated extents. */
993 if (em
->disk_bytenr
== EXTENT_MAP_HOLE
||
994 (em
->flags
& EXTENT_FLAG_PREALLOC
))
997 /* Skip older extent */
998 if (em
->generation
< newer_than
)
1001 /* This em is under writeback, no need to defrag */
1002 if (em
->generation
== (u64
)-1)
1006 * Our start offset might be in the middle of an existing extent
1007 * map, so take that into account.
1009 range_len
= em
->len
- (cur
- em
->start
);
1011 * If this range of the extent map is already flagged for delalloc,
1014 * 1) We could deadlock later, when trying to reserve space for
1015 * delalloc, because in case we can't immediately reserve space
1016 * the flusher can start delalloc and wait for the respective
1017 * ordered extents to complete. The deadlock would happen
1018 * because we do the space reservation while holding the range
1019 * locked, and starting writeback, or finishing an ordered
1020 * extent, requires locking the range;
1022 * 2) If there's delalloc there, it means there's dirty pages for
1023 * which writeback has not started yet (we clean the delalloc
1024 * flag when starting writeback and after creating an ordered
1025 * extent). If we mark pages in an adjacent range for defrag,
1026 * then we will have a larger contiguous range for delalloc,
1027 * very likely resulting in a larger extent after writeback is
1028 * triggered (except in a case of free space fragmentation).
1030 if (test_range_bit_exists(&inode
->io_tree
, cur
, cur
+ range_len
- 1,
1035 * For do_compress case, we want to compress all valid file
1036 * extents, thus no @extent_thresh or mergeable check.
1041 /* Skip too large extent */
1042 if (em
->len
>= extent_thresh
)
1046 * Skip extents already at its max capacity, this is mostly for
1047 * compressed extents, which max cap is only 128K.
1049 if (em
->len
>= get_extent_max_capacity(fs_info
, em
))
1053 * Normally there are no more extents after an inline one, thus
1054 * @next_mergeable will normally be false and not defragged.
1055 * So if an inline extent passed all above checks, just add it
1056 * for defrag, and be converted to regular extents.
1058 if (em
->disk_bytenr
== EXTENT_MAP_INLINE
)
1061 next_mergeable
= defrag_check_next_extent(&inode
->vfs_inode
, em
,
1062 extent_thresh
, newer_than
, locked
);
1063 if (!next_mergeable
) {
1064 struct defrag_target_range
*last
;
1066 /* Empty target list, no way to merge with last entry */
1067 if (list_empty(target_list
))
1069 last
= list_entry(target_list
->prev
,
1070 struct defrag_target_range
, list
);
1071 /* Not mergeable with last entry */
1072 if (last
->start
+ last
->len
!= cur
)
1075 /* Mergeable, fall through to add it to @target_list. */
1079 last_is_target
= true;
1080 range_len
= min(extent_map_end(em
), start
+ len
) - cur
;
1082 * This one is a good target, check if it can be merged into
1083 * last range of the target list.
1085 if (!list_empty(target_list
)) {
1086 struct defrag_target_range
*last
;
1088 last
= list_entry(target_list
->prev
,
1089 struct defrag_target_range
, list
);
1090 ASSERT(last
->start
+ last
->len
<= cur
);
1091 if (last
->start
+ last
->len
== cur
) {
1092 /* Mergeable, enlarge the last entry */
1093 last
->len
+= range_len
;
1096 /* Fall through to allocate a new entry */
1099 /* Allocate new defrag_target_range */
1100 new = kmalloc(sizeof(*new), GFP_NOFS
);
1102 free_extent_map(em
);
1107 new->len
= range_len
;
1108 list_add_tail(&new->list
, target_list
);
1111 cur
= extent_map_end(em
);
1112 free_extent_map(em
);
1115 struct defrag_target_range
*entry
;
1116 struct defrag_target_range
*tmp
;
1118 list_for_each_entry_safe(entry
, tmp
, target_list
, list
) {
1119 list_del_init(&entry
->list
);
1123 if (!ret
&& last_scanned_ret
) {
1125 * If the last extent is not a target, the caller can skip to
1126 * the end of that extent.
1127 * Otherwise, we can only go the end of the specified range.
1129 if (!last_is_target
)
1130 *last_scanned_ret
= max(cur
, *last_scanned_ret
);
1132 *last_scanned_ret
= max(start
+ len
, *last_scanned_ret
);
1137 #define CLUSTER_SIZE (SZ_256K)
1138 static_assert(PAGE_ALIGNED(CLUSTER_SIZE
));
1141 * Defrag one contiguous target range.
1143 * @inode: target inode
1144 * @target: target range to defrag
1145 * @pages: locked pages covering the defrag range
1146 * @nr_pages: number of locked pages
1148 * Caller should ensure:
1150 * - Pages are prepared
1151 * Pages should be locked, no ordered extent in the pages range,
1154 * - Extent bits are locked
1156 static int defrag_one_locked_target(struct btrfs_inode
*inode
,
1157 struct defrag_target_range
*target
,
1158 struct folio
**folios
, int nr_pages
,
1159 struct extent_state
**cached_state
)
1161 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1162 struct extent_changeset
*data_reserved
= NULL
;
1163 const u64 start
= target
->start
;
1164 const u64 len
= target
->len
;
1165 unsigned long last_index
= (start
+ len
- 1) >> PAGE_SHIFT
;
1166 unsigned long start_index
= start
>> PAGE_SHIFT
;
1167 unsigned long first_index
= folios
[0]->index
;
1171 ASSERT(last_index
- first_index
+ 1 <= nr_pages
);
1173 ret
= btrfs_delalloc_reserve_space(inode
, &data_reserved
, start
, len
);
1176 clear_extent_bit(&inode
->io_tree
, start
, start
+ len
- 1,
1177 EXTENT_DELALLOC
| EXTENT_DO_ACCOUNTING
|
1178 EXTENT_DEFRAG
, cached_state
);
1179 set_extent_bit(&inode
->io_tree
, start
, start
+ len
- 1,
1180 EXTENT_DELALLOC
| EXTENT_DEFRAG
, cached_state
);
1182 /* Update the page status */
1183 for (i
= start_index
- first_index
; i
<= last_index
- first_index
; i
++) {
1184 folio_clear_checked(folios
[i
]);
1185 btrfs_folio_clamp_set_dirty(fs_info
, folios
[i
], start
, len
);
1187 btrfs_delalloc_release_extents(inode
, len
);
1188 extent_changeset_free(data_reserved
);
1193 static int defrag_one_range(struct btrfs_inode
*inode
, u64 start
, u32 len
,
1194 u32 extent_thresh
, u64 newer_than
, bool do_compress
,
1195 u64
*last_scanned_ret
)
1197 struct extent_state
*cached_state
= NULL
;
1198 struct defrag_target_range
*entry
;
1199 struct defrag_target_range
*tmp
;
1200 LIST_HEAD(target_list
);
1201 struct folio
**folios
;
1202 const u32 sectorsize
= inode
->root
->fs_info
->sectorsize
;
1203 u64 last_index
= (start
+ len
- 1) >> PAGE_SHIFT
;
1204 u64 start_index
= start
>> PAGE_SHIFT
;
1205 unsigned int nr_pages
= last_index
- start_index
+ 1;
1209 ASSERT(nr_pages
<= CLUSTER_SIZE
/ PAGE_SIZE
);
1210 ASSERT(IS_ALIGNED(start
, sectorsize
) && IS_ALIGNED(len
, sectorsize
));
1212 folios
= kcalloc(nr_pages
, sizeof(struct folio
*), GFP_NOFS
);
1216 /* Prepare all pages */
1217 for (i
= 0; i
< nr_pages
; i
++) {
1218 folios
[i
] = defrag_prepare_one_folio(inode
, start_index
+ i
);
1219 if (IS_ERR(folios
[i
])) {
1220 ret
= PTR_ERR(folios
[i
]);
1225 for (i
= 0; i
< nr_pages
; i
++)
1226 folio_wait_writeback(folios
[i
]);
1228 /* Lock the pages range */
1229 lock_extent(&inode
->io_tree
, start_index
<< PAGE_SHIFT
,
1230 (last_index
<< PAGE_SHIFT
) + PAGE_SIZE
- 1,
1233 * Now we have a consistent view about the extent map, re-check
1234 * which range really needs to be defragged.
1236 * And this time we have extent locked already, pass @locked = true
1237 * so that we won't relock the extent range and cause deadlock.
1239 ret
= defrag_collect_targets(inode
, start
, len
, extent_thresh
,
1240 newer_than
, do_compress
, true,
1241 &target_list
, last_scanned_ret
);
1245 list_for_each_entry(entry
, &target_list
, list
) {
1246 ret
= defrag_one_locked_target(inode
, entry
, folios
, nr_pages
,
1252 list_for_each_entry_safe(entry
, tmp
, &target_list
, list
) {
1253 list_del_init(&entry
->list
);
1257 unlock_extent(&inode
->io_tree
, start_index
<< PAGE_SHIFT
,
1258 (last_index
<< PAGE_SHIFT
) + PAGE_SIZE
- 1,
1261 for (i
= 0; i
< nr_pages
; i
++) {
1262 folio_unlock(folios
[i
]);
1263 folio_put(folios
[i
]);
1269 static int defrag_one_cluster(struct btrfs_inode
*inode
,
1270 struct file_ra_state
*ra
,
1271 u64 start
, u32 len
, u32 extent_thresh
,
1272 u64 newer_than
, bool do_compress
,
1273 unsigned long *sectors_defragged
,
1274 unsigned long max_sectors
,
1275 u64
*last_scanned_ret
)
1277 const u32 sectorsize
= inode
->root
->fs_info
->sectorsize
;
1278 struct defrag_target_range
*entry
;
1279 struct defrag_target_range
*tmp
;
1280 LIST_HEAD(target_list
);
1283 ret
= defrag_collect_targets(inode
, start
, len
, extent_thresh
,
1284 newer_than
, do_compress
, false,
1285 &target_list
, NULL
);
1289 list_for_each_entry(entry
, &target_list
, list
) {
1290 u32 range_len
= entry
->len
;
1292 /* Reached or beyond the limit */
1293 if (max_sectors
&& *sectors_defragged
>= max_sectors
) {
1299 range_len
= min_t(u32
, range_len
,
1300 (max_sectors
- *sectors_defragged
) * sectorsize
);
1303 * If defrag_one_range() has updated last_scanned_ret,
1304 * our range may already be invalid (e.g. hole punched).
1305 * Skip if our range is before last_scanned_ret, as there is
1306 * no need to defrag the range anymore.
1308 if (entry
->start
+ range_len
<= *last_scanned_ret
)
1311 page_cache_sync_readahead(inode
->vfs_inode
.i_mapping
,
1312 ra
, NULL
, entry
->start
>> PAGE_SHIFT
,
1313 ((entry
->start
+ range_len
- 1) >> PAGE_SHIFT
) -
1314 (entry
->start
>> PAGE_SHIFT
) + 1);
1316 * Here we may not defrag any range if holes are punched before
1317 * we locked the pages.
1318 * But that's fine, it only affects the @sectors_defragged
1321 ret
= defrag_one_range(inode
, entry
->start
, range_len
,
1322 extent_thresh
, newer_than
, do_compress
,
1326 *sectors_defragged
+= range_len
>>
1327 inode
->root
->fs_info
->sectorsize_bits
;
1330 list_for_each_entry_safe(entry
, tmp
, &target_list
, list
) {
1331 list_del_init(&entry
->list
);
1335 *last_scanned_ret
= max(*last_scanned_ret
, start
+ len
);
1340 * Entry point to file defragmentation.
1342 * @inode: inode to be defragged
1343 * @ra: readahead state
1344 * @range: defrag options including range and flags
1345 * @newer_than: minimum transid to defrag
1346 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
1347 * will be defragged.
1349 * Return <0 for error.
1350 * Return >=0 for the number of sectors defragged, and range->start will be updated
1351 * to indicate the file offset where next defrag should be started at.
1352 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1353 * defragging all the range).
1355 int btrfs_defrag_file(struct inode
*inode
, struct file_ra_state
*ra
,
1356 struct btrfs_ioctl_defrag_range_args
*range
,
1357 u64 newer_than
, unsigned long max_to_defrag
)
1359 struct btrfs_fs_info
*fs_info
= inode_to_fs_info(inode
);
1360 unsigned long sectors_defragged
= 0;
1361 u64 isize
= i_size_read(inode
);
1364 bool do_compress
= (range
->flags
& BTRFS_DEFRAG_RANGE_COMPRESS
);
1365 int compress_type
= BTRFS_COMPRESS_ZLIB
;
1367 u32 extent_thresh
= range
->extent_thresh
;
1368 pgoff_t start_index
;
1375 if (range
->start
>= isize
)
1379 if (range
->compress_type
>= BTRFS_NR_COMPRESS_TYPES
)
1381 if (range
->compress_type
)
1382 compress_type
= range
->compress_type
;
1385 if (extent_thresh
== 0)
1386 extent_thresh
= SZ_256K
;
1388 if (range
->start
+ range
->len
> range
->start
) {
1389 /* Got a specific range */
1390 last_byte
= min(isize
, range
->start
+ range
->len
);
1392 /* Defrag until file end */
1396 /* Align the range */
1397 cur
= round_down(range
->start
, fs_info
->sectorsize
);
1398 last_byte
= round_up(last_byte
, fs_info
->sectorsize
) - 1;
1401 * Make writeback start from the beginning of the range, so that the
1402 * defrag range can be written sequentially.
1404 start_index
= cur
>> PAGE_SHIFT
;
1405 if (start_index
< inode
->i_mapping
->writeback_index
)
1406 inode
->i_mapping
->writeback_index
= start_index
;
1408 while (cur
< last_byte
) {
1409 const unsigned long prev_sectors_defragged
= sectors_defragged
;
1410 u64 last_scanned
= cur
;
1413 if (btrfs_defrag_cancelled(fs_info
)) {
1418 /* We want the cluster end at page boundary when possible */
1419 cluster_end
= (((cur
>> PAGE_SHIFT
) +
1420 (SZ_256K
>> PAGE_SHIFT
)) << PAGE_SHIFT
) - 1;
1421 cluster_end
= min(cluster_end
, last_byte
);
1423 btrfs_inode_lock(BTRFS_I(inode
), 0);
1424 if (IS_SWAPFILE(inode
)) {
1426 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1429 if (!(inode
->i_sb
->s_flags
& SB_ACTIVE
)) {
1430 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1434 BTRFS_I(inode
)->defrag_compress
= compress_type
;
1435 ret
= defrag_one_cluster(BTRFS_I(inode
), ra
, cur
,
1436 cluster_end
+ 1 - cur
, extent_thresh
,
1437 newer_than
, do_compress
, §ors_defragged
,
1438 max_to_defrag
, &last_scanned
);
1440 if (sectors_defragged
> prev_sectors_defragged
)
1441 balance_dirty_pages_ratelimited(inode
->i_mapping
);
1443 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1446 cur
= max(cluster_end
+ 1, last_scanned
);
1455 * Update range.start for autodefrag, this will indicate where to start
1459 if (sectors_defragged
) {
1461 * We have defragged some sectors, for compression case they
1462 * need to be written back immediately.
1464 if (range
->flags
& BTRFS_DEFRAG_RANGE_START_IO
) {
1465 filemap_flush(inode
->i_mapping
);
1466 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT
,
1467 &BTRFS_I(inode
)->runtime_flags
))
1468 filemap_flush(inode
->i_mapping
);
1470 if (range
->compress_type
== BTRFS_COMPRESS_LZO
)
1471 btrfs_set_fs_incompat(fs_info
, COMPRESS_LZO
);
1472 else if (range
->compress_type
== BTRFS_COMPRESS_ZSTD
)
1473 btrfs_set_fs_incompat(fs_info
, COMPRESS_ZSTD
);
1474 ret
= sectors_defragged
;
1477 btrfs_inode_lock(BTRFS_I(inode
), 0);
1478 BTRFS_I(inode
)->defrag_compress
= BTRFS_COMPRESS_NONE
;
1479 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1484 void __cold
btrfs_auto_defrag_exit(void)
1486 kmem_cache_destroy(btrfs_inode_defrag_cachep
);
1489 int __init
btrfs_auto_defrag_init(void)
1491 btrfs_inode_defrag_cachep
= kmem_cache_create("btrfs_inode_defrag",
1492 sizeof(struct inode_defrag
), 0, 0, NULL
);
1493 if (!btrfs_inode_defrag_cachep
)