1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2011 STRATO. All rights reserved.
7 #include <linux/rbtree.h>
8 #include <trace/events/btrfs.h>
13 #include "transaction.h"
14 #include "delayed-ref.h"
17 #include "tree-mod-log.h"
19 #include "accessors.h"
20 #include "extent-tree.h"
21 #include "relocation.h"
22 #include "tree-checker.h"
24 /* Just arbitrary numbers so we can be sure one of these happened. */
25 #define BACKREF_FOUND_SHARED 6
26 #define BACKREF_FOUND_NOT_SHARED 7
28 struct extent_inode_elem
{
32 struct extent_inode_elem
*next
;
35 static int check_extent_in_eb(struct btrfs_backref_walk_ctx
*ctx
,
36 const struct btrfs_key
*key
,
37 const struct extent_buffer
*eb
,
38 const struct btrfs_file_extent_item
*fi
,
39 struct extent_inode_elem
**eie
)
41 const u64 data_len
= btrfs_file_extent_num_bytes(eb
, fi
);
42 u64 offset
= key
->offset
;
43 struct extent_inode_elem
*e
;
48 if (!ctx
->ignore_extent_item_pos
&&
49 !btrfs_file_extent_compression(eb
, fi
) &&
50 !btrfs_file_extent_encryption(eb
, fi
) &&
51 !btrfs_file_extent_other_encoding(eb
, fi
)) {
54 data_offset
= btrfs_file_extent_offset(eb
, fi
);
56 if (ctx
->extent_item_pos
< data_offset
||
57 ctx
->extent_item_pos
>= data_offset
+ data_len
)
59 offset
+= ctx
->extent_item_pos
- data_offset
;
62 if (!ctx
->indirect_ref_iterator
|| !ctx
->cache_lookup
)
65 cached
= ctx
->cache_lookup(eb
->start
, ctx
->user_ctx
, &root_ids
,
70 for (int i
= 0; i
< root_count
; i
++) {
73 ret
= ctx
->indirect_ref_iterator(key
->objectid
, offset
,
74 data_len
, root_ids
[i
],
81 e
= kmalloc(sizeof(*e
), GFP_NOFS
);
86 e
->inum
= key
->objectid
;
88 e
->num_bytes
= data_len
;
94 static void free_inode_elem_list(struct extent_inode_elem
*eie
)
96 struct extent_inode_elem
*eie_next
;
98 for (; eie
; eie
= eie_next
) {
104 static int find_extent_in_eb(struct btrfs_backref_walk_ctx
*ctx
,
105 const struct extent_buffer
*eb
,
106 struct extent_inode_elem
**eie
)
109 struct btrfs_key key
;
110 struct btrfs_file_extent_item
*fi
;
117 * from the shared data ref, we only have the leaf but we need
118 * the key. thus, we must look into all items and see that we
119 * find one (some) with a reference to our extent item.
121 nritems
= btrfs_header_nritems(eb
);
122 for (slot
= 0; slot
< nritems
; ++slot
) {
123 btrfs_item_key_to_cpu(eb
, &key
, slot
);
124 if (key
.type
!= BTRFS_EXTENT_DATA_KEY
)
126 fi
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
127 extent_type
= btrfs_file_extent_type(eb
, fi
);
128 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
)
130 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
131 disk_byte
= btrfs_file_extent_disk_bytenr(eb
, fi
);
132 if (disk_byte
!= ctx
->bytenr
)
135 ret
= check_extent_in_eb(ctx
, &key
, eb
, fi
, eie
);
136 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
|| ret
< 0)
144 struct rb_root_cached root
;
148 #define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 }
151 struct preftree direct
; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
152 struct preftree indirect
; /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */
153 struct preftree indirect_missing_keys
;
157 * Checks for a shared extent during backref search.
159 * The share_count tracks prelim_refs (direct and indirect) having a
161 * - incremented when a ref->count transitions to >0
162 * - decremented when a ref->count transitions to <1
165 struct btrfs_backref_share_check_ctx
*ctx
;
166 struct btrfs_root
*root
;
171 * Counts number of inodes that refer to an extent (different inodes in
172 * the same root or different roots) that we could find. The sharedness
173 * check typically stops once this counter gets greater than 1, so it
174 * may not reflect the total number of inodes.
178 * The number of times we found our inode refers to the data extent we
179 * are determining the sharedness. In other words, how many file extent
180 * items we could find for our inode that point to our target data
181 * extent. The value we get here after finishing the extent sharedness
182 * check may be smaller than reality, but if it ends up being greater
183 * than 1, then we know for sure the inode has multiple file extent
184 * items that point to our inode, and we can safely assume it's useful
185 * to cache the sharedness check result.
188 bool have_delayed_delete_refs
;
191 static inline int extent_is_shared(struct share_check
*sc
)
193 return (sc
&& sc
->share_count
> 1) ? BACKREF_FOUND_SHARED
: 0;
196 static struct kmem_cache
*btrfs_prelim_ref_cache
;
198 int __init
btrfs_prelim_ref_init(void)
200 btrfs_prelim_ref_cache
= kmem_cache_create("btrfs_prelim_ref",
201 sizeof(struct prelim_ref
), 0, 0, NULL
);
202 if (!btrfs_prelim_ref_cache
)
207 void __cold
btrfs_prelim_ref_exit(void)
209 kmem_cache_destroy(btrfs_prelim_ref_cache
);
212 static void free_pref(struct prelim_ref
*ref
)
214 kmem_cache_free(btrfs_prelim_ref_cache
, ref
);
218 * Return 0 when both refs are for the same block (and can be merged).
219 * A -1 return indicates ref1 is a 'lower' block than ref2, while 1
220 * indicates a 'higher' block.
222 static int prelim_ref_compare(const struct prelim_ref
*ref1
,
223 const struct prelim_ref
*ref2
)
225 if (ref1
->level
< ref2
->level
)
227 if (ref1
->level
> ref2
->level
)
229 if (ref1
->root_id
< ref2
->root_id
)
231 if (ref1
->root_id
> ref2
->root_id
)
233 if (ref1
->key_for_search
.type
< ref2
->key_for_search
.type
)
235 if (ref1
->key_for_search
.type
> ref2
->key_for_search
.type
)
237 if (ref1
->key_for_search
.objectid
< ref2
->key_for_search
.objectid
)
239 if (ref1
->key_for_search
.objectid
> ref2
->key_for_search
.objectid
)
241 if (ref1
->key_for_search
.offset
< ref2
->key_for_search
.offset
)
243 if (ref1
->key_for_search
.offset
> ref2
->key_for_search
.offset
)
245 if (ref1
->parent
< ref2
->parent
)
247 if (ref1
->parent
> ref2
->parent
)
253 static void update_share_count(struct share_check
*sc
, int oldcount
,
254 int newcount
, const struct prelim_ref
*newref
)
256 if ((!sc
) || (oldcount
== 0 && newcount
< 1))
259 if (oldcount
> 0 && newcount
< 1)
261 else if (oldcount
< 1 && newcount
> 0)
264 if (newref
->root_id
== btrfs_root_id(sc
->root
) &&
265 newref
->wanted_disk_byte
== sc
->data_bytenr
&&
266 newref
->key_for_search
.objectid
== sc
->inum
)
267 sc
->self_ref_count
+= newref
->count
;
271 * Add @newref to the @root rbtree, merging identical refs.
273 * Callers should assume that newref has been freed after calling.
275 static void prelim_ref_insert(const struct btrfs_fs_info
*fs_info
,
276 struct preftree
*preftree
,
277 struct prelim_ref
*newref
,
278 struct share_check
*sc
)
280 struct rb_root_cached
*root
;
282 struct rb_node
*parent
= NULL
;
283 struct prelim_ref
*ref
;
285 bool leftmost
= true;
287 root
= &preftree
->root
;
288 p
= &root
->rb_root
.rb_node
;
292 ref
= rb_entry(parent
, struct prelim_ref
, rbnode
);
293 result
= prelim_ref_compare(ref
, newref
);
296 } else if (result
> 0) {
300 /* Identical refs, merge them and free @newref */
301 struct extent_inode_elem
*eie
= ref
->inode_list
;
303 while (eie
&& eie
->next
)
307 ref
->inode_list
= newref
->inode_list
;
309 eie
->next
= newref
->inode_list
;
310 trace_btrfs_prelim_ref_merge(fs_info
, ref
, newref
,
313 * A delayed ref can have newref->count < 0.
314 * The ref->count is updated to follow any
315 * BTRFS_[ADD|DROP]_DELAYED_REF actions.
317 update_share_count(sc
, ref
->count
,
318 ref
->count
+ newref
->count
, newref
);
319 ref
->count
+= newref
->count
;
325 update_share_count(sc
, 0, newref
->count
, newref
);
327 trace_btrfs_prelim_ref_insert(fs_info
, newref
, NULL
, preftree
->count
);
328 rb_link_node(&newref
->rbnode
, parent
, p
);
329 rb_insert_color_cached(&newref
->rbnode
, root
, leftmost
);
333 * Release the entire tree. We don't care about internal consistency so
334 * just free everything and then reset the tree root.
336 static void prelim_release(struct preftree
*preftree
)
338 struct prelim_ref
*ref
, *next_ref
;
340 rbtree_postorder_for_each_entry_safe(ref
, next_ref
,
341 &preftree
->root
.rb_root
, rbnode
) {
342 free_inode_elem_list(ref
->inode_list
);
346 preftree
->root
= RB_ROOT_CACHED
;
351 * the rules for all callers of this function are:
352 * - obtaining the parent is the goal
353 * - if you add a key, you must know that it is a correct key
354 * - if you cannot add the parent or a correct key, then we will look into the
355 * block later to set a correct key
359 * backref type | shared | indirect | shared | indirect
360 * information | tree | tree | data | data
361 * --------------------+--------+----------+--------+----------
362 * parent logical | y | - | - | -
363 * key to resolve | - | y | y | y
364 * tree block logical | - | - | - | -
365 * root for resolving | y | y | y | y
367 * - column 1: we've the parent -> done
368 * - column 2, 3, 4: we use the key to find the parent
370 * on disk refs (inline or keyed)
371 * ==============================
372 * backref type | shared | indirect | shared | indirect
373 * information | tree | tree | data | data
374 * --------------------+--------+----------+--------+----------
375 * parent logical | y | - | y | -
376 * key to resolve | - | - | - | y
377 * tree block logical | y | y | y | y
378 * root for resolving | - | y | y | y
380 * - column 1, 3: we've the parent -> done
381 * - column 2: we take the first key from the block to find the parent
382 * (see add_missing_keys)
383 * - column 4: we use the key to find the parent
385 * additional information that's available but not required to find the parent
386 * block might help in merging entries to gain some speed.
388 static int add_prelim_ref(const struct btrfs_fs_info
*fs_info
,
389 struct preftree
*preftree
, u64 root_id
,
390 const struct btrfs_key
*key
, int level
, u64 parent
,
391 u64 wanted_disk_byte
, int count
,
392 struct share_check
*sc
, gfp_t gfp_mask
)
394 struct prelim_ref
*ref
;
396 if (root_id
== BTRFS_DATA_RELOC_TREE_OBJECTID
)
399 ref
= kmem_cache_alloc(btrfs_prelim_ref_cache
, gfp_mask
);
403 ref
->root_id
= root_id
;
405 ref
->key_for_search
= *key
;
407 memset(&ref
->key_for_search
, 0, sizeof(ref
->key_for_search
));
409 ref
->inode_list
= NULL
;
412 ref
->parent
= parent
;
413 ref
->wanted_disk_byte
= wanted_disk_byte
;
414 prelim_ref_insert(fs_info
, preftree
, ref
, sc
);
415 return extent_is_shared(sc
);
418 /* direct refs use root == 0, key == NULL */
419 static int add_direct_ref(const struct btrfs_fs_info
*fs_info
,
420 struct preftrees
*preftrees
, int level
, u64 parent
,
421 u64 wanted_disk_byte
, int count
,
422 struct share_check
*sc
, gfp_t gfp_mask
)
424 return add_prelim_ref(fs_info
, &preftrees
->direct
, 0, NULL
, level
,
425 parent
, wanted_disk_byte
, count
, sc
, gfp_mask
);
428 /* indirect refs use parent == 0 */
429 static int add_indirect_ref(const struct btrfs_fs_info
*fs_info
,
430 struct preftrees
*preftrees
, u64 root_id
,
431 const struct btrfs_key
*key
, int level
,
432 u64 wanted_disk_byte
, int count
,
433 struct share_check
*sc
, gfp_t gfp_mask
)
435 struct preftree
*tree
= &preftrees
->indirect
;
438 tree
= &preftrees
->indirect_missing_keys
;
439 return add_prelim_ref(fs_info
, tree
, root_id
, key
, level
, 0,
440 wanted_disk_byte
, count
, sc
, gfp_mask
);
443 static int is_shared_data_backref(struct preftrees
*preftrees
, u64 bytenr
)
445 struct rb_node
**p
= &preftrees
->direct
.root
.rb_root
.rb_node
;
446 struct rb_node
*parent
= NULL
;
447 struct prelim_ref
*ref
= NULL
;
448 struct prelim_ref target
= {};
451 target
.parent
= bytenr
;
455 ref
= rb_entry(parent
, struct prelim_ref
, rbnode
);
456 result
= prelim_ref_compare(ref
, &target
);
468 static int add_all_parents(struct btrfs_backref_walk_ctx
*ctx
,
469 struct btrfs_root
*root
, struct btrfs_path
*path
,
470 struct ulist
*parents
,
471 struct preftrees
*preftrees
, struct prelim_ref
*ref
,
476 struct extent_buffer
*eb
;
477 struct btrfs_key key
;
478 struct btrfs_key
*key_for_search
= &ref
->key_for_search
;
479 struct btrfs_file_extent_item
*fi
;
480 struct extent_inode_elem
*eie
= NULL
, *old
= NULL
;
482 u64 wanted_disk_byte
= ref
->wanted_disk_byte
;
488 eb
= path
->nodes
[level
];
489 ret
= ulist_add(parents
, eb
->start
, 0, GFP_NOFS
);
496 * 1. We normally enter this function with the path already pointing to
497 * the first item to check. But sometimes, we may enter it with
499 * 2. We are searching for normal backref but bytenr of this leaf
500 * matches shared data backref
501 * 3. The leaf owner is not equal to the root we are searching
503 * For these cases, go to the next leaf before we continue.
506 if (path
->slots
[0] >= btrfs_header_nritems(eb
) ||
507 is_shared_data_backref(preftrees
, eb
->start
) ||
508 ref
->root_id
!= btrfs_header_owner(eb
)) {
509 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
510 ret
= btrfs_next_leaf(root
, path
);
512 ret
= btrfs_next_old_leaf(root
, path
, ctx
->time_seq
);
515 while (!ret
&& count
< ref
->count
) {
517 slot
= path
->slots
[0];
519 btrfs_item_key_to_cpu(eb
, &key
, slot
);
521 if (key
.objectid
!= key_for_search
->objectid
||
522 key
.type
!= BTRFS_EXTENT_DATA_KEY
)
526 * We are searching for normal backref but bytenr of this leaf
527 * matches shared data backref, OR
528 * the leaf owner is not equal to the root we are searching for
531 (is_shared_data_backref(preftrees
, eb
->start
) ||
532 ref
->root_id
!= btrfs_header_owner(eb
))) {
533 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
534 ret
= btrfs_next_leaf(root
, path
);
536 ret
= btrfs_next_old_leaf(root
, path
, ctx
->time_seq
);
539 fi
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
540 type
= btrfs_file_extent_type(eb
, fi
);
541 if (type
== BTRFS_FILE_EXTENT_INLINE
)
543 disk_byte
= btrfs_file_extent_disk_bytenr(eb
, fi
);
544 data_offset
= btrfs_file_extent_offset(eb
, fi
);
546 if (disk_byte
== wanted_disk_byte
) {
549 if (ref
->key_for_search
.offset
== key
.offset
- data_offset
)
553 if (!ctx
->skip_inode_ref_list
) {
554 ret
= check_extent_in_eb(ctx
, &key
, eb
, fi
, &eie
);
555 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
||
561 ret
= ulist_add_merge_ptr(parents
, eb
->start
,
562 eie
, (void **)&old
, GFP_NOFS
);
565 if (!ret
&& !ctx
->skip_inode_ref_list
) {
573 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
574 ret
= btrfs_next_item(root
, path
);
576 ret
= btrfs_next_old_item(root
, path
, ctx
->time_seq
);
579 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
|| ret
< 0)
580 free_inode_elem_list(eie
);
588 * resolve an indirect backref in the form (root_id, key, level)
589 * to a logical address
591 static int resolve_indirect_ref(struct btrfs_backref_walk_ctx
*ctx
,
592 struct btrfs_path
*path
,
593 struct preftrees
*preftrees
,
594 struct prelim_ref
*ref
, struct ulist
*parents
)
596 struct btrfs_root
*root
;
597 struct extent_buffer
*eb
;
600 int level
= ref
->level
;
601 struct btrfs_key search_key
= ref
->key_for_search
;
604 * If we're search_commit_root we could possibly be holding locks on
605 * other tree nodes. This happens when qgroups does backref walks when
606 * adding new delayed refs. To deal with this we need to look in cache
607 * for the root, and if we don't find it then we need to search the
608 * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage
611 if (path
->search_commit_root
)
612 root
= btrfs_get_fs_root_commit_root(ctx
->fs_info
, path
, ref
->root_id
);
614 root
= btrfs_get_fs_root(ctx
->fs_info
, ref
->root_id
, false);
620 if (!path
->search_commit_root
&&
621 test_bit(BTRFS_ROOT_DELETING
, &root
->state
)) {
626 if (btrfs_is_testing(ctx
->fs_info
)) {
631 if (path
->search_commit_root
)
632 root_level
= btrfs_header_level(root
->commit_root
);
633 else if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
634 root_level
= btrfs_header_level(root
->node
);
636 root_level
= btrfs_old_root_level(root
, ctx
->time_seq
);
638 if (root_level
+ 1 == level
)
642 * We can often find data backrefs with an offset that is too large
643 * (>= LLONG_MAX, maximum allowed file offset) due to underflows when
644 * subtracting a file's offset with the data offset of its
645 * corresponding extent data item. This can happen for example in the
648 * So if we detect such case we set the search key's offset to zero to
649 * make sure we will find the matching file extent item at
650 * add_all_parents(), otherwise we will miss it because the offset
651 * taken form the backref is much larger then the offset of the file
652 * extent item. This can make us scan a very large number of file
653 * extent items, but at least it will not make us miss any.
655 * This is an ugly workaround for a behaviour that should have never
656 * existed, but it does and a fix for the clone ioctl would touch a lot
657 * of places, cause backwards incompatibility and would not fix the
658 * problem for extents cloned with older kernels.
660 if (search_key
.type
== BTRFS_EXTENT_DATA_KEY
&&
661 search_key
.offset
>= LLONG_MAX
)
662 search_key
.offset
= 0;
663 path
->lowest_level
= level
;
664 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
665 ret
= btrfs_search_slot(NULL
, root
, &search_key
, path
, 0, 0);
667 ret
= btrfs_search_old_slot(root
, &search_key
, path
, ctx
->time_seq
);
669 btrfs_debug(ctx
->fs_info
,
670 "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
671 ref
->root_id
, level
, ref
->count
, ret
,
672 ref
->key_for_search
.objectid
, ref
->key_for_search
.type
,
673 ref
->key_for_search
.offset
);
677 eb
= path
->nodes
[level
];
679 if (WARN_ON(!level
)) {
684 eb
= path
->nodes
[level
];
687 ret
= add_all_parents(ctx
, root
, path
, parents
, preftrees
, ref
, level
);
689 btrfs_put_root(root
);
691 path
->lowest_level
= 0;
692 btrfs_release_path(path
);
696 static struct extent_inode_elem
*
697 unode_aux_to_inode_list(struct ulist_node
*node
)
701 return (struct extent_inode_elem
*)(uintptr_t)node
->aux
;
704 static void free_leaf_list(struct ulist
*ulist
)
706 struct ulist_node
*node
;
707 struct ulist_iterator uiter
;
709 ULIST_ITER_INIT(&uiter
);
710 while ((node
= ulist_next(ulist
, &uiter
)))
711 free_inode_elem_list(unode_aux_to_inode_list(node
));
717 * We maintain three separate rbtrees: one for direct refs, one for
718 * indirect refs which have a key, and one for indirect refs which do not
719 * have a key. Each tree does merge on insertion.
721 * Once all of the references are located, we iterate over the tree of
722 * indirect refs with missing keys. An appropriate key is located and
723 * the ref is moved onto the tree for indirect refs. After all missing
724 * keys are thus located, we iterate over the indirect ref tree, resolve
725 * each reference, and then insert the resolved reference onto the
726 * direct tree (merging there too).
728 * New backrefs (i.e., for parent nodes) are added to the appropriate
729 * rbtree as they are encountered. The new backrefs are subsequently
732 static int resolve_indirect_refs(struct btrfs_backref_walk_ctx
*ctx
,
733 struct btrfs_path
*path
,
734 struct preftrees
*preftrees
,
735 struct share_check
*sc
)
739 struct ulist
*parents
;
740 struct ulist_node
*node
;
741 struct ulist_iterator uiter
;
742 struct rb_node
*rnode
;
744 parents
= ulist_alloc(GFP_NOFS
);
749 * We could trade memory usage for performance here by iterating
750 * the tree, allocating new refs for each insertion, and then
751 * freeing the entire indirect tree when we're done. In some test
752 * cases, the tree can grow quite large (~200k objects).
754 while ((rnode
= rb_first_cached(&preftrees
->indirect
.root
))) {
755 struct prelim_ref
*ref
;
757 ref
= rb_entry(rnode
, struct prelim_ref
, rbnode
);
758 if (WARN(ref
->parent
,
759 "BUG: direct ref found in indirect tree")) {
764 rb_erase_cached(&ref
->rbnode
, &preftrees
->indirect
.root
);
765 preftrees
->indirect
.count
--;
767 if (ref
->count
== 0) {
772 if (sc
&& ref
->root_id
!= btrfs_root_id(sc
->root
)) {
774 ret
= BACKREF_FOUND_SHARED
;
777 err
= resolve_indirect_ref(ctx
, path
, preftrees
, ref
, parents
);
779 * we can only tolerate ENOENT,otherwise,we should catch error
780 * and return directly.
782 if (err
== -ENOENT
) {
783 prelim_ref_insert(ctx
->fs_info
, &preftrees
->direct
, ref
,
792 /* we put the first parent into the ref at hand */
793 ULIST_ITER_INIT(&uiter
);
794 node
= ulist_next(parents
, &uiter
);
795 ref
->parent
= node
? node
->val
: 0;
796 ref
->inode_list
= unode_aux_to_inode_list(node
);
798 /* Add a prelim_ref(s) for any other parent(s). */
799 while ((node
= ulist_next(parents
, &uiter
))) {
800 struct prelim_ref
*new_ref
;
802 new_ref
= kmem_cache_alloc(btrfs_prelim_ref_cache
,
809 memcpy(new_ref
, ref
, sizeof(*ref
));
810 new_ref
->parent
= node
->val
;
811 new_ref
->inode_list
= unode_aux_to_inode_list(node
);
812 prelim_ref_insert(ctx
->fs_info
, &preftrees
->direct
,
817 * Now it's a direct ref, put it in the direct tree. We must
818 * do this last because the ref could be merged/freed here.
820 prelim_ref_insert(ctx
->fs_info
, &preftrees
->direct
, ref
, NULL
);
822 ulist_reinit(parents
);
827 * We may have inode lists attached to refs in the parents ulist, so we
828 * must free them before freeing the ulist and its refs.
830 free_leaf_list(parents
);
835 * read tree blocks and add keys where required.
837 static int add_missing_keys(struct btrfs_fs_info
*fs_info
,
838 struct preftrees
*preftrees
, bool lock
)
840 struct prelim_ref
*ref
;
841 struct extent_buffer
*eb
;
842 struct preftree
*tree
= &preftrees
->indirect_missing_keys
;
843 struct rb_node
*node
;
845 while ((node
= rb_first_cached(&tree
->root
))) {
846 struct btrfs_tree_parent_check check
= { 0 };
848 ref
= rb_entry(node
, struct prelim_ref
, rbnode
);
849 rb_erase_cached(node
, &tree
->root
);
851 BUG_ON(ref
->parent
); /* should not be a direct ref */
852 BUG_ON(ref
->key_for_search
.type
);
853 BUG_ON(!ref
->wanted_disk_byte
);
855 check
.level
= ref
->level
- 1;
856 check
.owner_root
= ref
->root_id
;
858 eb
= read_tree_block(fs_info
, ref
->wanted_disk_byte
, &check
);
863 if (!extent_buffer_uptodate(eb
)) {
865 free_extent_buffer(eb
);
870 btrfs_tree_read_lock(eb
);
871 if (btrfs_header_level(eb
) == 0)
872 btrfs_item_key_to_cpu(eb
, &ref
->key_for_search
, 0);
874 btrfs_node_key_to_cpu(eb
, &ref
->key_for_search
, 0);
876 btrfs_tree_read_unlock(eb
);
877 free_extent_buffer(eb
);
878 prelim_ref_insert(fs_info
, &preftrees
->indirect
, ref
, NULL
);
885 * add all currently queued delayed refs from this head whose seq nr is
886 * smaller or equal that seq to the list
888 static int add_delayed_refs(const struct btrfs_fs_info
*fs_info
,
889 struct btrfs_delayed_ref_head
*head
, u64 seq
,
890 struct preftrees
*preftrees
, struct share_check
*sc
)
892 struct btrfs_delayed_ref_node
*node
;
893 struct btrfs_key key
;
898 spin_lock(&head
->lock
);
899 for (n
= rb_first_cached(&head
->ref_tree
); n
; n
= rb_next(n
)) {
900 node
= rb_entry(n
, struct btrfs_delayed_ref_node
,
905 switch (node
->action
) {
906 case BTRFS_ADD_DELAYED_EXTENT
:
907 case BTRFS_UPDATE_DELAYED_HEAD
:
910 case BTRFS_ADD_DELAYED_REF
:
911 count
= node
->ref_mod
;
913 case BTRFS_DROP_DELAYED_REF
:
914 count
= node
->ref_mod
* -1;
919 switch (node
->type
) {
920 case BTRFS_TREE_BLOCK_REF_KEY
: {
921 /* NORMAL INDIRECT METADATA backref */
922 struct btrfs_key
*key_ptr
= NULL
;
923 /* The owner of a tree block ref is the level. */
924 int level
= btrfs_delayed_ref_owner(node
);
926 if (head
->extent_op
&& head
->extent_op
->update_key
) {
927 btrfs_disk_key_to_cpu(&key
, &head
->extent_op
->key
);
931 ret
= add_indirect_ref(fs_info
, preftrees
, node
->ref_root
,
932 key_ptr
, level
+ 1, node
->bytenr
,
933 count
, sc
, GFP_ATOMIC
);
936 case BTRFS_SHARED_BLOCK_REF_KEY
: {
938 * SHARED DIRECT METADATA backref
940 * The owner of a tree block ref is the level.
942 int level
= btrfs_delayed_ref_owner(node
);
944 ret
= add_direct_ref(fs_info
, preftrees
, level
+ 1,
945 node
->parent
, node
->bytenr
, count
,
949 case BTRFS_EXTENT_DATA_REF_KEY
: {
950 /* NORMAL INDIRECT DATA backref */
951 key
.objectid
= btrfs_delayed_ref_owner(node
);
952 key
.type
= BTRFS_EXTENT_DATA_KEY
;
953 key
.offset
= btrfs_delayed_ref_offset(node
);
956 * If we have a share check context and a reference for
957 * another inode, we can't exit immediately. This is
958 * because even if this is a BTRFS_ADD_DELAYED_REF
959 * reference we may find next a BTRFS_DROP_DELAYED_REF
960 * which cancels out this ADD reference.
962 * If this is a DROP reference and there was no previous
963 * ADD reference, then we need to signal that when we
964 * process references from the extent tree (through
965 * add_inline_refs() and add_keyed_refs()), we should
966 * not exit early if we find a reference for another
967 * inode, because one of the delayed DROP references
968 * may cancel that reference in the extent tree.
971 sc
->have_delayed_delete_refs
= true;
973 ret
= add_indirect_ref(fs_info
, preftrees
, node
->ref_root
,
974 &key
, 0, node
->bytenr
, count
, sc
,
978 case BTRFS_SHARED_DATA_REF_KEY
: {
979 /* SHARED DIRECT FULL backref */
980 ret
= add_direct_ref(fs_info
, preftrees
, 0, node
->parent
,
981 node
->bytenr
, count
, sc
,
989 * We must ignore BACKREF_FOUND_SHARED until all delayed
990 * refs have been checked.
992 if (ret
&& (ret
!= BACKREF_FOUND_SHARED
))
996 ret
= extent_is_shared(sc
);
998 spin_unlock(&head
->lock
);
1003 * add all inline backrefs for bytenr to the list
1005 * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
1007 static int add_inline_refs(struct btrfs_backref_walk_ctx
*ctx
,
1008 struct btrfs_path
*path
,
1009 int *info_level
, struct preftrees
*preftrees
,
1010 struct share_check
*sc
)
1014 struct extent_buffer
*leaf
;
1015 struct btrfs_key key
;
1016 struct btrfs_key found_key
;
1019 struct btrfs_extent_item
*ei
;
1024 * enumerate all inline refs
1026 leaf
= path
->nodes
[0];
1027 slot
= path
->slots
[0];
1029 item_size
= btrfs_item_size(leaf
, slot
);
1030 ei
= btrfs_item_ptr(leaf
, slot
, struct btrfs_extent_item
);
1032 if (ctx
->check_extent_item
) {
1033 ret
= ctx
->check_extent_item(ctx
->bytenr
, ei
, leaf
, ctx
->user_ctx
);
1038 flags
= btrfs_extent_flags(leaf
, ei
);
1039 btrfs_item_key_to_cpu(leaf
, &found_key
, slot
);
1041 ptr
= (unsigned long)(ei
+ 1);
1042 end
= (unsigned long)ei
+ item_size
;
1044 if (found_key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
1045 flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
1046 struct btrfs_tree_block_info
*info
;
1048 info
= (struct btrfs_tree_block_info
*)ptr
;
1049 *info_level
= btrfs_tree_block_level(leaf
, info
);
1050 ptr
+= sizeof(struct btrfs_tree_block_info
);
1052 } else if (found_key
.type
== BTRFS_METADATA_ITEM_KEY
) {
1053 *info_level
= found_key
.offset
;
1055 BUG_ON(!(flags
& BTRFS_EXTENT_FLAG_DATA
));
1059 struct btrfs_extent_inline_ref
*iref
;
1063 iref
= (struct btrfs_extent_inline_ref
*)ptr
;
1064 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
,
1065 BTRFS_REF_TYPE_ANY
);
1066 if (type
== BTRFS_REF_TYPE_INVALID
)
1069 offset
= btrfs_extent_inline_ref_offset(leaf
, iref
);
1072 case BTRFS_SHARED_BLOCK_REF_KEY
:
1073 ret
= add_direct_ref(ctx
->fs_info
, preftrees
,
1074 *info_level
+ 1, offset
,
1075 ctx
->bytenr
, 1, NULL
, GFP_NOFS
);
1077 case BTRFS_SHARED_DATA_REF_KEY
: {
1078 struct btrfs_shared_data_ref
*sdref
;
1081 sdref
= (struct btrfs_shared_data_ref
*)(iref
+ 1);
1082 count
= btrfs_shared_data_ref_count(leaf
, sdref
);
1084 ret
= add_direct_ref(ctx
->fs_info
, preftrees
, 0, offset
,
1085 ctx
->bytenr
, count
, sc
, GFP_NOFS
);
1088 case BTRFS_TREE_BLOCK_REF_KEY
:
1089 ret
= add_indirect_ref(ctx
->fs_info
, preftrees
, offset
,
1090 NULL
, *info_level
+ 1,
1091 ctx
->bytenr
, 1, NULL
, GFP_NOFS
);
1093 case BTRFS_EXTENT_DATA_REF_KEY
: {
1094 struct btrfs_extent_data_ref
*dref
;
1098 dref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
1099 count
= btrfs_extent_data_ref_count(leaf
, dref
);
1100 key
.objectid
= btrfs_extent_data_ref_objectid(leaf
,
1102 key
.type
= BTRFS_EXTENT_DATA_KEY
;
1103 key
.offset
= btrfs_extent_data_ref_offset(leaf
, dref
);
1105 if (sc
&& key
.objectid
!= sc
->inum
&&
1106 !sc
->have_delayed_delete_refs
) {
1107 ret
= BACKREF_FOUND_SHARED
;
1111 root
= btrfs_extent_data_ref_root(leaf
, dref
);
1113 if (!ctx
->skip_data_ref
||
1114 !ctx
->skip_data_ref(root
, key
.objectid
, key
.offset
,
1116 ret
= add_indirect_ref(ctx
->fs_info
, preftrees
,
1117 root
, &key
, 0, ctx
->bytenr
,
1118 count
, sc
, GFP_NOFS
);
1121 case BTRFS_EXTENT_OWNER_REF_KEY
:
1122 ASSERT(btrfs_fs_incompat(ctx
->fs_info
, SIMPLE_QUOTA
));
1129 ptr
+= btrfs_extent_inline_ref_size(type
);
1136 * add all non-inline backrefs for bytenr to the list
1138 * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
1140 static int add_keyed_refs(struct btrfs_backref_walk_ctx
*ctx
,
1141 struct btrfs_root
*extent_root
,
1142 struct btrfs_path
*path
,
1143 int info_level
, struct preftrees
*preftrees
,
1144 struct share_check
*sc
)
1146 struct btrfs_fs_info
*fs_info
= extent_root
->fs_info
;
1149 struct extent_buffer
*leaf
;
1150 struct btrfs_key key
;
1153 ret
= btrfs_next_item(extent_root
, path
);
1161 slot
= path
->slots
[0];
1162 leaf
= path
->nodes
[0];
1163 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
1165 if (key
.objectid
!= ctx
->bytenr
)
1167 if (key
.type
< BTRFS_TREE_BLOCK_REF_KEY
)
1169 if (key
.type
> BTRFS_SHARED_DATA_REF_KEY
)
1173 case BTRFS_SHARED_BLOCK_REF_KEY
:
1174 /* SHARED DIRECT METADATA backref */
1175 ret
= add_direct_ref(fs_info
, preftrees
,
1176 info_level
+ 1, key
.offset
,
1177 ctx
->bytenr
, 1, NULL
, GFP_NOFS
);
1179 case BTRFS_SHARED_DATA_REF_KEY
: {
1180 /* SHARED DIRECT FULL backref */
1181 struct btrfs_shared_data_ref
*sdref
;
1184 sdref
= btrfs_item_ptr(leaf
, slot
,
1185 struct btrfs_shared_data_ref
);
1186 count
= btrfs_shared_data_ref_count(leaf
, sdref
);
1187 ret
= add_direct_ref(fs_info
, preftrees
, 0,
1188 key
.offset
, ctx
->bytenr
, count
,
1192 case BTRFS_TREE_BLOCK_REF_KEY
:
1193 /* NORMAL INDIRECT METADATA backref */
1194 ret
= add_indirect_ref(fs_info
, preftrees
, key
.offset
,
1195 NULL
, info_level
+ 1, ctx
->bytenr
,
1198 case BTRFS_EXTENT_DATA_REF_KEY
: {
1199 /* NORMAL INDIRECT DATA backref */
1200 struct btrfs_extent_data_ref
*dref
;
1204 dref
= btrfs_item_ptr(leaf
, slot
,
1205 struct btrfs_extent_data_ref
);
1206 count
= btrfs_extent_data_ref_count(leaf
, dref
);
1207 key
.objectid
= btrfs_extent_data_ref_objectid(leaf
,
1209 key
.type
= BTRFS_EXTENT_DATA_KEY
;
1210 key
.offset
= btrfs_extent_data_ref_offset(leaf
, dref
);
1212 if (sc
&& key
.objectid
!= sc
->inum
&&
1213 !sc
->have_delayed_delete_refs
) {
1214 ret
= BACKREF_FOUND_SHARED
;
1218 root
= btrfs_extent_data_ref_root(leaf
, dref
);
1220 if (!ctx
->skip_data_ref
||
1221 !ctx
->skip_data_ref(root
, key
.objectid
, key
.offset
,
1223 ret
= add_indirect_ref(fs_info
, preftrees
, root
,
1224 &key
, 0, ctx
->bytenr
,
1225 count
, sc
, GFP_NOFS
);
1240 * The caller has joined a transaction or is holding a read lock on the
1241 * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
1242 * snapshot field changing while updating or checking the cache.
1244 static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx
*ctx
,
1245 struct btrfs_root
*root
,
1246 u64 bytenr
, int level
, bool *is_shared
)
1248 const struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1249 struct btrfs_backref_shared_cache_entry
*entry
;
1251 if (!current
->journal_info
)
1252 lockdep_assert_held(&fs_info
->commit_root_sem
);
1254 if (!ctx
->use_path_cache
)
1257 if (WARN_ON_ONCE(level
>= BTRFS_MAX_LEVEL
))
1261 * Level -1 is used for the data extent, which is not reliable to cache
1262 * because its reference count can increase or decrease without us
1263 * realizing. We cache results only for extent buffers that lead from
1264 * the root node down to the leaf with the file extent item.
1268 entry
= &ctx
->path_cache_entries
[level
];
1270 /* Unused cache entry or being used for some other extent buffer. */
1271 if (entry
->bytenr
!= bytenr
)
1275 * We cached a false result, but the last snapshot generation of the
1276 * root changed, so we now have a snapshot. Don't trust the result.
1278 if (!entry
->is_shared
&&
1279 entry
->gen
!= btrfs_root_last_snapshot(&root
->root_item
))
1283 * If we cached a true result and the last generation used for dropping
1284 * a root changed, we can not trust the result, because the dropped root
1285 * could be a snapshot sharing this extent buffer.
1287 if (entry
->is_shared
&&
1288 entry
->gen
!= btrfs_get_last_root_drop_gen(fs_info
))
1291 *is_shared
= entry
->is_shared
;
1293 * If the node at this level is shared, than all nodes below are also
1294 * shared. Currently some of the nodes below may be marked as not shared
1295 * because we have just switched from one leaf to another, and switched
1296 * also other nodes above the leaf and below the current level, so mark
1300 for (int i
= 0; i
< level
; i
++) {
1301 ctx
->path_cache_entries
[i
].is_shared
= true;
1302 ctx
->path_cache_entries
[i
].gen
= entry
->gen
;
1310 * The caller has joined a transaction or is holding a read lock on the
1311 * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
1312 * snapshot field changing while updating or checking the cache.
1314 static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx
*ctx
,
1315 struct btrfs_root
*root
,
1316 u64 bytenr
, int level
, bool is_shared
)
1318 const struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1319 struct btrfs_backref_shared_cache_entry
*entry
;
1322 if (!current
->journal_info
)
1323 lockdep_assert_held(&fs_info
->commit_root_sem
);
1325 if (!ctx
->use_path_cache
)
1328 if (WARN_ON_ONCE(level
>= BTRFS_MAX_LEVEL
))
1332 * Level -1 is used for the data extent, which is not reliable to cache
1333 * because its reference count can increase or decrease without us
1334 * realizing. We cache results only for extent buffers that lead from
1335 * the root node down to the leaf with the file extent item.
1340 gen
= btrfs_get_last_root_drop_gen(fs_info
);
1342 gen
= btrfs_root_last_snapshot(&root
->root_item
);
1344 entry
= &ctx
->path_cache_entries
[level
];
1345 entry
->bytenr
= bytenr
;
1346 entry
->is_shared
= is_shared
;
1350 * If we found an extent buffer is shared, set the cache result for all
1351 * extent buffers below it to true. As nodes in the path are COWed,
1352 * their sharedness is moved to their children, and if a leaf is COWed,
1353 * then the sharedness of a data extent becomes direct, the refcount of
1354 * data extent is increased in the extent item at the extent tree.
1357 for (int i
= 0; i
< level
; i
++) {
1358 entry
= &ctx
->path_cache_entries
[i
];
1359 entry
->is_shared
= is_shared
;
1366 * this adds all existing backrefs (inline backrefs, backrefs and delayed
1367 * refs) for the given bytenr to the refs list, merges duplicates and resolves
1368 * indirect refs to their parent bytenr.
1369 * When roots are found, they're added to the roots list
1371 * @ctx: Backref walking context object, must be not NULL.
1372 * @sc: If !NULL, then immediately return BACKREF_FOUND_SHARED when a
1373 * shared extent is detected.
1375 * Otherwise this returns 0 for success and <0 for an error.
1377 * FIXME some caching might speed things up
1379 static int find_parent_nodes(struct btrfs_backref_walk_ctx
*ctx
,
1380 struct share_check
*sc
)
1382 struct btrfs_root
*root
= btrfs_extent_root(ctx
->fs_info
, ctx
->bytenr
);
1383 struct btrfs_key key
;
1384 struct btrfs_path
*path
;
1385 struct btrfs_delayed_ref_root
*delayed_refs
= NULL
;
1386 struct btrfs_delayed_ref_head
*head
;
1389 struct prelim_ref
*ref
;
1390 struct rb_node
*node
;
1391 struct extent_inode_elem
*eie
= NULL
;
1392 struct preftrees preftrees
= {
1393 .direct
= PREFTREE_INIT
,
1394 .indirect
= PREFTREE_INIT
,
1395 .indirect_missing_keys
= PREFTREE_INIT
1398 /* Roots ulist is not needed when using a sharedness check context. */
1400 ASSERT(ctx
->roots
== NULL
);
1402 key
.objectid
= ctx
->bytenr
;
1403 key
.offset
= (u64
)-1;
1404 if (btrfs_fs_incompat(ctx
->fs_info
, SKINNY_METADATA
))
1405 key
.type
= BTRFS_METADATA_ITEM_KEY
;
1407 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1409 path
= btrfs_alloc_path();
1413 path
->search_commit_root
= 1;
1414 path
->skip_locking
= 1;
1417 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
1418 path
->skip_locking
= 1;
1423 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1428 * Key with offset -1 found, there would have to exist an extent
1429 * item with such offset, but this is out of the valid range.
1435 if (ctx
->trans
&& likely(ctx
->trans
->type
!= __TRANS_DUMMY
) &&
1436 ctx
->time_seq
!= BTRFS_SEQ_LAST
) {
1438 * We have a specific time_seq we care about and trans which
1439 * means we have the path lock, we need to grab the ref head and
1440 * lock it so we have a consistent view of the refs at the given
1443 delayed_refs
= &ctx
->trans
->transaction
->delayed_refs
;
1444 spin_lock(&delayed_refs
->lock
);
1445 head
= btrfs_find_delayed_ref_head(ctx
->fs_info
, delayed_refs
,
1448 if (!mutex_trylock(&head
->mutex
)) {
1449 refcount_inc(&head
->refs
);
1450 spin_unlock(&delayed_refs
->lock
);
1452 btrfs_release_path(path
);
1455 * Mutex was contended, block until it's
1456 * released and try again
1458 mutex_lock(&head
->mutex
);
1459 mutex_unlock(&head
->mutex
);
1460 btrfs_put_delayed_ref_head(head
);
1463 spin_unlock(&delayed_refs
->lock
);
1464 ret
= add_delayed_refs(ctx
->fs_info
, head
, ctx
->time_seq
,
1466 mutex_unlock(&head
->mutex
);
1470 spin_unlock(&delayed_refs
->lock
);
1474 if (path
->slots
[0]) {
1475 struct extent_buffer
*leaf
;
1479 leaf
= path
->nodes
[0];
1480 slot
= path
->slots
[0];
1481 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
1482 if (key
.objectid
== ctx
->bytenr
&&
1483 (key
.type
== BTRFS_EXTENT_ITEM_KEY
||
1484 key
.type
== BTRFS_METADATA_ITEM_KEY
)) {
1485 ret
= add_inline_refs(ctx
, path
, &info_level
,
1489 ret
= add_keyed_refs(ctx
, root
, path
, info_level
,
1497 * If we have a share context and we reached here, it means the extent
1498 * is not directly shared (no multiple reference items for it),
1499 * otherwise we would have exited earlier with a return value of
1500 * BACKREF_FOUND_SHARED after processing delayed references or while
1501 * processing inline or keyed references from the extent tree.
1502 * The extent may however be indirectly shared through shared subtrees
1503 * as a result from creating snapshots, so we determine below what is
1504 * its parent node, in case we are dealing with a metadata extent, or
1505 * what's the leaf (or leaves), from a fs tree, that has a file extent
1506 * item pointing to it in case we are dealing with a data extent.
1508 ASSERT(extent_is_shared(sc
) == 0);
1511 * If we are here for a data extent and we have a share_check structure
1512 * it means the data extent is not directly shared (does not have
1513 * multiple reference items), so we have to check if a path in the fs
1514 * tree (going from the root node down to the leaf that has the file
1515 * extent item pointing to the data extent) is shared, that is, if any
1516 * of the extent buffers in the path is referenced by other trees.
1518 if (sc
&& ctx
->bytenr
== sc
->data_bytenr
) {
1520 * If our data extent is from a generation more recent than the
1521 * last generation used to snapshot the root, then we know that
1522 * it can not be shared through subtrees, so we can skip
1523 * resolving indirect references, there's no point in
1524 * determining the extent buffers for the path from the fs tree
1525 * root node down to the leaf that has the file extent item that
1526 * points to the data extent.
1528 if (sc
->data_extent_gen
>
1529 btrfs_root_last_snapshot(&sc
->root
->root_item
)) {
1530 ret
= BACKREF_FOUND_NOT_SHARED
;
1535 * If we are only determining if a data extent is shared or not
1536 * and the corresponding file extent item is located in the same
1537 * leaf as the previous file extent item, we can skip resolving
1538 * indirect references for a data extent, since the fs tree path
1539 * is the same (same leaf, so same path). We skip as long as the
1540 * cached result for the leaf is valid and only if there's only
1541 * one file extent item pointing to the data extent, because in
1542 * the case of multiple file extent items, they may be located
1543 * in different leaves and therefore we have multiple paths.
1545 if (sc
->ctx
->curr_leaf_bytenr
== sc
->ctx
->prev_leaf_bytenr
&&
1546 sc
->self_ref_count
== 1) {
1550 cached
= lookup_backref_shared_cache(sc
->ctx
, sc
->root
,
1551 sc
->ctx
->curr_leaf_bytenr
,
1555 ret
= BACKREF_FOUND_SHARED
;
1557 ret
= BACKREF_FOUND_NOT_SHARED
;
1563 btrfs_release_path(path
);
1565 ret
= add_missing_keys(ctx
->fs_info
, &preftrees
, path
->skip_locking
== 0);
1569 WARN_ON(!RB_EMPTY_ROOT(&preftrees
.indirect_missing_keys
.root
.rb_root
));
1571 ret
= resolve_indirect_refs(ctx
, path
, &preftrees
, sc
);
1575 WARN_ON(!RB_EMPTY_ROOT(&preftrees
.indirect
.root
.rb_root
));
1578 * This walks the tree of merged and resolved refs. Tree blocks are
1579 * read in as needed. Unique entries are added to the ulist, and
1580 * the list of found roots is updated.
1582 * We release the entire tree in one go before returning.
1584 node
= rb_first_cached(&preftrees
.direct
.root
);
1586 ref
= rb_entry(node
, struct prelim_ref
, rbnode
);
1587 node
= rb_next(&ref
->rbnode
);
1589 * ref->count < 0 can happen here if there are delayed
1590 * refs with a node->action of BTRFS_DROP_DELAYED_REF.
1591 * prelim_ref_insert() relies on this when merging
1592 * identical refs to keep the overall count correct.
1593 * prelim_ref_insert() will merge only those refs
1594 * which compare identically. Any refs having
1595 * e.g. different offsets would not be merged,
1596 * and would retain their original ref->count < 0.
1598 if (ctx
->roots
&& ref
->count
&& ref
->root_id
&& ref
->parent
== 0) {
1599 /* no parent == root of tree */
1600 ret
= ulist_add(ctx
->roots
, ref
->root_id
, 0, GFP_NOFS
);
1604 if (ref
->count
&& ref
->parent
) {
1605 if (!ctx
->skip_inode_ref_list
&& !ref
->inode_list
&&
1607 struct btrfs_tree_parent_check check
= { 0 };
1608 struct extent_buffer
*eb
;
1610 check
.level
= ref
->level
;
1612 eb
= read_tree_block(ctx
->fs_info
, ref
->parent
,
1618 if (!extent_buffer_uptodate(eb
)) {
1619 free_extent_buffer(eb
);
1624 if (!path
->skip_locking
)
1625 btrfs_tree_read_lock(eb
);
1626 ret
= find_extent_in_eb(ctx
, eb
, &eie
);
1627 if (!path
->skip_locking
)
1628 btrfs_tree_read_unlock(eb
);
1629 free_extent_buffer(eb
);
1630 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
||
1633 ref
->inode_list
= eie
;
1635 * We transferred the list ownership to the ref,
1636 * so set to NULL to avoid a double free in case
1637 * an error happens after this.
1641 ret
= ulist_add_merge_ptr(ctx
->refs
, ref
->parent
,
1643 (void **)&eie
, GFP_NOFS
);
1646 if (!ret
&& !ctx
->skip_inode_ref_list
) {
1648 * We've recorded that parent, so we must extend
1649 * its inode list here.
1651 * However if there was corruption we may not
1652 * have found an eie, return an error in this
1662 eie
->next
= ref
->inode_list
;
1666 * We have transferred the inode list ownership from
1667 * this ref to the ref we added to the 'refs' ulist.
1668 * So set this ref's inode list to NULL to avoid
1669 * use-after-free when our caller uses it or double
1670 * frees in case an error happens before we return.
1672 ref
->inode_list
= NULL
;
1678 btrfs_free_path(path
);
1680 prelim_release(&preftrees
.direct
);
1681 prelim_release(&preftrees
.indirect
);
1682 prelim_release(&preftrees
.indirect_missing_keys
);
1684 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
|| ret
< 0)
1685 free_inode_elem_list(eie
);
1690 * Finds all leaves with a reference to the specified combination of
1691 * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are
1692 * added to the ulist at @ctx->refs, and that ulist is allocated by this
1693 * function. The caller should free the ulist with free_leaf_list() if
1694 * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is
1697 * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated.
1699 int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx
*ctx
)
1703 ASSERT(ctx
->refs
== NULL
);
1705 ctx
->refs
= ulist_alloc(GFP_NOFS
);
1709 ret
= find_parent_nodes(ctx
, NULL
);
1710 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
||
1711 (ret
< 0 && ret
!= -ENOENT
)) {
1712 free_leaf_list(ctx
->refs
);
1721 * Walk all backrefs for a given extent to find all roots that reference this
1722 * extent. Walking a backref means finding all extents that reference this
1723 * extent and in turn walk the backrefs of those, too. Naturally this is a
1724 * recursive process, but here it is implemented in an iterative fashion: We
1725 * find all referencing extents for the extent in question and put them on a
1726 * list. In turn, we find all referencing extents for those, further appending
1727 * to the list. The way we iterate the list allows adding more elements after
1728 * the current while iterating. The process stops when we reach the end of the
1731 * Found roots are added to @ctx->roots, which is allocated by this function if
1732 * it points to NULL, in which case the caller is responsible for freeing it
1733 * after it's not needed anymore.
1734 * This function requires @ctx->refs to be NULL, as it uses it for allocating a
1735 * ulist to do temporary work, and frees it before returning.
1737 * Returns 0 on success, < 0 on error.
1739 static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx
*ctx
)
1741 const u64 orig_bytenr
= ctx
->bytenr
;
1742 const bool orig_skip_inode_ref_list
= ctx
->skip_inode_ref_list
;
1743 bool roots_ulist_allocated
= false;
1744 struct ulist_iterator uiter
;
1747 ASSERT(ctx
->refs
== NULL
);
1749 ctx
->refs
= ulist_alloc(GFP_NOFS
);
1754 ctx
->roots
= ulist_alloc(GFP_NOFS
);
1756 ulist_free(ctx
->refs
);
1760 roots_ulist_allocated
= true;
1763 ctx
->skip_inode_ref_list
= true;
1765 ULIST_ITER_INIT(&uiter
);
1767 struct ulist_node
*node
;
1769 ret
= find_parent_nodes(ctx
, NULL
);
1770 if (ret
< 0 && ret
!= -ENOENT
) {
1771 if (roots_ulist_allocated
) {
1772 ulist_free(ctx
->roots
);
1778 node
= ulist_next(ctx
->refs
, &uiter
);
1781 ctx
->bytenr
= node
->val
;
1785 ulist_free(ctx
->refs
);
1787 ctx
->bytenr
= orig_bytenr
;
1788 ctx
->skip_inode_ref_list
= orig_skip_inode_ref_list
;
1793 int btrfs_find_all_roots(struct btrfs_backref_walk_ctx
*ctx
,
1794 bool skip_commit_root_sem
)
1798 if (!ctx
->trans
&& !skip_commit_root_sem
)
1799 down_read(&ctx
->fs_info
->commit_root_sem
);
1800 ret
= btrfs_find_all_roots_safe(ctx
);
1801 if (!ctx
->trans
&& !skip_commit_root_sem
)
1802 up_read(&ctx
->fs_info
->commit_root_sem
);
1806 struct btrfs_backref_share_check_ctx
*btrfs_alloc_backref_share_check_ctx(void)
1808 struct btrfs_backref_share_check_ctx
*ctx
;
1810 ctx
= kzalloc(sizeof(*ctx
), GFP_KERNEL
);
1814 ulist_init(&ctx
->refs
);
1819 void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx
*ctx
)
1824 ulist_release(&ctx
->refs
);
1829 * Check if a data extent is shared or not.
1831 * @inode: The inode whose extent we are checking.
1832 * @bytenr: Logical bytenr of the extent we are checking.
1833 * @extent_gen: Generation of the extent (file extent item) or 0 if it is
1835 * @ctx: A backref sharedness check context.
1837 * btrfs_is_data_extent_shared uses the backref walking code but will short
1838 * circuit as soon as it finds a root or inode that doesn't match the
1839 * one passed in. This provides a significant performance benefit for
1840 * callers (such as fiemap) which want to know whether the extent is
1841 * shared but do not need a ref count.
1843 * This attempts to attach to the running transaction in order to account for
1844 * delayed refs, but continues on even when no running transaction exists.
1846 * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
1848 int btrfs_is_data_extent_shared(struct btrfs_inode
*inode
, u64 bytenr
,
1850 struct btrfs_backref_share_check_ctx
*ctx
)
1852 struct btrfs_backref_walk_ctx walk_ctx
= { 0 };
1853 struct btrfs_root
*root
= inode
->root
;
1854 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1855 struct btrfs_trans_handle
*trans
;
1856 struct ulist_iterator uiter
;
1857 struct ulist_node
*node
;
1858 struct btrfs_seq_list elem
= BTRFS_SEQ_LIST_INIT(elem
);
1860 struct share_check shared
= {
1863 .inum
= btrfs_ino(inode
),
1864 .data_bytenr
= bytenr
,
1865 .data_extent_gen
= extent_gen
,
1867 .self_ref_count
= 0,
1868 .have_delayed_delete_refs
= false,
1872 bool leaf_is_shared
;
1874 for (int i
= 0; i
< BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE
; i
++) {
1875 if (ctx
->prev_extents_cache
[i
].bytenr
== bytenr
)
1876 return ctx
->prev_extents_cache
[i
].is_shared
;
1879 ulist_init(&ctx
->refs
);
1881 trans
= btrfs_join_transaction_nostart(root
);
1882 if (IS_ERR(trans
)) {
1883 if (PTR_ERR(trans
) != -ENOENT
&& PTR_ERR(trans
) != -EROFS
) {
1884 ret
= PTR_ERR(trans
);
1888 down_read(&fs_info
->commit_root_sem
);
1890 btrfs_get_tree_mod_seq(fs_info
, &elem
);
1891 walk_ctx
.time_seq
= elem
.seq
;
1894 ctx
->use_path_cache
= true;
1897 * We may have previously determined that the current leaf is shared.
1898 * If it is, then we have a data extent that is shared due to a shared
1899 * subtree (caused by snapshotting) and we don't need to check for data
1900 * backrefs. If the leaf is not shared, then we must do backref walking
1901 * to determine if the data extent is shared through reflinks.
1903 leaf_cached
= lookup_backref_shared_cache(ctx
, root
,
1904 ctx
->curr_leaf_bytenr
, 0,
1906 if (leaf_cached
&& leaf_is_shared
) {
1911 walk_ctx
.skip_inode_ref_list
= true;
1912 walk_ctx
.trans
= trans
;
1913 walk_ctx
.fs_info
= fs_info
;
1914 walk_ctx
.refs
= &ctx
->refs
;
1916 /* -1 means we are in the bytenr of the data extent. */
1918 ULIST_ITER_INIT(&uiter
);
1920 const unsigned long prev_ref_count
= ctx
->refs
.nnodes
;
1922 walk_ctx
.bytenr
= bytenr
;
1923 ret
= find_parent_nodes(&walk_ctx
, &shared
);
1924 if (ret
== BACKREF_FOUND_SHARED
||
1925 ret
== BACKREF_FOUND_NOT_SHARED
) {
1926 /* If shared must return 1, otherwise return 0. */
1927 ret
= (ret
== BACKREF_FOUND_SHARED
) ? 1 : 0;
1929 store_backref_shared_cache(ctx
, root
, bytenr
,
1933 if (ret
< 0 && ret
!= -ENOENT
)
1938 * More than one extent buffer (bytenr) may have been added to
1939 * the ctx->refs ulist, in which case we have to check multiple
1940 * tree paths in case the first one is not shared, so we can not
1941 * use the path cache which is made for a single path. Multiple
1942 * extent buffers at the current level happen when:
1944 * 1) level -1, the data extent: If our data extent was not
1945 * directly shared (without multiple reference items), then
1946 * it might have a single reference item with a count > 1 for
1947 * the same offset, which means there are 2 (or more) file
1948 * extent items that point to the data extent - this happens
1949 * when a file extent item needs to be split and then one
1950 * item gets moved to another leaf due to a b+tree leaf split
1951 * when inserting some item. In this case the file extent
1952 * items may be located in different leaves and therefore
1953 * some of the leaves may be referenced through shared
1954 * subtrees while others are not. Since our extent buffer
1955 * cache only works for a single path (by far the most common
1956 * case and simpler to deal with), we can not use it if we
1957 * have multiple leaves (which implies multiple paths).
1959 * 2) level >= 0, a tree node/leaf: We can have a mix of direct
1960 * and indirect references on a b+tree node/leaf, so we have
1961 * to check multiple paths, and the extent buffer (the
1962 * current bytenr) may be shared or not. One example is
1963 * during relocation as we may get a shared tree block ref
1964 * (direct ref) and a non-shared tree block ref (indirect
1965 * ref) for the same node/leaf.
1967 if ((ctx
->refs
.nnodes
- prev_ref_count
) > 1)
1968 ctx
->use_path_cache
= false;
1971 store_backref_shared_cache(ctx
, root
, bytenr
,
1973 node
= ulist_next(&ctx
->refs
, &uiter
);
1977 if (ctx
->use_path_cache
) {
1982 cached
= lookup_backref_shared_cache(ctx
, root
, bytenr
,
1985 ret
= (is_shared
? 1 : 0);
1989 shared
.share_count
= 0;
1990 shared
.have_delayed_delete_refs
= false;
1995 * If the path cache is disabled, then it means at some tree level we
1996 * got multiple parents due to a mix of direct and indirect backrefs or
1997 * multiple leaves with file extent items pointing to the same data
1998 * extent. We have to invalidate the cache and cache only the sharedness
1999 * result for the levels where we got only one node/reference.
2001 if (!ctx
->use_path_cache
) {
2005 if (ret
>= 0 && level
>= 0) {
2006 bytenr
= ctx
->path_cache_entries
[level
].bytenr
;
2007 ctx
->use_path_cache
= true;
2008 store_backref_shared_cache(ctx
, root
, bytenr
, level
, ret
);
2012 for ( ; i
< BTRFS_MAX_LEVEL
; i
++)
2013 ctx
->path_cache_entries
[i
].bytenr
= 0;
2017 * Cache the sharedness result for the data extent if we know our inode
2018 * has more than 1 file extent item that refers to the data extent.
2020 if (ret
>= 0 && shared
.self_ref_count
> 1) {
2021 int slot
= ctx
->prev_extents_cache_slot
;
2023 ctx
->prev_extents_cache
[slot
].bytenr
= shared
.data_bytenr
;
2024 ctx
->prev_extents_cache
[slot
].is_shared
= (ret
== 1);
2026 slot
= (slot
+ 1) % BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE
;
2027 ctx
->prev_extents_cache_slot
= slot
;
2032 btrfs_put_tree_mod_seq(fs_info
, &elem
);
2033 btrfs_end_transaction(trans
);
2035 up_read(&fs_info
->commit_root_sem
);
2038 ulist_release(&ctx
->refs
);
2039 ctx
->prev_leaf_bytenr
= ctx
->curr_leaf_bytenr
;
2044 int btrfs_find_one_extref(struct btrfs_root
*root
, u64 inode_objectid
,
2045 u64 start_off
, struct btrfs_path
*path
,
2046 struct btrfs_inode_extref
**ret_extref
,
2050 struct btrfs_key key
;
2051 struct btrfs_key found_key
;
2052 struct btrfs_inode_extref
*extref
;
2053 const struct extent_buffer
*leaf
;
2056 key
.objectid
= inode_objectid
;
2057 key
.type
= BTRFS_INODE_EXTREF_KEY
;
2058 key
.offset
= start_off
;
2060 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
2065 leaf
= path
->nodes
[0];
2066 slot
= path
->slots
[0];
2067 if (slot
>= btrfs_header_nritems(leaf
)) {
2069 * If the item at offset is not found,
2070 * btrfs_search_slot will point us to the slot
2071 * where it should be inserted. In our case
2072 * that will be the slot directly before the
2073 * next INODE_REF_KEY_V2 item. In the case
2074 * that we're pointing to the last slot in a
2075 * leaf, we must move one leaf over.
2077 ret
= btrfs_next_leaf(root
, path
);
2086 btrfs_item_key_to_cpu(leaf
, &found_key
, slot
);
2089 * Check that we're still looking at an extended ref key for
2090 * this particular objectid. If we have different
2091 * objectid or type then there are no more to be found
2092 * in the tree and we can exit.
2095 if (found_key
.objectid
!= inode_objectid
)
2097 if (found_key
.type
!= BTRFS_INODE_EXTREF_KEY
)
2101 ptr
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
2102 extref
= (struct btrfs_inode_extref
*)ptr
;
2103 *ret_extref
= extref
;
2105 *found_off
= found_key
.offset
;
2113 * this iterates to turn a name (from iref/extref) into a full filesystem path.
2114 * Elements of the path are separated by '/' and the path is guaranteed to be
2115 * 0-terminated. the path is only given within the current file system.
2116 * Therefore, it never starts with a '/'. the caller is responsible to provide
2117 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
2118 * the start point of the resulting string is returned. this pointer is within
2120 * in case the path buffer would overflow, the pointer is decremented further
2121 * as if output was written to the buffer, though no more output is actually
2122 * generated. that way, the caller can determine how much space would be
2123 * required for the path to fit into the buffer. in that case, the returned
2124 * value will be smaller than dest. callers must check this!
2126 char *btrfs_ref_to_path(struct btrfs_root
*fs_root
, struct btrfs_path
*path
,
2127 u32 name_len
, unsigned long name_off
,
2128 struct extent_buffer
*eb_in
, u64 parent
,
2129 char *dest
, u32 size
)
2134 s64 bytes_left
= ((s64
)size
) - 1;
2135 struct extent_buffer
*eb
= eb_in
;
2136 struct btrfs_key found_key
;
2137 struct btrfs_inode_ref
*iref
;
2139 if (bytes_left
>= 0)
2140 dest
[bytes_left
] = '\0';
2143 bytes_left
-= name_len
;
2144 if (bytes_left
>= 0)
2145 read_extent_buffer(eb
, dest
+ bytes_left
,
2146 name_off
, name_len
);
2148 if (!path
->skip_locking
)
2149 btrfs_tree_read_unlock(eb
);
2150 free_extent_buffer(eb
);
2152 ret
= btrfs_find_item(fs_root
, path
, parent
, 0,
2153 BTRFS_INODE_REF_KEY
, &found_key
);
2159 next_inum
= found_key
.offset
;
2161 /* regular exit ahead */
2162 if (parent
== next_inum
)
2165 slot
= path
->slots
[0];
2166 eb
= path
->nodes
[0];
2167 /* make sure we can use eb after releasing the path */
2169 path
->nodes
[0] = NULL
;
2172 btrfs_release_path(path
);
2173 iref
= btrfs_item_ptr(eb
, slot
, struct btrfs_inode_ref
);
2175 name_len
= btrfs_inode_ref_name_len(eb
, iref
);
2176 name_off
= (unsigned long)(iref
+ 1);
2180 if (bytes_left
>= 0)
2181 dest
[bytes_left
] = '/';
2184 btrfs_release_path(path
);
2187 return ERR_PTR(ret
);
2189 return dest
+ bytes_left
;
2193 * this makes the path point to (logical EXTENT_ITEM *)
2194 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
2195 * tree blocks and <0 on error.
2197 int extent_from_logical(struct btrfs_fs_info
*fs_info
, u64 logical
,
2198 struct btrfs_path
*path
, struct btrfs_key
*found_key
,
2201 struct btrfs_root
*extent_root
= btrfs_extent_root(fs_info
, logical
);
2206 const struct extent_buffer
*eb
;
2207 struct btrfs_extent_item
*ei
;
2208 struct btrfs_key key
;
2210 if (btrfs_fs_incompat(fs_info
, SKINNY_METADATA
))
2211 key
.type
= BTRFS_METADATA_ITEM_KEY
;
2213 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
2214 key
.objectid
= logical
;
2215 key
.offset
= (u64
)-1;
2217 ret
= btrfs_search_slot(NULL
, extent_root
, &key
, path
, 0, 0);
2222 * Key with offset -1 found, there would have to exist an extent
2223 * item with such offset, but this is out of the valid range.
2228 ret
= btrfs_previous_extent_item(extent_root
, path
, 0);
2234 btrfs_item_key_to_cpu(path
->nodes
[0], found_key
, path
->slots
[0]);
2235 if (found_key
->type
== BTRFS_METADATA_ITEM_KEY
)
2236 size
= fs_info
->nodesize
;
2237 else if (found_key
->type
== BTRFS_EXTENT_ITEM_KEY
)
2238 size
= found_key
->offset
;
2240 if (found_key
->objectid
> logical
||
2241 found_key
->objectid
+ size
<= logical
) {
2242 btrfs_debug(fs_info
,
2243 "logical %llu is not within any extent", logical
);
2247 eb
= path
->nodes
[0];
2248 item_size
= btrfs_item_size(eb
, path
->slots
[0]);
2250 ei
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_extent_item
);
2251 flags
= btrfs_extent_flags(eb
, ei
);
2253 btrfs_debug(fs_info
,
2254 "logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u",
2255 logical
, logical
- found_key
->objectid
, found_key
->objectid
,
2256 found_key
->offset
, flags
, item_size
);
2258 WARN_ON(!flags_ret
);
2260 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)
2261 *flags_ret
= BTRFS_EXTENT_FLAG_TREE_BLOCK
;
2262 else if (flags
& BTRFS_EXTENT_FLAG_DATA
)
2263 *flags_ret
= BTRFS_EXTENT_FLAG_DATA
;
2273 * helper function to iterate extent inline refs. ptr must point to a 0 value
2274 * for the first call and may be modified. it is used to track state.
2275 * if more refs exist, 0 is returned and the next call to
2276 * get_extent_inline_ref must pass the modified ptr parameter to get the
2277 * next ref. after the last ref was processed, 1 is returned.
2278 * returns <0 on error
2280 static int get_extent_inline_ref(unsigned long *ptr
,
2281 const struct extent_buffer
*eb
,
2282 const struct btrfs_key
*key
,
2283 const struct btrfs_extent_item
*ei
,
2285 struct btrfs_extent_inline_ref
**out_eiref
,
2290 struct btrfs_tree_block_info
*info
;
2294 flags
= btrfs_extent_flags(eb
, ei
);
2295 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
2296 if (key
->type
== BTRFS_METADATA_ITEM_KEY
) {
2297 /* a skinny metadata extent */
2299 (struct btrfs_extent_inline_ref
*)(ei
+ 1);
2301 WARN_ON(key
->type
!= BTRFS_EXTENT_ITEM_KEY
);
2302 info
= (struct btrfs_tree_block_info
*)(ei
+ 1);
2304 (struct btrfs_extent_inline_ref
*)(info
+ 1);
2307 *out_eiref
= (struct btrfs_extent_inline_ref
*)(ei
+ 1);
2309 *ptr
= (unsigned long)*out_eiref
;
2310 if ((unsigned long)(*ptr
) >= (unsigned long)ei
+ item_size
)
2314 end
= (unsigned long)ei
+ item_size
;
2315 *out_eiref
= (struct btrfs_extent_inline_ref
*)(*ptr
);
2316 *out_type
= btrfs_get_extent_inline_ref_type(eb
, *out_eiref
,
2317 BTRFS_REF_TYPE_ANY
);
2318 if (*out_type
== BTRFS_REF_TYPE_INVALID
)
2321 *ptr
+= btrfs_extent_inline_ref_size(*out_type
);
2322 WARN_ON(*ptr
> end
);
2324 return 1; /* last */
2330 * reads the tree block backref for an extent. tree level and root are returned
2331 * through out_level and out_root. ptr must point to a 0 value for the first
2332 * call and may be modified (see get_extent_inline_ref comment).
2333 * returns 0 if data was provided, 1 if there was no more data to provide or
2336 int tree_backref_for_extent(unsigned long *ptr
, struct extent_buffer
*eb
,
2337 struct btrfs_key
*key
, struct btrfs_extent_item
*ei
,
2338 u32 item_size
, u64
*out_root
, u8
*out_level
)
2342 struct btrfs_extent_inline_ref
*eiref
;
2344 if (*ptr
== (unsigned long)-1)
2348 ret
= get_extent_inline_ref(ptr
, eb
, key
, ei
, item_size
,
2353 if (type
== BTRFS_TREE_BLOCK_REF_KEY
||
2354 type
== BTRFS_SHARED_BLOCK_REF_KEY
)
2361 /* we can treat both ref types equally here */
2362 *out_root
= btrfs_extent_inline_ref_offset(eb
, eiref
);
2364 if (key
->type
== BTRFS_EXTENT_ITEM_KEY
) {
2365 struct btrfs_tree_block_info
*info
;
2367 info
= (struct btrfs_tree_block_info
*)(ei
+ 1);
2368 *out_level
= btrfs_tree_block_level(eb
, info
);
2370 ASSERT(key
->type
== BTRFS_METADATA_ITEM_KEY
);
2371 *out_level
= (u8
)key
->offset
;
2375 *ptr
= (unsigned long)-1;
2380 static int iterate_leaf_refs(struct btrfs_fs_info
*fs_info
,
2381 struct extent_inode_elem
*inode_list
,
2382 u64 root
, u64 extent_item_objectid
,
2383 iterate_extent_inodes_t
*iterate
, void *ctx
)
2385 struct extent_inode_elem
*eie
;
2388 for (eie
= inode_list
; eie
; eie
= eie
->next
) {
2389 btrfs_debug(fs_info
,
2390 "ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu",
2391 extent_item_objectid
, eie
->inum
,
2393 ret
= iterate(eie
->inum
, eie
->offset
, eie
->num_bytes
, root
, ctx
);
2395 btrfs_debug(fs_info
,
2396 "stopping iteration for %llu due to ret=%d",
2397 extent_item_objectid
, ret
);
2406 * calls iterate() for every inode that references the extent identified by
2407 * the given parameters.
2408 * when the iterator function returns a non-zero value, iteration stops.
2410 int iterate_extent_inodes(struct btrfs_backref_walk_ctx
*ctx
,
2411 bool search_commit_root
,
2412 iterate_extent_inodes_t
*iterate
, void *user_ctx
)
2416 struct ulist_node
*ref_node
;
2417 struct btrfs_seq_list seq_elem
= BTRFS_SEQ_LIST_INIT(seq_elem
);
2418 struct ulist_iterator ref_uiter
;
2420 btrfs_debug(ctx
->fs_info
, "resolving all inodes for extent %llu",
2423 ASSERT(ctx
->trans
== NULL
);
2424 ASSERT(ctx
->roots
== NULL
);
2426 if (!search_commit_root
) {
2427 struct btrfs_trans_handle
*trans
;
2429 trans
= btrfs_attach_transaction(ctx
->fs_info
->tree_root
);
2430 if (IS_ERR(trans
)) {
2431 if (PTR_ERR(trans
) != -ENOENT
&&
2432 PTR_ERR(trans
) != -EROFS
)
2433 return PTR_ERR(trans
);
2440 btrfs_get_tree_mod_seq(ctx
->fs_info
, &seq_elem
);
2441 ctx
->time_seq
= seq_elem
.seq
;
2443 down_read(&ctx
->fs_info
->commit_root_sem
);
2446 ret
= btrfs_find_all_leafs(ctx
);
2452 ULIST_ITER_INIT(&ref_uiter
);
2453 while (!ret
&& (ref_node
= ulist_next(refs
, &ref_uiter
))) {
2454 const u64 leaf_bytenr
= ref_node
->val
;
2455 struct ulist_node
*root_node
;
2456 struct ulist_iterator root_uiter
;
2457 struct extent_inode_elem
*inode_list
;
2459 inode_list
= (struct extent_inode_elem
*)(uintptr_t)ref_node
->aux
;
2461 if (ctx
->cache_lookup
) {
2462 const u64
*root_ids
;
2466 cached
= ctx
->cache_lookup(leaf_bytenr
, ctx
->user_ctx
,
2467 &root_ids
, &root_count
);
2469 for (int i
= 0; i
< root_count
; i
++) {
2470 ret
= iterate_leaf_refs(ctx
->fs_info
,
2484 ctx
->roots
= ulist_alloc(GFP_NOFS
);
2491 ctx
->bytenr
= leaf_bytenr
;
2492 ret
= btrfs_find_all_roots_safe(ctx
);
2496 if (ctx
->cache_store
)
2497 ctx
->cache_store(leaf_bytenr
, ctx
->roots
, ctx
->user_ctx
);
2499 ULIST_ITER_INIT(&root_uiter
);
2500 while (!ret
&& (root_node
= ulist_next(ctx
->roots
, &root_uiter
))) {
2501 btrfs_debug(ctx
->fs_info
,
2502 "root %llu references leaf %llu, data list %#llx",
2503 root_node
->val
, ref_node
->val
,
2505 ret
= iterate_leaf_refs(ctx
->fs_info
, inode_list
,
2506 root_node
->val
, ctx
->bytenr
,
2509 ulist_reinit(ctx
->roots
);
2512 free_leaf_list(refs
);
2515 btrfs_put_tree_mod_seq(ctx
->fs_info
, &seq_elem
);
2516 btrfs_end_transaction(ctx
->trans
);
2519 up_read(&ctx
->fs_info
->commit_root_sem
);
2522 ulist_free(ctx
->roots
);
2525 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
)
2531 static int build_ino_list(u64 inum
, u64 offset
, u64 num_bytes
, u64 root
, void *ctx
)
2533 struct btrfs_data_container
*inodes
= ctx
;
2534 const size_t c
= 3 * sizeof(u64
);
2536 if (inodes
->bytes_left
>= c
) {
2537 inodes
->bytes_left
-= c
;
2538 inodes
->val
[inodes
->elem_cnt
] = inum
;
2539 inodes
->val
[inodes
->elem_cnt
+ 1] = offset
;
2540 inodes
->val
[inodes
->elem_cnt
+ 2] = root
;
2541 inodes
->elem_cnt
+= 3;
2543 inodes
->bytes_missing
+= c
- inodes
->bytes_left
;
2544 inodes
->bytes_left
= 0;
2545 inodes
->elem_missed
+= 3;
2551 int iterate_inodes_from_logical(u64 logical
, struct btrfs_fs_info
*fs_info
,
2552 struct btrfs_path
*path
,
2553 void *ctx
, bool ignore_offset
)
2555 struct btrfs_backref_walk_ctx walk_ctx
= { 0 };
2558 struct btrfs_key found_key
;
2559 int search_commit_root
= path
->search_commit_root
;
2561 ret
= extent_from_logical(fs_info
, logical
, path
, &found_key
, &flags
);
2562 btrfs_release_path(path
);
2565 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)
2568 walk_ctx
.bytenr
= found_key
.objectid
;
2570 walk_ctx
.ignore_extent_item_pos
= true;
2572 walk_ctx
.extent_item_pos
= logical
- found_key
.objectid
;
2573 walk_ctx
.fs_info
= fs_info
;
2575 return iterate_extent_inodes(&walk_ctx
, search_commit_root
,
2576 build_ino_list
, ctx
);
2579 static int inode_to_path(u64 inum
, u32 name_len
, unsigned long name_off
,
2580 struct extent_buffer
*eb
, struct inode_fs_paths
*ipath
);
2582 static int iterate_inode_refs(u64 inum
, struct inode_fs_paths
*ipath
)
2591 struct btrfs_root
*fs_root
= ipath
->fs_root
;
2592 struct btrfs_path
*path
= ipath
->btrfs_path
;
2593 struct extent_buffer
*eb
;
2594 struct btrfs_inode_ref
*iref
;
2595 struct btrfs_key found_key
;
2598 ret
= btrfs_find_item(fs_root
, path
, inum
,
2599 parent
? parent
+ 1 : 0, BTRFS_INODE_REF_KEY
,
2605 ret
= found
? 0 : -ENOENT
;
2610 parent
= found_key
.offset
;
2611 slot
= path
->slots
[0];
2612 eb
= btrfs_clone_extent_buffer(path
->nodes
[0]);
2617 btrfs_release_path(path
);
2619 iref
= btrfs_item_ptr(eb
, slot
, struct btrfs_inode_ref
);
2621 for (cur
= 0; cur
< btrfs_item_size(eb
, slot
); cur
+= len
) {
2622 name_len
= btrfs_inode_ref_name_len(eb
, iref
);
2623 /* path must be released before calling iterate()! */
2624 btrfs_debug(fs_root
->fs_info
,
2625 "following ref at offset %u for inode %llu in tree %llu",
2626 cur
, found_key
.objectid
,
2627 btrfs_root_id(fs_root
));
2628 ret
= inode_to_path(parent
, name_len
,
2629 (unsigned long)(iref
+ 1), eb
, ipath
);
2632 len
= sizeof(*iref
) + name_len
;
2633 iref
= (struct btrfs_inode_ref
*)((char *)iref
+ len
);
2635 free_extent_buffer(eb
);
2638 btrfs_release_path(path
);
2643 static int iterate_inode_extrefs(u64 inum
, struct inode_fs_paths
*ipath
)
2650 struct btrfs_root
*fs_root
= ipath
->fs_root
;
2651 struct btrfs_path
*path
= ipath
->btrfs_path
;
2652 struct extent_buffer
*eb
;
2653 struct btrfs_inode_extref
*extref
;
2659 ret
= btrfs_find_one_extref(fs_root
, inum
, offset
, path
, &extref
,
2664 ret
= found
? 0 : -ENOENT
;
2669 slot
= path
->slots
[0];
2670 eb
= btrfs_clone_extent_buffer(path
->nodes
[0]);
2675 btrfs_release_path(path
);
2677 item_size
= btrfs_item_size(eb
, slot
);
2678 ptr
= btrfs_item_ptr_offset(eb
, slot
);
2681 while (cur_offset
< item_size
) {
2684 extref
= (struct btrfs_inode_extref
*)(ptr
+ cur_offset
);
2685 parent
= btrfs_inode_extref_parent(eb
, extref
);
2686 name_len
= btrfs_inode_extref_name_len(eb
, extref
);
2687 ret
= inode_to_path(parent
, name_len
,
2688 (unsigned long)&extref
->name
, eb
, ipath
);
2692 cur_offset
+= btrfs_inode_extref_name_len(eb
, extref
);
2693 cur_offset
+= sizeof(*extref
);
2695 free_extent_buffer(eb
);
2700 btrfs_release_path(path
);
2706 * returns 0 if the path could be dumped (probably truncated)
2707 * returns <0 in case of an error
2709 static int inode_to_path(u64 inum
, u32 name_len
, unsigned long name_off
,
2710 struct extent_buffer
*eb
, struct inode_fs_paths
*ipath
)
2714 int i
= ipath
->fspath
->elem_cnt
;
2715 const int s_ptr
= sizeof(char *);
2718 bytes_left
= ipath
->fspath
->bytes_left
> s_ptr
?
2719 ipath
->fspath
->bytes_left
- s_ptr
: 0;
2721 fspath_min
= (char *)ipath
->fspath
->val
+ (i
+ 1) * s_ptr
;
2722 fspath
= btrfs_ref_to_path(ipath
->fs_root
, ipath
->btrfs_path
, name_len
,
2723 name_off
, eb
, inum
, fspath_min
, bytes_left
);
2725 return PTR_ERR(fspath
);
2727 if (fspath
> fspath_min
) {
2728 ipath
->fspath
->val
[i
] = (u64
)(unsigned long)fspath
;
2729 ++ipath
->fspath
->elem_cnt
;
2730 ipath
->fspath
->bytes_left
= fspath
- fspath_min
;
2732 ++ipath
->fspath
->elem_missed
;
2733 ipath
->fspath
->bytes_missing
+= fspath_min
- fspath
;
2734 ipath
->fspath
->bytes_left
= 0;
2741 * this dumps all file system paths to the inode into the ipath struct, provided
2742 * is has been created large enough. each path is zero-terminated and accessed
2743 * from ipath->fspath->val[i].
2744 * when it returns, there are ipath->fspath->elem_cnt number of paths available
2745 * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
2746 * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise,
2747 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
2748 * have been needed to return all paths.
2750 int paths_from_inode(u64 inum
, struct inode_fs_paths
*ipath
)
2755 ret
= iterate_inode_refs(inum
, ipath
);
2758 else if (ret
!= -ENOENT
)
2761 ret
= iterate_inode_extrefs(inum
, ipath
);
2762 if (ret
== -ENOENT
&& found_refs
)
2768 struct btrfs_data_container
*init_data_container(u32 total_bytes
)
2770 struct btrfs_data_container
*data
;
2773 alloc_bytes
= max_t(size_t, total_bytes
, sizeof(*data
));
2774 data
= kvzalloc(alloc_bytes
, GFP_KERNEL
);
2776 return ERR_PTR(-ENOMEM
);
2778 if (total_bytes
>= sizeof(*data
))
2779 data
->bytes_left
= total_bytes
- sizeof(*data
);
2781 data
->bytes_missing
= sizeof(*data
) - total_bytes
;
2787 * allocates space to return multiple file system paths for an inode.
2788 * total_bytes to allocate are passed, note that space usable for actual path
2789 * information will be total_bytes - sizeof(struct inode_fs_paths).
2790 * the returned pointer must be freed with free_ipath() in the end.
2792 struct inode_fs_paths
*init_ipath(s32 total_bytes
, struct btrfs_root
*fs_root
,
2793 struct btrfs_path
*path
)
2795 struct inode_fs_paths
*ifp
;
2796 struct btrfs_data_container
*fspath
;
2798 fspath
= init_data_container(total_bytes
);
2800 return ERR_CAST(fspath
);
2802 ifp
= kmalloc(sizeof(*ifp
), GFP_KERNEL
);
2805 return ERR_PTR(-ENOMEM
);
2808 ifp
->btrfs_path
= path
;
2809 ifp
->fspath
= fspath
;
2810 ifp
->fs_root
= fs_root
;
2815 void free_ipath(struct inode_fs_paths
*ipath
)
2819 kvfree(ipath
->fspath
);
2823 struct btrfs_backref_iter
*btrfs_backref_iter_alloc(struct btrfs_fs_info
*fs_info
)
2825 struct btrfs_backref_iter
*ret
;
2827 ret
= kzalloc(sizeof(*ret
), GFP_NOFS
);
2831 ret
->path
= btrfs_alloc_path();
2837 /* Current backref iterator only supports iteration in commit root */
2838 ret
->path
->search_commit_root
= 1;
2839 ret
->path
->skip_locking
= 1;
2840 ret
->fs_info
= fs_info
;
2845 static void btrfs_backref_iter_release(struct btrfs_backref_iter
*iter
)
2851 btrfs_release_path(iter
->path
);
2852 memset(&iter
->cur_key
, 0, sizeof(iter
->cur_key
));
2855 int btrfs_backref_iter_start(struct btrfs_backref_iter
*iter
, u64 bytenr
)
2857 struct btrfs_fs_info
*fs_info
= iter
->fs_info
;
2858 struct btrfs_root
*extent_root
= btrfs_extent_root(fs_info
, bytenr
);
2859 struct btrfs_path
*path
= iter
->path
;
2860 struct btrfs_extent_item
*ei
;
2861 struct btrfs_key key
;
2864 key
.objectid
= bytenr
;
2865 key
.type
= BTRFS_METADATA_ITEM_KEY
;
2866 key
.offset
= (u64
)-1;
2867 iter
->bytenr
= bytenr
;
2869 ret
= btrfs_search_slot(NULL
, extent_root
, &key
, path
, 0, 0);
2874 * Key with offset -1 found, there would have to exist an extent
2875 * item with such offset, but this is out of the valid range.
2880 if (path
->slots
[0] == 0) {
2881 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG
));
2887 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
2888 if ((key
.type
!= BTRFS_EXTENT_ITEM_KEY
&&
2889 key
.type
!= BTRFS_METADATA_ITEM_KEY
) || key
.objectid
!= bytenr
) {
2893 memcpy(&iter
->cur_key
, &key
, sizeof(key
));
2894 iter
->item_ptr
= (u32
)btrfs_item_ptr_offset(path
->nodes
[0],
2896 iter
->end_ptr
= (u32
)(iter
->item_ptr
+
2897 btrfs_item_size(path
->nodes
[0], path
->slots
[0]));
2898 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2899 struct btrfs_extent_item
);
2902 * Only support iteration on tree backref yet.
2904 * This is an extra precaution for non skinny-metadata, where
2905 * EXTENT_ITEM is also used for tree blocks, that we can only use
2906 * extent flags to determine if it's a tree block.
2908 if (btrfs_extent_flags(path
->nodes
[0], ei
) & BTRFS_EXTENT_FLAG_DATA
) {
2912 iter
->cur_ptr
= (u32
)(iter
->item_ptr
+ sizeof(*ei
));
2914 /* If there is no inline backref, go search for keyed backref */
2915 if (iter
->cur_ptr
>= iter
->end_ptr
) {
2916 ret
= btrfs_next_item(extent_root
, path
);
2918 /* No inline nor keyed ref */
2926 btrfs_item_key_to_cpu(path
->nodes
[0], &iter
->cur_key
,
2928 if (iter
->cur_key
.objectid
!= bytenr
||
2929 (iter
->cur_key
.type
!= BTRFS_SHARED_BLOCK_REF_KEY
&&
2930 iter
->cur_key
.type
!= BTRFS_TREE_BLOCK_REF_KEY
)) {
2934 iter
->cur_ptr
= (u32
)btrfs_item_ptr_offset(path
->nodes
[0],
2936 iter
->item_ptr
= iter
->cur_ptr
;
2937 iter
->end_ptr
= (u32
)(iter
->item_ptr
+ btrfs_item_size(
2938 path
->nodes
[0], path
->slots
[0]));
2943 btrfs_backref_iter_release(iter
);
2947 static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter
*iter
)
2949 if (iter
->cur_key
.type
== BTRFS_EXTENT_ITEM_KEY
||
2950 iter
->cur_key
.type
== BTRFS_METADATA_ITEM_KEY
)
2956 * Go to the next backref item of current bytenr, can be either inlined or
2959 * Caller needs to check whether it's inline ref or not by iter->cur_key.
2961 * Return 0 if we get next backref without problem.
2962 * Return >0 if there is no extra backref for this bytenr.
2963 * Return <0 if there is something wrong happened.
2965 int btrfs_backref_iter_next(struct btrfs_backref_iter
*iter
)
2967 struct extent_buffer
*eb
= iter
->path
->nodes
[0];
2968 struct btrfs_root
*extent_root
;
2969 struct btrfs_path
*path
= iter
->path
;
2970 struct btrfs_extent_inline_ref
*iref
;
2974 if (btrfs_backref_iter_is_inline_ref(iter
)) {
2975 /* We're still inside the inline refs */
2976 ASSERT(iter
->cur_ptr
< iter
->end_ptr
);
2978 if (btrfs_backref_has_tree_block_info(iter
)) {
2979 /* First tree block info */
2980 size
= sizeof(struct btrfs_tree_block_info
);
2982 /* Use inline ref type to determine the size */
2985 iref
= (struct btrfs_extent_inline_ref
*)
2986 ((unsigned long)iter
->cur_ptr
);
2987 type
= btrfs_extent_inline_ref_type(eb
, iref
);
2989 size
= btrfs_extent_inline_ref_size(type
);
2991 iter
->cur_ptr
+= size
;
2992 if (iter
->cur_ptr
< iter
->end_ptr
)
2995 /* All inline items iterated, fall through */
2998 /* We're at keyed items, there is no inline item, go to the next one */
2999 extent_root
= btrfs_extent_root(iter
->fs_info
, iter
->bytenr
);
3000 ret
= btrfs_next_item(extent_root
, iter
->path
);
3004 btrfs_item_key_to_cpu(path
->nodes
[0], &iter
->cur_key
, path
->slots
[0]);
3005 if (iter
->cur_key
.objectid
!= iter
->bytenr
||
3006 (iter
->cur_key
.type
!= BTRFS_TREE_BLOCK_REF_KEY
&&
3007 iter
->cur_key
.type
!= BTRFS_SHARED_BLOCK_REF_KEY
))
3009 iter
->item_ptr
= (u32
)btrfs_item_ptr_offset(path
->nodes
[0],
3011 iter
->cur_ptr
= iter
->item_ptr
;
3012 iter
->end_ptr
= iter
->item_ptr
+ (u32
)btrfs_item_size(path
->nodes
[0],
3017 void btrfs_backref_init_cache(struct btrfs_fs_info
*fs_info
,
3018 struct btrfs_backref_cache
*cache
, bool is_reloc
)
3022 cache
->rb_root
= RB_ROOT
;
3023 for (i
= 0; i
< BTRFS_MAX_LEVEL
; i
++)
3024 INIT_LIST_HEAD(&cache
->pending
[i
]);
3025 INIT_LIST_HEAD(&cache
->changed
);
3026 INIT_LIST_HEAD(&cache
->detached
);
3027 INIT_LIST_HEAD(&cache
->leaves
);
3028 INIT_LIST_HEAD(&cache
->pending_edge
);
3029 INIT_LIST_HEAD(&cache
->useless_node
);
3030 cache
->fs_info
= fs_info
;
3031 cache
->is_reloc
= is_reloc
;
3034 struct btrfs_backref_node
*btrfs_backref_alloc_node(
3035 struct btrfs_backref_cache
*cache
, u64 bytenr
, int level
)
3037 struct btrfs_backref_node
*node
;
3039 ASSERT(level
>= 0 && level
< BTRFS_MAX_LEVEL
);
3040 node
= kzalloc(sizeof(*node
), GFP_NOFS
);
3044 INIT_LIST_HEAD(&node
->list
);
3045 INIT_LIST_HEAD(&node
->upper
);
3046 INIT_LIST_HEAD(&node
->lower
);
3047 RB_CLEAR_NODE(&node
->rb_node
);
3049 node
->level
= level
;
3050 node
->bytenr
= bytenr
;
3055 void btrfs_backref_free_node(struct btrfs_backref_cache
*cache
,
3056 struct btrfs_backref_node
*node
)
3059 ASSERT(list_empty(&node
->list
));
3060 ASSERT(list_empty(&node
->lower
));
3061 ASSERT(node
->eb
== NULL
);
3063 btrfs_put_root(node
->root
);
3068 struct btrfs_backref_edge
*btrfs_backref_alloc_edge(
3069 struct btrfs_backref_cache
*cache
)
3071 struct btrfs_backref_edge
*edge
;
3073 edge
= kzalloc(sizeof(*edge
), GFP_NOFS
);
3079 void btrfs_backref_free_edge(struct btrfs_backref_cache
*cache
,
3080 struct btrfs_backref_edge
*edge
)
3088 void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node
*node
)
3091 btrfs_tree_unlock(node
->eb
);
3096 void btrfs_backref_drop_node_buffer(struct btrfs_backref_node
*node
)
3099 btrfs_backref_unlock_node_buffer(node
);
3100 free_extent_buffer(node
->eb
);
3106 * Drop the backref node from cache without cleaning up its children
3109 * This can only be called on node without parent edges.
3110 * The children edges are still kept as is.
3112 void btrfs_backref_drop_node(struct btrfs_backref_cache
*tree
,
3113 struct btrfs_backref_node
*node
)
3115 ASSERT(list_empty(&node
->upper
));
3117 btrfs_backref_drop_node_buffer(node
);
3118 list_del_init(&node
->list
);
3119 list_del_init(&node
->lower
);
3120 if (!RB_EMPTY_NODE(&node
->rb_node
))
3121 rb_erase(&node
->rb_node
, &tree
->rb_root
);
3122 btrfs_backref_free_node(tree
, node
);
3126 * Drop the backref node from cache, also cleaning up all its
3127 * upper edges and any uncached nodes in the path.
3129 * This cleanup happens bottom up, thus the node should either
3130 * be the lowest node in the cache or a detached node.
3132 void btrfs_backref_cleanup_node(struct btrfs_backref_cache
*cache
,
3133 struct btrfs_backref_node
*node
)
3135 struct btrfs_backref_node
*upper
;
3136 struct btrfs_backref_edge
*edge
;
3141 BUG_ON(!node
->lowest
&& !node
->detached
);
3142 while (!list_empty(&node
->upper
)) {
3143 edge
= list_entry(node
->upper
.next
, struct btrfs_backref_edge
,
3145 upper
= edge
->node
[UPPER
];
3146 list_del(&edge
->list
[LOWER
]);
3147 list_del(&edge
->list
[UPPER
]);
3148 btrfs_backref_free_edge(cache
, edge
);
3151 * Add the node to leaf node list if no other child block
3154 if (list_empty(&upper
->lower
)) {
3155 list_add_tail(&upper
->lower
, &cache
->leaves
);
3160 btrfs_backref_drop_node(cache
, node
);
3164 * Release all nodes/edges from current cache
3166 void btrfs_backref_release_cache(struct btrfs_backref_cache
*cache
)
3168 struct btrfs_backref_node
*node
;
3171 while (!list_empty(&cache
->detached
)) {
3172 node
= list_entry(cache
->detached
.next
,
3173 struct btrfs_backref_node
, list
);
3174 btrfs_backref_cleanup_node(cache
, node
);
3177 while (!list_empty(&cache
->leaves
)) {
3178 node
= list_entry(cache
->leaves
.next
,
3179 struct btrfs_backref_node
, lower
);
3180 btrfs_backref_cleanup_node(cache
, node
);
3183 for (i
= 0; i
< BTRFS_MAX_LEVEL
; i
++) {
3184 while (!list_empty(&cache
->pending
[i
])) {
3185 node
= list_first_entry(&cache
->pending
[i
],
3186 struct btrfs_backref_node
,
3188 btrfs_backref_cleanup_node(cache
, node
);
3191 ASSERT(list_empty(&cache
->pending_edge
));
3192 ASSERT(list_empty(&cache
->useless_node
));
3193 ASSERT(list_empty(&cache
->changed
));
3194 ASSERT(list_empty(&cache
->detached
));
3195 ASSERT(RB_EMPTY_ROOT(&cache
->rb_root
));
3196 ASSERT(!cache
->nr_nodes
);
3197 ASSERT(!cache
->nr_edges
);
3200 void btrfs_backref_link_edge(struct btrfs_backref_edge
*edge
,
3201 struct btrfs_backref_node
*lower
,
3202 struct btrfs_backref_node
*upper
,
3205 ASSERT(upper
&& lower
&& upper
->level
== lower
->level
+ 1);
3206 edge
->node
[LOWER
] = lower
;
3207 edge
->node
[UPPER
] = upper
;
3208 if (link_which
& LINK_LOWER
)
3209 list_add_tail(&edge
->list
[LOWER
], &lower
->upper
);
3210 if (link_which
& LINK_UPPER
)
3211 list_add_tail(&edge
->list
[UPPER
], &upper
->lower
);
3214 * Handle direct tree backref
3216 * Direct tree backref means, the backref item shows its parent bytenr
3217 * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined).
3219 * @ref_key: The converted backref key.
3220 * For keyed backref, it's the item key.
3221 * For inlined backref, objectid is the bytenr,
3222 * type is btrfs_inline_ref_type, offset is
3223 * btrfs_inline_ref_offset.
3225 static int handle_direct_tree_backref(struct btrfs_backref_cache
*cache
,
3226 struct btrfs_key
*ref_key
,
3227 struct btrfs_backref_node
*cur
)
3229 struct btrfs_backref_edge
*edge
;
3230 struct btrfs_backref_node
*upper
;
3231 struct rb_node
*rb_node
;
3233 ASSERT(ref_key
->type
== BTRFS_SHARED_BLOCK_REF_KEY
);
3235 /* Only reloc root uses backref pointing to itself */
3236 if (ref_key
->objectid
== ref_key
->offset
) {
3237 struct btrfs_root
*root
;
3239 cur
->is_reloc_root
= 1;
3240 /* Only reloc backref cache cares about a specific root */
3241 if (cache
->is_reloc
) {
3242 root
= find_reloc_root(cache
->fs_info
, cur
->bytenr
);
3248 * For generic purpose backref cache, reloc root node
3251 list_add(&cur
->list
, &cache
->useless_node
);
3256 edge
= btrfs_backref_alloc_edge(cache
);
3260 rb_node
= rb_simple_search(&cache
->rb_root
, ref_key
->offset
);
3262 /* Parent node not yet cached */
3263 upper
= btrfs_backref_alloc_node(cache
, ref_key
->offset
,
3266 btrfs_backref_free_edge(cache
, edge
);
3271 * Backrefs for the upper level block isn't cached, add the
3272 * block to pending list
3274 list_add_tail(&edge
->list
[UPPER
], &cache
->pending_edge
);
3276 /* Parent node already cached */
3277 upper
= rb_entry(rb_node
, struct btrfs_backref_node
, rb_node
);
3278 ASSERT(upper
->checked
);
3279 INIT_LIST_HEAD(&edge
->list
[UPPER
]);
3281 btrfs_backref_link_edge(edge
, cur
, upper
, LINK_LOWER
);
3286 * Handle indirect tree backref
3288 * Indirect tree backref means, we only know which tree the node belongs to.
3289 * We still need to do a tree search to find out the parents. This is for
3290 * TREE_BLOCK_REF backref (keyed or inlined).
3292 * @trans: Transaction handle.
3293 * @ref_key: The same as @ref_key in handle_direct_tree_backref()
3294 * @tree_key: The first key of this tree block.
3295 * @path: A clean (released) path, to avoid allocating path every time
3296 * the function get called.
3298 static int handle_indirect_tree_backref(struct btrfs_trans_handle
*trans
,
3299 struct btrfs_backref_cache
*cache
,
3300 struct btrfs_path
*path
,
3301 struct btrfs_key
*ref_key
,
3302 struct btrfs_key
*tree_key
,
3303 struct btrfs_backref_node
*cur
)
3305 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
3306 struct btrfs_backref_node
*upper
;
3307 struct btrfs_backref_node
*lower
;
3308 struct btrfs_backref_edge
*edge
;
3309 struct extent_buffer
*eb
;
3310 struct btrfs_root
*root
;
3311 struct rb_node
*rb_node
;
3313 bool need_check
= true;
3316 root
= btrfs_get_fs_root(fs_info
, ref_key
->offset
, false);
3318 return PTR_ERR(root
);
3319 if (!test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
))
3322 if (btrfs_root_level(&root
->root_item
) == cur
->level
) {
3324 ASSERT(btrfs_root_bytenr(&root
->root_item
) == cur
->bytenr
);
3326 * For reloc backref cache, we may ignore reloc root. But for
3327 * general purpose backref cache, we can't rely on
3328 * btrfs_should_ignore_reloc_root() as it may conflict with
3329 * current running relocation and lead to missing root.
3331 * For general purpose backref cache, reloc root detection is
3332 * completely relying on direct backref (key->offset is parent
3333 * bytenr), thus only do such check for reloc cache.
3335 if (btrfs_should_ignore_reloc_root(root
) && cache
->is_reloc
) {
3336 btrfs_put_root(root
);
3337 list_add(&cur
->list
, &cache
->useless_node
);
3344 level
= cur
->level
+ 1;
3346 /* Search the tree to find parent blocks referring to the block */
3347 path
->search_commit_root
= 1;
3348 path
->skip_locking
= 1;
3349 path
->lowest_level
= level
;
3350 ret
= btrfs_search_slot(NULL
, root
, tree_key
, path
, 0, 0);
3351 path
->lowest_level
= 0;
3353 btrfs_put_root(root
);
3356 if (ret
> 0 && path
->slots
[level
] > 0)
3357 path
->slots
[level
]--;
3359 eb
= path
->nodes
[level
];
3360 if (btrfs_node_blockptr(eb
, path
->slots
[level
]) != cur
->bytenr
) {
3362 "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
3363 cur
->bytenr
, level
- 1, btrfs_root_id(root
),
3364 tree_key
->objectid
, tree_key
->type
, tree_key
->offset
);
3365 btrfs_put_root(root
);
3371 /* Add all nodes and edges in the path */
3372 for (; level
< BTRFS_MAX_LEVEL
; level
++) {
3373 if (!path
->nodes
[level
]) {
3374 ASSERT(btrfs_root_bytenr(&root
->root_item
) ==
3376 /* Same as previous should_ignore_reloc_root() call */
3377 if (btrfs_should_ignore_reloc_root(root
) &&
3379 btrfs_put_root(root
);
3380 list_add(&lower
->list
, &cache
->useless_node
);
3387 edge
= btrfs_backref_alloc_edge(cache
);
3389 btrfs_put_root(root
);
3394 eb
= path
->nodes
[level
];
3395 rb_node
= rb_simple_search(&cache
->rb_root
, eb
->start
);
3397 upper
= btrfs_backref_alloc_node(cache
, eb
->start
,
3400 btrfs_put_root(root
);
3401 btrfs_backref_free_edge(cache
, edge
);
3405 upper
->owner
= btrfs_header_owner(eb
);
3406 if (!test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
))
3410 * If we know the block isn't shared we can avoid
3411 * checking its backrefs.
3413 if (btrfs_block_can_be_shared(trans
, root
, eb
))
3419 * Add the block to pending list if we need to check its
3420 * backrefs, we only do this once while walking up a
3421 * tree as we will catch anything else later on.
3423 if (!upper
->checked
&& need_check
) {
3425 list_add_tail(&edge
->list
[UPPER
],
3426 &cache
->pending_edge
);
3430 INIT_LIST_HEAD(&edge
->list
[UPPER
]);
3433 upper
= rb_entry(rb_node
, struct btrfs_backref_node
,
3435 ASSERT(upper
->checked
);
3436 INIT_LIST_HEAD(&edge
->list
[UPPER
]);
3438 upper
->owner
= btrfs_header_owner(eb
);
3440 btrfs_backref_link_edge(edge
, lower
, upper
, LINK_LOWER
);
3443 btrfs_put_root(root
);
3450 btrfs_release_path(path
);
3455 * Add backref node @cur into @cache.
3457 * NOTE: Even if the function returned 0, @cur is not yet cached as its upper
3458 * links aren't yet bi-directional. Needs to finish such links.
3459 * Use btrfs_backref_finish_upper_links() to finish such linkage.
3461 * @trans: Transaction handle.
3462 * @path: Released path for indirect tree backref lookup
3463 * @iter: Released backref iter for extent tree search
3464 * @node_key: The first key of the tree block
3466 int btrfs_backref_add_tree_node(struct btrfs_trans_handle
*trans
,
3467 struct btrfs_backref_cache
*cache
,
3468 struct btrfs_path
*path
,
3469 struct btrfs_backref_iter
*iter
,
3470 struct btrfs_key
*node_key
,
3471 struct btrfs_backref_node
*cur
)
3473 struct btrfs_backref_edge
*edge
;
3474 struct btrfs_backref_node
*exist
;
3477 ret
= btrfs_backref_iter_start(iter
, cur
->bytenr
);
3481 * We skip the first btrfs_tree_block_info, as we don't use the key
3482 * stored in it, but fetch it from the tree block
3484 if (btrfs_backref_has_tree_block_info(iter
)) {
3485 ret
= btrfs_backref_iter_next(iter
);
3488 /* No extra backref? This means the tree block is corrupted */
3494 WARN_ON(cur
->checked
);
3495 if (!list_empty(&cur
->upper
)) {
3497 * The backref was added previously when processing backref of
3498 * type BTRFS_TREE_BLOCK_REF_KEY
3500 ASSERT(list_is_singular(&cur
->upper
));
3501 edge
= list_entry(cur
->upper
.next
, struct btrfs_backref_edge
,
3503 ASSERT(list_empty(&edge
->list
[UPPER
]));
3504 exist
= edge
->node
[UPPER
];
3506 * Add the upper level block to pending list if we need check
3509 if (!exist
->checked
)
3510 list_add_tail(&edge
->list
[UPPER
], &cache
->pending_edge
);
3515 for (; ret
== 0; ret
= btrfs_backref_iter_next(iter
)) {
3516 struct extent_buffer
*eb
;
3517 struct btrfs_key key
;
3521 eb
= iter
->path
->nodes
[0];
3523 key
.objectid
= iter
->bytenr
;
3524 if (btrfs_backref_iter_is_inline_ref(iter
)) {
3525 struct btrfs_extent_inline_ref
*iref
;
3527 /* Update key for inline backref */
3528 iref
= (struct btrfs_extent_inline_ref
*)
3529 ((unsigned long)iter
->cur_ptr
);
3530 type
= btrfs_get_extent_inline_ref_type(eb
, iref
,
3531 BTRFS_REF_TYPE_BLOCK
);
3532 if (type
== BTRFS_REF_TYPE_INVALID
) {
3537 key
.offset
= btrfs_extent_inline_ref_offset(eb
, iref
);
3539 key
.type
= iter
->cur_key
.type
;
3540 key
.offset
= iter
->cur_key
.offset
;
3544 * Parent node found and matches current inline ref, no need to
3545 * rebuild this node for this inline ref
3548 ((key
.type
== BTRFS_TREE_BLOCK_REF_KEY
&&
3549 exist
->owner
== key
.offset
) ||
3550 (key
.type
== BTRFS_SHARED_BLOCK_REF_KEY
&&
3551 exist
->bytenr
== key
.offset
))) {
3556 /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
3557 if (key
.type
== BTRFS_SHARED_BLOCK_REF_KEY
) {
3558 ret
= handle_direct_tree_backref(cache
, &key
, cur
);
3561 } else if (key
.type
== BTRFS_TREE_BLOCK_REF_KEY
) {
3563 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref
3564 * offset means the root objectid. We need to search
3565 * the tree to get its parent bytenr.
3567 ret
= handle_indirect_tree_backref(trans
, cache
, path
,
3568 &key
, node_key
, cur
);
3573 * Unrecognized tree backref items (if it can pass tree-checker)
3581 btrfs_backref_iter_release(iter
);
3586 * Finish the upwards linkage created by btrfs_backref_add_tree_node()
3588 int btrfs_backref_finish_upper_links(struct btrfs_backref_cache
*cache
,
3589 struct btrfs_backref_node
*start
)
3591 struct list_head
*useless_node
= &cache
->useless_node
;
3592 struct btrfs_backref_edge
*edge
;
3593 struct rb_node
*rb_node
;
3594 LIST_HEAD(pending_edge
);
3596 ASSERT(start
->checked
);
3598 /* Insert this node to cache if it's not COW-only */
3599 if (!start
->cowonly
) {
3600 rb_node
= rb_simple_insert(&cache
->rb_root
, start
->bytenr
,
3603 btrfs_backref_panic(cache
->fs_info
, start
->bytenr
,
3605 list_add_tail(&start
->lower
, &cache
->leaves
);
3609 * Use breadth first search to iterate all related edges.
3611 * The starting points are all the edges of this node
3613 list_for_each_entry(edge
, &start
->upper
, list
[LOWER
])
3614 list_add_tail(&edge
->list
[UPPER
], &pending_edge
);
3616 while (!list_empty(&pending_edge
)) {
3617 struct btrfs_backref_node
*upper
;
3618 struct btrfs_backref_node
*lower
;
3620 edge
= list_first_entry(&pending_edge
,
3621 struct btrfs_backref_edge
, list
[UPPER
]);
3622 list_del_init(&edge
->list
[UPPER
]);
3623 upper
= edge
->node
[UPPER
];
3624 lower
= edge
->node
[LOWER
];
3626 /* Parent is detached, no need to keep any edges */
3627 if (upper
->detached
) {
3628 list_del(&edge
->list
[LOWER
]);
3629 btrfs_backref_free_edge(cache
, edge
);
3631 /* Lower node is orphan, queue for cleanup */
3632 if (list_empty(&lower
->upper
))
3633 list_add(&lower
->list
, useless_node
);
3638 * All new nodes added in current build_backref_tree() haven't
3639 * been linked to the cache rb tree.
3640 * So if we have upper->rb_node populated, this means a cache
3641 * hit. We only need to link the edge, as @upper and all its
3642 * parents have already been linked.
3644 if (!RB_EMPTY_NODE(&upper
->rb_node
)) {
3645 if (upper
->lowest
) {
3646 list_del_init(&upper
->lower
);
3650 list_add_tail(&edge
->list
[UPPER
], &upper
->lower
);
3654 /* Sanity check, we shouldn't have any unchecked nodes */
3655 if (!upper
->checked
) {
3660 /* Sanity check, COW-only node has non-COW-only parent */
3661 if (start
->cowonly
!= upper
->cowonly
) {
3666 /* Only cache non-COW-only (subvolume trees) tree blocks */
3667 if (!upper
->cowonly
) {
3668 rb_node
= rb_simple_insert(&cache
->rb_root
, upper
->bytenr
,
3671 btrfs_backref_panic(cache
->fs_info
,
3672 upper
->bytenr
, -EEXIST
);
3677 list_add_tail(&edge
->list
[UPPER
], &upper
->lower
);
3680 * Also queue all the parent edges of this uncached node
3681 * to finish the upper linkage
3683 list_for_each_entry(edge
, &upper
->upper
, list
[LOWER
])
3684 list_add_tail(&edge
->list
[UPPER
], &pending_edge
);
3689 void btrfs_backref_error_cleanup(struct btrfs_backref_cache
*cache
,
3690 struct btrfs_backref_node
*node
)
3692 struct btrfs_backref_node
*lower
;
3693 struct btrfs_backref_node
*upper
;
3694 struct btrfs_backref_edge
*edge
;
3696 while (!list_empty(&cache
->useless_node
)) {
3697 lower
= list_first_entry(&cache
->useless_node
,
3698 struct btrfs_backref_node
, list
);
3699 list_del_init(&lower
->list
);
3701 while (!list_empty(&cache
->pending_edge
)) {
3702 edge
= list_first_entry(&cache
->pending_edge
,
3703 struct btrfs_backref_edge
, list
[UPPER
]);
3704 list_del(&edge
->list
[UPPER
]);
3705 list_del(&edge
->list
[LOWER
]);
3706 lower
= edge
->node
[LOWER
];
3707 upper
= edge
->node
[UPPER
];
3708 btrfs_backref_free_edge(cache
, edge
);
3711 * Lower is no longer linked to any upper backref nodes and
3712 * isn't in the cache, we can free it ourselves.
3714 if (list_empty(&lower
->upper
) &&
3715 RB_EMPTY_NODE(&lower
->rb_node
))
3716 list_add(&lower
->list
, &cache
->useless_node
);
3718 if (!RB_EMPTY_NODE(&upper
->rb_node
))
3721 /* Add this guy's upper edges to the list to process */
3722 list_for_each_entry(edge
, &upper
->upper
, list
[LOWER
])
3723 list_add_tail(&edge
->list
[UPPER
],
3724 &cache
->pending_edge
);
3725 if (list_empty(&upper
->upper
))
3726 list_add(&upper
->list
, &cache
->useless_node
);
3729 while (!list_empty(&cache
->useless_node
)) {
3730 lower
= list_first_entry(&cache
->useless_node
,
3731 struct btrfs_backref_node
, list
);
3732 list_del_init(&lower
->list
);
3735 btrfs_backref_drop_node(cache
, lower
);
3738 btrfs_backref_cleanup_node(cache
, node
);
3739 ASSERT(list_empty(&cache
->useless_node
) &&
3740 list_empty(&cache
->pending_edge
));