1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2011 STRATO. All rights reserved.
7 #include <linux/rbtree.h>
8 #include <trace/events/btrfs.h>
13 #include "transaction.h"
14 #include "delayed-ref.h"
17 #include "tree-mod-log.h"
19 #include "accessors.h"
20 #include "extent-tree.h"
21 #include "relocation.h"
22 #include "tree-checker.h"
24 /* Just arbitrary numbers so we can be sure one of these happened. */
25 #define BACKREF_FOUND_SHARED 6
26 #define BACKREF_FOUND_NOT_SHARED 7
28 struct extent_inode_elem
{
32 struct extent_inode_elem
*next
;
35 static int check_extent_in_eb(struct btrfs_backref_walk_ctx
*ctx
,
36 const struct btrfs_key
*key
,
37 const struct extent_buffer
*eb
,
38 const struct btrfs_file_extent_item
*fi
,
39 struct extent_inode_elem
**eie
)
41 const u64 data_len
= btrfs_file_extent_num_bytes(eb
, fi
);
42 u64 offset
= key
->offset
;
43 struct extent_inode_elem
*e
;
48 if (!ctx
->ignore_extent_item_pos
&&
49 !btrfs_file_extent_compression(eb
, fi
) &&
50 !btrfs_file_extent_encryption(eb
, fi
) &&
51 !btrfs_file_extent_other_encoding(eb
, fi
)) {
54 data_offset
= btrfs_file_extent_offset(eb
, fi
);
56 if (ctx
->extent_item_pos
< data_offset
||
57 ctx
->extent_item_pos
>= data_offset
+ data_len
)
59 offset
+= ctx
->extent_item_pos
- data_offset
;
62 if (!ctx
->indirect_ref_iterator
|| !ctx
->cache_lookup
)
65 cached
= ctx
->cache_lookup(eb
->start
, ctx
->user_ctx
, &root_ids
,
70 for (int i
= 0; i
< root_count
; i
++) {
73 ret
= ctx
->indirect_ref_iterator(key
->objectid
, offset
,
74 data_len
, root_ids
[i
],
81 e
= kmalloc(sizeof(*e
), GFP_NOFS
);
86 e
->inum
= key
->objectid
;
88 e
->num_bytes
= data_len
;
94 static void free_inode_elem_list(struct extent_inode_elem
*eie
)
96 struct extent_inode_elem
*eie_next
;
98 for (; eie
; eie
= eie_next
) {
104 static int find_extent_in_eb(struct btrfs_backref_walk_ctx
*ctx
,
105 const struct extent_buffer
*eb
,
106 struct extent_inode_elem
**eie
)
109 struct btrfs_key key
;
110 struct btrfs_file_extent_item
*fi
;
117 * from the shared data ref, we only have the leaf but we need
118 * the key. thus, we must look into all items and see that we
119 * find one (some) with a reference to our extent item.
121 nritems
= btrfs_header_nritems(eb
);
122 for (slot
= 0; slot
< nritems
; ++slot
) {
123 btrfs_item_key_to_cpu(eb
, &key
, slot
);
124 if (key
.type
!= BTRFS_EXTENT_DATA_KEY
)
126 fi
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
127 extent_type
= btrfs_file_extent_type(eb
, fi
);
128 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
)
130 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
131 disk_byte
= btrfs_file_extent_disk_bytenr(eb
, fi
);
132 if (disk_byte
!= ctx
->bytenr
)
135 ret
= check_extent_in_eb(ctx
, &key
, eb
, fi
, eie
);
136 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
|| ret
< 0)
144 struct rb_root_cached root
;
148 #define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 }
151 struct preftree direct
; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
152 struct preftree indirect
; /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */
153 struct preftree indirect_missing_keys
;
157 * Checks for a shared extent during backref search.
159 * The share_count tracks prelim_refs (direct and indirect) having a
161 * - incremented when a ref->count transitions to >0
162 * - decremented when a ref->count transitions to <1
165 struct btrfs_backref_share_check_ctx
*ctx
;
166 struct btrfs_root
*root
;
171 * Counts number of inodes that refer to an extent (different inodes in
172 * the same root or different roots) that we could find. The sharedness
173 * check typically stops once this counter gets greater than 1, so it
174 * may not reflect the total number of inodes.
178 * The number of times we found our inode refers to the data extent we
179 * are determining the sharedness. In other words, how many file extent
180 * items we could find for our inode that point to our target data
181 * extent. The value we get here after finishing the extent sharedness
182 * check may be smaller than reality, but if it ends up being greater
183 * than 1, then we know for sure the inode has multiple file extent
184 * items that point to our inode, and we can safely assume it's useful
185 * to cache the sharedness check result.
188 bool have_delayed_delete_refs
;
191 static inline int extent_is_shared(struct share_check
*sc
)
193 return (sc
&& sc
->share_count
> 1) ? BACKREF_FOUND_SHARED
: 0;
196 static struct kmem_cache
*btrfs_prelim_ref_cache
;
198 int __init
btrfs_prelim_ref_init(void)
200 btrfs_prelim_ref_cache
= kmem_cache_create("btrfs_prelim_ref",
201 sizeof(struct prelim_ref
), 0, 0, NULL
);
202 if (!btrfs_prelim_ref_cache
)
207 void __cold
btrfs_prelim_ref_exit(void)
209 kmem_cache_destroy(btrfs_prelim_ref_cache
);
212 static void free_pref(struct prelim_ref
*ref
)
214 kmem_cache_free(btrfs_prelim_ref_cache
, ref
);
218 * Return 0 when both refs are for the same block (and can be merged).
219 * A -1 return indicates ref1 is a 'lower' block than ref2, while 1
220 * indicates a 'higher' block.
222 static int prelim_ref_compare(const struct prelim_ref
*ref1
,
223 const struct prelim_ref
*ref2
)
225 if (ref1
->level
< ref2
->level
)
227 if (ref1
->level
> ref2
->level
)
229 if (ref1
->root_id
< ref2
->root_id
)
231 if (ref1
->root_id
> ref2
->root_id
)
233 if (ref1
->key_for_search
.type
< ref2
->key_for_search
.type
)
235 if (ref1
->key_for_search
.type
> ref2
->key_for_search
.type
)
237 if (ref1
->key_for_search
.objectid
< ref2
->key_for_search
.objectid
)
239 if (ref1
->key_for_search
.objectid
> ref2
->key_for_search
.objectid
)
241 if (ref1
->key_for_search
.offset
< ref2
->key_for_search
.offset
)
243 if (ref1
->key_for_search
.offset
> ref2
->key_for_search
.offset
)
245 if (ref1
->parent
< ref2
->parent
)
247 if (ref1
->parent
> ref2
->parent
)
253 static void update_share_count(struct share_check
*sc
, int oldcount
,
254 int newcount
, const struct prelim_ref
*newref
)
256 if ((!sc
) || (oldcount
== 0 && newcount
< 1))
259 if (oldcount
> 0 && newcount
< 1)
261 else if (oldcount
< 1 && newcount
> 0)
264 if (newref
->root_id
== btrfs_root_id(sc
->root
) &&
265 newref
->wanted_disk_byte
== sc
->data_bytenr
&&
266 newref
->key_for_search
.objectid
== sc
->inum
)
267 sc
->self_ref_count
+= newref
->count
;
271 * Add @newref to the @root rbtree, merging identical refs.
273 * Callers should assume that newref has been freed after calling.
275 static void prelim_ref_insert(const struct btrfs_fs_info
*fs_info
,
276 struct preftree
*preftree
,
277 struct prelim_ref
*newref
,
278 struct share_check
*sc
)
280 struct rb_root_cached
*root
;
282 struct rb_node
*parent
= NULL
;
283 struct prelim_ref
*ref
;
285 bool leftmost
= true;
287 root
= &preftree
->root
;
288 p
= &root
->rb_root
.rb_node
;
292 ref
= rb_entry(parent
, struct prelim_ref
, rbnode
);
293 result
= prelim_ref_compare(ref
, newref
);
296 } else if (result
> 0) {
300 /* Identical refs, merge them and free @newref */
301 struct extent_inode_elem
*eie
= ref
->inode_list
;
303 while (eie
&& eie
->next
)
307 ref
->inode_list
= newref
->inode_list
;
309 eie
->next
= newref
->inode_list
;
310 trace_btrfs_prelim_ref_merge(fs_info
, ref
, newref
,
313 * A delayed ref can have newref->count < 0.
314 * The ref->count is updated to follow any
315 * BTRFS_[ADD|DROP]_DELAYED_REF actions.
317 update_share_count(sc
, ref
->count
,
318 ref
->count
+ newref
->count
, newref
);
319 ref
->count
+= newref
->count
;
325 update_share_count(sc
, 0, newref
->count
, newref
);
327 trace_btrfs_prelim_ref_insert(fs_info
, newref
, NULL
, preftree
->count
);
328 rb_link_node(&newref
->rbnode
, parent
, p
);
329 rb_insert_color_cached(&newref
->rbnode
, root
, leftmost
);
333 * Release the entire tree. We don't care about internal consistency so
334 * just free everything and then reset the tree root.
336 static void prelim_release(struct preftree
*preftree
)
338 struct prelim_ref
*ref
, *next_ref
;
340 rbtree_postorder_for_each_entry_safe(ref
, next_ref
,
341 &preftree
->root
.rb_root
, rbnode
) {
342 free_inode_elem_list(ref
->inode_list
);
346 preftree
->root
= RB_ROOT_CACHED
;
351 * the rules for all callers of this function are:
352 * - obtaining the parent is the goal
353 * - if you add a key, you must know that it is a correct key
354 * - if you cannot add the parent or a correct key, then we will look into the
355 * block later to set a correct key
359 * backref type | shared | indirect | shared | indirect
360 * information | tree | tree | data | data
361 * --------------------+--------+----------+--------+----------
362 * parent logical | y | - | - | -
363 * key to resolve | - | y | y | y
364 * tree block logical | - | - | - | -
365 * root for resolving | y | y | y | y
367 * - column 1: we've the parent -> done
368 * - column 2, 3, 4: we use the key to find the parent
370 * on disk refs (inline or keyed)
371 * ==============================
372 * backref type | shared | indirect | shared | indirect
373 * information | tree | tree | data | data
374 * --------------------+--------+----------+--------+----------
375 * parent logical | y | - | y | -
376 * key to resolve | - | - | - | y
377 * tree block logical | y | y | y | y
378 * root for resolving | - | y | y | y
380 * - column 1, 3: we've the parent -> done
381 * - column 2: we take the first key from the block to find the parent
382 * (see add_missing_keys)
383 * - column 4: we use the key to find the parent
385 * additional information that's available but not required to find the parent
386 * block might help in merging entries to gain some speed.
388 static int add_prelim_ref(const struct btrfs_fs_info
*fs_info
,
389 struct preftree
*preftree
, u64 root_id
,
390 const struct btrfs_key
*key
, int level
, u64 parent
,
391 u64 wanted_disk_byte
, int count
,
392 struct share_check
*sc
, gfp_t gfp_mask
)
394 struct prelim_ref
*ref
;
396 if (root_id
== BTRFS_DATA_RELOC_TREE_OBJECTID
)
399 ref
= kmem_cache_alloc(btrfs_prelim_ref_cache
, gfp_mask
);
403 ref
->root_id
= root_id
;
405 ref
->key_for_search
= *key
;
407 memset(&ref
->key_for_search
, 0, sizeof(ref
->key_for_search
));
409 ref
->inode_list
= NULL
;
412 ref
->parent
= parent
;
413 ref
->wanted_disk_byte
= wanted_disk_byte
;
414 prelim_ref_insert(fs_info
, preftree
, ref
, sc
);
415 return extent_is_shared(sc
);
418 /* direct refs use root == 0, key == NULL */
419 static int add_direct_ref(const struct btrfs_fs_info
*fs_info
,
420 struct preftrees
*preftrees
, int level
, u64 parent
,
421 u64 wanted_disk_byte
, int count
,
422 struct share_check
*sc
, gfp_t gfp_mask
)
424 return add_prelim_ref(fs_info
, &preftrees
->direct
, 0, NULL
, level
,
425 parent
, wanted_disk_byte
, count
, sc
, gfp_mask
);
428 /* indirect refs use parent == 0 */
429 static int add_indirect_ref(const struct btrfs_fs_info
*fs_info
,
430 struct preftrees
*preftrees
, u64 root_id
,
431 const struct btrfs_key
*key
, int level
,
432 u64 wanted_disk_byte
, int count
,
433 struct share_check
*sc
, gfp_t gfp_mask
)
435 struct preftree
*tree
= &preftrees
->indirect
;
438 tree
= &preftrees
->indirect_missing_keys
;
439 return add_prelim_ref(fs_info
, tree
, root_id
, key
, level
, 0,
440 wanted_disk_byte
, count
, sc
, gfp_mask
);
443 static int is_shared_data_backref(struct preftrees
*preftrees
, u64 bytenr
)
445 struct rb_node
**p
= &preftrees
->direct
.root
.rb_root
.rb_node
;
446 struct rb_node
*parent
= NULL
;
447 struct prelim_ref
*ref
= NULL
;
448 struct prelim_ref target
= {};
451 target
.parent
= bytenr
;
455 ref
= rb_entry(parent
, struct prelim_ref
, rbnode
);
456 result
= prelim_ref_compare(ref
, &target
);
468 static int add_all_parents(struct btrfs_backref_walk_ctx
*ctx
,
469 struct btrfs_root
*root
, struct btrfs_path
*path
,
470 struct ulist
*parents
,
471 struct preftrees
*preftrees
, struct prelim_ref
*ref
,
476 struct extent_buffer
*eb
;
477 struct btrfs_key key
;
478 struct btrfs_key
*key_for_search
= &ref
->key_for_search
;
479 struct btrfs_file_extent_item
*fi
;
480 struct extent_inode_elem
*eie
= NULL
, *old
= NULL
;
482 u64 wanted_disk_byte
= ref
->wanted_disk_byte
;
488 eb
= path
->nodes
[level
];
489 ret
= ulist_add(parents
, eb
->start
, 0, GFP_NOFS
);
496 * 1. We normally enter this function with the path already pointing to
497 * the first item to check. But sometimes, we may enter it with
499 * 2. We are searching for normal backref but bytenr of this leaf
500 * matches shared data backref
501 * 3. The leaf owner is not equal to the root we are searching
503 * For these cases, go to the next leaf before we continue.
506 if (path
->slots
[0] >= btrfs_header_nritems(eb
) ||
507 is_shared_data_backref(preftrees
, eb
->start
) ||
508 ref
->root_id
!= btrfs_header_owner(eb
)) {
509 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
510 ret
= btrfs_next_leaf(root
, path
);
512 ret
= btrfs_next_old_leaf(root
, path
, ctx
->time_seq
);
515 while (!ret
&& count
< ref
->count
) {
517 slot
= path
->slots
[0];
519 btrfs_item_key_to_cpu(eb
, &key
, slot
);
521 if (key
.objectid
!= key_for_search
->objectid
||
522 key
.type
!= BTRFS_EXTENT_DATA_KEY
)
526 * We are searching for normal backref but bytenr of this leaf
527 * matches shared data backref, OR
528 * the leaf owner is not equal to the root we are searching for
531 (is_shared_data_backref(preftrees
, eb
->start
) ||
532 ref
->root_id
!= btrfs_header_owner(eb
))) {
533 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
534 ret
= btrfs_next_leaf(root
, path
);
536 ret
= btrfs_next_old_leaf(root
, path
, ctx
->time_seq
);
539 fi
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
540 type
= btrfs_file_extent_type(eb
, fi
);
541 if (type
== BTRFS_FILE_EXTENT_INLINE
)
543 disk_byte
= btrfs_file_extent_disk_bytenr(eb
, fi
);
544 data_offset
= btrfs_file_extent_offset(eb
, fi
);
546 if (disk_byte
== wanted_disk_byte
) {
549 if (ref
->key_for_search
.offset
== key
.offset
- data_offset
)
553 if (!ctx
->skip_inode_ref_list
) {
554 ret
= check_extent_in_eb(ctx
, &key
, eb
, fi
, &eie
);
555 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
||
561 ret
= ulist_add_merge_ptr(parents
, eb
->start
,
562 eie
, (void **)&old
, GFP_NOFS
);
565 if (!ret
&& !ctx
->skip_inode_ref_list
) {
573 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
574 ret
= btrfs_next_item(root
, path
);
576 ret
= btrfs_next_old_item(root
, path
, ctx
->time_seq
);
579 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
|| ret
< 0)
580 free_inode_elem_list(eie
);
588 * resolve an indirect backref in the form (root_id, key, level)
589 * to a logical address
591 static int resolve_indirect_ref(struct btrfs_backref_walk_ctx
*ctx
,
592 struct btrfs_path
*path
,
593 struct preftrees
*preftrees
,
594 struct prelim_ref
*ref
, struct ulist
*parents
)
596 struct btrfs_root
*root
;
597 struct extent_buffer
*eb
;
600 int level
= ref
->level
;
601 struct btrfs_key search_key
= ref
->key_for_search
;
604 * If we're search_commit_root we could possibly be holding locks on
605 * other tree nodes. This happens when qgroups does backref walks when
606 * adding new delayed refs. To deal with this we need to look in cache
607 * for the root, and if we don't find it then we need to search the
608 * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage
611 if (path
->search_commit_root
)
612 root
= btrfs_get_fs_root_commit_root(ctx
->fs_info
, path
, ref
->root_id
);
614 root
= btrfs_get_fs_root(ctx
->fs_info
, ref
->root_id
, false);
620 if (!path
->search_commit_root
&&
621 test_bit(BTRFS_ROOT_DELETING
, &root
->state
)) {
626 if (btrfs_is_testing(ctx
->fs_info
)) {
631 if (path
->search_commit_root
)
632 root_level
= btrfs_header_level(root
->commit_root
);
633 else if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
634 root_level
= btrfs_header_level(root
->node
);
636 root_level
= btrfs_old_root_level(root
, ctx
->time_seq
);
638 if (root_level
+ 1 == level
)
642 * We can often find data backrefs with an offset that is too large
643 * (>= LLONG_MAX, maximum allowed file offset) due to underflows when
644 * subtracting a file's offset with the data offset of its
645 * corresponding extent data item. This can happen for example in the
648 * So if we detect such case we set the search key's offset to zero to
649 * make sure we will find the matching file extent item at
650 * add_all_parents(), otherwise we will miss it because the offset
651 * taken form the backref is much larger then the offset of the file
652 * extent item. This can make us scan a very large number of file
653 * extent items, but at least it will not make us miss any.
655 * This is an ugly workaround for a behaviour that should have never
656 * existed, but it does and a fix for the clone ioctl would touch a lot
657 * of places, cause backwards incompatibility and would not fix the
658 * problem for extents cloned with older kernels.
660 if (search_key
.type
== BTRFS_EXTENT_DATA_KEY
&&
661 search_key
.offset
>= LLONG_MAX
)
662 search_key
.offset
= 0;
663 path
->lowest_level
= level
;
664 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
665 ret
= btrfs_search_slot(NULL
, root
, &search_key
, path
, 0, 0);
667 ret
= btrfs_search_old_slot(root
, &search_key
, path
, ctx
->time_seq
);
669 btrfs_debug(ctx
->fs_info
,
670 "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
671 ref
->root_id
, level
, ref
->count
, ret
,
672 ref
->key_for_search
.objectid
, ref
->key_for_search
.type
,
673 ref
->key_for_search
.offset
);
677 eb
= path
->nodes
[level
];
679 if (WARN_ON(!level
)) {
684 eb
= path
->nodes
[level
];
687 ret
= add_all_parents(ctx
, root
, path
, parents
, preftrees
, ref
, level
);
689 btrfs_put_root(root
);
691 path
->lowest_level
= 0;
692 btrfs_release_path(path
);
696 static struct extent_inode_elem
*
697 unode_aux_to_inode_list(struct ulist_node
*node
)
701 return (struct extent_inode_elem
*)(uintptr_t)node
->aux
;
704 static void free_leaf_list(struct ulist
*ulist
)
706 struct ulist_node
*node
;
707 struct ulist_iterator uiter
;
709 ULIST_ITER_INIT(&uiter
);
710 while ((node
= ulist_next(ulist
, &uiter
)))
711 free_inode_elem_list(unode_aux_to_inode_list(node
));
717 * We maintain three separate rbtrees: one for direct refs, one for
718 * indirect refs which have a key, and one for indirect refs which do not
719 * have a key. Each tree does merge on insertion.
721 * Once all of the references are located, we iterate over the tree of
722 * indirect refs with missing keys. An appropriate key is located and
723 * the ref is moved onto the tree for indirect refs. After all missing
724 * keys are thus located, we iterate over the indirect ref tree, resolve
725 * each reference, and then insert the resolved reference onto the
726 * direct tree (merging there too).
728 * New backrefs (i.e., for parent nodes) are added to the appropriate
729 * rbtree as they are encountered. The new backrefs are subsequently
732 static int resolve_indirect_refs(struct btrfs_backref_walk_ctx
*ctx
,
733 struct btrfs_path
*path
,
734 struct preftrees
*preftrees
,
735 struct share_check
*sc
)
739 struct ulist
*parents
;
740 struct ulist_node
*node
;
741 struct ulist_iterator uiter
;
742 struct rb_node
*rnode
;
744 parents
= ulist_alloc(GFP_NOFS
);
749 * We could trade memory usage for performance here by iterating
750 * the tree, allocating new refs for each insertion, and then
751 * freeing the entire indirect tree when we're done. In some test
752 * cases, the tree can grow quite large (~200k objects).
754 while ((rnode
= rb_first_cached(&preftrees
->indirect
.root
))) {
755 struct prelim_ref
*ref
;
757 ref
= rb_entry(rnode
, struct prelim_ref
, rbnode
);
758 if (WARN(ref
->parent
,
759 "BUG: direct ref found in indirect tree")) {
764 rb_erase_cached(&ref
->rbnode
, &preftrees
->indirect
.root
);
765 preftrees
->indirect
.count
--;
767 if (ref
->count
== 0) {
772 if (sc
&& ref
->root_id
!= btrfs_root_id(sc
->root
)) {
774 ret
= BACKREF_FOUND_SHARED
;
777 err
= resolve_indirect_ref(ctx
, path
, preftrees
, ref
, parents
);
779 * we can only tolerate ENOENT,otherwise,we should catch error
780 * and return directly.
782 if (err
== -ENOENT
) {
783 prelim_ref_insert(ctx
->fs_info
, &preftrees
->direct
, ref
,
792 /* we put the first parent into the ref at hand */
793 ULIST_ITER_INIT(&uiter
);
794 node
= ulist_next(parents
, &uiter
);
795 ref
->parent
= node
? node
->val
: 0;
796 ref
->inode_list
= unode_aux_to_inode_list(node
);
798 /* Add a prelim_ref(s) for any other parent(s). */
799 while ((node
= ulist_next(parents
, &uiter
))) {
800 struct prelim_ref
*new_ref
;
802 new_ref
= kmem_cache_alloc(btrfs_prelim_ref_cache
,
809 memcpy(new_ref
, ref
, sizeof(*ref
));
810 new_ref
->parent
= node
->val
;
811 new_ref
->inode_list
= unode_aux_to_inode_list(node
);
812 prelim_ref_insert(ctx
->fs_info
, &preftrees
->direct
,
817 * Now it's a direct ref, put it in the direct tree. We must
818 * do this last because the ref could be merged/freed here.
820 prelim_ref_insert(ctx
->fs_info
, &preftrees
->direct
, ref
, NULL
);
822 ulist_reinit(parents
);
827 * We may have inode lists attached to refs in the parents ulist, so we
828 * must free them before freeing the ulist and its refs.
830 free_leaf_list(parents
);
835 * read tree blocks and add keys where required.
837 static int add_missing_keys(struct btrfs_fs_info
*fs_info
,
838 struct preftrees
*preftrees
, bool lock
)
840 struct prelim_ref
*ref
;
841 struct extent_buffer
*eb
;
842 struct preftree
*tree
= &preftrees
->indirect_missing_keys
;
843 struct rb_node
*node
;
845 while ((node
= rb_first_cached(&tree
->root
))) {
846 struct btrfs_tree_parent_check check
= { 0 };
848 ref
= rb_entry(node
, struct prelim_ref
, rbnode
);
849 rb_erase_cached(node
, &tree
->root
);
851 BUG_ON(ref
->parent
); /* should not be a direct ref */
852 BUG_ON(ref
->key_for_search
.type
);
853 BUG_ON(!ref
->wanted_disk_byte
);
855 check
.level
= ref
->level
- 1;
856 check
.owner_root
= ref
->root_id
;
858 eb
= read_tree_block(fs_info
, ref
->wanted_disk_byte
, &check
);
863 if (!extent_buffer_uptodate(eb
)) {
865 free_extent_buffer(eb
);
870 btrfs_tree_read_lock(eb
);
871 if (btrfs_header_level(eb
) == 0)
872 btrfs_item_key_to_cpu(eb
, &ref
->key_for_search
, 0);
874 btrfs_node_key_to_cpu(eb
, &ref
->key_for_search
, 0);
876 btrfs_tree_read_unlock(eb
);
877 free_extent_buffer(eb
);
878 prelim_ref_insert(fs_info
, &preftrees
->indirect
, ref
, NULL
);
885 * add all currently queued delayed refs from this head whose seq nr is
886 * smaller or equal that seq to the list
888 static int add_delayed_refs(const struct btrfs_fs_info
*fs_info
,
889 struct btrfs_delayed_ref_head
*head
, u64 seq
,
890 struct preftrees
*preftrees
, struct share_check
*sc
)
892 struct btrfs_delayed_ref_node
*node
;
893 struct btrfs_key key
;
898 spin_lock(&head
->lock
);
899 for (n
= rb_first_cached(&head
->ref_tree
); n
; n
= rb_next(n
)) {
900 node
= rb_entry(n
, struct btrfs_delayed_ref_node
,
905 switch (node
->action
) {
906 case BTRFS_ADD_DELAYED_EXTENT
:
907 case BTRFS_UPDATE_DELAYED_HEAD
:
910 case BTRFS_ADD_DELAYED_REF
:
911 count
= node
->ref_mod
;
913 case BTRFS_DROP_DELAYED_REF
:
914 count
= node
->ref_mod
* -1;
919 switch (node
->type
) {
920 case BTRFS_TREE_BLOCK_REF_KEY
: {
921 /* NORMAL INDIRECT METADATA backref */
922 struct btrfs_key
*key_ptr
= NULL
;
923 /* The owner of a tree block ref is the level. */
924 int level
= btrfs_delayed_ref_owner(node
);
926 if (head
->extent_op
&& head
->extent_op
->update_key
) {
927 btrfs_disk_key_to_cpu(&key
, &head
->extent_op
->key
);
931 ret
= add_indirect_ref(fs_info
, preftrees
, node
->ref_root
,
932 key_ptr
, level
+ 1, node
->bytenr
,
933 count
, sc
, GFP_ATOMIC
);
936 case BTRFS_SHARED_BLOCK_REF_KEY
: {
938 * SHARED DIRECT METADATA backref
940 * The owner of a tree block ref is the level.
942 int level
= btrfs_delayed_ref_owner(node
);
944 ret
= add_direct_ref(fs_info
, preftrees
, level
+ 1,
945 node
->parent
, node
->bytenr
, count
,
949 case BTRFS_EXTENT_DATA_REF_KEY
: {
950 /* NORMAL INDIRECT DATA backref */
951 key
.objectid
= btrfs_delayed_ref_owner(node
);
952 key
.type
= BTRFS_EXTENT_DATA_KEY
;
953 key
.offset
= btrfs_delayed_ref_offset(node
);
956 * If we have a share check context and a reference for
957 * another inode, we can't exit immediately. This is
958 * because even if this is a BTRFS_ADD_DELAYED_REF
959 * reference we may find next a BTRFS_DROP_DELAYED_REF
960 * which cancels out this ADD reference.
962 * If this is a DROP reference and there was no previous
963 * ADD reference, then we need to signal that when we
964 * process references from the extent tree (through
965 * add_inline_refs() and add_keyed_refs()), we should
966 * not exit early if we find a reference for another
967 * inode, because one of the delayed DROP references
968 * may cancel that reference in the extent tree.
971 sc
->have_delayed_delete_refs
= true;
973 ret
= add_indirect_ref(fs_info
, preftrees
, node
->ref_root
,
974 &key
, 0, node
->bytenr
, count
, sc
,
978 case BTRFS_SHARED_DATA_REF_KEY
: {
979 /* SHARED DIRECT FULL backref */
980 ret
= add_direct_ref(fs_info
, preftrees
, 0, node
->parent
,
981 node
->bytenr
, count
, sc
,
989 * We must ignore BACKREF_FOUND_SHARED until all delayed
990 * refs have been checked.
992 if (ret
&& (ret
!= BACKREF_FOUND_SHARED
))
996 ret
= extent_is_shared(sc
);
998 spin_unlock(&head
->lock
);
1003 * add all inline backrefs for bytenr to the list
1005 * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
1007 static int add_inline_refs(struct btrfs_backref_walk_ctx
*ctx
,
1008 struct btrfs_path
*path
,
1009 int *info_level
, struct preftrees
*preftrees
,
1010 struct share_check
*sc
)
1014 struct extent_buffer
*leaf
;
1015 struct btrfs_key key
;
1016 struct btrfs_key found_key
;
1019 struct btrfs_extent_item
*ei
;
1024 * enumerate all inline refs
1026 leaf
= path
->nodes
[0];
1027 slot
= path
->slots
[0];
1029 item_size
= btrfs_item_size(leaf
, slot
);
1030 ei
= btrfs_item_ptr(leaf
, slot
, struct btrfs_extent_item
);
1032 if (ctx
->check_extent_item
) {
1033 ret
= ctx
->check_extent_item(ctx
->bytenr
, ei
, leaf
, ctx
->user_ctx
);
1038 flags
= btrfs_extent_flags(leaf
, ei
);
1039 btrfs_item_key_to_cpu(leaf
, &found_key
, slot
);
1041 ptr
= (unsigned long)(ei
+ 1);
1042 end
= (unsigned long)ei
+ item_size
;
1044 if (found_key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
1045 flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
1046 struct btrfs_tree_block_info
*info
;
1048 info
= (struct btrfs_tree_block_info
*)ptr
;
1049 *info_level
= btrfs_tree_block_level(leaf
, info
);
1050 ptr
+= sizeof(struct btrfs_tree_block_info
);
1052 } else if (found_key
.type
== BTRFS_METADATA_ITEM_KEY
) {
1053 *info_level
= found_key
.offset
;
1055 BUG_ON(!(flags
& BTRFS_EXTENT_FLAG_DATA
));
1059 struct btrfs_extent_inline_ref
*iref
;
1063 iref
= (struct btrfs_extent_inline_ref
*)ptr
;
1064 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
,
1065 BTRFS_REF_TYPE_ANY
);
1066 if (type
== BTRFS_REF_TYPE_INVALID
)
1069 offset
= btrfs_extent_inline_ref_offset(leaf
, iref
);
1072 case BTRFS_SHARED_BLOCK_REF_KEY
:
1073 ret
= add_direct_ref(ctx
->fs_info
, preftrees
,
1074 *info_level
+ 1, offset
,
1075 ctx
->bytenr
, 1, NULL
, GFP_NOFS
);
1077 case BTRFS_SHARED_DATA_REF_KEY
: {
1078 struct btrfs_shared_data_ref
*sdref
;
1081 sdref
= (struct btrfs_shared_data_ref
*)(iref
+ 1);
1082 count
= btrfs_shared_data_ref_count(leaf
, sdref
);
1084 ret
= add_direct_ref(ctx
->fs_info
, preftrees
, 0, offset
,
1085 ctx
->bytenr
, count
, sc
, GFP_NOFS
);
1088 case BTRFS_TREE_BLOCK_REF_KEY
:
1089 ret
= add_indirect_ref(ctx
->fs_info
, preftrees
, offset
,
1090 NULL
, *info_level
+ 1,
1091 ctx
->bytenr
, 1, NULL
, GFP_NOFS
);
1093 case BTRFS_EXTENT_DATA_REF_KEY
: {
1094 struct btrfs_extent_data_ref
*dref
;
1098 dref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
1099 count
= btrfs_extent_data_ref_count(leaf
, dref
);
1100 key
.objectid
= btrfs_extent_data_ref_objectid(leaf
,
1102 key
.type
= BTRFS_EXTENT_DATA_KEY
;
1103 key
.offset
= btrfs_extent_data_ref_offset(leaf
, dref
);
1105 if (sc
&& key
.objectid
!= sc
->inum
&&
1106 !sc
->have_delayed_delete_refs
) {
1107 ret
= BACKREF_FOUND_SHARED
;
1111 root
= btrfs_extent_data_ref_root(leaf
, dref
);
1113 if (!ctx
->skip_data_ref
||
1114 !ctx
->skip_data_ref(root
, key
.objectid
, key
.offset
,
1116 ret
= add_indirect_ref(ctx
->fs_info
, preftrees
,
1117 root
, &key
, 0, ctx
->bytenr
,
1118 count
, sc
, GFP_NOFS
);
1121 case BTRFS_EXTENT_OWNER_REF_KEY
:
1122 ASSERT(btrfs_fs_incompat(ctx
->fs_info
, SIMPLE_QUOTA
));
1129 ptr
+= btrfs_extent_inline_ref_size(type
);
1136 * add all non-inline backrefs for bytenr to the list
1138 * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
1140 static int add_keyed_refs(struct btrfs_backref_walk_ctx
*ctx
,
1141 struct btrfs_root
*extent_root
,
1142 struct btrfs_path
*path
,
1143 int info_level
, struct preftrees
*preftrees
,
1144 struct share_check
*sc
)
1146 struct btrfs_fs_info
*fs_info
= extent_root
->fs_info
;
1149 struct extent_buffer
*leaf
;
1150 struct btrfs_key key
;
1153 ret
= btrfs_next_item(extent_root
, path
);
1161 slot
= path
->slots
[0];
1162 leaf
= path
->nodes
[0];
1163 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
1165 if (key
.objectid
!= ctx
->bytenr
)
1167 if (key
.type
< BTRFS_TREE_BLOCK_REF_KEY
)
1169 if (key
.type
> BTRFS_SHARED_DATA_REF_KEY
)
1173 case BTRFS_SHARED_BLOCK_REF_KEY
:
1174 /* SHARED DIRECT METADATA backref */
1175 ret
= add_direct_ref(fs_info
, preftrees
,
1176 info_level
+ 1, key
.offset
,
1177 ctx
->bytenr
, 1, NULL
, GFP_NOFS
);
1179 case BTRFS_SHARED_DATA_REF_KEY
: {
1180 /* SHARED DIRECT FULL backref */
1181 struct btrfs_shared_data_ref
*sdref
;
1184 sdref
= btrfs_item_ptr(leaf
, slot
,
1185 struct btrfs_shared_data_ref
);
1186 count
= btrfs_shared_data_ref_count(leaf
, sdref
);
1187 ret
= add_direct_ref(fs_info
, preftrees
, 0,
1188 key
.offset
, ctx
->bytenr
, count
,
1192 case BTRFS_TREE_BLOCK_REF_KEY
:
1193 /* NORMAL INDIRECT METADATA backref */
1194 ret
= add_indirect_ref(fs_info
, preftrees
, key
.offset
,
1195 NULL
, info_level
+ 1, ctx
->bytenr
,
1198 case BTRFS_EXTENT_DATA_REF_KEY
: {
1199 /* NORMAL INDIRECT DATA backref */
1200 struct btrfs_extent_data_ref
*dref
;
1204 dref
= btrfs_item_ptr(leaf
, slot
,
1205 struct btrfs_extent_data_ref
);
1206 count
= btrfs_extent_data_ref_count(leaf
, dref
);
1207 key
.objectid
= btrfs_extent_data_ref_objectid(leaf
,
1209 key
.type
= BTRFS_EXTENT_DATA_KEY
;
1210 key
.offset
= btrfs_extent_data_ref_offset(leaf
, dref
);
1212 if (sc
&& key
.objectid
!= sc
->inum
&&
1213 !sc
->have_delayed_delete_refs
) {
1214 ret
= BACKREF_FOUND_SHARED
;
1218 root
= btrfs_extent_data_ref_root(leaf
, dref
);
1220 if (!ctx
->skip_data_ref
||
1221 !ctx
->skip_data_ref(root
, key
.objectid
, key
.offset
,
1223 ret
= add_indirect_ref(fs_info
, preftrees
, root
,
1224 &key
, 0, ctx
->bytenr
,
1225 count
, sc
, GFP_NOFS
);
1240 * The caller has joined a transaction or is holding a read lock on the
1241 * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
1242 * snapshot field changing while updating or checking the cache.
1244 static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx
*ctx
,
1245 struct btrfs_root
*root
,
1246 u64 bytenr
, int level
, bool *is_shared
)
1248 const struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1249 struct btrfs_backref_shared_cache_entry
*entry
;
1251 if (!current
->journal_info
)
1252 lockdep_assert_held(&fs_info
->commit_root_sem
);
1254 if (!ctx
->use_path_cache
)
1257 if (WARN_ON_ONCE(level
>= BTRFS_MAX_LEVEL
))
1261 * Level -1 is used for the data extent, which is not reliable to cache
1262 * because its reference count can increase or decrease without us
1263 * realizing. We cache results only for extent buffers that lead from
1264 * the root node down to the leaf with the file extent item.
1268 entry
= &ctx
->path_cache_entries
[level
];
1270 /* Unused cache entry or being used for some other extent buffer. */
1271 if (entry
->bytenr
!= bytenr
)
1275 * We cached a false result, but the last snapshot generation of the
1276 * root changed, so we now have a snapshot. Don't trust the result.
1278 if (!entry
->is_shared
&&
1279 entry
->gen
!= btrfs_root_last_snapshot(&root
->root_item
))
1283 * If we cached a true result and the last generation used for dropping
1284 * a root changed, we can not trust the result, because the dropped root
1285 * could be a snapshot sharing this extent buffer.
1287 if (entry
->is_shared
&&
1288 entry
->gen
!= btrfs_get_last_root_drop_gen(fs_info
))
1291 *is_shared
= entry
->is_shared
;
1293 * If the node at this level is shared, than all nodes below are also
1294 * shared. Currently some of the nodes below may be marked as not shared
1295 * because we have just switched from one leaf to another, and switched
1296 * also other nodes above the leaf and below the current level, so mark
1300 for (int i
= 0; i
< level
; i
++) {
1301 ctx
->path_cache_entries
[i
].is_shared
= true;
1302 ctx
->path_cache_entries
[i
].gen
= entry
->gen
;
1310 * The caller has joined a transaction or is holding a read lock on the
1311 * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
1312 * snapshot field changing while updating or checking the cache.
1314 static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx
*ctx
,
1315 struct btrfs_root
*root
,
1316 u64 bytenr
, int level
, bool is_shared
)
1318 const struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1319 struct btrfs_backref_shared_cache_entry
*entry
;
1322 if (!current
->journal_info
)
1323 lockdep_assert_held(&fs_info
->commit_root_sem
);
1325 if (!ctx
->use_path_cache
)
1328 if (WARN_ON_ONCE(level
>= BTRFS_MAX_LEVEL
))
1332 * Level -1 is used for the data extent, which is not reliable to cache
1333 * because its reference count can increase or decrease without us
1334 * realizing. We cache results only for extent buffers that lead from
1335 * the root node down to the leaf with the file extent item.
1340 gen
= btrfs_get_last_root_drop_gen(fs_info
);
1342 gen
= btrfs_root_last_snapshot(&root
->root_item
);
1344 entry
= &ctx
->path_cache_entries
[level
];
1345 entry
->bytenr
= bytenr
;
1346 entry
->is_shared
= is_shared
;
1350 * If we found an extent buffer is shared, set the cache result for all
1351 * extent buffers below it to true. As nodes in the path are COWed,
1352 * their sharedness is moved to their children, and if a leaf is COWed,
1353 * then the sharedness of a data extent becomes direct, the refcount of
1354 * data extent is increased in the extent item at the extent tree.
1357 for (int i
= 0; i
< level
; i
++) {
1358 entry
= &ctx
->path_cache_entries
[i
];
1359 entry
->is_shared
= is_shared
;
1366 * this adds all existing backrefs (inline backrefs, backrefs and delayed
1367 * refs) for the given bytenr to the refs list, merges duplicates and resolves
1368 * indirect refs to their parent bytenr.
1369 * When roots are found, they're added to the roots list
1371 * @ctx: Backref walking context object, must be not NULL.
1372 * @sc: If !NULL, then immediately return BACKREF_FOUND_SHARED when a
1373 * shared extent is detected.
1375 * Otherwise this returns 0 for success and <0 for an error.
1377 * FIXME some caching might speed things up
1379 static int find_parent_nodes(struct btrfs_backref_walk_ctx
*ctx
,
1380 struct share_check
*sc
)
1382 struct btrfs_root
*root
= btrfs_extent_root(ctx
->fs_info
, ctx
->bytenr
);
1383 struct btrfs_key key
;
1384 struct btrfs_path
*path
;
1385 struct btrfs_delayed_ref_root
*delayed_refs
= NULL
;
1386 struct btrfs_delayed_ref_head
*head
;
1389 struct prelim_ref
*ref
;
1390 struct rb_node
*node
;
1391 struct extent_inode_elem
*eie
= NULL
;
1392 struct preftrees preftrees
= {
1393 .direct
= PREFTREE_INIT
,
1394 .indirect
= PREFTREE_INIT
,
1395 .indirect_missing_keys
= PREFTREE_INIT
1398 /* Roots ulist is not needed when using a sharedness check context. */
1400 ASSERT(ctx
->roots
== NULL
);
1402 key
.objectid
= ctx
->bytenr
;
1403 key
.offset
= (u64
)-1;
1404 if (btrfs_fs_incompat(ctx
->fs_info
, SKINNY_METADATA
))
1405 key
.type
= BTRFS_METADATA_ITEM_KEY
;
1407 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1409 path
= btrfs_alloc_path();
1413 path
->search_commit_root
= 1;
1414 path
->skip_locking
= 1;
1417 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
1418 path
->skip_locking
= 1;
1423 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1428 * Key with offset -1 found, there would have to exist an extent
1429 * item with such offset, but this is out of the valid range.
1435 if (ctx
->trans
&& likely(ctx
->trans
->type
!= __TRANS_DUMMY
) &&
1436 ctx
->time_seq
!= BTRFS_SEQ_LAST
) {
1438 * We have a specific time_seq we care about and trans which
1439 * means we have the path lock, we need to grab the ref head and
1440 * lock it so we have a consistent view of the refs at the given
1443 delayed_refs
= &ctx
->trans
->transaction
->delayed_refs
;
1444 spin_lock(&delayed_refs
->lock
);
1445 head
= btrfs_find_delayed_ref_head(delayed_refs
, ctx
->bytenr
);
1447 if (!mutex_trylock(&head
->mutex
)) {
1448 refcount_inc(&head
->refs
);
1449 spin_unlock(&delayed_refs
->lock
);
1451 btrfs_release_path(path
);
1454 * Mutex was contended, block until it's
1455 * released and try again
1457 mutex_lock(&head
->mutex
);
1458 mutex_unlock(&head
->mutex
);
1459 btrfs_put_delayed_ref_head(head
);
1462 spin_unlock(&delayed_refs
->lock
);
1463 ret
= add_delayed_refs(ctx
->fs_info
, head
, ctx
->time_seq
,
1465 mutex_unlock(&head
->mutex
);
1469 spin_unlock(&delayed_refs
->lock
);
1473 if (path
->slots
[0]) {
1474 struct extent_buffer
*leaf
;
1478 leaf
= path
->nodes
[0];
1479 slot
= path
->slots
[0];
1480 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
1481 if (key
.objectid
== ctx
->bytenr
&&
1482 (key
.type
== BTRFS_EXTENT_ITEM_KEY
||
1483 key
.type
== BTRFS_METADATA_ITEM_KEY
)) {
1484 ret
= add_inline_refs(ctx
, path
, &info_level
,
1488 ret
= add_keyed_refs(ctx
, root
, path
, info_level
,
1496 * If we have a share context and we reached here, it means the extent
1497 * is not directly shared (no multiple reference items for it),
1498 * otherwise we would have exited earlier with a return value of
1499 * BACKREF_FOUND_SHARED after processing delayed references or while
1500 * processing inline or keyed references from the extent tree.
1501 * The extent may however be indirectly shared through shared subtrees
1502 * as a result from creating snapshots, so we determine below what is
1503 * its parent node, in case we are dealing with a metadata extent, or
1504 * what's the leaf (or leaves), from a fs tree, that has a file extent
1505 * item pointing to it in case we are dealing with a data extent.
1507 ASSERT(extent_is_shared(sc
) == 0);
1510 * If we are here for a data extent and we have a share_check structure
1511 * it means the data extent is not directly shared (does not have
1512 * multiple reference items), so we have to check if a path in the fs
1513 * tree (going from the root node down to the leaf that has the file
1514 * extent item pointing to the data extent) is shared, that is, if any
1515 * of the extent buffers in the path is referenced by other trees.
1517 if (sc
&& ctx
->bytenr
== sc
->data_bytenr
) {
1519 * If our data extent is from a generation more recent than the
1520 * last generation used to snapshot the root, then we know that
1521 * it can not be shared through subtrees, so we can skip
1522 * resolving indirect references, there's no point in
1523 * determining the extent buffers for the path from the fs tree
1524 * root node down to the leaf that has the file extent item that
1525 * points to the data extent.
1527 if (sc
->data_extent_gen
>
1528 btrfs_root_last_snapshot(&sc
->root
->root_item
)) {
1529 ret
= BACKREF_FOUND_NOT_SHARED
;
1534 * If we are only determining if a data extent is shared or not
1535 * and the corresponding file extent item is located in the same
1536 * leaf as the previous file extent item, we can skip resolving
1537 * indirect references for a data extent, since the fs tree path
1538 * is the same (same leaf, so same path). We skip as long as the
1539 * cached result for the leaf is valid and only if there's only
1540 * one file extent item pointing to the data extent, because in
1541 * the case of multiple file extent items, they may be located
1542 * in different leaves and therefore we have multiple paths.
1544 if (sc
->ctx
->curr_leaf_bytenr
== sc
->ctx
->prev_leaf_bytenr
&&
1545 sc
->self_ref_count
== 1) {
1549 cached
= lookup_backref_shared_cache(sc
->ctx
, sc
->root
,
1550 sc
->ctx
->curr_leaf_bytenr
,
1554 ret
= BACKREF_FOUND_SHARED
;
1556 ret
= BACKREF_FOUND_NOT_SHARED
;
1562 btrfs_release_path(path
);
1564 ret
= add_missing_keys(ctx
->fs_info
, &preftrees
, path
->skip_locking
== 0);
1568 WARN_ON(!RB_EMPTY_ROOT(&preftrees
.indirect_missing_keys
.root
.rb_root
));
1570 ret
= resolve_indirect_refs(ctx
, path
, &preftrees
, sc
);
1574 WARN_ON(!RB_EMPTY_ROOT(&preftrees
.indirect
.root
.rb_root
));
1577 * This walks the tree of merged and resolved refs. Tree blocks are
1578 * read in as needed. Unique entries are added to the ulist, and
1579 * the list of found roots is updated.
1581 * We release the entire tree in one go before returning.
1583 node
= rb_first_cached(&preftrees
.direct
.root
);
1585 ref
= rb_entry(node
, struct prelim_ref
, rbnode
);
1586 node
= rb_next(&ref
->rbnode
);
1588 * ref->count < 0 can happen here if there are delayed
1589 * refs with a node->action of BTRFS_DROP_DELAYED_REF.
1590 * prelim_ref_insert() relies on this when merging
1591 * identical refs to keep the overall count correct.
1592 * prelim_ref_insert() will merge only those refs
1593 * which compare identically. Any refs having
1594 * e.g. different offsets would not be merged,
1595 * and would retain their original ref->count < 0.
1597 if (ctx
->roots
&& ref
->count
&& ref
->root_id
&& ref
->parent
== 0) {
1598 /* no parent == root of tree */
1599 ret
= ulist_add(ctx
->roots
, ref
->root_id
, 0, GFP_NOFS
);
1603 if (ref
->count
&& ref
->parent
) {
1604 if (!ctx
->skip_inode_ref_list
&& !ref
->inode_list
&&
1606 struct btrfs_tree_parent_check check
= { 0 };
1607 struct extent_buffer
*eb
;
1609 check
.level
= ref
->level
;
1611 eb
= read_tree_block(ctx
->fs_info
, ref
->parent
,
1617 if (!extent_buffer_uptodate(eb
)) {
1618 free_extent_buffer(eb
);
1623 if (!path
->skip_locking
)
1624 btrfs_tree_read_lock(eb
);
1625 ret
= find_extent_in_eb(ctx
, eb
, &eie
);
1626 if (!path
->skip_locking
)
1627 btrfs_tree_read_unlock(eb
);
1628 free_extent_buffer(eb
);
1629 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
||
1632 ref
->inode_list
= eie
;
1634 * We transferred the list ownership to the ref,
1635 * so set to NULL to avoid a double free in case
1636 * an error happens after this.
1640 ret
= ulist_add_merge_ptr(ctx
->refs
, ref
->parent
,
1642 (void **)&eie
, GFP_NOFS
);
1645 if (!ret
&& !ctx
->skip_inode_ref_list
) {
1647 * We've recorded that parent, so we must extend
1648 * its inode list here.
1650 * However if there was corruption we may not
1651 * have found an eie, return an error in this
1661 eie
->next
= ref
->inode_list
;
1665 * We have transferred the inode list ownership from
1666 * this ref to the ref we added to the 'refs' ulist.
1667 * So set this ref's inode list to NULL to avoid
1668 * use-after-free when our caller uses it or double
1669 * frees in case an error happens before we return.
1671 ref
->inode_list
= NULL
;
1677 btrfs_free_path(path
);
1679 prelim_release(&preftrees
.direct
);
1680 prelim_release(&preftrees
.indirect
);
1681 prelim_release(&preftrees
.indirect_missing_keys
);
1683 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
|| ret
< 0)
1684 free_inode_elem_list(eie
);
1689 * Finds all leaves with a reference to the specified combination of
1690 * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are
1691 * added to the ulist at @ctx->refs, and that ulist is allocated by this
1692 * function. The caller should free the ulist with free_leaf_list() if
1693 * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is
1696 * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated.
1698 int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx
*ctx
)
1702 ASSERT(ctx
->refs
== NULL
);
1704 ctx
->refs
= ulist_alloc(GFP_NOFS
);
1708 ret
= find_parent_nodes(ctx
, NULL
);
1709 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
||
1710 (ret
< 0 && ret
!= -ENOENT
)) {
1711 free_leaf_list(ctx
->refs
);
1720 * Walk all backrefs for a given extent to find all roots that reference this
1721 * extent. Walking a backref means finding all extents that reference this
1722 * extent and in turn walk the backrefs of those, too. Naturally this is a
1723 * recursive process, but here it is implemented in an iterative fashion: We
1724 * find all referencing extents for the extent in question and put them on a
1725 * list. In turn, we find all referencing extents for those, further appending
1726 * to the list. The way we iterate the list allows adding more elements after
1727 * the current while iterating. The process stops when we reach the end of the
1730 * Found roots are added to @ctx->roots, which is allocated by this function if
1731 * it points to NULL, in which case the caller is responsible for freeing it
1732 * after it's not needed anymore.
1733 * This function requires @ctx->refs to be NULL, as it uses it for allocating a
1734 * ulist to do temporary work, and frees it before returning.
1736 * Returns 0 on success, < 0 on error.
1738 static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx
*ctx
)
1740 const u64 orig_bytenr
= ctx
->bytenr
;
1741 const bool orig_skip_inode_ref_list
= ctx
->skip_inode_ref_list
;
1742 bool roots_ulist_allocated
= false;
1743 struct ulist_iterator uiter
;
1746 ASSERT(ctx
->refs
== NULL
);
1748 ctx
->refs
= ulist_alloc(GFP_NOFS
);
1753 ctx
->roots
= ulist_alloc(GFP_NOFS
);
1755 ulist_free(ctx
->refs
);
1759 roots_ulist_allocated
= true;
1762 ctx
->skip_inode_ref_list
= true;
1764 ULIST_ITER_INIT(&uiter
);
1766 struct ulist_node
*node
;
1768 ret
= find_parent_nodes(ctx
, NULL
);
1769 if (ret
< 0 && ret
!= -ENOENT
) {
1770 if (roots_ulist_allocated
) {
1771 ulist_free(ctx
->roots
);
1777 node
= ulist_next(ctx
->refs
, &uiter
);
1780 ctx
->bytenr
= node
->val
;
1784 ulist_free(ctx
->refs
);
1786 ctx
->bytenr
= orig_bytenr
;
1787 ctx
->skip_inode_ref_list
= orig_skip_inode_ref_list
;
1792 int btrfs_find_all_roots(struct btrfs_backref_walk_ctx
*ctx
,
1793 bool skip_commit_root_sem
)
1797 if (!ctx
->trans
&& !skip_commit_root_sem
)
1798 down_read(&ctx
->fs_info
->commit_root_sem
);
1799 ret
= btrfs_find_all_roots_safe(ctx
);
1800 if (!ctx
->trans
&& !skip_commit_root_sem
)
1801 up_read(&ctx
->fs_info
->commit_root_sem
);
1805 struct btrfs_backref_share_check_ctx
*btrfs_alloc_backref_share_check_ctx(void)
1807 struct btrfs_backref_share_check_ctx
*ctx
;
1809 ctx
= kzalloc(sizeof(*ctx
), GFP_KERNEL
);
1813 ulist_init(&ctx
->refs
);
1818 void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx
*ctx
)
1823 ulist_release(&ctx
->refs
);
1828 * Check if a data extent is shared or not.
1830 * @inode: The inode whose extent we are checking.
1831 * @bytenr: Logical bytenr of the extent we are checking.
1832 * @extent_gen: Generation of the extent (file extent item) or 0 if it is
1834 * @ctx: A backref sharedness check context.
1836 * btrfs_is_data_extent_shared uses the backref walking code but will short
1837 * circuit as soon as it finds a root or inode that doesn't match the
1838 * one passed in. This provides a significant performance benefit for
1839 * callers (such as fiemap) which want to know whether the extent is
1840 * shared but do not need a ref count.
1842 * This attempts to attach to the running transaction in order to account for
1843 * delayed refs, but continues on even when no running transaction exists.
1845 * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
1847 int btrfs_is_data_extent_shared(struct btrfs_inode
*inode
, u64 bytenr
,
1849 struct btrfs_backref_share_check_ctx
*ctx
)
1851 struct btrfs_backref_walk_ctx walk_ctx
= { 0 };
1852 struct btrfs_root
*root
= inode
->root
;
1853 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1854 struct btrfs_trans_handle
*trans
;
1855 struct ulist_iterator uiter
;
1856 struct ulist_node
*node
;
1857 struct btrfs_seq_list elem
= BTRFS_SEQ_LIST_INIT(elem
);
1859 struct share_check shared
= {
1862 .inum
= btrfs_ino(inode
),
1863 .data_bytenr
= bytenr
,
1864 .data_extent_gen
= extent_gen
,
1866 .self_ref_count
= 0,
1867 .have_delayed_delete_refs
= false,
1871 bool leaf_is_shared
;
1873 for (int i
= 0; i
< BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE
; i
++) {
1874 if (ctx
->prev_extents_cache
[i
].bytenr
== bytenr
)
1875 return ctx
->prev_extents_cache
[i
].is_shared
;
1878 ulist_init(&ctx
->refs
);
1880 trans
= btrfs_join_transaction_nostart(root
);
1881 if (IS_ERR(trans
)) {
1882 if (PTR_ERR(trans
) != -ENOENT
&& PTR_ERR(trans
) != -EROFS
) {
1883 ret
= PTR_ERR(trans
);
1887 down_read(&fs_info
->commit_root_sem
);
1889 btrfs_get_tree_mod_seq(fs_info
, &elem
);
1890 walk_ctx
.time_seq
= elem
.seq
;
1893 ctx
->use_path_cache
= true;
1896 * We may have previously determined that the current leaf is shared.
1897 * If it is, then we have a data extent that is shared due to a shared
1898 * subtree (caused by snapshotting) and we don't need to check for data
1899 * backrefs. If the leaf is not shared, then we must do backref walking
1900 * to determine if the data extent is shared through reflinks.
1902 leaf_cached
= lookup_backref_shared_cache(ctx
, root
,
1903 ctx
->curr_leaf_bytenr
, 0,
1905 if (leaf_cached
&& leaf_is_shared
) {
1910 walk_ctx
.skip_inode_ref_list
= true;
1911 walk_ctx
.trans
= trans
;
1912 walk_ctx
.fs_info
= fs_info
;
1913 walk_ctx
.refs
= &ctx
->refs
;
1915 /* -1 means we are in the bytenr of the data extent. */
1917 ULIST_ITER_INIT(&uiter
);
1919 const unsigned long prev_ref_count
= ctx
->refs
.nnodes
;
1921 walk_ctx
.bytenr
= bytenr
;
1922 ret
= find_parent_nodes(&walk_ctx
, &shared
);
1923 if (ret
== BACKREF_FOUND_SHARED
||
1924 ret
== BACKREF_FOUND_NOT_SHARED
) {
1925 /* If shared must return 1, otherwise return 0. */
1926 ret
= (ret
== BACKREF_FOUND_SHARED
) ? 1 : 0;
1928 store_backref_shared_cache(ctx
, root
, bytenr
,
1932 if (ret
< 0 && ret
!= -ENOENT
)
1937 * More than one extent buffer (bytenr) may have been added to
1938 * the ctx->refs ulist, in which case we have to check multiple
1939 * tree paths in case the first one is not shared, so we can not
1940 * use the path cache which is made for a single path. Multiple
1941 * extent buffers at the current level happen when:
1943 * 1) level -1, the data extent: If our data extent was not
1944 * directly shared (without multiple reference items), then
1945 * it might have a single reference item with a count > 1 for
1946 * the same offset, which means there are 2 (or more) file
1947 * extent items that point to the data extent - this happens
1948 * when a file extent item needs to be split and then one
1949 * item gets moved to another leaf due to a b+tree leaf split
1950 * when inserting some item. In this case the file extent
1951 * items may be located in different leaves and therefore
1952 * some of the leaves may be referenced through shared
1953 * subtrees while others are not. Since our extent buffer
1954 * cache only works for a single path (by far the most common
1955 * case and simpler to deal with), we can not use it if we
1956 * have multiple leaves (which implies multiple paths).
1958 * 2) level >= 0, a tree node/leaf: We can have a mix of direct
1959 * and indirect references on a b+tree node/leaf, so we have
1960 * to check multiple paths, and the extent buffer (the
1961 * current bytenr) may be shared or not. One example is
1962 * during relocation as we may get a shared tree block ref
1963 * (direct ref) and a non-shared tree block ref (indirect
1964 * ref) for the same node/leaf.
1966 if ((ctx
->refs
.nnodes
- prev_ref_count
) > 1)
1967 ctx
->use_path_cache
= false;
1970 store_backref_shared_cache(ctx
, root
, bytenr
,
1972 node
= ulist_next(&ctx
->refs
, &uiter
);
1976 if (ctx
->use_path_cache
) {
1981 cached
= lookup_backref_shared_cache(ctx
, root
, bytenr
,
1984 ret
= (is_shared
? 1 : 0);
1988 shared
.share_count
= 0;
1989 shared
.have_delayed_delete_refs
= false;
1994 * If the path cache is disabled, then it means at some tree level we
1995 * got multiple parents due to a mix of direct and indirect backrefs or
1996 * multiple leaves with file extent items pointing to the same data
1997 * extent. We have to invalidate the cache and cache only the sharedness
1998 * result for the levels where we got only one node/reference.
2000 if (!ctx
->use_path_cache
) {
2004 if (ret
>= 0 && level
>= 0) {
2005 bytenr
= ctx
->path_cache_entries
[level
].bytenr
;
2006 ctx
->use_path_cache
= true;
2007 store_backref_shared_cache(ctx
, root
, bytenr
, level
, ret
);
2011 for ( ; i
< BTRFS_MAX_LEVEL
; i
++)
2012 ctx
->path_cache_entries
[i
].bytenr
= 0;
2016 * Cache the sharedness result for the data extent if we know our inode
2017 * has more than 1 file extent item that refers to the data extent.
2019 if (ret
>= 0 && shared
.self_ref_count
> 1) {
2020 int slot
= ctx
->prev_extents_cache_slot
;
2022 ctx
->prev_extents_cache
[slot
].bytenr
= shared
.data_bytenr
;
2023 ctx
->prev_extents_cache
[slot
].is_shared
= (ret
== 1);
2025 slot
= (slot
+ 1) % BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE
;
2026 ctx
->prev_extents_cache_slot
= slot
;
2031 btrfs_put_tree_mod_seq(fs_info
, &elem
);
2032 btrfs_end_transaction(trans
);
2034 up_read(&fs_info
->commit_root_sem
);
2037 ulist_release(&ctx
->refs
);
2038 ctx
->prev_leaf_bytenr
= ctx
->curr_leaf_bytenr
;
2043 int btrfs_find_one_extref(struct btrfs_root
*root
, u64 inode_objectid
,
2044 u64 start_off
, struct btrfs_path
*path
,
2045 struct btrfs_inode_extref
**ret_extref
,
2049 struct btrfs_key key
;
2050 struct btrfs_key found_key
;
2051 struct btrfs_inode_extref
*extref
;
2052 const struct extent_buffer
*leaf
;
2055 key
.objectid
= inode_objectid
;
2056 key
.type
= BTRFS_INODE_EXTREF_KEY
;
2057 key
.offset
= start_off
;
2059 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
2064 leaf
= path
->nodes
[0];
2065 slot
= path
->slots
[0];
2066 if (slot
>= btrfs_header_nritems(leaf
)) {
2068 * If the item at offset is not found,
2069 * btrfs_search_slot will point us to the slot
2070 * where it should be inserted. In our case
2071 * that will be the slot directly before the
2072 * next INODE_REF_KEY_V2 item. In the case
2073 * that we're pointing to the last slot in a
2074 * leaf, we must move one leaf over.
2076 ret
= btrfs_next_leaf(root
, path
);
2085 btrfs_item_key_to_cpu(leaf
, &found_key
, slot
);
2088 * Check that we're still looking at an extended ref key for
2089 * this particular objectid. If we have different
2090 * objectid or type then there are no more to be found
2091 * in the tree and we can exit.
2094 if (found_key
.objectid
!= inode_objectid
)
2096 if (found_key
.type
!= BTRFS_INODE_EXTREF_KEY
)
2100 ptr
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
2101 extref
= (struct btrfs_inode_extref
*)ptr
;
2102 *ret_extref
= extref
;
2104 *found_off
= found_key
.offset
;
2112 * this iterates to turn a name (from iref/extref) into a full filesystem path.
2113 * Elements of the path are separated by '/' and the path is guaranteed to be
2114 * 0-terminated. the path is only given within the current file system.
2115 * Therefore, it never starts with a '/'. the caller is responsible to provide
2116 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
2117 * the start point of the resulting string is returned. this pointer is within
2119 * in case the path buffer would overflow, the pointer is decremented further
2120 * as if output was written to the buffer, though no more output is actually
2121 * generated. that way, the caller can determine how much space would be
2122 * required for the path to fit into the buffer. in that case, the returned
2123 * value will be smaller than dest. callers must check this!
2125 char *btrfs_ref_to_path(struct btrfs_root
*fs_root
, struct btrfs_path
*path
,
2126 u32 name_len
, unsigned long name_off
,
2127 struct extent_buffer
*eb_in
, u64 parent
,
2128 char *dest
, u32 size
)
2133 s64 bytes_left
= ((s64
)size
) - 1;
2134 struct extent_buffer
*eb
= eb_in
;
2135 struct btrfs_key found_key
;
2136 struct btrfs_inode_ref
*iref
;
2138 if (bytes_left
>= 0)
2139 dest
[bytes_left
] = '\0';
2142 bytes_left
-= name_len
;
2143 if (bytes_left
>= 0)
2144 read_extent_buffer(eb
, dest
+ bytes_left
,
2145 name_off
, name_len
);
2147 if (!path
->skip_locking
)
2148 btrfs_tree_read_unlock(eb
);
2149 free_extent_buffer(eb
);
2151 ret
= btrfs_find_item(fs_root
, path
, parent
, 0,
2152 BTRFS_INODE_REF_KEY
, &found_key
);
2158 next_inum
= found_key
.offset
;
2160 /* regular exit ahead */
2161 if (parent
== next_inum
)
2164 slot
= path
->slots
[0];
2165 eb
= path
->nodes
[0];
2166 /* make sure we can use eb after releasing the path */
2168 path
->nodes
[0] = NULL
;
2171 btrfs_release_path(path
);
2172 iref
= btrfs_item_ptr(eb
, slot
, struct btrfs_inode_ref
);
2174 name_len
= btrfs_inode_ref_name_len(eb
, iref
);
2175 name_off
= (unsigned long)(iref
+ 1);
2179 if (bytes_left
>= 0)
2180 dest
[bytes_left
] = '/';
2183 btrfs_release_path(path
);
2186 return ERR_PTR(ret
);
2188 return dest
+ bytes_left
;
2192 * this makes the path point to (logical EXTENT_ITEM *)
2193 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
2194 * tree blocks and <0 on error.
2196 int extent_from_logical(struct btrfs_fs_info
*fs_info
, u64 logical
,
2197 struct btrfs_path
*path
, struct btrfs_key
*found_key
,
2200 struct btrfs_root
*extent_root
= btrfs_extent_root(fs_info
, logical
);
2205 const struct extent_buffer
*eb
;
2206 struct btrfs_extent_item
*ei
;
2207 struct btrfs_key key
;
2209 if (btrfs_fs_incompat(fs_info
, SKINNY_METADATA
))
2210 key
.type
= BTRFS_METADATA_ITEM_KEY
;
2212 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
2213 key
.objectid
= logical
;
2214 key
.offset
= (u64
)-1;
2216 ret
= btrfs_search_slot(NULL
, extent_root
, &key
, path
, 0, 0);
2221 * Key with offset -1 found, there would have to exist an extent
2222 * item with such offset, but this is out of the valid range.
2227 ret
= btrfs_previous_extent_item(extent_root
, path
, 0);
2233 btrfs_item_key_to_cpu(path
->nodes
[0], found_key
, path
->slots
[0]);
2234 if (found_key
->type
== BTRFS_METADATA_ITEM_KEY
)
2235 size
= fs_info
->nodesize
;
2236 else if (found_key
->type
== BTRFS_EXTENT_ITEM_KEY
)
2237 size
= found_key
->offset
;
2239 if (found_key
->objectid
> logical
||
2240 found_key
->objectid
+ size
<= logical
) {
2241 btrfs_debug(fs_info
,
2242 "logical %llu is not within any extent", logical
);
2246 eb
= path
->nodes
[0];
2247 item_size
= btrfs_item_size(eb
, path
->slots
[0]);
2249 ei
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_extent_item
);
2250 flags
= btrfs_extent_flags(eb
, ei
);
2252 btrfs_debug(fs_info
,
2253 "logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u",
2254 logical
, logical
- found_key
->objectid
, found_key
->objectid
,
2255 found_key
->offset
, flags
, item_size
);
2257 WARN_ON(!flags_ret
);
2259 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)
2260 *flags_ret
= BTRFS_EXTENT_FLAG_TREE_BLOCK
;
2261 else if (flags
& BTRFS_EXTENT_FLAG_DATA
)
2262 *flags_ret
= BTRFS_EXTENT_FLAG_DATA
;
2272 * helper function to iterate extent inline refs. ptr must point to a 0 value
2273 * for the first call and may be modified. it is used to track state.
2274 * if more refs exist, 0 is returned and the next call to
2275 * get_extent_inline_ref must pass the modified ptr parameter to get the
2276 * next ref. after the last ref was processed, 1 is returned.
2277 * returns <0 on error
2279 static int get_extent_inline_ref(unsigned long *ptr
,
2280 const struct extent_buffer
*eb
,
2281 const struct btrfs_key
*key
,
2282 const struct btrfs_extent_item
*ei
,
2284 struct btrfs_extent_inline_ref
**out_eiref
,
2289 struct btrfs_tree_block_info
*info
;
2293 flags
= btrfs_extent_flags(eb
, ei
);
2294 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
2295 if (key
->type
== BTRFS_METADATA_ITEM_KEY
) {
2296 /* a skinny metadata extent */
2298 (struct btrfs_extent_inline_ref
*)(ei
+ 1);
2300 WARN_ON(key
->type
!= BTRFS_EXTENT_ITEM_KEY
);
2301 info
= (struct btrfs_tree_block_info
*)(ei
+ 1);
2303 (struct btrfs_extent_inline_ref
*)(info
+ 1);
2306 *out_eiref
= (struct btrfs_extent_inline_ref
*)(ei
+ 1);
2308 *ptr
= (unsigned long)*out_eiref
;
2309 if ((unsigned long)(*ptr
) >= (unsigned long)ei
+ item_size
)
2313 end
= (unsigned long)ei
+ item_size
;
2314 *out_eiref
= (struct btrfs_extent_inline_ref
*)(*ptr
);
2315 *out_type
= btrfs_get_extent_inline_ref_type(eb
, *out_eiref
,
2316 BTRFS_REF_TYPE_ANY
);
2317 if (*out_type
== BTRFS_REF_TYPE_INVALID
)
2320 *ptr
+= btrfs_extent_inline_ref_size(*out_type
);
2321 WARN_ON(*ptr
> end
);
2323 return 1; /* last */
2329 * reads the tree block backref for an extent. tree level and root are returned
2330 * through out_level and out_root. ptr must point to a 0 value for the first
2331 * call and may be modified (see get_extent_inline_ref comment).
2332 * returns 0 if data was provided, 1 if there was no more data to provide or
2335 int tree_backref_for_extent(unsigned long *ptr
, struct extent_buffer
*eb
,
2336 struct btrfs_key
*key
, struct btrfs_extent_item
*ei
,
2337 u32 item_size
, u64
*out_root
, u8
*out_level
)
2341 struct btrfs_extent_inline_ref
*eiref
;
2343 if (*ptr
== (unsigned long)-1)
2347 ret
= get_extent_inline_ref(ptr
, eb
, key
, ei
, item_size
,
2352 if (type
== BTRFS_TREE_BLOCK_REF_KEY
||
2353 type
== BTRFS_SHARED_BLOCK_REF_KEY
)
2360 /* we can treat both ref types equally here */
2361 *out_root
= btrfs_extent_inline_ref_offset(eb
, eiref
);
2363 if (key
->type
== BTRFS_EXTENT_ITEM_KEY
) {
2364 struct btrfs_tree_block_info
*info
;
2366 info
= (struct btrfs_tree_block_info
*)(ei
+ 1);
2367 *out_level
= btrfs_tree_block_level(eb
, info
);
2369 ASSERT(key
->type
== BTRFS_METADATA_ITEM_KEY
);
2370 *out_level
= (u8
)key
->offset
;
2374 *ptr
= (unsigned long)-1;
2379 static int iterate_leaf_refs(struct btrfs_fs_info
*fs_info
,
2380 struct extent_inode_elem
*inode_list
,
2381 u64 root
, u64 extent_item_objectid
,
2382 iterate_extent_inodes_t
*iterate
, void *ctx
)
2384 struct extent_inode_elem
*eie
;
2387 for (eie
= inode_list
; eie
; eie
= eie
->next
) {
2388 btrfs_debug(fs_info
,
2389 "ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu",
2390 extent_item_objectid
, eie
->inum
,
2392 ret
= iterate(eie
->inum
, eie
->offset
, eie
->num_bytes
, root
, ctx
);
2394 btrfs_debug(fs_info
,
2395 "stopping iteration for %llu due to ret=%d",
2396 extent_item_objectid
, ret
);
2405 * calls iterate() for every inode that references the extent identified by
2406 * the given parameters.
2407 * when the iterator function returns a non-zero value, iteration stops.
2409 int iterate_extent_inodes(struct btrfs_backref_walk_ctx
*ctx
,
2410 bool search_commit_root
,
2411 iterate_extent_inodes_t
*iterate
, void *user_ctx
)
2415 struct ulist_node
*ref_node
;
2416 struct btrfs_seq_list seq_elem
= BTRFS_SEQ_LIST_INIT(seq_elem
);
2417 struct ulist_iterator ref_uiter
;
2419 btrfs_debug(ctx
->fs_info
, "resolving all inodes for extent %llu",
2422 ASSERT(ctx
->trans
== NULL
);
2423 ASSERT(ctx
->roots
== NULL
);
2425 if (!search_commit_root
) {
2426 struct btrfs_trans_handle
*trans
;
2428 trans
= btrfs_attach_transaction(ctx
->fs_info
->tree_root
);
2429 if (IS_ERR(trans
)) {
2430 if (PTR_ERR(trans
) != -ENOENT
&&
2431 PTR_ERR(trans
) != -EROFS
)
2432 return PTR_ERR(trans
);
2439 btrfs_get_tree_mod_seq(ctx
->fs_info
, &seq_elem
);
2440 ctx
->time_seq
= seq_elem
.seq
;
2442 down_read(&ctx
->fs_info
->commit_root_sem
);
2445 ret
= btrfs_find_all_leafs(ctx
);
2451 ULIST_ITER_INIT(&ref_uiter
);
2452 while (!ret
&& (ref_node
= ulist_next(refs
, &ref_uiter
))) {
2453 const u64 leaf_bytenr
= ref_node
->val
;
2454 struct ulist_node
*root_node
;
2455 struct ulist_iterator root_uiter
;
2456 struct extent_inode_elem
*inode_list
;
2458 inode_list
= (struct extent_inode_elem
*)(uintptr_t)ref_node
->aux
;
2460 if (ctx
->cache_lookup
) {
2461 const u64
*root_ids
;
2465 cached
= ctx
->cache_lookup(leaf_bytenr
, ctx
->user_ctx
,
2466 &root_ids
, &root_count
);
2468 for (int i
= 0; i
< root_count
; i
++) {
2469 ret
= iterate_leaf_refs(ctx
->fs_info
,
2483 ctx
->roots
= ulist_alloc(GFP_NOFS
);
2490 ctx
->bytenr
= leaf_bytenr
;
2491 ret
= btrfs_find_all_roots_safe(ctx
);
2495 if (ctx
->cache_store
)
2496 ctx
->cache_store(leaf_bytenr
, ctx
->roots
, ctx
->user_ctx
);
2498 ULIST_ITER_INIT(&root_uiter
);
2499 while (!ret
&& (root_node
= ulist_next(ctx
->roots
, &root_uiter
))) {
2500 btrfs_debug(ctx
->fs_info
,
2501 "root %llu references leaf %llu, data list %#llx",
2502 root_node
->val
, ref_node
->val
,
2504 ret
= iterate_leaf_refs(ctx
->fs_info
, inode_list
,
2505 root_node
->val
, ctx
->bytenr
,
2508 ulist_reinit(ctx
->roots
);
2511 free_leaf_list(refs
);
2514 btrfs_put_tree_mod_seq(ctx
->fs_info
, &seq_elem
);
2515 btrfs_end_transaction(ctx
->trans
);
2518 up_read(&ctx
->fs_info
->commit_root_sem
);
2521 ulist_free(ctx
->roots
);
2524 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
)
2530 static int build_ino_list(u64 inum
, u64 offset
, u64 num_bytes
, u64 root
, void *ctx
)
2532 struct btrfs_data_container
*inodes
= ctx
;
2533 const size_t c
= 3 * sizeof(u64
);
2535 if (inodes
->bytes_left
>= c
) {
2536 inodes
->bytes_left
-= c
;
2537 inodes
->val
[inodes
->elem_cnt
] = inum
;
2538 inodes
->val
[inodes
->elem_cnt
+ 1] = offset
;
2539 inodes
->val
[inodes
->elem_cnt
+ 2] = root
;
2540 inodes
->elem_cnt
+= 3;
2542 inodes
->bytes_missing
+= c
- inodes
->bytes_left
;
2543 inodes
->bytes_left
= 0;
2544 inodes
->elem_missed
+= 3;
2550 int iterate_inodes_from_logical(u64 logical
, struct btrfs_fs_info
*fs_info
,
2551 struct btrfs_path
*path
,
2552 void *ctx
, bool ignore_offset
)
2554 struct btrfs_backref_walk_ctx walk_ctx
= { 0 };
2557 struct btrfs_key found_key
;
2558 int search_commit_root
= path
->search_commit_root
;
2560 ret
= extent_from_logical(fs_info
, logical
, path
, &found_key
, &flags
);
2561 btrfs_release_path(path
);
2564 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)
2567 walk_ctx
.bytenr
= found_key
.objectid
;
2569 walk_ctx
.ignore_extent_item_pos
= true;
2571 walk_ctx
.extent_item_pos
= logical
- found_key
.objectid
;
2572 walk_ctx
.fs_info
= fs_info
;
2574 return iterate_extent_inodes(&walk_ctx
, search_commit_root
,
2575 build_ino_list
, ctx
);
2578 static int inode_to_path(u64 inum
, u32 name_len
, unsigned long name_off
,
2579 struct extent_buffer
*eb
, struct inode_fs_paths
*ipath
);
2581 static int iterate_inode_refs(u64 inum
, struct inode_fs_paths
*ipath
)
2590 struct btrfs_root
*fs_root
= ipath
->fs_root
;
2591 struct btrfs_path
*path
= ipath
->btrfs_path
;
2592 struct extent_buffer
*eb
;
2593 struct btrfs_inode_ref
*iref
;
2594 struct btrfs_key found_key
;
2597 ret
= btrfs_find_item(fs_root
, path
, inum
,
2598 parent
? parent
+ 1 : 0, BTRFS_INODE_REF_KEY
,
2604 ret
= found
? 0 : -ENOENT
;
2609 parent
= found_key
.offset
;
2610 slot
= path
->slots
[0];
2611 eb
= btrfs_clone_extent_buffer(path
->nodes
[0]);
2616 btrfs_release_path(path
);
2618 iref
= btrfs_item_ptr(eb
, slot
, struct btrfs_inode_ref
);
2620 for (cur
= 0; cur
< btrfs_item_size(eb
, slot
); cur
+= len
) {
2621 name_len
= btrfs_inode_ref_name_len(eb
, iref
);
2622 /* path must be released before calling iterate()! */
2623 btrfs_debug(fs_root
->fs_info
,
2624 "following ref at offset %u for inode %llu in tree %llu",
2625 cur
, found_key
.objectid
,
2626 btrfs_root_id(fs_root
));
2627 ret
= inode_to_path(parent
, name_len
,
2628 (unsigned long)(iref
+ 1), eb
, ipath
);
2631 len
= sizeof(*iref
) + name_len
;
2632 iref
= (struct btrfs_inode_ref
*)((char *)iref
+ len
);
2634 free_extent_buffer(eb
);
2637 btrfs_release_path(path
);
2642 static int iterate_inode_extrefs(u64 inum
, struct inode_fs_paths
*ipath
)
2649 struct btrfs_root
*fs_root
= ipath
->fs_root
;
2650 struct btrfs_path
*path
= ipath
->btrfs_path
;
2651 struct extent_buffer
*eb
;
2652 struct btrfs_inode_extref
*extref
;
2658 ret
= btrfs_find_one_extref(fs_root
, inum
, offset
, path
, &extref
,
2663 ret
= found
? 0 : -ENOENT
;
2668 slot
= path
->slots
[0];
2669 eb
= btrfs_clone_extent_buffer(path
->nodes
[0]);
2674 btrfs_release_path(path
);
2676 item_size
= btrfs_item_size(eb
, slot
);
2677 ptr
= btrfs_item_ptr_offset(eb
, slot
);
2680 while (cur_offset
< item_size
) {
2683 extref
= (struct btrfs_inode_extref
*)(ptr
+ cur_offset
);
2684 parent
= btrfs_inode_extref_parent(eb
, extref
);
2685 name_len
= btrfs_inode_extref_name_len(eb
, extref
);
2686 ret
= inode_to_path(parent
, name_len
,
2687 (unsigned long)&extref
->name
, eb
, ipath
);
2691 cur_offset
+= btrfs_inode_extref_name_len(eb
, extref
);
2692 cur_offset
+= sizeof(*extref
);
2694 free_extent_buffer(eb
);
2699 btrfs_release_path(path
);
2705 * returns 0 if the path could be dumped (probably truncated)
2706 * returns <0 in case of an error
2708 static int inode_to_path(u64 inum
, u32 name_len
, unsigned long name_off
,
2709 struct extent_buffer
*eb
, struct inode_fs_paths
*ipath
)
2713 int i
= ipath
->fspath
->elem_cnt
;
2714 const int s_ptr
= sizeof(char *);
2717 bytes_left
= ipath
->fspath
->bytes_left
> s_ptr
?
2718 ipath
->fspath
->bytes_left
- s_ptr
: 0;
2720 fspath_min
= (char *)ipath
->fspath
->val
+ (i
+ 1) * s_ptr
;
2721 fspath
= btrfs_ref_to_path(ipath
->fs_root
, ipath
->btrfs_path
, name_len
,
2722 name_off
, eb
, inum
, fspath_min
, bytes_left
);
2724 return PTR_ERR(fspath
);
2726 if (fspath
> fspath_min
) {
2727 ipath
->fspath
->val
[i
] = (u64
)(unsigned long)fspath
;
2728 ++ipath
->fspath
->elem_cnt
;
2729 ipath
->fspath
->bytes_left
= fspath
- fspath_min
;
2731 ++ipath
->fspath
->elem_missed
;
2732 ipath
->fspath
->bytes_missing
+= fspath_min
- fspath
;
2733 ipath
->fspath
->bytes_left
= 0;
2740 * this dumps all file system paths to the inode into the ipath struct, provided
2741 * is has been created large enough. each path is zero-terminated and accessed
2742 * from ipath->fspath->val[i].
2743 * when it returns, there are ipath->fspath->elem_cnt number of paths available
2744 * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
2745 * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise,
2746 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
2747 * have been needed to return all paths.
2749 int paths_from_inode(u64 inum
, struct inode_fs_paths
*ipath
)
2754 ret
= iterate_inode_refs(inum
, ipath
);
2757 else if (ret
!= -ENOENT
)
2760 ret
= iterate_inode_extrefs(inum
, ipath
);
2761 if (ret
== -ENOENT
&& found_refs
)
2767 struct btrfs_data_container
*init_data_container(u32 total_bytes
)
2769 struct btrfs_data_container
*data
;
2772 alloc_bytes
= max_t(size_t, total_bytes
, sizeof(*data
));
2773 data
= kvzalloc(alloc_bytes
, GFP_KERNEL
);
2775 return ERR_PTR(-ENOMEM
);
2777 if (total_bytes
>= sizeof(*data
))
2778 data
->bytes_left
= total_bytes
- sizeof(*data
);
2780 data
->bytes_missing
= sizeof(*data
) - total_bytes
;
2786 * allocates space to return multiple file system paths for an inode.
2787 * total_bytes to allocate are passed, note that space usable for actual path
2788 * information will be total_bytes - sizeof(struct inode_fs_paths).
2789 * the returned pointer must be freed with free_ipath() in the end.
2791 struct inode_fs_paths
*init_ipath(s32 total_bytes
, struct btrfs_root
*fs_root
,
2792 struct btrfs_path
*path
)
2794 struct inode_fs_paths
*ifp
;
2795 struct btrfs_data_container
*fspath
;
2797 fspath
= init_data_container(total_bytes
);
2799 return ERR_CAST(fspath
);
2801 ifp
= kmalloc(sizeof(*ifp
), GFP_KERNEL
);
2804 return ERR_PTR(-ENOMEM
);
2807 ifp
->btrfs_path
= path
;
2808 ifp
->fspath
= fspath
;
2809 ifp
->fs_root
= fs_root
;
2814 void free_ipath(struct inode_fs_paths
*ipath
)
2818 kvfree(ipath
->fspath
);
2822 struct btrfs_backref_iter
*btrfs_backref_iter_alloc(struct btrfs_fs_info
*fs_info
)
2824 struct btrfs_backref_iter
*ret
;
2826 ret
= kzalloc(sizeof(*ret
), GFP_NOFS
);
2830 ret
->path
= btrfs_alloc_path();
2836 /* Current backref iterator only supports iteration in commit root */
2837 ret
->path
->search_commit_root
= 1;
2838 ret
->path
->skip_locking
= 1;
2839 ret
->fs_info
= fs_info
;
2844 static void btrfs_backref_iter_release(struct btrfs_backref_iter
*iter
)
2850 btrfs_release_path(iter
->path
);
2851 memset(&iter
->cur_key
, 0, sizeof(iter
->cur_key
));
2854 int btrfs_backref_iter_start(struct btrfs_backref_iter
*iter
, u64 bytenr
)
2856 struct btrfs_fs_info
*fs_info
= iter
->fs_info
;
2857 struct btrfs_root
*extent_root
= btrfs_extent_root(fs_info
, bytenr
);
2858 struct btrfs_path
*path
= iter
->path
;
2859 struct btrfs_extent_item
*ei
;
2860 struct btrfs_key key
;
2863 key
.objectid
= bytenr
;
2864 key
.type
= BTRFS_METADATA_ITEM_KEY
;
2865 key
.offset
= (u64
)-1;
2866 iter
->bytenr
= bytenr
;
2868 ret
= btrfs_search_slot(NULL
, extent_root
, &key
, path
, 0, 0);
2873 * Key with offset -1 found, there would have to exist an extent
2874 * item with such offset, but this is out of the valid range.
2879 if (path
->slots
[0] == 0) {
2880 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG
));
2886 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
2887 if ((key
.type
!= BTRFS_EXTENT_ITEM_KEY
&&
2888 key
.type
!= BTRFS_METADATA_ITEM_KEY
) || key
.objectid
!= bytenr
) {
2892 memcpy(&iter
->cur_key
, &key
, sizeof(key
));
2893 iter
->item_ptr
= (u32
)btrfs_item_ptr_offset(path
->nodes
[0],
2895 iter
->end_ptr
= (u32
)(iter
->item_ptr
+
2896 btrfs_item_size(path
->nodes
[0], path
->slots
[0]));
2897 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2898 struct btrfs_extent_item
);
2901 * Only support iteration on tree backref yet.
2903 * This is an extra precaution for non skinny-metadata, where
2904 * EXTENT_ITEM is also used for tree blocks, that we can only use
2905 * extent flags to determine if it's a tree block.
2907 if (btrfs_extent_flags(path
->nodes
[0], ei
) & BTRFS_EXTENT_FLAG_DATA
) {
2911 iter
->cur_ptr
= (u32
)(iter
->item_ptr
+ sizeof(*ei
));
2913 /* If there is no inline backref, go search for keyed backref */
2914 if (iter
->cur_ptr
>= iter
->end_ptr
) {
2915 ret
= btrfs_next_item(extent_root
, path
);
2917 /* No inline nor keyed ref */
2925 btrfs_item_key_to_cpu(path
->nodes
[0], &iter
->cur_key
,
2927 if (iter
->cur_key
.objectid
!= bytenr
||
2928 (iter
->cur_key
.type
!= BTRFS_SHARED_BLOCK_REF_KEY
&&
2929 iter
->cur_key
.type
!= BTRFS_TREE_BLOCK_REF_KEY
)) {
2933 iter
->cur_ptr
= (u32
)btrfs_item_ptr_offset(path
->nodes
[0],
2935 iter
->item_ptr
= iter
->cur_ptr
;
2936 iter
->end_ptr
= (u32
)(iter
->item_ptr
+ btrfs_item_size(
2937 path
->nodes
[0], path
->slots
[0]));
2942 btrfs_backref_iter_release(iter
);
2946 static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter
*iter
)
2948 if (iter
->cur_key
.type
== BTRFS_EXTENT_ITEM_KEY
||
2949 iter
->cur_key
.type
== BTRFS_METADATA_ITEM_KEY
)
2955 * Go to the next backref item of current bytenr, can be either inlined or
2958 * Caller needs to check whether it's inline ref or not by iter->cur_key.
2960 * Return 0 if we get next backref without problem.
2961 * Return >0 if there is no extra backref for this bytenr.
2962 * Return <0 if there is something wrong happened.
2964 int btrfs_backref_iter_next(struct btrfs_backref_iter
*iter
)
2966 struct extent_buffer
*eb
= iter
->path
->nodes
[0];
2967 struct btrfs_root
*extent_root
;
2968 struct btrfs_path
*path
= iter
->path
;
2969 struct btrfs_extent_inline_ref
*iref
;
2973 if (btrfs_backref_iter_is_inline_ref(iter
)) {
2974 /* We're still inside the inline refs */
2975 ASSERT(iter
->cur_ptr
< iter
->end_ptr
);
2977 if (btrfs_backref_has_tree_block_info(iter
)) {
2978 /* First tree block info */
2979 size
= sizeof(struct btrfs_tree_block_info
);
2981 /* Use inline ref type to determine the size */
2984 iref
= (struct btrfs_extent_inline_ref
*)
2985 ((unsigned long)iter
->cur_ptr
);
2986 type
= btrfs_extent_inline_ref_type(eb
, iref
);
2988 size
= btrfs_extent_inline_ref_size(type
);
2990 iter
->cur_ptr
+= size
;
2991 if (iter
->cur_ptr
< iter
->end_ptr
)
2994 /* All inline items iterated, fall through */
2997 /* We're at keyed items, there is no inline item, go to the next one */
2998 extent_root
= btrfs_extent_root(iter
->fs_info
, iter
->bytenr
);
2999 ret
= btrfs_next_item(extent_root
, iter
->path
);
3003 btrfs_item_key_to_cpu(path
->nodes
[0], &iter
->cur_key
, path
->slots
[0]);
3004 if (iter
->cur_key
.objectid
!= iter
->bytenr
||
3005 (iter
->cur_key
.type
!= BTRFS_TREE_BLOCK_REF_KEY
&&
3006 iter
->cur_key
.type
!= BTRFS_SHARED_BLOCK_REF_KEY
))
3008 iter
->item_ptr
= (u32
)btrfs_item_ptr_offset(path
->nodes
[0],
3010 iter
->cur_ptr
= iter
->item_ptr
;
3011 iter
->end_ptr
= iter
->item_ptr
+ (u32
)btrfs_item_size(path
->nodes
[0],
3016 void btrfs_backref_init_cache(struct btrfs_fs_info
*fs_info
,
3017 struct btrfs_backref_cache
*cache
, bool is_reloc
)
3021 cache
->rb_root
= RB_ROOT
;
3022 for (i
= 0; i
< BTRFS_MAX_LEVEL
; i
++)
3023 INIT_LIST_HEAD(&cache
->pending
[i
]);
3024 INIT_LIST_HEAD(&cache
->changed
);
3025 INIT_LIST_HEAD(&cache
->detached
);
3026 INIT_LIST_HEAD(&cache
->leaves
);
3027 INIT_LIST_HEAD(&cache
->pending_edge
);
3028 INIT_LIST_HEAD(&cache
->useless_node
);
3029 cache
->fs_info
= fs_info
;
3030 cache
->is_reloc
= is_reloc
;
3033 struct btrfs_backref_node
*btrfs_backref_alloc_node(
3034 struct btrfs_backref_cache
*cache
, u64 bytenr
, int level
)
3036 struct btrfs_backref_node
*node
;
3038 ASSERT(level
>= 0 && level
< BTRFS_MAX_LEVEL
);
3039 node
= kzalloc(sizeof(*node
), GFP_NOFS
);
3043 INIT_LIST_HEAD(&node
->list
);
3044 INIT_LIST_HEAD(&node
->upper
);
3045 INIT_LIST_HEAD(&node
->lower
);
3046 RB_CLEAR_NODE(&node
->rb_node
);
3048 node
->level
= level
;
3049 node
->bytenr
= bytenr
;
3054 void btrfs_backref_free_node(struct btrfs_backref_cache
*cache
,
3055 struct btrfs_backref_node
*node
)
3058 ASSERT(list_empty(&node
->list
));
3059 ASSERT(list_empty(&node
->lower
));
3060 ASSERT(node
->eb
== NULL
);
3062 btrfs_put_root(node
->root
);
3067 struct btrfs_backref_edge
*btrfs_backref_alloc_edge(
3068 struct btrfs_backref_cache
*cache
)
3070 struct btrfs_backref_edge
*edge
;
3072 edge
= kzalloc(sizeof(*edge
), GFP_NOFS
);
3078 void btrfs_backref_free_edge(struct btrfs_backref_cache
*cache
,
3079 struct btrfs_backref_edge
*edge
)
3087 void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node
*node
)
3090 btrfs_tree_unlock(node
->eb
);
3095 void btrfs_backref_drop_node_buffer(struct btrfs_backref_node
*node
)
3098 btrfs_backref_unlock_node_buffer(node
);
3099 free_extent_buffer(node
->eb
);
3105 * Drop the backref node from cache without cleaning up its children
3108 * This can only be called on node without parent edges.
3109 * The children edges are still kept as is.
3111 void btrfs_backref_drop_node(struct btrfs_backref_cache
*tree
,
3112 struct btrfs_backref_node
*node
)
3114 ASSERT(list_empty(&node
->upper
));
3116 btrfs_backref_drop_node_buffer(node
);
3117 list_del_init(&node
->list
);
3118 list_del_init(&node
->lower
);
3119 if (!RB_EMPTY_NODE(&node
->rb_node
))
3120 rb_erase(&node
->rb_node
, &tree
->rb_root
);
3121 btrfs_backref_free_node(tree
, node
);
3125 * Drop the backref node from cache, also cleaning up all its
3126 * upper edges and any uncached nodes in the path.
3128 * This cleanup happens bottom up, thus the node should either
3129 * be the lowest node in the cache or a detached node.
3131 void btrfs_backref_cleanup_node(struct btrfs_backref_cache
*cache
,
3132 struct btrfs_backref_node
*node
)
3134 struct btrfs_backref_node
*upper
;
3135 struct btrfs_backref_edge
*edge
;
3140 BUG_ON(!node
->lowest
&& !node
->detached
);
3141 while (!list_empty(&node
->upper
)) {
3142 edge
= list_entry(node
->upper
.next
, struct btrfs_backref_edge
,
3144 upper
= edge
->node
[UPPER
];
3145 list_del(&edge
->list
[LOWER
]);
3146 list_del(&edge
->list
[UPPER
]);
3147 btrfs_backref_free_edge(cache
, edge
);
3150 * Add the node to leaf node list if no other child block
3153 if (list_empty(&upper
->lower
)) {
3154 list_add_tail(&upper
->lower
, &cache
->leaves
);
3159 btrfs_backref_drop_node(cache
, node
);
3163 * Release all nodes/edges from current cache
3165 void btrfs_backref_release_cache(struct btrfs_backref_cache
*cache
)
3167 struct btrfs_backref_node
*node
;
3170 while (!list_empty(&cache
->detached
)) {
3171 node
= list_entry(cache
->detached
.next
,
3172 struct btrfs_backref_node
, list
);
3173 btrfs_backref_cleanup_node(cache
, node
);
3176 while (!list_empty(&cache
->leaves
)) {
3177 node
= list_entry(cache
->leaves
.next
,
3178 struct btrfs_backref_node
, lower
);
3179 btrfs_backref_cleanup_node(cache
, node
);
3182 cache
->last_trans
= 0;
3184 for (i
= 0; i
< BTRFS_MAX_LEVEL
; i
++)
3185 ASSERT(list_empty(&cache
->pending
[i
]));
3186 ASSERT(list_empty(&cache
->pending_edge
));
3187 ASSERT(list_empty(&cache
->useless_node
));
3188 ASSERT(list_empty(&cache
->changed
));
3189 ASSERT(list_empty(&cache
->detached
));
3190 ASSERT(RB_EMPTY_ROOT(&cache
->rb_root
));
3191 ASSERT(!cache
->nr_nodes
);
3192 ASSERT(!cache
->nr_edges
);
3195 void btrfs_backref_link_edge(struct btrfs_backref_edge
*edge
,
3196 struct btrfs_backref_node
*lower
,
3197 struct btrfs_backref_node
*upper
,
3200 ASSERT(upper
&& lower
&& upper
->level
== lower
->level
+ 1);
3201 edge
->node
[LOWER
] = lower
;
3202 edge
->node
[UPPER
] = upper
;
3203 if (link_which
& LINK_LOWER
)
3204 list_add_tail(&edge
->list
[LOWER
], &lower
->upper
);
3205 if (link_which
& LINK_UPPER
)
3206 list_add_tail(&edge
->list
[UPPER
], &upper
->lower
);
3209 * Handle direct tree backref
3211 * Direct tree backref means, the backref item shows its parent bytenr
3212 * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined).
3214 * @ref_key: The converted backref key.
3215 * For keyed backref, it's the item key.
3216 * For inlined backref, objectid is the bytenr,
3217 * type is btrfs_inline_ref_type, offset is
3218 * btrfs_inline_ref_offset.
3220 static int handle_direct_tree_backref(struct btrfs_backref_cache
*cache
,
3221 struct btrfs_key
*ref_key
,
3222 struct btrfs_backref_node
*cur
)
3224 struct btrfs_backref_edge
*edge
;
3225 struct btrfs_backref_node
*upper
;
3226 struct rb_node
*rb_node
;
3228 ASSERT(ref_key
->type
== BTRFS_SHARED_BLOCK_REF_KEY
);
3230 /* Only reloc root uses backref pointing to itself */
3231 if (ref_key
->objectid
== ref_key
->offset
) {
3232 struct btrfs_root
*root
;
3234 cur
->is_reloc_root
= 1;
3235 /* Only reloc backref cache cares about a specific root */
3236 if (cache
->is_reloc
) {
3237 root
= find_reloc_root(cache
->fs_info
, cur
->bytenr
);
3243 * For generic purpose backref cache, reloc root node
3246 list_add(&cur
->list
, &cache
->useless_node
);
3251 edge
= btrfs_backref_alloc_edge(cache
);
3255 rb_node
= rb_simple_search(&cache
->rb_root
, ref_key
->offset
);
3257 /* Parent node not yet cached */
3258 upper
= btrfs_backref_alloc_node(cache
, ref_key
->offset
,
3261 btrfs_backref_free_edge(cache
, edge
);
3266 * Backrefs for the upper level block isn't cached, add the
3267 * block to pending list
3269 list_add_tail(&edge
->list
[UPPER
], &cache
->pending_edge
);
3271 /* Parent node already cached */
3272 upper
= rb_entry(rb_node
, struct btrfs_backref_node
, rb_node
);
3273 ASSERT(upper
->checked
);
3274 INIT_LIST_HEAD(&edge
->list
[UPPER
]);
3276 btrfs_backref_link_edge(edge
, cur
, upper
, LINK_LOWER
);
3281 * Handle indirect tree backref
3283 * Indirect tree backref means, we only know which tree the node belongs to.
3284 * We still need to do a tree search to find out the parents. This is for
3285 * TREE_BLOCK_REF backref (keyed or inlined).
3287 * @trans: Transaction handle.
3288 * @ref_key: The same as @ref_key in handle_direct_tree_backref()
3289 * @tree_key: The first key of this tree block.
3290 * @path: A clean (released) path, to avoid allocating path every time
3291 * the function get called.
3293 static int handle_indirect_tree_backref(struct btrfs_trans_handle
*trans
,
3294 struct btrfs_backref_cache
*cache
,
3295 struct btrfs_path
*path
,
3296 struct btrfs_key
*ref_key
,
3297 struct btrfs_key
*tree_key
,
3298 struct btrfs_backref_node
*cur
)
3300 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
3301 struct btrfs_backref_node
*upper
;
3302 struct btrfs_backref_node
*lower
;
3303 struct btrfs_backref_edge
*edge
;
3304 struct extent_buffer
*eb
;
3305 struct btrfs_root
*root
;
3306 struct rb_node
*rb_node
;
3308 bool need_check
= true;
3311 root
= btrfs_get_fs_root(fs_info
, ref_key
->offset
, false);
3313 return PTR_ERR(root
);
3314 if (!test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
))
3317 if (btrfs_root_level(&root
->root_item
) == cur
->level
) {
3319 ASSERT(btrfs_root_bytenr(&root
->root_item
) == cur
->bytenr
);
3321 * For reloc backref cache, we may ignore reloc root. But for
3322 * general purpose backref cache, we can't rely on
3323 * btrfs_should_ignore_reloc_root() as it may conflict with
3324 * current running relocation and lead to missing root.
3326 * For general purpose backref cache, reloc root detection is
3327 * completely relying on direct backref (key->offset is parent
3328 * bytenr), thus only do such check for reloc cache.
3330 if (btrfs_should_ignore_reloc_root(root
) && cache
->is_reloc
) {
3331 btrfs_put_root(root
);
3332 list_add(&cur
->list
, &cache
->useless_node
);
3339 level
= cur
->level
+ 1;
3341 /* Search the tree to find parent blocks referring to the block */
3342 path
->search_commit_root
= 1;
3343 path
->skip_locking
= 1;
3344 path
->lowest_level
= level
;
3345 ret
= btrfs_search_slot(NULL
, root
, tree_key
, path
, 0, 0);
3346 path
->lowest_level
= 0;
3348 btrfs_put_root(root
);
3351 if (ret
> 0 && path
->slots
[level
] > 0)
3352 path
->slots
[level
]--;
3354 eb
= path
->nodes
[level
];
3355 if (btrfs_node_blockptr(eb
, path
->slots
[level
]) != cur
->bytenr
) {
3357 "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
3358 cur
->bytenr
, level
- 1, btrfs_root_id(root
),
3359 tree_key
->objectid
, tree_key
->type
, tree_key
->offset
);
3360 btrfs_put_root(root
);
3366 /* Add all nodes and edges in the path */
3367 for (; level
< BTRFS_MAX_LEVEL
; level
++) {
3368 if (!path
->nodes
[level
]) {
3369 ASSERT(btrfs_root_bytenr(&root
->root_item
) ==
3371 /* Same as previous should_ignore_reloc_root() call */
3372 if (btrfs_should_ignore_reloc_root(root
) &&
3374 btrfs_put_root(root
);
3375 list_add(&lower
->list
, &cache
->useless_node
);
3382 edge
= btrfs_backref_alloc_edge(cache
);
3384 btrfs_put_root(root
);
3389 eb
= path
->nodes
[level
];
3390 rb_node
= rb_simple_search(&cache
->rb_root
, eb
->start
);
3392 upper
= btrfs_backref_alloc_node(cache
, eb
->start
,
3395 btrfs_put_root(root
);
3396 btrfs_backref_free_edge(cache
, edge
);
3400 upper
->owner
= btrfs_header_owner(eb
);
3401 if (!test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
))
3405 * If we know the block isn't shared we can avoid
3406 * checking its backrefs.
3408 if (btrfs_block_can_be_shared(trans
, root
, eb
))
3414 * Add the block to pending list if we need to check its
3415 * backrefs, we only do this once while walking up a
3416 * tree as we will catch anything else later on.
3418 if (!upper
->checked
&& need_check
) {
3420 list_add_tail(&edge
->list
[UPPER
],
3421 &cache
->pending_edge
);
3425 INIT_LIST_HEAD(&edge
->list
[UPPER
]);
3428 upper
= rb_entry(rb_node
, struct btrfs_backref_node
,
3430 ASSERT(upper
->checked
);
3431 INIT_LIST_HEAD(&edge
->list
[UPPER
]);
3433 upper
->owner
= btrfs_header_owner(eb
);
3435 btrfs_backref_link_edge(edge
, lower
, upper
, LINK_LOWER
);
3438 btrfs_put_root(root
);
3445 btrfs_release_path(path
);
3450 * Add backref node @cur into @cache.
3452 * NOTE: Even if the function returned 0, @cur is not yet cached as its upper
3453 * links aren't yet bi-directional. Needs to finish such links.
3454 * Use btrfs_backref_finish_upper_links() to finish such linkage.
3456 * @trans: Transaction handle.
3457 * @path: Released path for indirect tree backref lookup
3458 * @iter: Released backref iter for extent tree search
3459 * @node_key: The first key of the tree block
3461 int btrfs_backref_add_tree_node(struct btrfs_trans_handle
*trans
,
3462 struct btrfs_backref_cache
*cache
,
3463 struct btrfs_path
*path
,
3464 struct btrfs_backref_iter
*iter
,
3465 struct btrfs_key
*node_key
,
3466 struct btrfs_backref_node
*cur
)
3468 struct btrfs_backref_edge
*edge
;
3469 struct btrfs_backref_node
*exist
;
3472 ret
= btrfs_backref_iter_start(iter
, cur
->bytenr
);
3476 * We skip the first btrfs_tree_block_info, as we don't use the key
3477 * stored in it, but fetch it from the tree block
3479 if (btrfs_backref_has_tree_block_info(iter
)) {
3480 ret
= btrfs_backref_iter_next(iter
);
3483 /* No extra backref? This means the tree block is corrupted */
3489 WARN_ON(cur
->checked
);
3490 if (!list_empty(&cur
->upper
)) {
3492 * The backref was added previously when processing backref of
3493 * type BTRFS_TREE_BLOCK_REF_KEY
3495 ASSERT(list_is_singular(&cur
->upper
));
3496 edge
= list_entry(cur
->upper
.next
, struct btrfs_backref_edge
,
3498 ASSERT(list_empty(&edge
->list
[UPPER
]));
3499 exist
= edge
->node
[UPPER
];
3501 * Add the upper level block to pending list if we need check
3504 if (!exist
->checked
)
3505 list_add_tail(&edge
->list
[UPPER
], &cache
->pending_edge
);
3510 for (; ret
== 0; ret
= btrfs_backref_iter_next(iter
)) {
3511 struct extent_buffer
*eb
;
3512 struct btrfs_key key
;
3516 eb
= iter
->path
->nodes
[0];
3518 key
.objectid
= iter
->bytenr
;
3519 if (btrfs_backref_iter_is_inline_ref(iter
)) {
3520 struct btrfs_extent_inline_ref
*iref
;
3522 /* Update key for inline backref */
3523 iref
= (struct btrfs_extent_inline_ref
*)
3524 ((unsigned long)iter
->cur_ptr
);
3525 type
= btrfs_get_extent_inline_ref_type(eb
, iref
,
3526 BTRFS_REF_TYPE_BLOCK
);
3527 if (type
== BTRFS_REF_TYPE_INVALID
) {
3532 key
.offset
= btrfs_extent_inline_ref_offset(eb
, iref
);
3534 key
.type
= iter
->cur_key
.type
;
3535 key
.offset
= iter
->cur_key
.offset
;
3539 * Parent node found and matches current inline ref, no need to
3540 * rebuild this node for this inline ref
3543 ((key
.type
== BTRFS_TREE_BLOCK_REF_KEY
&&
3544 exist
->owner
== key
.offset
) ||
3545 (key
.type
== BTRFS_SHARED_BLOCK_REF_KEY
&&
3546 exist
->bytenr
== key
.offset
))) {
3551 /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
3552 if (key
.type
== BTRFS_SHARED_BLOCK_REF_KEY
) {
3553 ret
= handle_direct_tree_backref(cache
, &key
, cur
);
3556 } else if (key
.type
== BTRFS_TREE_BLOCK_REF_KEY
) {
3558 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref
3559 * offset means the root objectid. We need to search
3560 * the tree to get its parent bytenr.
3562 ret
= handle_indirect_tree_backref(trans
, cache
, path
,
3563 &key
, node_key
, cur
);
3568 * Unrecognized tree backref items (if it can pass tree-checker)
3576 btrfs_backref_iter_release(iter
);
3581 * Finish the upwards linkage created by btrfs_backref_add_tree_node()
3583 int btrfs_backref_finish_upper_links(struct btrfs_backref_cache
*cache
,
3584 struct btrfs_backref_node
*start
)
3586 struct list_head
*useless_node
= &cache
->useless_node
;
3587 struct btrfs_backref_edge
*edge
;
3588 struct rb_node
*rb_node
;
3589 LIST_HEAD(pending_edge
);
3591 ASSERT(start
->checked
);
3593 /* Insert this node to cache if it's not COW-only */
3594 if (!start
->cowonly
) {
3595 rb_node
= rb_simple_insert(&cache
->rb_root
, start
->bytenr
,
3598 btrfs_backref_panic(cache
->fs_info
, start
->bytenr
,
3600 list_add_tail(&start
->lower
, &cache
->leaves
);
3604 * Use breadth first search to iterate all related edges.
3606 * The starting points are all the edges of this node
3608 list_for_each_entry(edge
, &start
->upper
, list
[LOWER
])
3609 list_add_tail(&edge
->list
[UPPER
], &pending_edge
);
3611 while (!list_empty(&pending_edge
)) {
3612 struct btrfs_backref_node
*upper
;
3613 struct btrfs_backref_node
*lower
;
3615 edge
= list_first_entry(&pending_edge
,
3616 struct btrfs_backref_edge
, list
[UPPER
]);
3617 list_del_init(&edge
->list
[UPPER
]);
3618 upper
= edge
->node
[UPPER
];
3619 lower
= edge
->node
[LOWER
];
3621 /* Parent is detached, no need to keep any edges */
3622 if (upper
->detached
) {
3623 list_del(&edge
->list
[LOWER
]);
3624 btrfs_backref_free_edge(cache
, edge
);
3626 /* Lower node is orphan, queue for cleanup */
3627 if (list_empty(&lower
->upper
))
3628 list_add(&lower
->list
, useless_node
);
3633 * All new nodes added in current build_backref_tree() haven't
3634 * been linked to the cache rb tree.
3635 * So if we have upper->rb_node populated, this means a cache
3636 * hit. We only need to link the edge, as @upper and all its
3637 * parents have already been linked.
3639 if (!RB_EMPTY_NODE(&upper
->rb_node
)) {
3640 if (upper
->lowest
) {
3641 list_del_init(&upper
->lower
);
3645 list_add_tail(&edge
->list
[UPPER
], &upper
->lower
);
3649 /* Sanity check, we shouldn't have any unchecked nodes */
3650 if (!upper
->checked
) {
3655 /* Sanity check, COW-only node has non-COW-only parent */
3656 if (start
->cowonly
!= upper
->cowonly
) {
3661 /* Only cache non-COW-only (subvolume trees) tree blocks */
3662 if (!upper
->cowonly
) {
3663 rb_node
= rb_simple_insert(&cache
->rb_root
, upper
->bytenr
,
3666 btrfs_backref_panic(cache
->fs_info
,
3667 upper
->bytenr
, -EEXIST
);
3672 list_add_tail(&edge
->list
[UPPER
], &upper
->lower
);
3675 * Also queue all the parent edges of this uncached node
3676 * to finish the upper linkage
3678 list_for_each_entry(edge
, &upper
->upper
, list
[LOWER
])
3679 list_add_tail(&edge
->list
[UPPER
], &pending_edge
);
3684 void btrfs_backref_error_cleanup(struct btrfs_backref_cache
*cache
,
3685 struct btrfs_backref_node
*node
)
3687 struct btrfs_backref_node
*lower
;
3688 struct btrfs_backref_node
*upper
;
3689 struct btrfs_backref_edge
*edge
;
3691 while (!list_empty(&cache
->useless_node
)) {
3692 lower
= list_first_entry(&cache
->useless_node
,
3693 struct btrfs_backref_node
, list
);
3694 list_del_init(&lower
->list
);
3696 while (!list_empty(&cache
->pending_edge
)) {
3697 edge
= list_first_entry(&cache
->pending_edge
,
3698 struct btrfs_backref_edge
, list
[UPPER
]);
3699 list_del(&edge
->list
[UPPER
]);
3700 list_del(&edge
->list
[LOWER
]);
3701 lower
= edge
->node
[LOWER
];
3702 upper
= edge
->node
[UPPER
];
3703 btrfs_backref_free_edge(cache
, edge
);
3706 * Lower is no longer linked to any upper backref nodes and
3707 * isn't in the cache, we can free it ourselves.
3709 if (list_empty(&lower
->upper
) &&
3710 RB_EMPTY_NODE(&lower
->rb_node
))
3711 list_add(&lower
->list
, &cache
->useless_node
);
3713 if (!RB_EMPTY_NODE(&upper
->rb_node
))
3716 /* Add this guy's upper edges to the list to process */
3717 list_for_each_entry(edge
, &upper
->upper
, list
[LOWER
])
3718 list_add_tail(&edge
->list
[UPPER
],
3719 &cache
->pending_edge
);
3720 if (list_empty(&upper
->upper
))
3721 list_add(&upper
->list
, &cache
->useless_node
);
3724 while (!list_empty(&cache
->useless_node
)) {
3725 lower
= list_first_entry(&cache
->useless_node
,
3726 struct btrfs_backref_node
, list
);
3727 list_del_init(&lower
->list
);
3730 btrfs_backref_drop_node(cache
, lower
);
3733 btrfs_backref_cleanup_node(cache
, node
);
3734 ASSERT(list_empty(&cache
->useless_node
) &&
3735 list_empty(&cache
->pending_edge
));