1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
20 #include "extent-tree.h"
21 #include "transaction.h"
23 #include "print-tree.h"
27 #include "free-space-cache.h"
28 #include "free-space-tree.h"
30 #include "ref-verify.h"
31 #include "space-info.h"
32 #include "block-rsv.h"
35 #include "dev-replace.h"
37 #include "accessors.h"
38 #include "root-tree.h"
39 #include "file-item.h"
41 #include "tree-checker.h"
42 #include "raid-stripe-tree.h"
44 #undef SCRAMBLE_DELAYED_REFS
47 static int __btrfs_free_extent(struct btrfs_trans_handle
*trans
,
48 struct btrfs_delayed_ref_head
*href
,
49 struct btrfs_delayed_ref_node
*node
,
50 struct btrfs_delayed_extent_op
*extra_op
);
51 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op
*extent_op
,
52 struct extent_buffer
*leaf
,
53 struct btrfs_extent_item
*ei
);
54 static int alloc_reserved_file_extent(struct btrfs_trans_handle
*trans
,
55 u64 parent
, u64 root_objectid
,
56 u64 flags
, u64 owner
, u64 offset
,
57 struct btrfs_key
*ins
, int ref_mod
, u64 oref_root
);
58 static int alloc_reserved_tree_block(struct btrfs_trans_handle
*trans
,
59 struct btrfs_delayed_ref_node
*node
,
60 struct btrfs_delayed_extent_op
*extent_op
);
61 static int find_next_key(struct btrfs_path
*path
, int level
,
62 struct btrfs_key
*key
);
64 static int block_group_bits(struct btrfs_block_group
*cache
, u64 bits
)
66 return (cache
->flags
& bits
) == bits
;
69 /* simple helper to search for an existing data extent at a given offset */
70 int btrfs_lookup_data_extent(struct btrfs_fs_info
*fs_info
, u64 start
, u64 len
)
72 struct btrfs_root
*root
= btrfs_extent_root(fs_info
, start
);
75 struct btrfs_path
*path
;
77 path
= btrfs_alloc_path();
83 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
84 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
85 btrfs_free_path(path
);
90 * helper function to lookup reference count and flags of a tree block.
92 * the head node for delayed ref is used to store the sum of all the
93 * reference count modifications queued up in the rbtree. the head
94 * node may also store the extent flags to set. This way you can check
95 * to see what the reference count and extent flags would be if all of
96 * the delayed refs are not processed.
98 int btrfs_lookup_extent_info(struct btrfs_trans_handle
*trans
,
99 struct btrfs_fs_info
*fs_info
, u64 bytenr
,
100 u64 offset
, int metadata
, u64
*refs
, u64
*flags
,
103 struct btrfs_root
*extent_root
;
104 struct btrfs_delayed_ref_head
*head
;
105 struct btrfs_delayed_ref_root
*delayed_refs
;
106 struct btrfs_path
*path
;
107 struct btrfs_key key
;
114 * If we don't have skinny metadata, don't bother doing anything
117 if (metadata
&& !btrfs_fs_incompat(fs_info
, SKINNY_METADATA
)) {
118 offset
= fs_info
->nodesize
;
122 path
= btrfs_alloc_path();
127 key
.objectid
= bytenr
;
130 key
.type
= BTRFS_METADATA_ITEM_KEY
;
132 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
134 extent_root
= btrfs_extent_root(fs_info
, bytenr
);
135 ret
= btrfs_search_slot(NULL
, extent_root
, &key
, path
, 0, 0);
139 if (ret
> 0 && key
.type
== BTRFS_METADATA_ITEM_KEY
) {
140 if (path
->slots
[0]) {
142 btrfs_item_key_to_cpu(path
->nodes
[0], &key
,
144 if (key
.objectid
== bytenr
&&
145 key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
146 key
.offset
== fs_info
->nodesize
)
152 struct extent_buffer
*leaf
= path
->nodes
[0];
153 struct btrfs_extent_item
*ei
;
154 const u32 item_size
= btrfs_item_size(leaf
, path
->slots
[0]);
156 if (unlikely(item_size
< sizeof(*ei
))) {
159 "unexpected extent item size, has %u expect >= %zu",
160 item_size
, sizeof(*ei
));
161 btrfs_abort_transaction(trans
, ret
);
165 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
166 num_refs
= btrfs_extent_refs(leaf
, ei
);
167 if (unlikely(num_refs
== 0)) {
170 "unexpected zero reference count for extent item (%llu %u %llu)",
171 key
.objectid
, key
.type
, key
.offset
);
172 btrfs_abort_transaction(trans
, ret
);
175 extent_flags
= btrfs_extent_flags(leaf
, ei
);
176 owner
= btrfs_get_extent_owner_root(fs_info
, leaf
, path
->slots
[0]);
183 delayed_refs
= &trans
->transaction
->delayed_refs
;
184 spin_lock(&delayed_refs
->lock
);
185 head
= btrfs_find_delayed_ref_head(fs_info
, delayed_refs
, bytenr
);
187 if (!mutex_trylock(&head
->mutex
)) {
188 refcount_inc(&head
->refs
);
189 spin_unlock(&delayed_refs
->lock
);
191 btrfs_release_path(path
);
194 * Mutex was contended, block until it's released and try
197 mutex_lock(&head
->mutex
);
198 mutex_unlock(&head
->mutex
);
199 btrfs_put_delayed_ref_head(head
);
202 spin_lock(&head
->lock
);
203 if (head
->extent_op
&& head
->extent_op
->update_flags
)
204 extent_flags
|= head
->extent_op
->flags_to_set
;
206 num_refs
+= head
->ref_mod
;
207 spin_unlock(&head
->lock
);
208 mutex_unlock(&head
->mutex
);
210 spin_unlock(&delayed_refs
->lock
);
212 WARN_ON(num_refs
== 0);
216 *flags
= extent_flags
;
218 *owning_root
= owner
;
220 btrfs_free_path(path
);
225 * Back reference rules. Back refs have three main goals:
227 * 1) differentiate between all holders of references to an extent so that
228 * when a reference is dropped we can make sure it was a valid reference
229 * before freeing the extent.
231 * 2) Provide enough information to quickly find the holders of an extent
232 * if we notice a given block is corrupted or bad.
234 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
235 * maintenance. This is actually the same as #2, but with a slightly
236 * different use case.
238 * There are two kinds of back refs. The implicit back refs is optimized
239 * for pointers in non-shared tree blocks. For a given pointer in a block,
240 * back refs of this kind provide information about the block's owner tree
241 * and the pointer's key. These information allow us to find the block by
242 * b-tree searching. The full back refs is for pointers in tree blocks not
243 * referenced by their owner trees. The location of tree block is recorded
244 * in the back refs. Actually the full back refs is generic, and can be
245 * used in all cases the implicit back refs is used. The major shortcoming
246 * of the full back refs is its overhead. Every time a tree block gets
247 * COWed, we have to update back refs entry for all pointers in it.
249 * For a newly allocated tree block, we use implicit back refs for
250 * pointers in it. This means most tree related operations only involve
251 * implicit back refs. For a tree block created in old transaction, the
252 * only way to drop a reference to it is COW it. So we can detect the
253 * event that tree block loses its owner tree's reference and do the
254 * back refs conversion.
256 * When a tree block is COWed through a tree, there are four cases:
258 * The reference count of the block is one and the tree is the block's
259 * owner tree. Nothing to do in this case.
261 * The reference count of the block is one and the tree is not the
262 * block's owner tree. In this case, full back refs is used for pointers
263 * in the block. Remove these full back refs, add implicit back refs for
264 * every pointers in the new block.
266 * The reference count of the block is greater than one and the tree is
267 * the block's owner tree. In this case, implicit back refs is used for
268 * pointers in the block. Add full back refs for every pointers in the
269 * block, increase lower level extents' reference counts. The original
270 * implicit back refs are entailed to the new block.
272 * The reference count of the block is greater than one and the tree is
273 * not the block's owner tree. Add implicit back refs for every pointer in
274 * the new block, increase lower level extents' reference count.
276 * Back Reference Key composing:
278 * The key objectid corresponds to the first byte in the extent,
279 * The key type is used to differentiate between types of back refs.
280 * There are different meanings of the key offset for different types
283 * File extents can be referenced by:
285 * - multiple snapshots, subvolumes, or different generations in one subvol
286 * - different files inside a single subvolume
287 * - different offsets inside a file (bookend extents in file.c)
289 * The extent ref structure for the implicit back refs has fields for:
291 * - Objectid of the subvolume root
292 * - objectid of the file holding the reference
293 * - original offset in the file
294 * - how many bookend extents
296 * The key offset for the implicit back refs is hash of the first
299 * The extent ref structure for the full back refs has field for:
301 * - number of pointers in the tree leaf
303 * The key offset for the implicit back refs is the first byte of
306 * When a file extent is allocated, The implicit back refs is used.
307 * the fields are filled in:
309 * (root_key.objectid, inode objectid, offset in file, 1)
311 * When a file extent is removed file truncation, we find the
312 * corresponding implicit back refs and check the following fields:
314 * (btrfs_header_owner(leaf), inode objectid, offset in file)
316 * Btree extents can be referenced by:
318 * - Different subvolumes
320 * Both the implicit back refs and the full back refs for tree blocks
321 * only consist of key. The key offset for the implicit back refs is
322 * objectid of block's owner tree. The key offset for the full back refs
323 * is the first byte of parent block.
325 * When implicit back refs is used, information about the lowest key and
326 * level of the tree block are required. These information are stored in
327 * tree block info structure.
331 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
332 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
333 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
335 int btrfs_get_extent_inline_ref_type(const struct extent_buffer
*eb
,
336 struct btrfs_extent_inline_ref
*iref
,
337 enum btrfs_inline_ref_type is_data
)
339 struct btrfs_fs_info
*fs_info
= eb
->fs_info
;
340 int type
= btrfs_extent_inline_ref_type(eb
, iref
);
341 u64 offset
= btrfs_extent_inline_ref_offset(eb
, iref
);
343 if (type
== BTRFS_EXTENT_OWNER_REF_KEY
) {
344 ASSERT(btrfs_fs_incompat(fs_info
, SIMPLE_QUOTA
));
348 if (type
== BTRFS_TREE_BLOCK_REF_KEY
||
349 type
== BTRFS_SHARED_BLOCK_REF_KEY
||
350 type
== BTRFS_SHARED_DATA_REF_KEY
||
351 type
== BTRFS_EXTENT_DATA_REF_KEY
) {
352 if (is_data
== BTRFS_REF_TYPE_BLOCK
) {
353 if (type
== BTRFS_TREE_BLOCK_REF_KEY
)
355 if (type
== BTRFS_SHARED_BLOCK_REF_KEY
) {
358 * Every shared one has parent tree block,
359 * which must be aligned to sector size.
361 if (offset
&& IS_ALIGNED(offset
, fs_info
->sectorsize
))
364 } else if (is_data
== BTRFS_REF_TYPE_DATA
) {
365 if (type
== BTRFS_EXTENT_DATA_REF_KEY
)
367 if (type
== BTRFS_SHARED_DATA_REF_KEY
) {
370 * Every shared one has parent tree block,
371 * which must be aligned to sector size.
374 IS_ALIGNED(offset
, fs_info
->sectorsize
))
378 ASSERT(is_data
== BTRFS_REF_TYPE_ANY
);
384 btrfs_print_leaf(eb
);
386 "eb %llu iref 0x%lx invalid extent inline ref type %d",
387 eb
->start
, (unsigned long)iref
, type
);
389 return BTRFS_REF_TYPE_INVALID
;
392 u64
hash_extent_data_ref(u64 root_objectid
, u64 owner
, u64 offset
)
394 u32 high_crc
= ~(u32
)0;
395 u32 low_crc
= ~(u32
)0;
398 lenum
= cpu_to_le64(root_objectid
);
399 high_crc
= crc32c(high_crc
, &lenum
, sizeof(lenum
));
400 lenum
= cpu_to_le64(owner
);
401 low_crc
= crc32c(low_crc
, &lenum
, sizeof(lenum
));
402 lenum
= cpu_to_le64(offset
);
403 low_crc
= crc32c(low_crc
, &lenum
, sizeof(lenum
));
405 return ((u64
)high_crc
<< 31) ^ (u64
)low_crc
;
408 static u64
hash_extent_data_ref_item(struct extent_buffer
*leaf
,
409 struct btrfs_extent_data_ref
*ref
)
411 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf
, ref
),
412 btrfs_extent_data_ref_objectid(leaf
, ref
),
413 btrfs_extent_data_ref_offset(leaf
, ref
));
416 static int match_extent_data_ref(struct extent_buffer
*leaf
,
417 struct btrfs_extent_data_ref
*ref
,
418 u64 root_objectid
, u64 owner
, u64 offset
)
420 if (btrfs_extent_data_ref_root(leaf
, ref
) != root_objectid
||
421 btrfs_extent_data_ref_objectid(leaf
, ref
) != owner
||
422 btrfs_extent_data_ref_offset(leaf
, ref
) != offset
)
427 static noinline
int lookup_extent_data_ref(struct btrfs_trans_handle
*trans
,
428 struct btrfs_path
*path
,
429 u64 bytenr
, u64 parent
,
431 u64 owner
, u64 offset
)
433 struct btrfs_root
*root
= btrfs_extent_root(trans
->fs_info
, bytenr
);
434 struct btrfs_key key
;
435 struct btrfs_extent_data_ref
*ref
;
436 struct extent_buffer
*leaf
;
441 key
.objectid
= bytenr
;
443 key
.type
= BTRFS_SHARED_DATA_REF_KEY
;
446 key
.type
= BTRFS_EXTENT_DATA_REF_KEY
;
447 key
.offset
= hash_extent_data_ref(root_objectid
,
452 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
463 leaf
= path
->nodes
[0];
464 nritems
= btrfs_header_nritems(leaf
);
466 if (path
->slots
[0] >= nritems
) {
467 ret
= btrfs_next_leaf(root
, path
);
474 leaf
= path
->nodes
[0];
475 nritems
= btrfs_header_nritems(leaf
);
479 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
480 if (key
.objectid
!= bytenr
||
481 key
.type
!= BTRFS_EXTENT_DATA_REF_KEY
)
484 ref
= btrfs_item_ptr(leaf
, path
->slots
[0],
485 struct btrfs_extent_data_ref
);
487 if (match_extent_data_ref(leaf
, ref
, root_objectid
,
490 btrfs_release_path(path
);
502 static noinline
int insert_extent_data_ref(struct btrfs_trans_handle
*trans
,
503 struct btrfs_path
*path
,
504 struct btrfs_delayed_ref_node
*node
,
507 struct btrfs_root
*root
= btrfs_extent_root(trans
->fs_info
, bytenr
);
508 struct btrfs_key key
;
509 struct extent_buffer
*leaf
;
510 u64 owner
= btrfs_delayed_ref_owner(node
);
511 u64 offset
= btrfs_delayed_ref_offset(node
);
516 key
.objectid
= bytenr
;
518 key
.type
= BTRFS_SHARED_DATA_REF_KEY
;
519 key
.offset
= node
->parent
;
520 size
= sizeof(struct btrfs_shared_data_ref
);
522 key
.type
= BTRFS_EXTENT_DATA_REF_KEY
;
523 key
.offset
= hash_extent_data_ref(node
->ref_root
, owner
, offset
);
524 size
= sizeof(struct btrfs_extent_data_ref
);
527 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
, size
);
528 if (ret
&& ret
!= -EEXIST
)
531 leaf
= path
->nodes
[0];
533 struct btrfs_shared_data_ref
*ref
;
534 ref
= btrfs_item_ptr(leaf
, path
->slots
[0],
535 struct btrfs_shared_data_ref
);
537 btrfs_set_shared_data_ref_count(leaf
, ref
, node
->ref_mod
);
539 num_refs
= btrfs_shared_data_ref_count(leaf
, ref
);
540 num_refs
+= node
->ref_mod
;
541 btrfs_set_shared_data_ref_count(leaf
, ref
, num_refs
);
544 struct btrfs_extent_data_ref
*ref
;
545 while (ret
== -EEXIST
) {
546 ref
= btrfs_item_ptr(leaf
, path
->slots
[0],
547 struct btrfs_extent_data_ref
);
548 if (match_extent_data_ref(leaf
, ref
, node
->ref_root
,
551 btrfs_release_path(path
);
553 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
,
555 if (ret
&& ret
!= -EEXIST
)
558 leaf
= path
->nodes
[0];
560 ref
= btrfs_item_ptr(leaf
, path
->slots
[0],
561 struct btrfs_extent_data_ref
);
563 btrfs_set_extent_data_ref_root(leaf
, ref
, node
->ref_root
);
564 btrfs_set_extent_data_ref_objectid(leaf
, ref
, owner
);
565 btrfs_set_extent_data_ref_offset(leaf
, ref
, offset
);
566 btrfs_set_extent_data_ref_count(leaf
, ref
, node
->ref_mod
);
568 num_refs
= btrfs_extent_data_ref_count(leaf
, ref
);
569 num_refs
+= node
->ref_mod
;
570 btrfs_set_extent_data_ref_count(leaf
, ref
, num_refs
);
573 btrfs_mark_buffer_dirty(trans
, leaf
);
576 btrfs_release_path(path
);
580 static noinline
int remove_extent_data_ref(struct btrfs_trans_handle
*trans
,
581 struct btrfs_root
*root
,
582 struct btrfs_path
*path
,
585 struct btrfs_key key
;
586 struct btrfs_extent_data_ref
*ref1
= NULL
;
587 struct btrfs_shared_data_ref
*ref2
= NULL
;
588 struct extent_buffer
*leaf
;
592 leaf
= path
->nodes
[0];
593 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
595 if (key
.type
== BTRFS_EXTENT_DATA_REF_KEY
) {
596 ref1
= btrfs_item_ptr(leaf
, path
->slots
[0],
597 struct btrfs_extent_data_ref
);
598 num_refs
= btrfs_extent_data_ref_count(leaf
, ref1
);
599 } else if (key
.type
== BTRFS_SHARED_DATA_REF_KEY
) {
600 ref2
= btrfs_item_ptr(leaf
, path
->slots
[0],
601 struct btrfs_shared_data_ref
);
602 num_refs
= btrfs_shared_data_ref_count(leaf
, ref2
);
604 btrfs_err(trans
->fs_info
,
605 "unrecognized backref key (%llu %u %llu)",
606 key
.objectid
, key
.type
, key
.offset
);
607 btrfs_abort_transaction(trans
, -EUCLEAN
);
611 BUG_ON(num_refs
< refs_to_drop
);
612 num_refs
-= refs_to_drop
;
615 ret
= btrfs_del_item(trans
, root
, path
);
617 if (key
.type
== BTRFS_EXTENT_DATA_REF_KEY
)
618 btrfs_set_extent_data_ref_count(leaf
, ref1
, num_refs
);
619 else if (key
.type
== BTRFS_SHARED_DATA_REF_KEY
)
620 btrfs_set_shared_data_ref_count(leaf
, ref2
, num_refs
);
621 btrfs_mark_buffer_dirty(trans
, leaf
);
626 static noinline u32
extent_data_ref_count(struct btrfs_path
*path
,
627 struct btrfs_extent_inline_ref
*iref
)
629 struct btrfs_key key
;
630 struct extent_buffer
*leaf
;
631 struct btrfs_extent_data_ref
*ref1
;
632 struct btrfs_shared_data_ref
*ref2
;
636 leaf
= path
->nodes
[0];
637 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
641 * If type is invalid, we should have bailed out earlier than
644 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
, BTRFS_REF_TYPE_DATA
);
645 ASSERT(type
!= BTRFS_REF_TYPE_INVALID
);
646 if (type
== BTRFS_EXTENT_DATA_REF_KEY
) {
647 ref1
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
648 num_refs
= btrfs_extent_data_ref_count(leaf
, ref1
);
650 ref2
= (struct btrfs_shared_data_ref
*)(iref
+ 1);
651 num_refs
= btrfs_shared_data_ref_count(leaf
, ref2
);
653 } else if (key
.type
== BTRFS_EXTENT_DATA_REF_KEY
) {
654 ref1
= btrfs_item_ptr(leaf
, path
->slots
[0],
655 struct btrfs_extent_data_ref
);
656 num_refs
= btrfs_extent_data_ref_count(leaf
, ref1
);
657 } else if (key
.type
== BTRFS_SHARED_DATA_REF_KEY
) {
658 ref2
= btrfs_item_ptr(leaf
, path
->slots
[0],
659 struct btrfs_shared_data_ref
);
660 num_refs
= btrfs_shared_data_ref_count(leaf
, ref2
);
667 static noinline
int lookup_tree_block_ref(struct btrfs_trans_handle
*trans
,
668 struct btrfs_path
*path
,
669 u64 bytenr
, u64 parent
,
672 struct btrfs_root
*root
= btrfs_extent_root(trans
->fs_info
, bytenr
);
673 struct btrfs_key key
;
676 key
.objectid
= bytenr
;
678 key
.type
= BTRFS_SHARED_BLOCK_REF_KEY
;
681 key
.type
= BTRFS_TREE_BLOCK_REF_KEY
;
682 key
.offset
= root_objectid
;
685 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
691 static noinline
int insert_tree_block_ref(struct btrfs_trans_handle
*trans
,
692 struct btrfs_path
*path
,
693 struct btrfs_delayed_ref_node
*node
,
696 struct btrfs_root
*root
= btrfs_extent_root(trans
->fs_info
, bytenr
);
697 struct btrfs_key key
;
700 key
.objectid
= bytenr
;
702 key
.type
= BTRFS_SHARED_BLOCK_REF_KEY
;
703 key
.offset
= node
->parent
;
705 key
.type
= BTRFS_TREE_BLOCK_REF_KEY
;
706 key
.offset
= node
->ref_root
;
709 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
, 0);
710 btrfs_release_path(path
);
714 static inline int extent_ref_type(u64 parent
, u64 owner
)
717 if (owner
< BTRFS_FIRST_FREE_OBJECTID
) {
719 type
= BTRFS_SHARED_BLOCK_REF_KEY
;
721 type
= BTRFS_TREE_BLOCK_REF_KEY
;
724 type
= BTRFS_SHARED_DATA_REF_KEY
;
726 type
= BTRFS_EXTENT_DATA_REF_KEY
;
731 static int find_next_key(struct btrfs_path
*path
, int level
,
732 struct btrfs_key
*key
)
735 for (; level
< BTRFS_MAX_LEVEL
; level
++) {
736 if (!path
->nodes
[level
])
738 if (path
->slots
[level
] + 1 >=
739 btrfs_header_nritems(path
->nodes
[level
]))
742 btrfs_item_key_to_cpu(path
->nodes
[level
], key
,
743 path
->slots
[level
] + 1);
745 btrfs_node_key_to_cpu(path
->nodes
[level
], key
,
746 path
->slots
[level
] + 1);
753 * look for inline back ref. if back ref is found, *ref_ret is set
754 * to the address of inline back ref, and 0 is returned.
756 * if back ref isn't found, *ref_ret is set to the address where it
757 * should be inserted, and -ENOENT is returned.
759 * if insert is true and there are too many inline back refs, the path
760 * points to the extent item, and -EAGAIN is returned.
762 * NOTE: inline back refs are ordered in the same way that back ref
763 * items in the tree are ordered.
765 static noinline_for_stack
766 int lookup_inline_extent_backref(struct btrfs_trans_handle
*trans
,
767 struct btrfs_path
*path
,
768 struct btrfs_extent_inline_ref
**ref_ret
,
769 u64 bytenr
, u64 num_bytes
,
770 u64 parent
, u64 root_objectid
,
771 u64 owner
, u64 offset
, int insert
)
773 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
774 struct btrfs_root
*root
= btrfs_extent_root(fs_info
, bytenr
);
775 struct btrfs_key key
;
776 struct extent_buffer
*leaf
;
777 struct btrfs_extent_item
*ei
;
778 struct btrfs_extent_inline_ref
*iref
;
787 bool skinny_metadata
= btrfs_fs_incompat(fs_info
, SKINNY_METADATA
);
790 key
.objectid
= bytenr
;
791 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
792 key
.offset
= num_bytes
;
794 want
= extent_ref_type(parent
, owner
);
796 extra_size
= btrfs_extent_inline_ref_size(want
);
797 path
->search_for_extension
= 1;
802 * Owner is our level, so we can just add one to get the level for the
803 * block we are interested in.
805 if (skinny_metadata
&& owner
< BTRFS_FIRST_FREE_OBJECTID
) {
806 key
.type
= BTRFS_METADATA_ITEM_KEY
;
811 ret
= btrfs_search_slot(trans
, root
, &key
, path
, extra_size
, 1);
816 * We may be a newly converted file system which still has the old fat
817 * extent entries for metadata, so try and see if we have one of those.
819 if (ret
> 0 && skinny_metadata
) {
820 skinny_metadata
= false;
821 if (path
->slots
[0]) {
823 btrfs_item_key_to_cpu(path
->nodes
[0], &key
,
825 if (key
.objectid
== bytenr
&&
826 key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
827 key
.offset
== num_bytes
)
831 key
.objectid
= bytenr
;
832 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
833 key
.offset
= num_bytes
;
834 btrfs_release_path(path
);
839 if (ret
&& !insert
) {
842 } else if (WARN_ON(ret
)) {
843 btrfs_print_leaf(path
->nodes
[0]);
845 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
846 bytenr
, num_bytes
, parent
, root_objectid
, owner
,
852 leaf
= path
->nodes
[0];
853 item_size
= btrfs_item_size(leaf
, path
->slots
[0]);
854 if (unlikely(item_size
< sizeof(*ei
))) {
857 "unexpected extent item size, has %llu expect >= %zu",
858 item_size
, sizeof(*ei
));
859 btrfs_abort_transaction(trans
, ret
);
863 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
864 flags
= btrfs_extent_flags(leaf
, ei
);
866 ptr
= (unsigned long)(ei
+ 1);
867 end
= (unsigned long)ei
+ item_size
;
869 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
&& !skinny_metadata
) {
870 ptr
+= sizeof(struct btrfs_tree_block_info
);
874 if (owner
>= BTRFS_FIRST_FREE_OBJECTID
)
875 needed
= BTRFS_REF_TYPE_DATA
;
877 needed
= BTRFS_REF_TYPE_BLOCK
;
881 iref
= (struct btrfs_extent_inline_ref
*)ptr
;
882 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
, needed
);
883 if (type
== BTRFS_EXTENT_OWNER_REF_KEY
) {
884 ASSERT(btrfs_fs_incompat(fs_info
, SIMPLE_QUOTA
));
885 ptr
+= btrfs_extent_inline_ref_size(type
);
888 if (type
== BTRFS_REF_TYPE_INVALID
) {
896 ptr
+= btrfs_extent_inline_ref_size(type
);
900 if (type
== BTRFS_EXTENT_DATA_REF_KEY
) {
901 struct btrfs_extent_data_ref
*dref
;
902 dref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
903 if (match_extent_data_ref(leaf
, dref
, root_objectid
,
908 if (hash_extent_data_ref_item(leaf
, dref
) <
909 hash_extent_data_ref(root_objectid
, owner
, offset
))
913 ref_offset
= btrfs_extent_inline_ref_offset(leaf
, iref
);
915 if (parent
== ref_offset
) {
919 if (ref_offset
< parent
)
922 if (root_objectid
== ref_offset
) {
926 if (ref_offset
< root_objectid
)
930 ptr
+= btrfs_extent_inline_ref_size(type
);
933 if (unlikely(ptr
> end
)) {
935 btrfs_print_leaf(path
->nodes
[0]);
937 "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
938 path
->slots
[0], root_objectid
, owner
, offset
, parent
);
942 if (ret
== -ENOENT
&& insert
) {
943 if (item_size
+ extra_size
>=
944 BTRFS_MAX_EXTENT_ITEM_SIZE(root
)) {
949 if (path
->slots
[0] + 1 < btrfs_header_nritems(path
->nodes
[0])) {
950 struct btrfs_key tmp_key
;
952 btrfs_item_key_to_cpu(path
->nodes
[0], &tmp_key
, path
->slots
[0] + 1);
953 if (tmp_key
.objectid
== bytenr
&&
954 tmp_key
.type
< BTRFS_BLOCK_GROUP_ITEM_KEY
) {
961 if (!path
->keep_locks
) {
962 btrfs_release_path(path
);
963 path
->keep_locks
= 1;
968 * To add new inline back ref, we have to make sure
969 * there is no corresponding back ref item.
970 * For simplicity, we just do not add new inline back
971 * ref if there is any kind of item for this block
973 if (find_next_key(path
, 0, &key
) == 0 &&
974 key
.objectid
== bytenr
&&
975 key
.type
< BTRFS_BLOCK_GROUP_ITEM_KEY
) {
981 *ref_ret
= (struct btrfs_extent_inline_ref
*)ptr
;
983 if (path
->keep_locks
) {
984 path
->keep_locks
= 0;
985 btrfs_unlock_up_safe(path
, 1);
988 path
->search_for_extension
= 0;
993 * helper to add new inline back ref
995 static noinline_for_stack
996 void setup_inline_extent_backref(struct btrfs_trans_handle
*trans
,
997 struct btrfs_path
*path
,
998 struct btrfs_extent_inline_ref
*iref
,
999 u64 parent
, u64 root_objectid
,
1000 u64 owner
, u64 offset
, int refs_to_add
,
1001 struct btrfs_delayed_extent_op
*extent_op
)
1003 struct extent_buffer
*leaf
;
1004 struct btrfs_extent_item
*ei
;
1007 unsigned long item_offset
;
1012 leaf
= path
->nodes
[0];
1013 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
1014 item_offset
= (unsigned long)iref
- (unsigned long)ei
;
1016 type
= extent_ref_type(parent
, owner
);
1017 size
= btrfs_extent_inline_ref_size(type
);
1019 btrfs_extend_item(trans
, path
, size
);
1021 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
1022 refs
= btrfs_extent_refs(leaf
, ei
);
1023 refs
+= refs_to_add
;
1024 btrfs_set_extent_refs(leaf
, ei
, refs
);
1026 __run_delayed_extent_op(extent_op
, leaf
, ei
);
1028 ptr
= (unsigned long)ei
+ item_offset
;
1029 end
= (unsigned long)ei
+ btrfs_item_size(leaf
, path
->slots
[0]);
1030 if (ptr
< end
- size
)
1031 memmove_extent_buffer(leaf
, ptr
+ size
, ptr
,
1034 iref
= (struct btrfs_extent_inline_ref
*)ptr
;
1035 btrfs_set_extent_inline_ref_type(leaf
, iref
, type
);
1036 if (type
== BTRFS_EXTENT_DATA_REF_KEY
) {
1037 struct btrfs_extent_data_ref
*dref
;
1038 dref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
1039 btrfs_set_extent_data_ref_root(leaf
, dref
, root_objectid
);
1040 btrfs_set_extent_data_ref_objectid(leaf
, dref
, owner
);
1041 btrfs_set_extent_data_ref_offset(leaf
, dref
, offset
);
1042 btrfs_set_extent_data_ref_count(leaf
, dref
, refs_to_add
);
1043 } else if (type
== BTRFS_SHARED_DATA_REF_KEY
) {
1044 struct btrfs_shared_data_ref
*sref
;
1045 sref
= (struct btrfs_shared_data_ref
*)(iref
+ 1);
1046 btrfs_set_shared_data_ref_count(leaf
, sref
, refs_to_add
);
1047 btrfs_set_extent_inline_ref_offset(leaf
, iref
, parent
);
1048 } else if (type
== BTRFS_SHARED_BLOCK_REF_KEY
) {
1049 btrfs_set_extent_inline_ref_offset(leaf
, iref
, parent
);
1051 btrfs_set_extent_inline_ref_offset(leaf
, iref
, root_objectid
);
1053 btrfs_mark_buffer_dirty(trans
, leaf
);
1056 static int lookup_extent_backref(struct btrfs_trans_handle
*trans
,
1057 struct btrfs_path
*path
,
1058 struct btrfs_extent_inline_ref
**ref_ret
,
1059 u64 bytenr
, u64 num_bytes
, u64 parent
,
1060 u64 root_objectid
, u64 owner
, u64 offset
)
1064 ret
= lookup_inline_extent_backref(trans
, path
, ref_ret
, bytenr
,
1065 num_bytes
, parent
, root_objectid
,
1070 btrfs_release_path(path
);
1073 if (owner
< BTRFS_FIRST_FREE_OBJECTID
) {
1074 ret
= lookup_tree_block_ref(trans
, path
, bytenr
, parent
,
1077 ret
= lookup_extent_data_ref(trans
, path
, bytenr
, parent
,
1078 root_objectid
, owner
, offset
);
1084 * helper to update/remove inline back ref
1086 static noinline_for_stack
int update_inline_extent_backref(
1087 struct btrfs_trans_handle
*trans
,
1088 struct btrfs_path
*path
,
1089 struct btrfs_extent_inline_ref
*iref
,
1091 struct btrfs_delayed_extent_op
*extent_op
)
1093 struct extent_buffer
*leaf
= path
->nodes
[0];
1094 struct btrfs_fs_info
*fs_info
= leaf
->fs_info
;
1095 struct btrfs_extent_item
*ei
;
1096 struct btrfs_extent_data_ref
*dref
= NULL
;
1097 struct btrfs_shared_data_ref
*sref
= NULL
;
1105 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
1106 refs
= btrfs_extent_refs(leaf
, ei
);
1107 if (unlikely(refs_to_mod
< 0 && refs
+ refs_to_mod
<= 0)) {
1108 struct btrfs_key key
;
1111 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
1112 if (key
.type
== BTRFS_METADATA_ITEM_KEY
)
1113 extent_size
= fs_info
->nodesize
;
1115 extent_size
= key
.offset
;
1116 btrfs_print_leaf(leaf
);
1118 "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
1119 key
.objectid
, extent_size
, refs_to_mod
, refs
);
1122 refs
+= refs_to_mod
;
1123 btrfs_set_extent_refs(leaf
, ei
, refs
);
1125 __run_delayed_extent_op(extent_op
, leaf
, ei
);
1127 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
, BTRFS_REF_TYPE_ANY
);
1129 * Function btrfs_get_extent_inline_ref_type() has already printed
1132 if (unlikely(type
== BTRFS_REF_TYPE_INVALID
))
1135 if (type
== BTRFS_EXTENT_DATA_REF_KEY
) {
1136 dref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
1137 refs
= btrfs_extent_data_ref_count(leaf
, dref
);
1138 } else if (type
== BTRFS_SHARED_DATA_REF_KEY
) {
1139 sref
= (struct btrfs_shared_data_ref
*)(iref
+ 1);
1140 refs
= btrfs_shared_data_ref_count(leaf
, sref
);
1144 * For tree blocks we can only drop one ref for it, and tree
1145 * blocks should not have refs > 1.
1147 * Furthermore if we're inserting a new inline backref, we
1148 * won't reach this path either. That would be
1149 * setup_inline_extent_backref().
1151 if (unlikely(refs_to_mod
!= -1)) {
1152 struct btrfs_key key
;
1154 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
1156 btrfs_print_leaf(leaf
);
1158 "invalid refs_to_mod for tree block %llu, has %d expect -1",
1159 key
.objectid
, refs_to_mod
);
1164 if (unlikely(refs_to_mod
< 0 && refs
< -refs_to_mod
)) {
1165 struct btrfs_key key
;
1168 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
1169 if (key
.type
== BTRFS_METADATA_ITEM_KEY
)
1170 extent_size
= fs_info
->nodesize
;
1172 extent_size
= key
.offset
;
1173 btrfs_print_leaf(leaf
);
1175 "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
1176 (unsigned long)iref
, key
.objectid
, extent_size
,
1180 refs
+= refs_to_mod
;
1183 if (type
== BTRFS_EXTENT_DATA_REF_KEY
)
1184 btrfs_set_extent_data_ref_count(leaf
, dref
, refs
);
1186 btrfs_set_shared_data_ref_count(leaf
, sref
, refs
);
1188 size
= btrfs_extent_inline_ref_size(type
);
1189 item_size
= btrfs_item_size(leaf
, path
->slots
[0]);
1190 ptr
= (unsigned long)iref
;
1191 end
= (unsigned long)ei
+ item_size
;
1192 if (ptr
+ size
< end
)
1193 memmove_extent_buffer(leaf
, ptr
, ptr
+ size
,
1196 btrfs_truncate_item(trans
, path
, item_size
, 1);
1198 btrfs_mark_buffer_dirty(trans
, leaf
);
1202 static noinline_for_stack
1203 int insert_inline_extent_backref(struct btrfs_trans_handle
*trans
,
1204 struct btrfs_path
*path
,
1205 u64 bytenr
, u64 num_bytes
, u64 parent
,
1206 u64 root_objectid
, u64 owner
,
1207 u64 offset
, int refs_to_add
,
1208 struct btrfs_delayed_extent_op
*extent_op
)
1210 struct btrfs_extent_inline_ref
*iref
;
1213 ret
= lookup_inline_extent_backref(trans
, path
, &iref
, bytenr
,
1214 num_bytes
, parent
, root_objectid
,
1218 * We're adding refs to a tree block we already own, this
1219 * should not happen at all.
1221 if (owner
< BTRFS_FIRST_FREE_OBJECTID
) {
1222 btrfs_print_leaf(path
->nodes
[0]);
1223 btrfs_crit(trans
->fs_info
,
1224 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
1225 bytenr
, num_bytes
, root_objectid
, path
->slots
[0]);
1228 ret
= update_inline_extent_backref(trans
, path
, iref
,
1229 refs_to_add
, extent_op
);
1230 } else if (ret
== -ENOENT
) {
1231 setup_inline_extent_backref(trans
, path
, iref
, parent
,
1232 root_objectid
, owner
, offset
,
1233 refs_to_add
, extent_op
);
1239 static int remove_extent_backref(struct btrfs_trans_handle
*trans
,
1240 struct btrfs_root
*root
,
1241 struct btrfs_path
*path
,
1242 struct btrfs_extent_inline_ref
*iref
,
1243 int refs_to_drop
, int is_data
)
1247 BUG_ON(!is_data
&& refs_to_drop
!= 1);
1249 ret
= update_inline_extent_backref(trans
, path
, iref
,
1250 -refs_to_drop
, NULL
);
1252 ret
= remove_extent_data_ref(trans
, root
, path
, refs_to_drop
);
1254 ret
= btrfs_del_item(trans
, root
, path
);
1258 static int btrfs_issue_discard(struct block_device
*bdev
, u64 start
, u64 len
,
1259 u64
*discarded_bytes
)
1262 u64 bytes_left
, end
;
1263 u64 aligned_start
= ALIGN(start
, 1 << SECTOR_SHIFT
);
1265 /* Adjust the range to be aligned to 512B sectors if necessary. */
1266 if (start
!= aligned_start
) {
1267 len
-= aligned_start
- start
;
1268 len
= round_down(len
, 1 << SECTOR_SHIFT
);
1269 start
= aligned_start
;
1272 *discarded_bytes
= 0;
1280 /* Skip any superblocks on this device. */
1281 for (j
= 0; j
< BTRFS_SUPER_MIRROR_MAX
; j
++) {
1282 u64 sb_start
= btrfs_sb_offset(j
);
1283 u64 sb_end
= sb_start
+ BTRFS_SUPER_INFO_SIZE
;
1284 u64 size
= sb_start
- start
;
1286 if (!in_range(sb_start
, start
, bytes_left
) &&
1287 !in_range(sb_end
, start
, bytes_left
) &&
1288 !in_range(start
, sb_start
, BTRFS_SUPER_INFO_SIZE
))
1292 * Superblock spans beginning of range. Adjust start and
1295 if (sb_start
<= start
) {
1296 start
+= sb_end
- start
;
1301 bytes_left
= end
- start
;
1306 ret
= blkdev_issue_discard(bdev
, start
>> SECTOR_SHIFT
,
1307 size
>> SECTOR_SHIFT
,
1310 *discarded_bytes
+= size
;
1311 else if (ret
!= -EOPNOTSUPP
)
1320 bytes_left
= end
- start
;
1323 while (bytes_left
) {
1324 u64 bytes_to_discard
= min(BTRFS_MAX_DISCARD_CHUNK_SIZE
, bytes_left
);
1326 ret
= blkdev_issue_discard(bdev
, start
>> SECTOR_SHIFT
,
1327 bytes_to_discard
>> SECTOR_SHIFT
,
1331 if (ret
!= -EOPNOTSUPP
)
1336 start
+= bytes_to_discard
;
1337 bytes_left
-= bytes_to_discard
;
1338 *discarded_bytes
+= bytes_to_discard
;
1340 if (btrfs_trim_interrupted()) {
1349 static int do_discard_extent(struct btrfs_discard_stripe
*stripe
, u64
*bytes
)
1351 struct btrfs_device
*dev
= stripe
->dev
;
1352 struct btrfs_fs_info
*fs_info
= dev
->fs_info
;
1353 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
1354 u64 phys
= stripe
->physical
;
1355 u64 len
= stripe
->length
;
1359 /* Zone reset on a zoned filesystem */
1360 if (btrfs_can_zone_reset(dev
, phys
, len
)) {
1363 ret
= btrfs_reset_device_zone(dev
, phys
, len
, &discarded
);
1367 if (!btrfs_dev_replace_is_ongoing(dev_replace
) ||
1368 dev
!= dev_replace
->srcdev
)
1371 src_disc
= discarded
;
1373 /* Send to replace target as well */
1374 ret
= btrfs_reset_device_zone(dev_replace
->tgtdev
, phys
, len
,
1376 discarded
+= src_disc
;
1377 } else if (bdev_max_discard_sectors(stripe
->dev
->bdev
)) {
1378 ret
= btrfs_issue_discard(dev
->bdev
, phys
, len
, &discarded
);
1389 int btrfs_discard_extent(struct btrfs_fs_info
*fs_info
, u64 bytenr
,
1390 u64 num_bytes
, u64
*actual_bytes
)
1393 u64 discarded_bytes
= 0;
1394 u64 end
= bytenr
+ num_bytes
;
1398 * Avoid races with device replace and make sure the devices in the
1399 * stripes don't go away while we are discarding.
1401 btrfs_bio_counter_inc_blocked(fs_info
);
1403 struct btrfs_discard_stripe
*stripes
;
1404 unsigned int num_stripes
;
1407 num_bytes
= end
- cur
;
1408 stripes
= btrfs_map_discard(fs_info
, cur
, &num_bytes
, &num_stripes
);
1409 if (IS_ERR(stripes
)) {
1410 ret
= PTR_ERR(stripes
);
1411 if (ret
== -EOPNOTSUPP
)
1416 for (i
= 0; i
< num_stripes
; i
++) {
1417 struct btrfs_discard_stripe
*stripe
= stripes
+ i
;
1420 if (!stripe
->dev
->bdev
) {
1421 ASSERT(btrfs_test_opt(fs_info
, DEGRADED
));
1425 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE
,
1426 &stripe
->dev
->dev_state
))
1429 ret
= do_discard_extent(stripe
, &bytes
);
1432 * Keep going if discard is not supported by the
1435 if (ret
!= -EOPNOTSUPP
)
1439 discarded_bytes
+= bytes
;
1447 btrfs_bio_counter_dec(fs_info
);
1449 *actual_bytes
= discarded_bytes
;
1453 /* Can return -ENOMEM */
1454 int btrfs_inc_extent_ref(struct btrfs_trans_handle
*trans
,
1455 struct btrfs_ref
*generic_ref
)
1457 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1460 ASSERT(generic_ref
->type
!= BTRFS_REF_NOT_SET
&&
1461 generic_ref
->action
);
1462 BUG_ON(generic_ref
->type
== BTRFS_REF_METADATA
&&
1463 generic_ref
->ref_root
== BTRFS_TREE_LOG_OBJECTID
);
1465 if (generic_ref
->type
== BTRFS_REF_METADATA
)
1466 ret
= btrfs_add_delayed_tree_ref(trans
, generic_ref
, NULL
);
1468 ret
= btrfs_add_delayed_data_ref(trans
, generic_ref
, 0);
1470 btrfs_ref_tree_mod(fs_info
, generic_ref
);
1476 * Insert backreference for a given extent.
1478 * The counterpart is in __btrfs_free_extent(), with examples and more details
1481 * @trans: Handle of transaction
1483 * @node: The delayed ref node used to get the bytenr/length for
1484 * extent whose references are incremented.
1486 * @extent_op Pointer to a structure, holding information necessary when
1487 * updating a tree block's flags
1490 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle
*trans
,
1491 struct btrfs_delayed_ref_node
*node
,
1492 struct btrfs_delayed_extent_op
*extent_op
)
1494 struct btrfs_path
*path
;
1495 struct extent_buffer
*leaf
;
1496 struct btrfs_extent_item
*item
;
1497 struct btrfs_key key
;
1498 u64 bytenr
= node
->bytenr
;
1499 u64 num_bytes
= node
->num_bytes
;
1500 u64 owner
= btrfs_delayed_ref_owner(node
);
1501 u64 offset
= btrfs_delayed_ref_offset(node
);
1503 int refs_to_add
= node
->ref_mod
;
1506 path
= btrfs_alloc_path();
1510 /* this will setup the path even if it fails to insert the back ref */
1511 ret
= insert_inline_extent_backref(trans
, path
, bytenr
, num_bytes
,
1512 node
->parent
, node
->ref_root
, owner
,
1513 offset
, refs_to_add
, extent_op
);
1514 if ((ret
< 0 && ret
!= -EAGAIN
) || !ret
)
1518 * Ok we had -EAGAIN which means we didn't have space to insert and
1519 * inline extent ref, so just update the reference count and add a
1522 leaf
= path
->nodes
[0];
1523 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
1524 item
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
1525 refs
= btrfs_extent_refs(leaf
, item
);
1526 btrfs_set_extent_refs(leaf
, item
, refs
+ refs_to_add
);
1528 __run_delayed_extent_op(extent_op
, leaf
, item
);
1530 btrfs_mark_buffer_dirty(trans
, leaf
);
1531 btrfs_release_path(path
);
1533 /* now insert the actual backref */
1534 if (owner
< BTRFS_FIRST_FREE_OBJECTID
)
1535 ret
= insert_tree_block_ref(trans
, path
, node
, bytenr
);
1537 ret
= insert_extent_data_ref(trans
, path
, node
, bytenr
);
1540 btrfs_abort_transaction(trans
, ret
);
1542 btrfs_free_path(path
);
1546 static void free_head_ref_squota_rsv(struct btrfs_fs_info
*fs_info
,
1547 struct btrfs_delayed_ref_head
*href
)
1549 u64 root
= href
->owning_root
;
1552 * Don't check must_insert_reserved, as this is called from contexts
1553 * where it has already been unset.
1555 if (btrfs_qgroup_mode(fs_info
) != BTRFS_QGROUP_MODE_SIMPLE
||
1556 !href
->is_data
|| !is_fstree(root
))
1559 btrfs_qgroup_free_refroot(fs_info
, root
, href
->reserved_bytes
,
1560 BTRFS_QGROUP_RSV_DATA
);
1563 static int run_delayed_data_ref(struct btrfs_trans_handle
*trans
,
1564 struct btrfs_delayed_ref_head
*href
,
1565 struct btrfs_delayed_ref_node
*node
,
1566 struct btrfs_delayed_extent_op
*extent_op
,
1567 bool insert_reserved
)
1573 trace_run_delayed_data_ref(trans
->fs_info
, node
);
1575 if (node
->type
== BTRFS_SHARED_DATA_REF_KEY
)
1576 parent
= node
->parent
;
1578 if (node
->action
== BTRFS_ADD_DELAYED_REF
&& insert_reserved
) {
1579 struct btrfs_key key
;
1580 struct btrfs_squota_delta delta
= {
1581 .root
= href
->owning_root
,
1582 .num_bytes
= node
->num_bytes
,
1585 .generation
= trans
->transid
,
1587 u64 owner
= btrfs_delayed_ref_owner(node
);
1588 u64 offset
= btrfs_delayed_ref_offset(node
);
1591 flags
|= extent_op
->flags_to_set
;
1593 key
.objectid
= node
->bytenr
;
1594 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1595 key
.offset
= node
->num_bytes
;
1597 ret
= alloc_reserved_file_extent(trans
, parent
, node
->ref_root
,
1598 flags
, owner
, offset
, &key
,
1601 free_head_ref_squota_rsv(trans
->fs_info
, href
);
1603 ret
= btrfs_record_squota_delta(trans
->fs_info
, &delta
);
1604 } else if (node
->action
== BTRFS_ADD_DELAYED_REF
) {
1605 ret
= __btrfs_inc_extent_ref(trans
, node
, extent_op
);
1606 } else if (node
->action
== BTRFS_DROP_DELAYED_REF
) {
1607 ret
= __btrfs_free_extent(trans
, href
, node
, extent_op
);
1614 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op
*extent_op
,
1615 struct extent_buffer
*leaf
,
1616 struct btrfs_extent_item
*ei
)
1618 u64 flags
= btrfs_extent_flags(leaf
, ei
);
1619 if (extent_op
->update_flags
) {
1620 flags
|= extent_op
->flags_to_set
;
1621 btrfs_set_extent_flags(leaf
, ei
, flags
);
1624 if (extent_op
->update_key
) {
1625 struct btrfs_tree_block_info
*bi
;
1626 BUG_ON(!(flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
));
1627 bi
= (struct btrfs_tree_block_info
*)(ei
+ 1);
1628 btrfs_set_tree_block_key(leaf
, bi
, &extent_op
->key
);
1632 static int run_delayed_extent_op(struct btrfs_trans_handle
*trans
,
1633 struct btrfs_delayed_ref_head
*head
,
1634 struct btrfs_delayed_extent_op
*extent_op
)
1636 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1637 struct btrfs_root
*root
;
1638 struct btrfs_key key
;
1639 struct btrfs_path
*path
;
1640 struct btrfs_extent_item
*ei
;
1641 struct extent_buffer
*leaf
;
1646 if (TRANS_ABORTED(trans
))
1649 if (!btrfs_fs_incompat(fs_info
, SKINNY_METADATA
))
1652 path
= btrfs_alloc_path();
1656 key
.objectid
= head
->bytenr
;
1659 key
.type
= BTRFS_METADATA_ITEM_KEY
;
1660 key
.offset
= head
->level
;
1662 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1663 key
.offset
= head
->num_bytes
;
1666 root
= btrfs_extent_root(fs_info
, key
.objectid
);
1668 ret
= btrfs_search_slot(trans
, root
, &key
, path
, 0, 1);
1671 } else if (ret
> 0) {
1673 if (path
->slots
[0] > 0) {
1675 btrfs_item_key_to_cpu(path
->nodes
[0], &key
,
1677 if (key
.objectid
== head
->bytenr
&&
1678 key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
1679 key
.offset
== head
->num_bytes
)
1683 btrfs_release_path(path
);
1686 key
.objectid
= head
->bytenr
;
1687 key
.offset
= head
->num_bytes
;
1688 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1694 "missing extent item for extent %llu num_bytes %llu level %d",
1695 head
->bytenr
, head
->num_bytes
, head
->level
);
1700 leaf
= path
->nodes
[0];
1701 item_size
= btrfs_item_size(leaf
, path
->slots
[0]);
1703 if (unlikely(item_size
< sizeof(*ei
))) {
1706 "unexpected extent item size, has %u expect >= %zu",
1707 item_size
, sizeof(*ei
));
1708 btrfs_abort_transaction(trans
, ret
);
1712 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
1713 __run_delayed_extent_op(extent_op
, leaf
, ei
);
1715 btrfs_mark_buffer_dirty(trans
, leaf
);
1717 btrfs_free_path(path
);
1721 static int run_delayed_tree_ref(struct btrfs_trans_handle
*trans
,
1722 struct btrfs_delayed_ref_head
*href
,
1723 struct btrfs_delayed_ref_node
*node
,
1724 struct btrfs_delayed_extent_op
*extent_op
,
1725 bool insert_reserved
)
1728 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1732 trace_run_delayed_tree_ref(trans
->fs_info
, node
);
1734 if (node
->type
== BTRFS_SHARED_BLOCK_REF_KEY
)
1735 parent
= node
->parent
;
1736 ref_root
= node
->ref_root
;
1738 if (unlikely(node
->ref_mod
!= 1)) {
1739 btrfs_err(trans
->fs_info
,
1740 "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
1741 node
->bytenr
, node
->ref_mod
, node
->action
, ref_root
,
1745 if (node
->action
== BTRFS_ADD_DELAYED_REF
&& insert_reserved
) {
1746 struct btrfs_squota_delta delta
= {
1747 .root
= href
->owning_root
,
1748 .num_bytes
= fs_info
->nodesize
,
1751 .generation
= trans
->transid
,
1754 ret
= alloc_reserved_tree_block(trans
, node
, extent_op
);
1756 btrfs_record_squota_delta(fs_info
, &delta
);
1757 } else if (node
->action
== BTRFS_ADD_DELAYED_REF
) {
1758 ret
= __btrfs_inc_extent_ref(trans
, node
, extent_op
);
1759 } else if (node
->action
== BTRFS_DROP_DELAYED_REF
) {
1760 ret
= __btrfs_free_extent(trans
, href
, node
, extent_op
);
1767 /* helper function to actually process a single delayed ref entry */
1768 static int run_one_delayed_ref(struct btrfs_trans_handle
*trans
,
1769 struct btrfs_delayed_ref_head
*href
,
1770 struct btrfs_delayed_ref_node
*node
,
1771 struct btrfs_delayed_extent_op
*extent_op
,
1772 bool insert_reserved
)
1776 if (TRANS_ABORTED(trans
)) {
1777 if (insert_reserved
) {
1778 btrfs_pin_extent(trans
, node
->bytenr
, node
->num_bytes
, 1);
1779 free_head_ref_squota_rsv(trans
->fs_info
, href
);
1784 if (node
->type
== BTRFS_TREE_BLOCK_REF_KEY
||
1785 node
->type
== BTRFS_SHARED_BLOCK_REF_KEY
)
1786 ret
= run_delayed_tree_ref(trans
, href
, node
, extent_op
,
1788 else if (node
->type
== BTRFS_EXTENT_DATA_REF_KEY
||
1789 node
->type
== BTRFS_SHARED_DATA_REF_KEY
)
1790 ret
= run_delayed_data_ref(trans
, href
, node
, extent_op
,
1792 else if (node
->type
== BTRFS_EXTENT_OWNER_REF_KEY
)
1796 if (ret
&& insert_reserved
)
1797 btrfs_pin_extent(trans
, node
->bytenr
, node
->num_bytes
, 1);
1799 btrfs_err(trans
->fs_info
,
1800 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1801 node
->bytenr
, node
->num_bytes
, node
->type
,
1802 node
->action
, node
->ref_mod
, ret
);
1806 static inline struct btrfs_delayed_ref_node
*
1807 select_delayed_ref(struct btrfs_delayed_ref_head
*head
)
1809 struct btrfs_delayed_ref_node
*ref
;
1811 if (RB_EMPTY_ROOT(&head
->ref_tree
.rb_root
))
1815 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1816 * This is to prevent a ref count from going down to zero, which deletes
1817 * the extent item from the extent tree, when there still are references
1818 * to add, which would fail because they would not find the extent item.
1820 if (!list_empty(&head
->ref_add_list
))
1821 return list_first_entry(&head
->ref_add_list
,
1822 struct btrfs_delayed_ref_node
, add_list
);
1824 ref
= rb_entry(rb_first_cached(&head
->ref_tree
),
1825 struct btrfs_delayed_ref_node
, ref_node
);
1826 ASSERT(list_empty(&ref
->add_list
));
1830 static struct btrfs_delayed_extent_op
*cleanup_extent_op(
1831 struct btrfs_delayed_ref_head
*head
)
1833 struct btrfs_delayed_extent_op
*extent_op
= head
->extent_op
;
1838 if (head
->must_insert_reserved
) {
1839 head
->extent_op
= NULL
;
1840 btrfs_free_delayed_extent_op(extent_op
);
1846 static int run_and_cleanup_extent_op(struct btrfs_trans_handle
*trans
,
1847 struct btrfs_delayed_ref_head
*head
)
1849 struct btrfs_delayed_extent_op
*extent_op
;
1852 extent_op
= cleanup_extent_op(head
);
1855 head
->extent_op
= NULL
;
1856 spin_unlock(&head
->lock
);
1857 ret
= run_delayed_extent_op(trans
, head
, extent_op
);
1858 btrfs_free_delayed_extent_op(extent_op
);
1859 return ret
? ret
: 1;
1862 u64
btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info
*fs_info
,
1863 struct btrfs_delayed_ref_root
*delayed_refs
,
1864 struct btrfs_delayed_ref_head
*head
)
1869 * We had csum deletions accounted for in our delayed refs rsv, we need
1870 * to drop the csum leaves for this update from our delayed_refs_rsv.
1872 if (head
->total_ref_mod
< 0 && head
->is_data
) {
1875 spin_lock(&delayed_refs
->lock
);
1876 delayed_refs
->pending_csums
-= head
->num_bytes
;
1877 spin_unlock(&delayed_refs
->lock
);
1878 nr_csums
= btrfs_csum_bytes_to_leaves(fs_info
, head
->num_bytes
);
1880 btrfs_delayed_refs_rsv_release(fs_info
, 0, nr_csums
);
1882 ret
= btrfs_calc_delayed_ref_csum_bytes(fs_info
, nr_csums
);
1884 /* must_insert_reserved can be set only if we didn't run the head ref. */
1885 if (head
->must_insert_reserved
)
1886 free_head_ref_squota_rsv(fs_info
, head
);
1891 static int cleanup_ref_head(struct btrfs_trans_handle
*trans
,
1892 struct btrfs_delayed_ref_head
*head
,
1893 u64
*bytes_released
)
1896 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1897 struct btrfs_delayed_ref_root
*delayed_refs
;
1900 delayed_refs
= &trans
->transaction
->delayed_refs
;
1902 ret
= run_and_cleanup_extent_op(trans
, head
);
1904 btrfs_unselect_ref_head(delayed_refs
, head
);
1905 btrfs_debug(fs_info
, "run_delayed_extent_op returned %d", ret
);
1912 * Need to drop our head ref lock and re-acquire the delayed ref lock
1913 * and then re-check to make sure nobody got added.
1915 spin_unlock(&head
->lock
);
1916 spin_lock(&delayed_refs
->lock
);
1917 spin_lock(&head
->lock
);
1918 if (!RB_EMPTY_ROOT(&head
->ref_tree
.rb_root
) || head
->extent_op
) {
1919 spin_unlock(&head
->lock
);
1920 spin_unlock(&delayed_refs
->lock
);
1923 btrfs_delete_ref_head(fs_info
, delayed_refs
, head
);
1924 spin_unlock(&head
->lock
);
1925 spin_unlock(&delayed_refs
->lock
);
1927 if (head
->must_insert_reserved
) {
1928 btrfs_pin_extent(trans
, head
->bytenr
, head
->num_bytes
, 1);
1929 if (head
->is_data
) {
1930 struct btrfs_root
*csum_root
;
1932 csum_root
= btrfs_csum_root(fs_info
, head
->bytenr
);
1933 ret
= btrfs_del_csums(trans
, csum_root
, head
->bytenr
,
1938 *bytes_released
+= btrfs_cleanup_ref_head_accounting(fs_info
, delayed_refs
, head
);
1940 trace_run_delayed_ref_head(fs_info
, head
, 0);
1941 btrfs_delayed_ref_unlock(head
);
1942 btrfs_put_delayed_ref_head(head
);
1946 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle
*trans
,
1947 struct btrfs_delayed_ref_head
*locked_ref
,
1948 u64
*bytes_released
)
1950 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1951 struct btrfs_delayed_ref_root
*delayed_refs
;
1952 struct btrfs_delayed_extent_op
*extent_op
;
1953 struct btrfs_delayed_ref_node
*ref
;
1954 bool must_insert_reserved
;
1957 delayed_refs
= &trans
->transaction
->delayed_refs
;
1959 lockdep_assert_held(&locked_ref
->mutex
);
1960 lockdep_assert_held(&locked_ref
->lock
);
1962 while ((ref
= select_delayed_ref(locked_ref
))) {
1964 btrfs_check_delayed_seq(fs_info
, ref
->seq
)) {
1965 spin_unlock(&locked_ref
->lock
);
1966 btrfs_unselect_ref_head(delayed_refs
, locked_ref
);
1970 rb_erase_cached(&ref
->ref_node
, &locked_ref
->ref_tree
);
1971 RB_CLEAR_NODE(&ref
->ref_node
);
1972 if (!list_empty(&ref
->add_list
))
1973 list_del(&ref
->add_list
);
1975 * When we play the delayed ref, also correct the ref_mod on
1978 switch (ref
->action
) {
1979 case BTRFS_ADD_DELAYED_REF
:
1980 case BTRFS_ADD_DELAYED_EXTENT
:
1981 locked_ref
->ref_mod
-= ref
->ref_mod
;
1983 case BTRFS_DROP_DELAYED_REF
:
1984 locked_ref
->ref_mod
+= ref
->ref_mod
;
1991 * Record the must_insert_reserved flag before we drop the
1994 must_insert_reserved
= locked_ref
->must_insert_reserved
;
1996 * Unsetting this on the head ref relinquishes ownership of
1997 * the rsv_bytes, so it is critical that every possible code
1998 * path from here forward frees all reserves including qgroup
2001 locked_ref
->must_insert_reserved
= false;
2003 extent_op
= locked_ref
->extent_op
;
2004 locked_ref
->extent_op
= NULL
;
2005 spin_unlock(&locked_ref
->lock
);
2007 ret
= run_one_delayed_ref(trans
, locked_ref
, ref
, extent_op
,
2008 must_insert_reserved
);
2009 btrfs_delayed_refs_rsv_release(fs_info
, 1, 0);
2010 *bytes_released
+= btrfs_calc_delayed_ref_bytes(fs_info
, 1);
2012 btrfs_free_delayed_extent_op(extent_op
);
2014 btrfs_unselect_ref_head(delayed_refs
, locked_ref
);
2015 btrfs_put_delayed_ref(ref
);
2019 btrfs_put_delayed_ref(ref
);
2022 spin_lock(&locked_ref
->lock
);
2023 btrfs_merge_delayed_refs(fs_info
, delayed_refs
, locked_ref
);
2030 * Returns 0 on success or if called with an already aborted transaction.
2031 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2033 static noinline
int __btrfs_run_delayed_refs(struct btrfs_trans_handle
*trans
,
2036 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
2037 struct btrfs_delayed_ref_root
*delayed_refs
;
2038 struct btrfs_delayed_ref_head
*locked_ref
= NULL
;
2040 unsigned long count
= 0;
2041 unsigned long max_count
= 0;
2042 u64 bytes_processed
= 0;
2044 delayed_refs
= &trans
->transaction
->delayed_refs
;
2045 if (min_bytes
== 0) {
2046 max_count
= delayed_refs
->num_heads_ready
;
2047 min_bytes
= U64_MAX
;
2052 locked_ref
= btrfs_select_ref_head(fs_info
, delayed_refs
);
2053 if (IS_ERR_OR_NULL(locked_ref
)) {
2054 if (PTR_ERR(locked_ref
) == -EAGAIN
) {
2063 * We need to try and merge add/drops of the same ref since we
2064 * can run into issues with relocate dropping the implicit ref
2065 * and then it being added back again before the drop can
2066 * finish. If we merged anything we need to re-loop so we can
2068 * Or we can get node references of the same type that weren't
2069 * merged when created due to bumps in the tree mod seq, and
2070 * we need to merge them to prevent adding an inline extent
2071 * backref before dropping it (triggering a BUG_ON at
2072 * insert_inline_extent_backref()).
2074 spin_lock(&locked_ref
->lock
);
2075 btrfs_merge_delayed_refs(fs_info
, delayed_refs
, locked_ref
);
2077 ret
= btrfs_run_delayed_refs_for_head(trans
, locked_ref
, &bytes_processed
);
2078 if (ret
< 0 && ret
!= -EAGAIN
) {
2080 * Error, btrfs_run_delayed_refs_for_head already
2081 * unlocked everything so just bail out
2086 * Success, perform the usual cleanup of a processed
2089 ret
= cleanup_ref_head(trans
, locked_ref
, &bytes_processed
);
2091 /* We dropped our lock, we need to loop. */
2100 * Either success case or btrfs_run_delayed_refs_for_head
2101 * returned -EAGAIN, meaning we need to select another head
2106 } while ((min_bytes
!= U64_MAX
&& bytes_processed
< min_bytes
) ||
2107 (max_count
> 0 && count
< max_count
) ||
2113 #ifdef SCRAMBLE_DELAYED_REFS
2115 * Normally delayed refs get processed in ascending bytenr order. This
2116 * correlates in most cases to the order added. To expose dependencies on this
2117 * order, we start to process the tree in the middle instead of the beginning
2119 static u64
find_middle(struct rb_root
*root
)
2121 struct rb_node
*n
= root
->rb_node
;
2122 struct btrfs_delayed_ref_node
*entry
;
2125 u64 first
= 0, last
= 0;
2129 entry
= rb_entry(n
, struct btrfs_delayed_ref_node
, rb_node
);
2130 first
= entry
->bytenr
;
2134 entry
= rb_entry(n
, struct btrfs_delayed_ref_node
, rb_node
);
2135 last
= entry
->bytenr
;
2140 entry
= rb_entry(n
, struct btrfs_delayed_ref_node
, rb_node
);
2141 WARN_ON(!entry
->in_tree
);
2143 middle
= entry
->bytenr
;
2157 * Start processing the delayed reference count updates and extent insertions
2158 * we have queued up so far.
2160 * @trans: Transaction handle.
2161 * @min_bytes: How many bytes of delayed references to process. After this
2162 * many bytes we stop processing delayed references if there are
2163 * any more. If 0 it means to run all existing delayed references,
2164 * but not new ones added after running all existing ones.
2165 * Use (u64)-1 (U64_MAX) to run all existing delayed references
2166 * plus any new ones that are added.
2168 * Returns 0 on success or if called with an aborted transaction
2169 * Returns <0 on error and aborts the transaction
2171 int btrfs_run_delayed_refs(struct btrfs_trans_handle
*trans
, u64 min_bytes
)
2173 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
2174 struct btrfs_delayed_ref_root
*delayed_refs
;
2177 /* We'll clean this up in btrfs_cleanup_transaction */
2178 if (TRANS_ABORTED(trans
))
2181 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE
, &fs_info
->flags
))
2184 delayed_refs
= &trans
->transaction
->delayed_refs
;
2186 #ifdef SCRAMBLE_DELAYED_REFS
2187 delayed_refs
->run_delayed_start
= find_middle(&delayed_refs
->root
);
2189 ret
= __btrfs_run_delayed_refs(trans
, min_bytes
);
2191 btrfs_abort_transaction(trans
, ret
);
2195 if (min_bytes
== U64_MAX
) {
2196 btrfs_create_pending_block_groups(trans
);
2198 spin_lock(&delayed_refs
->lock
);
2199 if (xa_empty(&delayed_refs
->head_refs
)) {
2200 spin_unlock(&delayed_refs
->lock
);
2203 spin_unlock(&delayed_refs
->lock
);
2212 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle
*trans
,
2213 struct extent_buffer
*eb
, u64 flags
)
2215 struct btrfs_delayed_extent_op
*extent_op
;
2218 extent_op
= btrfs_alloc_delayed_extent_op();
2222 extent_op
->flags_to_set
= flags
;
2223 extent_op
->update_flags
= true;
2224 extent_op
->update_key
= false;
2226 ret
= btrfs_add_delayed_extent_op(trans
, eb
->start
, eb
->len
,
2227 btrfs_header_level(eb
), extent_op
);
2229 btrfs_free_delayed_extent_op(extent_op
);
2233 static noinline
int check_delayed_ref(struct btrfs_root
*root
,
2234 struct btrfs_path
*path
,
2235 u64 objectid
, u64 offset
, u64 bytenr
)
2237 struct btrfs_delayed_ref_head
*head
;
2238 struct btrfs_delayed_ref_node
*ref
;
2239 struct btrfs_delayed_ref_root
*delayed_refs
;
2240 struct btrfs_transaction
*cur_trans
;
2241 struct rb_node
*node
;
2244 spin_lock(&root
->fs_info
->trans_lock
);
2245 cur_trans
= root
->fs_info
->running_transaction
;
2247 refcount_inc(&cur_trans
->use_count
);
2248 spin_unlock(&root
->fs_info
->trans_lock
);
2252 delayed_refs
= &cur_trans
->delayed_refs
;
2253 spin_lock(&delayed_refs
->lock
);
2254 head
= btrfs_find_delayed_ref_head(root
->fs_info
, delayed_refs
, bytenr
);
2256 spin_unlock(&delayed_refs
->lock
);
2257 btrfs_put_transaction(cur_trans
);
2261 if (!mutex_trylock(&head
->mutex
)) {
2263 spin_unlock(&delayed_refs
->lock
);
2264 btrfs_put_transaction(cur_trans
);
2268 refcount_inc(&head
->refs
);
2269 spin_unlock(&delayed_refs
->lock
);
2271 btrfs_release_path(path
);
2274 * Mutex was contended, block until it's released and let
2277 mutex_lock(&head
->mutex
);
2278 mutex_unlock(&head
->mutex
);
2279 btrfs_put_delayed_ref_head(head
);
2280 btrfs_put_transaction(cur_trans
);
2283 spin_unlock(&delayed_refs
->lock
);
2285 spin_lock(&head
->lock
);
2287 * XXX: We should replace this with a proper search function in the
2290 for (node
= rb_first_cached(&head
->ref_tree
); node
;
2291 node
= rb_next(node
)) {
2295 ref
= rb_entry(node
, struct btrfs_delayed_ref_node
, ref_node
);
2296 /* If it's a shared ref we know a cross reference exists */
2297 if (ref
->type
!= BTRFS_EXTENT_DATA_REF_KEY
) {
2302 ref_owner
= btrfs_delayed_ref_owner(ref
);
2303 ref_offset
= btrfs_delayed_ref_offset(ref
);
2306 * If our ref doesn't match the one we're currently looking at
2307 * then we have a cross reference.
2309 if (ref
->ref_root
!= btrfs_root_id(root
) ||
2310 ref_owner
!= objectid
|| ref_offset
!= offset
) {
2315 spin_unlock(&head
->lock
);
2316 mutex_unlock(&head
->mutex
);
2317 btrfs_put_transaction(cur_trans
);
2321 static noinline
int check_committed_ref(struct btrfs_root
*root
,
2322 struct btrfs_path
*path
,
2323 u64 objectid
, u64 offset
, u64 bytenr
,
2326 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2327 struct btrfs_root
*extent_root
= btrfs_extent_root(fs_info
, bytenr
);
2328 struct extent_buffer
*leaf
;
2329 struct btrfs_extent_data_ref
*ref
;
2330 struct btrfs_extent_inline_ref
*iref
;
2331 struct btrfs_extent_item
*ei
;
2332 struct btrfs_key key
;
2338 key
.objectid
= bytenr
;
2339 key
.offset
= (u64
)-1;
2340 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
2342 ret
= btrfs_search_slot(NULL
, extent_root
, &key
, path
, 0, 0);
2347 * Key with offset -1 found, there would have to exist an extent
2348 * item with such offset, but this is out of the valid range.
2355 if (path
->slots
[0] == 0)
2359 leaf
= path
->nodes
[0];
2360 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
2362 if (key
.objectid
!= bytenr
|| key
.type
!= BTRFS_EXTENT_ITEM_KEY
)
2366 item_size
= btrfs_item_size(leaf
, path
->slots
[0]);
2367 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
2368 expected_size
= sizeof(*ei
) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY
);
2370 /* No inline refs; we need to bail before checking for owner ref. */
2371 if (item_size
== sizeof(*ei
))
2374 /* Check for an owner ref; skip over it to the real inline refs. */
2375 iref
= (struct btrfs_extent_inline_ref
*)(ei
+ 1);
2376 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
, BTRFS_REF_TYPE_DATA
);
2377 if (btrfs_fs_incompat(fs_info
, SIMPLE_QUOTA
) && type
== BTRFS_EXTENT_OWNER_REF_KEY
) {
2378 expected_size
+= btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY
);
2379 iref
= (struct btrfs_extent_inline_ref
*)(iref
+ 1);
2382 /* If extent item has more than 1 inline ref then it's shared */
2383 if (item_size
!= expected_size
)
2387 * If extent created before last snapshot => it's shared unless the
2388 * snapshot has been deleted. Use the heuristic if strict is false.
2391 (btrfs_extent_generation(leaf
, ei
) <=
2392 btrfs_root_last_snapshot(&root
->root_item
)))
2395 /* If this extent has SHARED_DATA_REF then it's shared */
2396 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
, BTRFS_REF_TYPE_DATA
);
2397 if (type
!= BTRFS_EXTENT_DATA_REF_KEY
)
2400 ref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
2401 if (btrfs_extent_refs(leaf
, ei
) !=
2402 btrfs_extent_data_ref_count(leaf
, ref
) ||
2403 btrfs_extent_data_ref_root(leaf
, ref
) != btrfs_root_id(root
) ||
2404 btrfs_extent_data_ref_objectid(leaf
, ref
) != objectid
||
2405 btrfs_extent_data_ref_offset(leaf
, ref
) != offset
)
2413 int btrfs_cross_ref_exist(struct btrfs_root
*root
, u64 objectid
, u64 offset
,
2414 u64 bytenr
, bool strict
, struct btrfs_path
*path
)
2419 ret
= check_committed_ref(root
, path
, objectid
,
2420 offset
, bytenr
, strict
);
2421 if (ret
&& ret
!= -ENOENT
)
2424 ret
= check_delayed_ref(root
, path
, objectid
, offset
, bytenr
);
2425 } while (ret
== -EAGAIN
);
2428 btrfs_release_path(path
);
2429 if (btrfs_is_data_reloc_root(root
))
2434 static int __btrfs_mod_ref(struct btrfs_trans_handle
*trans
,
2435 struct btrfs_root
*root
,
2436 struct extent_buffer
*buf
,
2437 int full_backref
, int inc
)
2439 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2443 struct btrfs_key key
;
2444 struct btrfs_file_extent_item
*fi
;
2445 bool for_reloc
= btrfs_header_flag(buf
, BTRFS_HEADER_FLAG_RELOC
);
2451 if (btrfs_is_testing(fs_info
))
2454 ref_root
= btrfs_header_owner(buf
);
2455 nritems
= btrfs_header_nritems(buf
);
2456 level
= btrfs_header_level(buf
);
2458 if (!test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
) && level
== 0)
2462 parent
= buf
->start
;
2466 action
= BTRFS_ADD_DELAYED_REF
;
2468 action
= BTRFS_DROP_DELAYED_REF
;
2470 for (i
= 0; i
< nritems
; i
++) {
2471 struct btrfs_ref ref
= {
2474 .ref_root
= ref_root
,
2478 btrfs_item_key_to_cpu(buf
, &key
, i
);
2479 if (key
.type
!= BTRFS_EXTENT_DATA_KEY
)
2481 fi
= btrfs_item_ptr(buf
, i
,
2482 struct btrfs_file_extent_item
);
2483 if (btrfs_file_extent_type(buf
, fi
) ==
2484 BTRFS_FILE_EXTENT_INLINE
)
2486 ref
.bytenr
= btrfs_file_extent_disk_bytenr(buf
, fi
);
2487 if (ref
.bytenr
== 0)
2490 ref
.num_bytes
= btrfs_file_extent_disk_num_bytes(buf
, fi
);
2491 ref
.owning_root
= ref_root
;
2493 key
.offset
-= btrfs_file_extent_offset(buf
, fi
);
2494 btrfs_init_data_ref(&ref
, key
.objectid
, key
.offset
,
2495 btrfs_root_id(root
), for_reloc
);
2497 ret
= btrfs_inc_extent_ref(trans
, &ref
);
2499 ret
= btrfs_free_extent(trans
, &ref
);
2503 /* We don't know the owning_root, leave as 0. */
2504 ref
.bytenr
= btrfs_node_blockptr(buf
, i
);
2505 ref
.num_bytes
= fs_info
->nodesize
;
2507 btrfs_init_tree_ref(&ref
, level
- 1,
2508 btrfs_root_id(root
), for_reloc
);
2510 ret
= btrfs_inc_extent_ref(trans
, &ref
);
2512 ret
= btrfs_free_extent(trans
, &ref
);
2522 int btrfs_inc_ref(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
2523 struct extent_buffer
*buf
, int full_backref
)
2525 return __btrfs_mod_ref(trans
, root
, buf
, full_backref
, 1);
2528 int btrfs_dec_ref(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
2529 struct extent_buffer
*buf
, int full_backref
)
2531 return __btrfs_mod_ref(trans
, root
, buf
, full_backref
, 0);
2534 static u64
get_alloc_profile_by_root(struct btrfs_root
*root
, int data
)
2536 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2541 flags
= BTRFS_BLOCK_GROUP_DATA
;
2542 else if (root
== fs_info
->chunk_root
)
2543 flags
= BTRFS_BLOCK_GROUP_SYSTEM
;
2545 flags
= BTRFS_BLOCK_GROUP_METADATA
;
2547 ret
= btrfs_get_alloc_profile(fs_info
, flags
);
2551 static u64
first_logical_byte(struct btrfs_fs_info
*fs_info
)
2553 struct rb_node
*leftmost
;
2556 read_lock(&fs_info
->block_group_cache_lock
);
2557 /* Get the block group with the lowest logical start address. */
2558 leftmost
= rb_first_cached(&fs_info
->block_group_cache_tree
);
2560 struct btrfs_block_group
*bg
;
2562 bg
= rb_entry(leftmost
, struct btrfs_block_group
, cache_node
);
2565 read_unlock(&fs_info
->block_group_cache_lock
);
2570 static int pin_down_extent(struct btrfs_trans_handle
*trans
,
2571 struct btrfs_block_group
*cache
,
2572 u64 bytenr
, u64 num_bytes
, int reserved
)
2574 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
2576 spin_lock(&cache
->space_info
->lock
);
2577 spin_lock(&cache
->lock
);
2578 cache
->pinned
+= num_bytes
;
2579 btrfs_space_info_update_bytes_pinned(fs_info
, cache
->space_info
,
2582 cache
->reserved
-= num_bytes
;
2583 cache
->space_info
->bytes_reserved
-= num_bytes
;
2585 spin_unlock(&cache
->lock
);
2586 spin_unlock(&cache
->space_info
->lock
);
2588 set_extent_bit(&trans
->transaction
->pinned_extents
, bytenr
,
2589 bytenr
+ num_bytes
- 1, EXTENT_DIRTY
, NULL
);
2593 int btrfs_pin_extent(struct btrfs_trans_handle
*trans
,
2594 u64 bytenr
, u64 num_bytes
, int reserved
)
2596 struct btrfs_block_group
*cache
;
2598 cache
= btrfs_lookup_block_group(trans
->fs_info
, bytenr
);
2599 BUG_ON(!cache
); /* Logic error */
2601 pin_down_extent(trans
, cache
, bytenr
, num_bytes
, reserved
);
2603 btrfs_put_block_group(cache
);
2607 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle
*trans
,
2608 const struct extent_buffer
*eb
)
2610 struct btrfs_block_group
*cache
;
2613 cache
= btrfs_lookup_block_group(trans
->fs_info
, eb
->start
);
2618 * Fully cache the free space first so that our pin removes the free space
2621 ret
= btrfs_cache_block_group(cache
, true);
2625 pin_down_extent(trans
, cache
, eb
->start
, eb
->len
, 0);
2627 /* remove us from the free space cache (if we're there at all) */
2628 ret
= btrfs_remove_free_space(cache
, eb
->start
, eb
->len
);
2630 btrfs_put_block_group(cache
);
2634 static int __exclude_logged_extent(struct btrfs_fs_info
*fs_info
,
2635 u64 start
, u64 num_bytes
)
2638 struct btrfs_block_group
*block_group
;
2640 block_group
= btrfs_lookup_block_group(fs_info
, start
);
2644 ret
= btrfs_cache_block_group(block_group
, true);
2648 ret
= btrfs_remove_free_space(block_group
, start
, num_bytes
);
2650 btrfs_put_block_group(block_group
);
2654 int btrfs_exclude_logged_extents(struct extent_buffer
*eb
)
2656 struct btrfs_fs_info
*fs_info
= eb
->fs_info
;
2657 struct btrfs_file_extent_item
*item
;
2658 struct btrfs_key key
;
2663 if (!btrfs_fs_incompat(fs_info
, MIXED_GROUPS
))
2666 for (i
= 0; i
< btrfs_header_nritems(eb
); i
++) {
2667 btrfs_item_key_to_cpu(eb
, &key
, i
);
2668 if (key
.type
!= BTRFS_EXTENT_DATA_KEY
)
2670 item
= btrfs_item_ptr(eb
, i
, struct btrfs_file_extent_item
);
2671 found_type
= btrfs_file_extent_type(eb
, item
);
2672 if (found_type
== BTRFS_FILE_EXTENT_INLINE
)
2674 if (btrfs_file_extent_disk_bytenr(eb
, item
) == 0)
2676 key
.objectid
= btrfs_file_extent_disk_bytenr(eb
, item
);
2677 key
.offset
= btrfs_file_extent_disk_num_bytes(eb
, item
);
2678 ret
= __exclude_logged_extent(fs_info
, key
.objectid
, key
.offset
);
2687 btrfs_inc_block_group_reservations(struct btrfs_block_group
*bg
)
2689 atomic_inc(&bg
->reservations
);
2693 * Returns the free cluster for the given space info and sets empty_cluster to
2694 * what it should be based on the mount options.
2696 static struct btrfs_free_cluster
*
2697 fetch_cluster_info(struct btrfs_fs_info
*fs_info
,
2698 struct btrfs_space_info
*space_info
, u64
*empty_cluster
)
2700 struct btrfs_free_cluster
*ret
= NULL
;
2703 if (btrfs_mixed_space_info(space_info
))
2706 if (space_info
->flags
& BTRFS_BLOCK_GROUP_METADATA
) {
2707 ret
= &fs_info
->meta_alloc_cluster
;
2708 if (btrfs_test_opt(fs_info
, SSD
))
2709 *empty_cluster
= SZ_2M
;
2711 *empty_cluster
= SZ_64K
;
2712 } else if ((space_info
->flags
& BTRFS_BLOCK_GROUP_DATA
) &&
2713 btrfs_test_opt(fs_info
, SSD_SPREAD
)) {
2714 *empty_cluster
= SZ_2M
;
2715 ret
= &fs_info
->data_alloc_cluster
;
2721 static int unpin_extent_range(struct btrfs_fs_info
*fs_info
,
2723 const bool return_free_space
)
2725 struct btrfs_block_group
*cache
= NULL
;
2726 struct btrfs_space_info
*space_info
;
2727 struct btrfs_block_rsv
*global_rsv
= &fs_info
->global_block_rsv
;
2728 struct btrfs_free_cluster
*cluster
= NULL
;
2730 u64 total_unpinned
= 0;
2731 u64 empty_cluster
= 0;
2735 while (start
<= end
) {
2738 start
>= cache
->start
+ cache
->length
) {
2740 btrfs_put_block_group(cache
);
2742 cache
= btrfs_lookup_block_group(fs_info
, start
);
2743 if (cache
== NULL
) {
2744 /* Logic error, something removed the block group. */
2749 cluster
= fetch_cluster_info(fs_info
,
2752 empty_cluster
<<= 1;
2755 len
= cache
->start
+ cache
->length
- start
;
2756 len
= min(len
, end
+ 1 - start
);
2758 if (return_free_space
)
2759 btrfs_add_free_space(cache
, start
, len
);
2762 total_unpinned
+= len
;
2763 space_info
= cache
->space_info
;
2766 * If this space cluster has been marked as fragmented and we've
2767 * unpinned enough in this block group to potentially allow a
2768 * cluster to be created inside of it go ahead and clear the
2771 if (cluster
&& cluster
->fragmented
&&
2772 total_unpinned
> empty_cluster
) {
2773 spin_lock(&cluster
->lock
);
2774 cluster
->fragmented
= 0;
2775 spin_unlock(&cluster
->lock
);
2778 spin_lock(&space_info
->lock
);
2779 spin_lock(&cache
->lock
);
2780 cache
->pinned
-= len
;
2781 btrfs_space_info_update_bytes_pinned(fs_info
, space_info
, -len
);
2782 space_info
->max_extent_size
= 0;
2784 space_info
->bytes_readonly
+= len
;
2786 } else if (btrfs_is_zoned(fs_info
)) {
2787 /* Need reset before reusing in a zoned block group */
2788 btrfs_space_info_update_bytes_zone_unusable(fs_info
, space_info
,
2792 spin_unlock(&cache
->lock
);
2793 if (!readonly
&& return_free_space
&&
2794 global_rsv
->space_info
== space_info
) {
2795 spin_lock(&global_rsv
->lock
);
2796 if (!global_rsv
->full
) {
2797 u64 to_add
= min(len
, global_rsv
->size
-
2798 global_rsv
->reserved
);
2800 global_rsv
->reserved
+= to_add
;
2801 btrfs_space_info_update_bytes_may_use(fs_info
,
2802 space_info
, to_add
);
2803 if (global_rsv
->reserved
>= global_rsv
->size
)
2804 global_rsv
->full
= 1;
2807 spin_unlock(&global_rsv
->lock
);
2809 /* Add to any tickets we may have */
2810 if (!readonly
&& return_free_space
&& len
)
2811 btrfs_try_granting_tickets(fs_info
, space_info
);
2812 spin_unlock(&space_info
->lock
);
2816 btrfs_put_block_group(cache
);
2821 int btrfs_finish_extent_commit(struct btrfs_trans_handle
*trans
)
2823 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
2824 struct btrfs_block_group
*block_group
, *tmp
;
2825 struct list_head
*deleted_bgs
;
2826 struct extent_io_tree
*unpin
;
2831 unpin
= &trans
->transaction
->pinned_extents
;
2833 while (!TRANS_ABORTED(trans
)) {
2834 struct extent_state
*cached_state
= NULL
;
2836 mutex_lock(&fs_info
->unused_bg_unpin_mutex
);
2837 if (!find_first_extent_bit(unpin
, 0, &start
, &end
,
2838 EXTENT_DIRTY
, &cached_state
)) {
2839 mutex_unlock(&fs_info
->unused_bg_unpin_mutex
);
2843 if (btrfs_test_opt(fs_info
, DISCARD_SYNC
))
2844 ret
= btrfs_discard_extent(fs_info
, start
,
2845 end
+ 1 - start
, NULL
);
2847 clear_extent_dirty(unpin
, start
, end
, &cached_state
);
2848 ret
= unpin_extent_range(fs_info
, start
, end
, true);
2850 mutex_unlock(&fs_info
->unused_bg_unpin_mutex
);
2851 free_extent_state(cached_state
);
2855 if (btrfs_test_opt(fs_info
, DISCARD_ASYNC
)) {
2856 btrfs_discard_calc_delay(&fs_info
->discard_ctl
);
2857 btrfs_discard_schedule_work(&fs_info
->discard_ctl
, true);
2861 * Transaction is finished. We don't need the lock anymore. We
2862 * do need to clean up the block groups in case of a transaction
2865 deleted_bgs
= &trans
->transaction
->deleted_bgs
;
2866 list_for_each_entry_safe(block_group
, tmp
, deleted_bgs
, bg_list
) {
2870 if (!TRANS_ABORTED(trans
))
2871 ret
= btrfs_discard_extent(fs_info
,
2873 block_group
->length
,
2876 list_del_init(&block_group
->bg_list
);
2877 btrfs_unfreeze_block_group(block_group
);
2878 btrfs_put_block_group(block_group
);
2881 const char *errstr
= btrfs_decode_error(ret
);
2883 "discard failed while removing blockgroup: errno=%d %s",
2892 * Parse an extent item's inline extents looking for a simple quotas owner ref.
2894 * @fs_info: the btrfs_fs_info for this mount
2895 * @leaf: a leaf in the extent tree containing the extent item
2896 * @slot: the slot in the leaf where the extent item is found
2898 * Returns the objectid of the root that originally allocated the extent item
2899 * if the inline owner ref is expected and present, otherwise 0.
2901 * If an extent item has an owner ref item, it will be the first inline ref
2902 * item. Therefore the logic is to check whether there are any inline ref
2903 * items, then check the type of the first one.
2905 u64
btrfs_get_extent_owner_root(struct btrfs_fs_info
*fs_info
,
2906 struct extent_buffer
*leaf
, int slot
)
2908 struct btrfs_extent_item
*ei
;
2909 struct btrfs_extent_inline_ref
*iref
;
2910 struct btrfs_extent_owner_ref
*oref
;
2915 if (!btrfs_fs_incompat(fs_info
, SIMPLE_QUOTA
))
2918 ei
= btrfs_item_ptr(leaf
, slot
, struct btrfs_extent_item
);
2919 ptr
= (unsigned long)(ei
+ 1);
2920 end
= (unsigned long)ei
+ btrfs_item_size(leaf
, slot
);
2922 /* No inline ref items of any kind, can't check type. */
2926 iref
= (struct btrfs_extent_inline_ref
*)ptr
;
2927 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
, BTRFS_REF_TYPE_ANY
);
2929 /* We found an owner ref, get the root out of it. */
2930 if (type
== BTRFS_EXTENT_OWNER_REF_KEY
) {
2931 oref
= (struct btrfs_extent_owner_ref
*)(&iref
->offset
);
2932 return btrfs_extent_owner_ref_root_id(leaf
, oref
);
2935 /* We have inline refs, but not an owner ref. */
2939 static int do_free_extent_accounting(struct btrfs_trans_handle
*trans
,
2940 u64 bytenr
, struct btrfs_squota_delta
*delta
)
2943 u64 num_bytes
= delta
->num_bytes
;
2945 if (delta
->is_data
) {
2946 struct btrfs_root
*csum_root
;
2948 csum_root
= btrfs_csum_root(trans
->fs_info
, bytenr
);
2949 ret
= btrfs_del_csums(trans
, csum_root
, bytenr
, num_bytes
);
2951 btrfs_abort_transaction(trans
, ret
);
2955 ret
= btrfs_delete_raid_extent(trans
, bytenr
, num_bytes
);
2957 btrfs_abort_transaction(trans
, ret
);
2962 ret
= btrfs_record_squota_delta(trans
->fs_info
, delta
);
2964 btrfs_abort_transaction(trans
, ret
);
2968 ret
= add_to_free_space_tree(trans
, bytenr
, num_bytes
);
2970 btrfs_abort_transaction(trans
, ret
);
2974 ret
= btrfs_update_block_group(trans
, bytenr
, num_bytes
, false);
2976 btrfs_abort_transaction(trans
, ret
);
2981 #define abort_and_dump(trans, path, fmt, args...) \
2983 btrfs_abort_transaction(trans, -EUCLEAN); \
2984 btrfs_print_leaf(path->nodes[0]); \
2985 btrfs_crit(trans->fs_info, fmt, ##args); \
2989 * Drop one or more refs of @node.
2991 * 1. Locate the extent refs.
2992 * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
2993 * Locate it, then reduce the refs number or remove the ref line completely.
2995 * 2. Update the refs count in EXTENT/METADATA_ITEM
2997 * Inline backref case:
2999 * in extent tree we have:
3001 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3002 * refs 2 gen 6 flags DATA
3003 * extent data backref root FS_TREE objectid 258 offset 0 count 1
3004 * extent data backref root FS_TREE objectid 257 offset 0 count 1
3006 * This function gets called with:
3008 * node->bytenr = 13631488
3009 * node->num_bytes = 1048576
3010 * root_objectid = FS_TREE
3011 * owner_objectid = 257
3015 * Then we should get some like:
3017 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3018 * refs 1 gen 6 flags DATA
3019 * extent data backref root FS_TREE objectid 258 offset 0 count 1
3021 * Keyed backref case:
3023 * in extent tree we have:
3025 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3026 * refs 754 gen 6 flags DATA
3028 * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
3029 * extent data backref root FS_TREE objectid 866 offset 0 count 1
3031 * This function get called with:
3033 * node->bytenr = 13631488
3034 * node->num_bytes = 1048576
3035 * root_objectid = FS_TREE
3036 * owner_objectid = 866
3040 * Then we should get some like:
3042 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3043 * refs 753 gen 6 flags DATA
3045 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
3047 static int __btrfs_free_extent(struct btrfs_trans_handle
*trans
,
3048 struct btrfs_delayed_ref_head
*href
,
3049 struct btrfs_delayed_ref_node
*node
,
3050 struct btrfs_delayed_extent_op
*extent_op
)
3052 struct btrfs_fs_info
*info
= trans
->fs_info
;
3053 struct btrfs_key key
;
3054 struct btrfs_path
*path
;
3055 struct btrfs_root
*extent_root
;
3056 struct extent_buffer
*leaf
;
3057 struct btrfs_extent_item
*ei
;
3058 struct btrfs_extent_inline_ref
*iref
;
3061 int extent_slot
= 0;
3062 int found_extent
= 0;
3064 int refs_to_drop
= node
->ref_mod
;
3067 u64 bytenr
= node
->bytenr
;
3068 u64 num_bytes
= node
->num_bytes
;
3069 u64 owner_objectid
= btrfs_delayed_ref_owner(node
);
3070 u64 owner_offset
= btrfs_delayed_ref_offset(node
);
3071 bool skinny_metadata
= btrfs_fs_incompat(info
, SKINNY_METADATA
);
3072 u64 delayed_ref_root
= href
->owning_root
;
3074 extent_root
= btrfs_extent_root(info
, bytenr
);
3075 ASSERT(extent_root
);
3077 path
= btrfs_alloc_path();
3081 is_data
= owner_objectid
>= BTRFS_FIRST_FREE_OBJECTID
;
3083 if (!is_data
&& refs_to_drop
!= 1) {
3085 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
3086 node
->bytenr
, refs_to_drop
);
3088 btrfs_abort_transaction(trans
, ret
);
3093 skinny_metadata
= false;
3095 ret
= lookup_extent_backref(trans
, path
, &iref
, bytenr
, num_bytes
,
3096 node
->parent
, node
->ref_root
, owner_objectid
,
3100 * Either the inline backref or the SHARED_DATA_REF/
3101 * SHARED_BLOCK_REF is found
3103 * Here is a quick path to locate EXTENT/METADATA_ITEM.
3104 * It's possible the EXTENT/METADATA_ITEM is near current slot.
3106 extent_slot
= path
->slots
[0];
3107 while (extent_slot
>= 0) {
3108 btrfs_item_key_to_cpu(path
->nodes
[0], &key
,
3110 if (key
.objectid
!= bytenr
)
3112 if (key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
3113 key
.offset
== num_bytes
) {
3117 if (key
.type
== BTRFS_METADATA_ITEM_KEY
&&
3118 key
.offset
== owner_objectid
) {
3123 /* Quick path didn't find the EXTENT/METADATA_ITEM */
3124 if (path
->slots
[0] - extent_slot
> 5)
3129 if (!found_extent
) {
3131 abort_and_dump(trans
, path
,
3132 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
3137 /* Must be SHARED_* item, remove the backref first */
3138 ret
= remove_extent_backref(trans
, extent_root
, path
,
3139 NULL
, refs_to_drop
, is_data
);
3141 btrfs_abort_transaction(trans
, ret
);
3144 btrfs_release_path(path
);
3146 /* Slow path to locate EXTENT/METADATA_ITEM */
3147 key
.objectid
= bytenr
;
3148 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
3149 key
.offset
= num_bytes
;
3151 if (!is_data
&& skinny_metadata
) {
3152 key
.type
= BTRFS_METADATA_ITEM_KEY
;
3153 key
.offset
= owner_objectid
;
3156 ret
= btrfs_search_slot(trans
, extent_root
,
3158 if (ret
> 0 && skinny_metadata
&& path
->slots
[0]) {
3160 * Couldn't find our skinny metadata item,
3161 * see if we have ye olde extent item.
3164 btrfs_item_key_to_cpu(path
->nodes
[0], &key
,
3166 if (key
.objectid
== bytenr
&&
3167 key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
3168 key
.offset
== num_bytes
)
3172 if (ret
> 0 && skinny_metadata
) {
3173 skinny_metadata
= false;
3174 key
.objectid
= bytenr
;
3175 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
3176 key
.offset
= num_bytes
;
3177 btrfs_release_path(path
);
3178 ret
= btrfs_search_slot(trans
, extent_root
,
3184 btrfs_print_leaf(path
->nodes
[0]);
3186 "umm, got %d back from search, was looking for %llu, slot %d",
3187 ret
, bytenr
, path
->slots
[0]);
3190 btrfs_abort_transaction(trans
, ret
);
3193 extent_slot
= path
->slots
[0];
3195 } else if (WARN_ON(ret
== -ENOENT
)) {
3196 abort_and_dump(trans
, path
,
3197 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
3198 bytenr
, node
->parent
, node
->ref_root
, owner_objectid
,
3199 owner_offset
, path
->slots
[0]);
3202 btrfs_abort_transaction(trans
, ret
);
3206 leaf
= path
->nodes
[0];
3207 item_size
= btrfs_item_size(leaf
, extent_slot
);
3208 if (unlikely(item_size
< sizeof(*ei
))) {
3210 btrfs_err(trans
->fs_info
,
3211 "unexpected extent item size, has %u expect >= %zu",
3212 item_size
, sizeof(*ei
));
3213 btrfs_abort_transaction(trans
, ret
);
3216 ei
= btrfs_item_ptr(leaf
, extent_slot
,
3217 struct btrfs_extent_item
);
3218 if (owner_objectid
< BTRFS_FIRST_FREE_OBJECTID
&&
3219 key
.type
== BTRFS_EXTENT_ITEM_KEY
) {
3220 struct btrfs_tree_block_info
*bi
;
3222 if (item_size
< sizeof(*ei
) + sizeof(*bi
)) {
3223 abort_and_dump(trans
, path
,
3224 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
3225 key
.objectid
, key
.type
, key
.offset
,
3226 path
->slots
[0], owner_objectid
, item_size
,
3227 sizeof(*ei
) + sizeof(*bi
));
3231 bi
= (struct btrfs_tree_block_info
*)(ei
+ 1);
3232 WARN_ON(owner_objectid
!= btrfs_tree_block_level(leaf
, bi
));
3235 refs
= btrfs_extent_refs(leaf
, ei
);
3236 if (refs
< refs_to_drop
) {
3237 abort_and_dump(trans
, path
,
3238 "trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
3239 refs_to_drop
, refs
, bytenr
, path
->slots
[0]);
3243 refs
-= refs_to_drop
;
3247 __run_delayed_extent_op(extent_op
, leaf
, ei
);
3249 * In the case of inline back ref, reference count will
3250 * be updated by remove_extent_backref
3253 if (!found_extent
) {
3254 abort_and_dump(trans
, path
,
3255 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
3261 btrfs_set_extent_refs(leaf
, ei
, refs
);
3262 btrfs_mark_buffer_dirty(trans
, leaf
);
3265 ret
= remove_extent_backref(trans
, extent_root
, path
,
3266 iref
, refs_to_drop
, is_data
);
3268 btrfs_abort_transaction(trans
, ret
);
3273 struct btrfs_squota_delta delta
= {
3274 .root
= delayed_ref_root
,
3275 .num_bytes
= num_bytes
,
3278 .generation
= btrfs_extent_generation(leaf
, ei
),
3281 /* In this branch refs == 1 */
3283 if (is_data
&& refs_to_drop
!=
3284 extent_data_ref_count(path
, iref
)) {
3285 abort_and_dump(trans
, path
,
3286 "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
3287 extent_data_ref_count(path
, iref
),
3288 refs_to_drop
, path
->slots
[0]);
3293 if (path
->slots
[0] != extent_slot
) {
3294 abort_and_dump(trans
, path
,
3295 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
3296 key
.objectid
, key
.type
,
3297 key
.offset
, path
->slots
[0]);
3303 * No inline ref, we must be at SHARED_* item,
3304 * And it's single ref, it must be:
3305 * | extent_slot ||extent_slot + 1|
3306 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3308 if (path
->slots
[0] != extent_slot
+ 1) {
3309 abort_and_dump(trans
, path
,
3310 "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
3315 path
->slots
[0] = extent_slot
;
3320 * We can't infer the data owner from the delayed ref, so we need
3321 * to try to get it from the owning ref item.
3323 * If it is not present, then that extent was not written under
3324 * simple quotas mode, so we don't need to account for its deletion.
3327 delta
.root
= btrfs_get_extent_owner_root(trans
->fs_info
,
3330 ret
= btrfs_del_items(trans
, extent_root
, path
, path
->slots
[0],
3333 btrfs_abort_transaction(trans
, ret
);
3336 btrfs_release_path(path
);
3338 ret
= do_free_extent_accounting(trans
, bytenr
, &delta
);
3340 btrfs_release_path(path
);
3343 btrfs_free_path(path
);
3348 * when we free an block, it is possible (and likely) that we free the last
3349 * delayed ref for that extent as well. This searches the delayed ref tree for
3350 * a given extent, and if there are no other delayed refs to be processed, it
3351 * removes it from the tree.
3353 static noinline
int check_ref_cleanup(struct btrfs_trans_handle
*trans
,
3356 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
3357 struct btrfs_delayed_ref_head
*head
;
3358 struct btrfs_delayed_ref_root
*delayed_refs
;
3361 delayed_refs
= &trans
->transaction
->delayed_refs
;
3362 spin_lock(&delayed_refs
->lock
);
3363 head
= btrfs_find_delayed_ref_head(fs_info
, delayed_refs
, bytenr
);
3365 goto out_delayed_unlock
;
3367 spin_lock(&head
->lock
);
3368 if (!RB_EMPTY_ROOT(&head
->ref_tree
.rb_root
))
3371 if (cleanup_extent_op(head
) != NULL
)
3375 * waiting for the lock here would deadlock. If someone else has it
3376 * locked they are already in the process of dropping it anyway
3378 if (!mutex_trylock(&head
->mutex
))
3381 btrfs_delete_ref_head(fs_info
, delayed_refs
, head
);
3382 head
->processing
= false;
3384 spin_unlock(&head
->lock
);
3385 spin_unlock(&delayed_refs
->lock
);
3387 BUG_ON(head
->extent_op
);
3388 if (head
->must_insert_reserved
)
3391 btrfs_cleanup_ref_head_accounting(fs_info
, delayed_refs
, head
);
3392 mutex_unlock(&head
->mutex
);
3393 btrfs_put_delayed_ref_head(head
);
3396 spin_unlock(&head
->lock
);
3399 spin_unlock(&delayed_refs
->lock
);
3403 int btrfs_free_tree_block(struct btrfs_trans_handle
*trans
,
3405 struct extent_buffer
*buf
,
3406 u64 parent
, int last_ref
)
3408 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
3409 struct btrfs_block_group
*bg
;
3412 if (root_id
!= BTRFS_TREE_LOG_OBJECTID
) {
3413 struct btrfs_ref generic_ref
= {
3414 .action
= BTRFS_DROP_DELAYED_REF
,
3415 .bytenr
= buf
->start
,
3416 .num_bytes
= buf
->len
,
3418 .owning_root
= btrfs_header_owner(buf
),
3419 .ref_root
= root_id
,
3423 * Assert that the extent buffer is not cleared due to
3424 * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
3425 * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
3428 ASSERT(btrfs_header_bytenr(buf
) != 0);
3430 btrfs_init_tree_ref(&generic_ref
, btrfs_header_level(buf
), 0, false);
3431 btrfs_ref_tree_mod(fs_info
, &generic_ref
);
3432 ret
= btrfs_add_delayed_tree_ref(trans
, &generic_ref
, NULL
);
3440 if (btrfs_header_generation(buf
) != trans
->transid
)
3443 if (root_id
!= BTRFS_TREE_LOG_OBJECTID
) {
3444 ret
= check_ref_cleanup(trans
, buf
->start
);
3449 bg
= btrfs_lookup_block_group(fs_info
, buf
->start
);
3451 if (btrfs_header_flag(buf
, BTRFS_HEADER_FLAG_WRITTEN
)) {
3452 pin_down_extent(trans
, bg
, buf
->start
, buf
->len
, 1);
3453 btrfs_put_block_group(bg
);
3458 * If there are tree mod log users we may have recorded mod log
3459 * operations for this node. If we re-allocate this node we
3460 * could replay operations on this node that happened when it
3461 * existed in a completely different root. For example if it
3462 * was part of root A, then was reallocated to root B, and we
3463 * are doing a btrfs_old_search_slot(root b), we could replay
3464 * operations that happened when the block was part of root A,
3465 * giving us an inconsistent view of the btree.
3467 * We are safe from races here because at this point no other
3468 * node or root points to this extent buffer, so if after this
3469 * check a new tree mod log user joins we will not have an
3470 * existing log of operations on this node that we have to
3474 if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS
, &fs_info
->flags
)
3475 || btrfs_is_zoned(fs_info
)) {
3476 pin_down_extent(trans
, bg
, buf
->start
, buf
->len
, 1);
3477 btrfs_put_block_group(bg
);
3481 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY
, &buf
->bflags
));
3483 btrfs_add_free_space(bg
, buf
->start
, buf
->len
);
3484 btrfs_free_reserved_bytes(bg
, buf
->len
, 0);
3485 btrfs_put_block_group(bg
);
3486 trace_btrfs_reserved_extent_free(fs_info
, buf
->start
, buf
->len
);
3491 * Deleting the buffer, clear the corrupt flag since it doesn't
3494 clear_bit(EXTENT_BUFFER_CORRUPT
, &buf
->bflags
);
3498 /* Can return -ENOMEM */
3499 int btrfs_free_extent(struct btrfs_trans_handle
*trans
, struct btrfs_ref
*ref
)
3501 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
3504 if (btrfs_is_testing(fs_info
))
3508 * tree log blocks never actually go into the extent allocation
3509 * tree, just update pinning info and exit early.
3511 if (ref
->ref_root
== BTRFS_TREE_LOG_OBJECTID
) {
3512 btrfs_pin_extent(trans
, ref
->bytenr
, ref
->num_bytes
, 1);
3514 } else if (ref
->type
== BTRFS_REF_METADATA
) {
3515 ret
= btrfs_add_delayed_tree_ref(trans
, ref
, NULL
);
3517 ret
= btrfs_add_delayed_data_ref(trans
, ref
, 0);
3520 if (ref
->ref_root
!= BTRFS_TREE_LOG_OBJECTID
)
3521 btrfs_ref_tree_mod(fs_info
, ref
);
3526 enum btrfs_loop_type
{
3528 * Start caching block groups but do not wait for progress or for them
3531 LOOP_CACHING_NOWAIT
,
3534 * Wait for the block group free_space >= the space we're waiting for if
3535 * the block group isn't cached.
3540 * Allow allocations to happen from block groups that do not yet have a
3541 * size classification.
3543 LOOP_UNSET_SIZE_CLASS
,
3546 * Allocate a chunk and then retry the allocation.
3551 * Ignore the size class restrictions for this allocation.
3553 LOOP_WRONG_SIZE_CLASS
,
3556 * Ignore the empty size, only try to allocate the number of bytes
3557 * needed for this allocation.
3563 btrfs_lock_block_group(struct btrfs_block_group
*cache
,
3567 down_read(&cache
->data_rwsem
);
3570 static inline void btrfs_grab_block_group(struct btrfs_block_group
*cache
,
3573 btrfs_get_block_group(cache
);
3575 down_read(&cache
->data_rwsem
);
3578 static struct btrfs_block_group
*btrfs_lock_cluster(
3579 struct btrfs_block_group
*block_group
,
3580 struct btrfs_free_cluster
*cluster
,
3582 __acquires(&cluster
->refill_lock
)
3584 struct btrfs_block_group
*used_bg
= NULL
;
3586 spin_lock(&cluster
->refill_lock
);
3588 used_bg
= cluster
->block_group
;
3592 if (used_bg
== block_group
)
3595 btrfs_get_block_group(used_bg
);
3600 if (down_read_trylock(&used_bg
->data_rwsem
))
3603 spin_unlock(&cluster
->refill_lock
);
3605 /* We should only have one-level nested. */
3606 down_read_nested(&used_bg
->data_rwsem
, SINGLE_DEPTH_NESTING
);
3608 spin_lock(&cluster
->refill_lock
);
3609 if (used_bg
== cluster
->block_group
)
3612 up_read(&used_bg
->data_rwsem
);
3613 btrfs_put_block_group(used_bg
);
3618 btrfs_release_block_group(struct btrfs_block_group
*cache
,
3622 up_read(&cache
->data_rwsem
);
3623 btrfs_put_block_group(cache
);
3627 * Helper function for find_free_extent().
3629 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3630 * Return >0 to inform caller that we find nothing
3631 * Return 0 means we have found a location and set ffe_ctl->found_offset.
3633 static int find_free_extent_clustered(struct btrfs_block_group
*bg
,
3634 struct find_free_extent_ctl
*ffe_ctl
,
3635 struct btrfs_block_group
**cluster_bg_ret
)
3637 struct btrfs_block_group
*cluster_bg
;
3638 struct btrfs_free_cluster
*last_ptr
= ffe_ctl
->last_ptr
;
3639 u64 aligned_cluster
;
3643 cluster_bg
= btrfs_lock_cluster(bg
, last_ptr
, ffe_ctl
->delalloc
);
3645 goto refill_cluster
;
3646 if (cluster_bg
!= bg
&& (cluster_bg
->ro
||
3647 !block_group_bits(cluster_bg
, ffe_ctl
->flags
)))
3648 goto release_cluster
;
3650 offset
= btrfs_alloc_from_cluster(cluster_bg
, last_ptr
,
3651 ffe_ctl
->num_bytes
, cluster_bg
->start
,
3652 &ffe_ctl
->max_extent_size
);
3654 /* We have a block, we're done */
3655 spin_unlock(&last_ptr
->refill_lock
);
3656 trace_btrfs_reserve_extent_cluster(cluster_bg
, ffe_ctl
);
3657 *cluster_bg_ret
= cluster_bg
;
3658 ffe_ctl
->found_offset
= offset
;
3661 WARN_ON(last_ptr
->block_group
!= cluster_bg
);
3665 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3666 * lets just skip it and let the allocator find whatever block it can
3667 * find. If we reach this point, we will have tried the cluster
3668 * allocator plenty of times and not have found anything, so we are
3669 * likely way too fragmented for the clustering stuff to find anything.
3671 * However, if the cluster is taken from the current block group,
3672 * release the cluster first, so that we stand a better chance of
3673 * succeeding in the unclustered allocation.
3675 if (ffe_ctl
->loop
>= LOOP_NO_EMPTY_SIZE
&& cluster_bg
!= bg
) {
3676 spin_unlock(&last_ptr
->refill_lock
);
3677 btrfs_release_block_group(cluster_bg
, ffe_ctl
->delalloc
);
3681 /* This cluster didn't work out, free it and start over */
3682 btrfs_return_cluster_to_free_space(NULL
, last_ptr
);
3684 if (cluster_bg
!= bg
)
3685 btrfs_release_block_group(cluster_bg
, ffe_ctl
->delalloc
);
3688 if (ffe_ctl
->loop
>= LOOP_NO_EMPTY_SIZE
) {
3689 spin_unlock(&last_ptr
->refill_lock
);
3693 aligned_cluster
= max_t(u64
,
3694 ffe_ctl
->empty_cluster
+ ffe_ctl
->empty_size
,
3695 bg
->full_stripe_len
);
3696 ret
= btrfs_find_space_cluster(bg
, last_ptr
, ffe_ctl
->search_start
,
3697 ffe_ctl
->num_bytes
, aligned_cluster
);
3699 /* Now pull our allocation out of this cluster */
3700 offset
= btrfs_alloc_from_cluster(bg
, last_ptr
,
3701 ffe_ctl
->num_bytes
, ffe_ctl
->search_start
,
3702 &ffe_ctl
->max_extent_size
);
3704 /* We found one, proceed */
3705 spin_unlock(&last_ptr
->refill_lock
);
3706 ffe_ctl
->found_offset
= offset
;
3707 trace_btrfs_reserve_extent_cluster(bg
, ffe_ctl
);
3712 * At this point we either didn't find a cluster or we weren't able to
3713 * allocate a block from our cluster. Free the cluster we've been
3714 * trying to use, and go to the next block group.
3716 btrfs_return_cluster_to_free_space(NULL
, last_ptr
);
3717 spin_unlock(&last_ptr
->refill_lock
);
3722 * Return >0 to inform caller that we find nothing
3723 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3725 static int find_free_extent_unclustered(struct btrfs_block_group
*bg
,
3726 struct find_free_extent_ctl
*ffe_ctl
)
3728 struct btrfs_free_cluster
*last_ptr
= ffe_ctl
->last_ptr
;
3732 * We are doing an unclustered allocation, set the fragmented flag so
3733 * we don't bother trying to setup a cluster again until we get more
3736 if (unlikely(last_ptr
)) {
3737 spin_lock(&last_ptr
->lock
);
3738 last_ptr
->fragmented
= 1;
3739 spin_unlock(&last_ptr
->lock
);
3741 if (ffe_ctl
->cached
) {
3742 struct btrfs_free_space_ctl
*free_space_ctl
;
3744 free_space_ctl
= bg
->free_space_ctl
;
3745 spin_lock(&free_space_ctl
->tree_lock
);
3746 if (free_space_ctl
->free_space
<
3747 ffe_ctl
->num_bytes
+ ffe_ctl
->empty_cluster
+
3748 ffe_ctl
->empty_size
) {
3749 ffe_ctl
->total_free_space
= max_t(u64
,
3750 ffe_ctl
->total_free_space
,
3751 free_space_ctl
->free_space
);
3752 spin_unlock(&free_space_ctl
->tree_lock
);
3755 spin_unlock(&free_space_ctl
->tree_lock
);
3758 offset
= btrfs_find_space_for_alloc(bg
, ffe_ctl
->search_start
,
3759 ffe_ctl
->num_bytes
, ffe_ctl
->empty_size
,
3760 &ffe_ctl
->max_extent_size
);
3763 ffe_ctl
->found_offset
= offset
;
3767 static int do_allocation_clustered(struct btrfs_block_group
*block_group
,
3768 struct find_free_extent_ctl
*ffe_ctl
,
3769 struct btrfs_block_group
**bg_ret
)
3773 /* We want to try and use the cluster allocator, so lets look there */
3774 if (ffe_ctl
->last_ptr
&& ffe_ctl
->use_cluster
) {
3775 ret
= find_free_extent_clustered(block_group
, ffe_ctl
, bg_ret
);
3778 /* ret == -ENOENT case falls through */
3781 return find_free_extent_unclustered(block_group
, ffe_ctl
);
3785 * Tree-log block group locking
3786 * ============================
3788 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3789 * indicates the starting address of a block group, which is reserved only
3790 * for tree-log metadata.
3797 * fs_info::treelog_bg_lock
3801 * Simple allocator for sequential-only block group. It only allows sequential
3802 * allocation. No need to play with trees. This function also reserves the
3803 * bytes as in btrfs_add_reserved_bytes.
3805 static int do_allocation_zoned(struct btrfs_block_group
*block_group
,
3806 struct find_free_extent_ctl
*ffe_ctl
,
3807 struct btrfs_block_group
**bg_ret
)
3809 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
3810 struct btrfs_space_info
*space_info
= block_group
->space_info
;
3811 struct btrfs_free_space_ctl
*ctl
= block_group
->free_space_ctl
;
3812 u64 start
= block_group
->start
;
3813 u64 num_bytes
= ffe_ctl
->num_bytes
;
3815 u64 bytenr
= block_group
->start
;
3817 u64 data_reloc_bytenr
;
3821 ASSERT(btrfs_is_zoned(block_group
->fs_info
));
3824 * Do not allow non-tree-log blocks in the dedicated tree-log block
3825 * group, and vice versa.
3827 spin_lock(&fs_info
->treelog_bg_lock
);
3828 log_bytenr
= fs_info
->treelog_bg
;
3829 if (log_bytenr
&& ((ffe_ctl
->for_treelog
&& bytenr
!= log_bytenr
) ||
3830 (!ffe_ctl
->for_treelog
&& bytenr
== log_bytenr
)))
3832 spin_unlock(&fs_info
->treelog_bg_lock
);
3837 * Do not allow non-relocation blocks in the dedicated relocation block
3838 * group, and vice versa.
3840 spin_lock(&fs_info
->relocation_bg_lock
);
3841 data_reloc_bytenr
= fs_info
->data_reloc_bg
;
3842 if (data_reloc_bytenr
&&
3843 ((ffe_ctl
->for_data_reloc
&& bytenr
!= data_reloc_bytenr
) ||
3844 (!ffe_ctl
->for_data_reloc
&& bytenr
== data_reloc_bytenr
)))
3846 spin_unlock(&fs_info
->relocation_bg_lock
);
3850 /* Check RO and no space case before trying to activate it */
3851 spin_lock(&block_group
->lock
);
3852 if (block_group
->ro
|| btrfs_zoned_bg_is_full(block_group
)) {
3855 * May need to clear fs_info->{treelog,data_reloc}_bg.
3856 * Return the error after taking the locks.
3859 spin_unlock(&block_group
->lock
);
3861 /* Metadata block group is activated at write time. */
3862 if (!ret
&& (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
) &&
3863 !btrfs_zone_activate(block_group
)) {
3866 * May need to clear fs_info->{treelog,data_reloc}_bg.
3867 * Return the error after taking the locks.
3871 spin_lock(&space_info
->lock
);
3872 spin_lock(&block_group
->lock
);
3873 spin_lock(&fs_info
->treelog_bg_lock
);
3874 spin_lock(&fs_info
->relocation_bg_lock
);
3879 ASSERT(!ffe_ctl
->for_treelog
||
3880 block_group
->start
== fs_info
->treelog_bg
||
3881 fs_info
->treelog_bg
== 0);
3882 ASSERT(!ffe_ctl
->for_data_reloc
||
3883 block_group
->start
== fs_info
->data_reloc_bg
||
3884 fs_info
->data_reloc_bg
== 0);
3886 if (block_group
->ro
||
3887 (!ffe_ctl
->for_data_reloc
&&
3888 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
, &block_group
->runtime_flags
))) {
3894 * Do not allow currently using block group to be tree-log dedicated
3897 if (ffe_ctl
->for_treelog
&& !fs_info
->treelog_bg
&&
3898 (block_group
->used
|| block_group
->reserved
)) {
3904 * Do not allow currently used block group to be the data relocation
3905 * dedicated block group.
3907 if (ffe_ctl
->for_data_reloc
&& !fs_info
->data_reloc_bg
&&
3908 (block_group
->used
|| block_group
->reserved
)) {
3913 WARN_ON_ONCE(block_group
->alloc_offset
> block_group
->zone_capacity
);
3914 avail
= block_group
->zone_capacity
- block_group
->alloc_offset
;
3915 if (avail
< num_bytes
) {
3916 if (ffe_ctl
->max_extent_size
< avail
) {
3918 * With sequential allocator, free space is always
3921 ffe_ctl
->max_extent_size
= avail
;
3922 ffe_ctl
->total_free_space
= avail
;
3928 if (ffe_ctl
->for_treelog
&& !fs_info
->treelog_bg
)
3929 fs_info
->treelog_bg
= block_group
->start
;
3931 if (ffe_ctl
->for_data_reloc
) {
3932 if (!fs_info
->data_reloc_bg
)
3933 fs_info
->data_reloc_bg
= block_group
->start
;
3935 * Do not allow allocations from this block group, unless it is
3936 * for data relocation. Compared to increasing the ->ro, setting
3937 * the ->zoned_data_reloc_ongoing flag still allows nocow
3938 * writers to come in. See btrfs_inc_nocow_writers().
3940 * We need to disable an allocation to avoid an allocation of
3941 * regular (non-relocation data) extent. With mix of relocation
3942 * extents and regular extents, we can dispatch WRITE commands
3943 * (for relocation extents) and ZONE APPEND commands (for
3944 * regular extents) at the same time to the same zone, which
3945 * easily break the write pointer.
3947 * Also, this flag avoids this block group to be zone finished.
3949 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
, &block_group
->runtime_flags
);
3952 ffe_ctl
->found_offset
= start
+ block_group
->alloc_offset
;
3953 block_group
->alloc_offset
+= num_bytes
;
3954 spin_lock(&ctl
->tree_lock
);
3955 ctl
->free_space
-= num_bytes
;
3956 spin_unlock(&ctl
->tree_lock
);
3959 * We do not check if found_offset is aligned to stripesize. The
3960 * address is anyway rewritten when using zone append writing.
3963 ffe_ctl
->search_start
= ffe_ctl
->found_offset
;
3966 if (ret
&& ffe_ctl
->for_treelog
)
3967 fs_info
->treelog_bg
= 0;
3968 if (ret
&& ffe_ctl
->for_data_reloc
)
3969 fs_info
->data_reloc_bg
= 0;
3970 spin_unlock(&fs_info
->relocation_bg_lock
);
3971 spin_unlock(&fs_info
->treelog_bg_lock
);
3972 spin_unlock(&block_group
->lock
);
3973 spin_unlock(&space_info
->lock
);
3977 static int do_allocation(struct btrfs_block_group
*block_group
,
3978 struct find_free_extent_ctl
*ffe_ctl
,
3979 struct btrfs_block_group
**bg_ret
)
3981 switch (ffe_ctl
->policy
) {
3982 case BTRFS_EXTENT_ALLOC_CLUSTERED
:
3983 return do_allocation_clustered(block_group
, ffe_ctl
, bg_ret
);
3984 case BTRFS_EXTENT_ALLOC_ZONED
:
3985 return do_allocation_zoned(block_group
, ffe_ctl
, bg_ret
);
3991 static void release_block_group(struct btrfs_block_group
*block_group
,
3992 struct find_free_extent_ctl
*ffe_ctl
,
3995 switch (ffe_ctl
->policy
) {
3996 case BTRFS_EXTENT_ALLOC_CLUSTERED
:
3997 ffe_ctl
->retry_uncached
= false;
3999 case BTRFS_EXTENT_ALLOC_ZONED
:
4006 BUG_ON(btrfs_bg_flags_to_raid_index(block_group
->flags
) !=
4008 btrfs_release_block_group(block_group
, delalloc
);
4011 static void found_extent_clustered(struct find_free_extent_ctl
*ffe_ctl
,
4012 struct btrfs_key
*ins
)
4014 struct btrfs_free_cluster
*last_ptr
= ffe_ctl
->last_ptr
;
4016 if (!ffe_ctl
->use_cluster
&& last_ptr
) {
4017 spin_lock(&last_ptr
->lock
);
4018 last_ptr
->window_start
= ins
->objectid
;
4019 spin_unlock(&last_ptr
->lock
);
4023 static void found_extent(struct find_free_extent_ctl
*ffe_ctl
,
4024 struct btrfs_key
*ins
)
4026 switch (ffe_ctl
->policy
) {
4027 case BTRFS_EXTENT_ALLOC_CLUSTERED
:
4028 found_extent_clustered(ffe_ctl
, ins
);
4030 case BTRFS_EXTENT_ALLOC_ZONED
:
4038 static int can_allocate_chunk_zoned(struct btrfs_fs_info
*fs_info
,
4039 struct find_free_extent_ctl
*ffe_ctl
)
4041 /* Block group's activeness is not a requirement for METADATA block groups. */
4042 if (!(ffe_ctl
->flags
& BTRFS_BLOCK_GROUP_DATA
))
4045 /* If we can activate new zone, just allocate a chunk and use it */
4046 if (btrfs_can_activate_zone(fs_info
->fs_devices
, ffe_ctl
->flags
))
4050 * We already reached the max active zones. Try to finish one block
4051 * group to make a room for a new block group. This is only possible
4052 * for a data block group because btrfs_zone_finish() may need to wait
4053 * for a running transaction which can cause a deadlock for metadata
4056 if (ffe_ctl
->flags
& BTRFS_BLOCK_GROUP_DATA
) {
4057 int ret
= btrfs_zone_finish_one_bg(fs_info
);
4066 * If we have enough free space left in an already active block group
4067 * and we can't activate any other zone now, do not allow allocating a
4068 * new chunk and let find_free_extent() retry with a smaller size.
4070 if (ffe_ctl
->max_extent_size
>= ffe_ctl
->min_alloc_size
)
4074 * Even min_alloc_size is not left in any block groups. Since we cannot
4075 * activate a new block group, allocating it may not help. Let's tell a
4076 * caller to try again and hope it progress something by writing some
4077 * parts of the region. That is only possible for data block groups,
4078 * where a part of the region can be written.
4080 if (ffe_ctl
->flags
& BTRFS_BLOCK_GROUP_DATA
)
4084 * We cannot activate a new block group and no enough space left in any
4085 * block groups. So, allocating a new block group may not help. But,
4086 * there is nothing to do anyway, so let's go with it.
4091 static int can_allocate_chunk(struct btrfs_fs_info
*fs_info
,
4092 struct find_free_extent_ctl
*ffe_ctl
)
4094 switch (ffe_ctl
->policy
) {
4095 case BTRFS_EXTENT_ALLOC_CLUSTERED
:
4097 case BTRFS_EXTENT_ALLOC_ZONED
:
4098 return can_allocate_chunk_zoned(fs_info
, ffe_ctl
);
4105 * Return >0 means caller needs to re-search for free extent
4106 * Return 0 means we have the needed free extent.
4107 * Return <0 means we failed to locate any free extent.
4109 static int find_free_extent_update_loop(struct btrfs_fs_info
*fs_info
,
4110 struct btrfs_key
*ins
,
4111 struct find_free_extent_ctl
*ffe_ctl
,
4114 struct btrfs_root
*root
= fs_info
->chunk_root
;
4117 if ((ffe_ctl
->loop
== LOOP_CACHING_NOWAIT
) &&
4118 ffe_ctl
->have_caching_bg
&& !ffe_ctl
->orig_have_caching_bg
)
4119 ffe_ctl
->orig_have_caching_bg
= true;
4121 if (ins
->objectid
) {
4122 found_extent(ffe_ctl
, ins
);
4126 if (ffe_ctl
->loop
>= LOOP_CACHING_WAIT
&& ffe_ctl
->have_caching_bg
)
4130 if (ffe_ctl
->index
< BTRFS_NR_RAID_TYPES
)
4133 /* See the comments for btrfs_loop_type for an explanation of the phases. */
4134 if (ffe_ctl
->loop
< LOOP_NO_EMPTY_SIZE
) {
4137 * We want to skip the LOOP_CACHING_WAIT step if we don't have
4138 * any uncached bgs and we've already done a full search
4141 if (ffe_ctl
->loop
== LOOP_CACHING_NOWAIT
&&
4142 (!ffe_ctl
->orig_have_caching_bg
&& full_search
))
4146 if (ffe_ctl
->loop
== LOOP_ALLOC_CHUNK
) {
4147 struct btrfs_trans_handle
*trans
;
4150 /* Check if allocation policy allows to create a new chunk */
4151 ret
= can_allocate_chunk(fs_info
, ffe_ctl
);
4155 trans
= current
->journal_info
;
4159 trans
= btrfs_join_transaction(root
);
4161 if (IS_ERR(trans
)) {
4162 ret
= PTR_ERR(trans
);
4166 ret
= btrfs_chunk_alloc(trans
, ffe_ctl
->flags
,
4167 CHUNK_ALLOC_FORCE_FOR_EXTENT
);
4169 /* Do not bail out on ENOSPC since we can do more. */
4170 if (ret
== -ENOSPC
) {
4175 btrfs_abort_transaction(trans
, ret
);
4179 btrfs_end_transaction(trans
);
4184 if (ffe_ctl
->loop
== LOOP_NO_EMPTY_SIZE
) {
4185 if (ffe_ctl
->policy
!= BTRFS_EXTENT_ALLOC_CLUSTERED
)
4189 * Don't loop again if we already have no empty_size and
4192 if (ffe_ctl
->empty_size
== 0 &&
4193 ffe_ctl
->empty_cluster
== 0)
4195 ffe_ctl
->empty_size
= 0;
4196 ffe_ctl
->empty_cluster
= 0;
4203 static bool find_free_extent_check_size_class(struct find_free_extent_ctl
*ffe_ctl
,
4204 struct btrfs_block_group
*bg
)
4206 if (ffe_ctl
->policy
== BTRFS_EXTENT_ALLOC_ZONED
)
4208 if (!btrfs_block_group_should_use_size_class(bg
))
4210 if (ffe_ctl
->loop
>= LOOP_WRONG_SIZE_CLASS
)
4212 if (ffe_ctl
->loop
>= LOOP_UNSET_SIZE_CLASS
&&
4213 bg
->size_class
== BTRFS_BG_SZ_NONE
)
4215 return ffe_ctl
->size_class
== bg
->size_class
;
4218 static int prepare_allocation_clustered(struct btrfs_fs_info
*fs_info
,
4219 struct find_free_extent_ctl
*ffe_ctl
,
4220 struct btrfs_space_info
*space_info
,
4221 struct btrfs_key
*ins
)
4224 * If our free space is heavily fragmented we may not be able to make
4225 * big contiguous allocations, so instead of doing the expensive search
4226 * for free space, simply return ENOSPC with our max_extent_size so we
4227 * can go ahead and search for a more manageable chunk.
4229 * If our max_extent_size is large enough for our allocation simply
4230 * disable clustering since we will likely not be able to find enough
4231 * space to create a cluster and induce latency trying.
4233 if (space_info
->max_extent_size
) {
4234 spin_lock(&space_info
->lock
);
4235 if (space_info
->max_extent_size
&&
4236 ffe_ctl
->num_bytes
> space_info
->max_extent_size
) {
4237 ins
->offset
= space_info
->max_extent_size
;
4238 spin_unlock(&space_info
->lock
);
4240 } else if (space_info
->max_extent_size
) {
4241 ffe_ctl
->use_cluster
= false;
4243 spin_unlock(&space_info
->lock
);
4246 ffe_ctl
->last_ptr
= fetch_cluster_info(fs_info
, space_info
,
4247 &ffe_ctl
->empty_cluster
);
4248 if (ffe_ctl
->last_ptr
) {
4249 struct btrfs_free_cluster
*last_ptr
= ffe_ctl
->last_ptr
;
4251 spin_lock(&last_ptr
->lock
);
4252 if (last_ptr
->block_group
)
4253 ffe_ctl
->hint_byte
= last_ptr
->window_start
;
4254 if (last_ptr
->fragmented
) {
4256 * We still set window_start so we can keep track of the
4257 * last place we found an allocation to try and save
4260 ffe_ctl
->hint_byte
= last_ptr
->window_start
;
4261 ffe_ctl
->use_cluster
= false;
4263 spin_unlock(&last_ptr
->lock
);
4269 static int prepare_allocation_zoned(struct btrfs_fs_info
*fs_info
,
4270 struct find_free_extent_ctl
*ffe_ctl
)
4272 if (ffe_ctl
->for_treelog
) {
4273 spin_lock(&fs_info
->treelog_bg_lock
);
4274 if (fs_info
->treelog_bg
)
4275 ffe_ctl
->hint_byte
= fs_info
->treelog_bg
;
4276 spin_unlock(&fs_info
->treelog_bg_lock
);
4277 } else if (ffe_ctl
->for_data_reloc
) {
4278 spin_lock(&fs_info
->relocation_bg_lock
);
4279 if (fs_info
->data_reloc_bg
)
4280 ffe_ctl
->hint_byte
= fs_info
->data_reloc_bg
;
4281 spin_unlock(&fs_info
->relocation_bg_lock
);
4282 } else if (ffe_ctl
->flags
& BTRFS_BLOCK_GROUP_DATA
) {
4283 struct btrfs_block_group
*block_group
;
4285 spin_lock(&fs_info
->zone_active_bgs_lock
);
4286 list_for_each_entry(block_group
, &fs_info
->zone_active_bgs
, active_bg_list
) {
4288 * No lock is OK here because avail is monotinically
4289 * decreasing, and this is just a hint.
4291 u64 avail
= block_group
->zone_capacity
- block_group
->alloc_offset
;
4293 if (block_group_bits(block_group
, ffe_ctl
->flags
) &&
4294 avail
>= ffe_ctl
->num_bytes
) {
4295 ffe_ctl
->hint_byte
= block_group
->start
;
4299 spin_unlock(&fs_info
->zone_active_bgs_lock
);
4305 static int prepare_allocation(struct btrfs_fs_info
*fs_info
,
4306 struct find_free_extent_ctl
*ffe_ctl
,
4307 struct btrfs_space_info
*space_info
,
4308 struct btrfs_key
*ins
)
4310 switch (ffe_ctl
->policy
) {
4311 case BTRFS_EXTENT_ALLOC_CLUSTERED
:
4312 return prepare_allocation_clustered(fs_info
, ffe_ctl
,
4314 case BTRFS_EXTENT_ALLOC_ZONED
:
4315 return prepare_allocation_zoned(fs_info
, ffe_ctl
);
4322 * walks the btree of allocated extents and find a hole of a given size.
4323 * The key ins is changed to record the hole:
4324 * ins->objectid == start position
4325 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4326 * ins->offset == the size of the hole.
4327 * Any available blocks before search_start are skipped.
4329 * If there is no suitable free space, we will record the max size of
4330 * the free space extent currently.
4332 * The overall logic and call chain:
4334 * find_free_extent()
4335 * |- Iterate through all block groups
4336 * | |- Get a valid block group
4337 * | |- Try to do clustered allocation in that block group
4338 * | |- Try to do unclustered allocation in that block group
4339 * | |- Check if the result is valid
4340 * | | |- If valid, then exit
4341 * | |- Jump to next block group
4343 * |- Push harder to find free extents
4344 * |- If not found, re-iterate all block groups
4346 static noinline
int find_free_extent(struct btrfs_root
*root
,
4347 struct btrfs_key
*ins
,
4348 struct find_free_extent_ctl
*ffe_ctl
)
4350 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4352 int cache_block_group_error
= 0;
4353 struct btrfs_block_group
*block_group
= NULL
;
4354 struct btrfs_space_info
*space_info
;
4355 bool full_search
= false;
4357 WARN_ON(ffe_ctl
->num_bytes
< fs_info
->sectorsize
);
4359 ffe_ctl
->search_start
= 0;
4360 /* For clustered allocation */
4361 ffe_ctl
->empty_cluster
= 0;
4362 ffe_ctl
->last_ptr
= NULL
;
4363 ffe_ctl
->use_cluster
= true;
4364 ffe_ctl
->have_caching_bg
= false;
4365 ffe_ctl
->orig_have_caching_bg
= false;
4366 ffe_ctl
->index
= btrfs_bg_flags_to_raid_index(ffe_ctl
->flags
);
4368 ffe_ctl
->retry_uncached
= false;
4369 ffe_ctl
->cached
= 0;
4370 ffe_ctl
->max_extent_size
= 0;
4371 ffe_ctl
->total_free_space
= 0;
4372 ffe_ctl
->found_offset
= 0;
4373 ffe_ctl
->policy
= BTRFS_EXTENT_ALLOC_CLUSTERED
;
4374 ffe_ctl
->size_class
= btrfs_calc_block_group_size_class(ffe_ctl
->num_bytes
);
4376 if (btrfs_is_zoned(fs_info
))
4377 ffe_ctl
->policy
= BTRFS_EXTENT_ALLOC_ZONED
;
4379 ins
->type
= BTRFS_EXTENT_ITEM_KEY
;
4383 trace_find_free_extent(root
, ffe_ctl
);
4385 space_info
= btrfs_find_space_info(fs_info
, ffe_ctl
->flags
);
4387 btrfs_err(fs_info
, "No space info for %llu", ffe_ctl
->flags
);
4391 ret
= prepare_allocation(fs_info
, ffe_ctl
, space_info
, ins
);
4395 ffe_ctl
->search_start
= max(ffe_ctl
->search_start
,
4396 first_logical_byte(fs_info
));
4397 ffe_ctl
->search_start
= max(ffe_ctl
->search_start
, ffe_ctl
->hint_byte
);
4398 if (ffe_ctl
->search_start
== ffe_ctl
->hint_byte
) {
4399 block_group
= btrfs_lookup_block_group(fs_info
,
4400 ffe_ctl
->search_start
);
4402 * we don't want to use the block group if it doesn't match our
4403 * allocation bits, or if its not cached.
4405 * However if we are re-searching with an ideal block group
4406 * picked out then we don't care that the block group is cached.
4408 if (block_group
&& block_group_bits(block_group
, ffe_ctl
->flags
) &&
4409 block_group
->cached
!= BTRFS_CACHE_NO
) {
4410 down_read(&space_info
->groups_sem
);
4411 if (list_empty(&block_group
->list
) ||
4414 * someone is removing this block group,
4415 * we can't jump into the have_block_group
4416 * target because our list pointers are not
4419 btrfs_put_block_group(block_group
);
4420 up_read(&space_info
->groups_sem
);
4422 ffe_ctl
->index
= btrfs_bg_flags_to_raid_index(
4423 block_group
->flags
);
4424 btrfs_lock_block_group(block_group
,
4426 ffe_ctl
->hinted
= true;
4427 goto have_block_group
;
4429 } else if (block_group
) {
4430 btrfs_put_block_group(block_group
);
4434 trace_find_free_extent_search_loop(root
, ffe_ctl
);
4435 ffe_ctl
->have_caching_bg
= false;
4436 if (ffe_ctl
->index
== btrfs_bg_flags_to_raid_index(ffe_ctl
->flags
) ||
4437 ffe_ctl
->index
== 0)
4439 down_read(&space_info
->groups_sem
);
4440 list_for_each_entry(block_group
,
4441 &space_info
->block_groups
[ffe_ctl
->index
], list
) {
4442 struct btrfs_block_group
*bg_ret
;
4444 ffe_ctl
->hinted
= false;
4445 /* If the block group is read-only, we can skip it entirely. */
4446 if (unlikely(block_group
->ro
)) {
4447 if (ffe_ctl
->for_treelog
)
4448 btrfs_clear_treelog_bg(block_group
);
4449 if (ffe_ctl
->for_data_reloc
)
4450 btrfs_clear_data_reloc_bg(block_group
);
4454 btrfs_grab_block_group(block_group
, ffe_ctl
->delalloc
);
4455 ffe_ctl
->search_start
= block_group
->start
;
4458 * this can happen if we end up cycling through all the
4459 * raid types, but we want to make sure we only allocate
4460 * for the proper type.
4462 if (!block_group_bits(block_group
, ffe_ctl
->flags
)) {
4463 u64 extra
= BTRFS_BLOCK_GROUP_DUP
|
4464 BTRFS_BLOCK_GROUP_RAID1_MASK
|
4465 BTRFS_BLOCK_GROUP_RAID56_MASK
|
4466 BTRFS_BLOCK_GROUP_RAID10
;
4469 * if they asked for extra copies and this block group
4470 * doesn't provide them, bail. This does allow us to
4471 * fill raid0 from raid1.
4473 if ((ffe_ctl
->flags
& extra
) && !(block_group
->flags
& extra
))
4477 * This block group has different flags than we want.
4478 * It's possible that we have MIXED_GROUP flag but no
4479 * block group is mixed. Just skip such block group.
4481 btrfs_release_block_group(block_group
, ffe_ctl
->delalloc
);
4486 trace_find_free_extent_have_block_group(root
, ffe_ctl
, block_group
);
4487 ffe_ctl
->cached
= btrfs_block_group_done(block_group
);
4488 if (unlikely(!ffe_ctl
->cached
)) {
4489 ffe_ctl
->have_caching_bg
= true;
4490 ret
= btrfs_cache_block_group(block_group
, false);
4493 * If we get ENOMEM here or something else we want to
4494 * try other block groups, because it may not be fatal.
4495 * However if we can't find anything else we need to
4496 * save our return here so that we return the actual
4497 * error that caused problems, not ENOSPC.
4500 if (!cache_block_group_error
)
4501 cache_block_group_error
= ret
;
4508 if (unlikely(block_group
->cached
== BTRFS_CACHE_ERROR
)) {
4509 if (!cache_block_group_error
)
4510 cache_block_group_error
= -EIO
;
4514 if (!find_free_extent_check_size_class(ffe_ctl
, block_group
))
4518 ret
= do_allocation(block_group
, ffe_ctl
, &bg_ret
);
4522 if (bg_ret
&& bg_ret
!= block_group
) {
4523 btrfs_release_block_group(block_group
, ffe_ctl
->delalloc
);
4524 block_group
= bg_ret
;
4528 ffe_ctl
->search_start
= round_up(ffe_ctl
->found_offset
,
4529 fs_info
->stripesize
);
4531 /* move on to the next group */
4532 if (ffe_ctl
->search_start
+ ffe_ctl
->num_bytes
>
4533 block_group
->start
+ block_group
->length
) {
4534 btrfs_add_free_space_unused(block_group
,
4535 ffe_ctl
->found_offset
,
4536 ffe_ctl
->num_bytes
);
4540 if (ffe_ctl
->found_offset
< ffe_ctl
->search_start
)
4541 btrfs_add_free_space_unused(block_group
,
4542 ffe_ctl
->found_offset
,
4543 ffe_ctl
->search_start
- ffe_ctl
->found_offset
);
4545 ret
= btrfs_add_reserved_bytes(block_group
, ffe_ctl
->ram_bytes
,
4548 ffe_ctl
->loop
>= LOOP_WRONG_SIZE_CLASS
);
4549 if (ret
== -EAGAIN
) {
4550 btrfs_add_free_space_unused(block_group
,
4551 ffe_ctl
->found_offset
,
4552 ffe_ctl
->num_bytes
);
4555 btrfs_inc_block_group_reservations(block_group
);
4557 /* we are all good, lets return */
4558 ins
->objectid
= ffe_ctl
->search_start
;
4559 ins
->offset
= ffe_ctl
->num_bytes
;
4561 trace_btrfs_reserve_extent(block_group
, ffe_ctl
);
4562 btrfs_release_block_group(block_group
, ffe_ctl
->delalloc
);
4565 if (!ffe_ctl
->cached
&& ffe_ctl
->loop
> LOOP_CACHING_NOWAIT
&&
4566 !ffe_ctl
->retry_uncached
) {
4567 ffe_ctl
->retry_uncached
= true;
4568 btrfs_wait_block_group_cache_progress(block_group
,
4569 ffe_ctl
->num_bytes
+
4570 ffe_ctl
->empty_cluster
+
4571 ffe_ctl
->empty_size
);
4572 goto have_block_group
;
4574 release_block_group(block_group
, ffe_ctl
, ffe_ctl
->delalloc
);
4577 up_read(&space_info
->groups_sem
);
4579 ret
= find_free_extent_update_loop(fs_info
, ins
, ffe_ctl
, full_search
);
4583 if (ret
== -ENOSPC
&& !cache_block_group_error
) {
4585 * Use ffe_ctl->total_free_space as fallback if we can't find
4586 * any contiguous hole.
4588 if (!ffe_ctl
->max_extent_size
)
4589 ffe_ctl
->max_extent_size
= ffe_ctl
->total_free_space
;
4590 spin_lock(&space_info
->lock
);
4591 space_info
->max_extent_size
= ffe_ctl
->max_extent_size
;
4592 spin_unlock(&space_info
->lock
);
4593 ins
->offset
= ffe_ctl
->max_extent_size
;
4594 } else if (ret
== -ENOSPC
) {
4595 ret
= cache_block_group_error
;
4601 * Entry point to the extent allocator. Tries to find a hole that is at least
4602 * as big as @num_bytes.
4604 * @root - The root that will contain this extent
4606 * @ram_bytes - The amount of space in ram that @num_bytes take. This
4607 * is used for accounting purposes. This value differs
4608 * from @num_bytes only in the case of compressed extents.
4610 * @num_bytes - Number of bytes to allocate on-disk.
4612 * @min_alloc_size - Indicates the minimum amount of space that the
4613 * allocator should try to satisfy. In some cases
4614 * @num_bytes may be larger than what is required and if
4615 * the filesystem is fragmented then allocation fails.
4616 * However, the presence of @min_alloc_size gives a
4617 * chance to try and satisfy the smaller allocation.
4619 * @empty_size - A hint that you plan on doing more COW. This is the
4620 * size in bytes the allocator should try to find free
4621 * next to the block it returns. This is just a hint and
4622 * may be ignored by the allocator.
4624 * @hint_byte - Hint to the allocator to start searching above the byte
4625 * address passed. It might be ignored.
4627 * @ins - This key is modified to record the found hole. It will
4628 * have the following values:
4629 * ins->objectid == start position
4630 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4631 * ins->offset == the size of the hole.
4633 * @is_data - Boolean flag indicating whether an extent is
4634 * allocated for data (true) or metadata (false)
4636 * @delalloc - Boolean flag indicating whether this allocation is for
4637 * delalloc or not. If 'true' data_rwsem of block groups
4638 * is going to be acquired.
4641 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4642 * case -ENOSPC is returned then @ins->offset will contain the size of the
4643 * largest available hole the allocator managed to find.
4645 int btrfs_reserve_extent(struct btrfs_root
*root
, u64 ram_bytes
,
4646 u64 num_bytes
, u64 min_alloc_size
,
4647 u64 empty_size
, u64 hint_byte
,
4648 struct btrfs_key
*ins
, int is_data
, int delalloc
)
4650 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4651 struct find_free_extent_ctl ffe_ctl
= {};
4652 bool final_tried
= num_bytes
== min_alloc_size
;
4655 bool for_treelog
= (btrfs_root_id(root
) == BTRFS_TREE_LOG_OBJECTID
);
4656 bool for_data_reloc
= (btrfs_is_data_reloc_root(root
) && is_data
);
4658 flags
= get_alloc_profile_by_root(root
, is_data
);
4660 WARN_ON(num_bytes
< fs_info
->sectorsize
);
4662 ffe_ctl
.ram_bytes
= ram_bytes
;
4663 ffe_ctl
.num_bytes
= num_bytes
;
4664 ffe_ctl
.min_alloc_size
= min_alloc_size
;
4665 ffe_ctl
.empty_size
= empty_size
;
4666 ffe_ctl
.flags
= flags
;
4667 ffe_ctl
.delalloc
= delalloc
;
4668 ffe_ctl
.hint_byte
= hint_byte
;
4669 ffe_ctl
.for_treelog
= for_treelog
;
4670 ffe_ctl
.for_data_reloc
= for_data_reloc
;
4672 ret
= find_free_extent(root
, ins
, &ffe_ctl
);
4673 if (!ret
&& !is_data
) {
4674 btrfs_dec_block_group_reservations(fs_info
, ins
->objectid
);
4675 } else if (ret
== -ENOSPC
) {
4676 if (!final_tried
&& ins
->offset
) {
4677 num_bytes
= min(num_bytes
>> 1, ins
->offset
);
4678 num_bytes
= round_down(num_bytes
,
4679 fs_info
->sectorsize
);
4680 num_bytes
= max(num_bytes
, min_alloc_size
);
4681 ram_bytes
= num_bytes
;
4682 if (num_bytes
== min_alloc_size
)
4685 } else if (btrfs_test_opt(fs_info
, ENOSPC_DEBUG
)) {
4686 struct btrfs_space_info
*sinfo
;
4688 sinfo
= btrfs_find_space_info(fs_info
, flags
);
4690 "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4691 flags
, num_bytes
, for_treelog
, for_data_reloc
);
4693 btrfs_dump_space_info(fs_info
, sinfo
,
4701 int btrfs_free_reserved_extent(struct btrfs_fs_info
*fs_info
,
4702 u64 start
, u64 len
, int delalloc
)
4704 struct btrfs_block_group
*cache
;
4706 cache
= btrfs_lookup_block_group(fs_info
, start
);
4708 btrfs_err(fs_info
, "Unable to find block group for %llu",
4713 btrfs_add_free_space(cache
, start
, len
);
4714 btrfs_free_reserved_bytes(cache
, len
, delalloc
);
4715 trace_btrfs_reserved_extent_free(fs_info
, start
, len
);
4717 btrfs_put_block_group(cache
);
4721 int btrfs_pin_reserved_extent(struct btrfs_trans_handle
*trans
,
4722 const struct extent_buffer
*eb
)
4724 struct btrfs_block_group
*cache
;
4727 cache
= btrfs_lookup_block_group(trans
->fs_info
, eb
->start
);
4729 btrfs_err(trans
->fs_info
, "unable to find block group for %llu",
4734 ret
= pin_down_extent(trans
, cache
, eb
->start
, eb
->len
, 1);
4735 btrfs_put_block_group(cache
);
4739 static int alloc_reserved_extent(struct btrfs_trans_handle
*trans
, u64 bytenr
,
4742 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
4745 ret
= remove_from_free_space_tree(trans
, bytenr
, num_bytes
);
4749 ret
= btrfs_update_block_group(trans
, bytenr
, num_bytes
, true);
4752 btrfs_err(fs_info
, "update block group failed for %llu %llu",
4757 trace_btrfs_reserved_extent_alloc(fs_info
, bytenr
, num_bytes
);
4761 static int alloc_reserved_file_extent(struct btrfs_trans_handle
*trans
,
4762 u64 parent
, u64 root_objectid
,
4763 u64 flags
, u64 owner
, u64 offset
,
4764 struct btrfs_key
*ins
, int ref_mod
, u64 oref_root
)
4766 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
4767 struct btrfs_root
*extent_root
;
4769 struct btrfs_extent_item
*extent_item
;
4770 struct btrfs_extent_owner_ref
*oref
;
4771 struct btrfs_extent_inline_ref
*iref
;
4772 struct btrfs_path
*path
;
4773 struct extent_buffer
*leaf
;
4776 const bool simple_quota
= (btrfs_qgroup_mode(fs_info
) == BTRFS_QGROUP_MODE_SIMPLE
);
4779 type
= BTRFS_SHARED_DATA_REF_KEY
;
4781 type
= BTRFS_EXTENT_DATA_REF_KEY
;
4783 size
= sizeof(*extent_item
);
4785 size
+= btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY
);
4786 size
+= btrfs_extent_inline_ref_size(type
);
4788 path
= btrfs_alloc_path();
4792 extent_root
= btrfs_extent_root(fs_info
, ins
->objectid
);
4793 ret
= btrfs_insert_empty_item(trans
, extent_root
, path
, ins
, size
);
4795 btrfs_free_path(path
);
4799 leaf
= path
->nodes
[0];
4800 extent_item
= btrfs_item_ptr(leaf
, path
->slots
[0],
4801 struct btrfs_extent_item
);
4802 btrfs_set_extent_refs(leaf
, extent_item
, ref_mod
);
4803 btrfs_set_extent_generation(leaf
, extent_item
, trans
->transid
);
4804 btrfs_set_extent_flags(leaf
, extent_item
,
4805 flags
| BTRFS_EXTENT_FLAG_DATA
);
4807 iref
= (struct btrfs_extent_inline_ref
*)(extent_item
+ 1);
4809 btrfs_set_extent_inline_ref_type(leaf
, iref
, BTRFS_EXTENT_OWNER_REF_KEY
);
4810 oref
= (struct btrfs_extent_owner_ref
*)(&iref
->offset
);
4811 btrfs_set_extent_owner_ref_root_id(leaf
, oref
, oref_root
);
4812 iref
= (struct btrfs_extent_inline_ref
*)(oref
+ 1);
4814 btrfs_set_extent_inline_ref_type(leaf
, iref
, type
);
4817 struct btrfs_shared_data_ref
*ref
;
4818 ref
= (struct btrfs_shared_data_ref
*)(iref
+ 1);
4819 btrfs_set_extent_inline_ref_offset(leaf
, iref
, parent
);
4820 btrfs_set_shared_data_ref_count(leaf
, ref
, ref_mod
);
4822 struct btrfs_extent_data_ref
*ref
;
4823 ref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
4824 btrfs_set_extent_data_ref_root(leaf
, ref
, root_objectid
);
4825 btrfs_set_extent_data_ref_objectid(leaf
, ref
, owner
);
4826 btrfs_set_extent_data_ref_offset(leaf
, ref
, offset
);
4827 btrfs_set_extent_data_ref_count(leaf
, ref
, ref_mod
);
4830 btrfs_mark_buffer_dirty(trans
, path
->nodes
[0]);
4831 btrfs_free_path(path
);
4833 return alloc_reserved_extent(trans
, ins
->objectid
, ins
->offset
);
4836 static int alloc_reserved_tree_block(struct btrfs_trans_handle
*trans
,
4837 struct btrfs_delayed_ref_node
*node
,
4838 struct btrfs_delayed_extent_op
*extent_op
)
4840 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
4841 struct btrfs_root
*extent_root
;
4843 struct btrfs_extent_item
*extent_item
;
4844 struct btrfs_key extent_key
;
4845 struct btrfs_tree_block_info
*block_info
;
4846 struct btrfs_extent_inline_ref
*iref
;
4847 struct btrfs_path
*path
;
4848 struct extent_buffer
*leaf
;
4849 u32 size
= sizeof(*extent_item
) + sizeof(*iref
);
4850 const u64 flags
= (extent_op
? extent_op
->flags_to_set
: 0);
4851 /* The owner of a tree block is the level. */
4852 int level
= btrfs_delayed_ref_owner(node
);
4853 bool skinny_metadata
= btrfs_fs_incompat(fs_info
, SKINNY_METADATA
);
4855 extent_key
.objectid
= node
->bytenr
;
4856 if (skinny_metadata
) {
4857 /* The owner of a tree block is the level. */
4858 extent_key
.offset
= level
;
4859 extent_key
.type
= BTRFS_METADATA_ITEM_KEY
;
4861 extent_key
.offset
= node
->num_bytes
;
4862 extent_key
.type
= BTRFS_EXTENT_ITEM_KEY
;
4863 size
+= sizeof(*block_info
);
4866 path
= btrfs_alloc_path();
4870 extent_root
= btrfs_extent_root(fs_info
, extent_key
.objectid
);
4871 ret
= btrfs_insert_empty_item(trans
, extent_root
, path
, &extent_key
,
4874 btrfs_free_path(path
);
4878 leaf
= path
->nodes
[0];
4879 extent_item
= btrfs_item_ptr(leaf
, path
->slots
[0],
4880 struct btrfs_extent_item
);
4881 btrfs_set_extent_refs(leaf
, extent_item
, 1);
4882 btrfs_set_extent_generation(leaf
, extent_item
, trans
->transid
);
4883 btrfs_set_extent_flags(leaf
, extent_item
,
4884 flags
| BTRFS_EXTENT_FLAG_TREE_BLOCK
);
4886 if (skinny_metadata
) {
4887 iref
= (struct btrfs_extent_inline_ref
*)(extent_item
+ 1);
4889 block_info
= (struct btrfs_tree_block_info
*)(extent_item
+ 1);
4890 btrfs_set_tree_block_key(leaf
, block_info
, &extent_op
->key
);
4891 btrfs_set_tree_block_level(leaf
, block_info
, level
);
4892 iref
= (struct btrfs_extent_inline_ref
*)(block_info
+ 1);
4895 if (node
->type
== BTRFS_SHARED_BLOCK_REF_KEY
) {
4896 btrfs_set_extent_inline_ref_type(leaf
, iref
,
4897 BTRFS_SHARED_BLOCK_REF_KEY
);
4898 btrfs_set_extent_inline_ref_offset(leaf
, iref
, node
->parent
);
4900 btrfs_set_extent_inline_ref_type(leaf
, iref
,
4901 BTRFS_TREE_BLOCK_REF_KEY
);
4902 btrfs_set_extent_inline_ref_offset(leaf
, iref
, node
->ref_root
);
4905 btrfs_mark_buffer_dirty(trans
, leaf
);
4906 btrfs_free_path(path
);
4908 return alloc_reserved_extent(trans
, node
->bytenr
, fs_info
->nodesize
);
4911 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle
*trans
,
4912 struct btrfs_root
*root
, u64 owner
,
4913 u64 offset
, u64 ram_bytes
,
4914 struct btrfs_key
*ins
)
4916 struct btrfs_ref generic_ref
= {
4917 .action
= BTRFS_ADD_DELAYED_EXTENT
,
4918 .bytenr
= ins
->objectid
,
4919 .num_bytes
= ins
->offset
,
4920 .owning_root
= btrfs_root_id(root
),
4921 .ref_root
= btrfs_root_id(root
),
4924 ASSERT(generic_ref
.ref_root
!= BTRFS_TREE_LOG_OBJECTID
);
4926 if (btrfs_is_data_reloc_root(root
) && is_fstree(root
->relocation_src_root
))
4927 generic_ref
.owning_root
= root
->relocation_src_root
;
4929 btrfs_init_data_ref(&generic_ref
, owner
, offset
, 0, false);
4930 btrfs_ref_tree_mod(root
->fs_info
, &generic_ref
);
4932 return btrfs_add_delayed_data_ref(trans
, &generic_ref
, ram_bytes
);
4936 * this is used by the tree logging recovery code. It records that
4937 * an extent has been allocated and makes sure to clear the free
4938 * space cache bits as well
4940 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle
*trans
,
4941 u64 root_objectid
, u64 owner
, u64 offset
,
4942 struct btrfs_key
*ins
)
4944 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
4946 struct btrfs_block_group
*block_group
;
4947 struct btrfs_space_info
*space_info
;
4948 struct btrfs_squota_delta delta
= {
4949 .root
= root_objectid
,
4950 .num_bytes
= ins
->offset
,
4951 .generation
= trans
->transid
,
4957 * Mixed block groups will exclude before processing the log so we only
4958 * need to do the exclude dance if this fs isn't mixed.
4960 if (!btrfs_fs_incompat(fs_info
, MIXED_GROUPS
)) {
4961 ret
= __exclude_logged_extent(fs_info
, ins
->objectid
,
4967 block_group
= btrfs_lookup_block_group(fs_info
, ins
->objectid
);
4971 space_info
= block_group
->space_info
;
4972 spin_lock(&space_info
->lock
);
4973 spin_lock(&block_group
->lock
);
4974 space_info
->bytes_reserved
+= ins
->offset
;
4975 block_group
->reserved
+= ins
->offset
;
4976 spin_unlock(&block_group
->lock
);
4977 spin_unlock(&space_info
->lock
);
4979 ret
= alloc_reserved_file_extent(trans
, 0, root_objectid
, 0, owner
,
4980 offset
, ins
, 1, root_objectid
);
4982 btrfs_pin_extent(trans
, ins
->objectid
, ins
->offset
, 1);
4983 ret
= btrfs_record_squota_delta(fs_info
, &delta
);
4984 btrfs_put_block_group(block_group
);
4988 #ifdef CONFIG_BTRFS_DEBUG
4990 * Extra safety check in case the extent tree is corrupted and extent allocator
4991 * chooses to use a tree block which is already used and locked.
4993 static bool check_eb_lock_owner(const struct extent_buffer
*eb
)
4995 if (eb
->lock_owner
== current
->pid
) {
4996 btrfs_err_rl(eb
->fs_info
,
4997 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4998 eb
->start
, btrfs_header_owner(eb
), current
->pid
);
5004 static bool check_eb_lock_owner(struct extent_buffer
*eb
)
5010 static struct extent_buffer
*
5011 btrfs_init_new_buffer(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
5012 u64 bytenr
, int level
, u64 owner
,
5013 enum btrfs_lock_nesting nest
)
5015 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5016 struct extent_buffer
*buf
;
5017 u64 lockdep_owner
= owner
;
5019 buf
= btrfs_find_create_tree_block(fs_info
, bytenr
, owner
, level
);
5023 if (check_eb_lock_owner(buf
)) {
5024 free_extent_buffer(buf
);
5025 return ERR_PTR(-EUCLEAN
);
5029 * The reloc trees are just snapshots, so we need them to appear to be
5030 * just like any other fs tree WRT lockdep.
5032 * The exception however is in replace_path() in relocation, where we
5033 * hold the lock on the original fs root and then search for the reloc
5034 * root. At that point we need to make sure any reloc root buffers are
5035 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
5038 if (lockdep_owner
== BTRFS_TREE_RELOC_OBJECTID
&&
5039 !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS
, &root
->state
))
5040 lockdep_owner
= BTRFS_FS_TREE_OBJECTID
;
5042 /* btrfs_clear_buffer_dirty() accesses generation field. */
5043 btrfs_set_header_generation(buf
, trans
->transid
);
5046 * This needs to stay, because we could allocate a freed block from an
5047 * old tree into a new tree, so we need to make sure this new block is
5048 * set to the appropriate level and owner.
5050 btrfs_set_buffer_lockdep_class(lockdep_owner
, buf
, level
);
5052 btrfs_tree_lock_nested(buf
, nest
);
5053 btrfs_clear_buffer_dirty(trans
, buf
);
5054 clear_bit(EXTENT_BUFFER_STALE
, &buf
->bflags
);
5055 clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT
, &buf
->bflags
);
5057 set_extent_buffer_uptodate(buf
);
5059 memzero_extent_buffer(buf
, 0, sizeof(struct btrfs_header
));
5060 btrfs_set_header_level(buf
, level
);
5061 btrfs_set_header_bytenr(buf
, buf
->start
);
5062 btrfs_set_header_generation(buf
, trans
->transid
);
5063 btrfs_set_header_backref_rev(buf
, BTRFS_MIXED_BACKREF_REV
);
5064 btrfs_set_header_owner(buf
, owner
);
5065 write_extent_buffer_fsid(buf
, fs_info
->fs_devices
->metadata_uuid
);
5066 write_extent_buffer_chunk_tree_uuid(buf
, fs_info
->chunk_tree_uuid
);
5067 if (btrfs_root_id(root
) == BTRFS_TREE_LOG_OBJECTID
) {
5068 buf
->log_index
= root
->log_transid
% 2;
5070 * we allow two log transactions at a time, use different
5071 * EXTENT bit to differentiate dirty pages.
5073 if (buf
->log_index
== 0)
5074 set_extent_bit(&root
->dirty_log_pages
, buf
->start
,
5075 buf
->start
+ buf
->len
- 1,
5076 EXTENT_DIRTY
, NULL
);
5078 set_extent_bit(&root
->dirty_log_pages
, buf
->start
,
5079 buf
->start
+ buf
->len
- 1,
5082 buf
->log_index
= -1;
5083 set_extent_bit(&trans
->transaction
->dirty_pages
, buf
->start
,
5084 buf
->start
+ buf
->len
- 1, EXTENT_DIRTY
, NULL
);
5086 /* this returns a buffer locked for blocking */
5091 * finds a free extent and does all the dirty work required for allocation
5092 * returns the tree buffer or an ERR_PTR on error.
5094 struct extent_buffer
*btrfs_alloc_tree_block(struct btrfs_trans_handle
*trans
,
5095 struct btrfs_root
*root
,
5096 u64 parent
, u64 root_objectid
,
5097 const struct btrfs_disk_key
*key
,
5098 int level
, u64 hint
,
5101 enum btrfs_lock_nesting nest
)
5103 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5104 struct btrfs_key ins
;
5105 struct btrfs_block_rsv
*block_rsv
;
5106 struct extent_buffer
*buf
;
5109 u32 blocksize
= fs_info
->nodesize
;
5110 bool skinny_metadata
= btrfs_fs_incompat(fs_info
, SKINNY_METADATA
);
5113 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5114 if (btrfs_is_testing(fs_info
)) {
5115 buf
= btrfs_init_new_buffer(trans
, root
, root
->alloc_bytenr
,
5116 level
, root_objectid
, nest
);
5118 root
->alloc_bytenr
+= blocksize
;
5123 block_rsv
= btrfs_use_block_rsv(trans
, root
, blocksize
);
5124 if (IS_ERR(block_rsv
))
5125 return ERR_CAST(block_rsv
);
5127 ret
= btrfs_reserve_extent(root
, blocksize
, blocksize
, blocksize
,
5128 empty_size
, hint
, &ins
, 0, 0);
5132 buf
= btrfs_init_new_buffer(trans
, root
, ins
.objectid
, level
,
5133 root_objectid
, nest
);
5136 goto out_free_reserved
;
5138 owning_root
= btrfs_header_owner(buf
);
5140 if (root_objectid
== BTRFS_TREE_RELOC_OBJECTID
) {
5142 parent
= ins
.objectid
;
5143 flags
|= BTRFS_BLOCK_FLAG_FULL_BACKREF
;
5144 owning_root
= reloc_src_root
;
5148 if (root_objectid
!= BTRFS_TREE_LOG_OBJECTID
) {
5149 struct btrfs_delayed_extent_op
*extent_op
;
5150 struct btrfs_ref generic_ref
= {
5151 .action
= BTRFS_ADD_DELAYED_EXTENT
,
5152 .bytenr
= ins
.objectid
,
5153 .num_bytes
= ins
.offset
,
5155 .owning_root
= owning_root
,
5156 .ref_root
= root_objectid
,
5159 if (!skinny_metadata
|| flags
!= 0) {
5160 extent_op
= btrfs_alloc_delayed_extent_op();
5166 memcpy(&extent_op
->key
, key
, sizeof(extent_op
->key
));
5168 memset(&extent_op
->key
, 0, sizeof(extent_op
->key
));
5169 extent_op
->flags_to_set
= flags
;
5170 extent_op
->update_key
= (skinny_metadata
? false : true);
5171 extent_op
->update_flags
= (flags
!= 0);
5176 btrfs_init_tree_ref(&generic_ref
, level
, btrfs_root_id(root
), false);
5177 btrfs_ref_tree_mod(fs_info
, &generic_ref
);
5178 ret
= btrfs_add_delayed_tree_ref(trans
, &generic_ref
, extent_op
);
5180 btrfs_free_delayed_extent_op(extent_op
);
5187 btrfs_tree_unlock(buf
);
5188 free_extent_buffer(buf
);
5190 btrfs_free_reserved_extent(fs_info
, ins
.objectid
, ins
.offset
, 0);
5192 btrfs_unuse_block_rsv(fs_info
, block_rsv
, blocksize
);
5193 return ERR_PTR(ret
);
5196 struct walk_control
{
5197 u64 refs
[BTRFS_MAX_LEVEL
];
5198 u64 flags
[BTRFS_MAX_LEVEL
];
5199 struct btrfs_key update_progress
;
5200 struct btrfs_key drop_progress
;
5210 /* Indicate that extent info needs to be looked up when walking the tree. */
5215 * This is our normal stage. We are traversing blocks the current snapshot owns
5216 * and we are dropping any of our references to any children we are able to, and
5217 * then freeing the block once we've processed all of the children.
5219 #define DROP_REFERENCE 1
5222 * We enter this stage when we have to walk into a child block (meaning we can't
5223 * simply drop our reference to it from our current parent node) and there are
5224 * more than one reference on it. If we are the owner of any of the children
5225 * blocks from the current parent node then we have to do the FULL_BACKREF dance
5226 * on them in order to drop our normal ref and add the shared ref.
5228 #define UPDATE_BACKREF 2
5231 * Decide if we need to walk down into this node to adjust the references.
5233 * @root: the root we are currently deleting
5234 * @wc: the walk control for this deletion
5235 * @eb: the parent eb that we're currently visiting
5236 * @refs: the number of refs for wc->level - 1
5237 * @flags: the flags for wc->level - 1
5238 * @slot: the slot in the eb that we're currently checking
5240 * This is meant to be called when we're evaluating if a node we point to at
5241 * wc->level should be read and walked into, or if we can simply delete our
5242 * reference to it. We return true if we should walk into the node, false if we
5245 * We have assertions in here to make sure this is called correctly. We assume
5246 * that sanity checking on the blocks read to this point has been done, so any
5247 * corrupted file systems must have been caught before calling this function.
5249 static bool visit_node_for_delete(struct btrfs_root
*root
, struct walk_control
*wc
,
5250 struct extent_buffer
*eb
, u64 flags
, int slot
)
5252 struct btrfs_key key
;
5254 int level
= wc
->level
;
5257 ASSERT(wc
->refs
[level
- 1] > 0);
5260 * The update backref stage we only want to skip if we already have
5261 * FULL_BACKREF set, otherwise we need to read.
5263 if (wc
->stage
== UPDATE_BACKREF
) {
5264 if (level
== 1 && flags
& BTRFS_BLOCK_FLAG_FULL_BACKREF
)
5270 * We're the last ref on this block, we must walk into it and process
5271 * any refs it's pointing at.
5273 if (wc
->refs
[level
- 1] == 1)
5277 * If we're already FULL_BACKREF then we know we can just drop our
5278 * current reference.
5280 if (level
== 1 && flags
& BTRFS_BLOCK_FLAG_FULL_BACKREF
)
5284 * This block is older than our creation generation, we can drop our
5287 generation
= btrfs_node_ptr_generation(eb
, slot
);
5288 if (!wc
->update_ref
|| generation
<= root
->root_key
.offset
)
5292 * This block was processed from a previous snapshot deletion run, we
5295 btrfs_node_key_to_cpu(eb
, &key
, slot
);
5296 if (btrfs_comp_cpu_keys(&key
, &wc
->update_progress
) < 0)
5299 /* All other cases we need to wander into the node. */
5303 static noinline
void reada_walk_down(struct btrfs_trans_handle
*trans
,
5304 struct btrfs_root
*root
,
5305 struct walk_control
*wc
,
5306 struct btrfs_path
*path
)
5308 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5314 struct extent_buffer
*eb
;
5319 if (path
->slots
[wc
->level
] < wc
->reada_slot
) {
5320 wc
->reada_count
= wc
->reada_count
* 2 / 3;
5321 wc
->reada_count
= max(wc
->reada_count
, 2);
5323 wc
->reada_count
= wc
->reada_count
* 3 / 2;
5324 wc
->reada_count
= min_t(int, wc
->reada_count
,
5325 BTRFS_NODEPTRS_PER_BLOCK(fs_info
));
5328 eb
= path
->nodes
[wc
->level
];
5329 nritems
= btrfs_header_nritems(eb
);
5331 for (slot
= path
->slots
[wc
->level
]; slot
< nritems
; slot
++) {
5332 if (nread
>= wc
->reada_count
)
5336 bytenr
= btrfs_node_blockptr(eb
, slot
);
5337 generation
= btrfs_node_ptr_generation(eb
, slot
);
5339 if (slot
== path
->slots
[wc
->level
])
5342 if (wc
->stage
== UPDATE_BACKREF
&&
5343 generation
<= root
->root_key
.offset
)
5346 /* We don't lock the tree block, it's OK to be racy here */
5347 ret
= btrfs_lookup_extent_info(trans
, fs_info
, bytenr
,
5348 wc
->level
- 1, 1, &refs
,
5350 /* We don't care about errors in readahead. */
5355 * This could be racey, it's conceivable that we raced and end
5356 * up with a bogus refs count, if that's the case just skip, if
5357 * we are actually corrupt we will notice when we look up
5358 * everything again with our locks.
5363 /* If we don't need to visit this node don't reada. */
5364 if (!visit_node_for_delete(root
, wc
, eb
, flags
, slot
))
5367 btrfs_readahead_node_child(eb
, slot
);
5370 wc
->reada_slot
= slot
;
5374 * helper to process tree block while walking down the tree.
5376 * when wc->stage == UPDATE_BACKREF, this function updates
5377 * back refs for pointers in the block.
5379 * NOTE: return value 1 means we should stop walking down.
5381 static noinline
int walk_down_proc(struct btrfs_trans_handle
*trans
,
5382 struct btrfs_root
*root
,
5383 struct btrfs_path
*path
,
5384 struct walk_control
*wc
)
5386 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5387 int level
= wc
->level
;
5388 struct extent_buffer
*eb
= path
->nodes
[level
];
5389 u64 flag
= BTRFS_BLOCK_FLAG_FULL_BACKREF
;
5392 if (wc
->stage
== UPDATE_BACKREF
&& btrfs_header_owner(eb
) != btrfs_root_id(root
))
5396 * when reference count of tree block is 1, it won't increase
5397 * again. once full backref flag is set, we never clear it.
5399 if (wc
->lookup_info
&&
5400 ((wc
->stage
== DROP_REFERENCE
&& wc
->refs
[level
] != 1) ||
5401 (wc
->stage
== UPDATE_BACKREF
&& !(wc
->flags
[level
] & flag
)))) {
5402 ASSERT(path
->locks
[level
]);
5403 ret
= btrfs_lookup_extent_info(trans
, fs_info
,
5404 eb
->start
, level
, 1,
5410 if (unlikely(wc
->refs
[level
] == 0)) {
5411 btrfs_err(fs_info
, "bytenr %llu has 0 references, expect > 0",
5417 if (wc
->stage
== DROP_REFERENCE
) {
5418 if (wc
->refs
[level
] > 1)
5421 if (path
->locks
[level
] && !wc
->keep_locks
) {
5422 btrfs_tree_unlock_rw(eb
, path
->locks
[level
]);
5423 path
->locks
[level
] = 0;
5428 /* wc->stage == UPDATE_BACKREF */
5429 if (!(wc
->flags
[level
] & flag
)) {
5430 ASSERT(path
->locks
[level
]);
5431 ret
= btrfs_inc_ref(trans
, root
, eb
, 1);
5433 btrfs_abort_transaction(trans
, ret
);
5436 ret
= btrfs_dec_ref(trans
, root
, eb
, 0);
5438 btrfs_abort_transaction(trans
, ret
);
5441 ret
= btrfs_set_disk_extent_flags(trans
, eb
, flag
);
5443 btrfs_abort_transaction(trans
, ret
);
5446 wc
->flags
[level
] |= flag
;
5450 * the block is shared by multiple trees, so it's not good to
5451 * keep the tree lock
5453 if (path
->locks
[level
] && level
> 0) {
5454 btrfs_tree_unlock_rw(eb
, path
->locks
[level
]);
5455 path
->locks
[level
] = 0;
5461 * This is used to verify a ref exists for this root to deal with a bug where we
5462 * would have a drop_progress key that hadn't been updated properly.
5464 static int check_ref_exists(struct btrfs_trans_handle
*trans
,
5465 struct btrfs_root
*root
, u64 bytenr
, u64 parent
,
5468 struct btrfs_delayed_ref_root
*delayed_refs
;
5469 struct btrfs_delayed_ref_head
*head
;
5470 struct btrfs_path
*path
;
5471 struct btrfs_extent_inline_ref
*iref
;
5473 bool exists
= false;
5475 path
= btrfs_alloc_path();
5479 ret
= lookup_extent_backref(trans
, path
, &iref
, bytenr
,
5480 root
->fs_info
->nodesize
, parent
,
5481 btrfs_root_id(root
), level
, 0);
5482 if (ret
!= -ENOENT
) {
5484 * If we get 0 then we found our reference, return 1, else
5485 * return the error if it's not -ENOENT;
5487 btrfs_free_path(path
);
5488 return (ret
< 0 ) ? ret
: 1;
5492 * We could have a delayed ref with this reference, so look it up while
5493 * we're holding the path open to make sure we don't race with the
5494 * delayed ref running.
5496 delayed_refs
= &trans
->transaction
->delayed_refs
;
5497 spin_lock(&delayed_refs
->lock
);
5498 head
= btrfs_find_delayed_ref_head(root
->fs_info
, delayed_refs
, bytenr
);
5501 if (!mutex_trylock(&head
->mutex
)) {
5503 * We're contended, means that the delayed ref is running, get a
5504 * reference and wait for the ref head to be complete and then
5507 refcount_inc(&head
->refs
);
5508 spin_unlock(&delayed_refs
->lock
);
5510 btrfs_release_path(path
);
5512 mutex_lock(&head
->mutex
);
5513 mutex_unlock(&head
->mutex
);
5514 btrfs_put_delayed_ref_head(head
);
5518 exists
= btrfs_find_delayed_tree_ref(head
, root
->root_key
.objectid
, parent
);
5519 mutex_unlock(&head
->mutex
);
5521 spin_unlock(&delayed_refs
->lock
);
5522 btrfs_free_path(path
);
5523 return exists
? 1 : 0;
5527 * We may not have an uptodate block, so if we are going to walk down into this
5528 * block we need to drop the lock, read it off of the disk, re-lock it and
5529 * return to continue dropping the snapshot.
5531 static int check_next_block_uptodate(struct btrfs_trans_handle
*trans
,
5532 struct btrfs_root
*root
,
5533 struct btrfs_path
*path
,
5534 struct walk_control
*wc
,
5535 struct extent_buffer
*next
)
5537 struct btrfs_tree_parent_check check
= { 0 };
5539 int level
= wc
->level
;
5542 btrfs_assert_tree_write_locked(next
);
5544 generation
= btrfs_node_ptr_generation(path
->nodes
[level
], path
->slots
[level
]);
5546 if (btrfs_buffer_uptodate(next
, generation
, 0))
5549 check
.level
= level
- 1;
5550 check
.transid
= generation
;
5551 check
.owner_root
= btrfs_root_id(root
);
5552 check
.has_first_key
= true;
5553 btrfs_node_key_to_cpu(path
->nodes
[level
], &check
.first_key
, path
->slots
[level
]);
5555 btrfs_tree_unlock(next
);
5557 reada_walk_down(trans
, root
, wc
, path
);
5558 ret
= btrfs_read_extent_buffer(next
, &check
);
5560 free_extent_buffer(next
);
5563 btrfs_tree_lock(next
);
5564 wc
->lookup_info
= 1;
5569 * If we determine that we don't have to visit wc->level - 1 then we need to
5570 * determine if we can drop our reference.
5572 * If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
5574 * If we are DROP_REFERENCE this will figure out if we need to drop our current
5575 * reference, skipping it if we dropped it from a previous incompleted drop, or
5576 * dropping it if we still have a reference to it.
5578 static int maybe_drop_reference(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
5579 struct btrfs_path
*path
, struct walk_control
*wc
,
5580 struct extent_buffer
*next
, u64 owner_root
)
5582 struct btrfs_ref ref
= {
5583 .action
= BTRFS_DROP_DELAYED_REF
,
5584 .bytenr
= next
->start
,
5585 .num_bytes
= root
->fs_info
->nodesize
,
5586 .owning_root
= owner_root
,
5587 .ref_root
= btrfs_root_id(root
),
5589 int level
= wc
->level
;
5592 /* We are UPDATE_BACKREF, we're not dropping anything. */
5593 if (wc
->stage
== UPDATE_BACKREF
)
5596 if (wc
->flags
[level
] & BTRFS_BLOCK_FLAG_FULL_BACKREF
) {
5597 ref
.parent
= path
->nodes
[level
]->start
;
5599 ASSERT(btrfs_root_id(root
) == btrfs_header_owner(path
->nodes
[level
]));
5600 if (btrfs_root_id(root
) != btrfs_header_owner(path
->nodes
[level
])) {
5601 btrfs_err(root
->fs_info
, "mismatched block owner");
5607 * If we had a drop_progress we need to verify the refs are set as
5608 * expected. If we find our ref then we know that from here on out
5609 * everything should be correct, and we can clear the
5612 if (wc
->restarted
) {
5613 ret
= check_ref_exists(trans
, root
, next
->start
, ref
.parent
,
5622 * Reloc tree doesn't contribute to qgroup numbers, and we have already
5623 * accounted them at merge time (replace_path), thus we could skip
5624 * expensive subtree trace here.
5626 if (btrfs_root_id(root
) != BTRFS_TREE_RELOC_OBJECTID
&&
5627 wc
->refs
[level
- 1] > 1) {
5628 u64 generation
= btrfs_node_ptr_generation(path
->nodes
[level
],
5629 path
->slots
[level
]);
5631 ret
= btrfs_qgroup_trace_subtree(trans
, next
, generation
, level
- 1);
5633 btrfs_err_rl(root
->fs_info
,
5634 "error %d accounting shared subtree, quota is out of sync, rescan required",
5640 * We need to update the next key in our walk control so we can update
5641 * the drop_progress key accordingly. We don't care if find_next_key
5642 * doesn't find a key because that means we're at the end and are going
5645 wc
->drop_level
= level
;
5646 find_next_key(path
, level
, &wc
->drop_progress
);
5648 btrfs_init_tree_ref(&ref
, level
- 1, 0, false);
5649 return btrfs_free_extent(trans
, &ref
);
5653 * helper to process tree block pointer.
5655 * when wc->stage == DROP_REFERENCE, this function checks
5656 * reference count of the block pointed to. if the block
5657 * is shared and we need update back refs for the subtree
5658 * rooted at the block, this function changes wc->stage to
5659 * UPDATE_BACKREF. if the block is shared and there is no
5660 * need to update back, this function drops the reference
5663 * NOTE: return value 1 means we should stop walking down.
5665 static noinline
int do_walk_down(struct btrfs_trans_handle
*trans
,
5666 struct btrfs_root
*root
,
5667 struct btrfs_path
*path
,
5668 struct walk_control
*wc
)
5670 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5674 struct extent_buffer
*next
;
5675 int level
= wc
->level
;
5678 generation
= btrfs_node_ptr_generation(path
->nodes
[level
],
5679 path
->slots
[level
]);
5681 * if the lower level block was created before the snapshot
5682 * was created, we know there is no need to update back refs
5685 if (wc
->stage
== UPDATE_BACKREF
&&
5686 generation
<= root
->root_key
.offset
) {
5687 wc
->lookup_info
= 1;
5691 bytenr
= btrfs_node_blockptr(path
->nodes
[level
], path
->slots
[level
]);
5693 next
= btrfs_find_create_tree_block(fs_info
, bytenr
, btrfs_root_id(root
),
5696 return PTR_ERR(next
);
5698 btrfs_tree_lock(next
);
5700 ret
= btrfs_lookup_extent_info(trans
, fs_info
, bytenr
, level
- 1, 1,
5701 &wc
->refs
[level
- 1],
5702 &wc
->flags
[level
- 1],
5707 if (unlikely(wc
->refs
[level
- 1] == 0)) {
5708 btrfs_err(fs_info
, "bytenr %llu has 0 references, expect > 0",
5713 wc
->lookup_info
= 0;
5715 /* If we don't have to walk into this node skip it. */
5716 if (!visit_node_for_delete(root
, wc
, path
->nodes
[level
],
5717 wc
->flags
[level
- 1], path
->slots
[level
]))
5721 * We have to walk down into this node, and if we're currently at the
5722 * DROP_REFERNCE stage and this block is shared then we need to switch
5723 * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
5725 if (wc
->stage
== DROP_REFERENCE
&& wc
->refs
[level
- 1] > 1) {
5726 wc
->stage
= UPDATE_BACKREF
;
5727 wc
->shared_level
= level
- 1;
5730 ret
= check_next_block_uptodate(trans
, root
, path
, wc
, next
);
5735 ASSERT(level
== btrfs_header_level(next
));
5736 if (level
!= btrfs_header_level(next
)) {
5737 btrfs_err(root
->fs_info
, "mismatched level");
5741 path
->nodes
[level
] = next
;
5742 path
->slots
[level
] = 0;
5743 path
->locks
[level
] = BTRFS_WRITE_LOCK
;
5749 ret
= maybe_drop_reference(trans
, root
, path
, wc
, next
, owner_root
);
5752 wc
->refs
[level
- 1] = 0;
5753 wc
->flags
[level
- 1] = 0;
5754 wc
->lookup_info
= 1;
5758 btrfs_tree_unlock(next
);
5759 free_extent_buffer(next
);
5765 * helper to process tree block while walking up the tree.
5767 * when wc->stage == DROP_REFERENCE, this function drops
5768 * reference count on the block.
5770 * when wc->stage == UPDATE_BACKREF, this function changes
5771 * wc->stage back to DROP_REFERENCE if we changed wc->stage
5772 * to UPDATE_BACKREF previously while processing the block.
5774 * NOTE: return value 1 means we should stop walking up.
5776 static noinline
int walk_up_proc(struct btrfs_trans_handle
*trans
,
5777 struct btrfs_root
*root
,
5778 struct btrfs_path
*path
,
5779 struct walk_control
*wc
)
5781 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5783 int level
= wc
->level
;
5784 struct extent_buffer
*eb
= path
->nodes
[level
];
5787 if (wc
->stage
== UPDATE_BACKREF
) {
5788 ASSERT(wc
->shared_level
>= level
);
5789 if (level
< wc
->shared_level
)
5792 ret
= find_next_key(path
, level
+ 1, &wc
->update_progress
);
5796 wc
->stage
= DROP_REFERENCE
;
5797 wc
->shared_level
= -1;
5798 path
->slots
[level
] = 0;
5801 * check reference count again if the block isn't locked.
5802 * we should start walking down the tree again if reference
5805 if (!path
->locks
[level
]) {
5807 btrfs_tree_lock(eb
);
5808 path
->locks
[level
] = BTRFS_WRITE_LOCK
;
5810 ret
= btrfs_lookup_extent_info(trans
, fs_info
,
5811 eb
->start
, level
, 1,
5816 btrfs_tree_unlock_rw(eb
, path
->locks
[level
]);
5817 path
->locks
[level
] = 0;
5820 if (unlikely(wc
->refs
[level
] == 0)) {
5821 btrfs_tree_unlock_rw(eb
, path
->locks
[level
]);
5822 btrfs_err(fs_info
, "bytenr %llu has 0 references, expect > 0",
5826 if (wc
->refs
[level
] == 1) {
5827 btrfs_tree_unlock_rw(eb
, path
->locks
[level
]);
5828 path
->locks
[level
] = 0;
5834 /* wc->stage == DROP_REFERENCE */
5835 ASSERT(path
->locks
[level
] || wc
->refs
[level
] == 1);
5837 if (wc
->refs
[level
] == 1) {
5839 if (wc
->flags
[level
] & BTRFS_BLOCK_FLAG_FULL_BACKREF
)
5840 ret
= btrfs_dec_ref(trans
, root
, eb
, 1);
5842 ret
= btrfs_dec_ref(trans
, root
, eb
, 0);
5844 btrfs_abort_transaction(trans
, ret
);
5847 if (is_fstree(btrfs_root_id(root
))) {
5848 ret
= btrfs_qgroup_trace_leaf_items(trans
, eb
);
5850 btrfs_err_rl(fs_info
,
5851 "error %d accounting leaf items, quota is out of sync, rescan required",
5856 /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
5857 if (!path
->locks
[level
]) {
5858 btrfs_tree_lock(eb
);
5859 path
->locks
[level
] = BTRFS_WRITE_LOCK
;
5861 btrfs_clear_buffer_dirty(trans
, eb
);
5864 if (eb
== root
->node
) {
5865 if (wc
->flags
[level
] & BTRFS_BLOCK_FLAG_FULL_BACKREF
)
5867 else if (btrfs_root_id(root
) != btrfs_header_owner(eb
))
5868 goto owner_mismatch
;
5870 if (wc
->flags
[level
+ 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF
)
5871 parent
= path
->nodes
[level
+ 1]->start
;
5872 else if (btrfs_root_id(root
) !=
5873 btrfs_header_owner(path
->nodes
[level
+ 1]))
5874 goto owner_mismatch
;
5877 ret
= btrfs_free_tree_block(trans
, btrfs_root_id(root
), eb
, parent
,
5878 wc
->refs
[level
] == 1);
5880 btrfs_abort_transaction(trans
, ret
);
5882 wc
->refs
[level
] = 0;
5883 wc
->flags
[level
] = 0;
5887 btrfs_err_rl(fs_info
, "unexpected tree owner, have %llu expect %llu",
5888 btrfs_header_owner(eb
), btrfs_root_id(root
));
5893 * walk_down_tree consists of two steps.
5895 * walk_down_proc(). Look up the reference count and reference of our current
5896 * wc->level. At this point path->nodes[wc->level] should be populated and
5897 * uptodate, and in most cases should already be locked. If we are in
5898 * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and
5899 * we can walk back up the tree. If we are UPDATE_BACKREF we have to set
5900 * FULL_BACKREF on this node if it's not already set, and then do the
5901 * FULL_BACKREF conversion dance, which is to drop the root reference and add
5902 * the shared reference to all of this nodes children.
5904 * do_walk_down(). This is where we actually start iterating on the children of
5905 * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping
5906 * our reference to the children that return false from visit_node_for_delete(),
5907 * which has various conditions where we know we can just drop our reference
5908 * without visiting the node. For UPDATE_BACKREF we will skip any children that
5909 * visit_node_for_delete() returns false for, only walking down when necessary.
5910 * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of
5911 * snapshot deletion.
5913 static noinline
int walk_down_tree(struct btrfs_trans_handle
*trans
,
5914 struct btrfs_root
*root
,
5915 struct btrfs_path
*path
,
5916 struct walk_control
*wc
)
5918 int level
= wc
->level
;
5921 wc
->lookup_info
= 1;
5922 while (level
>= 0) {
5923 ret
= walk_down_proc(trans
, root
, path
, wc
);
5930 if (path
->slots
[level
] >=
5931 btrfs_header_nritems(path
->nodes
[level
]))
5934 ret
= do_walk_down(trans
, root
, path
, wc
);
5936 path
->slots
[level
]++;
5942 return (ret
== 1) ? 0 : ret
;
5946 * walk_up_tree() is responsible for making sure we visit every slot on our
5947 * current node, and if we're at the end of that node then we call
5948 * walk_up_proc() on our current node which will do one of a few things based on
5951 * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level
5952 * then we need to walk back up the tree, and then going back down into the
5953 * other slots via walk_down_tree to update any other children from our original
5954 * wc->shared_level. Once we're at or above our wc->shared_level we can switch
5955 * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on.
5957 * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block.
5958 * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents
5959 * in our current leaf. After that we call btrfs_free_tree_block() on the
5960 * current node and walk up to the next node to walk down the next slot.
5962 static noinline
int walk_up_tree(struct btrfs_trans_handle
*trans
,
5963 struct btrfs_root
*root
,
5964 struct btrfs_path
*path
,
5965 struct walk_control
*wc
, int max_level
)
5967 int level
= wc
->level
;
5970 path
->slots
[level
] = btrfs_header_nritems(path
->nodes
[level
]);
5971 while (level
< max_level
&& path
->nodes
[level
]) {
5973 if (path
->slots
[level
] + 1 <
5974 btrfs_header_nritems(path
->nodes
[level
])) {
5975 path
->slots
[level
]++;
5978 ret
= walk_up_proc(trans
, root
, path
, wc
);
5984 if (path
->locks
[level
]) {
5985 btrfs_tree_unlock_rw(path
->nodes
[level
],
5986 path
->locks
[level
]);
5987 path
->locks
[level
] = 0;
5989 free_extent_buffer(path
->nodes
[level
]);
5990 path
->nodes
[level
] = NULL
;
5998 * drop a subvolume tree.
6000 * this function traverses the tree freeing any blocks that only
6001 * referenced by the tree.
6003 * when a shared tree block is found. this function decreases its
6004 * reference count by one. if update_ref is true, this function
6005 * also make sure backrefs for the shared block and all lower level
6006 * blocks are properly updated.
6008 * If called with for_reloc == 0, may exit early with -EAGAIN
6010 int btrfs_drop_snapshot(struct btrfs_root
*root
, int update_ref
, int for_reloc
)
6012 const bool is_reloc_root
= (btrfs_root_id(root
) == BTRFS_TREE_RELOC_OBJECTID
);
6013 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
6014 struct btrfs_path
*path
;
6015 struct btrfs_trans_handle
*trans
;
6016 struct btrfs_root
*tree_root
= fs_info
->tree_root
;
6017 struct btrfs_root_item
*root_item
= &root
->root_item
;
6018 struct walk_control
*wc
;
6019 struct btrfs_key key
;
6020 const u64 rootid
= btrfs_root_id(root
);
6023 bool root_dropped
= false;
6024 bool unfinished_drop
= false;
6026 btrfs_debug(fs_info
, "Drop subvolume %llu", btrfs_root_id(root
));
6028 path
= btrfs_alloc_path();
6034 wc
= kzalloc(sizeof(*wc
), GFP_NOFS
);
6036 btrfs_free_path(path
);
6042 * Use join to avoid potential EINTR from transaction start. See
6043 * wait_reserve_ticket and the whole reservation callchain.
6046 trans
= btrfs_join_transaction(tree_root
);
6048 trans
= btrfs_start_transaction(tree_root
, 0);
6049 if (IS_ERR(trans
)) {
6050 ret
= PTR_ERR(trans
);
6054 ret
= btrfs_run_delayed_items(trans
);
6059 * This will help us catch people modifying the fs tree while we're
6060 * dropping it. It is unsafe to mess with the fs tree while it's being
6061 * dropped as we unlock the root node and parent nodes as we walk down
6062 * the tree, assuming nothing will change. If something does change
6063 * then we'll have stale information and drop references to blocks we've
6066 set_bit(BTRFS_ROOT_DELETING
, &root
->state
);
6067 unfinished_drop
= test_bit(BTRFS_ROOT_UNFINISHED_DROP
, &root
->state
);
6069 if (btrfs_disk_key_objectid(&root_item
->drop_progress
) == 0) {
6070 level
= btrfs_header_level(root
->node
);
6071 path
->nodes
[level
] = btrfs_lock_root_node(root
);
6072 path
->slots
[level
] = 0;
6073 path
->locks
[level
] = BTRFS_WRITE_LOCK
;
6074 memset(&wc
->update_progress
, 0,
6075 sizeof(wc
->update_progress
));
6077 btrfs_disk_key_to_cpu(&key
, &root_item
->drop_progress
);
6078 memcpy(&wc
->update_progress
, &key
,
6079 sizeof(wc
->update_progress
));
6081 level
= btrfs_root_drop_level(root_item
);
6083 path
->lowest_level
= level
;
6084 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
6085 path
->lowest_level
= 0;
6093 * unlock our path, this is safe because only this
6094 * function is allowed to delete this snapshot
6096 btrfs_unlock_up_safe(path
, 0);
6098 level
= btrfs_header_level(root
->node
);
6100 btrfs_tree_lock(path
->nodes
[level
]);
6101 path
->locks
[level
] = BTRFS_WRITE_LOCK
;
6104 * btrfs_lookup_extent_info() returns 0 for success,
6107 ret
= btrfs_lookup_extent_info(trans
, fs_info
,
6108 path
->nodes
[level
]->start
,
6109 level
, 1, &wc
->refs
[level
],
6110 &wc
->flags
[level
], NULL
);
6114 BUG_ON(wc
->refs
[level
] == 0);
6116 if (level
== btrfs_root_drop_level(root_item
))
6119 btrfs_tree_unlock(path
->nodes
[level
]);
6120 path
->locks
[level
] = 0;
6121 WARN_ON(wc
->refs
[level
] != 1);
6126 wc
->restarted
= test_bit(BTRFS_ROOT_DEAD_TREE
, &root
->state
);
6128 wc
->shared_level
= -1;
6129 wc
->stage
= DROP_REFERENCE
;
6130 wc
->update_ref
= update_ref
;
6132 wc
->reada_count
= BTRFS_NODEPTRS_PER_BLOCK(fs_info
);
6136 ret
= walk_down_tree(trans
, root
, path
, wc
);
6138 btrfs_abort_transaction(trans
, ret
);
6142 ret
= walk_up_tree(trans
, root
, path
, wc
, BTRFS_MAX_LEVEL
);
6144 btrfs_abort_transaction(trans
, ret
);
6149 BUG_ON(wc
->stage
!= DROP_REFERENCE
);
6154 if (wc
->stage
== DROP_REFERENCE
) {
6155 wc
->drop_level
= wc
->level
;
6156 btrfs_node_key_to_cpu(path
->nodes
[wc
->drop_level
],
6158 path
->slots
[wc
->drop_level
]);
6160 btrfs_cpu_key_to_disk(&root_item
->drop_progress
,
6161 &wc
->drop_progress
);
6162 btrfs_set_root_drop_level(root_item
, wc
->drop_level
);
6164 BUG_ON(wc
->level
== 0);
6165 if (btrfs_should_end_transaction(trans
) ||
6166 (!for_reloc
&& btrfs_need_cleaner_sleep(fs_info
))) {
6167 ret
= btrfs_update_root(trans
, tree_root
,
6171 btrfs_abort_transaction(trans
, ret
);
6176 btrfs_set_last_root_drop_gen(fs_info
, trans
->transid
);
6178 btrfs_end_transaction_throttle(trans
);
6179 if (!for_reloc
&& btrfs_need_cleaner_sleep(fs_info
)) {
6180 btrfs_debug(fs_info
,
6181 "drop snapshot early exit");
6187 * Use join to avoid potential EINTR from transaction
6188 * start. See wait_reserve_ticket and the whole
6189 * reservation callchain.
6192 trans
= btrfs_join_transaction(tree_root
);
6194 trans
= btrfs_start_transaction(tree_root
, 0);
6195 if (IS_ERR(trans
)) {
6196 ret
= PTR_ERR(trans
);
6201 btrfs_release_path(path
);
6205 ret
= btrfs_del_root(trans
, &root
->root_key
);
6207 btrfs_abort_transaction(trans
, ret
);
6211 if (!is_reloc_root
) {
6212 ret
= btrfs_find_root(tree_root
, &root
->root_key
, path
,
6215 btrfs_abort_transaction(trans
, ret
);
6217 } else if (ret
> 0) {
6220 * If we fail to delete the orphan item this time
6221 * around, it'll get picked up the next time.
6223 * The most common failure here is just -ENOENT.
6225 btrfs_del_orphan_item(trans
, tree_root
, btrfs_root_id(root
));
6230 * This subvolume is going to be completely dropped, and won't be
6231 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
6232 * commit transaction time. So free it here manually.
6234 btrfs_qgroup_convert_reserved_meta(root
, INT_MAX
);
6235 btrfs_qgroup_free_meta_all_pertrans(root
);
6237 if (test_bit(BTRFS_ROOT_IN_RADIX
, &root
->state
))
6238 btrfs_add_dropped_root(trans
, root
);
6240 btrfs_put_root(root
);
6241 root_dropped
= true;
6244 btrfs_set_last_root_drop_gen(fs_info
, trans
->transid
);
6246 btrfs_end_transaction_throttle(trans
);
6249 btrfs_free_path(path
);
6251 if (!ret
&& root_dropped
) {
6252 ret
= btrfs_qgroup_cleanup_dropped_subvolume(fs_info
, rootid
);
6254 btrfs_warn_rl(fs_info
,
6255 "failed to cleanup qgroup 0/%llu: %d",
6260 * We were an unfinished drop root, check to see if there are any
6261 * pending, and if not clear and wake up any waiters.
6263 if (!ret
&& unfinished_drop
)
6264 btrfs_maybe_wake_unfinished_drop(fs_info
);
6267 * So if we need to stop dropping the snapshot for whatever reason we
6268 * need to make sure to add it back to the dead root list so that we
6269 * keep trying to do the work later. This also cleans up roots if we
6270 * don't have it in the radix (like when we recover after a power fail
6271 * or unmount) so we don't leak memory.
6273 if (!for_reloc
&& !root_dropped
)
6274 btrfs_add_dead_root(root
);
6279 * drop subtree rooted at tree block 'node'.
6281 * NOTE: this function will unlock and release tree block 'node'
6282 * only used by relocation code
6284 int btrfs_drop_subtree(struct btrfs_trans_handle
*trans
,
6285 struct btrfs_root
*root
,
6286 struct extent_buffer
*node
,
6287 struct extent_buffer
*parent
)
6289 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
6290 struct btrfs_path
*path
;
6291 struct walk_control
*wc
;
6296 BUG_ON(btrfs_root_id(root
) != BTRFS_TREE_RELOC_OBJECTID
);
6298 path
= btrfs_alloc_path();
6302 wc
= kzalloc(sizeof(*wc
), GFP_NOFS
);
6304 btrfs_free_path(path
);
6308 btrfs_assert_tree_write_locked(parent
);
6309 parent_level
= btrfs_header_level(parent
);
6310 atomic_inc(&parent
->refs
);
6311 path
->nodes
[parent_level
] = parent
;
6312 path
->slots
[parent_level
] = btrfs_header_nritems(parent
);
6314 btrfs_assert_tree_write_locked(node
);
6315 level
= btrfs_header_level(node
);
6316 path
->nodes
[level
] = node
;
6317 path
->slots
[level
] = 0;
6318 path
->locks
[level
] = BTRFS_WRITE_LOCK
;
6320 wc
->refs
[parent_level
] = 1;
6321 wc
->flags
[parent_level
] = BTRFS_BLOCK_FLAG_FULL_BACKREF
;
6323 wc
->shared_level
= -1;
6324 wc
->stage
= DROP_REFERENCE
;
6327 wc
->reada_count
= BTRFS_NODEPTRS_PER_BLOCK(fs_info
);
6330 ret
= walk_down_tree(trans
, root
, path
, wc
);
6334 ret
= walk_up_tree(trans
, root
, path
, wc
, parent_level
);
6343 btrfs_free_path(path
);
6348 * Unpin the extent range in an error context and don't add the space back.
6349 * Errors are not propagated further.
6351 void btrfs_error_unpin_extent_range(struct btrfs_fs_info
*fs_info
, u64 start
, u64 end
)
6353 unpin_extent_range(fs_info
, start
, end
, false);
6357 * It used to be that old block groups would be left around forever.
6358 * Iterating over them would be enough to trim unused space. Since we
6359 * now automatically remove them, we also need to iterate over unallocated
6362 * We don't want a transaction for this since the discard may take a
6363 * substantial amount of time. We don't require that a transaction be
6364 * running, but we do need to take a running transaction into account
6365 * to ensure that we're not discarding chunks that were released or
6366 * allocated in the current transaction.
6368 * Holding the chunks lock will prevent other threads from allocating
6369 * or releasing chunks, but it won't prevent a running transaction
6370 * from committing and releasing the memory that the pending chunks
6371 * list head uses. For that, we need to take a reference to the
6372 * transaction and hold the commit root sem. We only need to hold
6373 * it while performing the free space search since we have already
6374 * held back allocations.
6376 static int btrfs_trim_free_extents(struct btrfs_device
*device
, u64
*trimmed
)
6378 u64 start
= BTRFS_DEVICE_RANGE_RESERVED
, len
= 0, end
= 0;
6383 /* Discard not supported = nothing to do. */
6384 if (!bdev_max_discard_sectors(device
->bdev
))
6387 /* Not writable = nothing to do. */
6388 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE
, &device
->dev_state
))
6391 /* No free space = nothing to do. */
6392 if (device
->total_bytes
<= device
->bytes_used
)
6398 struct btrfs_fs_info
*fs_info
= device
->fs_info
;
6401 ret
= mutex_lock_interruptible(&fs_info
->chunk_mutex
);
6405 find_first_clear_extent_bit(&device
->alloc_state
, start
,
6407 CHUNK_TRIMMED
| CHUNK_ALLOCATED
);
6409 /* Check if there are any CHUNK_* bits left */
6410 if (start
> device
->total_bytes
) {
6411 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG
));
6412 btrfs_warn_in_rcu(fs_info
,
6413 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
6414 start
, end
- start
+ 1,
6415 btrfs_dev_name(device
),
6416 device
->total_bytes
);
6417 mutex_unlock(&fs_info
->chunk_mutex
);
6422 /* Ensure we skip the reserved space on each device. */
6423 start
= max_t(u64
, start
, BTRFS_DEVICE_RANGE_RESERVED
);
6426 * If find_first_clear_extent_bit find a range that spans the
6427 * end of the device it will set end to -1, in this case it's up
6428 * to the caller to trim the value to the size of the device.
6430 end
= min(end
, device
->total_bytes
- 1);
6432 len
= end
- start
+ 1;
6434 /* We didn't find any extents */
6436 mutex_unlock(&fs_info
->chunk_mutex
);
6441 ret
= btrfs_issue_discard(device
->bdev
, start
, len
,
6444 set_extent_bit(&device
->alloc_state
, start
,
6445 start
+ bytes
- 1, CHUNK_TRIMMED
, NULL
);
6446 mutex_unlock(&fs_info
->chunk_mutex
);
6454 if (btrfs_trim_interrupted()) {
6466 * Trim the whole filesystem by:
6467 * 1) trimming the free space in each block group
6468 * 2) trimming the unallocated space on each device
6470 * This will also continue trimming even if a block group or device encounters
6471 * an error. The return value will be the last error, or 0 if nothing bad
6474 int btrfs_trim_fs(struct btrfs_fs_info
*fs_info
, struct fstrim_range
*range
)
6476 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
6477 struct btrfs_block_group
*cache
= NULL
;
6478 struct btrfs_device
*device
;
6480 u64 range_end
= U64_MAX
;
6490 if (range
->start
== U64_MAX
)
6494 * Check range overflow if range->len is set.
6495 * The default range->len is U64_MAX.
6497 if (range
->len
!= U64_MAX
&&
6498 check_add_overflow(range
->start
, range
->len
, &range_end
))
6501 cache
= btrfs_lookup_first_block_group(fs_info
, range
->start
);
6502 for (; cache
; cache
= btrfs_next_block_group(cache
)) {
6503 if (cache
->start
>= range_end
) {
6504 btrfs_put_block_group(cache
);
6508 start
= max(range
->start
, cache
->start
);
6509 end
= min(range_end
, cache
->start
+ cache
->length
);
6511 if (end
- start
>= range
->minlen
) {
6512 if (!btrfs_block_group_done(cache
)) {
6513 ret
= btrfs_cache_block_group(cache
, true);
6520 ret
= btrfs_trim_block_group(cache
,
6526 trimmed
+= group_trimmed
;
6537 "failed to trim %llu block group(s), last error %d",
6540 mutex_lock(&fs_devices
->device_list_mutex
);
6541 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
6542 if (test_bit(BTRFS_DEV_STATE_MISSING
, &device
->dev_state
))
6545 ret
= btrfs_trim_free_extents(device
, &group_trimmed
);
6547 trimmed
+= group_trimmed
;
6554 mutex_unlock(&fs_devices
->device_list_mutex
);
6558 "failed to trim %llu device(s), last error %d",
6559 dev_failed
, dev_ret
);
6560 range
->len
= trimmed
;