1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
22 #include "print-tree.h"
26 #include "free-space-cache.h"
27 #include "free-space-tree.h"
30 #include "ref-verify.h"
31 #include "space-info.h"
32 #include "block-rsv.h"
33 #include "delalloc-space.h"
34 #include "block-group.h"
36 #undef SCRAMBLE_DELAYED_REFS
39 static int __btrfs_free_extent(struct btrfs_trans_handle
*trans
,
40 struct btrfs_delayed_ref_node
*node
, u64 parent
,
41 u64 root_objectid
, u64 owner_objectid
,
42 u64 owner_offset
, int refs_to_drop
,
43 struct btrfs_delayed_extent_op
*extra_op
);
44 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op
*extent_op
,
45 struct extent_buffer
*leaf
,
46 struct btrfs_extent_item
*ei
);
47 static int alloc_reserved_file_extent(struct btrfs_trans_handle
*trans
,
48 u64 parent
, u64 root_objectid
,
49 u64 flags
, u64 owner
, u64 offset
,
50 struct btrfs_key
*ins
, int ref_mod
);
51 static int alloc_reserved_tree_block(struct btrfs_trans_handle
*trans
,
52 struct btrfs_delayed_ref_node
*node
,
53 struct btrfs_delayed_extent_op
*extent_op
);
54 static int find_next_key(struct btrfs_path
*path
, int level
,
55 struct btrfs_key
*key
);
57 static int block_group_bits(struct btrfs_block_group_cache
*cache
, u64 bits
)
59 return (cache
->flags
& bits
) == bits
;
62 int btrfs_add_excluded_extent(struct btrfs_fs_info
*fs_info
,
63 u64 start
, u64 num_bytes
)
65 u64 end
= start
+ num_bytes
- 1;
66 set_extent_bits(&fs_info
->freed_extents
[0],
67 start
, end
, EXTENT_UPTODATE
);
68 set_extent_bits(&fs_info
->freed_extents
[1],
69 start
, end
, EXTENT_UPTODATE
);
73 void btrfs_free_excluded_extents(struct btrfs_block_group_cache
*cache
)
75 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
78 start
= cache
->key
.objectid
;
79 end
= start
+ cache
->key
.offset
- 1;
81 clear_extent_bits(&fs_info
->freed_extents
[0],
82 start
, end
, EXTENT_UPTODATE
);
83 clear_extent_bits(&fs_info
->freed_extents
[1],
84 start
, end
, EXTENT_UPTODATE
);
87 static u64
generic_ref_to_space_flags(struct btrfs_ref
*ref
)
89 if (ref
->type
== BTRFS_REF_METADATA
) {
90 if (ref
->tree_ref
.root
== BTRFS_CHUNK_TREE_OBJECTID
)
91 return BTRFS_BLOCK_GROUP_SYSTEM
;
93 return BTRFS_BLOCK_GROUP_METADATA
;
95 return BTRFS_BLOCK_GROUP_DATA
;
98 static void add_pinned_bytes(struct btrfs_fs_info
*fs_info
,
99 struct btrfs_ref
*ref
)
101 struct btrfs_space_info
*space_info
;
102 u64 flags
= generic_ref_to_space_flags(ref
);
104 space_info
= btrfs_find_space_info(fs_info
, flags
);
106 percpu_counter_add_batch(&space_info
->total_bytes_pinned
, ref
->len
,
107 BTRFS_TOTAL_BYTES_PINNED_BATCH
);
110 static void sub_pinned_bytes(struct btrfs_fs_info
*fs_info
,
111 struct btrfs_ref
*ref
)
113 struct btrfs_space_info
*space_info
;
114 u64 flags
= generic_ref_to_space_flags(ref
);
116 space_info
= btrfs_find_space_info(fs_info
, flags
);
118 percpu_counter_add_batch(&space_info
->total_bytes_pinned
, -ref
->len
,
119 BTRFS_TOTAL_BYTES_PINNED_BATCH
);
122 /* simple helper to search for an existing data extent at a given offset */
123 int btrfs_lookup_data_extent(struct btrfs_fs_info
*fs_info
, u64 start
, u64 len
)
126 struct btrfs_key key
;
127 struct btrfs_path
*path
;
129 path
= btrfs_alloc_path();
133 key
.objectid
= start
;
135 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
136 ret
= btrfs_search_slot(NULL
, fs_info
->extent_root
, &key
, path
, 0, 0);
137 btrfs_free_path(path
);
142 * helper function to lookup reference count and flags of a tree block.
144 * the head node for delayed ref is used to store the sum of all the
145 * reference count modifications queued up in the rbtree. the head
146 * node may also store the extent flags to set. This way you can check
147 * to see what the reference count and extent flags would be if all of
148 * the delayed refs are not processed.
150 int btrfs_lookup_extent_info(struct btrfs_trans_handle
*trans
,
151 struct btrfs_fs_info
*fs_info
, u64 bytenr
,
152 u64 offset
, int metadata
, u64
*refs
, u64
*flags
)
154 struct btrfs_delayed_ref_head
*head
;
155 struct btrfs_delayed_ref_root
*delayed_refs
;
156 struct btrfs_path
*path
;
157 struct btrfs_extent_item
*ei
;
158 struct extent_buffer
*leaf
;
159 struct btrfs_key key
;
166 * If we don't have skinny metadata, don't bother doing anything
169 if (metadata
&& !btrfs_fs_incompat(fs_info
, SKINNY_METADATA
)) {
170 offset
= fs_info
->nodesize
;
174 path
= btrfs_alloc_path();
179 path
->skip_locking
= 1;
180 path
->search_commit_root
= 1;
184 key
.objectid
= bytenr
;
187 key
.type
= BTRFS_METADATA_ITEM_KEY
;
189 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
191 ret
= btrfs_search_slot(trans
, fs_info
->extent_root
, &key
, path
, 0, 0);
195 if (ret
> 0 && metadata
&& key
.type
== BTRFS_METADATA_ITEM_KEY
) {
196 if (path
->slots
[0]) {
198 btrfs_item_key_to_cpu(path
->nodes
[0], &key
,
200 if (key
.objectid
== bytenr
&&
201 key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
202 key
.offset
== fs_info
->nodesize
)
208 leaf
= path
->nodes
[0];
209 item_size
= btrfs_item_size_nr(leaf
, path
->slots
[0]);
210 if (item_size
>= sizeof(*ei
)) {
211 ei
= btrfs_item_ptr(leaf
, path
->slots
[0],
212 struct btrfs_extent_item
);
213 num_refs
= btrfs_extent_refs(leaf
, ei
);
214 extent_flags
= btrfs_extent_flags(leaf
, ei
);
217 btrfs_print_v0_err(fs_info
);
219 btrfs_abort_transaction(trans
, ret
);
221 btrfs_handle_fs_error(fs_info
, ret
, NULL
);
226 BUG_ON(num_refs
== 0);
236 delayed_refs
= &trans
->transaction
->delayed_refs
;
237 spin_lock(&delayed_refs
->lock
);
238 head
= btrfs_find_delayed_ref_head(delayed_refs
, bytenr
);
240 if (!mutex_trylock(&head
->mutex
)) {
241 refcount_inc(&head
->refs
);
242 spin_unlock(&delayed_refs
->lock
);
244 btrfs_release_path(path
);
247 * Mutex was contended, block until it's released and try
250 mutex_lock(&head
->mutex
);
251 mutex_unlock(&head
->mutex
);
252 btrfs_put_delayed_ref_head(head
);
255 spin_lock(&head
->lock
);
256 if (head
->extent_op
&& head
->extent_op
->update_flags
)
257 extent_flags
|= head
->extent_op
->flags_to_set
;
259 BUG_ON(num_refs
== 0);
261 num_refs
+= head
->ref_mod
;
262 spin_unlock(&head
->lock
);
263 mutex_unlock(&head
->mutex
);
265 spin_unlock(&delayed_refs
->lock
);
267 WARN_ON(num_refs
== 0);
271 *flags
= extent_flags
;
273 btrfs_free_path(path
);
278 * Back reference rules. Back refs have three main goals:
280 * 1) differentiate between all holders of references to an extent so that
281 * when a reference is dropped we can make sure it was a valid reference
282 * before freeing the extent.
284 * 2) Provide enough information to quickly find the holders of an extent
285 * if we notice a given block is corrupted or bad.
287 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
288 * maintenance. This is actually the same as #2, but with a slightly
289 * different use case.
291 * There are two kinds of back refs. The implicit back refs is optimized
292 * for pointers in non-shared tree blocks. For a given pointer in a block,
293 * back refs of this kind provide information about the block's owner tree
294 * and the pointer's key. These information allow us to find the block by
295 * b-tree searching. The full back refs is for pointers in tree blocks not
296 * referenced by their owner trees. The location of tree block is recorded
297 * in the back refs. Actually the full back refs is generic, and can be
298 * used in all cases the implicit back refs is used. The major shortcoming
299 * of the full back refs is its overhead. Every time a tree block gets
300 * COWed, we have to update back refs entry for all pointers in it.
302 * For a newly allocated tree block, we use implicit back refs for
303 * pointers in it. This means most tree related operations only involve
304 * implicit back refs. For a tree block created in old transaction, the
305 * only way to drop a reference to it is COW it. So we can detect the
306 * event that tree block loses its owner tree's reference and do the
307 * back refs conversion.
309 * When a tree block is COWed through a tree, there are four cases:
311 * The reference count of the block is one and the tree is the block's
312 * owner tree. Nothing to do in this case.
314 * The reference count of the block is one and the tree is not the
315 * block's owner tree. In this case, full back refs is used for pointers
316 * in the block. Remove these full back refs, add implicit back refs for
317 * every pointers in the new block.
319 * The reference count of the block is greater than one and the tree is
320 * the block's owner tree. In this case, implicit back refs is used for
321 * pointers in the block. Add full back refs for every pointers in the
322 * block, increase lower level extents' reference counts. The original
323 * implicit back refs are entailed to the new block.
325 * The reference count of the block is greater than one and the tree is
326 * not the block's owner tree. Add implicit back refs for every pointer in
327 * the new block, increase lower level extents' reference count.
329 * Back Reference Key composing:
331 * The key objectid corresponds to the first byte in the extent,
332 * The key type is used to differentiate between types of back refs.
333 * There are different meanings of the key offset for different types
336 * File extents can be referenced by:
338 * - multiple snapshots, subvolumes, or different generations in one subvol
339 * - different files inside a single subvolume
340 * - different offsets inside a file (bookend extents in file.c)
342 * The extent ref structure for the implicit back refs has fields for:
344 * - Objectid of the subvolume root
345 * - objectid of the file holding the reference
346 * - original offset in the file
347 * - how many bookend extents
349 * The key offset for the implicit back refs is hash of the first
352 * The extent ref structure for the full back refs has field for:
354 * - number of pointers in the tree leaf
356 * The key offset for the implicit back refs is the first byte of
359 * When a file extent is allocated, The implicit back refs is used.
360 * the fields are filled in:
362 * (root_key.objectid, inode objectid, offset in file, 1)
364 * When a file extent is removed file truncation, we find the
365 * corresponding implicit back refs and check the following fields:
367 * (btrfs_header_owner(leaf), inode objectid, offset in file)
369 * Btree extents can be referenced by:
371 * - Different subvolumes
373 * Both the implicit back refs and the full back refs for tree blocks
374 * only consist of key. The key offset for the implicit back refs is
375 * objectid of block's owner tree. The key offset for the full back refs
376 * is the first byte of parent block.
378 * When implicit back refs is used, information about the lowest key and
379 * level of the tree block are required. These information are stored in
380 * tree block info structure.
384 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
385 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
386 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
388 int btrfs_get_extent_inline_ref_type(const struct extent_buffer
*eb
,
389 struct btrfs_extent_inline_ref
*iref
,
390 enum btrfs_inline_ref_type is_data
)
392 int type
= btrfs_extent_inline_ref_type(eb
, iref
);
393 u64 offset
= btrfs_extent_inline_ref_offset(eb
, iref
);
395 if (type
== BTRFS_TREE_BLOCK_REF_KEY
||
396 type
== BTRFS_SHARED_BLOCK_REF_KEY
||
397 type
== BTRFS_SHARED_DATA_REF_KEY
||
398 type
== BTRFS_EXTENT_DATA_REF_KEY
) {
399 if (is_data
== BTRFS_REF_TYPE_BLOCK
) {
400 if (type
== BTRFS_TREE_BLOCK_REF_KEY
)
402 if (type
== BTRFS_SHARED_BLOCK_REF_KEY
) {
405 * Every shared one has parent tree
406 * block, which must be aligned to
410 IS_ALIGNED(offset
, eb
->fs_info
->nodesize
))
413 } else if (is_data
== BTRFS_REF_TYPE_DATA
) {
414 if (type
== BTRFS_EXTENT_DATA_REF_KEY
)
416 if (type
== BTRFS_SHARED_DATA_REF_KEY
) {
419 * Every shared one has parent tree
420 * block, which must be aligned to
424 IS_ALIGNED(offset
, eb
->fs_info
->nodesize
))
428 ASSERT(is_data
== BTRFS_REF_TYPE_ANY
);
433 btrfs_print_leaf((struct extent_buffer
*)eb
);
434 btrfs_err(eb
->fs_info
, "eb %llu invalid extent inline ref type %d",
438 return BTRFS_REF_TYPE_INVALID
;
441 u64
hash_extent_data_ref(u64 root_objectid
, u64 owner
, u64 offset
)
443 u32 high_crc
= ~(u32
)0;
444 u32 low_crc
= ~(u32
)0;
447 lenum
= cpu_to_le64(root_objectid
);
448 high_crc
= btrfs_crc32c(high_crc
, &lenum
, sizeof(lenum
));
449 lenum
= cpu_to_le64(owner
);
450 low_crc
= btrfs_crc32c(low_crc
, &lenum
, sizeof(lenum
));
451 lenum
= cpu_to_le64(offset
);
452 low_crc
= btrfs_crc32c(low_crc
, &lenum
, sizeof(lenum
));
454 return ((u64
)high_crc
<< 31) ^ (u64
)low_crc
;
457 static u64
hash_extent_data_ref_item(struct extent_buffer
*leaf
,
458 struct btrfs_extent_data_ref
*ref
)
460 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf
, ref
),
461 btrfs_extent_data_ref_objectid(leaf
, ref
),
462 btrfs_extent_data_ref_offset(leaf
, ref
));
465 static int match_extent_data_ref(struct extent_buffer
*leaf
,
466 struct btrfs_extent_data_ref
*ref
,
467 u64 root_objectid
, u64 owner
, u64 offset
)
469 if (btrfs_extent_data_ref_root(leaf
, ref
) != root_objectid
||
470 btrfs_extent_data_ref_objectid(leaf
, ref
) != owner
||
471 btrfs_extent_data_ref_offset(leaf
, ref
) != offset
)
476 static noinline
int lookup_extent_data_ref(struct btrfs_trans_handle
*trans
,
477 struct btrfs_path
*path
,
478 u64 bytenr
, u64 parent
,
480 u64 owner
, u64 offset
)
482 struct btrfs_root
*root
= trans
->fs_info
->extent_root
;
483 struct btrfs_key key
;
484 struct btrfs_extent_data_ref
*ref
;
485 struct extent_buffer
*leaf
;
491 key
.objectid
= bytenr
;
493 key
.type
= BTRFS_SHARED_DATA_REF_KEY
;
496 key
.type
= BTRFS_EXTENT_DATA_REF_KEY
;
497 key
.offset
= hash_extent_data_ref(root_objectid
,
502 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
514 leaf
= path
->nodes
[0];
515 nritems
= btrfs_header_nritems(leaf
);
517 if (path
->slots
[0] >= nritems
) {
518 ret
= btrfs_next_leaf(root
, path
);
524 leaf
= path
->nodes
[0];
525 nritems
= btrfs_header_nritems(leaf
);
529 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
530 if (key
.objectid
!= bytenr
||
531 key
.type
!= BTRFS_EXTENT_DATA_REF_KEY
)
534 ref
= btrfs_item_ptr(leaf
, path
->slots
[0],
535 struct btrfs_extent_data_ref
);
537 if (match_extent_data_ref(leaf
, ref
, root_objectid
,
540 btrfs_release_path(path
);
552 static noinline
int insert_extent_data_ref(struct btrfs_trans_handle
*trans
,
553 struct btrfs_path
*path
,
554 u64 bytenr
, u64 parent
,
555 u64 root_objectid
, u64 owner
,
556 u64 offset
, int refs_to_add
)
558 struct btrfs_root
*root
= trans
->fs_info
->extent_root
;
559 struct btrfs_key key
;
560 struct extent_buffer
*leaf
;
565 key
.objectid
= bytenr
;
567 key
.type
= BTRFS_SHARED_DATA_REF_KEY
;
569 size
= sizeof(struct btrfs_shared_data_ref
);
571 key
.type
= BTRFS_EXTENT_DATA_REF_KEY
;
572 key
.offset
= hash_extent_data_ref(root_objectid
,
574 size
= sizeof(struct btrfs_extent_data_ref
);
577 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
, size
);
578 if (ret
&& ret
!= -EEXIST
)
581 leaf
= path
->nodes
[0];
583 struct btrfs_shared_data_ref
*ref
;
584 ref
= btrfs_item_ptr(leaf
, path
->slots
[0],
585 struct btrfs_shared_data_ref
);
587 btrfs_set_shared_data_ref_count(leaf
, ref
, refs_to_add
);
589 num_refs
= btrfs_shared_data_ref_count(leaf
, ref
);
590 num_refs
+= refs_to_add
;
591 btrfs_set_shared_data_ref_count(leaf
, ref
, num_refs
);
594 struct btrfs_extent_data_ref
*ref
;
595 while (ret
== -EEXIST
) {
596 ref
= btrfs_item_ptr(leaf
, path
->slots
[0],
597 struct btrfs_extent_data_ref
);
598 if (match_extent_data_ref(leaf
, ref
, root_objectid
,
601 btrfs_release_path(path
);
603 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
,
605 if (ret
&& ret
!= -EEXIST
)
608 leaf
= path
->nodes
[0];
610 ref
= btrfs_item_ptr(leaf
, path
->slots
[0],
611 struct btrfs_extent_data_ref
);
613 btrfs_set_extent_data_ref_root(leaf
, ref
,
615 btrfs_set_extent_data_ref_objectid(leaf
, ref
, owner
);
616 btrfs_set_extent_data_ref_offset(leaf
, ref
, offset
);
617 btrfs_set_extent_data_ref_count(leaf
, ref
, refs_to_add
);
619 num_refs
= btrfs_extent_data_ref_count(leaf
, ref
);
620 num_refs
+= refs_to_add
;
621 btrfs_set_extent_data_ref_count(leaf
, ref
, num_refs
);
624 btrfs_mark_buffer_dirty(leaf
);
627 btrfs_release_path(path
);
631 static noinline
int remove_extent_data_ref(struct btrfs_trans_handle
*trans
,
632 struct btrfs_path
*path
,
633 int refs_to_drop
, int *last_ref
)
635 struct btrfs_key key
;
636 struct btrfs_extent_data_ref
*ref1
= NULL
;
637 struct btrfs_shared_data_ref
*ref2
= NULL
;
638 struct extent_buffer
*leaf
;
642 leaf
= path
->nodes
[0];
643 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
645 if (key
.type
== BTRFS_EXTENT_DATA_REF_KEY
) {
646 ref1
= btrfs_item_ptr(leaf
, path
->slots
[0],
647 struct btrfs_extent_data_ref
);
648 num_refs
= btrfs_extent_data_ref_count(leaf
, ref1
);
649 } else if (key
.type
== BTRFS_SHARED_DATA_REF_KEY
) {
650 ref2
= btrfs_item_ptr(leaf
, path
->slots
[0],
651 struct btrfs_shared_data_ref
);
652 num_refs
= btrfs_shared_data_ref_count(leaf
, ref2
);
653 } else if (unlikely(key
.type
== BTRFS_EXTENT_REF_V0_KEY
)) {
654 btrfs_print_v0_err(trans
->fs_info
);
655 btrfs_abort_transaction(trans
, -EINVAL
);
661 BUG_ON(num_refs
< refs_to_drop
);
662 num_refs
-= refs_to_drop
;
665 ret
= btrfs_del_item(trans
, trans
->fs_info
->extent_root
, path
);
668 if (key
.type
== BTRFS_EXTENT_DATA_REF_KEY
)
669 btrfs_set_extent_data_ref_count(leaf
, ref1
, num_refs
);
670 else if (key
.type
== BTRFS_SHARED_DATA_REF_KEY
)
671 btrfs_set_shared_data_ref_count(leaf
, ref2
, num_refs
);
672 btrfs_mark_buffer_dirty(leaf
);
677 static noinline u32
extent_data_ref_count(struct btrfs_path
*path
,
678 struct btrfs_extent_inline_ref
*iref
)
680 struct btrfs_key key
;
681 struct extent_buffer
*leaf
;
682 struct btrfs_extent_data_ref
*ref1
;
683 struct btrfs_shared_data_ref
*ref2
;
687 leaf
= path
->nodes
[0];
688 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
690 BUG_ON(key
.type
== BTRFS_EXTENT_REF_V0_KEY
);
693 * If type is invalid, we should have bailed out earlier than
696 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
, BTRFS_REF_TYPE_DATA
);
697 ASSERT(type
!= BTRFS_REF_TYPE_INVALID
);
698 if (type
== BTRFS_EXTENT_DATA_REF_KEY
) {
699 ref1
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
700 num_refs
= btrfs_extent_data_ref_count(leaf
, ref1
);
702 ref2
= (struct btrfs_shared_data_ref
*)(iref
+ 1);
703 num_refs
= btrfs_shared_data_ref_count(leaf
, ref2
);
705 } else if (key
.type
== BTRFS_EXTENT_DATA_REF_KEY
) {
706 ref1
= btrfs_item_ptr(leaf
, path
->slots
[0],
707 struct btrfs_extent_data_ref
);
708 num_refs
= btrfs_extent_data_ref_count(leaf
, ref1
);
709 } else if (key
.type
== BTRFS_SHARED_DATA_REF_KEY
) {
710 ref2
= btrfs_item_ptr(leaf
, path
->slots
[0],
711 struct btrfs_shared_data_ref
);
712 num_refs
= btrfs_shared_data_ref_count(leaf
, ref2
);
719 static noinline
int lookup_tree_block_ref(struct btrfs_trans_handle
*trans
,
720 struct btrfs_path
*path
,
721 u64 bytenr
, u64 parent
,
724 struct btrfs_root
*root
= trans
->fs_info
->extent_root
;
725 struct btrfs_key key
;
728 key
.objectid
= bytenr
;
730 key
.type
= BTRFS_SHARED_BLOCK_REF_KEY
;
733 key
.type
= BTRFS_TREE_BLOCK_REF_KEY
;
734 key
.offset
= root_objectid
;
737 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
743 static noinline
int insert_tree_block_ref(struct btrfs_trans_handle
*trans
,
744 struct btrfs_path
*path
,
745 u64 bytenr
, u64 parent
,
748 struct btrfs_key key
;
751 key
.objectid
= bytenr
;
753 key
.type
= BTRFS_SHARED_BLOCK_REF_KEY
;
756 key
.type
= BTRFS_TREE_BLOCK_REF_KEY
;
757 key
.offset
= root_objectid
;
760 ret
= btrfs_insert_empty_item(trans
, trans
->fs_info
->extent_root
,
762 btrfs_release_path(path
);
766 static inline int extent_ref_type(u64 parent
, u64 owner
)
769 if (owner
< BTRFS_FIRST_FREE_OBJECTID
) {
771 type
= BTRFS_SHARED_BLOCK_REF_KEY
;
773 type
= BTRFS_TREE_BLOCK_REF_KEY
;
776 type
= BTRFS_SHARED_DATA_REF_KEY
;
778 type
= BTRFS_EXTENT_DATA_REF_KEY
;
783 static int find_next_key(struct btrfs_path
*path
, int level
,
784 struct btrfs_key
*key
)
787 for (; level
< BTRFS_MAX_LEVEL
; level
++) {
788 if (!path
->nodes
[level
])
790 if (path
->slots
[level
] + 1 >=
791 btrfs_header_nritems(path
->nodes
[level
]))
794 btrfs_item_key_to_cpu(path
->nodes
[level
], key
,
795 path
->slots
[level
] + 1);
797 btrfs_node_key_to_cpu(path
->nodes
[level
], key
,
798 path
->slots
[level
] + 1);
805 * look for inline back ref. if back ref is found, *ref_ret is set
806 * to the address of inline back ref, and 0 is returned.
808 * if back ref isn't found, *ref_ret is set to the address where it
809 * should be inserted, and -ENOENT is returned.
811 * if insert is true and there are too many inline back refs, the path
812 * points to the extent item, and -EAGAIN is returned.
814 * NOTE: inline back refs are ordered in the same way that back ref
815 * items in the tree are ordered.
817 static noinline_for_stack
818 int lookup_inline_extent_backref(struct btrfs_trans_handle
*trans
,
819 struct btrfs_path
*path
,
820 struct btrfs_extent_inline_ref
**ref_ret
,
821 u64 bytenr
, u64 num_bytes
,
822 u64 parent
, u64 root_objectid
,
823 u64 owner
, u64 offset
, int insert
)
825 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
826 struct btrfs_root
*root
= fs_info
->extent_root
;
827 struct btrfs_key key
;
828 struct extent_buffer
*leaf
;
829 struct btrfs_extent_item
*ei
;
830 struct btrfs_extent_inline_ref
*iref
;
840 bool skinny_metadata
= btrfs_fs_incompat(fs_info
, SKINNY_METADATA
);
843 key
.objectid
= bytenr
;
844 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
845 key
.offset
= num_bytes
;
847 want
= extent_ref_type(parent
, owner
);
849 extra_size
= btrfs_extent_inline_ref_size(want
);
850 path
->keep_locks
= 1;
855 * Owner is our level, so we can just add one to get the level for the
856 * block we are interested in.
858 if (skinny_metadata
&& owner
< BTRFS_FIRST_FREE_OBJECTID
) {
859 key
.type
= BTRFS_METADATA_ITEM_KEY
;
864 ret
= btrfs_search_slot(trans
, root
, &key
, path
, extra_size
, 1);
871 * We may be a newly converted file system which still has the old fat
872 * extent entries for metadata, so try and see if we have one of those.
874 if (ret
> 0 && skinny_metadata
) {
875 skinny_metadata
= false;
876 if (path
->slots
[0]) {
878 btrfs_item_key_to_cpu(path
->nodes
[0], &key
,
880 if (key
.objectid
== bytenr
&&
881 key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
882 key
.offset
== num_bytes
)
886 key
.objectid
= bytenr
;
887 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
888 key
.offset
= num_bytes
;
889 btrfs_release_path(path
);
894 if (ret
&& !insert
) {
897 } else if (WARN_ON(ret
)) {
902 leaf
= path
->nodes
[0];
903 item_size
= btrfs_item_size_nr(leaf
, path
->slots
[0]);
904 if (unlikely(item_size
< sizeof(*ei
))) {
906 btrfs_print_v0_err(fs_info
);
907 btrfs_abort_transaction(trans
, err
);
911 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
912 flags
= btrfs_extent_flags(leaf
, ei
);
914 ptr
= (unsigned long)(ei
+ 1);
915 end
= (unsigned long)ei
+ item_size
;
917 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
&& !skinny_metadata
) {
918 ptr
+= sizeof(struct btrfs_tree_block_info
);
922 if (owner
>= BTRFS_FIRST_FREE_OBJECTID
)
923 needed
= BTRFS_REF_TYPE_DATA
;
925 needed
= BTRFS_REF_TYPE_BLOCK
;
933 iref
= (struct btrfs_extent_inline_ref
*)ptr
;
934 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
, needed
);
935 if (type
== BTRFS_REF_TYPE_INVALID
) {
943 ptr
+= btrfs_extent_inline_ref_size(type
);
947 if (type
== BTRFS_EXTENT_DATA_REF_KEY
) {
948 struct btrfs_extent_data_ref
*dref
;
949 dref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
950 if (match_extent_data_ref(leaf
, dref
, root_objectid
,
955 if (hash_extent_data_ref_item(leaf
, dref
) <
956 hash_extent_data_ref(root_objectid
, owner
, offset
))
960 ref_offset
= btrfs_extent_inline_ref_offset(leaf
, iref
);
962 if (parent
== ref_offset
) {
966 if (ref_offset
< parent
)
969 if (root_objectid
== ref_offset
) {
973 if (ref_offset
< root_objectid
)
977 ptr
+= btrfs_extent_inline_ref_size(type
);
979 if (err
== -ENOENT
&& insert
) {
980 if (item_size
+ extra_size
>=
981 BTRFS_MAX_EXTENT_ITEM_SIZE(root
)) {
986 * To add new inline back ref, we have to make sure
987 * there is no corresponding back ref item.
988 * For simplicity, we just do not add new inline back
989 * ref if there is any kind of item for this block
991 if (find_next_key(path
, 0, &key
) == 0 &&
992 key
.objectid
== bytenr
&&
993 key
.type
< BTRFS_BLOCK_GROUP_ITEM_KEY
) {
998 *ref_ret
= (struct btrfs_extent_inline_ref
*)ptr
;
1001 path
->keep_locks
= 0;
1002 btrfs_unlock_up_safe(path
, 1);
1008 * helper to add new inline back ref
1010 static noinline_for_stack
1011 void setup_inline_extent_backref(struct btrfs_fs_info
*fs_info
,
1012 struct btrfs_path
*path
,
1013 struct btrfs_extent_inline_ref
*iref
,
1014 u64 parent
, u64 root_objectid
,
1015 u64 owner
, u64 offset
, int refs_to_add
,
1016 struct btrfs_delayed_extent_op
*extent_op
)
1018 struct extent_buffer
*leaf
;
1019 struct btrfs_extent_item
*ei
;
1022 unsigned long item_offset
;
1027 leaf
= path
->nodes
[0];
1028 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
1029 item_offset
= (unsigned long)iref
- (unsigned long)ei
;
1031 type
= extent_ref_type(parent
, owner
);
1032 size
= btrfs_extent_inline_ref_size(type
);
1034 btrfs_extend_item(path
, size
);
1036 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
1037 refs
= btrfs_extent_refs(leaf
, ei
);
1038 refs
+= refs_to_add
;
1039 btrfs_set_extent_refs(leaf
, ei
, refs
);
1041 __run_delayed_extent_op(extent_op
, leaf
, ei
);
1043 ptr
= (unsigned long)ei
+ item_offset
;
1044 end
= (unsigned long)ei
+ btrfs_item_size_nr(leaf
, path
->slots
[0]);
1045 if (ptr
< end
- size
)
1046 memmove_extent_buffer(leaf
, ptr
+ size
, ptr
,
1049 iref
= (struct btrfs_extent_inline_ref
*)ptr
;
1050 btrfs_set_extent_inline_ref_type(leaf
, iref
, type
);
1051 if (type
== BTRFS_EXTENT_DATA_REF_KEY
) {
1052 struct btrfs_extent_data_ref
*dref
;
1053 dref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
1054 btrfs_set_extent_data_ref_root(leaf
, dref
, root_objectid
);
1055 btrfs_set_extent_data_ref_objectid(leaf
, dref
, owner
);
1056 btrfs_set_extent_data_ref_offset(leaf
, dref
, offset
);
1057 btrfs_set_extent_data_ref_count(leaf
, dref
, refs_to_add
);
1058 } else if (type
== BTRFS_SHARED_DATA_REF_KEY
) {
1059 struct btrfs_shared_data_ref
*sref
;
1060 sref
= (struct btrfs_shared_data_ref
*)(iref
+ 1);
1061 btrfs_set_shared_data_ref_count(leaf
, sref
, refs_to_add
);
1062 btrfs_set_extent_inline_ref_offset(leaf
, iref
, parent
);
1063 } else if (type
== BTRFS_SHARED_BLOCK_REF_KEY
) {
1064 btrfs_set_extent_inline_ref_offset(leaf
, iref
, parent
);
1066 btrfs_set_extent_inline_ref_offset(leaf
, iref
, root_objectid
);
1068 btrfs_mark_buffer_dirty(leaf
);
1071 static int lookup_extent_backref(struct btrfs_trans_handle
*trans
,
1072 struct btrfs_path
*path
,
1073 struct btrfs_extent_inline_ref
**ref_ret
,
1074 u64 bytenr
, u64 num_bytes
, u64 parent
,
1075 u64 root_objectid
, u64 owner
, u64 offset
)
1079 ret
= lookup_inline_extent_backref(trans
, path
, ref_ret
, bytenr
,
1080 num_bytes
, parent
, root_objectid
,
1085 btrfs_release_path(path
);
1088 if (owner
< BTRFS_FIRST_FREE_OBJECTID
) {
1089 ret
= lookup_tree_block_ref(trans
, path
, bytenr
, parent
,
1092 ret
= lookup_extent_data_ref(trans
, path
, bytenr
, parent
,
1093 root_objectid
, owner
, offset
);
1099 * helper to update/remove inline back ref
1101 static noinline_for_stack
1102 void update_inline_extent_backref(struct btrfs_path
*path
,
1103 struct btrfs_extent_inline_ref
*iref
,
1105 struct btrfs_delayed_extent_op
*extent_op
,
1108 struct extent_buffer
*leaf
= path
->nodes
[0];
1109 struct btrfs_extent_item
*ei
;
1110 struct btrfs_extent_data_ref
*dref
= NULL
;
1111 struct btrfs_shared_data_ref
*sref
= NULL
;
1119 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
1120 refs
= btrfs_extent_refs(leaf
, ei
);
1121 WARN_ON(refs_to_mod
< 0 && refs
+ refs_to_mod
<= 0);
1122 refs
+= refs_to_mod
;
1123 btrfs_set_extent_refs(leaf
, ei
, refs
);
1125 __run_delayed_extent_op(extent_op
, leaf
, ei
);
1128 * If type is invalid, we should have bailed out after
1129 * lookup_inline_extent_backref().
1131 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
, BTRFS_REF_TYPE_ANY
);
1132 ASSERT(type
!= BTRFS_REF_TYPE_INVALID
);
1134 if (type
== BTRFS_EXTENT_DATA_REF_KEY
) {
1135 dref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
1136 refs
= btrfs_extent_data_ref_count(leaf
, dref
);
1137 } else if (type
== BTRFS_SHARED_DATA_REF_KEY
) {
1138 sref
= (struct btrfs_shared_data_ref
*)(iref
+ 1);
1139 refs
= btrfs_shared_data_ref_count(leaf
, sref
);
1142 BUG_ON(refs_to_mod
!= -1);
1145 BUG_ON(refs_to_mod
< 0 && refs
< -refs_to_mod
);
1146 refs
+= refs_to_mod
;
1149 if (type
== BTRFS_EXTENT_DATA_REF_KEY
)
1150 btrfs_set_extent_data_ref_count(leaf
, dref
, refs
);
1152 btrfs_set_shared_data_ref_count(leaf
, sref
, refs
);
1155 size
= btrfs_extent_inline_ref_size(type
);
1156 item_size
= btrfs_item_size_nr(leaf
, path
->slots
[0]);
1157 ptr
= (unsigned long)iref
;
1158 end
= (unsigned long)ei
+ item_size
;
1159 if (ptr
+ size
< end
)
1160 memmove_extent_buffer(leaf
, ptr
, ptr
+ size
,
1163 btrfs_truncate_item(path
, item_size
, 1);
1165 btrfs_mark_buffer_dirty(leaf
);
1168 static noinline_for_stack
1169 int insert_inline_extent_backref(struct btrfs_trans_handle
*trans
,
1170 struct btrfs_path
*path
,
1171 u64 bytenr
, u64 num_bytes
, u64 parent
,
1172 u64 root_objectid
, u64 owner
,
1173 u64 offset
, int refs_to_add
,
1174 struct btrfs_delayed_extent_op
*extent_op
)
1176 struct btrfs_extent_inline_ref
*iref
;
1179 ret
= lookup_inline_extent_backref(trans
, path
, &iref
, bytenr
,
1180 num_bytes
, parent
, root_objectid
,
1183 BUG_ON(owner
< BTRFS_FIRST_FREE_OBJECTID
);
1184 update_inline_extent_backref(path
, iref
, refs_to_add
,
1186 } else if (ret
== -ENOENT
) {
1187 setup_inline_extent_backref(trans
->fs_info
, path
, iref
, parent
,
1188 root_objectid
, owner
, offset
,
1189 refs_to_add
, extent_op
);
1195 static int insert_extent_backref(struct btrfs_trans_handle
*trans
,
1196 struct btrfs_path
*path
,
1197 u64 bytenr
, u64 parent
, u64 root_objectid
,
1198 u64 owner
, u64 offset
, int refs_to_add
)
1201 if (owner
< BTRFS_FIRST_FREE_OBJECTID
) {
1202 BUG_ON(refs_to_add
!= 1);
1203 ret
= insert_tree_block_ref(trans
, path
, bytenr
, parent
,
1206 ret
= insert_extent_data_ref(trans
, path
, bytenr
, parent
,
1207 root_objectid
, owner
, offset
,
1213 static int remove_extent_backref(struct btrfs_trans_handle
*trans
,
1214 struct btrfs_path
*path
,
1215 struct btrfs_extent_inline_ref
*iref
,
1216 int refs_to_drop
, int is_data
, int *last_ref
)
1220 BUG_ON(!is_data
&& refs_to_drop
!= 1);
1222 update_inline_extent_backref(path
, iref
, -refs_to_drop
, NULL
,
1224 } else if (is_data
) {
1225 ret
= remove_extent_data_ref(trans
, path
, refs_to_drop
,
1229 ret
= btrfs_del_item(trans
, trans
->fs_info
->extent_root
, path
);
1234 static int btrfs_issue_discard(struct block_device
*bdev
, u64 start
, u64 len
,
1235 u64
*discarded_bytes
)
1238 u64 bytes_left
, end
;
1239 u64 aligned_start
= ALIGN(start
, 1 << 9);
1241 if (WARN_ON(start
!= aligned_start
)) {
1242 len
-= aligned_start
- start
;
1243 len
= round_down(len
, 1 << 9);
1244 start
= aligned_start
;
1247 *discarded_bytes
= 0;
1255 /* Skip any superblocks on this device. */
1256 for (j
= 0; j
< BTRFS_SUPER_MIRROR_MAX
; j
++) {
1257 u64 sb_start
= btrfs_sb_offset(j
);
1258 u64 sb_end
= sb_start
+ BTRFS_SUPER_INFO_SIZE
;
1259 u64 size
= sb_start
- start
;
1261 if (!in_range(sb_start
, start
, bytes_left
) &&
1262 !in_range(sb_end
, start
, bytes_left
) &&
1263 !in_range(start
, sb_start
, BTRFS_SUPER_INFO_SIZE
))
1267 * Superblock spans beginning of range. Adjust start and
1270 if (sb_start
<= start
) {
1271 start
+= sb_end
- start
;
1276 bytes_left
= end
- start
;
1281 ret
= blkdev_issue_discard(bdev
, start
>> 9, size
>> 9,
1284 *discarded_bytes
+= size
;
1285 else if (ret
!= -EOPNOTSUPP
)
1294 bytes_left
= end
- start
;
1298 ret
= blkdev_issue_discard(bdev
, start
>> 9, bytes_left
>> 9,
1301 *discarded_bytes
+= bytes_left
;
1306 int btrfs_discard_extent(struct btrfs_fs_info
*fs_info
, u64 bytenr
,
1307 u64 num_bytes
, u64
*actual_bytes
)
1310 u64 discarded_bytes
= 0;
1311 struct btrfs_bio
*bbio
= NULL
;
1315 * Avoid races with device replace and make sure our bbio has devices
1316 * associated to its stripes that don't go away while we are discarding.
1318 btrfs_bio_counter_inc_blocked(fs_info
);
1319 /* Tell the block device(s) that the sectors can be discarded */
1320 ret
= btrfs_map_block(fs_info
, BTRFS_MAP_DISCARD
, bytenr
, &num_bytes
,
1322 /* Error condition is -ENOMEM */
1324 struct btrfs_bio_stripe
*stripe
= bbio
->stripes
;
1328 for (i
= 0; i
< bbio
->num_stripes
; i
++, stripe
++) {
1330 struct request_queue
*req_q
;
1332 if (!stripe
->dev
->bdev
) {
1333 ASSERT(btrfs_test_opt(fs_info
, DEGRADED
));
1336 req_q
= bdev_get_queue(stripe
->dev
->bdev
);
1337 if (!blk_queue_discard(req_q
))
1340 ret
= btrfs_issue_discard(stripe
->dev
->bdev
,
1345 discarded_bytes
+= bytes
;
1346 else if (ret
!= -EOPNOTSUPP
)
1347 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1350 * Just in case we get back EOPNOTSUPP for some reason,
1351 * just ignore the return value so we don't screw up
1352 * people calling discard_extent.
1356 btrfs_put_bbio(bbio
);
1358 btrfs_bio_counter_dec(fs_info
);
1361 *actual_bytes
= discarded_bytes
;
1364 if (ret
== -EOPNOTSUPP
)
1369 /* Can return -ENOMEM */
1370 int btrfs_inc_extent_ref(struct btrfs_trans_handle
*trans
,
1371 struct btrfs_ref
*generic_ref
)
1373 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1374 int old_ref_mod
, new_ref_mod
;
1377 ASSERT(generic_ref
->type
!= BTRFS_REF_NOT_SET
&&
1378 generic_ref
->action
);
1379 BUG_ON(generic_ref
->type
== BTRFS_REF_METADATA
&&
1380 generic_ref
->tree_ref
.root
== BTRFS_TREE_LOG_OBJECTID
);
1382 if (generic_ref
->type
== BTRFS_REF_METADATA
)
1383 ret
= btrfs_add_delayed_tree_ref(trans
, generic_ref
,
1384 NULL
, &old_ref_mod
, &new_ref_mod
);
1386 ret
= btrfs_add_delayed_data_ref(trans
, generic_ref
, 0,
1387 &old_ref_mod
, &new_ref_mod
);
1389 btrfs_ref_tree_mod(fs_info
, generic_ref
);
1391 if (ret
== 0 && old_ref_mod
< 0 && new_ref_mod
>= 0)
1392 sub_pinned_bytes(fs_info
, generic_ref
);
1398 * __btrfs_inc_extent_ref - insert backreference for a given extent
1400 * @trans: Handle of transaction
1402 * @node: The delayed ref node used to get the bytenr/length for
1403 * extent whose references are incremented.
1405 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
1406 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
1407 * bytenr of the parent block. Since new extents are always
1408 * created with indirect references, this will only be the case
1409 * when relocating a shared extent. In that case, root_objectid
1410 * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
1413 * @root_objectid: The id of the root where this modification has originated,
1414 * this can be either one of the well-known metadata trees or
1415 * the subvolume id which references this extent.
1417 * @owner: For data extents it is the inode number of the owning file.
1418 * For metadata extents this parameter holds the level in the
1419 * tree of the extent.
1421 * @offset: For metadata extents the offset is ignored and is currently
1422 * always passed as 0. For data extents it is the fileoffset
1423 * this extent belongs to.
1425 * @refs_to_add Number of references to add
1427 * @extent_op Pointer to a structure, holding information necessary when
1428 * updating a tree block's flags
1431 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle
*trans
,
1432 struct btrfs_delayed_ref_node
*node
,
1433 u64 parent
, u64 root_objectid
,
1434 u64 owner
, u64 offset
, int refs_to_add
,
1435 struct btrfs_delayed_extent_op
*extent_op
)
1437 struct btrfs_path
*path
;
1438 struct extent_buffer
*leaf
;
1439 struct btrfs_extent_item
*item
;
1440 struct btrfs_key key
;
1441 u64 bytenr
= node
->bytenr
;
1442 u64 num_bytes
= node
->num_bytes
;
1446 path
= btrfs_alloc_path();
1450 path
->reada
= READA_FORWARD
;
1451 path
->leave_spinning
= 1;
1452 /* this will setup the path even if it fails to insert the back ref */
1453 ret
= insert_inline_extent_backref(trans
, path
, bytenr
, num_bytes
,
1454 parent
, root_objectid
, owner
,
1455 offset
, refs_to_add
, extent_op
);
1456 if ((ret
< 0 && ret
!= -EAGAIN
) || !ret
)
1460 * Ok we had -EAGAIN which means we didn't have space to insert and
1461 * inline extent ref, so just update the reference count and add a
1464 leaf
= path
->nodes
[0];
1465 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
1466 item
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
1467 refs
= btrfs_extent_refs(leaf
, item
);
1468 btrfs_set_extent_refs(leaf
, item
, refs
+ refs_to_add
);
1470 __run_delayed_extent_op(extent_op
, leaf
, item
);
1472 btrfs_mark_buffer_dirty(leaf
);
1473 btrfs_release_path(path
);
1475 path
->reada
= READA_FORWARD
;
1476 path
->leave_spinning
= 1;
1477 /* now insert the actual backref */
1478 ret
= insert_extent_backref(trans
, path
, bytenr
, parent
, root_objectid
,
1479 owner
, offset
, refs_to_add
);
1481 btrfs_abort_transaction(trans
, ret
);
1483 btrfs_free_path(path
);
1487 static int run_delayed_data_ref(struct btrfs_trans_handle
*trans
,
1488 struct btrfs_delayed_ref_node
*node
,
1489 struct btrfs_delayed_extent_op
*extent_op
,
1490 int insert_reserved
)
1493 struct btrfs_delayed_data_ref
*ref
;
1494 struct btrfs_key ins
;
1499 ins
.objectid
= node
->bytenr
;
1500 ins
.offset
= node
->num_bytes
;
1501 ins
.type
= BTRFS_EXTENT_ITEM_KEY
;
1503 ref
= btrfs_delayed_node_to_data_ref(node
);
1504 trace_run_delayed_data_ref(trans
->fs_info
, node
, ref
, node
->action
);
1506 if (node
->type
== BTRFS_SHARED_DATA_REF_KEY
)
1507 parent
= ref
->parent
;
1508 ref_root
= ref
->root
;
1510 if (node
->action
== BTRFS_ADD_DELAYED_REF
&& insert_reserved
) {
1512 flags
|= extent_op
->flags_to_set
;
1513 ret
= alloc_reserved_file_extent(trans
, parent
, ref_root
,
1514 flags
, ref
->objectid
,
1517 } else if (node
->action
== BTRFS_ADD_DELAYED_REF
) {
1518 ret
= __btrfs_inc_extent_ref(trans
, node
, parent
, ref_root
,
1519 ref
->objectid
, ref
->offset
,
1520 node
->ref_mod
, extent_op
);
1521 } else if (node
->action
== BTRFS_DROP_DELAYED_REF
) {
1522 ret
= __btrfs_free_extent(trans
, node
, parent
,
1523 ref_root
, ref
->objectid
,
1524 ref
->offset
, node
->ref_mod
,
1532 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op
*extent_op
,
1533 struct extent_buffer
*leaf
,
1534 struct btrfs_extent_item
*ei
)
1536 u64 flags
= btrfs_extent_flags(leaf
, ei
);
1537 if (extent_op
->update_flags
) {
1538 flags
|= extent_op
->flags_to_set
;
1539 btrfs_set_extent_flags(leaf
, ei
, flags
);
1542 if (extent_op
->update_key
) {
1543 struct btrfs_tree_block_info
*bi
;
1544 BUG_ON(!(flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
));
1545 bi
= (struct btrfs_tree_block_info
*)(ei
+ 1);
1546 btrfs_set_tree_block_key(leaf
, bi
, &extent_op
->key
);
1550 static int run_delayed_extent_op(struct btrfs_trans_handle
*trans
,
1551 struct btrfs_delayed_ref_head
*head
,
1552 struct btrfs_delayed_extent_op
*extent_op
)
1554 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1555 struct btrfs_key key
;
1556 struct btrfs_path
*path
;
1557 struct btrfs_extent_item
*ei
;
1558 struct extent_buffer
*leaf
;
1562 int metadata
= !extent_op
->is_data
;
1567 if (metadata
&& !btrfs_fs_incompat(fs_info
, SKINNY_METADATA
))
1570 path
= btrfs_alloc_path();
1574 key
.objectid
= head
->bytenr
;
1577 key
.type
= BTRFS_METADATA_ITEM_KEY
;
1578 key
.offset
= extent_op
->level
;
1580 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1581 key
.offset
= head
->num_bytes
;
1585 path
->reada
= READA_FORWARD
;
1586 path
->leave_spinning
= 1;
1587 ret
= btrfs_search_slot(trans
, fs_info
->extent_root
, &key
, path
, 0, 1);
1594 if (path
->slots
[0] > 0) {
1596 btrfs_item_key_to_cpu(path
->nodes
[0], &key
,
1598 if (key
.objectid
== head
->bytenr
&&
1599 key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
1600 key
.offset
== head
->num_bytes
)
1604 btrfs_release_path(path
);
1607 key
.objectid
= head
->bytenr
;
1608 key
.offset
= head
->num_bytes
;
1609 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1618 leaf
= path
->nodes
[0];
1619 item_size
= btrfs_item_size_nr(leaf
, path
->slots
[0]);
1621 if (unlikely(item_size
< sizeof(*ei
))) {
1623 btrfs_print_v0_err(fs_info
);
1624 btrfs_abort_transaction(trans
, err
);
1628 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
1629 __run_delayed_extent_op(extent_op
, leaf
, ei
);
1631 btrfs_mark_buffer_dirty(leaf
);
1633 btrfs_free_path(path
);
1637 static int run_delayed_tree_ref(struct btrfs_trans_handle
*trans
,
1638 struct btrfs_delayed_ref_node
*node
,
1639 struct btrfs_delayed_extent_op
*extent_op
,
1640 int insert_reserved
)
1643 struct btrfs_delayed_tree_ref
*ref
;
1647 ref
= btrfs_delayed_node_to_tree_ref(node
);
1648 trace_run_delayed_tree_ref(trans
->fs_info
, node
, ref
, node
->action
);
1650 if (node
->type
== BTRFS_SHARED_BLOCK_REF_KEY
)
1651 parent
= ref
->parent
;
1652 ref_root
= ref
->root
;
1654 if (node
->ref_mod
!= 1) {
1655 btrfs_err(trans
->fs_info
,
1656 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
1657 node
->bytenr
, node
->ref_mod
, node
->action
, ref_root
,
1661 if (node
->action
== BTRFS_ADD_DELAYED_REF
&& insert_reserved
) {
1662 BUG_ON(!extent_op
|| !extent_op
->update_flags
);
1663 ret
= alloc_reserved_tree_block(trans
, node
, extent_op
);
1664 } else if (node
->action
== BTRFS_ADD_DELAYED_REF
) {
1665 ret
= __btrfs_inc_extent_ref(trans
, node
, parent
, ref_root
,
1666 ref
->level
, 0, 1, extent_op
);
1667 } else if (node
->action
== BTRFS_DROP_DELAYED_REF
) {
1668 ret
= __btrfs_free_extent(trans
, node
, parent
, ref_root
,
1669 ref
->level
, 0, 1, extent_op
);
1676 /* helper function to actually process a single delayed ref entry */
1677 static int run_one_delayed_ref(struct btrfs_trans_handle
*trans
,
1678 struct btrfs_delayed_ref_node
*node
,
1679 struct btrfs_delayed_extent_op
*extent_op
,
1680 int insert_reserved
)
1684 if (trans
->aborted
) {
1685 if (insert_reserved
)
1686 btrfs_pin_extent(trans
->fs_info
, node
->bytenr
,
1687 node
->num_bytes
, 1);
1691 if (node
->type
== BTRFS_TREE_BLOCK_REF_KEY
||
1692 node
->type
== BTRFS_SHARED_BLOCK_REF_KEY
)
1693 ret
= run_delayed_tree_ref(trans
, node
, extent_op
,
1695 else if (node
->type
== BTRFS_EXTENT_DATA_REF_KEY
||
1696 node
->type
== BTRFS_SHARED_DATA_REF_KEY
)
1697 ret
= run_delayed_data_ref(trans
, node
, extent_op
,
1701 if (ret
&& insert_reserved
)
1702 btrfs_pin_extent(trans
->fs_info
, node
->bytenr
,
1703 node
->num_bytes
, 1);
1707 static inline struct btrfs_delayed_ref_node
*
1708 select_delayed_ref(struct btrfs_delayed_ref_head
*head
)
1710 struct btrfs_delayed_ref_node
*ref
;
1712 if (RB_EMPTY_ROOT(&head
->ref_tree
.rb_root
))
1716 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1717 * This is to prevent a ref count from going down to zero, which deletes
1718 * the extent item from the extent tree, when there still are references
1719 * to add, which would fail because they would not find the extent item.
1721 if (!list_empty(&head
->ref_add_list
))
1722 return list_first_entry(&head
->ref_add_list
,
1723 struct btrfs_delayed_ref_node
, add_list
);
1725 ref
= rb_entry(rb_first_cached(&head
->ref_tree
),
1726 struct btrfs_delayed_ref_node
, ref_node
);
1727 ASSERT(list_empty(&ref
->add_list
));
1731 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root
*delayed_refs
,
1732 struct btrfs_delayed_ref_head
*head
)
1734 spin_lock(&delayed_refs
->lock
);
1735 head
->processing
= 0;
1736 delayed_refs
->num_heads_ready
++;
1737 spin_unlock(&delayed_refs
->lock
);
1738 btrfs_delayed_ref_unlock(head
);
1741 static struct btrfs_delayed_extent_op
*cleanup_extent_op(
1742 struct btrfs_delayed_ref_head
*head
)
1744 struct btrfs_delayed_extent_op
*extent_op
= head
->extent_op
;
1749 if (head
->must_insert_reserved
) {
1750 head
->extent_op
= NULL
;
1751 btrfs_free_delayed_extent_op(extent_op
);
1757 static int run_and_cleanup_extent_op(struct btrfs_trans_handle
*trans
,
1758 struct btrfs_delayed_ref_head
*head
)
1760 struct btrfs_delayed_extent_op
*extent_op
;
1763 extent_op
= cleanup_extent_op(head
);
1766 head
->extent_op
= NULL
;
1767 spin_unlock(&head
->lock
);
1768 ret
= run_delayed_extent_op(trans
, head
, extent_op
);
1769 btrfs_free_delayed_extent_op(extent_op
);
1770 return ret
? ret
: 1;
1773 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info
*fs_info
,
1774 struct btrfs_delayed_ref_root
*delayed_refs
,
1775 struct btrfs_delayed_ref_head
*head
)
1777 int nr_items
= 1; /* Dropping this ref head update. */
1779 if (head
->total_ref_mod
< 0) {
1780 struct btrfs_space_info
*space_info
;
1784 flags
= BTRFS_BLOCK_GROUP_DATA
;
1785 else if (head
->is_system
)
1786 flags
= BTRFS_BLOCK_GROUP_SYSTEM
;
1788 flags
= BTRFS_BLOCK_GROUP_METADATA
;
1789 space_info
= btrfs_find_space_info(fs_info
, flags
);
1791 percpu_counter_add_batch(&space_info
->total_bytes_pinned
,
1793 BTRFS_TOTAL_BYTES_PINNED_BATCH
);
1796 * We had csum deletions accounted for in our delayed refs rsv,
1797 * we need to drop the csum leaves for this update from our
1800 if (head
->is_data
) {
1801 spin_lock(&delayed_refs
->lock
);
1802 delayed_refs
->pending_csums
-= head
->num_bytes
;
1803 spin_unlock(&delayed_refs
->lock
);
1804 nr_items
+= btrfs_csum_bytes_to_leaves(fs_info
,
1809 btrfs_delayed_refs_rsv_release(fs_info
, nr_items
);
1812 static int cleanup_ref_head(struct btrfs_trans_handle
*trans
,
1813 struct btrfs_delayed_ref_head
*head
)
1816 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1817 struct btrfs_delayed_ref_root
*delayed_refs
;
1820 delayed_refs
= &trans
->transaction
->delayed_refs
;
1822 ret
= run_and_cleanup_extent_op(trans
, head
);
1824 unselect_delayed_ref_head(delayed_refs
, head
);
1825 btrfs_debug(fs_info
, "run_delayed_extent_op returned %d", ret
);
1832 * Need to drop our head ref lock and re-acquire the delayed ref lock
1833 * and then re-check to make sure nobody got added.
1835 spin_unlock(&head
->lock
);
1836 spin_lock(&delayed_refs
->lock
);
1837 spin_lock(&head
->lock
);
1838 if (!RB_EMPTY_ROOT(&head
->ref_tree
.rb_root
) || head
->extent_op
) {
1839 spin_unlock(&head
->lock
);
1840 spin_unlock(&delayed_refs
->lock
);
1843 btrfs_delete_ref_head(delayed_refs
, head
);
1844 spin_unlock(&head
->lock
);
1845 spin_unlock(&delayed_refs
->lock
);
1847 if (head
->must_insert_reserved
) {
1848 btrfs_pin_extent(fs_info
, head
->bytenr
,
1849 head
->num_bytes
, 1);
1850 if (head
->is_data
) {
1851 ret
= btrfs_del_csums(trans
, fs_info
, head
->bytenr
,
1856 btrfs_cleanup_ref_head_accounting(fs_info
, delayed_refs
, head
);
1858 trace_run_delayed_ref_head(fs_info
, head
, 0);
1859 btrfs_delayed_ref_unlock(head
);
1860 btrfs_put_delayed_ref_head(head
);
1864 static struct btrfs_delayed_ref_head
*btrfs_obtain_ref_head(
1865 struct btrfs_trans_handle
*trans
)
1867 struct btrfs_delayed_ref_root
*delayed_refs
=
1868 &trans
->transaction
->delayed_refs
;
1869 struct btrfs_delayed_ref_head
*head
= NULL
;
1872 spin_lock(&delayed_refs
->lock
);
1873 head
= btrfs_select_ref_head(delayed_refs
);
1875 spin_unlock(&delayed_refs
->lock
);
1880 * Grab the lock that says we are going to process all the refs for
1883 ret
= btrfs_delayed_ref_lock(delayed_refs
, head
);
1884 spin_unlock(&delayed_refs
->lock
);
1887 * We may have dropped the spin lock to get the head mutex lock, and
1888 * that might have given someone else time to free the head. If that's
1889 * true, it has been removed from our list and we can move on.
1892 head
= ERR_PTR(-EAGAIN
);
1897 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle
*trans
,
1898 struct btrfs_delayed_ref_head
*locked_ref
,
1899 unsigned long *run_refs
)
1901 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1902 struct btrfs_delayed_ref_root
*delayed_refs
;
1903 struct btrfs_delayed_extent_op
*extent_op
;
1904 struct btrfs_delayed_ref_node
*ref
;
1905 int must_insert_reserved
= 0;
1908 delayed_refs
= &trans
->transaction
->delayed_refs
;
1910 lockdep_assert_held(&locked_ref
->mutex
);
1911 lockdep_assert_held(&locked_ref
->lock
);
1913 while ((ref
= select_delayed_ref(locked_ref
))) {
1915 btrfs_check_delayed_seq(fs_info
, ref
->seq
)) {
1916 spin_unlock(&locked_ref
->lock
);
1917 unselect_delayed_ref_head(delayed_refs
, locked_ref
);
1923 rb_erase_cached(&ref
->ref_node
, &locked_ref
->ref_tree
);
1924 RB_CLEAR_NODE(&ref
->ref_node
);
1925 if (!list_empty(&ref
->add_list
))
1926 list_del(&ref
->add_list
);
1928 * When we play the delayed ref, also correct the ref_mod on
1931 switch (ref
->action
) {
1932 case BTRFS_ADD_DELAYED_REF
:
1933 case BTRFS_ADD_DELAYED_EXTENT
:
1934 locked_ref
->ref_mod
-= ref
->ref_mod
;
1936 case BTRFS_DROP_DELAYED_REF
:
1937 locked_ref
->ref_mod
+= ref
->ref_mod
;
1942 atomic_dec(&delayed_refs
->num_entries
);
1945 * Record the must_insert_reserved flag before we drop the
1948 must_insert_reserved
= locked_ref
->must_insert_reserved
;
1949 locked_ref
->must_insert_reserved
= 0;
1951 extent_op
= locked_ref
->extent_op
;
1952 locked_ref
->extent_op
= NULL
;
1953 spin_unlock(&locked_ref
->lock
);
1955 ret
= run_one_delayed_ref(trans
, ref
, extent_op
,
1956 must_insert_reserved
);
1958 btrfs_free_delayed_extent_op(extent_op
);
1960 unselect_delayed_ref_head(delayed_refs
, locked_ref
);
1961 btrfs_put_delayed_ref(ref
);
1962 btrfs_debug(fs_info
, "run_one_delayed_ref returned %d",
1967 btrfs_put_delayed_ref(ref
);
1970 spin_lock(&locked_ref
->lock
);
1971 btrfs_merge_delayed_refs(trans
, delayed_refs
, locked_ref
);
1978 * Returns 0 on success or if called with an already aborted transaction.
1979 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
1981 static noinline
int __btrfs_run_delayed_refs(struct btrfs_trans_handle
*trans
,
1984 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1985 struct btrfs_delayed_ref_root
*delayed_refs
;
1986 struct btrfs_delayed_ref_head
*locked_ref
= NULL
;
1987 ktime_t start
= ktime_get();
1989 unsigned long count
= 0;
1990 unsigned long actual_count
= 0;
1992 delayed_refs
= &trans
->transaction
->delayed_refs
;
1995 locked_ref
= btrfs_obtain_ref_head(trans
);
1996 if (IS_ERR_OR_NULL(locked_ref
)) {
1997 if (PTR_ERR(locked_ref
) == -EAGAIN
) {
2006 * We need to try and merge add/drops of the same ref since we
2007 * can run into issues with relocate dropping the implicit ref
2008 * and then it being added back again before the drop can
2009 * finish. If we merged anything we need to re-loop so we can
2011 * Or we can get node references of the same type that weren't
2012 * merged when created due to bumps in the tree mod seq, and
2013 * we need to merge them to prevent adding an inline extent
2014 * backref before dropping it (triggering a BUG_ON at
2015 * insert_inline_extent_backref()).
2017 spin_lock(&locked_ref
->lock
);
2018 btrfs_merge_delayed_refs(trans
, delayed_refs
, locked_ref
);
2020 ret
= btrfs_run_delayed_refs_for_head(trans
, locked_ref
,
2022 if (ret
< 0 && ret
!= -EAGAIN
) {
2024 * Error, btrfs_run_delayed_refs_for_head already
2025 * unlocked everything so just bail out
2030 * Success, perform the usual cleanup of a processed
2033 ret
= cleanup_ref_head(trans
, locked_ref
);
2035 /* We dropped our lock, we need to loop. */
2044 * Either success case or btrfs_run_delayed_refs_for_head
2045 * returned -EAGAIN, meaning we need to select another head
2050 } while ((nr
!= -1 && count
< nr
) || locked_ref
);
2053 * We don't want to include ref heads since we can have empty ref heads
2054 * and those will drastically skew our runtime down since we just do
2055 * accounting, no actual extent tree updates.
2057 if (actual_count
> 0) {
2058 u64 runtime
= ktime_to_ns(ktime_sub(ktime_get(), start
));
2062 * We weigh the current average higher than our current runtime
2063 * to avoid large swings in the average.
2065 spin_lock(&delayed_refs
->lock
);
2066 avg
= fs_info
->avg_delayed_ref_runtime
* 3 + runtime
;
2067 fs_info
->avg_delayed_ref_runtime
= avg
>> 2; /* div by 4 */
2068 spin_unlock(&delayed_refs
->lock
);
2073 #ifdef SCRAMBLE_DELAYED_REFS
2075 * Normally delayed refs get processed in ascending bytenr order. This
2076 * correlates in most cases to the order added. To expose dependencies on this
2077 * order, we start to process the tree in the middle instead of the beginning
2079 static u64
find_middle(struct rb_root
*root
)
2081 struct rb_node
*n
= root
->rb_node
;
2082 struct btrfs_delayed_ref_node
*entry
;
2085 u64 first
= 0, last
= 0;
2089 entry
= rb_entry(n
, struct btrfs_delayed_ref_node
, rb_node
);
2090 first
= entry
->bytenr
;
2094 entry
= rb_entry(n
, struct btrfs_delayed_ref_node
, rb_node
);
2095 last
= entry
->bytenr
;
2100 entry
= rb_entry(n
, struct btrfs_delayed_ref_node
, rb_node
);
2101 WARN_ON(!entry
->in_tree
);
2103 middle
= entry
->bytenr
;
2116 static inline u64
heads_to_leaves(struct btrfs_fs_info
*fs_info
, u64 heads
)
2120 num_bytes
= heads
* (sizeof(struct btrfs_extent_item
) +
2121 sizeof(struct btrfs_extent_inline_ref
));
2122 if (!btrfs_fs_incompat(fs_info
, SKINNY_METADATA
))
2123 num_bytes
+= heads
* sizeof(struct btrfs_tree_block_info
);
2126 * We don't ever fill up leaves all the way so multiply by 2 just to be
2127 * closer to what we're really going to want to use.
2129 return div_u64(num_bytes
, BTRFS_LEAF_DATA_SIZE(fs_info
));
2133 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2134 * would require to store the csums for that many bytes.
2136 u64
btrfs_csum_bytes_to_leaves(struct btrfs_fs_info
*fs_info
, u64 csum_bytes
)
2139 u64 num_csums_per_leaf
;
2142 csum_size
= BTRFS_MAX_ITEM_SIZE(fs_info
);
2143 num_csums_per_leaf
= div64_u64(csum_size
,
2144 (u64
)btrfs_super_csum_size(fs_info
->super_copy
));
2145 num_csums
= div64_u64(csum_bytes
, fs_info
->sectorsize
);
2146 num_csums
+= num_csums_per_leaf
- 1;
2147 num_csums
= div64_u64(num_csums
, num_csums_per_leaf
);
2152 * this starts processing the delayed reference count updates and
2153 * extent insertions we have queued up so far. count can be
2154 * 0, which means to process everything in the tree at the start
2155 * of the run (but not newly added entries), or it can be some target
2156 * number you'd like to process.
2158 * Returns 0 on success or if called with an aborted transaction
2159 * Returns <0 on error and aborts the transaction
2161 int btrfs_run_delayed_refs(struct btrfs_trans_handle
*trans
,
2162 unsigned long count
)
2164 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
2165 struct rb_node
*node
;
2166 struct btrfs_delayed_ref_root
*delayed_refs
;
2167 struct btrfs_delayed_ref_head
*head
;
2169 int run_all
= count
== (unsigned long)-1;
2171 /* We'll clean this up in btrfs_cleanup_transaction */
2175 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE
, &fs_info
->flags
))
2178 delayed_refs
= &trans
->transaction
->delayed_refs
;
2180 count
= atomic_read(&delayed_refs
->num_entries
) * 2;
2183 #ifdef SCRAMBLE_DELAYED_REFS
2184 delayed_refs
->run_delayed_start
= find_middle(&delayed_refs
->root
);
2186 ret
= __btrfs_run_delayed_refs(trans
, count
);
2188 btrfs_abort_transaction(trans
, ret
);
2193 btrfs_create_pending_block_groups(trans
);
2195 spin_lock(&delayed_refs
->lock
);
2196 node
= rb_first_cached(&delayed_refs
->href_root
);
2198 spin_unlock(&delayed_refs
->lock
);
2201 head
= rb_entry(node
, struct btrfs_delayed_ref_head
,
2203 refcount_inc(&head
->refs
);
2204 spin_unlock(&delayed_refs
->lock
);
2206 /* Mutex was contended, block until it's released and retry. */
2207 mutex_lock(&head
->mutex
);
2208 mutex_unlock(&head
->mutex
);
2210 btrfs_put_delayed_ref_head(head
);
2218 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle
*trans
,
2219 u64 bytenr
, u64 num_bytes
, u64 flags
,
2220 int level
, int is_data
)
2222 struct btrfs_delayed_extent_op
*extent_op
;
2225 extent_op
= btrfs_alloc_delayed_extent_op();
2229 extent_op
->flags_to_set
= flags
;
2230 extent_op
->update_flags
= true;
2231 extent_op
->update_key
= false;
2232 extent_op
->is_data
= is_data
? true : false;
2233 extent_op
->level
= level
;
2235 ret
= btrfs_add_delayed_extent_op(trans
, bytenr
, num_bytes
, extent_op
);
2237 btrfs_free_delayed_extent_op(extent_op
);
2241 static noinline
int check_delayed_ref(struct btrfs_root
*root
,
2242 struct btrfs_path
*path
,
2243 u64 objectid
, u64 offset
, u64 bytenr
)
2245 struct btrfs_delayed_ref_head
*head
;
2246 struct btrfs_delayed_ref_node
*ref
;
2247 struct btrfs_delayed_data_ref
*data_ref
;
2248 struct btrfs_delayed_ref_root
*delayed_refs
;
2249 struct btrfs_transaction
*cur_trans
;
2250 struct rb_node
*node
;
2253 spin_lock(&root
->fs_info
->trans_lock
);
2254 cur_trans
= root
->fs_info
->running_transaction
;
2256 refcount_inc(&cur_trans
->use_count
);
2257 spin_unlock(&root
->fs_info
->trans_lock
);
2261 delayed_refs
= &cur_trans
->delayed_refs
;
2262 spin_lock(&delayed_refs
->lock
);
2263 head
= btrfs_find_delayed_ref_head(delayed_refs
, bytenr
);
2265 spin_unlock(&delayed_refs
->lock
);
2266 btrfs_put_transaction(cur_trans
);
2270 if (!mutex_trylock(&head
->mutex
)) {
2271 refcount_inc(&head
->refs
);
2272 spin_unlock(&delayed_refs
->lock
);
2274 btrfs_release_path(path
);
2277 * Mutex was contended, block until it's released and let
2280 mutex_lock(&head
->mutex
);
2281 mutex_unlock(&head
->mutex
);
2282 btrfs_put_delayed_ref_head(head
);
2283 btrfs_put_transaction(cur_trans
);
2286 spin_unlock(&delayed_refs
->lock
);
2288 spin_lock(&head
->lock
);
2290 * XXX: We should replace this with a proper search function in the
2293 for (node
= rb_first_cached(&head
->ref_tree
); node
;
2294 node
= rb_next(node
)) {
2295 ref
= rb_entry(node
, struct btrfs_delayed_ref_node
, ref_node
);
2296 /* If it's a shared ref we know a cross reference exists */
2297 if (ref
->type
!= BTRFS_EXTENT_DATA_REF_KEY
) {
2302 data_ref
= btrfs_delayed_node_to_data_ref(ref
);
2305 * If our ref doesn't match the one we're currently looking at
2306 * then we have a cross reference.
2308 if (data_ref
->root
!= root
->root_key
.objectid
||
2309 data_ref
->objectid
!= objectid
||
2310 data_ref
->offset
!= offset
) {
2315 spin_unlock(&head
->lock
);
2316 mutex_unlock(&head
->mutex
);
2317 btrfs_put_transaction(cur_trans
);
2321 static noinline
int check_committed_ref(struct btrfs_root
*root
,
2322 struct btrfs_path
*path
,
2323 u64 objectid
, u64 offset
, u64 bytenr
)
2325 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2326 struct btrfs_root
*extent_root
= fs_info
->extent_root
;
2327 struct extent_buffer
*leaf
;
2328 struct btrfs_extent_data_ref
*ref
;
2329 struct btrfs_extent_inline_ref
*iref
;
2330 struct btrfs_extent_item
*ei
;
2331 struct btrfs_key key
;
2336 key
.objectid
= bytenr
;
2337 key
.offset
= (u64
)-1;
2338 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
2340 ret
= btrfs_search_slot(NULL
, extent_root
, &key
, path
, 0, 0);
2343 BUG_ON(ret
== 0); /* Corruption */
2346 if (path
->slots
[0] == 0)
2350 leaf
= path
->nodes
[0];
2351 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
2353 if (key
.objectid
!= bytenr
|| key
.type
!= BTRFS_EXTENT_ITEM_KEY
)
2357 item_size
= btrfs_item_size_nr(leaf
, path
->slots
[0]);
2358 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_extent_item
);
2360 /* If extent item has more than 1 inline ref then it's shared */
2361 if (item_size
!= sizeof(*ei
) +
2362 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY
))
2365 /* If extent created before last snapshot => it's definitely shared */
2366 if (btrfs_extent_generation(leaf
, ei
) <=
2367 btrfs_root_last_snapshot(&root
->root_item
))
2370 iref
= (struct btrfs_extent_inline_ref
*)(ei
+ 1);
2372 /* If this extent has SHARED_DATA_REF then it's shared */
2373 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
, BTRFS_REF_TYPE_DATA
);
2374 if (type
!= BTRFS_EXTENT_DATA_REF_KEY
)
2377 ref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
2378 if (btrfs_extent_refs(leaf
, ei
) !=
2379 btrfs_extent_data_ref_count(leaf
, ref
) ||
2380 btrfs_extent_data_ref_root(leaf
, ref
) !=
2381 root
->root_key
.objectid
||
2382 btrfs_extent_data_ref_objectid(leaf
, ref
) != objectid
||
2383 btrfs_extent_data_ref_offset(leaf
, ref
) != offset
)
2391 int btrfs_cross_ref_exist(struct btrfs_root
*root
, u64 objectid
, u64 offset
,
2394 struct btrfs_path
*path
;
2397 path
= btrfs_alloc_path();
2402 ret
= check_committed_ref(root
, path
, objectid
,
2404 if (ret
&& ret
!= -ENOENT
)
2407 ret
= check_delayed_ref(root
, path
, objectid
, offset
, bytenr
);
2408 } while (ret
== -EAGAIN
);
2411 btrfs_free_path(path
);
2412 if (root
->root_key
.objectid
== BTRFS_DATA_RELOC_TREE_OBJECTID
)
2417 static int __btrfs_mod_ref(struct btrfs_trans_handle
*trans
,
2418 struct btrfs_root
*root
,
2419 struct extent_buffer
*buf
,
2420 int full_backref
, int inc
)
2422 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2428 struct btrfs_key key
;
2429 struct btrfs_file_extent_item
*fi
;
2430 struct btrfs_ref generic_ref
= { 0 };
2431 bool for_reloc
= btrfs_header_flag(buf
, BTRFS_HEADER_FLAG_RELOC
);
2437 if (btrfs_is_testing(fs_info
))
2440 ref_root
= btrfs_header_owner(buf
);
2441 nritems
= btrfs_header_nritems(buf
);
2442 level
= btrfs_header_level(buf
);
2444 if (!test_bit(BTRFS_ROOT_REF_COWS
, &root
->state
) && level
== 0)
2448 parent
= buf
->start
;
2452 action
= BTRFS_ADD_DELAYED_REF
;
2454 action
= BTRFS_DROP_DELAYED_REF
;
2456 for (i
= 0; i
< nritems
; i
++) {
2458 btrfs_item_key_to_cpu(buf
, &key
, i
);
2459 if (key
.type
!= BTRFS_EXTENT_DATA_KEY
)
2461 fi
= btrfs_item_ptr(buf
, i
,
2462 struct btrfs_file_extent_item
);
2463 if (btrfs_file_extent_type(buf
, fi
) ==
2464 BTRFS_FILE_EXTENT_INLINE
)
2466 bytenr
= btrfs_file_extent_disk_bytenr(buf
, fi
);
2470 num_bytes
= btrfs_file_extent_disk_num_bytes(buf
, fi
);
2471 key
.offset
-= btrfs_file_extent_offset(buf
, fi
);
2472 btrfs_init_generic_ref(&generic_ref
, action
, bytenr
,
2474 generic_ref
.real_root
= root
->root_key
.objectid
;
2475 btrfs_init_data_ref(&generic_ref
, ref_root
, key
.objectid
,
2477 generic_ref
.skip_qgroup
= for_reloc
;
2479 ret
= btrfs_inc_extent_ref(trans
, &generic_ref
);
2481 ret
= btrfs_free_extent(trans
, &generic_ref
);
2485 bytenr
= btrfs_node_blockptr(buf
, i
);
2486 num_bytes
= fs_info
->nodesize
;
2487 btrfs_init_generic_ref(&generic_ref
, action
, bytenr
,
2489 generic_ref
.real_root
= root
->root_key
.objectid
;
2490 btrfs_init_tree_ref(&generic_ref
, level
- 1, ref_root
);
2491 generic_ref
.skip_qgroup
= for_reloc
;
2493 ret
= btrfs_inc_extent_ref(trans
, &generic_ref
);
2495 ret
= btrfs_free_extent(trans
, &generic_ref
);
2505 int btrfs_inc_ref(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
2506 struct extent_buffer
*buf
, int full_backref
)
2508 return __btrfs_mod_ref(trans
, root
, buf
, full_backref
, 1);
2511 int btrfs_dec_ref(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
2512 struct extent_buffer
*buf
, int full_backref
)
2514 return __btrfs_mod_ref(trans
, root
, buf
, full_backref
, 0);
2517 int btrfs_extent_readonly(struct btrfs_fs_info
*fs_info
, u64 bytenr
)
2519 struct btrfs_block_group_cache
*block_group
;
2522 block_group
= btrfs_lookup_block_group(fs_info
, bytenr
);
2523 if (!block_group
|| block_group
->ro
)
2526 btrfs_put_block_group(block_group
);
2530 static u64
get_alloc_profile_by_root(struct btrfs_root
*root
, int data
)
2532 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2537 flags
= BTRFS_BLOCK_GROUP_DATA
;
2538 else if (root
== fs_info
->chunk_root
)
2539 flags
= BTRFS_BLOCK_GROUP_SYSTEM
;
2541 flags
= BTRFS_BLOCK_GROUP_METADATA
;
2543 ret
= btrfs_get_alloc_profile(fs_info
, flags
);
2547 static u64
first_logical_byte(struct btrfs_fs_info
*fs_info
, u64 search_start
)
2549 struct btrfs_block_group_cache
*cache
;
2552 spin_lock(&fs_info
->block_group_cache_lock
);
2553 bytenr
= fs_info
->first_logical_byte
;
2554 spin_unlock(&fs_info
->block_group_cache_lock
);
2556 if (bytenr
< (u64
)-1)
2559 cache
= btrfs_lookup_first_block_group(fs_info
, search_start
);
2563 bytenr
= cache
->key
.objectid
;
2564 btrfs_put_block_group(cache
);
2569 static int pin_down_extent(struct btrfs_block_group_cache
*cache
,
2570 u64 bytenr
, u64 num_bytes
, int reserved
)
2572 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
2574 spin_lock(&cache
->space_info
->lock
);
2575 spin_lock(&cache
->lock
);
2576 cache
->pinned
+= num_bytes
;
2577 btrfs_space_info_update_bytes_pinned(fs_info
, cache
->space_info
,
2580 cache
->reserved
-= num_bytes
;
2581 cache
->space_info
->bytes_reserved
-= num_bytes
;
2583 spin_unlock(&cache
->lock
);
2584 spin_unlock(&cache
->space_info
->lock
);
2586 percpu_counter_add_batch(&cache
->space_info
->total_bytes_pinned
,
2587 num_bytes
, BTRFS_TOTAL_BYTES_PINNED_BATCH
);
2588 set_extent_dirty(fs_info
->pinned_extents
, bytenr
,
2589 bytenr
+ num_bytes
- 1, GFP_NOFS
| __GFP_NOFAIL
);
2594 * this function must be called within transaction
2596 int btrfs_pin_extent(struct btrfs_fs_info
*fs_info
,
2597 u64 bytenr
, u64 num_bytes
, int reserved
)
2599 struct btrfs_block_group_cache
*cache
;
2601 cache
= btrfs_lookup_block_group(fs_info
, bytenr
);
2602 BUG_ON(!cache
); /* Logic error */
2604 pin_down_extent(cache
, bytenr
, num_bytes
, reserved
);
2606 btrfs_put_block_group(cache
);
2611 * this function must be called within transaction
2613 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info
*fs_info
,
2614 u64 bytenr
, u64 num_bytes
)
2616 struct btrfs_block_group_cache
*cache
;
2619 cache
= btrfs_lookup_block_group(fs_info
, bytenr
);
2624 * pull in the free space cache (if any) so that our pin
2625 * removes the free space from the cache. We have load_only set
2626 * to one because the slow code to read in the free extents does check
2627 * the pinned extents.
2629 btrfs_cache_block_group(cache
, 1);
2631 pin_down_extent(cache
, bytenr
, num_bytes
, 0);
2633 /* remove us from the free space cache (if we're there at all) */
2634 ret
= btrfs_remove_free_space(cache
, bytenr
, num_bytes
);
2635 btrfs_put_block_group(cache
);
2639 static int __exclude_logged_extent(struct btrfs_fs_info
*fs_info
,
2640 u64 start
, u64 num_bytes
)
2643 struct btrfs_block_group_cache
*block_group
;
2644 struct btrfs_caching_control
*caching_ctl
;
2646 block_group
= btrfs_lookup_block_group(fs_info
, start
);
2650 btrfs_cache_block_group(block_group
, 0);
2651 caching_ctl
= btrfs_get_caching_control(block_group
);
2655 BUG_ON(!btrfs_block_group_cache_done(block_group
));
2656 ret
= btrfs_remove_free_space(block_group
, start
, num_bytes
);
2658 mutex_lock(&caching_ctl
->mutex
);
2660 if (start
>= caching_ctl
->progress
) {
2661 ret
= btrfs_add_excluded_extent(fs_info
, start
,
2663 } else if (start
+ num_bytes
<= caching_ctl
->progress
) {
2664 ret
= btrfs_remove_free_space(block_group
,
2667 num_bytes
= caching_ctl
->progress
- start
;
2668 ret
= btrfs_remove_free_space(block_group
,
2673 num_bytes
= (start
+ num_bytes
) -
2674 caching_ctl
->progress
;
2675 start
= caching_ctl
->progress
;
2676 ret
= btrfs_add_excluded_extent(fs_info
, start
,
2680 mutex_unlock(&caching_ctl
->mutex
);
2681 btrfs_put_caching_control(caching_ctl
);
2683 btrfs_put_block_group(block_group
);
2687 int btrfs_exclude_logged_extents(struct extent_buffer
*eb
)
2689 struct btrfs_fs_info
*fs_info
= eb
->fs_info
;
2690 struct btrfs_file_extent_item
*item
;
2691 struct btrfs_key key
;
2696 if (!btrfs_fs_incompat(fs_info
, MIXED_GROUPS
))
2699 for (i
= 0; i
< btrfs_header_nritems(eb
); i
++) {
2700 btrfs_item_key_to_cpu(eb
, &key
, i
);
2701 if (key
.type
!= BTRFS_EXTENT_DATA_KEY
)
2703 item
= btrfs_item_ptr(eb
, i
, struct btrfs_file_extent_item
);
2704 found_type
= btrfs_file_extent_type(eb
, item
);
2705 if (found_type
== BTRFS_FILE_EXTENT_INLINE
)
2707 if (btrfs_file_extent_disk_bytenr(eb
, item
) == 0)
2709 key
.objectid
= btrfs_file_extent_disk_bytenr(eb
, item
);
2710 key
.offset
= btrfs_file_extent_disk_num_bytes(eb
, item
);
2711 ret
= __exclude_logged_extent(fs_info
, key
.objectid
, key
.offset
);
2720 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache
*bg
)
2722 atomic_inc(&bg
->reservations
);
2725 void btrfs_prepare_extent_commit(struct btrfs_fs_info
*fs_info
)
2727 struct btrfs_caching_control
*next
;
2728 struct btrfs_caching_control
*caching_ctl
;
2729 struct btrfs_block_group_cache
*cache
;
2731 down_write(&fs_info
->commit_root_sem
);
2733 list_for_each_entry_safe(caching_ctl
, next
,
2734 &fs_info
->caching_block_groups
, list
) {
2735 cache
= caching_ctl
->block_group
;
2736 if (btrfs_block_group_cache_done(cache
)) {
2737 cache
->last_byte_to_unpin
= (u64
)-1;
2738 list_del_init(&caching_ctl
->list
);
2739 btrfs_put_caching_control(caching_ctl
);
2741 cache
->last_byte_to_unpin
= caching_ctl
->progress
;
2745 if (fs_info
->pinned_extents
== &fs_info
->freed_extents
[0])
2746 fs_info
->pinned_extents
= &fs_info
->freed_extents
[1];
2748 fs_info
->pinned_extents
= &fs_info
->freed_extents
[0];
2750 up_write(&fs_info
->commit_root_sem
);
2752 btrfs_update_global_block_rsv(fs_info
);
2756 * Returns the free cluster for the given space info and sets empty_cluster to
2757 * what it should be based on the mount options.
2759 static struct btrfs_free_cluster
*
2760 fetch_cluster_info(struct btrfs_fs_info
*fs_info
,
2761 struct btrfs_space_info
*space_info
, u64
*empty_cluster
)
2763 struct btrfs_free_cluster
*ret
= NULL
;
2766 if (btrfs_mixed_space_info(space_info
))
2769 if (space_info
->flags
& BTRFS_BLOCK_GROUP_METADATA
) {
2770 ret
= &fs_info
->meta_alloc_cluster
;
2771 if (btrfs_test_opt(fs_info
, SSD
))
2772 *empty_cluster
= SZ_2M
;
2774 *empty_cluster
= SZ_64K
;
2775 } else if ((space_info
->flags
& BTRFS_BLOCK_GROUP_DATA
) &&
2776 btrfs_test_opt(fs_info
, SSD_SPREAD
)) {
2777 *empty_cluster
= SZ_2M
;
2778 ret
= &fs_info
->data_alloc_cluster
;
2784 static int unpin_extent_range(struct btrfs_fs_info
*fs_info
,
2786 const bool return_free_space
)
2788 struct btrfs_block_group_cache
*cache
= NULL
;
2789 struct btrfs_space_info
*space_info
;
2790 struct btrfs_block_rsv
*global_rsv
= &fs_info
->global_block_rsv
;
2791 struct btrfs_free_cluster
*cluster
= NULL
;
2793 u64 total_unpinned
= 0;
2794 u64 empty_cluster
= 0;
2797 while (start
<= end
) {
2800 start
>= cache
->key
.objectid
+ cache
->key
.offset
) {
2802 btrfs_put_block_group(cache
);
2804 cache
= btrfs_lookup_block_group(fs_info
, start
);
2805 BUG_ON(!cache
); /* Logic error */
2807 cluster
= fetch_cluster_info(fs_info
,
2810 empty_cluster
<<= 1;
2813 len
= cache
->key
.objectid
+ cache
->key
.offset
- start
;
2814 len
= min(len
, end
+ 1 - start
);
2816 if (start
< cache
->last_byte_to_unpin
) {
2817 len
= min(len
, cache
->last_byte_to_unpin
- start
);
2818 if (return_free_space
)
2819 btrfs_add_free_space(cache
, start
, len
);
2823 total_unpinned
+= len
;
2824 space_info
= cache
->space_info
;
2827 * If this space cluster has been marked as fragmented and we've
2828 * unpinned enough in this block group to potentially allow a
2829 * cluster to be created inside of it go ahead and clear the
2832 if (cluster
&& cluster
->fragmented
&&
2833 total_unpinned
> empty_cluster
) {
2834 spin_lock(&cluster
->lock
);
2835 cluster
->fragmented
= 0;
2836 spin_unlock(&cluster
->lock
);
2839 spin_lock(&space_info
->lock
);
2840 spin_lock(&cache
->lock
);
2841 cache
->pinned
-= len
;
2842 btrfs_space_info_update_bytes_pinned(fs_info
, space_info
, -len
);
2843 space_info
->max_extent_size
= 0;
2844 percpu_counter_add_batch(&space_info
->total_bytes_pinned
,
2845 -len
, BTRFS_TOTAL_BYTES_PINNED_BATCH
);
2847 space_info
->bytes_readonly
+= len
;
2850 spin_unlock(&cache
->lock
);
2851 if (!readonly
&& return_free_space
&&
2852 global_rsv
->space_info
== space_info
) {
2855 spin_lock(&global_rsv
->lock
);
2856 if (!global_rsv
->full
) {
2857 to_add
= min(len
, global_rsv
->size
-
2858 global_rsv
->reserved
);
2859 global_rsv
->reserved
+= to_add
;
2860 btrfs_space_info_update_bytes_may_use(fs_info
,
2861 space_info
, to_add
);
2862 if (global_rsv
->reserved
>= global_rsv
->size
)
2863 global_rsv
->full
= 1;
2866 spin_unlock(&global_rsv
->lock
);
2867 /* Add to any tickets we may have */
2869 btrfs_try_granting_tickets(fs_info
,
2872 spin_unlock(&space_info
->lock
);
2876 btrfs_put_block_group(cache
);
2880 int btrfs_finish_extent_commit(struct btrfs_trans_handle
*trans
)
2882 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
2883 struct btrfs_block_group_cache
*block_group
, *tmp
;
2884 struct list_head
*deleted_bgs
;
2885 struct extent_io_tree
*unpin
;
2890 if (fs_info
->pinned_extents
== &fs_info
->freed_extents
[0])
2891 unpin
= &fs_info
->freed_extents
[1];
2893 unpin
= &fs_info
->freed_extents
[0];
2895 while (!trans
->aborted
) {
2896 struct extent_state
*cached_state
= NULL
;
2898 mutex_lock(&fs_info
->unused_bg_unpin_mutex
);
2899 ret
= find_first_extent_bit(unpin
, 0, &start
, &end
,
2900 EXTENT_DIRTY
, &cached_state
);
2902 mutex_unlock(&fs_info
->unused_bg_unpin_mutex
);
2906 if (btrfs_test_opt(fs_info
, DISCARD
))
2907 ret
= btrfs_discard_extent(fs_info
, start
,
2908 end
+ 1 - start
, NULL
);
2910 clear_extent_dirty(unpin
, start
, end
, &cached_state
);
2911 unpin_extent_range(fs_info
, start
, end
, true);
2912 mutex_unlock(&fs_info
->unused_bg_unpin_mutex
);
2913 free_extent_state(cached_state
);
2918 * Transaction is finished. We don't need the lock anymore. We
2919 * do need to clean up the block groups in case of a transaction
2922 deleted_bgs
= &trans
->transaction
->deleted_bgs
;
2923 list_for_each_entry_safe(block_group
, tmp
, deleted_bgs
, bg_list
) {
2927 if (!trans
->aborted
)
2928 ret
= btrfs_discard_extent(fs_info
,
2929 block_group
->key
.objectid
,
2930 block_group
->key
.offset
,
2933 list_del_init(&block_group
->bg_list
);
2934 btrfs_put_block_group_trimming(block_group
);
2935 btrfs_put_block_group(block_group
);
2938 const char *errstr
= btrfs_decode_error(ret
);
2940 "discard failed while removing blockgroup: errno=%d %s",
2948 static int __btrfs_free_extent(struct btrfs_trans_handle
*trans
,
2949 struct btrfs_delayed_ref_node
*node
, u64 parent
,
2950 u64 root_objectid
, u64 owner_objectid
,
2951 u64 owner_offset
, int refs_to_drop
,
2952 struct btrfs_delayed_extent_op
*extent_op
)
2954 struct btrfs_fs_info
*info
= trans
->fs_info
;
2955 struct btrfs_key key
;
2956 struct btrfs_path
*path
;
2957 struct btrfs_root
*extent_root
= info
->extent_root
;
2958 struct extent_buffer
*leaf
;
2959 struct btrfs_extent_item
*ei
;
2960 struct btrfs_extent_inline_ref
*iref
;
2963 int extent_slot
= 0;
2964 int found_extent
= 0;
2968 u64 bytenr
= node
->bytenr
;
2969 u64 num_bytes
= node
->num_bytes
;
2971 bool skinny_metadata
= btrfs_fs_incompat(info
, SKINNY_METADATA
);
2973 path
= btrfs_alloc_path();
2977 path
->reada
= READA_FORWARD
;
2978 path
->leave_spinning
= 1;
2980 is_data
= owner_objectid
>= BTRFS_FIRST_FREE_OBJECTID
;
2981 BUG_ON(!is_data
&& refs_to_drop
!= 1);
2984 skinny_metadata
= false;
2986 ret
= lookup_extent_backref(trans
, path
, &iref
, bytenr
, num_bytes
,
2987 parent
, root_objectid
, owner_objectid
,
2990 extent_slot
= path
->slots
[0];
2991 while (extent_slot
>= 0) {
2992 btrfs_item_key_to_cpu(path
->nodes
[0], &key
,
2994 if (key
.objectid
!= bytenr
)
2996 if (key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
2997 key
.offset
== num_bytes
) {
3001 if (key
.type
== BTRFS_METADATA_ITEM_KEY
&&
3002 key
.offset
== owner_objectid
) {
3006 if (path
->slots
[0] - extent_slot
> 5)
3011 if (!found_extent
) {
3013 ret
= remove_extent_backref(trans
, path
, NULL
,
3015 is_data
, &last_ref
);
3017 btrfs_abort_transaction(trans
, ret
);
3020 btrfs_release_path(path
);
3021 path
->leave_spinning
= 1;
3023 key
.objectid
= bytenr
;
3024 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
3025 key
.offset
= num_bytes
;
3027 if (!is_data
&& skinny_metadata
) {
3028 key
.type
= BTRFS_METADATA_ITEM_KEY
;
3029 key
.offset
= owner_objectid
;
3032 ret
= btrfs_search_slot(trans
, extent_root
,
3034 if (ret
> 0 && skinny_metadata
&& path
->slots
[0]) {
3036 * Couldn't find our skinny metadata item,
3037 * see if we have ye olde extent item.
3040 btrfs_item_key_to_cpu(path
->nodes
[0], &key
,
3042 if (key
.objectid
== bytenr
&&
3043 key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
3044 key
.offset
== num_bytes
)
3048 if (ret
> 0 && skinny_metadata
) {
3049 skinny_metadata
= false;
3050 key
.objectid
= bytenr
;
3051 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
3052 key
.offset
= num_bytes
;
3053 btrfs_release_path(path
);
3054 ret
= btrfs_search_slot(trans
, extent_root
,
3060 "umm, got %d back from search, was looking for %llu",
3063 btrfs_print_leaf(path
->nodes
[0]);
3066 btrfs_abort_transaction(trans
, ret
);
3069 extent_slot
= path
->slots
[0];
3071 } else if (WARN_ON(ret
== -ENOENT
)) {
3072 btrfs_print_leaf(path
->nodes
[0]);
3074 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
3075 bytenr
, parent
, root_objectid
, owner_objectid
,
3077 btrfs_abort_transaction(trans
, ret
);
3080 btrfs_abort_transaction(trans
, ret
);
3084 leaf
= path
->nodes
[0];
3085 item_size
= btrfs_item_size_nr(leaf
, extent_slot
);
3086 if (unlikely(item_size
< sizeof(*ei
))) {
3088 btrfs_print_v0_err(info
);
3089 btrfs_abort_transaction(trans
, ret
);
3092 ei
= btrfs_item_ptr(leaf
, extent_slot
,
3093 struct btrfs_extent_item
);
3094 if (owner_objectid
< BTRFS_FIRST_FREE_OBJECTID
&&
3095 key
.type
== BTRFS_EXTENT_ITEM_KEY
) {
3096 struct btrfs_tree_block_info
*bi
;
3097 BUG_ON(item_size
< sizeof(*ei
) + sizeof(*bi
));
3098 bi
= (struct btrfs_tree_block_info
*)(ei
+ 1);
3099 WARN_ON(owner_objectid
!= btrfs_tree_block_level(leaf
, bi
));
3102 refs
= btrfs_extent_refs(leaf
, ei
);
3103 if (refs
< refs_to_drop
) {
3105 "trying to drop %d refs but we only have %Lu for bytenr %Lu",
3106 refs_to_drop
, refs
, bytenr
);
3108 btrfs_abort_transaction(trans
, ret
);
3111 refs
-= refs_to_drop
;
3115 __run_delayed_extent_op(extent_op
, leaf
, ei
);
3117 * In the case of inline back ref, reference count will
3118 * be updated by remove_extent_backref
3121 BUG_ON(!found_extent
);
3123 btrfs_set_extent_refs(leaf
, ei
, refs
);
3124 btrfs_mark_buffer_dirty(leaf
);
3127 ret
= remove_extent_backref(trans
, path
, iref
,
3128 refs_to_drop
, is_data
,
3131 btrfs_abort_transaction(trans
, ret
);
3137 BUG_ON(is_data
&& refs_to_drop
!=
3138 extent_data_ref_count(path
, iref
));
3140 BUG_ON(path
->slots
[0] != extent_slot
);
3142 BUG_ON(path
->slots
[0] != extent_slot
+ 1);
3143 path
->slots
[0] = extent_slot
;
3149 ret
= btrfs_del_items(trans
, extent_root
, path
, path
->slots
[0],
3152 btrfs_abort_transaction(trans
, ret
);
3155 btrfs_release_path(path
);
3158 ret
= btrfs_del_csums(trans
, info
, bytenr
, num_bytes
);
3160 btrfs_abort_transaction(trans
, ret
);
3165 ret
= add_to_free_space_tree(trans
, bytenr
, num_bytes
);
3167 btrfs_abort_transaction(trans
, ret
);
3171 ret
= btrfs_update_block_group(trans
, bytenr
, num_bytes
, 0);
3173 btrfs_abort_transaction(trans
, ret
);
3177 btrfs_release_path(path
);
3180 btrfs_free_path(path
);
3185 * when we free an block, it is possible (and likely) that we free the last
3186 * delayed ref for that extent as well. This searches the delayed ref tree for
3187 * a given extent, and if there are no other delayed refs to be processed, it
3188 * removes it from the tree.
3190 static noinline
int check_ref_cleanup(struct btrfs_trans_handle
*trans
,
3193 struct btrfs_delayed_ref_head
*head
;
3194 struct btrfs_delayed_ref_root
*delayed_refs
;
3197 delayed_refs
= &trans
->transaction
->delayed_refs
;
3198 spin_lock(&delayed_refs
->lock
);
3199 head
= btrfs_find_delayed_ref_head(delayed_refs
, bytenr
);
3201 goto out_delayed_unlock
;
3203 spin_lock(&head
->lock
);
3204 if (!RB_EMPTY_ROOT(&head
->ref_tree
.rb_root
))
3207 if (cleanup_extent_op(head
) != NULL
)
3211 * waiting for the lock here would deadlock. If someone else has it
3212 * locked they are already in the process of dropping it anyway
3214 if (!mutex_trylock(&head
->mutex
))
3217 btrfs_delete_ref_head(delayed_refs
, head
);
3218 head
->processing
= 0;
3220 spin_unlock(&head
->lock
);
3221 spin_unlock(&delayed_refs
->lock
);
3223 BUG_ON(head
->extent_op
);
3224 if (head
->must_insert_reserved
)
3227 btrfs_cleanup_ref_head_accounting(trans
->fs_info
, delayed_refs
, head
);
3228 mutex_unlock(&head
->mutex
);
3229 btrfs_put_delayed_ref_head(head
);
3232 spin_unlock(&head
->lock
);
3235 spin_unlock(&delayed_refs
->lock
);
3239 void btrfs_free_tree_block(struct btrfs_trans_handle
*trans
,
3240 struct btrfs_root
*root
,
3241 struct extent_buffer
*buf
,
3242 u64 parent
, int last_ref
)
3244 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
3245 struct btrfs_ref generic_ref
= { 0 };
3249 btrfs_init_generic_ref(&generic_ref
, BTRFS_DROP_DELAYED_REF
,
3250 buf
->start
, buf
->len
, parent
);
3251 btrfs_init_tree_ref(&generic_ref
, btrfs_header_level(buf
),
3252 root
->root_key
.objectid
);
3254 if (root
->root_key
.objectid
!= BTRFS_TREE_LOG_OBJECTID
) {
3255 int old_ref_mod
, new_ref_mod
;
3257 btrfs_ref_tree_mod(fs_info
, &generic_ref
);
3258 ret
= btrfs_add_delayed_tree_ref(trans
, &generic_ref
, NULL
,
3259 &old_ref_mod
, &new_ref_mod
);
3260 BUG_ON(ret
); /* -ENOMEM */
3261 pin
= old_ref_mod
>= 0 && new_ref_mod
< 0;
3264 if (last_ref
&& btrfs_header_generation(buf
) == trans
->transid
) {
3265 struct btrfs_block_group_cache
*cache
;
3267 if (root
->root_key
.objectid
!= BTRFS_TREE_LOG_OBJECTID
) {
3268 ret
= check_ref_cleanup(trans
, buf
->start
);
3274 cache
= btrfs_lookup_block_group(fs_info
, buf
->start
);
3276 if (btrfs_header_flag(buf
, BTRFS_HEADER_FLAG_WRITTEN
)) {
3277 pin_down_extent(cache
, buf
->start
, buf
->len
, 1);
3278 btrfs_put_block_group(cache
);
3282 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY
, &buf
->bflags
));
3284 btrfs_add_free_space(cache
, buf
->start
, buf
->len
);
3285 btrfs_free_reserved_bytes(cache
, buf
->len
, 0);
3286 btrfs_put_block_group(cache
);
3287 trace_btrfs_reserved_extent_free(fs_info
, buf
->start
, buf
->len
);
3291 add_pinned_bytes(fs_info
, &generic_ref
);
3295 * Deleting the buffer, clear the corrupt flag since it doesn't
3298 clear_bit(EXTENT_BUFFER_CORRUPT
, &buf
->bflags
);
3302 /* Can return -ENOMEM */
3303 int btrfs_free_extent(struct btrfs_trans_handle
*trans
, struct btrfs_ref
*ref
)
3305 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
3306 int old_ref_mod
, new_ref_mod
;
3309 if (btrfs_is_testing(fs_info
))
3313 * tree log blocks never actually go into the extent allocation
3314 * tree, just update pinning info and exit early.
3316 if ((ref
->type
== BTRFS_REF_METADATA
&&
3317 ref
->tree_ref
.root
== BTRFS_TREE_LOG_OBJECTID
) ||
3318 (ref
->type
== BTRFS_REF_DATA
&&
3319 ref
->data_ref
.ref_root
== BTRFS_TREE_LOG_OBJECTID
)) {
3320 /* unlocks the pinned mutex */
3321 btrfs_pin_extent(fs_info
, ref
->bytenr
, ref
->len
, 1);
3322 old_ref_mod
= new_ref_mod
= 0;
3324 } else if (ref
->type
== BTRFS_REF_METADATA
) {
3325 ret
= btrfs_add_delayed_tree_ref(trans
, ref
, NULL
,
3326 &old_ref_mod
, &new_ref_mod
);
3328 ret
= btrfs_add_delayed_data_ref(trans
, ref
, 0,
3329 &old_ref_mod
, &new_ref_mod
);
3332 if (!((ref
->type
== BTRFS_REF_METADATA
&&
3333 ref
->tree_ref
.root
== BTRFS_TREE_LOG_OBJECTID
) ||
3334 (ref
->type
== BTRFS_REF_DATA
&&
3335 ref
->data_ref
.ref_root
== BTRFS_TREE_LOG_OBJECTID
)))
3336 btrfs_ref_tree_mod(fs_info
, ref
);
3338 if (ret
== 0 && old_ref_mod
>= 0 && new_ref_mod
< 0)
3339 add_pinned_bytes(fs_info
, ref
);
3344 enum btrfs_loop_type
{
3345 LOOP_CACHING_NOWAIT
,
3352 btrfs_lock_block_group(struct btrfs_block_group_cache
*cache
,
3356 down_read(&cache
->data_rwsem
);
3360 btrfs_grab_block_group(struct btrfs_block_group_cache
*cache
,
3363 btrfs_get_block_group(cache
);
3365 down_read(&cache
->data_rwsem
);
3368 static struct btrfs_block_group_cache
*
3369 btrfs_lock_cluster(struct btrfs_block_group_cache
*block_group
,
3370 struct btrfs_free_cluster
*cluster
,
3373 struct btrfs_block_group_cache
*used_bg
= NULL
;
3375 spin_lock(&cluster
->refill_lock
);
3377 used_bg
= cluster
->block_group
;
3381 if (used_bg
== block_group
)
3384 btrfs_get_block_group(used_bg
);
3389 if (down_read_trylock(&used_bg
->data_rwsem
))
3392 spin_unlock(&cluster
->refill_lock
);
3394 /* We should only have one-level nested. */
3395 down_read_nested(&used_bg
->data_rwsem
, SINGLE_DEPTH_NESTING
);
3397 spin_lock(&cluster
->refill_lock
);
3398 if (used_bg
== cluster
->block_group
)
3401 up_read(&used_bg
->data_rwsem
);
3402 btrfs_put_block_group(used_bg
);
3407 btrfs_release_block_group(struct btrfs_block_group_cache
*cache
,
3411 up_read(&cache
->data_rwsem
);
3412 btrfs_put_block_group(cache
);
3416 * Structure used internally for find_free_extent() function. Wraps needed
3419 struct find_free_extent_ctl
{
3420 /* Basic allocation info */
3427 /* Where to start the search inside the bg */
3430 /* For clustered allocation */
3433 bool have_caching_bg
;
3434 bool orig_have_caching_bg
;
3436 /* RAID index, converted from flags */
3440 * Current loop number, check find_free_extent_update_loop() for details
3445 * Whether we're refilling a cluster, if true we need to re-search
3446 * current block group but don't try to refill the cluster again.
3448 bool retry_clustered
;
3451 * Whether we're updating free space cache, if true we need to re-search
3452 * current block group but don't try updating free space cache again.
3454 bool retry_unclustered
;
3456 /* If current block group is cached */
3459 /* Max contiguous hole found */
3460 u64 max_extent_size
;
3462 /* Total free space from free space cache, not always contiguous */
3463 u64 total_free_space
;
3471 * Helper function for find_free_extent().
3473 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3474 * Return -EAGAIN to inform caller that we need to re-search this block group
3475 * Return >0 to inform caller that we find nothing
3476 * Return 0 means we have found a location and set ffe_ctl->found_offset.
3478 static int find_free_extent_clustered(struct btrfs_block_group_cache
*bg
,
3479 struct btrfs_free_cluster
*last_ptr
,
3480 struct find_free_extent_ctl
*ffe_ctl
,
3481 struct btrfs_block_group_cache
**cluster_bg_ret
)
3483 struct btrfs_block_group_cache
*cluster_bg
;
3484 u64 aligned_cluster
;
3488 cluster_bg
= btrfs_lock_cluster(bg
, last_ptr
, ffe_ctl
->delalloc
);
3490 goto refill_cluster
;
3491 if (cluster_bg
!= bg
&& (cluster_bg
->ro
||
3492 !block_group_bits(cluster_bg
, ffe_ctl
->flags
)))
3493 goto release_cluster
;
3495 offset
= btrfs_alloc_from_cluster(cluster_bg
, last_ptr
,
3496 ffe_ctl
->num_bytes
, cluster_bg
->key
.objectid
,
3497 &ffe_ctl
->max_extent_size
);
3499 /* We have a block, we're done */
3500 spin_unlock(&last_ptr
->refill_lock
);
3501 trace_btrfs_reserve_extent_cluster(cluster_bg
,
3502 ffe_ctl
->search_start
, ffe_ctl
->num_bytes
);
3503 *cluster_bg_ret
= cluster_bg
;
3504 ffe_ctl
->found_offset
= offset
;
3507 WARN_ON(last_ptr
->block_group
!= cluster_bg
);
3511 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3512 * lets just skip it and let the allocator find whatever block it can
3513 * find. If we reach this point, we will have tried the cluster
3514 * allocator plenty of times and not have found anything, so we are
3515 * likely way too fragmented for the clustering stuff to find anything.
3517 * However, if the cluster is taken from the current block group,
3518 * release the cluster first, so that we stand a better chance of
3519 * succeeding in the unclustered allocation.
3521 if (ffe_ctl
->loop
>= LOOP_NO_EMPTY_SIZE
&& cluster_bg
!= bg
) {
3522 spin_unlock(&last_ptr
->refill_lock
);
3523 btrfs_release_block_group(cluster_bg
, ffe_ctl
->delalloc
);
3527 /* This cluster didn't work out, free it and start over */
3528 btrfs_return_cluster_to_free_space(NULL
, last_ptr
);
3530 if (cluster_bg
!= bg
)
3531 btrfs_release_block_group(cluster_bg
, ffe_ctl
->delalloc
);
3534 if (ffe_ctl
->loop
>= LOOP_NO_EMPTY_SIZE
) {
3535 spin_unlock(&last_ptr
->refill_lock
);
3539 aligned_cluster
= max_t(u64
,
3540 ffe_ctl
->empty_cluster
+ ffe_ctl
->empty_size
,
3541 bg
->full_stripe_len
);
3542 ret
= btrfs_find_space_cluster(bg
, last_ptr
, ffe_ctl
->search_start
,
3543 ffe_ctl
->num_bytes
, aligned_cluster
);
3545 /* Now pull our allocation out of this cluster */
3546 offset
= btrfs_alloc_from_cluster(bg
, last_ptr
,
3547 ffe_ctl
->num_bytes
, ffe_ctl
->search_start
,
3548 &ffe_ctl
->max_extent_size
);
3550 /* We found one, proceed */
3551 spin_unlock(&last_ptr
->refill_lock
);
3552 trace_btrfs_reserve_extent_cluster(bg
,
3553 ffe_ctl
->search_start
,
3554 ffe_ctl
->num_bytes
);
3555 ffe_ctl
->found_offset
= offset
;
3558 } else if (!ffe_ctl
->cached
&& ffe_ctl
->loop
> LOOP_CACHING_NOWAIT
&&
3559 !ffe_ctl
->retry_clustered
) {
3560 spin_unlock(&last_ptr
->refill_lock
);
3562 ffe_ctl
->retry_clustered
= true;
3563 btrfs_wait_block_group_cache_progress(bg
, ffe_ctl
->num_bytes
+
3564 ffe_ctl
->empty_cluster
+ ffe_ctl
->empty_size
);
3568 * At this point we either didn't find a cluster or we weren't able to
3569 * allocate a block from our cluster. Free the cluster we've been
3570 * trying to use, and go to the next block group.
3572 btrfs_return_cluster_to_free_space(NULL
, last_ptr
);
3573 spin_unlock(&last_ptr
->refill_lock
);
3578 * Return >0 to inform caller that we find nothing
3579 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3580 * Return -EAGAIN to inform caller that we need to re-search this block group
3582 static int find_free_extent_unclustered(struct btrfs_block_group_cache
*bg
,
3583 struct btrfs_free_cluster
*last_ptr
,
3584 struct find_free_extent_ctl
*ffe_ctl
)
3589 * We are doing an unclustered allocation, set the fragmented flag so
3590 * we don't bother trying to setup a cluster again until we get more
3593 if (unlikely(last_ptr
)) {
3594 spin_lock(&last_ptr
->lock
);
3595 last_ptr
->fragmented
= 1;
3596 spin_unlock(&last_ptr
->lock
);
3598 if (ffe_ctl
->cached
) {
3599 struct btrfs_free_space_ctl
*free_space_ctl
;
3601 free_space_ctl
= bg
->free_space_ctl
;
3602 spin_lock(&free_space_ctl
->tree_lock
);
3603 if (free_space_ctl
->free_space
<
3604 ffe_ctl
->num_bytes
+ ffe_ctl
->empty_cluster
+
3605 ffe_ctl
->empty_size
) {
3606 ffe_ctl
->total_free_space
= max_t(u64
,
3607 ffe_ctl
->total_free_space
,
3608 free_space_ctl
->free_space
);
3609 spin_unlock(&free_space_ctl
->tree_lock
);
3612 spin_unlock(&free_space_ctl
->tree_lock
);
3615 offset
= btrfs_find_space_for_alloc(bg
, ffe_ctl
->search_start
,
3616 ffe_ctl
->num_bytes
, ffe_ctl
->empty_size
,
3617 &ffe_ctl
->max_extent_size
);
3620 * If we didn't find a chunk, and we haven't failed on this block group
3621 * before, and this block group is in the middle of caching and we are
3622 * ok with waiting, then go ahead and wait for progress to be made, and
3623 * set @retry_unclustered to true.
3625 * If @retry_unclustered is true then we've already waited on this
3626 * block group once and should move on to the next block group.
3628 if (!offset
&& !ffe_ctl
->retry_unclustered
&& !ffe_ctl
->cached
&&
3629 ffe_ctl
->loop
> LOOP_CACHING_NOWAIT
) {
3630 btrfs_wait_block_group_cache_progress(bg
, ffe_ctl
->num_bytes
+
3631 ffe_ctl
->empty_size
);
3632 ffe_ctl
->retry_unclustered
= true;
3634 } else if (!offset
) {
3637 ffe_ctl
->found_offset
= offset
;
3642 * Return >0 means caller needs to re-search for free extent
3643 * Return 0 means we have the needed free extent.
3644 * Return <0 means we failed to locate any free extent.
3646 static int find_free_extent_update_loop(struct btrfs_fs_info
*fs_info
,
3647 struct btrfs_free_cluster
*last_ptr
,
3648 struct btrfs_key
*ins
,
3649 struct find_free_extent_ctl
*ffe_ctl
,
3650 int full_search
, bool use_cluster
)
3652 struct btrfs_root
*root
= fs_info
->extent_root
;
3655 if ((ffe_ctl
->loop
== LOOP_CACHING_NOWAIT
) &&
3656 ffe_ctl
->have_caching_bg
&& !ffe_ctl
->orig_have_caching_bg
)
3657 ffe_ctl
->orig_have_caching_bg
= true;
3659 if (!ins
->objectid
&& ffe_ctl
->loop
>= LOOP_CACHING_WAIT
&&
3660 ffe_ctl
->have_caching_bg
)
3663 if (!ins
->objectid
&& ++(ffe_ctl
->index
) < BTRFS_NR_RAID_TYPES
)
3666 if (ins
->objectid
) {
3667 if (!use_cluster
&& last_ptr
) {
3668 spin_lock(&last_ptr
->lock
);
3669 last_ptr
->window_start
= ins
->objectid
;
3670 spin_unlock(&last_ptr
->lock
);
3676 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
3677 * caching kthreads as we move along
3678 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3679 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3680 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3683 if (ffe_ctl
->loop
< LOOP_NO_EMPTY_SIZE
) {
3685 if (ffe_ctl
->loop
== LOOP_CACHING_NOWAIT
) {
3687 * We want to skip the LOOP_CACHING_WAIT step if we
3688 * don't have any uncached bgs and we've already done a
3689 * full search through.
3691 if (ffe_ctl
->orig_have_caching_bg
|| !full_search
)
3692 ffe_ctl
->loop
= LOOP_CACHING_WAIT
;
3694 ffe_ctl
->loop
= LOOP_ALLOC_CHUNK
;
3699 if (ffe_ctl
->loop
== LOOP_ALLOC_CHUNK
) {
3700 struct btrfs_trans_handle
*trans
;
3703 trans
= current
->journal_info
;
3707 trans
= btrfs_join_transaction(root
);
3709 if (IS_ERR(trans
)) {
3710 ret
= PTR_ERR(trans
);
3714 ret
= btrfs_chunk_alloc(trans
, ffe_ctl
->flags
,
3718 * If we can't allocate a new chunk we've already looped
3719 * through at least once, move on to the NO_EMPTY_SIZE
3723 ffe_ctl
->loop
= LOOP_NO_EMPTY_SIZE
;
3725 /* Do not bail out on ENOSPC since we can do more. */
3726 if (ret
< 0 && ret
!= -ENOSPC
)
3727 btrfs_abort_transaction(trans
, ret
);
3731 btrfs_end_transaction(trans
);
3736 if (ffe_ctl
->loop
== LOOP_NO_EMPTY_SIZE
) {
3738 * Don't loop again if we already have no empty_size and
3741 if (ffe_ctl
->empty_size
== 0 &&
3742 ffe_ctl
->empty_cluster
== 0)
3744 ffe_ctl
->empty_size
= 0;
3745 ffe_ctl
->empty_cluster
= 0;
3753 * walks the btree of allocated extents and find a hole of a given size.
3754 * The key ins is changed to record the hole:
3755 * ins->objectid == start position
3756 * ins->flags = BTRFS_EXTENT_ITEM_KEY
3757 * ins->offset == the size of the hole.
3758 * Any available blocks before search_start are skipped.
3760 * If there is no suitable free space, we will record the max size of
3761 * the free space extent currently.
3763 * The overall logic and call chain:
3765 * find_free_extent()
3766 * |- Iterate through all block groups
3767 * | |- Get a valid block group
3768 * | |- Try to do clustered allocation in that block group
3769 * | |- Try to do unclustered allocation in that block group
3770 * | |- Check if the result is valid
3771 * | | |- If valid, then exit
3772 * | |- Jump to next block group
3774 * |- Push harder to find free extents
3775 * |- If not found, re-iterate all block groups
3777 static noinline
int find_free_extent(struct btrfs_fs_info
*fs_info
,
3778 u64 ram_bytes
, u64 num_bytes
, u64 empty_size
,
3779 u64 hint_byte
, struct btrfs_key
*ins
,
3780 u64 flags
, int delalloc
)
3783 struct btrfs_free_cluster
*last_ptr
= NULL
;
3784 struct btrfs_block_group_cache
*block_group
= NULL
;
3785 struct find_free_extent_ctl ffe_ctl
= {0};
3786 struct btrfs_space_info
*space_info
;
3787 bool use_cluster
= true;
3788 bool full_search
= false;
3790 WARN_ON(num_bytes
< fs_info
->sectorsize
);
3792 ffe_ctl
.ram_bytes
= ram_bytes
;
3793 ffe_ctl
.num_bytes
= num_bytes
;
3794 ffe_ctl
.empty_size
= empty_size
;
3795 ffe_ctl
.flags
= flags
;
3796 ffe_ctl
.search_start
= 0;
3797 ffe_ctl
.retry_clustered
= false;
3798 ffe_ctl
.retry_unclustered
= false;
3799 ffe_ctl
.delalloc
= delalloc
;
3800 ffe_ctl
.index
= btrfs_bg_flags_to_raid_index(flags
);
3801 ffe_ctl
.have_caching_bg
= false;
3802 ffe_ctl
.orig_have_caching_bg
= false;
3803 ffe_ctl
.found_offset
= 0;
3805 ins
->type
= BTRFS_EXTENT_ITEM_KEY
;
3809 trace_find_free_extent(fs_info
, num_bytes
, empty_size
, flags
);
3811 space_info
= btrfs_find_space_info(fs_info
, flags
);
3813 btrfs_err(fs_info
, "No space info for %llu", flags
);
3818 * If our free space is heavily fragmented we may not be able to make
3819 * big contiguous allocations, so instead of doing the expensive search
3820 * for free space, simply return ENOSPC with our max_extent_size so we
3821 * can go ahead and search for a more manageable chunk.
3823 * If our max_extent_size is large enough for our allocation simply
3824 * disable clustering since we will likely not be able to find enough
3825 * space to create a cluster and induce latency trying.
3827 if (unlikely(space_info
->max_extent_size
)) {
3828 spin_lock(&space_info
->lock
);
3829 if (space_info
->max_extent_size
&&
3830 num_bytes
> space_info
->max_extent_size
) {
3831 ins
->offset
= space_info
->max_extent_size
;
3832 spin_unlock(&space_info
->lock
);
3834 } else if (space_info
->max_extent_size
) {
3835 use_cluster
= false;
3837 spin_unlock(&space_info
->lock
);
3840 last_ptr
= fetch_cluster_info(fs_info
, space_info
,
3841 &ffe_ctl
.empty_cluster
);
3843 spin_lock(&last_ptr
->lock
);
3844 if (last_ptr
->block_group
)
3845 hint_byte
= last_ptr
->window_start
;
3846 if (last_ptr
->fragmented
) {
3848 * We still set window_start so we can keep track of the
3849 * last place we found an allocation to try and save
3852 hint_byte
= last_ptr
->window_start
;
3853 use_cluster
= false;
3855 spin_unlock(&last_ptr
->lock
);
3858 ffe_ctl
.search_start
= max(ffe_ctl
.search_start
,
3859 first_logical_byte(fs_info
, 0));
3860 ffe_ctl
.search_start
= max(ffe_ctl
.search_start
, hint_byte
);
3861 if (ffe_ctl
.search_start
== hint_byte
) {
3862 block_group
= btrfs_lookup_block_group(fs_info
,
3863 ffe_ctl
.search_start
);
3865 * we don't want to use the block group if it doesn't match our
3866 * allocation bits, or if its not cached.
3868 * However if we are re-searching with an ideal block group
3869 * picked out then we don't care that the block group is cached.
3871 if (block_group
&& block_group_bits(block_group
, flags
) &&
3872 block_group
->cached
!= BTRFS_CACHE_NO
) {
3873 down_read(&space_info
->groups_sem
);
3874 if (list_empty(&block_group
->list
) ||
3877 * someone is removing this block group,
3878 * we can't jump into the have_block_group
3879 * target because our list pointers are not
3882 btrfs_put_block_group(block_group
);
3883 up_read(&space_info
->groups_sem
);
3885 ffe_ctl
.index
= btrfs_bg_flags_to_raid_index(
3886 block_group
->flags
);
3887 btrfs_lock_block_group(block_group
, delalloc
);
3888 goto have_block_group
;
3890 } else if (block_group
) {
3891 btrfs_put_block_group(block_group
);
3895 ffe_ctl
.have_caching_bg
= false;
3896 if (ffe_ctl
.index
== btrfs_bg_flags_to_raid_index(flags
) ||
3899 down_read(&space_info
->groups_sem
);
3900 list_for_each_entry(block_group
,
3901 &space_info
->block_groups
[ffe_ctl
.index
], list
) {
3902 /* If the block group is read-only, we can skip it entirely. */
3903 if (unlikely(block_group
->ro
))
3906 btrfs_grab_block_group(block_group
, delalloc
);
3907 ffe_ctl
.search_start
= block_group
->key
.objectid
;
3910 * this can happen if we end up cycling through all the
3911 * raid types, but we want to make sure we only allocate
3912 * for the proper type.
3914 if (!block_group_bits(block_group
, flags
)) {
3915 u64 extra
= BTRFS_BLOCK_GROUP_DUP
|
3916 BTRFS_BLOCK_GROUP_RAID1_MASK
|
3917 BTRFS_BLOCK_GROUP_RAID56_MASK
|
3918 BTRFS_BLOCK_GROUP_RAID10
;
3921 * if they asked for extra copies and this block group
3922 * doesn't provide them, bail. This does allow us to
3923 * fill raid0 from raid1.
3925 if ((flags
& extra
) && !(block_group
->flags
& extra
))
3929 * This block group has different flags than we want.
3930 * It's possible that we have MIXED_GROUP flag but no
3931 * block group is mixed. Just skip such block group.
3933 btrfs_release_block_group(block_group
, delalloc
);
3938 ffe_ctl
.cached
= btrfs_block_group_cache_done(block_group
);
3939 if (unlikely(!ffe_ctl
.cached
)) {
3940 ffe_ctl
.have_caching_bg
= true;
3941 ret
= btrfs_cache_block_group(block_group
, 0);
3946 if (unlikely(block_group
->cached
== BTRFS_CACHE_ERROR
))
3950 * Ok we want to try and use the cluster allocator, so
3953 if (last_ptr
&& use_cluster
) {
3954 struct btrfs_block_group_cache
*cluster_bg
= NULL
;
3956 ret
= find_free_extent_clustered(block_group
, last_ptr
,
3957 &ffe_ctl
, &cluster_bg
);
3960 if (cluster_bg
&& cluster_bg
!= block_group
) {
3961 btrfs_release_block_group(block_group
,
3963 block_group
= cluster_bg
;
3966 } else if (ret
== -EAGAIN
) {
3967 goto have_block_group
;
3968 } else if (ret
> 0) {
3971 /* ret == -ENOENT case falls through */
3974 ret
= find_free_extent_unclustered(block_group
, last_ptr
,
3977 goto have_block_group
;
3980 /* ret == 0 case falls through */
3982 ffe_ctl
.search_start
= round_up(ffe_ctl
.found_offset
,
3983 fs_info
->stripesize
);
3985 /* move on to the next group */
3986 if (ffe_ctl
.search_start
+ num_bytes
>
3987 block_group
->key
.objectid
+ block_group
->key
.offset
) {
3988 btrfs_add_free_space(block_group
, ffe_ctl
.found_offset
,
3993 if (ffe_ctl
.found_offset
< ffe_ctl
.search_start
)
3994 btrfs_add_free_space(block_group
, ffe_ctl
.found_offset
,
3995 ffe_ctl
.search_start
- ffe_ctl
.found_offset
);
3997 ret
= btrfs_add_reserved_bytes(block_group
, ram_bytes
,
3998 num_bytes
, delalloc
);
3999 if (ret
== -EAGAIN
) {
4000 btrfs_add_free_space(block_group
, ffe_ctl
.found_offset
,
4004 btrfs_inc_block_group_reservations(block_group
);
4006 /* we are all good, lets return */
4007 ins
->objectid
= ffe_ctl
.search_start
;
4008 ins
->offset
= num_bytes
;
4010 trace_btrfs_reserve_extent(block_group
, ffe_ctl
.search_start
,
4012 btrfs_release_block_group(block_group
, delalloc
);
4015 ffe_ctl
.retry_clustered
= false;
4016 ffe_ctl
.retry_unclustered
= false;
4017 BUG_ON(btrfs_bg_flags_to_raid_index(block_group
->flags
) !=
4019 btrfs_release_block_group(block_group
, delalloc
);
4022 up_read(&space_info
->groups_sem
);
4024 ret
= find_free_extent_update_loop(fs_info
, last_ptr
, ins
, &ffe_ctl
,
4025 full_search
, use_cluster
);
4029 if (ret
== -ENOSPC
) {
4031 * Use ffe_ctl->total_free_space as fallback if we can't find
4032 * any contiguous hole.
4034 if (!ffe_ctl
.max_extent_size
)
4035 ffe_ctl
.max_extent_size
= ffe_ctl
.total_free_space
;
4036 spin_lock(&space_info
->lock
);
4037 space_info
->max_extent_size
= ffe_ctl
.max_extent_size
;
4038 spin_unlock(&space_info
->lock
);
4039 ins
->offset
= ffe_ctl
.max_extent_size
;
4045 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
4046 * hole that is at least as big as @num_bytes.
4048 * @root - The root that will contain this extent
4050 * @ram_bytes - The amount of space in ram that @num_bytes take. This
4051 * is used for accounting purposes. This value differs
4052 * from @num_bytes only in the case of compressed extents.
4054 * @num_bytes - Number of bytes to allocate on-disk.
4056 * @min_alloc_size - Indicates the minimum amount of space that the
4057 * allocator should try to satisfy. In some cases
4058 * @num_bytes may be larger than what is required and if
4059 * the filesystem is fragmented then allocation fails.
4060 * However, the presence of @min_alloc_size gives a
4061 * chance to try and satisfy the smaller allocation.
4063 * @empty_size - A hint that you plan on doing more COW. This is the
4064 * size in bytes the allocator should try to find free
4065 * next to the block it returns. This is just a hint and
4066 * may be ignored by the allocator.
4068 * @hint_byte - Hint to the allocator to start searching above the byte
4069 * address passed. It might be ignored.
4071 * @ins - This key is modified to record the found hole. It will
4072 * have the following values:
4073 * ins->objectid == start position
4074 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4075 * ins->offset == the size of the hole.
4077 * @is_data - Boolean flag indicating whether an extent is
4078 * allocated for data (true) or metadata (false)
4080 * @delalloc - Boolean flag indicating whether this allocation is for
4081 * delalloc or not. If 'true' data_rwsem of block groups
4082 * is going to be acquired.
4085 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4086 * case -ENOSPC is returned then @ins->offset will contain the size of the
4087 * largest available hole the allocator managed to find.
4089 int btrfs_reserve_extent(struct btrfs_root
*root
, u64 ram_bytes
,
4090 u64 num_bytes
, u64 min_alloc_size
,
4091 u64 empty_size
, u64 hint_byte
,
4092 struct btrfs_key
*ins
, int is_data
, int delalloc
)
4094 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4095 bool final_tried
= num_bytes
== min_alloc_size
;
4099 flags
= get_alloc_profile_by_root(root
, is_data
);
4101 WARN_ON(num_bytes
< fs_info
->sectorsize
);
4102 ret
= find_free_extent(fs_info
, ram_bytes
, num_bytes
, empty_size
,
4103 hint_byte
, ins
, flags
, delalloc
);
4104 if (!ret
&& !is_data
) {
4105 btrfs_dec_block_group_reservations(fs_info
, ins
->objectid
);
4106 } else if (ret
== -ENOSPC
) {
4107 if (!final_tried
&& ins
->offset
) {
4108 num_bytes
= min(num_bytes
>> 1, ins
->offset
);
4109 num_bytes
= round_down(num_bytes
,
4110 fs_info
->sectorsize
);
4111 num_bytes
= max(num_bytes
, min_alloc_size
);
4112 ram_bytes
= num_bytes
;
4113 if (num_bytes
== min_alloc_size
)
4116 } else if (btrfs_test_opt(fs_info
, ENOSPC_DEBUG
)) {
4117 struct btrfs_space_info
*sinfo
;
4119 sinfo
= btrfs_find_space_info(fs_info
, flags
);
4121 "allocation failed flags %llu, wanted %llu",
4124 btrfs_dump_space_info(fs_info
, sinfo
,
4132 static int __btrfs_free_reserved_extent(struct btrfs_fs_info
*fs_info
,
4134 int pin
, int delalloc
)
4136 struct btrfs_block_group_cache
*cache
;
4139 cache
= btrfs_lookup_block_group(fs_info
, start
);
4141 btrfs_err(fs_info
, "Unable to find block group for %llu",
4147 pin_down_extent(cache
, start
, len
, 1);
4149 if (btrfs_test_opt(fs_info
, DISCARD
))
4150 ret
= btrfs_discard_extent(fs_info
, start
, len
, NULL
);
4151 btrfs_add_free_space(cache
, start
, len
);
4152 btrfs_free_reserved_bytes(cache
, len
, delalloc
);
4153 trace_btrfs_reserved_extent_free(fs_info
, start
, len
);
4156 btrfs_put_block_group(cache
);
4160 int btrfs_free_reserved_extent(struct btrfs_fs_info
*fs_info
,
4161 u64 start
, u64 len
, int delalloc
)
4163 return __btrfs_free_reserved_extent(fs_info
, start
, len
, 0, delalloc
);
4166 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info
*fs_info
,
4169 return __btrfs_free_reserved_extent(fs_info
, start
, len
, 1, 0);
4172 static int alloc_reserved_file_extent(struct btrfs_trans_handle
*trans
,
4173 u64 parent
, u64 root_objectid
,
4174 u64 flags
, u64 owner
, u64 offset
,
4175 struct btrfs_key
*ins
, int ref_mod
)
4177 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
4179 struct btrfs_extent_item
*extent_item
;
4180 struct btrfs_extent_inline_ref
*iref
;
4181 struct btrfs_path
*path
;
4182 struct extent_buffer
*leaf
;
4187 type
= BTRFS_SHARED_DATA_REF_KEY
;
4189 type
= BTRFS_EXTENT_DATA_REF_KEY
;
4191 size
= sizeof(*extent_item
) + btrfs_extent_inline_ref_size(type
);
4193 path
= btrfs_alloc_path();
4197 path
->leave_spinning
= 1;
4198 ret
= btrfs_insert_empty_item(trans
, fs_info
->extent_root
, path
,
4201 btrfs_free_path(path
);
4205 leaf
= path
->nodes
[0];
4206 extent_item
= btrfs_item_ptr(leaf
, path
->slots
[0],
4207 struct btrfs_extent_item
);
4208 btrfs_set_extent_refs(leaf
, extent_item
, ref_mod
);
4209 btrfs_set_extent_generation(leaf
, extent_item
, trans
->transid
);
4210 btrfs_set_extent_flags(leaf
, extent_item
,
4211 flags
| BTRFS_EXTENT_FLAG_DATA
);
4213 iref
= (struct btrfs_extent_inline_ref
*)(extent_item
+ 1);
4214 btrfs_set_extent_inline_ref_type(leaf
, iref
, type
);
4216 struct btrfs_shared_data_ref
*ref
;
4217 ref
= (struct btrfs_shared_data_ref
*)(iref
+ 1);
4218 btrfs_set_extent_inline_ref_offset(leaf
, iref
, parent
);
4219 btrfs_set_shared_data_ref_count(leaf
, ref
, ref_mod
);
4221 struct btrfs_extent_data_ref
*ref
;
4222 ref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
4223 btrfs_set_extent_data_ref_root(leaf
, ref
, root_objectid
);
4224 btrfs_set_extent_data_ref_objectid(leaf
, ref
, owner
);
4225 btrfs_set_extent_data_ref_offset(leaf
, ref
, offset
);
4226 btrfs_set_extent_data_ref_count(leaf
, ref
, ref_mod
);
4229 btrfs_mark_buffer_dirty(path
->nodes
[0]);
4230 btrfs_free_path(path
);
4232 ret
= remove_from_free_space_tree(trans
, ins
->objectid
, ins
->offset
);
4236 ret
= btrfs_update_block_group(trans
, ins
->objectid
, ins
->offset
, 1);
4237 if (ret
) { /* -ENOENT, logic error */
4238 btrfs_err(fs_info
, "update block group failed for %llu %llu",
4239 ins
->objectid
, ins
->offset
);
4242 trace_btrfs_reserved_extent_alloc(fs_info
, ins
->objectid
, ins
->offset
);
4246 static int alloc_reserved_tree_block(struct btrfs_trans_handle
*trans
,
4247 struct btrfs_delayed_ref_node
*node
,
4248 struct btrfs_delayed_extent_op
*extent_op
)
4250 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
4252 struct btrfs_extent_item
*extent_item
;
4253 struct btrfs_key extent_key
;
4254 struct btrfs_tree_block_info
*block_info
;
4255 struct btrfs_extent_inline_ref
*iref
;
4256 struct btrfs_path
*path
;
4257 struct extent_buffer
*leaf
;
4258 struct btrfs_delayed_tree_ref
*ref
;
4259 u32 size
= sizeof(*extent_item
) + sizeof(*iref
);
4261 u64 flags
= extent_op
->flags_to_set
;
4262 bool skinny_metadata
= btrfs_fs_incompat(fs_info
, SKINNY_METADATA
);
4264 ref
= btrfs_delayed_node_to_tree_ref(node
);
4266 extent_key
.objectid
= node
->bytenr
;
4267 if (skinny_metadata
) {
4268 extent_key
.offset
= ref
->level
;
4269 extent_key
.type
= BTRFS_METADATA_ITEM_KEY
;
4270 num_bytes
= fs_info
->nodesize
;
4272 extent_key
.offset
= node
->num_bytes
;
4273 extent_key
.type
= BTRFS_EXTENT_ITEM_KEY
;
4274 size
+= sizeof(*block_info
);
4275 num_bytes
= node
->num_bytes
;
4278 path
= btrfs_alloc_path();
4282 path
->leave_spinning
= 1;
4283 ret
= btrfs_insert_empty_item(trans
, fs_info
->extent_root
, path
,
4286 btrfs_free_path(path
);
4290 leaf
= path
->nodes
[0];
4291 extent_item
= btrfs_item_ptr(leaf
, path
->slots
[0],
4292 struct btrfs_extent_item
);
4293 btrfs_set_extent_refs(leaf
, extent_item
, 1);
4294 btrfs_set_extent_generation(leaf
, extent_item
, trans
->transid
);
4295 btrfs_set_extent_flags(leaf
, extent_item
,
4296 flags
| BTRFS_EXTENT_FLAG_TREE_BLOCK
);
4298 if (skinny_metadata
) {
4299 iref
= (struct btrfs_extent_inline_ref
*)(extent_item
+ 1);
4301 block_info
= (struct btrfs_tree_block_info
*)(extent_item
+ 1);
4302 btrfs_set_tree_block_key(leaf
, block_info
, &extent_op
->key
);
4303 btrfs_set_tree_block_level(leaf
, block_info
, ref
->level
);
4304 iref
= (struct btrfs_extent_inline_ref
*)(block_info
+ 1);
4307 if (node
->type
== BTRFS_SHARED_BLOCK_REF_KEY
) {
4308 BUG_ON(!(flags
& BTRFS_BLOCK_FLAG_FULL_BACKREF
));
4309 btrfs_set_extent_inline_ref_type(leaf
, iref
,
4310 BTRFS_SHARED_BLOCK_REF_KEY
);
4311 btrfs_set_extent_inline_ref_offset(leaf
, iref
, ref
->parent
);
4313 btrfs_set_extent_inline_ref_type(leaf
, iref
,
4314 BTRFS_TREE_BLOCK_REF_KEY
);
4315 btrfs_set_extent_inline_ref_offset(leaf
, iref
, ref
->root
);
4318 btrfs_mark_buffer_dirty(leaf
);
4319 btrfs_free_path(path
);
4321 ret
= remove_from_free_space_tree(trans
, extent_key
.objectid
,
4326 ret
= btrfs_update_block_group(trans
, extent_key
.objectid
,
4327 fs_info
->nodesize
, 1);
4328 if (ret
) { /* -ENOENT, logic error */
4329 btrfs_err(fs_info
, "update block group failed for %llu %llu",
4330 extent_key
.objectid
, extent_key
.offset
);
4334 trace_btrfs_reserved_extent_alloc(fs_info
, extent_key
.objectid
,
4339 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle
*trans
,
4340 struct btrfs_root
*root
, u64 owner
,
4341 u64 offset
, u64 ram_bytes
,
4342 struct btrfs_key
*ins
)
4344 struct btrfs_ref generic_ref
= { 0 };
4347 BUG_ON(root
->root_key
.objectid
== BTRFS_TREE_LOG_OBJECTID
);
4349 btrfs_init_generic_ref(&generic_ref
, BTRFS_ADD_DELAYED_EXTENT
,
4350 ins
->objectid
, ins
->offset
, 0);
4351 btrfs_init_data_ref(&generic_ref
, root
->root_key
.objectid
, owner
, offset
);
4352 btrfs_ref_tree_mod(root
->fs_info
, &generic_ref
);
4353 ret
= btrfs_add_delayed_data_ref(trans
, &generic_ref
,
4354 ram_bytes
, NULL
, NULL
);
4359 * this is used by the tree logging recovery code. It records that
4360 * an extent has been allocated and makes sure to clear the free
4361 * space cache bits as well
4363 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle
*trans
,
4364 u64 root_objectid
, u64 owner
, u64 offset
,
4365 struct btrfs_key
*ins
)
4367 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
4369 struct btrfs_block_group_cache
*block_group
;
4370 struct btrfs_space_info
*space_info
;
4373 * Mixed block groups will exclude before processing the log so we only
4374 * need to do the exclude dance if this fs isn't mixed.
4376 if (!btrfs_fs_incompat(fs_info
, MIXED_GROUPS
)) {
4377 ret
= __exclude_logged_extent(fs_info
, ins
->objectid
,
4383 block_group
= btrfs_lookup_block_group(fs_info
, ins
->objectid
);
4387 space_info
= block_group
->space_info
;
4388 spin_lock(&space_info
->lock
);
4389 spin_lock(&block_group
->lock
);
4390 space_info
->bytes_reserved
+= ins
->offset
;
4391 block_group
->reserved
+= ins
->offset
;
4392 spin_unlock(&block_group
->lock
);
4393 spin_unlock(&space_info
->lock
);
4395 ret
= alloc_reserved_file_extent(trans
, 0, root_objectid
, 0, owner
,
4397 btrfs_put_block_group(block_group
);
4401 static struct extent_buffer
*
4402 btrfs_init_new_buffer(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
4403 u64 bytenr
, int level
, u64 owner
)
4405 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4406 struct extent_buffer
*buf
;
4408 buf
= btrfs_find_create_tree_block(fs_info
, bytenr
);
4413 * Extra safety check in case the extent tree is corrupted and extent
4414 * allocator chooses to use a tree block which is already used and
4417 if (buf
->lock_owner
== current
->pid
) {
4418 btrfs_err_rl(fs_info
,
4419 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4420 buf
->start
, btrfs_header_owner(buf
), current
->pid
);
4421 free_extent_buffer(buf
);
4422 return ERR_PTR(-EUCLEAN
);
4425 btrfs_set_buffer_lockdep_class(root
->root_key
.objectid
, buf
, level
);
4426 btrfs_tree_lock(buf
);
4427 btrfs_clean_tree_block(buf
);
4428 clear_bit(EXTENT_BUFFER_STALE
, &buf
->bflags
);
4430 btrfs_set_lock_blocking_write(buf
);
4431 set_extent_buffer_uptodate(buf
);
4433 memzero_extent_buffer(buf
, 0, sizeof(struct btrfs_header
));
4434 btrfs_set_header_level(buf
, level
);
4435 btrfs_set_header_bytenr(buf
, buf
->start
);
4436 btrfs_set_header_generation(buf
, trans
->transid
);
4437 btrfs_set_header_backref_rev(buf
, BTRFS_MIXED_BACKREF_REV
);
4438 btrfs_set_header_owner(buf
, owner
);
4439 write_extent_buffer_fsid(buf
, fs_info
->fs_devices
->metadata_uuid
);
4440 write_extent_buffer_chunk_tree_uuid(buf
, fs_info
->chunk_tree_uuid
);
4441 if (root
->root_key
.objectid
== BTRFS_TREE_LOG_OBJECTID
) {
4442 buf
->log_index
= root
->log_transid
% 2;
4444 * we allow two log transactions at a time, use different
4445 * EXTENT bit to differentiate dirty pages.
4447 if (buf
->log_index
== 0)
4448 set_extent_dirty(&root
->dirty_log_pages
, buf
->start
,
4449 buf
->start
+ buf
->len
- 1, GFP_NOFS
);
4451 set_extent_new(&root
->dirty_log_pages
, buf
->start
,
4452 buf
->start
+ buf
->len
- 1);
4454 buf
->log_index
= -1;
4455 set_extent_dirty(&trans
->transaction
->dirty_pages
, buf
->start
,
4456 buf
->start
+ buf
->len
- 1, GFP_NOFS
);
4458 trans
->dirty
= true;
4459 /* this returns a buffer locked for blocking */
4464 * finds a free extent and does all the dirty work required for allocation
4465 * returns the tree buffer or an ERR_PTR on error.
4467 struct extent_buffer
*btrfs_alloc_tree_block(struct btrfs_trans_handle
*trans
,
4468 struct btrfs_root
*root
,
4469 u64 parent
, u64 root_objectid
,
4470 const struct btrfs_disk_key
*key
,
4471 int level
, u64 hint
,
4474 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4475 struct btrfs_key ins
;
4476 struct btrfs_block_rsv
*block_rsv
;
4477 struct extent_buffer
*buf
;
4478 struct btrfs_delayed_extent_op
*extent_op
;
4479 struct btrfs_ref generic_ref
= { 0 };
4482 u32 blocksize
= fs_info
->nodesize
;
4483 bool skinny_metadata
= btrfs_fs_incompat(fs_info
, SKINNY_METADATA
);
4485 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4486 if (btrfs_is_testing(fs_info
)) {
4487 buf
= btrfs_init_new_buffer(trans
, root
, root
->alloc_bytenr
,
4488 level
, root_objectid
);
4490 root
->alloc_bytenr
+= blocksize
;
4495 block_rsv
= btrfs_use_block_rsv(trans
, root
, blocksize
);
4496 if (IS_ERR(block_rsv
))
4497 return ERR_CAST(block_rsv
);
4499 ret
= btrfs_reserve_extent(root
, blocksize
, blocksize
, blocksize
,
4500 empty_size
, hint
, &ins
, 0, 0);
4504 buf
= btrfs_init_new_buffer(trans
, root
, ins
.objectid
, level
,
4508 goto out_free_reserved
;
4511 if (root_objectid
== BTRFS_TREE_RELOC_OBJECTID
) {
4513 parent
= ins
.objectid
;
4514 flags
|= BTRFS_BLOCK_FLAG_FULL_BACKREF
;
4518 if (root_objectid
!= BTRFS_TREE_LOG_OBJECTID
) {
4519 extent_op
= btrfs_alloc_delayed_extent_op();
4525 memcpy(&extent_op
->key
, key
, sizeof(extent_op
->key
));
4527 memset(&extent_op
->key
, 0, sizeof(extent_op
->key
));
4528 extent_op
->flags_to_set
= flags
;
4529 extent_op
->update_key
= skinny_metadata
? false : true;
4530 extent_op
->update_flags
= true;
4531 extent_op
->is_data
= false;
4532 extent_op
->level
= level
;
4534 btrfs_init_generic_ref(&generic_ref
, BTRFS_ADD_DELAYED_EXTENT
,
4535 ins
.objectid
, ins
.offset
, parent
);
4536 generic_ref
.real_root
= root
->root_key
.objectid
;
4537 btrfs_init_tree_ref(&generic_ref
, level
, root_objectid
);
4538 btrfs_ref_tree_mod(fs_info
, &generic_ref
);
4539 ret
= btrfs_add_delayed_tree_ref(trans
, &generic_ref
,
4540 extent_op
, NULL
, NULL
);
4542 goto out_free_delayed
;
4547 btrfs_free_delayed_extent_op(extent_op
);
4549 free_extent_buffer(buf
);
4551 btrfs_free_reserved_extent(fs_info
, ins
.objectid
, ins
.offset
, 0);
4553 btrfs_unuse_block_rsv(fs_info
, block_rsv
, blocksize
);
4554 return ERR_PTR(ret
);
4557 struct walk_control
{
4558 u64 refs
[BTRFS_MAX_LEVEL
];
4559 u64 flags
[BTRFS_MAX_LEVEL
];
4560 struct btrfs_key update_progress
;
4561 struct btrfs_key drop_progress
;
4573 #define DROP_REFERENCE 1
4574 #define UPDATE_BACKREF 2
4576 static noinline
void reada_walk_down(struct btrfs_trans_handle
*trans
,
4577 struct btrfs_root
*root
,
4578 struct walk_control
*wc
,
4579 struct btrfs_path
*path
)
4581 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4587 struct btrfs_key key
;
4588 struct extent_buffer
*eb
;
4593 if (path
->slots
[wc
->level
] < wc
->reada_slot
) {
4594 wc
->reada_count
= wc
->reada_count
* 2 / 3;
4595 wc
->reada_count
= max(wc
->reada_count
, 2);
4597 wc
->reada_count
= wc
->reada_count
* 3 / 2;
4598 wc
->reada_count
= min_t(int, wc
->reada_count
,
4599 BTRFS_NODEPTRS_PER_BLOCK(fs_info
));
4602 eb
= path
->nodes
[wc
->level
];
4603 nritems
= btrfs_header_nritems(eb
);
4605 for (slot
= path
->slots
[wc
->level
]; slot
< nritems
; slot
++) {
4606 if (nread
>= wc
->reada_count
)
4610 bytenr
= btrfs_node_blockptr(eb
, slot
);
4611 generation
= btrfs_node_ptr_generation(eb
, slot
);
4613 if (slot
== path
->slots
[wc
->level
])
4616 if (wc
->stage
== UPDATE_BACKREF
&&
4617 generation
<= root
->root_key
.offset
)
4620 /* We don't lock the tree block, it's OK to be racy here */
4621 ret
= btrfs_lookup_extent_info(trans
, fs_info
, bytenr
,
4622 wc
->level
- 1, 1, &refs
,
4624 /* We don't care about errors in readahead. */
4629 if (wc
->stage
== DROP_REFERENCE
) {
4633 if (wc
->level
== 1 &&
4634 (flags
& BTRFS_BLOCK_FLAG_FULL_BACKREF
))
4636 if (!wc
->update_ref
||
4637 generation
<= root
->root_key
.offset
)
4639 btrfs_node_key_to_cpu(eb
, &key
, slot
);
4640 ret
= btrfs_comp_cpu_keys(&key
,
4641 &wc
->update_progress
);
4645 if (wc
->level
== 1 &&
4646 (flags
& BTRFS_BLOCK_FLAG_FULL_BACKREF
))
4650 readahead_tree_block(fs_info
, bytenr
);
4653 wc
->reada_slot
= slot
;
4657 * helper to process tree block while walking down the tree.
4659 * when wc->stage == UPDATE_BACKREF, this function updates
4660 * back refs for pointers in the block.
4662 * NOTE: return value 1 means we should stop walking down.
4664 static noinline
int walk_down_proc(struct btrfs_trans_handle
*trans
,
4665 struct btrfs_root
*root
,
4666 struct btrfs_path
*path
,
4667 struct walk_control
*wc
, int lookup_info
)
4669 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4670 int level
= wc
->level
;
4671 struct extent_buffer
*eb
= path
->nodes
[level
];
4672 u64 flag
= BTRFS_BLOCK_FLAG_FULL_BACKREF
;
4675 if (wc
->stage
== UPDATE_BACKREF
&&
4676 btrfs_header_owner(eb
) != root
->root_key
.objectid
)
4680 * when reference count of tree block is 1, it won't increase
4681 * again. once full backref flag is set, we never clear it.
4684 ((wc
->stage
== DROP_REFERENCE
&& wc
->refs
[level
] != 1) ||
4685 (wc
->stage
== UPDATE_BACKREF
&& !(wc
->flags
[level
] & flag
)))) {
4686 BUG_ON(!path
->locks
[level
]);
4687 ret
= btrfs_lookup_extent_info(trans
, fs_info
,
4688 eb
->start
, level
, 1,
4691 BUG_ON(ret
== -ENOMEM
);
4694 BUG_ON(wc
->refs
[level
] == 0);
4697 if (wc
->stage
== DROP_REFERENCE
) {
4698 if (wc
->refs
[level
] > 1)
4701 if (path
->locks
[level
] && !wc
->keep_locks
) {
4702 btrfs_tree_unlock_rw(eb
, path
->locks
[level
]);
4703 path
->locks
[level
] = 0;
4708 /* wc->stage == UPDATE_BACKREF */
4709 if (!(wc
->flags
[level
] & flag
)) {
4710 BUG_ON(!path
->locks
[level
]);
4711 ret
= btrfs_inc_ref(trans
, root
, eb
, 1);
4712 BUG_ON(ret
); /* -ENOMEM */
4713 ret
= btrfs_dec_ref(trans
, root
, eb
, 0);
4714 BUG_ON(ret
); /* -ENOMEM */
4715 ret
= btrfs_set_disk_extent_flags(trans
, eb
->start
,
4717 btrfs_header_level(eb
), 0);
4718 BUG_ON(ret
); /* -ENOMEM */
4719 wc
->flags
[level
] |= flag
;
4723 * the block is shared by multiple trees, so it's not good to
4724 * keep the tree lock
4726 if (path
->locks
[level
] && level
> 0) {
4727 btrfs_tree_unlock_rw(eb
, path
->locks
[level
]);
4728 path
->locks
[level
] = 0;
4734 * This is used to verify a ref exists for this root to deal with a bug where we
4735 * would have a drop_progress key that hadn't been updated properly.
4737 static int check_ref_exists(struct btrfs_trans_handle
*trans
,
4738 struct btrfs_root
*root
, u64 bytenr
, u64 parent
,
4741 struct btrfs_path
*path
;
4742 struct btrfs_extent_inline_ref
*iref
;
4745 path
= btrfs_alloc_path();
4749 ret
= lookup_extent_backref(trans
, path
, &iref
, bytenr
,
4750 root
->fs_info
->nodesize
, parent
,
4751 root
->root_key
.objectid
, level
, 0);
4752 btrfs_free_path(path
);
4761 * helper to process tree block pointer.
4763 * when wc->stage == DROP_REFERENCE, this function checks
4764 * reference count of the block pointed to. if the block
4765 * is shared and we need update back refs for the subtree
4766 * rooted at the block, this function changes wc->stage to
4767 * UPDATE_BACKREF. if the block is shared and there is no
4768 * need to update back, this function drops the reference
4771 * NOTE: return value 1 means we should stop walking down.
4773 static noinline
int do_walk_down(struct btrfs_trans_handle
*trans
,
4774 struct btrfs_root
*root
,
4775 struct btrfs_path
*path
,
4776 struct walk_control
*wc
, int *lookup_info
)
4778 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4782 struct btrfs_key key
;
4783 struct btrfs_key first_key
;
4784 struct btrfs_ref ref
= { 0 };
4785 struct extent_buffer
*next
;
4786 int level
= wc
->level
;
4789 bool need_account
= false;
4791 generation
= btrfs_node_ptr_generation(path
->nodes
[level
],
4792 path
->slots
[level
]);
4794 * if the lower level block was created before the snapshot
4795 * was created, we know there is no need to update back refs
4798 if (wc
->stage
== UPDATE_BACKREF
&&
4799 generation
<= root
->root_key
.offset
) {
4804 bytenr
= btrfs_node_blockptr(path
->nodes
[level
], path
->slots
[level
]);
4805 btrfs_node_key_to_cpu(path
->nodes
[level
], &first_key
,
4806 path
->slots
[level
]);
4808 next
= find_extent_buffer(fs_info
, bytenr
);
4810 next
= btrfs_find_create_tree_block(fs_info
, bytenr
);
4812 return PTR_ERR(next
);
4814 btrfs_set_buffer_lockdep_class(root
->root_key
.objectid
, next
,
4818 btrfs_tree_lock(next
);
4819 btrfs_set_lock_blocking_write(next
);
4821 ret
= btrfs_lookup_extent_info(trans
, fs_info
, bytenr
, level
- 1, 1,
4822 &wc
->refs
[level
- 1],
4823 &wc
->flags
[level
- 1]);
4827 if (unlikely(wc
->refs
[level
- 1] == 0)) {
4828 btrfs_err(fs_info
, "Missing references.");
4834 if (wc
->stage
== DROP_REFERENCE
) {
4835 if (wc
->refs
[level
- 1] > 1) {
4836 need_account
= true;
4838 (wc
->flags
[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF
))
4841 if (!wc
->update_ref
||
4842 generation
<= root
->root_key
.offset
)
4845 btrfs_node_key_to_cpu(path
->nodes
[level
], &key
,
4846 path
->slots
[level
]);
4847 ret
= btrfs_comp_cpu_keys(&key
, &wc
->update_progress
);
4851 wc
->stage
= UPDATE_BACKREF
;
4852 wc
->shared_level
= level
- 1;
4856 (wc
->flags
[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF
))
4860 if (!btrfs_buffer_uptodate(next
, generation
, 0)) {
4861 btrfs_tree_unlock(next
);
4862 free_extent_buffer(next
);
4868 if (reada
&& level
== 1)
4869 reada_walk_down(trans
, root
, wc
, path
);
4870 next
= read_tree_block(fs_info
, bytenr
, generation
, level
- 1,
4873 return PTR_ERR(next
);
4874 } else if (!extent_buffer_uptodate(next
)) {
4875 free_extent_buffer(next
);
4878 btrfs_tree_lock(next
);
4879 btrfs_set_lock_blocking_write(next
);
4883 ASSERT(level
== btrfs_header_level(next
));
4884 if (level
!= btrfs_header_level(next
)) {
4885 btrfs_err(root
->fs_info
, "mismatched level");
4889 path
->nodes
[level
] = next
;
4890 path
->slots
[level
] = 0;
4891 path
->locks
[level
] = BTRFS_WRITE_LOCK_BLOCKING
;
4897 wc
->refs
[level
- 1] = 0;
4898 wc
->flags
[level
- 1] = 0;
4899 if (wc
->stage
== DROP_REFERENCE
) {
4900 if (wc
->flags
[level
] & BTRFS_BLOCK_FLAG_FULL_BACKREF
) {
4901 parent
= path
->nodes
[level
]->start
;
4903 ASSERT(root
->root_key
.objectid
==
4904 btrfs_header_owner(path
->nodes
[level
]));
4905 if (root
->root_key
.objectid
!=
4906 btrfs_header_owner(path
->nodes
[level
])) {
4907 btrfs_err(root
->fs_info
,
4908 "mismatched block owner");
4916 * If we had a drop_progress we need to verify the refs are set
4917 * as expected. If we find our ref then we know that from here
4918 * on out everything should be correct, and we can clear the
4921 if (wc
->restarted
) {
4922 ret
= check_ref_exists(trans
, root
, bytenr
, parent
,
4933 * Reloc tree doesn't contribute to qgroup numbers, and we have
4934 * already accounted them at merge time (replace_path),
4935 * thus we could skip expensive subtree trace here.
4937 if (root
->root_key
.objectid
!= BTRFS_TREE_RELOC_OBJECTID
&&
4939 ret
= btrfs_qgroup_trace_subtree(trans
, next
,
4940 generation
, level
- 1);
4942 btrfs_err_rl(fs_info
,
4943 "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
4949 * We need to update the next key in our walk control so we can
4950 * update the drop_progress key accordingly. We don't care if
4951 * find_next_key doesn't find a key because that means we're at
4952 * the end and are going to clean up now.
4954 wc
->drop_level
= level
;
4955 find_next_key(path
, level
, &wc
->drop_progress
);
4957 btrfs_init_generic_ref(&ref
, BTRFS_DROP_DELAYED_REF
, bytenr
,
4958 fs_info
->nodesize
, parent
);
4959 btrfs_init_tree_ref(&ref
, level
- 1, root
->root_key
.objectid
);
4960 ret
= btrfs_free_extent(trans
, &ref
);
4969 btrfs_tree_unlock(next
);
4970 free_extent_buffer(next
);
4976 * helper to process tree block while walking up the tree.
4978 * when wc->stage == DROP_REFERENCE, this function drops
4979 * reference count on the block.
4981 * when wc->stage == UPDATE_BACKREF, this function changes
4982 * wc->stage back to DROP_REFERENCE if we changed wc->stage
4983 * to UPDATE_BACKREF previously while processing the block.
4985 * NOTE: return value 1 means we should stop walking up.
4987 static noinline
int walk_up_proc(struct btrfs_trans_handle
*trans
,
4988 struct btrfs_root
*root
,
4989 struct btrfs_path
*path
,
4990 struct walk_control
*wc
)
4992 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4994 int level
= wc
->level
;
4995 struct extent_buffer
*eb
= path
->nodes
[level
];
4998 if (wc
->stage
== UPDATE_BACKREF
) {
4999 BUG_ON(wc
->shared_level
< level
);
5000 if (level
< wc
->shared_level
)
5003 ret
= find_next_key(path
, level
+ 1, &wc
->update_progress
);
5007 wc
->stage
= DROP_REFERENCE
;
5008 wc
->shared_level
= -1;
5009 path
->slots
[level
] = 0;
5012 * check reference count again if the block isn't locked.
5013 * we should start walking down the tree again if reference
5016 if (!path
->locks
[level
]) {
5018 btrfs_tree_lock(eb
);
5019 btrfs_set_lock_blocking_write(eb
);
5020 path
->locks
[level
] = BTRFS_WRITE_LOCK_BLOCKING
;
5022 ret
= btrfs_lookup_extent_info(trans
, fs_info
,
5023 eb
->start
, level
, 1,
5027 btrfs_tree_unlock_rw(eb
, path
->locks
[level
]);
5028 path
->locks
[level
] = 0;
5031 BUG_ON(wc
->refs
[level
] == 0);
5032 if (wc
->refs
[level
] == 1) {
5033 btrfs_tree_unlock_rw(eb
, path
->locks
[level
]);
5034 path
->locks
[level
] = 0;
5040 /* wc->stage == DROP_REFERENCE */
5041 BUG_ON(wc
->refs
[level
] > 1 && !path
->locks
[level
]);
5043 if (wc
->refs
[level
] == 1) {
5045 if (wc
->flags
[level
] & BTRFS_BLOCK_FLAG_FULL_BACKREF
)
5046 ret
= btrfs_dec_ref(trans
, root
, eb
, 1);
5048 ret
= btrfs_dec_ref(trans
, root
, eb
, 0);
5049 BUG_ON(ret
); /* -ENOMEM */
5050 if (is_fstree(root
->root_key
.objectid
)) {
5051 ret
= btrfs_qgroup_trace_leaf_items(trans
, eb
);
5053 btrfs_err_rl(fs_info
,
5054 "error %d accounting leaf items, quota is out of sync, rescan required",
5059 /* make block locked assertion in btrfs_clean_tree_block happy */
5060 if (!path
->locks
[level
] &&
5061 btrfs_header_generation(eb
) == trans
->transid
) {
5062 btrfs_tree_lock(eb
);
5063 btrfs_set_lock_blocking_write(eb
);
5064 path
->locks
[level
] = BTRFS_WRITE_LOCK_BLOCKING
;
5066 btrfs_clean_tree_block(eb
);
5069 if (eb
== root
->node
) {
5070 if (wc
->flags
[level
] & BTRFS_BLOCK_FLAG_FULL_BACKREF
)
5072 else if (root
->root_key
.objectid
!= btrfs_header_owner(eb
))
5073 goto owner_mismatch
;
5075 if (wc
->flags
[level
+ 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF
)
5076 parent
= path
->nodes
[level
+ 1]->start
;
5077 else if (root
->root_key
.objectid
!=
5078 btrfs_header_owner(path
->nodes
[level
+ 1]))
5079 goto owner_mismatch
;
5082 btrfs_free_tree_block(trans
, root
, eb
, parent
, wc
->refs
[level
] == 1);
5084 wc
->refs
[level
] = 0;
5085 wc
->flags
[level
] = 0;
5089 btrfs_err_rl(fs_info
, "unexpected tree owner, have %llu expect %llu",
5090 btrfs_header_owner(eb
), root
->root_key
.objectid
);
5094 static noinline
int walk_down_tree(struct btrfs_trans_handle
*trans
,
5095 struct btrfs_root
*root
,
5096 struct btrfs_path
*path
,
5097 struct walk_control
*wc
)
5099 int level
= wc
->level
;
5100 int lookup_info
= 1;
5103 while (level
>= 0) {
5104 ret
= walk_down_proc(trans
, root
, path
, wc
, lookup_info
);
5111 if (path
->slots
[level
] >=
5112 btrfs_header_nritems(path
->nodes
[level
]))
5115 ret
= do_walk_down(trans
, root
, path
, wc
, &lookup_info
);
5117 path
->slots
[level
]++;
5126 static noinline
int walk_up_tree(struct btrfs_trans_handle
*trans
,
5127 struct btrfs_root
*root
,
5128 struct btrfs_path
*path
,
5129 struct walk_control
*wc
, int max_level
)
5131 int level
= wc
->level
;
5134 path
->slots
[level
] = btrfs_header_nritems(path
->nodes
[level
]);
5135 while (level
< max_level
&& path
->nodes
[level
]) {
5137 if (path
->slots
[level
] + 1 <
5138 btrfs_header_nritems(path
->nodes
[level
])) {
5139 path
->slots
[level
]++;
5142 ret
= walk_up_proc(trans
, root
, path
, wc
);
5148 if (path
->locks
[level
]) {
5149 btrfs_tree_unlock_rw(path
->nodes
[level
],
5150 path
->locks
[level
]);
5151 path
->locks
[level
] = 0;
5153 free_extent_buffer(path
->nodes
[level
]);
5154 path
->nodes
[level
] = NULL
;
5162 * drop a subvolume tree.
5164 * this function traverses the tree freeing any blocks that only
5165 * referenced by the tree.
5167 * when a shared tree block is found. this function decreases its
5168 * reference count by one. if update_ref is true, this function
5169 * also make sure backrefs for the shared block and all lower level
5170 * blocks are properly updated.
5172 * If called with for_reloc == 0, may exit early with -EAGAIN
5174 int btrfs_drop_snapshot(struct btrfs_root
*root
,
5175 struct btrfs_block_rsv
*block_rsv
, int update_ref
,
5178 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5179 struct btrfs_path
*path
;
5180 struct btrfs_trans_handle
*trans
;
5181 struct btrfs_root
*tree_root
= fs_info
->tree_root
;
5182 struct btrfs_root_item
*root_item
= &root
->root_item
;
5183 struct walk_control
*wc
;
5184 struct btrfs_key key
;
5188 bool root_dropped
= false;
5190 btrfs_debug(fs_info
, "Drop subvolume %llu", root
->root_key
.objectid
);
5192 path
= btrfs_alloc_path();
5198 wc
= kzalloc(sizeof(*wc
), GFP_NOFS
);
5200 btrfs_free_path(path
);
5205 trans
= btrfs_start_transaction(tree_root
, 0);
5206 if (IS_ERR(trans
)) {
5207 err
= PTR_ERR(trans
);
5211 err
= btrfs_run_delayed_items(trans
);
5216 trans
->block_rsv
= block_rsv
;
5219 * This will help us catch people modifying the fs tree while we're
5220 * dropping it. It is unsafe to mess with the fs tree while it's being
5221 * dropped as we unlock the root node and parent nodes as we walk down
5222 * the tree, assuming nothing will change. If something does change
5223 * then we'll have stale information and drop references to blocks we've
5226 set_bit(BTRFS_ROOT_DELETING
, &root
->state
);
5227 if (btrfs_disk_key_objectid(&root_item
->drop_progress
) == 0) {
5228 level
= btrfs_header_level(root
->node
);
5229 path
->nodes
[level
] = btrfs_lock_root_node(root
);
5230 btrfs_set_lock_blocking_write(path
->nodes
[level
]);
5231 path
->slots
[level
] = 0;
5232 path
->locks
[level
] = BTRFS_WRITE_LOCK_BLOCKING
;
5233 memset(&wc
->update_progress
, 0,
5234 sizeof(wc
->update_progress
));
5236 btrfs_disk_key_to_cpu(&key
, &root_item
->drop_progress
);
5237 memcpy(&wc
->update_progress
, &key
,
5238 sizeof(wc
->update_progress
));
5240 level
= root_item
->drop_level
;
5242 path
->lowest_level
= level
;
5243 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
5244 path
->lowest_level
= 0;
5252 * unlock our path, this is safe because only this
5253 * function is allowed to delete this snapshot
5255 btrfs_unlock_up_safe(path
, 0);
5257 level
= btrfs_header_level(root
->node
);
5259 btrfs_tree_lock(path
->nodes
[level
]);
5260 btrfs_set_lock_blocking_write(path
->nodes
[level
]);
5261 path
->locks
[level
] = BTRFS_WRITE_LOCK_BLOCKING
;
5263 ret
= btrfs_lookup_extent_info(trans
, fs_info
,
5264 path
->nodes
[level
]->start
,
5265 level
, 1, &wc
->refs
[level
],
5271 BUG_ON(wc
->refs
[level
] == 0);
5273 if (level
== root_item
->drop_level
)
5276 btrfs_tree_unlock(path
->nodes
[level
]);
5277 path
->locks
[level
] = 0;
5278 WARN_ON(wc
->refs
[level
] != 1);
5283 wc
->restarted
= test_bit(BTRFS_ROOT_DEAD_TREE
, &root
->state
);
5285 wc
->shared_level
= -1;
5286 wc
->stage
= DROP_REFERENCE
;
5287 wc
->update_ref
= update_ref
;
5289 wc
->reada_count
= BTRFS_NODEPTRS_PER_BLOCK(fs_info
);
5293 ret
= walk_down_tree(trans
, root
, path
, wc
);
5299 ret
= walk_up_tree(trans
, root
, path
, wc
, BTRFS_MAX_LEVEL
);
5306 BUG_ON(wc
->stage
!= DROP_REFERENCE
);
5310 if (wc
->stage
== DROP_REFERENCE
) {
5311 wc
->drop_level
= wc
->level
;
5312 btrfs_node_key_to_cpu(path
->nodes
[wc
->drop_level
],
5314 path
->slots
[wc
->drop_level
]);
5316 btrfs_cpu_key_to_disk(&root_item
->drop_progress
,
5317 &wc
->drop_progress
);
5318 root_item
->drop_level
= wc
->drop_level
;
5320 BUG_ON(wc
->level
== 0);
5321 if (btrfs_should_end_transaction(trans
) ||
5322 (!for_reloc
&& btrfs_need_cleaner_sleep(fs_info
))) {
5323 ret
= btrfs_update_root(trans
, tree_root
,
5327 btrfs_abort_transaction(trans
, ret
);
5332 btrfs_end_transaction_throttle(trans
);
5333 if (!for_reloc
&& btrfs_need_cleaner_sleep(fs_info
)) {
5334 btrfs_debug(fs_info
,
5335 "drop snapshot early exit");
5340 trans
= btrfs_start_transaction(tree_root
, 0);
5341 if (IS_ERR(trans
)) {
5342 err
= PTR_ERR(trans
);
5346 trans
->block_rsv
= block_rsv
;
5349 btrfs_release_path(path
);
5353 ret
= btrfs_del_root(trans
, &root
->root_key
);
5355 btrfs_abort_transaction(trans
, ret
);
5360 if (root
->root_key
.objectid
!= BTRFS_TREE_RELOC_OBJECTID
) {
5361 ret
= btrfs_find_root(tree_root
, &root
->root_key
, path
,
5364 btrfs_abort_transaction(trans
, ret
);
5367 } else if (ret
> 0) {
5368 /* if we fail to delete the orphan item this time
5369 * around, it'll get picked up the next time.
5371 * The most common failure here is just -ENOENT.
5373 btrfs_del_orphan_item(trans
, tree_root
,
5374 root
->root_key
.objectid
);
5378 if (test_bit(BTRFS_ROOT_IN_RADIX
, &root
->state
)) {
5379 btrfs_add_dropped_root(trans
, root
);
5381 free_extent_buffer(root
->node
);
5382 free_extent_buffer(root
->commit_root
);
5383 btrfs_put_fs_root(root
);
5385 root_dropped
= true;
5387 btrfs_end_transaction_throttle(trans
);
5390 btrfs_free_path(path
);
5393 * So if we need to stop dropping the snapshot for whatever reason we
5394 * need to make sure to add it back to the dead root list so that we
5395 * keep trying to do the work later. This also cleans up roots if we
5396 * don't have it in the radix (like when we recover after a power fail
5397 * or unmount) so we don't leak memory.
5399 if (!for_reloc
&& !root_dropped
)
5400 btrfs_add_dead_root(root
);
5401 if (err
&& err
!= -EAGAIN
)
5402 btrfs_handle_fs_error(fs_info
, err
, NULL
);
5407 * drop subtree rooted at tree block 'node'.
5409 * NOTE: this function will unlock and release tree block 'node'
5410 * only used by relocation code
5412 int btrfs_drop_subtree(struct btrfs_trans_handle
*trans
,
5413 struct btrfs_root
*root
,
5414 struct extent_buffer
*node
,
5415 struct extent_buffer
*parent
)
5417 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5418 struct btrfs_path
*path
;
5419 struct walk_control
*wc
;
5425 BUG_ON(root
->root_key
.objectid
!= BTRFS_TREE_RELOC_OBJECTID
);
5427 path
= btrfs_alloc_path();
5431 wc
= kzalloc(sizeof(*wc
), GFP_NOFS
);
5433 btrfs_free_path(path
);
5437 btrfs_assert_tree_locked(parent
);
5438 parent_level
= btrfs_header_level(parent
);
5439 extent_buffer_get(parent
);
5440 path
->nodes
[parent_level
] = parent
;
5441 path
->slots
[parent_level
] = btrfs_header_nritems(parent
);
5443 btrfs_assert_tree_locked(node
);
5444 level
= btrfs_header_level(node
);
5445 path
->nodes
[level
] = node
;
5446 path
->slots
[level
] = 0;
5447 path
->locks
[level
] = BTRFS_WRITE_LOCK_BLOCKING
;
5449 wc
->refs
[parent_level
] = 1;
5450 wc
->flags
[parent_level
] = BTRFS_BLOCK_FLAG_FULL_BACKREF
;
5452 wc
->shared_level
= -1;
5453 wc
->stage
= DROP_REFERENCE
;
5456 wc
->reada_count
= BTRFS_NODEPTRS_PER_BLOCK(fs_info
);
5459 wret
= walk_down_tree(trans
, root
, path
, wc
);
5465 wret
= walk_up_tree(trans
, root
, path
, wc
, parent_level
);
5473 btrfs_free_path(path
);
5478 * helper to account the unused space of all the readonly block group in the
5479 * space_info. takes mirrors into account.
5481 u64
btrfs_account_ro_block_groups_free_space(struct btrfs_space_info
*sinfo
)
5483 struct btrfs_block_group_cache
*block_group
;
5487 /* It's df, we don't care if it's racy */
5488 if (list_empty(&sinfo
->ro_bgs
))
5491 spin_lock(&sinfo
->lock
);
5492 list_for_each_entry(block_group
, &sinfo
->ro_bgs
, ro_list
) {
5493 spin_lock(&block_group
->lock
);
5495 if (!block_group
->ro
) {
5496 spin_unlock(&block_group
->lock
);
5500 factor
= btrfs_bg_type_to_factor(block_group
->flags
);
5501 free_bytes
+= (block_group
->key
.offset
-
5502 btrfs_block_group_used(&block_group
->item
)) *
5505 spin_unlock(&block_group
->lock
);
5507 spin_unlock(&sinfo
->lock
);
5512 int btrfs_error_unpin_extent_range(struct btrfs_fs_info
*fs_info
,
5515 return unpin_extent_range(fs_info
, start
, end
, false);
5519 * It used to be that old block groups would be left around forever.
5520 * Iterating over them would be enough to trim unused space. Since we
5521 * now automatically remove them, we also need to iterate over unallocated
5524 * We don't want a transaction for this since the discard may take a
5525 * substantial amount of time. We don't require that a transaction be
5526 * running, but we do need to take a running transaction into account
5527 * to ensure that we're not discarding chunks that were released or
5528 * allocated in the current transaction.
5530 * Holding the chunks lock will prevent other threads from allocating
5531 * or releasing chunks, but it won't prevent a running transaction
5532 * from committing and releasing the memory that the pending chunks
5533 * list head uses. For that, we need to take a reference to the
5534 * transaction and hold the commit root sem. We only need to hold
5535 * it while performing the free space search since we have already
5536 * held back allocations.
5538 static int btrfs_trim_free_extents(struct btrfs_device
*device
, u64
*trimmed
)
5540 u64 start
= SZ_1M
, len
= 0, end
= 0;
5545 /* Discard not supported = nothing to do. */
5546 if (!blk_queue_discard(bdev_get_queue(device
->bdev
)))
5549 /* Not writable = nothing to do. */
5550 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE
, &device
->dev_state
))
5553 /* No free space = nothing to do. */
5554 if (device
->total_bytes
<= device
->bytes_used
)
5560 struct btrfs_fs_info
*fs_info
= device
->fs_info
;
5563 ret
= mutex_lock_interruptible(&fs_info
->chunk_mutex
);
5567 find_first_clear_extent_bit(&device
->alloc_state
, start
,
5569 CHUNK_TRIMMED
| CHUNK_ALLOCATED
);
5571 /* Ensure we skip the reserved area in the first 1M */
5572 start
= max_t(u64
, start
, SZ_1M
);
5575 * If find_first_clear_extent_bit find a range that spans the
5576 * end of the device it will set end to -1, in this case it's up
5577 * to the caller to trim the value to the size of the device.
5579 end
= min(end
, device
->total_bytes
- 1);
5581 len
= end
- start
+ 1;
5583 /* We didn't find any extents */
5585 mutex_unlock(&fs_info
->chunk_mutex
);
5590 ret
= btrfs_issue_discard(device
->bdev
, start
, len
,
5593 set_extent_bits(&device
->alloc_state
, start
,
5596 mutex_unlock(&fs_info
->chunk_mutex
);
5604 if (fatal_signal_pending(current
)) {
5616 * Trim the whole filesystem by:
5617 * 1) trimming the free space in each block group
5618 * 2) trimming the unallocated space on each device
5620 * This will also continue trimming even if a block group or device encounters
5621 * an error. The return value will be the last error, or 0 if nothing bad
5624 int btrfs_trim_fs(struct btrfs_fs_info
*fs_info
, struct fstrim_range
*range
)
5626 struct btrfs_block_group_cache
*cache
= NULL
;
5627 struct btrfs_device
*device
;
5628 struct list_head
*devices
;
5630 u64 range_end
= U64_MAX
;
5641 * Check range overflow if range->len is set.
5642 * The default range->len is U64_MAX.
5644 if (range
->len
!= U64_MAX
&&
5645 check_add_overflow(range
->start
, range
->len
, &range_end
))
5648 cache
= btrfs_lookup_first_block_group(fs_info
, range
->start
);
5649 for (; cache
; cache
= btrfs_next_block_group(cache
)) {
5650 if (cache
->key
.objectid
>= range_end
) {
5651 btrfs_put_block_group(cache
);
5655 start
= max(range
->start
, cache
->key
.objectid
);
5656 end
= min(range_end
, cache
->key
.objectid
+ cache
->key
.offset
);
5658 if (end
- start
>= range
->minlen
) {
5659 if (!btrfs_block_group_cache_done(cache
)) {
5660 ret
= btrfs_cache_block_group(cache
, 0);
5666 ret
= btrfs_wait_block_group_cache_done(cache
);
5673 ret
= btrfs_trim_block_group(cache
,
5679 trimmed
+= group_trimmed
;
5690 "failed to trim %llu block group(s), last error %d",
5692 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
5693 devices
= &fs_info
->fs_devices
->devices
;
5694 list_for_each_entry(device
, devices
, dev_list
) {
5695 ret
= btrfs_trim_free_extents(device
, &group_trimmed
);
5702 trimmed
+= group_trimmed
;
5704 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
5708 "failed to trim %llu device(s), last error %d",
5709 dev_failed
, dev_ret
);
5710 range
->len
= trimmed
;
5717 * btrfs_{start,end}_write_no_snapshotting() are similar to
5718 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
5719 * data into the page cache through nocow before the subvolume is snapshoted,
5720 * but flush the data into disk after the snapshot creation, or to prevent
5721 * operations while snapshotting is ongoing and that cause the snapshot to be
5722 * inconsistent (writes followed by expanding truncates for example).
5724 void btrfs_end_write_no_snapshotting(struct btrfs_root
*root
)
5726 percpu_counter_dec(&root
->subv_writers
->counter
);
5727 cond_wake_up(&root
->subv_writers
->wait
);
5730 int btrfs_start_write_no_snapshotting(struct btrfs_root
*root
)
5732 if (atomic_read(&root
->will_be_snapshotted
))
5735 percpu_counter_inc(&root
->subv_writers
->counter
);
5737 * Make sure counter is updated before we check for snapshot creation.
5740 if (atomic_read(&root
->will_be_snapshotted
)) {
5741 btrfs_end_write_no_snapshotting(root
);
5747 void btrfs_wait_for_snapshot_creation(struct btrfs_root
*root
)
5752 ret
= btrfs_start_write_no_snapshotting(root
);
5755 wait_var_event(&root
->will_be_snapshotted
,
5756 !atomic_read(&root
->will_be_snapshotted
));