1 // SPDX-License-Identifier: GPL-2.0
4 #include <linux/slab.h>
5 #include <linux/spinlock.h>
8 #include "extent_map.h"
9 #include "compression.h"
10 #include "btrfs_inode.h"
14 static struct kmem_cache
*extent_map_cache
;
16 int __init
extent_map_init(void)
18 extent_map_cache
= kmem_cache_create("btrfs_extent_map",
19 sizeof(struct extent_map
), 0, 0, NULL
);
20 if (!extent_map_cache
)
25 void __cold
extent_map_exit(void)
27 kmem_cache_destroy(extent_map_cache
);
31 * Initialize the extent tree @tree. Should be called for each new inode or
32 * other user of the extent_map interface.
34 void extent_map_tree_init(struct extent_map_tree
*tree
)
37 INIT_LIST_HEAD(&tree
->modified_extents
);
38 rwlock_init(&tree
->lock
);
42 * Allocate a new extent_map structure. The new structure is returned with a
43 * reference count of one and needs to be freed using free_extent_map()
45 struct extent_map
*alloc_extent_map(void)
47 struct extent_map
*em
;
48 em
= kmem_cache_zalloc(extent_map_cache
, GFP_NOFS
);
51 RB_CLEAR_NODE(&em
->rb_node
);
52 refcount_set(&em
->refs
, 1);
53 INIT_LIST_HEAD(&em
->list
);
58 * Drop the reference out on @em by one and free the structure if the reference
61 void free_extent_map(struct extent_map
*em
)
65 if (refcount_dec_and_test(&em
->refs
)) {
66 WARN_ON(extent_map_in_tree(em
));
67 WARN_ON(!list_empty(&em
->list
));
68 kmem_cache_free(extent_map_cache
, em
);
72 /* Do the math around the end of an extent, handling wrapping. */
73 static u64
range_end(u64 start
, u64 len
)
75 if (start
+ len
< start
)
80 static void remove_em(struct btrfs_inode
*inode
, struct extent_map
*em
)
82 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
84 rb_erase(&em
->rb_node
, &inode
->extent_tree
.root
);
85 RB_CLEAR_NODE(&em
->rb_node
);
87 if (!btrfs_is_testing(fs_info
) && is_fstree(btrfs_root_id(inode
->root
)))
88 percpu_counter_dec(&fs_info
->evictable_extent_maps
);
91 static int tree_insert(struct rb_root
*root
, struct extent_map
*em
)
93 struct rb_node
**p
= &root
->rb_node
;
94 struct rb_node
*parent
= NULL
;
95 struct extent_map
*entry
= NULL
;
96 struct rb_node
*orig_parent
= NULL
;
97 u64 end
= range_end(em
->start
, em
->len
);
101 entry
= rb_entry(parent
, struct extent_map
, rb_node
);
103 if (em
->start
< entry
->start
)
105 else if (em
->start
>= extent_map_end(entry
))
111 orig_parent
= parent
;
112 while (parent
&& em
->start
>= extent_map_end(entry
)) {
113 parent
= rb_next(parent
);
114 entry
= rb_entry(parent
, struct extent_map
, rb_node
);
117 if (end
> entry
->start
&& em
->start
< extent_map_end(entry
))
120 parent
= orig_parent
;
121 entry
= rb_entry(parent
, struct extent_map
, rb_node
);
122 while (parent
&& em
->start
< entry
->start
) {
123 parent
= rb_prev(parent
);
124 entry
= rb_entry(parent
, struct extent_map
, rb_node
);
127 if (end
> entry
->start
&& em
->start
< extent_map_end(entry
))
130 rb_link_node(&em
->rb_node
, orig_parent
, p
);
131 rb_insert_color(&em
->rb_node
, root
);
136 * Search through the tree for an extent_map with a given offset. If it can't
137 * be found, try to find some neighboring extents
139 static struct rb_node
*__tree_search(struct rb_root
*root
, u64 offset
,
140 struct rb_node
**prev_or_next_ret
)
142 struct rb_node
*n
= root
->rb_node
;
143 struct rb_node
*prev
= NULL
;
144 struct rb_node
*orig_prev
= NULL
;
145 struct extent_map
*entry
;
146 struct extent_map
*prev_entry
= NULL
;
148 ASSERT(prev_or_next_ret
);
151 entry
= rb_entry(n
, struct extent_map
, rb_node
);
155 if (offset
< entry
->start
)
157 else if (offset
>= extent_map_end(entry
))
164 while (prev
&& offset
>= extent_map_end(prev_entry
)) {
165 prev
= rb_next(prev
);
166 prev_entry
= rb_entry(prev
, struct extent_map
, rb_node
);
170 * Previous extent map found, return as in this case the caller does not
171 * care about the next one.
174 *prev_or_next_ret
= prev
;
179 prev_entry
= rb_entry(prev
, struct extent_map
, rb_node
);
180 while (prev
&& offset
< prev_entry
->start
) {
181 prev
= rb_prev(prev
);
182 prev_entry
= rb_entry(prev
, struct extent_map
, rb_node
);
184 *prev_or_next_ret
= prev
;
189 static inline u64
extent_map_block_len(const struct extent_map
*em
)
191 if (extent_map_is_compressed(em
))
192 return em
->disk_num_bytes
;
196 static inline u64
extent_map_block_end(const struct extent_map
*em
)
198 const u64 block_start
= extent_map_block_start(em
);
199 const u64 block_end
= block_start
+ extent_map_block_len(em
);
201 if (block_end
< block_start
)
207 static bool can_merge_extent_map(const struct extent_map
*em
)
209 if (em
->flags
& EXTENT_FLAG_PINNED
)
212 /* Don't merge compressed extents, we need to know their actual size. */
213 if (extent_map_is_compressed(em
))
216 if (em
->flags
& EXTENT_FLAG_LOGGING
)
220 * We don't want to merge stuff that hasn't been written to the log yet
221 * since it may not reflect exactly what is on disk, and that would be
224 if (!list_empty(&em
->list
))
230 /* Check to see if two extent_map structs are adjacent and safe to merge. */
231 static bool mergeable_maps(const struct extent_map
*prev
, const struct extent_map
*next
)
233 if (extent_map_end(prev
) != next
->start
)
237 * The merged flag is not an on-disk flag, it just indicates we had the
238 * extent maps of 2 (or more) adjacent extents merged, so factor it out.
240 if ((prev
->flags
& ~EXTENT_FLAG_MERGED
) !=
241 (next
->flags
& ~EXTENT_FLAG_MERGED
))
244 if (next
->disk_bytenr
< EXTENT_MAP_LAST_BYTE
- 1)
245 return extent_map_block_start(next
) == extent_map_block_end(prev
);
247 /* HOLES and INLINE extents. */
248 return next
->disk_bytenr
== prev
->disk_bytenr
;
252 * Handle the on-disk data extents merge for @prev and @next.
254 * @prev: left extent to merge
255 * @next: right extent to merge
256 * @merged: the extent we will not discard after the merge; updated with new values
258 * After this, one of the two extents is the new merged extent and the other is
259 * removed from the tree and likely freed. Note that @merged is one of @prev/@next
260 * so there is const/non-const aliasing occurring here.
262 * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes.
263 * For now only uncompressed regular extent can be merged.
265 static void merge_ondisk_extents(const struct extent_map
*prev
, const struct extent_map
*next
,
266 struct extent_map
*merged
)
269 u64 new_disk_num_bytes
;
272 /* @prev and @next should not be compressed. */
273 ASSERT(!extent_map_is_compressed(prev
));
274 ASSERT(!extent_map_is_compressed(next
));
277 * There are two different cases where @prev and @next can be merged.
279 * 1) They are referring to the same data extent:
281 * |<----- data extent A ----->|
282 * |<- prev ->|<- next ->|
284 * 2) They are referring to different data extents but still adjacent:
286 * |<-- data extent A -->|<-- data extent B -->|
287 * |<- prev ->|<- next ->|
289 * The calculation here always merges the data extents first, then updates
290 * @offset using the new data extents.
292 * For case 1), the merged data extent would be the same.
293 * For case 2), we just merge the two data extents into one.
295 new_disk_bytenr
= min(prev
->disk_bytenr
, next
->disk_bytenr
);
296 new_disk_num_bytes
= max(prev
->disk_bytenr
+ prev
->disk_num_bytes
,
297 next
->disk_bytenr
+ next
->disk_num_bytes
) -
299 new_offset
= prev
->disk_bytenr
+ prev
->offset
- new_disk_bytenr
;
301 merged
->disk_bytenr
= new_disk_bytenr
;
302 merged
->disk_num_bytes
= new_disk_num_bytes
;
303 merged
->ram_bytes
= new_disk_num_bytes
;
304 merged
->offset
= new_offset
;
307 static void dump_extent_map(struct btrfs_fs_info
*fs_info
, const char *prefix
,
308 struct extent_map
*em
)
310 if (!IS_ENABLED(CONFIG_BTRFS_DEBUG
))
313 "%s, start=%llu len=%llu disk_bytenr=%llu disk_num_bytes=%llu ram_bytes=%llu offset=%llu flags=0x%x",
314 prefix
, em
->start
, em
->len
, em
->disk_bytenr
, em
->disk_num_bytes
,
315 em
->ram_bytes
, em
->offset
, em
->flags
);
319 /* Internal sanity checks for btrfs debug builds. */
320 static void validate_extent_map(struct btrfs_fs_info
*fs_info
, struct extent_map
*em
)
322 if (!IS_ENABLED(CONFIG_BTRFS_DEBUG
))
324 if (em
->disk_bytenr
< EXTENT_MAP_LAST_BYTE
) {
325 if (em
->disk_num_bytes
== 0)
326 dump_extent_map(fs_info
, "zero disk_num_bytes", em
);
327 if (em
->offset
+ em
->len
> em
->ram_bytes
)
328 dump_extent_map(fs_info
, "ram_bytes too small", em
);
329 if (em
->offset
+ em
->len
> em
->disk_num_bytes
&&
330 !extent_map_is_compressed(em
))
331 dump_extent_map(fs_info
, "disk_num_bytes too small", em
);
332 if (!extent_map_is_compressed(em
) &&
333 em
->ram_bytes
!= em
->disk_num_bytes
)
334 dump_extent_map(fs_info
,
335 "ram_bytes mismatch with disk_num_bytes for non-compressed em",
337 } else if (em
->offset
) {
338 dump_extent_map(fs_info
, "non-zero offset for hole/inline", em
);
342 static void try_merge_map(struct btrfs_inode
*inode
, struct extent_map
*em
)
344 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
345 struct extent_map
*merge
= NULL
;
349 * We can't modify an extent map that is in the tree and that is being
350 * used by another task, as it can cause that other task to see it in
351 * inconsistent state during the merging. We always have 1 reference for
352 * the tree and 1 for this task (which is unpinning the extent map or
353 * clearing the logging flag), so anything > 2 means it's being used by
356 if (refcount_read(&em
->refs
) > 2)
359 if (!can_merge_extent_map(em
))
362 if (em
->start
!= 0) {
363 rb
= rb_prev(&em
->rb_node
);
365 merge
= rb_entry(rb
, struct extent_map
, rb_node
);
366 if (rb
&& can_merge_extent_map(merge
) && mergeable_maps(merge
, em
)) {
367 em
->start
= merge
->start
;
368 em
->len
+= merge
->len
;
369 em
->generation
= max(em
->generation
, merge
->generation
);
371 if (em
->disk_bytenr
< EXTENT_MAP_LAST_BYTE
)
372 merge_ondisk_extents(merge
, em
, em
);
373 em
->flags
|= EXTENT_FLAG_MERGED
;
375 validate_extent_map(fs_info
, em
);
376 remove_em(inode
, merge
);
377 free_extent_map(merge
);
381 rb
= rb_next(&em
->rb_node
);
383 merge
= rb_entry(rb
, struct extent_map
, rb_node
);
384 if (rb
&& can_merge_extent_map(merge
) && mergeable_maps(em
, merge
)) {
385 em
->len
+= merge
->len
;
386 if (em
->disk_bytenr
< EXTENT_MAP_LAST_BYTE
)
387 merge_ondisk_extents(em
, merge
, em
);
388 validate_extent_map(fs_info
, em
);
389 em
->generation
= max(em
->generation
, merge
->generation
);
390 em
->flags
|= EXTENT_FLAG_MERGED
;
391 remove_em(inode
, merge
);
392 free_extent_map(merge
);
397 * Unpin an extent from the cache.
399 * @inode: the inode from which we are unpinning an extent range
400 * @start: logical offset in the file
401 * @len: length of the extent
402 * @gen: generation that this extent has been modified in
404 * Called after an extent has been written to disk properly. Set the generation
405 * to the generation that actually added the file item to the inode so we know
406 * we need to sync this extent when we call fsync().
408 * Returns: 0 on success
409 * -ENOENT when the extent is not found in the tree
410 * -EUCLEAN if the found extent does not match the expected start
412 int unpin_extent_cache(struct btrfs_inode
*inode
, u64 start
, u64 len
, u64 gen
)
414 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
415 struct extent_map_tree
*tree
= &inode
->extent_tree
;
417 struct extent_map
*em
;
419 write_lock(&tree
->lock
);
420 em
= lookup_extent_mapping(tree
, start
, len
);
424 "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
425 btrfs_ino(inode
), btrfs_root_id(inode
->root
),
426 start
, start
+ len
, gen
);
431 if (WARN_ON(em
->start
!= start
)) {
433 "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
434 btrfs_ino(inode
), btrfs_root_id(inode
->root
),
435 em
->start
, start
, start
+ len
, gen
);
440 em
->generation
= gen
;
441 em
->flags
&= ~EXTENT_FLAG_PINNED
;
443 try_merge_map(inode
, em
);
446 write_unlock(&tree
->lock
);
452 void clear_em_logging(struct btrfs_inode
*inode
, struct extent_map
*em
)
454 lockdep_assert_held_write(&inode
->extent_tree
.lock
);
456 em
->flags
&= ~EXTENT_FLAG_LOGGING
;
457 if (extent_map_in_tree(em
))
458 try_merge_map(inode
, em
);
461 static inline void setup_extent_mapping(struct btrfs_inode
*inode
,
462 struct extent_map
*em
,
465 refcount_inc(&em
->refs
);
467 ASSERT(list_empty(&em
->list
));
470 list_add(&em
->list
, &inode
->extent_tree
.modified_extents
);
472 try_merge_map(inode
, em
);
476 * Add a new extent map to an inode's extent map tree.
478 * @inode: the target inode
480 * @modified: indicate whether the given @em should be added to the
481 * modified list, which indicates the extent needs to be logged
483 * Insert @em into the @inode's extent map tree or perform a simple
484 * forward/backward merge with existing mappings. The extent_map struct passed
485 * in will be inserted into the tree directly, with an additional reference
486 * taken, or a reference dropped if the merge attempt was successful.
488 static int add_extent_mapping(struct btrfs_inode
*inode
,
489 struct extent_map
*em
, int modified
)
491 struct extent_map_tree
*tree
= &inode
->extent_tree
;
492 struct btrfs_root
*root
= inode
->root
;
493 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
496 lockdep_assert_held_write(&tree
->lock
);
498 validate_extent_map(fs_info
, em
);
499 ret
= tree_insert(&tree
->root
, em
);
503 setup_extent_mapping(inode
, em
, modified
);
505 if (!btrfs_is_testing(fs_info
) && is_fstree(btrfs_root_id(root
)))
506 percpu_counter_inc(&fs_info
->evictable_extent_maps
);
511 static struct extent_map
*
512 __lookup_extent_mapping(struct extent_map_tree
*tree
,
513 u64 start
, u64 len
, int strict
)
515 struct extent_map
*em
;
516 struct rb_node
*rb_node
;
517 struct rb_node
*prev_or_next
= NULL
;
518 u64 end
= range_end(start
, len
);
520 rb_node
= __tree_search(&tree
->root
, start
, &prev_or_next
);
523 rb_node
= prev_or_next
;
528 em
= rb_entry(rb_node
, struct extent_map
, rb_node
);
530 if (strict
&& !(end
> em
->start
&& start
< extent_map_end(em
)))
533 refcount_inc(&em
->refs
);
538 * Lookup extent_map that intersects @start + @len range.
540 * @tree: tree to lookup in
541 * @start: byte offset to start the search
542 * @len: length of the lookup range
544 * Find and return the first extent_map struct in @tree that intersects the
545 * [start, len] range. There may be additional objects in the tree that
546 * intersect, so check the object returned carefully to make sure that no
547 * additional lookups are needed.
549 struct extent_map
*lookup_extent_mapping(struct extent_map_tree
*tree
,
552 return __lookup_extent_mapping(tree
, start
, len
, 1);
556 * Find a nearby extent map intersecting @start + @len (not an exact search).
558 * @tree: tree to lookup in
559 * @start: byte offset to start the search
560 * @len: length of the lookup range
562 * Find and return the first extent_map struct in @tree that intersects the
563 * [start, len] range.
565 * If one can't be found, any nearby extent may be returned
567 struct extent_map
*search_extent_mapping(struct extent_map_tree
*tree
,
570 return __lookup_extent_mapping(tree
, start
, len
, 0);
574 * Remove an extent_map from its inode's extent tree.
576 * @inode: the inode the extent map belongs to
577 * @em: extent map being removed
579 * Remove @em from the extent tree of @inode. No reference counts are dropped,
580 * and no checks are done to see if the range is in use.
582 void remove_extent_mapping(struct btrfs_inode
*inode
, struct extent_map
*em
)
584 struct extent_map_tree
*tree
= &inode
->extent_tree
;
586 lockdep_assert_held_write(&tree
->lock
);
588 WARN_ON(em
->flags
& EXTENT_FLAG_PINNED
);
589 if (!(em
->flags
& EXTENT_FLAG_LOGGING
))
590 list_del_init(&em
->list
);
592 remove_em(inode
, em
);
595 static void replace_extent_mapping(struct btrfs_inode
*inode
,
596 struct extent_map
*cur
,
597 struct extent_map
*new,
600 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
601 struct extent_map_tree
*tree
= &inode
->extent_tree
;
603 lockdep_assert_held_write(&tree
->lock
);
605 validate_extent_map(fs_info
, new);
607 WARN_ON(cur
->flags
& EXTENT_FLAG_PINNED
);
608 ASSERT(extent_map_in_tree(cur
));
609 if (!(cur
->flags
& EXTENT_FLAG_LOGGING
))
610 list_del_init(&cur
->list
);
611 rb_replace_node(&cur
->rb_node
, &new->rb_node
, &tree
->root
);
612 RB_CLEAR_NODE(&cur
->rb_node
);
614 setup_extent_mapping(inode
, new, modified
);
617 static struct extent_map
*next_extent_map(const struct extent_map
*em
)
619 struct rb_node
*next
;
621 next
= rb_next(&em
->rb_node
);
624 return container_of(next
, struct extent_map
, rb_node
);
627 static struct extent_map
*prev_extent_map(struct extent_map
*em
)
629 struct rb_node
*prev
;
631 prev
= rb_prev(&em
->rb_node
);
634 return container_of(prev
, struct extent_map
, rb_node
);
638 * Helper for btrfs_get_extent. Given an existing extent in the tree,
639 * the existing extent is the nearest extent to map_start,
640 * and an extent that you want to insert, deal with overlap and insert
641 * the best fitted new extent into the tree.
643 static noinline
int merge_extent_mapping(struct btrfs_inode
*inode
,
644 struct extent_map
*existing
,
645 struct extent_map
*em
,
648 struct extent_map
*prev
;
649 struct extent_map
*next
;
654 if (map_start
< em
->start
|| map_start
>= extent_map_end(em
))
657 if (existing
->start
> map_start
) {
659 prev
= prev_extent_map(next
);
662 next
= next_extent_map(prev
);
665 start
= prev
? extent_map_end(prev
) : em
->start
;
666 start
= max_t(u64
, start
, em
->start
);
667 end
= next
? next
->start
: extent_map_end(em
);
668 end
= min_t(u64
, end
, extent_map_end(em
));
669 start_diff
= start
- em
->start
;
671 em
->len
= end
- start
;
672 if (em
->disk_bytenr
< EXTENT_MAP_LAST_BYTE
)
673 em
->offset
+= start_diff
;
674 return add_extent_mapping(inode
, em
, 0);
678 * Add extent mapping into an inode's extent map tree.
680 * @inode: target inode
681 * @em_in: extent we are inserting
682 * @start: start of the logical range btrfs_get_extent() is requesting
683 * @len: length of the logical range btrfs_get_extent() is requesting
685 * Note that @em_in's range may be different from [start, start+len),
686 * but they must be overlapped.
688 * Insert @em_in into the inode's extent map tree. In case there is an
689 * overlapping range, handle the -EEXIST by either:
690 * a) Returning the existing extent in @em_in if @start is within the
692 * b) Merge the existing extent with @em_in passed in.
694 * Return 0 on success, otherwise -EEXIST.
697 int btrfs_add_extent_mapping(struct btrfs_inode
*inode
,
698 struct extent_map
**em_in
, u64 start
, u64 len
)
701 struct extent_map
*em
= *em_in
;
702 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
705 * Tree-checker should have rejected any inline extent with non-zero
706 * file offset. Here just do a sanity check.
708 if (em
->disk_bytenr
== EXTENT_MAP_INLINE
)
709 ASSERT(em
->start
== 0);
711 ret
= add_extent_mapping(inode
, em
, 0);
712 /* it is possible that someone inserted the extent into the tree
713 * while we had the lock dropped. It is also possible that
714 * an overlapping map exists in the tree
716 if (ret
== -EEXIST
) {
717 struct extent_map
*existing
;
719 existing
= search_extent_mapping(&inode
->extent_tree
, start
, len
);
721 trace_btrfs_handle_em_exist(fs_info
, existing
, em
, start
, len
);
724 * existing will always be non-NULL, since there must be
725 * extent causing the -EEXIST.
727 if (start
>= existing
->start
&&
728 start
< extent_map_end(existing
)) {
733 u64 orig_start
= em
->start
;
734 u64 orig_len
= em
->len
;
737 * The existing extent map is the one nearest to
738 * the [start, start + len) range which overlaps
740 ret
= merge_extent_mapping(inode
, existing
, em
, start
);
745 "extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu",
746 existing
->start
, extent_map_end(existing
),
747 orig_start
, orig_start
+ orig_len
, start
);
749 free_extent_map(existing
);
753 ASSERT(ret
== 0 || ret
== -EEXIST
);
758 * Drop all extent maps from a tree in the fastest possible way, rescheduling
759 * if needed. This avoids searching the tree, from the root down to the first
760 * extent map, before each deletion.
762 static void drop_all_extent_maps_fast(struct btrfs_inode
*inode
)
764 struct extent_map_tree
*tree
= &inode
->extent_tree
;
765 struct rb_node
*node
;
767 write_lock(&tree
->lock
);
768 node
= rb_first(&tree
->root
);
770 struct extent_map
*em
;
771 struct rb_node
*next
= rb_next(node
);
773 em
= rb_entry(node
, struct extent_map
, rb_node
);
774 em
->flags
&= ~(EXTENT_FLAG_PINNED
| EXTENT_FLAG_LOGGING
);
775 remove_extent_mapping(inode
, em
);
778 if (cond_resched_rwlock_write(&tree
->lock
))
779 node
= rb_first(&tree
->root
);
783 write_unlock(&tree
->lock
);
787 * Drop all extent maps in a given range.
789 * @inode: The target inode.
790 * @start: Start offset of the range.
791 * @end: End offset of the range (inclusive value).
792 * @skip_pinned: Indicate if pinned extent maps should be ignored or not.
794 * This drops all the extent maps that intersect the given range [@start, @end].
795 * Extent maps that partially overlap the range and extend behind or beyond it,
797 * The caller should have locked an appropriate file range in the inode's io
798 * tree before calling this function.
800 void btrfs_drop_extent_map_range(struct btrfs_inode
*inode
, u64 start
, u64 end
,
803 struct extent_map
*split
;
804 struct extent_map
*split2
;
805 struct extent_map
*em
;
806 struct extent_map_tree
*em_tree
= &inode
->extent_tree
;
807 u64 len
= end
- start
+ 1;
809 WARN_ON(end
< start
);
810 if (end
== (u64
)-1) {
811 if (start
== 0 && !skip_pinned
) {
812 drop_all_extent_maps_fast(inode
);
817 /* Make end offset exclusive for use in the loop below. */
822 * It's ok if we fail to allocate the extent maps, see the comment near
823 * the bottom of the loop below. We only need two spare extent maps in
824 * the worst case, where the first extent map that intersects our range
825 * starts before the range and the last extent map that intersects our
826 * range ends after our range (and they might be the same extent map),
827 * because we need to split those two extent maps at the boundaries.
829 split
= alloc_extent_map();
830 split2
= alloc_extent_map();
832 write_lock(&em_tree
->lock
);
833 em
= lookup_extent_mapping(em_tree
, start
, len
);
836 /* extent_map_end() returns exclusive value (last byte + 1). */
837 const u64 em_end
= extent_map_end(em
);
838 struct extent_map
*next_em
= NULL
;
844 next_em
= next_extent_map(em
);
846 if (next_em
->start
< end
)
847 refcount_inc(&next_em
->refs
);
853 if (skip_pinned
&& (em
->flags
& EXTENT_FLAG_PINNED
)) {
860 * In case we split the extent map, we want to preserve the
861 * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
862 * it on the new extent maps.
864 em
->flags
&= ~(EXTENT_FLAG_PINNED
| EXTENT_FLAG_LOGGING
);
865 modified
= !list_empty(&em
->list
);
868 * The extent map does not cross our target range, so no need to
869 * split it, we can remove it directly.
871 if (em
->start
>= start
&& em_end
<= end
)
874 gen
= em
->generation
;
876 if (em
->start
< start
) {
883 split
->start
= em
->start
;
884 split
->len
= start
- em
->start
;
886 if (em
->disk_bytenr
< EXTENT_MAP_LAST_BYTE
) {
887 split
->disk_bytenr
= em
->disk_bytenr
;
888 split
->disk_num_bytes
= em
->disk_num_bytes
;
889 split
->offset
= em
->offset
;
890 split
->ram_bytes
= em
->ram_bytes
;
892 split
->disk_bytenr
= em
->disk_bytenr
;
893 split
->disk_num_bytes
= 0;
895 split
->ram_bytes
= split
->len
;
898 split
->generation
= gen
;
899 split
->flags
= flags
;
900 replace_extent_mapping(inode
, em
, split
, modified
);
901 free_extent_map(split
);
913 split
->len
= em_end
- end
;
914 split
->disk_bytenr
= em
->disk_bytenr
;
915 split
->flags
= flags
;
916 split
->generation
= gen
;
918 if (em
->disk_bytenr
< EXTENT_MAP_LAST_BYTE
) {
919 split
->disk_num_bytes
= em
->disk_num_bytes
;
920 split
->offset
= em
->offset
+ end
- em
->start
;
921 split
->ram_bytes
= em
->ram_bytes
;
923 split
->disk_num_bytes
= 0;
925 split
->ram_bytes
= split
->len
;
928 if (extent_map_in_tree(em
)) {
929 replace_extent_mapping(inode
, em
, split
, modified
);
933 ret
= add_extent_mapping(inode
, split
, modified
);
934 /* Logic error, shouldn't happen. */
936 if (WARN_ON(ret
!= 0) && modified
)
937 btrfs_set_inode_full_sync(inode
);
939 free_extent_map(split
);
943 if (extent_map_in_tree(em
)) {
945 * If the extent map is still in the tree it means that
946 * either of the following is true:
948 * 1) It fits entirely in our range (doesn't end beyond
949 * it or starts before it);
951 * 2) It starts before our range and/or ends after our
952 * range, and we were not able to allocate the extent
953 * maps for split operations, @split and @split2.
955 * If we are at case 2) then we just remove the entire
956 * extent map - this is fine since if anyone needs it to
957 * access the subranges outside our range, will just
958 * load it again from the subvolume tree's file extent
959 * item. However if the extent map was in the list of
960 * modified extents, then we must mark the inode for a
961 * full fsync, otherwise a fast fsync will miss this
962 * extent if it's new and needs to be logged.
964 if ((em
->start
< start
|| em_end
> end
) && modified
) {
966 btrfs_set_inode_full_sync(inode
);
968 remove_extent_mapping(inode
, em
);
972 * Once for the tree reference (we replaced or removed the
973 * extent map from the tree).
977 /* Once for us (for our lookup reference). */
983 write_unlock(&em_tree
->lock
);
985 free_extent_map(split
);
986 free_extent_map(split2
);
990 * Replace a range in the inode's extent map tree with a new extent map.
992 * @inode: The target inode.
993 * @new_em: The new extent map to add to the inode's extent map tree.
994 * @modified: Indicate if the new extent map should be added to the list of
995 * modified extents (for fast fsync tracking).
997 * Drops all the extent maps in the inode's extent map tree that intersect the
998 * range of the new extent map and adds the new extent map to the tree.
999 * The caller should have locked an appropriate file range in the inode's io
1000 * tree before calling this function.
1002 int btrfs_replace_extent_map_range(struct btrfs_inode
*inode
,
1003 struct extent_map
*new_em
,
1006 const u64 end
= new_em
->start
+ new_em
->len
- 1;
1007 struct extent_map_tree
*tree
= &inode
->extent_tree
;
1010 ASSERT(!extent_map_in_tree(new_em
));
1013 * The caller has locked an appropriate file range in the inode's io
1014 * tree, but getting -EEXIST when adding the new extent map can still
1015 * happen in case there are extents that partially cover the range, and
1016 * this is due to two tasks operating on different parts of the extent.
1017 * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from
1018 * btrfs_get_extent") for an example and details.
1021 btrfs_drop_extent_map_range(inode
, new_em
->start
, end
, false);
1022 write_lock(&tree
->lock
);
1023 ret
= add_extent_mapping(inode
, new_em
, modified
);
1024 write_unlock(&tree
->lock
);
1025 } while (ret
== -EEXIST
);
1031 * Split off the first pre bytes from the extent_map at [start, start + len],
1032 * and set the block_start for it to new_logical.
1034 * This function is used when an ordered_extent needs to be split.
1036 int split_extent_map(struct btrfs_inode
*inode
, u64 start
, u64 len
, u64 pre
,
1039 struct extent_map_tree
*em_tree
= &inode
->extent_tree
;
1040 struct extent_map
*em
;
1041 struct extent_map
*split_pre
= NULL
;
1042 struct extent_map
*split_mid
= NULL
;
1044 unsigned long flags
;
1049 split_pre
= alloc_extent_map();
1052 split_mid
= alloc_extent_map();
1058 lock_extent(&inode
->io_tree
, start
, start
+ len
- 1, NULL
);
1059 write_lock(&em_tree
->lock
);
1060 em
= lookup_extent_mapping(em_tree
, start
, len
);
1066 ASSERT(em
->len
== len
);
1067 ASSERT(!extent_map_is_compressed(em
));
1068 ASSERT(em
->disk_bytenr
< EXTENT_MAP_LAST_BYTE
);
1069 ASSERT(em
->flags
& EXTENT_FLAG_PINNED
);
1070 ASSERT(!(em
->flags
& EXTENT_FLAG_LOGGING
));
1071 ASSERT(!list_empty(&em
->list
));
1074 em
->flags
&= ~EXTENT_FLAG_PINNED
;
1076 /* First, replace the em with a new extent_map starting from * em->start */
1077 split_pre
->start
= em
->start
;
1078 split_pre
->len
= pre
;
1079 split_pre
->disk_bytenr
= new_logical
;
1080 split_pre
->disk_num_bytes
= split_pre
->len
;
1081 split_pre
->offset
= 0;
1082 split_pre
->ram_bytes
= split_pre
->len
;
1083 split_pre
->flags
= flags
;
1084 split_pre
->generation
= em
->generation
;
1086 replace_extent_mapping(inode
, em
, split_pre
, 1);
1089 * Now we only have an extent_map at:
1090 * [em->start, em->start + pre]
1093 /* Insert the middle extent_map. */
1094 split_mid
->start
= em
->start
+ pre
;
1095 split_mid
->len
= em
->len
- pre
;
1096 split_mid
->disk_bytenr
= extent_map_block_start(em
) + pre
;
1097 split_mid
->disk_num_bytes
= split_mid
->len
;
1098 split_mid
->offset
= 0;
1099 split_mid
->ram_bytes
= split_mid
->len
;
1100 split_mid
->flags
= flags
;
1101 split_mid
->generation
= em
->generation
;
1102 add_extent_mapping(inode
, split_mid
, 1);
1105 free_extent_map(em
);
1106 /* Once for the tree */
1107 free_extent_map(em
);
1110 write_unlock(&em_tree
->lock
);
1111 unlock_extent(&inode
->io_tree
, start
, start
+ len
- 1, NULL
);
1112 free_extent_map(split_mid
);
1114 free_extent_map(split_pre
);
1118 struct btrfs_em_shrink_ctx
{
1123 static long btrfs_scan_inode(struct btrfs_inode
*inode
, struct btrfs_em_shrink_ctx
*ctx
)
1125 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1126 const u64 cur_fs_gen
= btrfs_get_fs_generation(fs_info
);
1127 struct extent_map_tree
*tree
= &inode
->extent_tree
;
1128 long nr_dropped
= 0;
1129 struct rb_node
*node
;
1132 * Take the mmap lock so that we serialize with the inode logging phase
1133 * of fsync because we may need to set the full sync flag on the inode,
1134 * in case we have to remove extent maps in the tree's list of modified
1135 * extents. If we set the full sync flag in the inode while an fsync is
1136 * in progress, we may risk missing new extents because before the flag
1137 * is set, fsync decides to only wait for writeback to complete and then
1138 * during inode logging it sees the flag set and uses the subvolume tree
1139 * to find new extents, which may not be there yet because ordered
1140 * extents haven't completed yet.
1142 * We also do a try lock because otherwise we could deadlock. This is
1143 * because the shrinker for this filesystem may be invoked while we are
1144 * in a path that is holding the mmap lock in write mode. For example in
1145 * a reflink operation while COWing an extent buffer, when allocating
1146 * pages for a new extent buffer and under memory pressure, the shrinker
1147 * may be invoked, and therefore we would deadlock by attempting to read
1148 * lock the mmap lock while we are holding already a write lock on it.
1150 if (!down_read_trylock(&inode
->i_mmap_lock
))
1154 * We want to be fast so if the lock is busy we don't want to spend time
1155 * waiting for it - either some task is about to do IO for the inode or
1156 * we may have another task shrinking extent maps, here in this code, so
1159 if (!write_trylock(&tree
->lock
)) {
1160 up_read(&inode
->i_mmap_lock
);
1164 node
= rb_first(&tree
->root
);
1166 struct rb_node
*next
= rb_next(node
);
1167 struct extent_map
*em
;
1169 em
= rb_entry(node
, struct extent_map
, rb_node
);
1172 if (em
->flags
& EXTENT_FLAG_PINNED
)
1176 * If the inode is in the list of modified extents (new) and its
1177 * generation is the same (or is greater than) the current fs
1178 * generation, it means it was not yet persisted so we have to
1179 * set the full sync flag so that the next fsync will not miss
1182 if (!list_empty(&em
->list
) && em
->generation
>= cur_fs_gen
)
1183 btrfs_set_inode_full_sync(inode
);
1185 remove_extent_mapping(inode
, em
);
1186 trace_btrfs_extent_map_shrinker_remove_em(inode
, em
);
1187 /* Drop the reference for the tree. */
1188 free_extent_map(em
);
1191 if (ctx
->scanned
>= ctx
->nr_to_scan
)
1195 * Stop if we need to reschedule or there's contention on the
1196 * lock. This is to avoid slowing other tasks trying to take the
1199 if (need_resched() || rwlock_needbreak(&tree
->lock
) ||
1200 btrfs_fs_closing(fs_info
))
1204 write_unlock(&tree
->lock
);
1205 up_read(&inode
->i_mmap_lock
);
1210 static long btrfs_scan_root(struct btrfs_root
*root
, struct btrfs_em_shrink_ctx
*ctx
)
1212 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1213 struct btrfs_inode
*inode
;
1214 long nr_dropped
= 0;
1215 u64 min_ino
= fs_info
->em_shrinker_last_ino
+ 1;
1217 inode
= btrfs_find_first_inode(root
, min_ino
);
1219 nr_dropped
+= btrfs_scan_inode(inode
, ctx
);
1221 min_ino
= btrfs_ino(inode
) + 1;
1222 fs_info
->em_shrinker_last_ino
= btrfs_ino(inode
);
1223 btrfs_add_delayed_iput(inode
);
1225 if (ctx
->scanned
>= ctx
->nr_to_scan
||
1226 btrfs_fs_closing(inode
->root
->fs_info
))
1231 inode
= btrfs_find_first_inode(root
, min_ino
);
1236 * There are still inodes in this root or we happened to process
1237 * the last one and reached the scan limit. In either case set
1238 * the current root to this one, so we'll resume from the next
1239 * inode if there is one or we will find out this was the last
1240 * one and move to the next root.
1242 fs_info
->em_shrinker_last_root
= btrfs_root_id(root
);
1245 * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
1246 * that when processing the next root we start from its first inode.
1248 fs_info
->em_shrinker_last_ino
= 0;
1249 fs_info
->em_shrinker_last_root
= btrfs_root_id(root
) + 1;
1255 static void btrfs_extent_map_shrinker_worker(struct work_struct
*work
)
1257 struct btrfs_fs_info
*fs_info
;
1258 struct btrfs_em_shrink_ctx ctx
;
1261 bool cycled
= false;
1262 long nr_dropped
= 0;
1264 fs_info
= container_of(work
, struct btrfs_fs_info
, em_shrinker_work
);
1267 ctx
.nr_to_scan
= atomic64_read(&fs_info
->em_shrinker_nr_to_scan
);
1269 start_root_id
= fs_info
->em_shrinker_last_root
;
1270 next_root_id
= fs_info
->em_shrinker_last_root
;
1272 if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
1273 s64 nr
= percpu_counter_sum_positive(&fs_info
->evictable_extent_maps
);
1275 trace_btrfs_extent_map_shrinker_scan_enter(fs_info
, nr
);
1278 while (ctx
.scanned
< ctx
.nr_to_scan
&& !btrfs_fs_closing(fs_info
)) {
1279 struct btrfs_root
*root
;
1280 unsigned long count
;
1284 spin_lock(&fs_info
->fs_roots_radix_lock
);
1285 count
= radix_tree_gang_lookup(&fs_info
->fs_roots_radix
,
1287 (unsigned long)next_root_id
, 1);
1289 spin_unlock(&fs_info
->fs_roots_radix_lock
);
1290 if (start_root_id
> 0 && !cycled
) {
1292 fs_info
->em_shrinker_last_root
= 0;
1293 fs_info
->em_shrinker_last_ino
= 0;
1299 next_root_id
= btrfs_root_id(root
) + 1;
1300 root
= btrfs_grab_root(root
);
1301 spin_unlock(&fs_info
->fs_roots_radix_lock
);
1306 if (is_fstree(btrfs_root_id(root
)))
1307 nr_dropped
+= btrfs_scan_root(root
, &ctx
);
1309 btrfs_put_root(root
);
1312 if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
1313 s64 nr
= percpu_counter_sum_positive(&fs_info
->evictable_extent_maps
);
1315 trace_btrfs_extent_map_shrinker_scan_exit(fs_info
, nr_dropped
, nr
);
1318 atomic64_set(&fs_info
->em_shrinker_nr_to_scan
, 0);
1321 void btrfs_free_extent_maps(struct btrfs_fs_info
*fs_info
, long nr_to_scan
)
1324 * Do nothing if the shrinker is already running. In case of high memory
1325 * pressure we can have a lot of tasks calling us and all passing the
1326 * same nr_to_scan value, but in reality we may need only to free
1327 * nr_to_scan extent maps (or less). In case we need to free more than
1328 * that, we will be called again by the fs shrinker, so no worries about
1329 * not doing enough work to reclaim memory from extent maps.
1330 * We can also be repeatedly called with the same nr_to_scan value
1331 * simply because the shrinker runs asynchronously and multiple calls
1332 * to this function are made before the shrinker does enough progress.
1334 * That's why we set the atomic counter to nr_to_scan only if its
1335 * current value is zero, instead of incrementing the counter by
1338 if (atomic64_cmpxchg(&fs_info
->em_shrinker_nr_to_scan
, 0, nr_to_scan
) != 0)
1341 queue_work(system_unbound_wq
, &fs_info
->em_shrinker_work
);
1344 void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info
*fs_info
)
1346 atomic64_set(&fs_info
->em_shrinker_nr_to_scan
, 0);
1347 INIT_WORK(&fs_info
->em_shrinker_work
, btrfs_extent_map_shrinker_worker
);