1 // SPDX-License-Identifier: GPL-2.0
4 #include "btrfs_inode.h"
9 struct btrfs_fiemap_entry
{
17 * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
18 * range from the inode's io tree, unlock the subvolume tree search path, flush
19 * the fiemap cache and relock the file range and research the subvolume tree.
20 * The value here is something negative that can't be confused with a valid
21 * errno value and different from 1 because that's also a return value from
22 * fiemap_fill_next_extent() and also it's often used to mean some btree search
23 * did not find a key, so make it some distinct negative value.
25 #define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
30 * - Cache the next entry to be emitted to the fiemap buffer, so that we can
31 * merge extents that are contiguous and can be grouped as a single one;
33 * - Store extents ready to be written to the fiemap buffer in an intermediary
34 * buffer. This intermediary buffer is to ensure that in case the fiemap
35 * buffer is memory mapped to the fiemap target file, we don't deadlock
36 * during btrfs_page_mkwrite(). This is because during fiemap we are locking
37 * an extent range in order to prevent races with delalloc flushing and
38 * ordered extent completion, which is needed in order to reliably detect
39 * delalloc in holes and prealloc extents. And this can lead to a deadlock
40 * if the fiemap buffer is memory mapped to the file we are running fiemap
41 * against (a silly, useless in practice scenario, but possible) because
42 * btrfs_page_mkwrite() will try to lock the same extent range.
45 /* An array of ready fiemap entries. */
46 struct btrfs_fiemap_entry
*entries
;
47 /* Number of entries in the entries array. */
49 /* Index of the next entry in the entries array to write to. */
52 * Once the entries array is full, this indicates what's the offset for
53 * the next file extent item we must search for in the inode's subvolume
54 * tree after unlocking the extent range in the inode's io tree and
55 * releasing the search path.
57 u64 next_search_offset
;
59 * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
60 * to count ourselves emitted extents and stop instead of relying on
61 * fiemap_fill_next_extent() because we buffer ready fiemap entries at
62 * the @entries array, and we want to stop as soon as we hit the max
63 * amount of extents to map, not just to save time but also to make the
64 * logic at extent_fiemap() simpler.
66 unsigned int extents_mapped
;
67 /* Fields for the cached extent (unsubmitted, not ready, extent). */
75 static int flush_fiemap_cache(struct fiemap_extent_info
*fieinfo
,
76 struct fiemap_cache
*cache
)
78 for (int i
= 0; i
< cache
->entries_pos
; i
++) {
79 struct btrfs_fiemap_entry
*entry
= &cache
->entries
[i
];
82 ret
= fiemap_fill_next_extent(fieinfo
, entry
->offset
,
83 entry
->phys
, entry
->len
,
86 * Ignore 1 (reached max entries) because we keep track of that
87 * ourselves in emit_fiemap_extent().
92 cache
->entries_pos
= 0;
98 * Helper to submit fiemap extent.
100 * Will try to merge current fiemap extent specified by @offset, @phys,
101 * @len and @flags with cached one.
102 * And only when we fails to merge, cached one will be submitted as
105 * Return value is the same as fiemap_fill_next_extent().
107 static int emit_fiemap_extent(struct fiemap_extent_info
*fieinfo
,
108 struct fiemap_cache
*cache
,
109 u64 offset
, u64 phys
, u64 len
, u32 flags
)
111 struct btrfs_fiemap_entry
*entry
;
114 /* Set at the end of extent_fiemap(). */
115 ASSERT((flags
& FIEMAP_EXTENT_LAST
) == 0);
121 * When iterating the extents of the inode, at extent_fiemap(), we may
122 * find an extent that starts at an offset behind the end offset of the
123 * previous extent we processed. This happens if fiemap is called
124 * without FIEMAP_FLAG_SYNC and there are ordered extents completing
125 * after we had to unlock the file range, release the search path, emit
126 * the fiemap extents stored in the buffer (cache->entries array) and
127 * the lock the remainder of the range and re-search the btree.
129 * For example we are in leaf X processing its last item, which is the
130 * file extent item for file range [512K, 1M[, and after
131 * btrfs_next_leaf() releases the path, there's an ordered extent that
132 * completes for the file range [768K, 2M[, and that results in trimming
133 * the file extent item so that it now corresponds to the file range
134 * [512K, 768K[ and a new file extent item is inserted for the file
135 * range [768K, 2M[, which may end up as the last item of leaf X or as
136 * the first item of the next leaf - in either case btrfs_next_leaf()
137 * will leave us with a path pointing to the new extent item, for the
138 * file range [768K, 2M[, since that's the first key that follows the
139 * last one we processed. So in order not to report overlapping extents
140 * to user space, we trim the length of the previously cached extent and
143 * Upon calling btrfs_next_leaf() we may also find an extent with an
144 * offset smaller than or equals to cache->offset, and this happens
145 * when we had a hole or prealloc extent with several delalloc ranges in
146 * it, but after btrfs_next_leaf() released the path, delalloc was
147 * flushed and the resulting ordered extents were completed, so we can
148 * now have found a file extent item for an offset that is smaller than
149 * or equals to what we have in cache->offset. We deal with this as
152 cache_end
= cache
->offset
+ cache
->len
;
153 if (cache_end
> offset
) {
154 if (offset
== cache
->offset
) {
156 * We cached a dealloc range (found in the io tree) for
157 * a hole or prealloc extent and we have now found a
158 * file extent item for the same offset. What we have
159 * now is more recent and up to date, so discard what
160 * we had in the cache and use what we have just found.
163 } else if (offset
> cache
->offset
) {
165 * The extent range we previously found ends after the
166 * offset of the file extent item we found and that
167 * offset falls somewhere in the middle of that previous
168 * extent range. So adjust the range we previously found
169 * to end at the offset of the file extent item we have
170 * just found, since this extent is more up to date.
171 * Emit that adjusted range and cache the file extent
172 * item we have just found. This corresponds to the case
173 * where a previously found file extent item was split
174 * due to an ordered extent completing.
176 cache
->len
= offset
- cache
->offset
;
179 const u64 range_end
= offset
+ len
;
182 * The offset of the file extent item we have just found
183 * is behind the cached offset. This means we were
184 * processing a hole or prealloc extent for which we
185 * have found delalloc ranges (in the io tree), so what
186 * we have in the cache is the last delalloc range we
187 * found while the file extent item we found can be
188 * either for a whole delalloc range we previously
189 * emitted or only a part of that range.
191 * We have two cases here:
193 * 1) The file extent item's range ends at or behind the
194 * cached extent's end. In this case just ignore the
195 * current file extent item because we don't want to
196 * overlap with previous ranges that may have been
199 * 2) The file extent item starts behind the currently
200 * cached extent but its end offset goes beyond the
201 * end offset of the cached extent. We don't want to
202 * overlap with a previous range that may have been
203 * emitted already, so we emit the currently cached
204 * extent and then partially store the current file
205 * extent item's range in the cache, for the subrange
206 * going the cached extent's end to the end of the
209 if (range_end
<= cache_end
)
212 if (!(flags
& (FIEMAP_EXTENT_ENCODED
| FIEMAP_EXTENT_DELALLOC
)))
213 phys
+= cache_end
- offset
;
216 len
= range_end
- cache_end
;
222 * Only merges fiemap extents if
223 * 1) Their logical addresses are continuous
225 * 2) Their physical addresses are continuous
226 * So truly compressed (physical size smaller than logical size)
227 * extents won't get merged with each other
229 * 3) Share same flags
231 if (cache
->offset
+ cache
->len
== offset
&&
232 cache
->phys
+ cache
->len
== phys
&&
233 cache
->flags
== flags
) {
239 /* Not mergeable, need to submit cached one */
241 if (cache
->entries_pos
== cache
->entries_size
) {
243 * We will need to research for the end offset of the last
244 * stored extent and not from the current offset, because after
245 * unlocking the range and releasing the path, if there's a hole
246 * between that end offset and this current offset, a new extent
247 * may have been inserted due to a new write, so we don't want
250 entry
= &cache
->entries
[cache
->entries_size
- 1];
251 cache
->next_search_offset
= entry
->offset
+ entry
->len
;
252 cache
->cached
= false;
254 return BTRFS_FIEMAP_FLUSH_CACHE
;
257 entry
= &cache
->entries
[cache
->entries_pos
];
258 entry
->offset
= cache
->offset
;
259 entry
->phys
= cache
->phys
;
260 entry
->len
= cache
->len
;
261 entry
->flags
= cache
->flags
;
262 cache
->entries_pos
++;
263 cache
->extents_mapped
++;
265 if (cache
->extents_mapped
== fieinfo
->fi_extents_max
) {
266 cache
->cached
= false;
270 cache
->cached
= true;
271 cache
->offset
= offset
;
274 cache
->flags
= flags
;
280 * Emit last fiemap cache
282 * The last fiemap cache may still be cached in the following case:
284 * |<- Fiemap range ->|
285 * |<------------ First extent ----------->|
287 * In this case, the first extent range will be cached but not emitted.
288 * So we must emit it before ending extent_fiemap().
290 static int emit_last_fiemap_cache(struct fiemap_extent_info
*fieinfo
,
291 struct fiemap_cache
*cache
)
298 ret
= fiemap_fill_next_extent(fieinfo
, cache
->offset
, cache
->phys
,
299 cache
->len
, cache
->flags
);
300 cache
->cached
= false;
306 static int fiemap_next_leaf_item(struct btrfs_inode
*inode
, struct btrfs_path
*path
)
308 struct extent_buffer
*clone
= path
->nodes
[0];
309 struct btrfs_key key
;
314 if (path
->slots
[0] < btrfs_header_nritems(path
->nodes
[0]))
318 * Add a temporary extra ref to an already cloned extent buffer to
319 * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
320 * the cost of allocating a new one.
322 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED
, &clone
->bflags
));
323 atomic_inc(&clone
->refs
);
325 ret
= btrfs_next_leaf(inode
->root
, path
);
330 * Don't bother with cloning if there are no more file extent items for
333 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
334 if (key
.objectid
!= btrfs_ino(inode
) || key
.type
!= BTRFS_EXTENT_DATA_KEY
) {
340 * Important to preserve the start field, for the optimizations when
341 * checking if extents are shared (see extent_fiemap()).
343 * We must set ->start before calling copy_extent_buffer_full(). If we
344 * are on sub-pagesize blocksize, we use ->start to determine the offset
345 * into the folio where our eb exists, and if we update ->start after
346 * the fact then any subsequent reads of the eb may read from a
347 * different offset in the folio than where we originally copied into.
349 clone
->start
= path
->nodes
[0]->start
;
350 /* See the comment at fiemap_search_slot() about why we clone. */
351 copy_extent_buffer_full(clone
, path
->nodes
[0]);
353 slot
= path
->slots
[0];
354 btrfs_release_path(path
);
355 path
->nodes
[0] = clone
;
356 path
->slots
[0] = slot
;
359 free_extent_buffer(clone
);
365 * Search for the first file extent item that starts at a given file offset or
366 * the one that starts immediately before that offset.
367 * Returns: 0 on success, < 0 on error, 1 if not found.
369 static int fiemap_search_slot(struct btrfs_inode
*inode
, struct btrfs_path
*path
,
372 const u64 ino
= btrfs_ino(inode
);
373 struct btrfs_root
*root
= inode
->root
;
374 struct extent_buffer
*clone
;
375 struct btrfs_key key
;
380 key
.type
= BTRFS_EXTENT_DATA_KEY
;
381 key
.offset
= file_offset
;
383 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
387 if (ret
> 0 && path
->slots
[0] > 0) {
388 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0] - 1);
389 if (key
.objectid
== ino
&& key
.type
== BTRFS_EXTENT_DATA_KEY
)
393 if (path
->slots
[0] >= btrfs_header_nritems(path
->nodes
[0])) {
394 ret
= btrfs_next_leaf(root
, path
);
398 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
399 if (key
.objectid
!= ino
|| key
.type
!= BTRFS_EXTENT_DATA_KEY
)
404 * We clone the leaf and use it during fiemap. This is because while
405 * using the leaf we do expensive things like checking if an extent is
406 * shared, which can take a long time. In order to prevent blocking
407 * other tasks for too long, we use a clone of the leaf. We have locked
408 * the file range in the inode's io tree, so we know none of our file
409 * extent items can change. This way we avoid blocking other tasks that
410 * want to insert items for other inodes in the same leaf or b+tree
411 * rebalance operations (triggered for example when someone is trying
412 * to push items into this leaf when trying to insert an item in a
414 * We also need the private clone because holding a read lock on an
415 * extent buffer of the subvolume's b+tree will make lockdep unhappy
416 * when we check if extents are shared, as backref walking may need to
417 * lock the same leaf we are processing.
419 clone
= btrfs_clone_extent_buffer(path
->nodes
[0]);
423 slot
= path
->slots
[0];
424 btrfs_release_path(path
);
425 path
->nodes
[0] = clone
;
426 path
->slots
[0] = slot
;
432 * Process a range which is a hole or a prealloc extent in the inode's subvolume
433 * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
434 * extent. The end offset (@end) is inclusive.
436 static int fiemap_process_hole(struct btrfs_inode
*inode
,
437 struct fiemap_extent_info
*fieinfo
,
438 struct fiemap_cache
*cache
,
439 struct extent_state
**delalloc_cached_state
,
440 struct btrfs_backref_share_check_ctx
*backref_ctx
,
441 u64 disk_bytenr
, u64 extent_offset
,
445 const u64 i_size
= i_size_read(&inode
->vfs_inode
);
446 u64 cur_offset
= start
;
447 u64 last_delalloc_end
= 0;
448 u32 prealloc_flags
= FIEMAP_EXTENT_UNWRITTEN
;
449 bool checked_extent_shared
= false;
453 * There can be no delalloc past i_size, so don't waste time looking for
456 while (cur_offset
< end
&& cur_offset
< i_size
) {
460 u64 prealloc_len
= 0;
463 delalloc
= btrfs_find_delalloc_in_range(inode
, cur_offset
, end
,
464 delalloc_cached_state
,
471 * If this is a prealloc extent we have to report every section
472 * of it that has no delalloc.
474 if (disk_bytenr
!= 0) {
475 if (last_delalloc_end
== 0) {
476 prealloc_start
= start
;
477 prealloc_len
= delalloc_start
- start
;
479 prealloc_start
= last_delalloc_end
+ 1;
480 prealloc_len
= delalloc_start
- prealloc_start
;
484 if (prealloc_len
> 0) {
485 if (!checked_extent_shared
&& fieinfo
->fi_extents_max
) {
486 ret
= btrfs_is_data_extent_shared(inode
,
493 prealloc_flags
|= FIEMAP_EXTENT_SHARED
;
495 checked_extent_shared
= true;
497 ret
= emit_fiemap_extent(fieinfo
, cache
, prealloc_start
,
498 disk_bytenr
+ extent_offset
,
499 prealloc_len
, prealloc_flags
);
502 extent_offset
+= prealloc_len
;
505 ret
= emit_fiemap_extent(fieinfo
, cache
, delalloc_start
, 0,
506 delalloc_end
+ 1 - delalloc_start
,
507 FIEMAP_EXTENT_DELALLOC
|
508 FIEMAP_EXTENT_UNKNOWN
);
512 last_delalloc_end
= delalloc_end
;
513 cur_offset
= delalloc_end
+ 1;
514 extent_offset
+= cur_offset
- delalloc_start
;
519 * Either we found no delalloc for the whole prealloc extent or we have
520 * a prealloc extent that spans i_size or starts at or after i_size.
522 if (disk_bytenr
!= 0 && last_delalloc_end
< end
) {
526 if (last_delalloc_end
== 0) {
527 prealloc_start
= start
;
528 prealloc_len
= end
+ 1 - start
;
530 prealloc_start
= last_delalloc_end
+ 1;
531 prealloc_len
= end
+ 1 - prealloc_start
;
534 if (!checked_extent_shared
&& fieinfo
->fi_extents_max
) {
535 ret
= btrfs_is_data_extent_shared(inode
,
542 prealloc_flags
|= FIEMAP_EXTENT_SHARED
;
544 ret
= emit_fiemap_extent(fieinfo
, cache
, prealloc_start
,
545 disk_bytenr
+ extent_offset
,
546 prealloc_len
, prealloc_flags
);
554 static int fiemap_find_last_extent_offset(struct btrfs_inode
*inode
,
555 struct btrfs_path
*path
,
556 u64
*last_extent_end_ret
)
558 const u64 ino
= btrfs_ino(inode
);
559 struct btrfs_root
*root
= inode
->root
;
560 struct extent_buffer
*leaf
;
561 struct btrfs_file_extent_item
*ei
;
562 struct btrfs_key key
;
567 * Lookup the last file extent. We're not using i_size here because
568 * there might be preallocation past i_size.
570 ret
= btrfs_lookup_file_extent(NULL
, root
, path
, ino
, (u64
)-1, 0);
571 /* There can't be a file extent item at offset (u64)-1 */
577 * For a non-existing key, btrfs_search_slot() always leaves us at a
578 * slot > 0, except if the btree is empty, which is impossible because
579 * at least it has the inode item for this inode and all the items for
580 * the root inode 256.
582 ASSERT(path
->slots
[0] > 0);
584 leaf
= path
->nodes
[0];
585 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
586 if (key
.objectid
!= ino
|| key
.type
!= BTRFS_EXTENT_DATA_KEY
) {
587 /* No file extent items in the subvolume tree. */
588 *last_extent_end_ret
= 0;
593 * For an inline extent, the disk_bytenr is where inline data starts at,
594 * so first check if we have an inline extent item before checking if we
595 * have an implicit hole (disk_bytenr == 0).
597 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_file_extent_item
);
598 if (btrfs_file_extent_type(leaf
, ei
) == BTRFS_FILE_EXTENT_INLINE
) {
599 *last_extent_end_ret
= btrfs_file_extent_end(path
);
604 * Find the last file extent item that is not a hole (when NO_HOLES is
605 * not enabled). This should take at most 2 iterations in the worst
606 * case: we have one hole file extent item at slot 0 of a leaf and
607 * another hole file extent item as the last item in the previous leaf.
608 * This is because we merge file extent items that represent holes.
610 disk_bytenr
= btrfs_file_extent_disk_bytenr(leaf
, ei
);
611 while (disk_bytenr
== 0) {
612 ret
= btrfs_previous_item(root
, path
, ino
, BTRFS_EXTENT_DATA_KEY
);
615 } else if (ret
> 0) {
616 /* No file extent items that are not holes. */
617 *last_extent_end_ret
= 0;
620 leaf
= path
->nodes
[0];
621 ei
= btrfs_item_ptr(leaf
, path
->slots
[0],
622 struct btrfs_file_extent_item
);
623 disk_bytenr
= btrfs_file_extent_disk_bytenr(leaf
, ei
);
626 *last_extent_end_ret
= btrfs_file_extent_end(path
);
630 static int extent_fiemap(struct btrfs_inode
*inode
,
631 struct fiemap_extent_info
*fieinfo
,
634 const u64 ino
= btrfs_ino(inode
);
635 struct extent_state
*cached_state
= NULL
;
636 struct extent_state
*delalloc_cached_state
= NULL
;
637 struct btrfs_path
*path
;
638 struct fiemap_cache cache
= { 0 };
639 struct btrfs_backref_share_check_ctx
*backref_ctx
;
640 u64 last_extent_end
= 0;
644 const u64 sectorsize
= inode
->root
->fs_info
->sectorsize
;
645 bool stopped
= false;
648 cache
.entries_size
= PAGE_SIZE
/ sizeof(struct btrfs_fiemap_entry
);
649 cache
.entries
= kmalloc_array(cache
.entries_size
,
650 sizeof(struct btrfs_fiemap_entry
),
652 backref_ctx
= btrfs_alloc_backref_share_check_ctx();
653 path
= btrfs_alloc_path();
654 if (!cache
.entries
|| !backref_ctx
|| !path
) {
660 range_start
= round_down(start
, sectorsize
);
661 range_end
= round_up(start
+ len
, sectorsize
);
662 prev_extent_end
= range_start
;
664 lock_extent(&inode
->io_tree
, range_start
, range_end
, &cached_state
);
666 ret
= fiemap_find_last_extent_offset(inode
, path
, &last_extent_end
);
669 btrfs_release_path(path
);
671 path
->reada
= READA_FORWARD
;
672 ret
= fiemap_search_slot(inode
, path
, range_start
);
675 } else if (ret
> 0) {
677 * No file extent item found, but we may have delalloc between
678 * the current offset and i_size. So check for that.
681 goto check_eof_delalloc
;
684 while (prev_extent_end
< range_end
) {
685 struct extent_buffer
*leaf
= path
->nodes
[0];
686 struct btrfs_file_extent_item
*ei
;
687 struct btrfs_key key
;
690 u64 extent_offset
= 0;
697 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
698 if (key
.objectid
!= ino
|| key
.type
!= BTRFS_EXTENT_DATA_KEY
)
701 extent_end
= btrfs_file_extent_end(path
);
704 * The first iteration can leave us at an extent item that ends
705 * before our range's start. Move to the next item.
707 if (extent_end
<= range_start
)
710 backref_ctx
->curr_leaf_bytenr
= leaf
->start
;
712 /* We have in implicit hole (NO_HOLES feature enabled). */
713 if (prev_extent_end
< key
.offset
) {
714 const u64 hole_end
= min(key
.offset
, range_end
) - 1;
716 ret
= fiemap_process_hole(inode
, fieinfo
, &cache
,
717 &delalloc_cached_state
,
718 backref_ctx
, 0, 0, 0,
719 prev_extent_end
, hole_end
);
722 } else if (ret
> 0) {
723 /* fiemap_fill_next_extent() told us to stop. */
728 /* We've reached the end of the fiemap range, stop. */
729 if (key
.offset
>= range_end
) {
735 extent_len
= extent_end
- key
.offset
;
736 ei
= btrfs_item_ptr(leaf
, path
->slots
[0],
737 struct btrfs_file_extent_item
);
738 compression
= btrfs_file_extent_compression(leaf
, ei
);
739 extent_type
= btrfs_file_extent_type(leaf
, ei
);
740 extent_gen
= btrfs_file_extent_generation(leaf
, ei
);
742 if (extent_type
!= BTRFS_FILE_EXTENT_INLINE
) {
743 disk_bytenr
= btrfs_file_extent_disk_bytenr(leaf
, ei
);
744 if (compression
== BTRFS_COMPRESS_NONE
)
745 extent_offset
= btrfs_file_extent_offset(leaf
, ei
);
748 if (compression
!= BTRFS_COMPRESS_NONE
)
749 flags
|= FIEMAP_EXTENT_ENCODED
;
751 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
752 flags
|= FIEMAP_EXTENT_DATA_INLINE
;
753 flags
|= FIEMAP_EXTENT_NOT_ALIGNED
;
754 ret
= emit_fiemap_extent(fieinfo
, &cache
, key
.offset
, 0,
756 } else if (extent_type
== BTRFS_FILE_EXTENT_PREALLOC
) {
757 ret
= fiemap_process_hole(inode
, fieinfo
, &cache
,
758 &delalloc_cached_state
,
760 disk_bytenr
, extent_offset
,
761 extent_gen
, key
.offset
,
763 } else if (disk_bytenr
== 0) {
764 /* We have an explicit hole. */
765 ret
= fiemap_process_hole(inode
, fieinfo
, &cache
,
766 &delalloc_cached_state
,
767 backref_ctx
, 0, 0, 0,
768 key
.offset
, extent_end
- 1);
770 /* We have a regular extent. */
771 if (fieinfo
->fi_extents_max
) {
772 ret
= btrfs_is_data_extent_shared(inode
,
779 flags
|= FIEMAP_EXTENT_SHARED
;
782 ret
= emit_fiemap_extent(fieinfo
, &cache
, key
.offset
,
783 disk_bytenr
+ extent_offset
,
789 } else if (ret
> 0) {
790 /* emit_fiemap_extent() told us to stop. */
795 prev_extent_end
= extent_end
;
797 if (fatal_signal_pending(current
)) {
802 ret
= fiemap_next_leaf_item(inode
, path
);
805 } else if (ret
> 0) {
806 /* No more file extent items for this inode. */
813 if (!stopped
&& prev_extent_end
< range_end
) {
814 ret
= fiemap_process_hole(inode
, fieinfo
, &cache
,
815 &delalloc_cached_state
, backref_ctx
,
816 0, 0, 0, prev_extent_end
, range_end
- 1);
819 prev_extent_end
= range_end
;
822 if (cache
.cached
&& cache
.offset
+ cache
.len
>= last_extent_end
) {
823 const u64 i_size
= i_size_read(&inode
->vfs_inode
);
825 if (prev_extent_end
< i_size
) {
830 delalloc
= btrfs_find_delalloc_in_range(inode
,
833 &delalloc_cached_state
,
837 cache
.flags
|= FIEMAP_EXTENT_LAST
;
839 cache
.flags
|= FIEMAP_EXTENT_LAST
;
844 unlock_extent(&inode
->io_tree
, range_start
, range_end
, &cached_state
);
846 if (ret
== BTRFS_FIEMAP_FLUSH_CACHE
) {
847 btrfs_release_path(path
);
848 ret
= flush_fiemap_cache(fieinfo
, &cache
);
851 len
-= cache
.next_search_offset
- start
;
852 start
= cache
.next_search_offset
;
854 } else if (ret
< 0) {
859 * Must free the path before emitting to the fiemap buffer because we
860 * may have a non-cloned leaf and if the fiemap buffer is memory mapped
861 * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
862 * waiting for an ordered extent that in order to complete needs to
863 * modify that leaf, therefore leading to a deadlock.
865 btrfs_free_path(path
);
868 ret
= flush_fiemap_cache(fieinfo
, &cache
);
872 ret
= emit_last_fiemap_cache(fieinfo
, &cache
);
874 free_extent_state(delalloc_cached_state
);
875 kfree(cache
.entries
);
876 btrfs_free_backref_share_ctx(backref_ctx
);
877 btrfs_free_path(path
);
881 int btrfs_fiemap(struct inode
*inode
, struct fiemap_extent_info
*fieinfo
,
884 struct btrfs_inode
*btrfs_inode
= BTRFS_I(inode
);
887 ret
= fiemap_prep(inode
, fieinfo
, start
, &len
, 0);
892 * fiemap_prep() called filemap_write_and_wait() for the whole possible
893 * file range (0 to LLONG_MAX), but that is not enough if we have
894 * compression enabled. The first filemap_fdatawrite_range() only kicks
895 * in the compression of data (in an async thread) and will return
896 * before the compression is done and writeback is started. A second
897 * filemap_fdatawrite_range() is needed to wait for the compression to
898 * complete and writeback to start. We also need to wait for ordered
899 * extents to complete, because our fiemap implementation uses mainly
900 * file extent items to list the extents, searching for extent maps
901 * only for file ranges with holes or prealloc extents to figure out
902 * if we have delalloc in those ranges.
904 if (fieinfo
->fi_flags
& FIEMAP_FLAG_SYNC
) {
905 ret
= btrfs_wait_ordered_range(btrfs_inode
, 0, LLONG_MAX
);
910 btrfs_inode_lock(btrfs_inode
, BTRFS_ILOCK_SHARED
);
913 * We did an initial flush to avoid holding the inode's lock while
914 * triggering writeback and waiting for the completion of IO and ordered
915 * extents. Now after we locked the inode we do it again, because it's
916 * possible a new write may have happened in between those two steps.
918 if (fieinfo
->fi_flags
& FIEMAP_FLAG_SYNC
) {
919 ret
= btrfs_wait_ordered_range(btrfs_inode
, 0, LLONG_MAX
);
921 btrfs_inode_unlock(btrfs_inode
, BTRFS_ILOCK_SHARED
);
926 ret
= extent_fiemap(btrfs_inode
, fieinfo
, start
, len
);
927 btrfs_inode_unlock(btrfs_inode
, BTRFS_ILOCK_SHARED
);