1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
7 #include <linux/pagemap.h>
8 #include <linux/time.h>
9 #include <linux/init.h>
10 #include <linux/string.h>
11 #include <linux/backing-dev.h>
12 #include <linux/falloc.h>
13 #include <linux/writeback.h>
14 #include <linux/compat.h>
15 #include <linux/slab.h>
16 #include <linux/btrfs.h>
17 #include <linux/uio.h>
18 #include <linux/iversion.h>
19 #include <linux/fsverity.h>
21 #include "direct-io.h"
23 #include "transaction.h"
24 #include "btrfs_inode.h"
28 #include "compression.h"
29 #include "delalloc-space.h"
33 #include "accessors.h"
34 #include "extent-tree.h"
35 #include "file-item.h"
41 * Helper to fault in page and copy. This should go away and be replaced with
42 * calls into generic code.
44 static noinline
int btrfs_copy_from_user(loff_t pos
, size_t write_bytes
,
45 struct folio
*folio
, struct iov_iter
*i
)
48 size_t total_copied
= 0;
49 int offset
= offset_in_page(pos
);
51 while (write_bytes
> 0) {
52 size_t count
= min_t(size_t, PAGE_SIZE
- offset
, write_bytes
);
54 * Copy data from userspace to the current page
56 copied
= copy_folio_from_iter_atomic(folio
, offset
, count
, i
);
58 /* Flush processor's dcache for this page */
59 flush_dcache_folio(folio
);
62 * if we get a partial write, we can end up with
63 * partially up to date page. These add
64 * a lot of complexity, so make sure they don't
65 * happen by forcing this copy to be retried.
67 * The rest of the btrfs_file_write code will fall
68 * back to page at a time copies after we return 0.
70 if (unlikely(copied
< count
)) {
71 if (!folio_test_uptodate(folio
)) {
72 iov_iter_revert(i
, copied
);
79 write_bytes
-= copied
;
80 total_copied
+= copied
;
87 * Unlock folio after btrfs_file_write() is done with it.
89 static void btrfs_drop_folio(struct btrfs_fs_info
*fs_info
, struct folio
*folio
,
92 u64 block_start
= round_down(pos
, fs_info
->sectorsize
);
93 u64 block_len
= round_up(pos
+ copied
, fs_info
->sectorsize
) - block_start
;
95 ASSERT(block_len
<= U32_MAX
);
97 * Folio checked is some magic around finding folios that have been
98 * modified without going through btrfs_dirty_folio(). Clear it here.
99 * There should be no need to mark the pages accessed as
100 * prepare_one_folio() should have marked them accessed in
101 * prepare_one_folio() via find_or_create_page()
103 btrfs_folio_clamp_clear_checked(fs_info
, folio
, block_start
, block_len
);
109 * After btrfs_copy_from_user(), update the following things for delalloc:
110 * - Mark newly dirtied folio as DELALLOC in the io tree.
111 * Used to advise which range is to be written back.
112 * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
113 * - Update inode size for past EOF write
115 int btrfs_dirty_folio(struct btrfs_inode
*inode
, struct folio
*folio
, loff_t pos
,
116 size_t write_bytes
, struct extent_state
**cached
, bool noreserve
)
118 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
122 u64 end_of_last_block
;
123 u64 end_pos
= pos
+ write_bytes
;
124 loff_t isize
= i_size_read(&inode
->vfs_inode
);
125 unsigned int extra_bits
= 0;
127 if (write_bytes
== 0)
131 extra_bits
|= EXTENT_NORESERVE
;
133 start_pos
= round_down(pos
, fs_info
->sectorsize
);
134 num_bytes
= round_up(write_bytes
+ pos
- start_pos
,
135 fs_info
->sectorsize
);
136 ASSERT(num_bytes
<= U32_MAX
);
137 ASSERT(folio_pos(folio
) <= pos
&&
138 folio_pos(folio
) + folio_size(folio
) >= pos
+ write_bytes
);
140 end_of_last_block
= start_pos
+ num_bytes
- 1;
143 * The pages may have already been dirty, clear out old accounting so
144 * we can set things up properly
146 clear_extent_bit(&inode
->io_tree
, start_pos
, end_of_last_block
,
147 EXTENT_DELALLOC
| EXTENT_DO_ACCOUNTING
| EXTENT_DEFRAG
,
150 ret
= btrfs_set_extent_delalloc(inode
, start_pos
, end_of_last_block
,
155 btrfs_folio_clamp_set_uptodate(fs_info
, folio
, start_pos
, num_bytes
);
156 btrfs_folio_clamp_clear_checked(fs_info
, folio
, start_pos
, num_bytes
);
157 btrfs_folio_clamp_set_dirty(fs_info
, folio
, start_pos
, num_bytes
);
160 * we've only changed i_size in ram, and we haven't updated
161 * the disk i_size. There is no need to log the inode
165 i_size_write(&inode
->vfs_inode
, end_pos
);
170 * this is very complex, but the basic idea is to drop all extents
171 * in the range start - end. hint_block is filled in with a block number
172 * that would be a good hint to the block allocator for this file.
174 * If an extent intersects the range but is not entirely inside the range
175 * it is either truncated or split. Anything entirely inside the range
176 * is deleted from the tree.
178 * Note: the VFS' inode number of bytes is not updated, it's up to the caller
179 * to deal with that. We set the field 'bytes_found' of the arguments structure
180 * with the number of allocated bytes found in the target range, so that the
181 * caller can update the inode's number of bytes in an atomic way when
182 * replacing extents in a range to avoid races with stat(2).
184 int btrfs_drop_extents(struct btrfs_trans_handle
*trans
,
185 struct btrfs_root
*root
, struct btrfs_inode
*inode
,
186 struct btrfs_drop_extents_args
*args
)
188 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
189 struct extent_buffer
*leaf
;
190 struct btrfs_file_extent_item
*fi
;
191 struct btrfs_key key
;
192 struct btrfs_key new_key
;
193 u64 ino
= btrfs_ino(inode
);
194 u64 search_start
= args
->start
;
197 u64 extent_offset
= 0;
199 u64 last_end
= args
->start
;
205 int modify_tree
= -1;
208 struct btrfs_path
*path
= args
->path
;
210 args
->bytes_found
= 0;
211 args
->extent_inserted
= false;
213 /* Must always have a path if ->replace_extent is true */
214 ASSERT(!(args
->replace_extent
&& !args
->path
));
217 path
= btrfs_alloc_path();
224 if (args
->drop_cache
)
225 btrfs_drop_extent_map_range(inode
, args
->start
, args
->end
- 1, false);
227 if (args
->start
>= inode
->disk_i_size
&& !args
->replace_extent
)
230 update_refs
= (btrfs_root_id(root
) != BTRFS_TREE_LOG_OBJECTID
);
233 ret
= btrfs_lookup_file_extent(trans
, root
, path
, ino
,
234 search_start
, modify_tree
);
237 if (ret
> 0 && path
->slots
[0] > 0 && search_start
== args
->start
) {
238 leaf
= path
->nodes
[0];
239 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0] - 1);
240 if (key
.objectid
== ino
&&
241 key
.type
== BTRFS_EXTENT_DATA_KEY
)
246 leaf
= path
->nodes
[0];
247 if (path
->slots
[0] >= btrfs_header_nritems(leaf
)) {
249 ret
= btrfs_next_leaf(root
, path
);
256 leaf
= path
->nodes
[0];
260 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
262 if (key
.objectid
> ino
)
264 if (WARN_ON_ONCE(key
.objectid
< ino
) ||
265 key
.type
< BTRFS_EXTENT_DATA_KEY
) {
270 if (key
.type
> BTRFS_EXTENT_DATA_KEY
|| key
.offset
>= args
->end
)
273 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
274 struct btrfs_file_extent_item
);
275 extent_type
= btrfs_file_extent_type(leaf
, fi
);
277 if (extent_type
== BTRFS_FILE_EXTENT_REG
||
278 extent_type
== BTRFS_FILE_EXTENT_PREALLOC
) {
279 disk_bytenr
= btrfs_file_extent_disk_bytenr(leaf
, fi
);
280 num_bytes
= btrfs_file_extent_disk_num_bytes(leaf
, fi
);
281 extent_offset
= btrfs_file_extent_offset(leaf
, fi
);
282 extent_end
= key
.offset
+
283 btrfs_file_extent_num_bytes(leaf
, fi
);
284 } else if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
285 extent_end
= key
.offset
+
286 btrfs_file_extent_ram_bytes(leaf
, fi
);
293 * Don't skip extent items representing 0 byte lengths. They
294 * used to be created (bug) if while punching holes we hit
295 * -ENOSPC condition. So if we find one here, just ensure we
296 * delete it, otherwise we would insert a new file extent item
297 * with the same key (offset) as that 0 bytes length file
298 * extent item in the call to setup_items_for_insert() later
301 if (extent_end
== key
.offset
&& extent_end
>= search_start
) {
302 last_end
= extent_end
;
303 goto delete_extent_item
;
306 if (extent_end
<= search_start
) {
312 search_start
= max(key
.offset
, args
->start
);
313 if (recow
|| !modify_tree
) {
315 btrfs_release_path(path
);
320 * | - range to drop - |
321 * | -------- extent -------- |
323 if (args
->start
> key
.offset
&& args
->end
< extent_end
) {
325 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
330 memcpy(&new_key
, &key
, sizeof(new_key
));
331 new_key
.offset
= args
->start
;
332 ret
= btrfs_duplicate_item(trans
, root
, path
,
334 if (ret
== -EAGAIN
) {
335 btrfs_release_path(path
);
341 leaf
= path
->nodes
[0];
342 fi
= btrfs_item_ptr(leaf
, path
->slots
[0] - 1,
343 struct btrfs_file_extent_item
);
344 btrfs_set_file_extent_num_bytes(leaf
, fi
,
345 args
->start
- key
.offset
);
347 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
348 struct btrfs_file_extent_item
);
350 extent_offset
+= args
->start
- key
.offset
;
351 btrfs_set_file_extent_offset(leaf
, fi
, extent_offset
);
352 btrfs_set_file_extent_num_bytes(leaf
, fi
,
353 extent_end
- args
->start
);
354 btrfs_mark_buffer_dirty(trans
, leaf
);
356 if (update_refs
&& disk_bytenr
> 0) {
357 struct btrfs_ref ref
= {
358 .action
= BTRFS_ADD_DELAYED_REF
,
359 .bytenr
= disk_bytenr
,
360 .num_bytes
= num_bytes
,
362 .owning_root
= btrfs_root_id(root
),
363 .ref_root
= btrfs_root_id(root
),
365 btrfs_init_data_ref(&ref
, new_key
.objectid
,
366 args
->start
- extent_offset
,
368 ret
= btrfs_inc_extent_ref(trans
, &ref
);
370 btrfs_abort_transaction(trans
, ret
);
374 key
.offset
= args
->start
;
377 * From here on out we will have actually dropped something, so
378 * last_end can be updated.
380 last_end
= extent_end
;
383 * | ---- range to drop ----- |
384 * | -------- extent -------- |
386 if (args
->start
<= key
.offset
&& args
->end
< extent_end
) {
387 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
392 memcpy(&new_key
, &key
, sizeof(new_key
));
393 new_key
.offset
= args
->end
;
394 btrfs_set_item_key_safe(trans
, path
, &new_key
);
396 extent_offset
+= args
->end
- key
.offset
;
397 btrfs_set_file_extent_offset(leaf
, fi
, extent_offset
);
398 btrfs_set_file_extent_num_bytes(leaf
, fi
,
399 extent_end
- args
->end
);
400 btrfs_mark_buffer_dirty(trans
, leaf
);
401 if (update_refs
&& disk_bytenr
> 0)
402 args
->bytes_found
+= args
->end
- key
.offset
;
406 search_start
= extent_end
;
408 * | ---- range to drop ----- |
409 * | -------- extent -------- |
411 if (args
->start
> key
.offset
&& args
->end
>= extent_end
) {
413 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
418 btrfs_set_file_extent_num_bytes(leaf
, fi
,
419 args
->start
- key
.offset
);
420 btrfs_mark_buffer_dirty(trans
, leaf
);
421 if (update_refs
&& disk_bytenr
> 0)
422 args
->bytes_found
+= extent_end
- args
->start
;
423 if (args
->end
== extent_end
)
431 * | ---- range to drop ----- |
432 * | ------ extent ------ |
434 if (args
->start
<= key
.offset
&& args
->end
>= extent_end
) {
437 del_slot
= path
->slots
[0];
440 BUG_ON(del_slot
+ del_nr
!= path
->slots
[0]);
445 extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
446 args
->bytes_found
+= extent_end
- key
.offset
;
447 extent_end
= ALIGN(extent_end
,
448 fs_info
->sectorsize
);
449 } else if (update_refs
&& disk_bytenr
> 0) {
450 struct btrfs_ref ref
= {
451 .action
= BTRFS_DROP_DELAYED_REF
,
452 .bytenr
= disk_bytenr
,
453 .num_bytes
= num_bytes
,
455 .owning_root
= btrfs_root_id(root
),
456 .ref_root
= btrfs_root_id(root
),
458 btrfs_init_data_ref(&ref
, key
.objectid
,
459 key
.offset
- extent_offset
,
461 ret
= btrfs_free_extent(trans
, &ref
);
463 btrfs_abort_transaction(trans
, ret
);
466 args
->bytes_found
+= extent_end
- key
.offset
;
469 if (args
->end
== extent_end
)
472 if (path
->slots
[0] + 1 < btrfs_header_nritems(leaf
)) {
477 ret
= btrfs_del_items(trans
, root
, path
, del_slot
,
480 btrfs_abort_transaction(trans
, ret
);
487 btrfs_release_path(path
);
494 if (!ret
&& del_nr
> 0) {
496 * Set path->slots[0] to first slot, so that after the delete
497 * if items are move off from our leaf to its immediate left or
498 * right neighbor leafs, we end up with a correct and adjusted
499 * path->slots[0] for our insertion (if args->replace_extent).
501 path
->slots
[0] = del_slot
;
502 ret
= btrfs_del_items(trans
, root
, path
, del_slot
, del_nr
);
504 btrfs_abort_transaction(trans
, ret
);
507 leaf
= path
->nodes
[0];
509 * If btrfs_del_items() was called, it might have deleted a leaf, in
510 * which case it unlocked our path, so check path->locks[0] matches a
513 if (!ret
&& args
->replace_extent
&&
514 path
->locks
[0] == BTRFS_WRITE_LOCK
&&
515 btrfs_leaf_free_space(leaf
) >=
516 sizeof(struct btrfs_item
) + args
->extent_item_size
) {
519 key
.type
= BTRFS_EXTENT_DATA_KEY
;
520 key
.offset
= args
->start
;
521 if (!del_nr
&& path
->slots
[0] < btrfs_header_nritems(leaf
)) {
522 struct btrfs_key slot_key
;
524 btrfs_item_key_to_cpu(leaf
, &slot_key
, path
->slots
[0]);
525 if (btrfs_comp_cpu_keys(&key
, &slot_key
) > 0)
528 btrfs_setup_item_for_insert(trans
, root
, path
, &key
,
529 args
->extent_item_size
);
530 args
->extent_inserted
= true;
534 btrfs_free_path(path
);
535 else if (!args
->extent_inserted
)
536 btrfs_release_path(path
);
538 args
->drop_end
= found
? min(args
->end
, last_end
) : args
->end
;
543 static int extent_mergeable(struct extent_buffer
*leaf
, int slot
,
544 u64 objectid
, u64 bytenr
, u64 orig_offset
,
545 u64
*start
, u64
*end
)
547 struct btrfs_file_extent_item
*fi
;
548 struct btrfs_key key
;
551 if (slot
< 0 || slot
>= btrfs_header_nritems(leaf
))
554 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
555 if (key
.objectid
!= objectid
|| key
.type
!= BTRFS_EXTENT_DATA_KEY
)
558 fi
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
559 if (btrfs_file_extent_type(leaf
, fi
) != BTRFS_FILE_EXTENT_REG
||
560 btrfs_file_extent_disk_bytenr(leaf
, fi
) != bytenr
||
561 btrfs_file_extent_offset(leaf
, fi
) != key
.offset
- orig_offset
||
562 btrfs_file_extent_compression(leaf
, fi
) ||
563 btrfs_file_extent_encryption(leaf
, fi
) ||
564 btrfs_file_extent_other_encoding(leaf
, fi
))
567 extent_end
= key
.offset
+ btrfs_file_extent_num_bytes(leaf
, fi
);
568 if ((*start
&& *start
!= key
.offset
) || (*end
&& *end
!= extent_end
))
577 * Mark extent in the range start - end as written.
579 * This changes extent type from 'pre-allocated' to 'regular'. If only
580 * part of extent is marked as written, the extent will be split into
583 int btrfs_mark_extent_written(struct btrfs_trans_handle
*trans
,
584 struct btrfs_inode
*inode
, u64 start
, u64 end
)
586 struct btrfs_root
*root
= inode
->root
;
587 struct extent_buffer
*leaf
;
588 struct btrfs_path
*path
;
589 struct btrfs_file_extent_item
*fi
;
590 struct btrfs_ref ref
= { 0 };
591 struct btrfs_key key
;
592 struct btrfs_key new_key
;
604 u64 ino
= btrfs_ino(inode
);
606 path
= btrfs_alloc_path();
613 key
.type
= BTRFS_EXTENT_DATA_KEY
;
616 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
619 if (ret
> 0 && path
->slots
[0] > 0)
622 leaf
= path
->nodes
[0];
623 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
624 if (key
.objectid
!= ino
||
625 key
.type
!= BTRFS_EXTENT_DATA_KEY
) {
627 btrfs_abort_transaction(trans
, ret
);
630 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
631 struct btrfs_file_extent_item
);
632 if (btrfs_file_extent_type(leaf
, fi
) != BTRFS_FILE_EXTENT_PREALLOC
) {
634 btrfs_abort_transaction(trans
, ret
);
637 extent_end
= key
.offset
+ btrfs_file_extent_num_bytes(leaf
, fi
);
638 if (key
.offset
> start
|| extent_end
< end
) {
640 btrfs_abort_transaction(trans
, ret
);
644 bytenr
= btrfs_file_extent_disk_bytenr(leaf
, fi
);
645 num_bytes
= btrfs_file_extent_disk_num_bytes(leaf
, fi
);
646 orig_offset
= key
.offset
- btrfs_file_extent_offset(leaf
, fi
);
647 memcpy(&new_key
, &key
, sizeof(new_key
));
649 if (start
== key
.offset
&& end
< extent_end
) {
652 if (extent_mergeable(leaf
, path
->slots
[0] - 1,
653 ino
, bytenr
, orig_offset
,
654 &other_start
, &other_end
)) {
655 new_key
.offset
= end
;
656 btrfs_set_item_key_safe(trans
, path
, &new_key
);
657 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
658 struct btrfs_file_extent_item
);
659 btrfs_set_file_extent_generation(leaf
, fi
,
661 btrfs_set_file_extent_num_bytes(leaf
, fi
,
663 btrfs_set_file_extent_offset(leaf
, fi
,
665 fi
= btrfs_item_ptr(leaf
, path
->slots
[0] - 1,
666 struct btrfs_file_extent_item
);
667 btrfs_set_file_extent_generation(leaf
, fi
,
669 btrfs_set_file_extent_num_bytes(leaf
, fi
,
671 btrfs_mark_buffer_dirty(trans
, leaf
);
676 if (start
> key
.offset
&& end
== extent_end
) {
679 if (extent_mergeable(leaf
, path
->slots
[0] + 1,
680 ino
, bytenr
, orig_offset
,
681 &other_start
, &other_end
)) {
682 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
683 struct btrfs_file_extent_item
);
684 btrfs_set_file_extent_num_bytes(leaf
, fi
,
686 btrfs_set_file_extent_generation(leaf
, fi
,
689 new_key
.offset
= start
;
690 btrfs_set_item_key_safe(trans
, path
, &new_key
);
692 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
693 struct btrfs_file_extent_item
);
694 btrfs_set_file_extent_generation(leaf
, fi
,
696 btrfs_set_file_extent_num_bytes(leaf
, fi
,
698 btrfs_set_file_extent_offset(leaf
, fi
,
699 start
- orig_offset
);
700 btrfs_mark_buffer_dirty(trans
, leaf
);
705 while (start
> key
.offset
|| end
< extent_end
) {
706 if (key
.offset
== start
)
709 new_key
.offset
= split
;
710 ret
= btrfs_duplicate_item(trans
, root
, path
, &new_key
);
711 if (ret
== -EAGAIN
) {
712 btrfs_release_path(path
);
716 btrfs_abort_transaction(trans
, ret
);
720 leaf
= path
->nodes
[0];
721 fi
= btrfs_item_ptr(leaf
, path
->slots
[0] - 1,
722 struct btrfs_file_extent_item
);
723 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
724 btrfs_set_file_extent_num_bytes(leaf
, fi
,
727 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
728 struct btrfs_file_extent_item
);
730 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
731 btrfs_set_file_extent_offset(leaf
, fi
, split
- orig_offset
);
732 btrfs_set_file_extent_num_bytes(leaf
, fi
,
734 btrfs_mark_buffer_dirty(trans
, leaf
);
736 ref
.action
= BTRFS_ADD_DELAYED_REF
;
738 ref
.num_bytes
= num_bytes
;
740 ref
.owning_root
= btrfs_root_id(root
);
741 ref
.ref_root
= btrfs_root_id(root
);
742 btrfs_init_data_ref(&ref
, ino
, orig_offset
, 0, false);
743 ret
= btrfs_inc_extent_ref(trans
, &ref
);
745 btrfs_abort_transaction(trans
, ret
);
749 if (split
== start
) {
752 if (start
!= key
.offset
) {
754 btrfs_abort_transaction(trans
, ret
);
766 ref
.action
= BTRFS_DROP_DELAYED_REF
;
768 ref
.num_bytes
= num_bytes
;
770 ref
.owning_root
= btrfs_root_id(root
);
771 ref
.ref_root
= btrfs_root_id(root
);
772 btrfs_init_data_ref(&ref
, ino
, orig_offset
, 0, false);
773 if (extent_mergeable(leaf
, path
->slots
[0] + 1,
774 ino
, bytenr
, orig_offset
,
775 &other_start
, &other_end
)) {
777 btrfs_release_path(path
);
780 extent_end
= other_end
;
781 del_slot
= path
->slots
[0] + 1;
783 ret
= btrfs_free_extent(trans
, &ref
);
785 btrfs_abort_transaction(trans
, ret
);
791 if (extent_mergeable(leaf
, path
->slots
[0] - 1,
792 ino
, bytenr
, orig_offset
,
793 &other_start
, &other_end
)) {
795 btrfs_release_path(path
);
798 key
.offset
= other_start
;
799 del_slot
= path
->slots
[0];
801 ret
= btrfs_free_extent(trans
, &ref
);
803 btrfs_abort_transaction(trans
, ret
);
808 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
809 struct btrfs_file_extent_item
);
810 btrfs_set_file_extent_type(leaf
, fi
,
811 BTRFS_FILE_EXTENT_REG
);
812 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
813 btrfs_mark_buffer_dirty(trans
, leaf
);
815 fi
= btrfs_item_ptr(leaf
, del_slot
- 1,
816 struct btrfs_file_extent_item
);
817 btrfs_set_file_extent_type(leaf
, fi
,
818 BTRFS_FILE_EXTENT_REG
);
819 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
820 btrfs_set_file_extent_num_bytes(leaf
, fi
,
821 extent_end
- key
.offset
);
822 btrfs_mark_buffer_dirty(trans
, leaf
);
824 ret
= btrfs_del_items(trans
, root
, path
, del_slot
, del_nr
);
826 btrfs_abort_transaction(trans
, ret
);
831 btrfs_free_path(path
);
836 * On error return an unlocked folio and the error value
837 * On success return a locked folio and 0
839 static int prepare_uptodate_folio(struct inode
*inode
, struct folio
*folio
, u64 pos
,
840 u64 len
, bool force_uptodate
)
842 u64 clamp_start
= max_t(u64
, pos
, folio_pos(folio
));
843 u64 clamp_end
= min_t(u64
, pos
+ len
, folio_pos(folio
) + folio_size(folio
));
846 if (folio_test_uptodate(folio
))
849 if (!force_uptodate
&&
850 IS_ALIGNED(clamp_start
, PAGE_SIZE
) &&
851 IS_ALIGNED(clamp_end
, PAGE_SIZE
))
854 ret
= btrfs_read_folio(NULL
, folio
);
858 if (!folio_test_uptodate(folio
)) {
864 * Since btrfs_read_folio() will unlock the folio before it returns,
865 * there is a window where btrfs_release_folio() can be called to
866 * release the page. Here we check both inode mapping and page
867 * private to make sure the page was not released.
869 * The private flag check is essential for subpage as we need to store
870 * extra bitmap using folio private.
872 if (folio
->mapping
!= inode
->i_mapping
|| !folio_test_private(folio
)) {
879 static gfp_t
get_prepare_gfp_flags(struct inode
*inode
, bool nowait
)
883 gfp
= btrfs_alloc_write_mask(inode
->i_mapping
);
885 gfp
&= ~__GFP_DIRECT_RECLAIM
;
893 * Get folio into the page cache and lock it.
895 static noinline
int prepare_one_folio(struct inode
*inode
, struct folio
**folio_ret
,
896 loff_t pos
, size_t write_bytes
,
897 bool force_uptodate
, bool nowait
)
899 unsigned long index
= pos
>> PAGE_SHIFT
;
900 gfp_t mask
= get_prepare_gfp_flags(inode
, nowait
);
901 fgf_t fgp_flags
= (nowait
? FGP_WRITEBEGIN
| FGP_NOWAIT
: FGP_WRITEBEGIN
);
906 folio
= __filemap_get_folio(inode
->i_mapping
, index
, fgp_flags
, mask
);
911 ret
= PTR_ERR(folio
);
914 /* Only support page sized folio yet. */
915 ASSERT(folio_order(folio
) == 0);
916 ret
= set_folio_extent_mapped(folio
);
922 ret
= prepare_uptodate_folio(inode
, folio
, pos
, write_bytes
, force_uptodate
);
924 /* The folio is already unlocked. */
926 if (!nowait
&& ret
== -EAGAIN
) {
937 * Locks the extent and properly waits for data=ordered extents to finish
938 * before allowing the folios to be modified if need.
941 * 1 - the extent is locked
942 * 0 - the extent is not locked, and everything is OK
943 * -EAGAIN - need to prepare the folios again
946 lock_and_cleanup_extent_if_need(struct btrfs_inode
*inode
, struct folio
*folio
,
947 loff_t pos
, size_t write_bytes
,
948 u64
*lockstart
, u64
*lockend
, bool nowait
,
949 struct extent_state
**cached_state
)
951 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
956 start_pos
= round_down(pos
, fs_info
->sectorsize
);
957 last_pos
= round_up(pos
+ write_bytes
, fs_info
->sectorsize
) - 1;
959 if (start_pos
< inode
->vfs_inode
.i_size
) {
960 struct btrfs_ordered_extent
*ordered
;
963 if (!try_lock_extent(&inode
->io_tree
, start_pos
, last_pos
,
970 lock_extent(&inode
->io_tree
, start_pos
, last_pos
, cached_state
);
973 ordered
= btrfs_lookup_ordered_range(inode
, start_pos
,
974 last_pos
- start_pos
+ 1);
976 ordered
->file_offset
+ ordered
->num_bytes
> start_pos
&&
977 ordered
->file_offset
<= last_pos
) {
978 unlock_extent(&inode
->io_tree
, start_pos
, last_pos
,
982 btrfs_start_ordered_extent(ordered
);
983 btrfs_put_ordered_extent(ordered
);
987 btrfs_put_ordered_extent(ordered
);
989 *lockstart
= start_pos
;
995 * We should be called after prepare_one_folio() which should have locked
996 * all pages in the range.
998 WARN_ON(!folio_test_locked(folio
));
1004 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1006 * @pos: File offset.
1007 * @write_bytes: The length to write, will be updated to the nocow writeable
1010 * This function will flush ordered extents in the range to ensure proper
1014 * > 0 If we can nocow, and updates @write_bytes.
1015 * 0 If we can't do a nocow write.
1016 * -EAGAIN If we can't do a nocow write because snapshoting of the inode's
1017 * root is in progress.
1018 * < 0 If an error happened.
1020 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
1022 int btrfs_check_nocow_lock(struct btrfs_inode
*inode
, loff_t pos
,
1023 size_t *write_bytes
, bool nowait
)
1025 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1026 struct btrfs_root
*root
= inode
->root
;
1027 struct extent_state
*cached_state
= NULL
;
1028 u64 lockstart
, lockend
;
1032 if (!(inode
->flags
& (BTRFS_INODE_NODATACOW
| BTRFS_INODE_PREALLOC
)))
1035 if (!btrfs_drew_try_write_lock(&root
->snapshot_lock
))
1038 lockstart
= round_down(pos
, fs_info
->sectorsize
);
1039 lockend
= round_up(pos
+ *write_bytes
,
1040 fs_info
->sectorsize
) - 1;
1041 num_bytes
= lockend
- lockstart
+ 1;
1044 if (!btrfs_try_lock_ordered_range(inode
, lockstart
, lockend
,
1046 btrfs_drew_write_unlock(&root
->snapshot_lock
);
1050 btrfs_lock_and_flush_ordered_range(inode
, lockstart
, lockend
,
1053 ret
= can_nocow_extent(&inode
->vfs_inode
, lockstart
, &num_bytes
,
1054 NULL
, nowait
, false);
1056 btrfs_drew_write_unlock(&root
->snapshot_lock
);
1058 *write_bytes
= min_t(size_t, *write_bytes
,
1059 num_bytes
- pos
+ lockstart
);
1060 unlock_extent(&inode
->io_tree
, lockstart
, lockend
, &cached_state
);
1065 void btrfs_check_nocow_unlock(struct btrfs_inode
*inode
)
1067 btrfs_drew_write_unlock(&inode
->root
->snapshot_lock
);
1070 int btrfs_write_check(struct kiocb
*iocb
, size_t count
)
1072 struct file
*file
= iocb
->ki_filp
;
1073 struct inode
*inode
= file_inode(file
);
1074 struct btrfs_fs_info
*fs_info
= inode_to_fs_info(inode
);
1075 loff_t pos
= iocb
->ki_pos
;
1081 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1082 * prealloc flags, as without those flags we always have to COW. We will
1083 * later check if we can really COW into the target range (using
1084 * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1086 if ((iocb
->ki_flags
& IOCB_NOWAIT
) &&
1087 !(BTRFS_I(inode
)->flags
& (BTRFS_INODE_NODATACOW
| BTRFS_INODE_PREALLOC
)))
1090 ret
= file_remove_privs(file
);
1095 * We reserve space for updating the inode when we reserve space for the
1096 * extent we are going to write, so we will enospc out there. We don't
1097 * need to start yet another transaction to update the inode as we will
1098 * update the inode when we finish writing whatever data we write.
1100 if (!IS_NOCMTIME(inode
)) {
1101 inode_set_mtime_to_ts(inode
, inode_set_ctime_current(inode
));
1102 inode_inc_iversion(inode
);
1105 start_pos
= round_down(pos
, fs_info
->sectorsize
);
1106 oldsize
= i_size_read(inode
);
1107 if (start_pos
> oldsize
) {
1108 /* Expand hole size to cover write data, preventing empty gap */
1109 loff_t end_pos
= round_up(pos
+ count
, fs_info
->sectorsize
);
1111 ret
= btrfs_cont_expand(BTRFS_I(inode
), oldsize
, end_pos
);
1119 ssize_t
btrfs_buffered_write(struct kiocb
*iocb
, struct iov_iter
*i
)
1121 struct file
*file
= iocb
->ki_filp
;
1123 struct inode
*inode
= file_inode(file
);
1124 struct btrfs_fs_info
*fs_info
= inode_to_fs_info(inode
);
1125 struct extent_changeset
*data_reserved
= NULL
;
1126 u64 release_bytes
= 0;
1129 size_t num_written
= 0;
1131 loff_t old_isize
= i_size_read(inode
);
1132 unsigned int ilock_flags
= 0;
1133 const bool nowait
= (iocb
->ki_flags
& IOCB_NOWAIT
);
1134 unsigned int bdp_flags
= (nowait
? BDP_ASYNC
: 0);
1135 bool only_release_metadata
= false;
1138 ilock_flags
|= BTRFS_ILOCK_TRY
;
1140 ret
= btrfs_inode_lock(BTRFS_I(inode
), ilock_flags
);
1144 ret
= generic_write_checks(iocb
, i
);
1148 ret
= btrfs_write_check(iocb
, ret
);
1153 while (iov_iter_count(i
) > 0) {
1154 struct extent_state
*cached_state
= NULL
;
1155 size_t offset
= offset_in_page(pos
);
1156 size_t sector_offset
;
1157 size_t write_bytes
= min(iov_iter_count(i
), PAGE_SIZE
- offset
);
1158 size_t reserve_bytes
;
1160 size_t dirty_sectors
;
1162 struct folio
*folio
= NULL
;
1164 bool force_page_uptodate
= false;
1167 * Fault pages before locking them in prepare_one_folio()
1168 * to avoid recursive lock
1170 if (unlikely(fault_in_iov_iter_readable(i
, write_bytes
))) {
1175 only_release_metadata
= false;
1176 sector_offset
= pos
& (fs_info
->sectorsize
- 1);
1178 extent_changeset_release(data_reserved
);
1179 ret
= btrfs_check_data_free_space(BTRFS_I(inode
),
1180 &data_reserved
, pos
,
1181 write_bytes
, nowait
);
1185 if (nowait
&& (ret
== -ENOSPC
|| ret
== -EAGAIN
)) {
1191 * If we don't have to COW at the offset, reserve
1192 * metadata only. write_bytes may get smaller than
1195 can_nocow
= btrfs_check_nocow_lock(BTRFS_I(inode
), pos
,
1196 &write_bytes
, nowait
);
1203 only_release_metadata
= true;
1206 reserve_bytes
= round_up(write_bytes
+ sector_offset
,
1207 fs_info
->sectorsize
);
1208 WARN_ON(reserve_bytes
== 0);
1209 ret
= btrfs_delalloc_reserve_metadata(BTRFS_I(inode
),
1211 reserve_bytes
, nowait
);
1213 if (!only_release_metadata
)
1214 btrfs_free_reserved_data_space(BTRFS_I(inode
),
1218 btrfs_check_nocow_unlock(BTRFS_I(inode
));
1220 if (nowait
&& ret
== -ENOSPC
)
1225 release_bytes
= reserve_bytes
;
1227 ret
= balance_dirty_pages_ratelimited_flags(inode
->i_mapping
, bdp_flags
);
1229 btrfs_delalloc_release_extents(BTRFS_I(inode
), reserve_bytes
);
1233 ret
= prepare_one_folio(inode
, &folio
, pos
, write_bytes
,
1234 force_page_uptodate
, false);
1236 btrfs_delalloc_release_extents(BTRFS_I(inode
),
1241 extents_locked
= lock_and_cleanup_extent_if_need(BTRFS_I(inode
),
1242 folio
, pos
, write_bytes
, &lockstart
,
1243 &lockend
, nowait
, &cached_state
);
1244 if (extents_locked
< 0) {
1245 if (!nowait
&& extents_locked
== -EAGAIN
)
1248 btrfs_delalloc_release_extents(BTRFS_I(inode
),
1250 ret
= extents_locked
;
1254 copied
= btrfs_copy_from_user(pos
, write_bytes
, folio
, i
);
1256 num_sectors
= BTRFS_BYTES_TO_BLKS(fs_info
, reserve_bytes
);
1257 dirty_sectors
= round_up(copied
+ sector_offset
,
1258 fs_info
->sectorsize
);
1259 dirty_sectors
= BTRFS_BYTES_TO_BLKS(fs_info
, dirty_sectors
);
1262 force_page_uptodate
= true;
1265 force_page_uptodate
= false;
1268 if (num_sectors
> dirty_sectors
) {
1269 /* release everything except the sectors we dirtied */
1270 release_bytes
-= dirty_sectors
<< fs_info
->sectorsize_bits
;
1271 if (only_release_metadata
) {
1272 btrfs_delalloc_release_metadata(BTRFS_I(inode
),
1273 release_bytes
, true);
1275 u64 release_start
= round_up(pos
+ copied
,
1276 fs_info
->sectorsize
);
1277 btrfs_delalloc_release_space(BTRFS_I(inode
),
1278 data_reserved
, release_start
,
1279 release_bytes
, true);
1283 release_bytes
= round_up(copied
+ sector_offset
,
1284 fs_info
->sectorsize
);
1286 ret
= btrfs_dirty_folio(BTRFS_I(inode
), folio
, pos
, copied
,
1287 &cached_state
, only_release_metadata
);
1290 * If we have not locked the extent range, because the range's
1291 * start offset is >= i_size, we might still have a non-NULL
1292 * cached extent state, acquired while marking the extent range
1293 * as delalloc through btrfs_dirty_page(). Therefore free any
1294 * possible cached extent state to avoid a memory leak.
1297 unlock_extent(&BTRFS_I(inode
)->io_tree
, lockstart
,
1298 lockend
, &cached_state
);
1300 free_extent_state(cached_state
);
1302 btrfs_delalloc_release_extents(BTRFS_I(inode
), reserve_bytes
);
1304 btrfs_drop_folio(fs_info
, folio
, pos
, copied
);
1309 if (only_release_metadata
)
1310 btrfs_check_nocow_unlock(BTRFS_I(inode
));
1312 btrfs_drop_folio(fs_info
, folio
, pos
, copied
);
1317 num_written
+= copied
;
1320 if (release_bytes
) {
1321 if (only_release_metadata
) {
1322 btrfs_check_nocow_unlock(BTRFS_I(inode
));
1323 btrfs_delalloc_release_metadata(BTRFS_I(inode
),
1324 release_bytes
, true);
1326 btrfs_delalloc_release_space(BTRFS_I(inode
),
1328 round_down(pos
, fs_info
->sectorsize
),
1329 release_bytes
, true);
1333 extent_changeset_free(data_reserved
);
1334 if (num_written
> 0) {
1335 pagecache_isize_extended(inode
, old_isize
, iocb
->ki_pos
);
1336 iocb
->ki_pos
+= num_written
;
1339 btrfs_inode_unlock(BTRFS_I(inode
), ilock_flags
);
1340 return num_written
? num_written
: ret
;
1343 static ssize_t
btrfs_encoded_write(struct kiocb
*iocb
, struct iov_iter
*from
,
1344 const struct btrfs_ioctl_encoded_io_args
*encoded
)
1346 struct file
*file
= iocb
->ki_filp
;
1347 struct inode
*inode
= file_inode(file
);
1351 btrfs_inode_lock(BTRFS_I(inode
), 0);
1352 count
= encoded
->len
;
1353 ret
= generic_write_checks_count(iocb
, &count
);
1354 if (ret
== 0 && count
!= encoded
->len
) {
1356 * The write got truncated by generic_write_checks_count(). We
1357 * can't do a partial encoded write.
1361 if (ret
|| encoded
->len
== 0)
1364 ret
= btrfs_write_check(iocb
, encoded
->len
);
1368 ret
= btrfs_do_encoded_write(iocb
, from
, encoded
);
1370 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1374 ssize_t
btrfs_do_write_iter(struct kiocb
*iocb
, struct iov_iter
*from
,
1375 const struct btrfs_ioctl_encoded_io_args
*encoded
)
1377 struct file
*file
= iocb
->ki_filp
;
1378 struct btrfs_inode
*inode
= BTRFS_I(file_inode(file
));
1379 ssize_t num_written
, num_sync
;
1382 * If the fs flips readonly due to some impossible error, although we
1383 * have opened a file as writable, we have to stop this write operation
1384 * to ensure consistency.
1386 if (BTRFS_FS_ERROR(inode
->root
->fs_info
))
1389 if (encoded
&& (iocb
->ki_flags
& IOCB_NOWAIT
))
1393 num_written
= btrfs_encoded_write(iocb
, from
, encoded
);
1394 num_sync
= encoded
->len
;
1395 } else if (iocb
->ki_flags
& IOCB_DIRECT
) {
1396 num_written
= btrfs_direct_write(iocb
, from
);
1397 num_sync
= num_written
;
1399 num_written
= btrfs_buffered_write(iocb
, from
);
1400 num_sync
= num_written
;
1403 btrfs_set_inode_last_sub_trans(inode
);
1406 num_sync
= generic_write_sync(iocb
, num_sync
);
1408 num_written
= num_sync
;
1414 static ssize_t
btrfs_file_write_iter(struct kiocb
*iocb
, struct iov_iter
*from
)
1416 return btrfs_do_write_iter(iocb
, from
, NULL
);
1419 int btrfs_release_file(struct inode
*inode
, struct file
*filp
)
1421 struct btrfs_file_private
*private = filp
->private_data
;
1424 kfree(private->filldir_buf
);
1425 free_extent_state(private->llseek_cached_state
);
1427 filp
->private_data
= NULL
;
1431 * Set by setattr when we are about to truncate a file from a non-zero
1432 * size to a zero size. This tries to flush down new bytes that may
1433 * have been written if the application were using truncate to replace
1436 if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE
,
1437 &BTRFS_I(inode
)->runtime_flags
))
1438 filemap_flush(inode
->i_mapping
);
1442 static int start_ordered_ops(struct btrfs_inode
*inode
, loff_t start
, loff_t end
)
1445 struct blk_plug plug
;
1448 * This is only called in fsync, which would do synchronous writes, so
1449 * a plug can merge adjacent IOs as much as possible. Esp. in case of
1450 * multiple disks using raid profile, a large IO can be split to
1451 * several segments of stripe length (currently 64K).
1453 blk_start_plug(&plug
);
1454 ret
= btrfs_fdatawrite_range(inode
, start
, end
);
1455 blk_finish_plug(&plug
);
1460 static inline bool skip_inode_logging(const struct btrfs_log_ctx
*ctx
)
1462 struct btrfs_inode
*inode
= ctx
->inode
;
1463 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1465 if (btrfs_inode_in_log(inode
, btrfs_get_fs_generation(fs_info
)) &&
1466 list_empty(&ctx
->ordered_extents
))
1470 * If we are doing a fast fsync we can not bail out if the inode's
1471 * last_trans is <= then the last committed transaction, because we only
1472 * update the last_trans of the inode during ordered extent completion,
1473 * and for a fast fsync we don't wait for that, we only wait for the
1474 * writeback to complete.
1476 if (inode
->last_trans
<= btrfs_get_last_trans_committed(fs_info
) &&
1477 (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC
, &inode
->runtime_flags
) ||
1478 list_empty(&ctx
->ordered_extents
)))
1485 * fsync call for both files and directories. This logs the inode into
1486 * the tree log instead of forcing full commits whenever possible.
1488 * It needs to call filemap_fdatawait so that all ordered extent updates are
1489 * in the metadata btree are up to date for copying to the log.
1491 * It drops the inode mutex before doing the tree log commit. This is an
1492 * important optimization for directories because holding the mutex prevents
1493 * new operations on the dir while we write to disk.
1495 int btrfs_sync_file(struct file
*file
, loff_t start
, loff_t end
, int datasync
)
1497 struct dentry
*dentry
= file_dentry(file
);
1498 struct btrfs_inode
*inode
= BTRFS_I(d_inode(dentry
));
1499 struct btrfs_root
*root
= inode
->root
;
1500 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1501 struct btrfs_trans_handle
*trans
;
1502 struct btrfs_log_ctx ctx
;
1506 bool skip_ilock
= false;
1508 if (current
->journal_info
== BTRFS_TRANS_DIO_WRITE_STUB
) {
1510 current
->journal_info
= NULL
;
1511 btrfs_assert_inode_locked(inode
);
1514 trace_btrfs_sync_file(file
, datasync
);
1516 btrfs_init_log_ctx(&ctx
, inode
);
1519 * Always set the range to a full range, otherwise we can get into
1520 * several problems, from missing file extent items to represent holes
1521 * when not using the NO_HOLES feature, to log tree corruption due to
1522 * races between hole detection during logging and completion of ordered
1523 * extents outside the range, to missing checksums due to ordered extents
1524 * for which we flushed only a subset of their pages.
1528 len
= (u64
)LLONG_MAX
+ 1;
1531 * We write the dirty pages in the range and wait until they complete
1532 * out of the ->i_mutex. If so, we can flush the dirty pages by
1533 * multi-task, and make the performance up. See
1534 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1536 ret
= start_ordered_ops(inode
, start
, end
);
1541 down_write(&inode
->i_mmap_lock
);
1543 btrfs_inode_lock(inode
, BTRFS_ILOCK_MMAP
);
1545 atomic_inc(&root
->log_batch
);
1548 * Before we acquired the inode's lock and the mmap lock, someone may
1549 * have dirtied more pages in the target range. We need to make sure
1550 * that writeback for any such pages does not start while we are logging
1551 * the inode, because if it does, any of the following might happen when
1552 * we are not doing a full inode sync:
1554 * 1) We log an extent after its writeback finishes but before its
1555 * checksums are added to the csum tree, leading to -EIO errors
1556 * when attempting to read the extent after a log replay.
1558 * 2) We can end up logging an extent before its writeback finishes.
1559 * Therefore after the log replay we will have a file extent item
1560 * pointing to an unwritten extent (and no data checksums as well).
1562 * So trigger writeback for any eventual new dirty pages and then we
1563 * wait for all ordered extents to complete below.
1565 ret
= start_ordered_ops(inode
, start
, end
);
1568 up_write(&inode
->i_mmap_lock
);
1570 btrfs_inode_unlock(inode
, BTRFS_ILOCK_MMAP
);
1575 * Always check for the full sync flag while holding the inode's lock,
1576 * to avoid races with other tasks. The flag must be either set all the
1577 * time during logging or always off all the time while logging.
1578 * We check the flag here after starting delalloc above, because when
1579 * running delalloc the full sync flag may be set if we need to drop
1580 * extra extent map ranges due to temporary memory allocation failures.
1582 full_sync
= test_bit(BTRFS_INODE_NEEDS_FULL_SYNC
, &inode
->runtime_flags
);
1585 * We have to do this here to avoid the priority inversion of waiting on
1586 * IO of a lower priority task while holding a transaction open.
1588 * For a full fsync we wait for the ordered extents to complete while
1589 * for a fast fsync we wait just for writeback to complete, and then
1590 * attach the ordered extents to the transaction so that a transaction
1591 * commit waits for their completion, to avoid data loss if we fsync,
1592 * the current transaction commits before the ordered extents complete
1593 * and a power failure happens right after that.
1595 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1596 * logical address recorded in the ordered extent may change. We need
1597 * to wait for the IO to stabilize the logical address.
1599 if (full_sync
|| btrfs_is_zoned(fs_info
)) {
1600 ret
= btrfs_wait_ordered_range(inode
, start
, len
);
1601 clear_bit(BTRFS_INODE_COW_WRITE_ERROR
, &inode
->runtime_flags
);
1604 * Get our ordered extents as soon as possible to avoid doing
1605 * checksum lookups in the csum tree, and use instead the
1606 * checksums attached to the ordered extents.
1608 btrfs_get_ordered_extents_for_logging(inode
, &ctx
.ordered_extents
);
1609 ret
= filemap_fdatawait_range(inode
->vfs_inode
.i_mapping
, start
, end
);
1611 goto out_release_extents
;
1614 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
1615 * starting and waiting for writeback, because for buffered IO
1616 * it may have been set during the end IO callback
1617 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
1618 * case an error happened and we need to wait for ordered
1619 * extents to complete so that any extent maps that point to
1620 * unwritten locations are dropped and we don't log them.
1622 if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR
, &inode
->runtime_flags
))
1623 ret
= btrfs_wait_ordered_range(inode
, start
, len
);
1627 goto out_release_extents
;
1629 atomic_inc(&root
->log_batch
);
1631 if (skip_inode_logging(&ctx
)) {
1633 * We've had everything committed since the last time we were
1634 * modified so clear this flag in case it was set for whatever
1635 * reason, it's no longer relevant.
1637 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC
, &inode
->runtime_flags
);
1639 * An ordered extent might have started before and completed
1640 * already with io errors, in which case the inode was not
1641 * updated and we end up here. So check the inode's mapping
1642 * for any errors that might have happened since we last
1643 * checked called fsync.
1645 ret
= filemap_check_wb_err(inode
->vfs_inode
.i_mapping
, file
->f_wb_err
);
1646 goto out_release_extents
;
1649 btrfs_init_log_ctx_scratch_eb(&ctx
);
1652 * We use start here because we will need to wait on the IO to complete
1653 * in btrfs_sync_log, which could require joining a transaction (for
1654 * example checking cross references in the nocow path). If we use join
1655 * here we could get into a situation where we're waiting on IO to
1656 * happen that is blocked on a transaction trying to commit. With start
1657 * we inc the extwriter counter, so we wait for all extwriters to exit
1658 * before we start blocking joiners. This comment is to keep somebody
1659 * from thinking they are super smart and changing this to
1660 * btrfs_join_transaction *cough*Josef*cough*.
1662 trans
= btrfs_start_transaction(root
, 0);
1663 if (IS_ERR(trans
)) {
1664 ret
= PTR_ERR(trans
);
1665 goto out_release_extents
;
1667 trans
->in_fsync
= true;
1669 ret
= btrfs_log_dentry_safe(trans
, dentry
, &ctx
);
1671 * Scratch eb no longer needed, release before syncing log or commit
1672 * transaction, to avoid holding unnecessary memory during such long
1675 if (ctx
.scratch_eb
) {
1676 free_extent_buffer(ctx
.scratch_eb
);
1677 ctx
.scratch_eb
= NULL
;
1679 btrfs_release_log_ctx_extents(&ctx
);
1681 /* Fallthrough and commit/free transaction. */
1682 ret
= BTRFS_LOG_FORCE_COMMIT
;
1685 /* we've logged all the items and now have a consistent
1686 * version of the file in the log. It is possible that
1687 * someone will come in and modify the file, but that's
1688 * fine because the log is consistent on disk, and we
1689 * have references to all of the file's extents
1691 * It is possible that someone will come in and log the
1692 * file again, but that will end up using the synchronization
1693 * inside btrfs_sync_log to keep things safe.
1696 up_write(&inode
->i_mmap_lock
);
1698 btrfs_inode_unlock(inode
, BTRFS_ILOCK_MMAP
);
1700 if (ret
== BTRFS_NO_LOG_SYNC
) {
1701 ret
= btrfs_end_transaction(trans
);
1705 /* We successfully logged the inode, attempt to sync the log. */
1707 ret
= btrfs_sync_log(trans
, root
, &ctx
);
1709 ret
= btrfs_end_transaction(trans
);
1715 * At this point we need to commit the transaction because we had
1716 * btrfs_need_log_full_commit() or some other error.
1718 * If we didn't do a full sync we have to stop the trans handle, wait on
1719 * the ordered extents, start it again and commit the transaction. If
1720 * we attempt to wait on the ordered extents here we could deadlock with
1721 * something like fallocate() that is holding the extent lock trying to
1722 * start a transaction while some other thread is trying to commit the
1723 * transaction while we (fsync) are currently holding the transaction
1727 ret
= btrfs_end_transaction(trans
);
1730 ret
= btrfs_wait_ordered_range(inode
, start
, len
);
1735 * This is safe to use here because we're only interested in
1736 * making sure the transaction that had the ordered extents is
1737 * committed. We aren't waiting on anything past this point,
1738 * we're purely getting the transaction and committing it.
1740 trans
= btrfs_attach_transaction_barrier(root
);
1741 if (IS_ERR(trans
)) {
1742 ret
= PTR_ERR(trans
);
1745 * We committed the transaction and there's no currently
1746 * running transaction, this means everything we care
1747 * about made it to disk and we are done.
1755 ret
= btrfs_commit_transaction(trans
);
1757 free_extent_buffer(ctx
.scratch_eb
);
1758 ASSERT(list_empty(&ctx
.list
));
1759 ASSERT(list_empty(&ctx
.conflict_inodes
));
1760 err
= file_check_and_advance_wb_err(file
);
1763 return ret
> 0 ? -EIO
: ret
;
1765 out_release_extents
:
1766 btrfs_release_log_ctx_extents(&ctx
);
1768 up_write(&inode
->i_mmap_lock
);
1770 btrfs_inode_unlock(inode
, BTRFS_ILOCK_MMAP
);
1775 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
1776 * called from a page fault handler when a page is first dirtied. Hence we must
1777 * be careful to check for EOF conditions here. We set the page up correctly
1778 * for a written page which means we get ENOSPC checking when writing into
1779 * holes and correct delalloc and unwritten extent mapping on filesystems that
1780 * support these features.
1782 * We are not allowed to take the i_mutex here so we have to play games to
1783 * protect against truncate races as the page could now be beyond EOF. Because
1784 * truncate_setsize() writes the inode size before removing pages, once we have
1785 * the page lock we can determine safely if the page is beyond EOF. If it is not
1786 * beyond EOF, then the page is guaranteed safe against truncation until we
1789 static vm_fault_t
btrfs_page_mkwrite(struct vm_fault
*vmf
)
1791 struct page
*page
= vmf
->page
;
1792 struct folio
*folio
= page_folio(page
);
1793 struct inode
*inode
= file_inode(vmf
->vma
->vm_file
);
1794 struct btrfs_fs_info
*fs_info
= inode_to_fs_info(inode
);
1795 struct extent_io_tree
*io_tree
= &BTRFS_I(inode
)->io_tree
;
1796 struct btrfs_ordered_extent
*ordered
;
1797 struct extent_state
*cached_state
= NULL
;
1798 struct extent_changeset
*data_reserved
= NULL
;
1799 unsigned long zero_start
;
1809 ASSERT(folio_order(folio
) == 0);
1811 reserved_space
= PAGE_SIZE
;
1813 sb_start_pagefault(inode
->i_sb
);
1814 page_start
= folio_pos(folio
);
1815 page_end
= page_start
+ folio_size(folio
) - 1;
1819 * Reserving delalloc space after obtaining the page lock can lead to
1820 * deadlock. For example, if a dirty page is locked by this function
1821 * and the call to btrfs_delalloc_reserve_space() ends up triggering
1822 * dirty page write out, then the btrfs_writepages() function could
1823 * end up waiting indefinitely to get a lock on the page currently
1824 * being processed by btrfs_page_mkwrite() function.
1826 ret2
= btrfs_delalloc_reserve_space(BTRFS_I(inode
), &data_reserved
,
1827 page_start
, reserved_space
);
1829 ret2
= file_update_time(vmf
->vma
->vm_file
);
1833 ret
= vmf_error(ret2
);
1839 /* Make the VM retry the fault. */
1840 ret
= VM_FAULT_NOPAGE
;
1842 down_read(&BTRFS_I(inode
)->i_mmap_lock
);
1844 size
= i_size_read(inode
);
1846 if ((folio
->mapping
!= inode
->i_mapping
) ||
1847 (page_start
>= size
)) {
1848 /* Page got truncated out from underneath us. */
1851 folio_wait_writeback(folio
);
1853 lock_extent(io_tree
, page_start
, page_end
, &cached_state
);
1854 ret2
= set_folio_extent_mapped(folio
);
1856 ret
= vmf_error(ret2
);
1857 unlock_extent(io_tree
, page_start
, page_end
, &cached_state
);
1862 * We can't set the delalloc bits if there are pending ordered
1863 * extents. Drop our locks and wait for them to finish.
1865 ordered
= btrfs_lookup_ordered_range(BTRFS_I(inode
), page_start
, PAGE_SIZE
);
1867 unlock_extent(io_tree
, page_start
, page_end
, &cached_state
);
1868 folio_unlock(folio
);
1869 up_read(&BTRFS_I(inode
)->i_mmap_lock
);
1870 btrfs_start_ordered_extent(ordered
);
1871 btrfs_put_ordered_extent(ordered
);
1875 if (folio
->index
== ((size
- 1) >> PAGE_SHIFT
)) {
1876 reserved_space
= round_up(size
- page_start
, fs_info
->sectorsize
);
1877 if (reserved_space
< PAGE_SIZE
) {
1878 end
= page_start
+ reserved_space
- 1;
1879 btrfs_delalloc_release_space(BTRFS_I(inode
),
1880 data_reserved
, page_start
,
1881 PAGE_SIZE
- reserved_space
, true);
1886 * page_mkwrite gets called when the page is firstly dirtied after it's
1887 * faulted in, but write(2) could also dirty a page and set delalloc
1888 * bits, thus in this case for space account reason, we still need to
1889 * clear any delalloc bits within this page range since we have to
1890 * reserve data&meta space before lock_page() (see above comments).
1892 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, page_start
, end
,
1893 EXTENT_DELALLOC
| EXTENT_DO_ACCOUNTING
|
1894 EXTENT_DEFRAG
, &cached_state
);
1896 ret2
= btrfs_set_extent_delalloc(BTRFS_I(inode
), page_start
, end
, 0,
1899 unlock_extent(io_tree
, page_start
, page_end
, &cached_state
);
1900 ret
= VM_FAULT_SIGBUS
;
1904 /* Page is wholly or partially inside EOF. */
1905 if (page_start
+ folio_size(folio
) > size
)
1906 zero_start
= offset_in_folio(folio
, size
);
1908 zero_start
= PAGE_SIZE
;
1910 if (zero_start
!= PAGE_SIZE
)
1911 folio_zero_range(folio
, zero_start
, folio_size(folio
) - zero_start
);
1913 btrfs_folio_clear_checked(fs_info
, folio
, page_start
, PAGE_SIZE
);
1914 btrfs_folio_set_dirty(fs_info
, folio
, page_start
, end
+ 1 - page_start
);
1915 btrfs_folio_set_uptodate(fs_info
, folio
, page_start
, end
+ 1 - page_start
);
1917 btrfs_set_inode_last_sub_trans(BTRFS_I(inode
));
1919 unlock_extent(io_tree
, page_start
, page_end
, &cached_state
);
1920 up_read(&BTRFS_I(inode
)->i_mmap_lock
);
1922 btrfs_delalloc_release_extents(BTRFS_I(inode
), PAGE_SIZE
);
1923 sb_end_pagefault(inode
->i_sb
);
1924 extent_changeset_free(data_reserved
);
1925 return VM_FAULT_LOCKED
;
1928 folio_unlock(folio
);
1929 up_read(&BTRFS_I(inode
)->i_mmap_lock
);
1931 btrfs_delalloc_release_extents(BTRFS_I(inode
), PAGE_SIZE
);
1932 btrfs_delalloc_release_space(BTRFS_I(inode
), data_reserved
, page_start
,
1933 reserved_space
, (ret
!= 0));
1935 sb_end_pagefault(inode
->i_sb
);
1936 extent_changeset_free(data_reserved
);
1940 static const struct vm_operations_struct btrfs_file_vm_ops
= {
1941 .fault
= filemap_fault
,
1942 .map_pages
= filemap_map_pages
,
1943 .page_mkwrite
= btrfs_page_mkwrite
,
1946 static int btrfs_file_mmap(struct file
*filp
, struct vm_area_struct
*vma
)
1948 struct address_space
*mapping
= filp
->f_mapping
;
1950 if (!mapping
->a_ops
->read_folio
)
1953 file_accessed(filp
);
1954 vma
->vm_ops
= &btrfs_file_vm_ops
;
1959 static int hole_mergeable(struct btrfs_inode
*inode
, struct extent_buffer
*leaf
,
1960 int slot
, u64 start
, u64 end
)
1962 struct btrfs_file_extent_item
*fi
;
1963 struct btrfs_key key
;
1965 if (slot
< 0 || slot
>= btrfs_header_nritems(leaf
))
1968 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
1969 if (key
.objectid
!= btrfs_ino(inode
) ||
1970 key
.type
!= BTRFS_EXTENT_DATA_KEY
)
1973 fi
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
1975 if (btrfs_file_extent_type(leaf
, fi
) != BTRFS_FILE_EXTENT_REG
)
1978 if (btrfs_file_extent_disk_bytenr(leaf
, fi
))
1981 if (key
.offset
== end
)
1983 if (key
.offset
+ btrfs_file_extent_num_bytes(leaf
, fi
) == start
)
1988 static int fill_holes(struct btrfs_trans_handle
*trans
,
1989 struct btrfs_inode
*inode
,
1990 struct btrfs_path
*path
, u64 offset
, u64 end
)
1992 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1993 struct btrfs_root
*root
= inode
->root
;
1994 struct extent_buffer
*leaf
;
1995 struct btrfs_file_extent_item
*fi
;
1996 struct extent_map
*hole_em
;
1997 struct btrfs_key key
;
2000 if (btrfs_fs_incompat(fs_info
, NO_HOLES
))
2003 key
.objectid
= btrfs_ino(inode
);
2004 key
.type
= BTRFS_EXTENT_DATA_KEY
;
2005 key
.offset
= offset
;
2007 ret
= btrfs_search_slot(trans
, root
, &key
, path
, 0, 1);
2010 * We should have dropped this offset, so if we find it then
2011 * something has gone horribly wrong.
2018 leaf
= path
->nodes
[0];
2019 if (hole_mergeable(inode
, leaf
, path
->slots
[0] - 1, offset
, end
)) {
2023 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
2024 struct btrfs_file_extent_item
);
2025 num_bytes
= btrfs_file_extent_num_bytes(leaf
, fi
) +
2027 btrfs_set_file_extent_num_bytes(leaf
, fi
, num_bytes
);
2028 btrfs_set_file_extent_ram_bytes(leaf
, fi
, num_bytes
);
2029 btrfs_set_file_extent_offset(leaf
, fi
, 0);
2030 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
2031 btrfs_mark_buffer_dirty(trans
, leaf
);
2035 if (hole_mergeable(inode
, leaf
, path
->slots
[0], offset
, end
)) {
2038 key
.offset
= offset
;
2039 btrfs_set_item_key_safe(trans
, path
, &key
);
2040 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
2041 struct btrfs_file_extent_item
);
2042 num_bytes
= btrfs_file_extent_num_bytes(leaf
, fi
) + end
-
2044 btrfs_set_file_extent_num_bytes(leaf
, fi
, num_bytes
);
2045 btrfs_set_file_extent_ram_bytes(leaf
, fi
, num_bytes
);
2046 btrfs_set_file_extent_offset(leaf
, fi
, 0);
2047 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
2048 btrfs_mark_buffer_dirty(trans
, leaf
);
2051 btrfs_release_path(path
);
2053 ret
= btrfs_insert_hole_extent(trans
, root
, btrfs_ino(inode
), offset
,
2059 btrfs_release_path(path
);
2061 hole_em
= alloc_extent_map();
2063 btrfs_drop_extent_map_range(inode
, offset
, end
- 1, false);
2064 btrfs_set_inode_full_sync(inode
);
2066 hole_em
->start
= offset
;
2067 hole_em
->len
= end
- offset
;
2068 hole_em
->ram_bytes
= hole_em
->len
;
2070 hole_em
->disk_bytenr
= EXTENT_MAP_HOLE
;
2071 hole_em
->disk_num_bytes
= 0;
2072 hole_em
->generation
= trans
->transid
;
2074 ret
= btrfs_replace_extent_map_range(inode
, hole_em
, true);
2075 free_extent_map(hole_em
);
2077 btrfs_set_inode_full_sync(inode
);
2084 * Find a hole extent on given inode and change start/len to the end of hole
2085 * extent.(hole/vacuum extent whose em->start <= start &&
2086 * em->start + em->len > start)
2087 * When a hole extent is found, return 1 and modify start/len.
2089 static int find_first_non_hole(struct btrfs_inode
*inode
, u64
*start
, u64
*len
)
2091 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
2092 struct extent_map
*em
;
2095 em
= btrfs_get_extent(inode
, NULL
,
2096 round_down(*start
, fs_info
->sectorsize
),
2097 round_up(*len
, fs_info
->sectorsize
));
2101 /* Hole or vacuum extent(only exists in no-hole mode) */
2102 if (em
->disk_bytenr
== EXTENT_MAP_HOLE
) {
2104 *len
= em
->start
+ em
->len
> *start
+ *len
?
2105 0 : *start
+ *len
- em
->start
- em
->len
;
2106 *start
= em
->start
+ em
->len
;
2108 free_extent_map(em
);
2112 static void btrfs_punch_hole_lock_range(struct inode
*inode
,
2113 const u64 lockstart
,
2115 struct extent_state
**cached_state
)
2118 * For subpage case, if the range is not at page boundary, we could
2119 * have pages at the leading/tailing part of the range.
2120 * This could lead to dead loop since filemap_range_has_page()
2121 * will always return true.
2122 * So here we need to do extra page alignment for
2123 * filemap_range_has_page().
2125 const u64 page_lockstart
= round_up(lockstart
, PAGE_SIZE
);
2126 const u64 page_lockend
= round_down(lockend
+ 1, PAGE_SIZE
) - 1;
2129 truncate_pagecache_range(inode
, lockstart
, lockend
);
2131 lock_extent(&BTRFS_I(inode
)->io_tree
, lockstart
, lockend
,
2134 * We can't have ordered extents in the range, nor dirty/writeback
2135 * pages, because we have locked the inode's VFS lock in exclusive
2136 * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2137 * we have flushed all delalloc in the range and we have waited
2138 * for any ordered extents in the range to complete.
2139 * We can race with anyone reading pages from this range, so after
2140 * locking the range check if we have pages in the range, and if
2141 * we do, unlock the range and retry.
2143 if (!filemap_range_has_page(inode
->i_mapping
, page_lockstart
,
2147 unlock_extent(&BTRFS_I(inode
)->io_tree
, lockstart
, lockend
,
2151 btrfs_assert_inode_range_clean(BTRFS_I(inode
), lockstart
, lockend
);
2154 static int btrfs_insert_replace_extent(struct btrfs_trans_handle
*trans
,
2155 struct btrfs_inode
*inode
,
2156 struct btrfs_path
*path
,
2157 struct btrfs_replace_extent_info
*extent_info
,
2158 const u64 replace_len
,
2159 const u64 bytes_to_drop
)
2161 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
2162 struct btrfs_root
*root
= inode
->root
;
2163 struct btrfs_file_extent_item
*extent
;
2164 struct extent_buffer
*leaf
;
2165 struct btrfs_key key
;
2169 if (replace_len
== 0)
2172 if (extent_info
->disk_offset
== 0 &&
2173 btrfs_fs_incompat(fs_info
, NO_HOLES
)) {
2174 btrfs_update_inode_bytes(inode
, 0, bytes_to_drop
);
2178 key
.objectid
= btrfs_ino(inode
);
2179 key
.type
= BTRFS_EXTENT_DATA_KEY
;
2180 key
.offset
= extent_info
->file_offset
;
2181 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
,
2182 sizeof(struct btrfs_file_extent_item
));
2185 leaf
= path
->nodes
[0];
2186 slot
= path
->slots
[0];
2187 write_extent_buffer(leaf
, extent_info
->extent_buf
,
2188 btrfs_item_ptr_offset(leaf
, slot
),
2189 sizeof(struct btrfs_file_extent_item
));
2190 extent
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
2191 ASSERT(btrfs_file_extent_type(leaf
, extent
) != BTRFS_FILE_EXTENT_INLINE
);
2192 btrfs_set_file_extent_offset(leaf
, extent
, extent_info
->data_offset
);
2193 btrfs_set_file_extent_num_bytes(leaf
, extent
, replace_len
);
2194 if (extent_info
->is_new_extent
)
2195 btrfs_set_file_extent_generation(leaf
, extent
, trans
->transid
);
2196 btrfs_mark_buffer_dirty(trans
, leaf
);
2197 btrfs_release_path(path
);
2199 ret
= btrfs_inode_set_file_extent_range(inode
, extent_info
->file_offset
,
2204 /* If it's a hole, nothing more needs to be done. */
2205 if (extent_info
->disk_offset
== 0) {
2206 btrfs_update_inode_bytes(inode
, 0, bytes_to_drop
);
2210 btrfs_update_inode_bytes(inode
, replace_len
, bytes_to_drop
);
2212 if (extent_info
->is_new_extent
&& extent_info
->insertions
== 0) {
2213 key
.objectid
= extent_info
->disk_offset
;
2214 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
2215 key
.offset
= extent_info
->disk_len
;
2216 ret
= btrfs_alloc_reserved_file_extent(trans
, root
,
2218 extent_info
->file_offset
,
2219 extent_info
->qgroup_reserved
,
2222 struct btrfs_ref ref
= {
2223 .action
= BTRFS_ADD_DELAYED_REF
,
2224 .bytenr
= extent_info
->disk_offset
,
2225 .num_bytes
= extent_info
->disk_len
,
2226 .owning_root
= btrfs_root_id(root
),
2227 .ref_root
= btrfs_root_id(root
),
2231 ref_offset
= extent_info
->file_offset
- extent_info
->data_offset
;
2232 btrfs_init_data_ref(&ref
, btrfs_ino(inode
), ref_offset
, 0, false);
2233 ret
= btrfs_inc_extent_ref(trans
, &ref
);
2236 extent_info
->insertions
++;
2242 * The respective range must have been previously locked, as well as the inode.
2243 * The end offset is inclusive (last byte of the range).
2244 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2245 * the file range with an extent.
2246 * When not punching a hole, we don't want to end up in a state where we dropped
2247 * extents without inserting a new one, so we must abort the transaction to avoid
2250 int btrfs_replace_file_extents(struct btrfs_inode
*inode
,
2251 struct btrfs_path
*path
, const u64 start
,
2253 struct btrfs_replace_extent_info
*extent_info
,
2254 struct btrfs_trans_handle
**trans_out
)
2256 struct btrfs_drop_extents_args drop_args
= { 0 };
2257 struct btrfs_root
*root
= inode
->root
;
2258 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2259 u64 min_size
= btrfs_calc_insert_metadata_size(fs_info
, 1);
2260 u64 ino_size
= round_up(inode
->vfs_inode
.i_size
, fs_info
->sectorsize
);
2261 struct btrfs_trans_handle
*trans
= NULL
;
2262 struct btrfs_block_rsv
*rsv
;
2263 unsigned int rsv_count
;
2265 u64 len
= end
- start
;
2271 rsv
= btrfs_alloc_block_rsv(fs_info
, BTRFS_BLOCK_RSV_TEMP
);
2276 rsv
->size
= btrfs_calc_insert_metadata_size(fs_info
, 1);
2277 rsv
->failfast
= true;
2280 * 1 - update the inode
2281 * 1 - removing the extents in the range
2282 * 1 - adding the hole extent if no_holes isn't set or if we are
2283 * replacing the range with a new extent
2285 if (!btrfs_fs_incompat(fs_info
, NO_HOLES
) || extent_info
)
2290 trans
= btrfs_start_transaction(root
, rsv_count
);
2291 if (IS_ERR(trans
)) {
2292 ret
= PTR_ERR(trans
);
2297 ret
= btrfs_block_rsv_migrate(&fs_info
->trans_block_rsv
, rsv
,
2301 trans
->block_rsv
= rsv
;
2304 drop_args
.path
= path
;
2305 drop_args
.end
= end
+ 1;
2306 drop_args
.drop_cache
= true;
2307 while (cur_offset
< end
) {
2308 drop_args
.start
= cur_offset
;
2309 ret
= btrfs_drop_extents(trans
, root
, inode
, &drop_args
);
2310 /* If we are punching a hole decrement the inode's byte count */
2312 btrfs_update_inode_bytes(inode
, 0,
2313 drop_args
.bytes_found
);
2314 if (ret
!= -ENOSPC
) {
2316 * The only time we don't want to abort is if we are
2317 * attempting to clone a partial inline extent, in which
2318 * case we'll get EOPNOTSUPP. However if we aren't
2319 * clone we need to abort no matter what, because if we
2320 * got EOPNOTSUPP via prealloc then we messed up and
2324 (ret
!= -EOPNOTSUPP
||
2325 (extent_info
&& extent_info
->is_new_extent
)))
2326 btrfs_abort_transaction(trans
, ret
);
2330 trans
->block_rsv
= &fs_info
->trans_block_rsv
;
2332 if (!extent_info
&& cur_offset
< drop_args
.drop_end
&&
2333 cur_offset
< ino_size
) {
2334 ret
= fill_holes(trans
, inode
, path
, cur_offset
,
2335 drop_args
.drop_end
);
2338 * If we failed then we didn't insert our hole
2339 * entries for the area we dropped, so now the
2340 * fs is corrupted, so we must abort the
2343 btrfs_abort_transaction(trans
, ret
);
2346 } else if (!extent_info
&& cur_offset
< drop_args
.drop_end
) {
2348 * We are past the i_size here, but since we didn't
2349 * insert holes we need to clear the mapped area so we
2350 * know to not set disk_i_size in this area until a new
2351 * file extent is inserted here.
2353 ret
= btrfs_inode_clear_file_extent_range(inode
,
2355 drop_args
.drop_end
- cur_offset
);
2358 * We couldn't clear our area, so we could
2359 * presumably adjust up and corrupt the fs, so
2362 btrfs_abort_transaction(trans
, ret
);
2368 drop_args
.drop_end
> extent_info
->file_offset
) {
2369 u64 replace_len
= drop_args
.drop_end
-
2370 extent_info
->file_offset
;
2372 ret
= btrfs_insert_replace_extent(trans
, inode
, path
,
2373 extent_info
, replace_len
,
2374 drop_args
.bytes_found
);
2376 btrfs_abort_transaction(trans
, ret
);
2379 extent_info
->data_len
-= replace_len
;
2380 extent_info
->data_offset
+= replace_len
;
2381 extent_info
->file_offset
+= replace_len
;
2385 * We are releasing our handle on the transaction, balance the
2386 * dirty pages of the btree inode and flush delayed items, and
2387 * then get a new transaction handle, which may now point to a
2388 * new transaction in case someone else may have committed the
2389 * transaction we used to replace/drop file extent items. So
2390 * bump the inode's iversion and update mtime and ctime except
2391 * if we are called from a dedupe context. This is because a
2392 * power failure/crash may happen after the transaction is
2393 * committed and before we finish replacing/dropping all the
2394 * file extent items we need.
2396 inode_inc_iversion(&inode
->vfs_inode
);
2398 if (!extent_info
|| extent_info
->update_times
)
2399 inode_set_mtime_to_ts(&inode
->vfs_inode
,
2400 inode_set_ctime_current(&inode
->vfs_inode
));
2402 ret
= btrfs_update_inode(trans
, inode
);
2406 btrfs_end_transaction(trans
);
2407 btrfs_btree_balance_dirty(fs_info
);
2409 trans
= btrfs_start_transaction(root
, rsv_count
);
2410 if (IS_ERR(trans
)) {
2411 ret
= PTR_ERR(trans
);
2416 ret
= btrfs_block_rsv_migrate(&fs_info
->trans_block_rsv
,
2417 rsv
, min_size
, false);
2420 trans
->block_rsv
= rsv
;
2422 cur_offset
= drop_args
.drop_end
;
2423 len
= end
- cur_offset
;
2424 if (!extent_info
&& len
) {
2425 ret
= find_first_non_hole(inode
, &cur_offset
, &len
);
2426 if (unlikely(ret
< 0))
2436 * If we were cloning, force the next fsync to be a full one since we
2437 * we replaced (or just dropped in the case of cloning holes when
2438 * NO_HOLES is enabled) file extent items and did not setup new extent
2439 * maps for the replacement extents (or holes).
2441 if (extent_info
&& !extent_info
->is_new_extent
)
2442 btrfs_set_inode_full_sync(inode
);
2447 trans
->block_rsv
= &fs_info
->trans_block_rsv
;
2449 * If we are using the NO_HOLES feature we might have had already an
2450 * hole that overlaps a part of the region [lockstart, lockend] and
2451 * ends at (or beyond) lockend. Since we have no file extent items to
2452 * represent holes, drop_end can be less than lockend and so we must
2453 * make sure we have an extent map representing the existing hole (the
2454 * call to __btrfs_drop_extents() might have dropped the existing extent
2455 * map representing the existing hole), otherwise the fast fsync path
2456 * will not record the existence of the hole region
2457 * [existing_hole_start, lockend].
2459 if (drop_args
.drop_end
<= end
)
2460 drop_args
.drop_end
= end
+ 1;
2462 * Don't insert file hole extent item if it's for a range beyond eof
2463 * (because it's useless) or if it represents a 0 bytes range (when
2464 * cur_offset == drop_end).
2466 if (!extent_info
&& cur_offset
< ino_size
&&
2467 cur_offset
< drop_args
.drop_end
) {
2468 ret
= fill_holes(trans
, inode
, path
, cur_offset
,
2469 drop_args
.drop_end
);
2471 /* Same comment as above. */
2472 btrfs_abort_transaction(trans
, ret
);
2475 } else if (!extent_info
&& cur_offset
< drop_args
.drop_end
) {
2476 /* See the comment in the loop above for the reasoning here. */
2477 ret
= btrfs_inode_clear_file_extent_range(inode
, cur_offset
,
2478 drop_args
.drop_end
- cur_offset
);
2480 btrfs_abort_transaction(trans
, ret
);
2486 ret
= btrfs_insert_replace_extent(trans
, inode
, path
,
2487 extent_info
, extent_info
->data_len
,
2488 drop_args
.bytes_found
);
2490 btrfs_abort_transaction(trans
, ret
);
2499 trans
->block_rsv
= &fs_info
->trans_block_rsv
;
2501 btrfs_end_transaction(trans
);
2505 btrfs_free_block_rsv(fs_info
, rsv
);
2510 static int btrfs_punch_hole(struct file
*file
, loff_t offset
, loff_t len
)
2512 struct inode
*inode
= file_inode(file
);
2513 struct btrfs_fs_info
*fs_info
= inode_to_fs_info(inode
);
2514 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
2515 struct extent_state
*cached_state
= NULL
;
2516 struct btrfs_path
*path
;
2517 struct btrfs_trans_handle
*trans
= NULL
;
2522 u64 orig_start
= offset
;
2526 bool truncated_block
= false;
2527 bool updated_inode
= false;
2529 btrfs_inode_lock(BTRFS_I(inode
), BTRFS_ILOCK_MMAP
);
2531 ret
= btrfs_wait_ordered_range(BTRFS_I(inode
), offset
, len
);
2533 goto out_only_mutex
;
2535 ino_size
= round_up(inode
->i_size
, fs_info
->sectorsize
);
2536 ret
= find_first_non_hole(BTRFS_I(inode
), &offset
, &len
);
2538 goto out_only_mutex
;
2540 /* Already in a large hole */
2542 goto out_only_mutex
;
2545 ret
= file_modified(file
);
2547 goto out_only_mutex
;
2549 lockstart
= round_up(offset
, fs_info
->sectorsize
);
2550 lockend
= round_down(offset
+ len
, fs_info
->sectorsize
) - 1;
2551 same_block
= (BTRFS_BYTES_TO_BLKS(fs_info
, offset
))
2552 == (BTRFS_BYTES_TO_BLKS(fs_info
, offset
+ len
- 1));
2554 * We needn't truncate any block which is beyond the end of the file
2555 * because we are sure there is no data there.
2558 * Only do this if we are in the same block and we aren't doing the
2561 if (same_block
&& len
< fs_info
->sectorsize
) {
2562 if (offset
< ino_size
) {
2563 truncated_block
= true;
2564 ret
= btrfs_truncate_block(BTRFS_I(inode
), offset
, len
,
2569 goto out_only_mutex
;
2572 /* zero back part of the first block */
2573 if (offset
< ino_size
) {
2574 truncated_block
= true;
2575 ret
= btrfs_truncate_block(BTRFS_I(inode
), offset
, 0, 0);
2577 btrfs_inode_unlock(BTRFS_I(inode
), BTRFS_ILOCK_MMAP
);
2582 /* Check the aligned pages after the first unaligned page,
2583 * if offset != orig_start, which means the first unaligned page
2584 * including several following pages are already in holes,
2585 * the extra check can be skipped */
2586 if (offset
== orig_start
) {
2587 /* after truncate page, check hole again */
2588 len
= offset
+ len
- lockstart
;
2590 ret
= find_first_non_hole(BTRFS_I(inode
), &offset
, &len
);
2592 goto out_only_mutex
;
2595 goto out_only_mutex
;
2600 /* Check the tail unaligned part is in a hole */
2601 tail_start
= lockend
+ 1;
2602 tail_len
= offset
+ len
- tail_start
;
2604 ret
= find_first_non_hole(BTRFS_I(inode
), &tail_start
, &tail_len
);
2605 if (unlikely(ret
< 0))
2606 goto out_only_mutex
;
2608 /* zero the front end of the last page */
2609 if (tail_start
+ tail_len
< ino_size
) {
2610 truncated_block
= true;
2611 ret
= btrfs_truncate_block(BTRFS_I(inode
),
2612 tail_start
+ tail_len
,
2615 goto out_only_mutex
;
2620 if (lockend
< lockstart
) {
2622 goto out_only_mutex
;
2625 btrfs_punch_hole_lock_range(inode
, lockstart
, lockend
, &cached_state
);
2627 path
= btrfs_alloc_path();
2633 ret
= btrfs_replace_file_extents(BTRFS_I(inode
), path
, lockstart
,
2634 lockend
, NULL
, &trans
);
2635 btrfs_free_path(path
);
2639 ASSERT(trans
!= NULL
);
2640 inode_inc_iversion(inode
);
2641 inode_set_mtime_to_ts(inode
, inode_set_ctime_current(inode
));
2642 ret
= btrfs_update_inode(trans
, BTRFS_I(inode
));
2643 updated_inode
= true;
2644 btrfs_end_transaction(trans
);
2645 btrfs_btree_balance_dirty(fs_info
);
2647 unlock_extent(&BTRFS_I(inode
)->io_tree
, lockstart
, lockend
,
2650 if (!updated_inode
&& truncated_block
&& !ret
) {
2652 * If we only end up zeroing part of a page, we still need to
2653 * update the inode item, so that all the time fields are
2654 * updated as well as the necessary btrfs inode in memory fields
2655 * for detecting, at fsync time, if the inode isn't yet in the
2656 * log tree or it's there but not up to date.
2658 struct timespec64 now
= inode_set_ctime_current(inode
);
2660 inode_inc_iversion(inode
);
2661 inode_set_mtime_to_ts(inode
, now
);
2662 trans
= btrfs_start_transaction(root
, 1);
2663 if (IS_ERR(trans
)) {
2664 ret
= PTR_ERR(trans
);
2668 ret
= btrfs_update_inode(trans
, BTRFS_I(inode
));
2669 ret2
= btrfs_end_transaction(trans
);
2674 btrfs_inode_unlock(BTRFS_I(inode
), BTRFS_ILOCK_MMAP
);
2678 /* Helper structure to record which range is already reserved */
2679 struct falloc_range
{
2680 struct list_head list
;
2686 * Helper function to add falloc range
2688 * Caller should have locked the larger range of extent containing
2691 static int add_falloc_range(struct list_head
*head
, u64 start
, u64 len
)
2693 struct falloc_range
*range
= NULL
;
2695 if (!list_empty(head
)) {
2697 * As fallocate iterates by bytenr order, we only need to check
2700 range
= list_last_entry(head
, struct falloc_range
, list
);
2701 if (range
->start
+ range
->len
== start
) {
2707 range
= kmalloc(sizeof(*range
), GFP_KERNEL
);
2710 range
->start
= start
;
2712 list_add_tail(&range
->list
, head
);
2716 static int btrfs_fallocate_update_isize(struct inode
*inode
,
2720 struct btrfs_trans_handle
*trans
;
2721 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
2725 if (mode
& FALLOC_FL_KEEP_SIZE
|| end
<= i_size_read(inode
))
2728 trans
= btrfs_start_transaction(root
, 1);
2730 return PTR_ERR(trans
);
2732 inode_set_ctime_current(inode
);
2733 i_size_write(inode
, end
);
2734 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode
), 0);
2735 ret
= btrfs_update_inode(trans
, BTRFS_I(inode
));
2736 ret2
= btrfs_end_transaction(trans
);
2738 return ret
? ret
: ret2
;
2742 RANGE_BOUNDARY_WRITTEN_EXTENT
,
2743 RANGE_BOUNDARY_PREALLOC_EXTENT
,
2744 RANGE_BOUNDARY_HOLE
,
2747 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode
*inode
,
2750 const u64 sectorsize
= inode
->root
->fs_info
->sectorsize
;
2751 struct extent_map
*em
;
2754 offset
= round_down(offset
, sectorsize
);
2755 em
= btrfs_get_extent(inode
, NULL
, offset
, sectorsize
);
2759 if (em
->disk_bytenr
== EXTENT_MAP_HOLE
)
2760 ret
= RANGE_BOUNDARY_HOLE
;
2761 else if (em
->flags
& EXTENT_FLAG_PREALLOC
)
2762 ret
= RANGE_BOUNDARY_PREALLOC_EXTENT
;
2764 ret
= RANGE_BOUNDARY_WRITTEN_EXTENT
;
2766 free_extent_map(em
);
2770 static int btrfs_zero_range(struct inode
*inode
,
2775 struct btrfs_fs_info
*fs_info
= BTRFS_I(inode
)->root
->fs_info
;
2776 struct extent_map
*em
;
2777 struct extent_changeset
*data_reserved
= NULL
;
2780 const u64 sectorsize
= fs_info
->sectorsize
;
2781 u64 alloc_start
= round_down(offset
, sectorsize
);
2782 u64 alloc_end
= round_up(offset
+ len
, sectorsize
);
2783 u64 bytes_to_reserve
= 0;
2784 bool space_reserved
= false;
2786 em
= btrfs_get_extent(BTRFS_I(inode
), NULL
, alloc_start
,
2787 alloc_end
- alloc_start
);
2794 * Avoid hole punching and extent allocation for some cases. More cases
2795 * could be considered, but these are unlikely common and we keep things
2796 * as simple as possible for now. Also, intentionally, if the target
2797 * range contains one or more prealloc extents together with regular
2798 * extents and holes, we drop all the existing extents and allocate a
2799 * new prealloc extent, so that we get a larger contiguous disk extent.
2801 if (em
->start
<= alloc_start
&& (em
->flags
& EXTENT_FLAG_PREALLOC
)) {
2802 const u64 em_end
= em
->start
+ em
->len
;
2804 if (em_end
>= offset
+ len
) {
2806 * The whole range is already a prealloc extent,
2807 * do nothing except updating the inode's i_size if
2810 free_extent_map(em
);
2811 ret
= btrfs_fallocate_update_isize(inode
, offset
+ len
,
2816 * Part of the range is already a prealloc extent, so operate
2817 * only on the remaining part of the range.
2819 alloc_start
= em_end
;
2820 ASSERT(IS_ALIGNED(alloc_start
, sectorsize
));
2821 len
= offset
+ len
- alloc_start
;
2822 offset
= alloc_start
;
2823 alloc_hint
= extent_map_block_start(em
) + em
->len
;
2825 free_extent_map(em
);
2827 if (BTRFS_BYTES_TO_BLKS(fs_info
, offset
) ==
2828 BTRFS_BYTES_TO_BLKS(fs_info
, offset
+ len
- 1)) {
2829 em
= btrfs_get_extent(BTRFS_I(inode
), NULL
, alloc_start
, sectorsize
);
2835 if (em
->flags
& EXTENT_FLAG_PREALLOC
) {
2836 free_extent_map(em
);
2837 ret
= btrfs_fallocate_update_isize(inode
, offset
+ len
,
2841 if (len
< sectorsize
&& em
->disk_bytenr
!= EXTENT_MAP_HOLE
) {
2842 free_extent_map(em
);
2843 ret
= btrfs_truncate_block(BTRFS_I(inode
), offset
, len
,
2846 ret
= btrfs_fallocate_update_isize(inode
,
2851 free_extent_map(em
);
2852 alloc_start
= round_down(offset
, sectorsize
);
2853 alloc_end
= alloc_start
+ sectorsize
;
2857 alloc_start
= round_up(offset
, sectorsize
);
2858 alloc_end
= round_down(offset
+ len
, sectorsize
);
2861 * For unaligned ranges, check the pages at the boundaries, they might
2862 * map to an extent, in which case we need to partially zero them, or
2863 * they might map to a hole, in which case we need our allocation range
2866 if (!IS_ALIGNED(offset
, sectorsize
)) {
2867 ret
= btrfs_zero_range_check_range_boundary(BTRFS_I(inode
),
2871 if (ret
== RANGE_BOUNDARY_HOLE
) {
2872 alloc_start
= round_down(offset
, sectorsize
);
2874 } else if (ret
== RANGE_BOUNDARY_WRITTEN_EXTENT
) {
2875 ret
= btrfs_truncate_block(BTRFS_I(inode
), offset
, 0, 0);
2883 if (!IS_ALIGNED(offset
+ len
, sectorsize
)) {
2884 ret
= btrfs_zero_range_check_range_boundary(BTRFS_I(inode
),
2888 if (ret
== RANGE_BOUNDARY_HOLE
) {
2889 alloc_end
= round_up(offset
+ len
, sectorsize
);
2891 } else if (ret
== RANGE_BOUNDARY_WRITTEN_EXTENT
) {
2892 ret
= btrfs_truncate_block(BTRFS_I(inode
), offset
+ len
,
2902 if (alloc_start
< alloc_end
) {
2903 struct extent_state
*cached_state
= NULL
;
2904 const u64 lockstart
= alloc_start
;
2905 const u64 lockend
= alloc_end
- 1;
2907 bytes_to_reserve
= alloc_end
- alloc_start
;
2908 ret
= btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode
),
2912 space_reserved
= true;
2913 btrfs_punch_hole_lock_range(inode
, lockstart
, lockend
,
2915 ret
= btrfs_qgroup_reserve_data(BTRFS_I(inode
), &data_reserved
,
2916 alloc_start
, bytes_to_reserve
);
2918 unlock_extent(&BTRFS_I(inode
)->io_tree
, lockstart
,
2919 lockend
, &cached_state
);
2922 ret
= btrfs_prealloc_file_range(inode
, mode
, alloc_start
,
2923 alloc_end
- alloc_start
,
2924 fs_info
->sectorsize
,
2925 offset
+ len
, &alloc_hint
);
2926 unlock_extent(&BTRFS_I(inode
)->io_tree
, lockstart
, lockend
,
2928 /* btrfs_prealloc_file_range releases reserved space on error */
2930 space_reserved
= false;
2934 ret
= btrfs_fallocate_update_isize(inode
, offset
+ len
, mode
);
2936 if (ret
&& space_reserved
)
2937 btrfs_free_reserved_data_space(BTRFS_I(inode
), data_reserved
,
2938 alloc_start
, bytes_to_reserve
);
2939 extent_changeset_free(data_reserved
);
2944 static long btrfs_fallocate(struct file
*file
, int mode
,
2945 loff_t offset
, loff_t len
)
2947 struct inode
*inode
= file_inode(file
);
2948 struct extent_state
*cached_state
= NULL
;
2949 struct extent_changeset
*data_reserved
= NULL
;
2950 struct falloc_range
*range
;
2951 struct falloc_range
*tmp
;
2952 LIST_HEAD(reserve_list
);
2960 u64 data_space_needed
= 0;
2961 u64 data_space_reserved
= 0;
2962 u64 qgroup_reserved
= 0;
2963 struct extent_map
*em
;
2964 int blocksize
= BTRFS_I(inode
)->root
->fs_info
->sectorsize
;
2967 /* Do not allow fallocate in ZONED mode */
2968 if (btrfs_is_zoned(inode_to_fs_info(inode
)))
2971 alloc_start
= round_down(offset
, blocksize
);
2972 alloc_end
= round_up(offset
+ len
, blocksize
);
2973 cur_offset
= alloc_start
;
2975 /* Make sure we aren't being give some crap mode */
2976 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
|
2977 FALLOC_FL_ZERO_RANGE
))
2980 if (mode
& FALLOC_FL_PUNCH_HOLE
)
2981 return btrfs_punch_hole(file
, offset
, len
);
2983 btrfs_inode_lock(BTRFS_I(inode
), BTRFS_ILOCK_MMAP
);
2985 if (!(mode
& FALLOC_FL_KEEP_SIZE
) && offset
+ len
> inode
->i_size
) {
2986 ret
= inode_newsize_ok(inode
, offset
+ len
);
2991 ret
= file_modified(file
);
2996 * TODO: Move these two operations after we have checked
2997 * accurate reserved space, or fallocate can still fail but
2998 * with page truncated or size expanded.
3000 * But that's a minor problem and won't do much harm BTW.
3002 if (alloc_start
> inode
->i_size
) {
3003 ret
= btrfs_cont_expand(BTRFS_I(inode
), i_size_read(inode
),
3007 } else if (offset
+ len
> inode
->i_size
) {
3009 * If we are fallocating from the end of the file onward we
3010 * need to zero out the end of the block if i_size lands in the
3011 * middle of a block.
3013 ret
= btrfs_truncate_block(BTRFS_I(inode
), inode
->i_size
, 0, 0);
3019 * We have locked the inode at the VFS level (in exclusive mode) and we
3020 * have locked the i_mmap_lock lock (in exclusive mode). Now before
3021 * locking the file range, flush all dealloc in the range and wait for
3022 * all ordered extents in the range to complete. After this we can lock
3023 * the file range and, due to the previous locking we did, we know there
3024 * can't be more delalloc or ordered extents in the range.
3026 ret
= btrfs_wait_ordered_range(BTRFS_I(inode
), alloc_start
,
3027 alloc_end
- alloc_start
);
3031 if (mode
& FALLOC_FL_ZERO_RANGE
) {
3032 ret
= btrfs_zero_range(inode
, offset
, len
, mode
);
3033 btrfs_inode_unlock(BTRFS_I(inode
), BTRFS_ILOCK_MMAP
);
3037 locked_end
= alloc_end
- 1;
3038 lock_extent(&BTRFS_I(inode
)->io_tree
, alloc_start
, locked_end
,
3041 btrfs_assert_inode_range_clean(BTRFS_I(inode
), alloc_start
, locked_end
);
3043 /* First, check if we exceed the qgroup limit */
3044 while (cur_offset
< alloc_end
) {
3045 em
= btrfs_get_extent(BTRFS_I(inode
), NULL
, cur_offset
,
3046 alloc_end
- cur_offset
);
3051 last_byte
= min(extent_map_end(em
), alloc_end
);
3052 actual_end
= min_t(u64
, extent_map_end(em
), offset
+ len
);
3053 last_byte
= ALIGN(last_byte
, blocksize
);
3054 if (em
->disk_bytenr
== EXTENT_MAP_HOLE
||
3055 (cur_offset
>= inode
->i_size
&&
3056 !(em
->flags
& EXTENT_FLAG_PREALLOC
))) {
3057 const u64 range_len
= last_byte
- cur_offset
;
3059 ret
= add_falloc_range(&reserve_list
, cur_offset
, range_len
);
3061 free_extent_map(em
);
3064 ret
= btrfs_qgroup_reserve_data(BTRFS_I(inode
),
3065 &data_reserved
, cur_offset
, range_len
);
3067 free_extent_map(em
);
3070 qgroup_reserved
+= range_len
;
3071 data_space_needed
+= range_len
;
3073 free_extent_map(em
);
3074 cur_offset
= last_byte
;
3077 if (!ret
&& data_space_needed
> 0) {
3079 * We are safe to reserve space here as we can't have delalloc
3080 * in the range, see above.
3082 ret
= btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode
),
3085 data_space_reserved
= data_space_needed
;
3089 * If ret is still 0, means we're OK to fallocate.
3090 * Or just cleanup the list and exit.
3092 list_for_each_entry_safe(range
, tmp
, &reserve_list
, list
) {
3094 ret
= btrfs_prealloc_file_range(inode
, mode
,
3096 range
->len
, blocksize
,
3097 offset
+ len
, &alloc_hint
);
3099 * btrfs_prealloc_file_range() releases space even
3100 * if it returns an error.
3102 data_space_reserved
-= range
->len
;
3103 qgroup_reserved
-= range
->len
;
3104 } else if (data_space_reserved
> 0) {
3105 btrfs_free_reserved_data_space(BTRFS_I(inode
),
3106 data_reserved
, range
->start
,
3108 data_space_reserved
-= range
->len
;
3109 qgroup_reserved
-= range
->len
;
3110 } else if (qgroup_reserved
> 0) {
3111 btrfs_qgroup_free_data(BTRFS_I(inode
), data_reserved
,
3112 range
->start
, range
->len
, NULL
);
3113 qgroup_reserved
-= range
->len
;
3115 list_del(&range
->list
);
3122 * We didn't need to allocate any more space, but we still extended the
3123 * size of the file so we need to update i_size and the inode item.
3125 ret
= btrfs_fallocate_update_isize(inode
, actual_end
, mode
);
3127 unlock_extent(&BTRFS_I(inode
)->io_tree
, alloc_start
, locked_end
,
3130 btrfs_inode_unlock(BTRFS_I(inode
), BTRFS_ILOCK_MMAP
);
3131 extent_changeset_free(data_reserved
);
3136 * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3137 * that has unflushed and/or flushing delalloc. There might be other adjacent
3138 * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3139 * looping while it gets adjacent subranges, and merging them together.
3141 static bool find_delalloc_subrange(struct btrfs_inode
*inode
, u64 start
, u64 end
,
3142 struct extent_state
**cached_state
,
3143 bool *search_io_tree
,
3144 u64
*delalloc_start_ret
, u64
*delalloc_end_ret
)
3146 u64 len
= end
+ 1 - start
;
3147 u64 delalloc_len
= 0;
3148 struct btrfs_ordered_extent
*oe
;
3153 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3154 * means we have delalloc (dirty pages) for which writeback has not
3157 if (*search_io_tree
) {
3158 spin_lock(&inode
->lock
);
3159 if (inode
->delalloc_bytes
> 0) {
3160 spin_unlock(&inode
->lock
);
3161 *delalloc_start_ret
= start
;
3162 delalloc_len
= count_range_bits(&inode
->io_tree
,
3163 delalloc_start_ret
, end
,
3164 len
, EXTENT_DELALLOC
, 1,
3167 spin_unlock(&inode
->lock
);
3171 if (delalloc_len
> 0) {
3173 * If delalloc was found then *delalloc_start_ret has a sector size
3174 * aligned value (rounded down).
3176 *delalloc_end_ret
= *delalloc_start_ret
+ delalloc_len
- 1;
3178 if (*delalloc_start_ret
== start
) {
3179 /* Delalloc for the whole range, nothing more to do. */
3180 if (*delalloc_end_ret
== end
)
3182 /* Else trim our search range for ordered extents. */
3183 start
= *delalloc_end_ret
+ 1;
3184 len
= end
+ 1 - start
;
3187 /* No delalloc, future calls don't need to search again. */
3188 *search_io_tree
= false;
3192 * Now also check if there's any ordered extent in the range.
3193 * We do this because:
3195 * 1) When delalloc is flushed, the file range is locked, we clear the
3196 * EXTENT_DELALLOC bit from the io tree and create an extent map and
3197 * an ordered extent for the write. So we might just have been called
3198 * after delalloc is flushed and before the ordered extent completes
3199 * and inserts the new file extent item in the subvolume's btree;
3201 * 2) We may have an ordered extent created by flushing delalloc for a
3202 * subrange that starts before the subrange we found marked with
3203 * EXTENT_DELALLOC in the io tree.
3205 * We could also use the extent map tree to find such delalloc that is
3206 * being flushed, but using the ordered extents tree is more efficient
3207 * because it's usually much smaller as ordered extents are removed from
3208 * the tree once they complete. With the extent maps, we mau have them
3209 * in the extent map tree for a very long time, and they were either
3210 * created by previous writes or loaded by read operations.
3212 oe
= btrfs_lookup_first_ordered_range(inode
, start
, len
);
3214 return (delalloc_len
> 0);
3216 /* The ordered extent may span beyond our search range. */
3217 oe_start
= max(oe
->file_offset
, start
);
3218 oe_end
= min(oe
->file_offset
+ oe
->num_bytes
- 1, end
);
3220 btrfs_put_ordered_extent(oe
);
3222 /* Don't have unflushed delalloc, return the ordered extent range. */
3223 if (delalloc_len
== 0) {
3224 *delalloc_start_ret
= oe_start
;
3225 *delalloc_end_ret
= oe_end
;
3230 * We have both unflushed delalloc (io_tree) and an ordered extent.
3231 * If the ranges are adjacent returned a combined range, otherwise
3232 * return the leftmost range.
3234 if (oe_start
< *delalloc_start_ret
) {
3235 if (oe_end
< *delalloc_start_ret
)
3236 *delalloc_end_ret
= oe_end
;
3237 *delalloc_start_ret
= oe_start
;
3238 } else if (*delalloc_end_ret
+ 1 == oe_start
) {
3239 *delalloc_end_ret
= oe_end
;
3246 * Check if there's delalloc in a given range.
3248 * @inode: The inode.
3249 * @start: The start offset of the range. It does not need to be
3250 * sector size aligned.
3251 * @end: The end offset (inclusive value) of the search range.
3252 * It does not need to be sector size aligned.
3253 * @cached_state: Extent state record used for speeding up delalloc
3254 * searches in the inode's io_tree. Can be NULL.
3255 * @delalloc_start_ret: Output argument, set to the start offset of the
3256 * subrange found with delalloc (may not be sector size
3258 * @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
3259 * of the subrange found with delalloc.
3261 * Returns true if a subrange with delalloc is found within the given range, and
3262 * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3263 * end offsets of the subrange.
3265 bool btrfs_find_delalloc_in_range(struct btrfs_inode
*inode
, u64 start
, u64 end
,
3266 struct extent_state
**cached_state
,
3267 u64
*delalloc_start_ret
, u64
*delalloc_end_ret
)
3269 u64 cur_offset
= round_down(start
, inode
->root
->fs_info
->sectorsize
);
3270 u64 prev_delalloc_end
= 0;
3271 bool search_io_tree
= true;
3274 while (cur_offset
<= end
) {
3279 delalloc
= find_delalloc_subrange(inode
, cur_offset
, end
,
3280 cached_state
, &search_io_tree
,
3286 if (prev_delalloc_end
== 0) {
3287 /* First subrange found. */
3288 *delalloc_start_ret
= max(delalloc_start
, start
);
3289 *delalloc_end_ret
= delalloc_end
;
3291 } else if (delalloc_start
== prev_delalloc_end
+ 1) {
3292 /* Subrange adjacent to the previous one, merge them. */
3293 *delalloc_end_ret
= delalloc_end
;
3295 /* Subrange not adjacent to the previous one, exit. */
3299 prev_delalloc_end
= delalloc_end
;
3300 cur_offset
= delalloc_end
+ 1;
3308 * Check if there's a hole or delalloc range in a range representing a hole (or
3309 * prealloc extent) found in the inode's subvolume btree.
3311 * @inode: The inode.
3312 * @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
3313 * @start: Start offset of the hole region. It does not need to be sector
3315 * @end: End offset (inclusive value) of the hole region. It does not
3316 * need to be sector size aligned.
3317 * @start_ret: Return parameter, used to set the start of the subrange in the
3318 * hole that matches the search criteria (seek mode), if such
3319 * subrange is found (return value of the function is true).
3320 * The value returned here may not be sector size aligned.
3322 * Returns true if a subrange matching the given seek mode is found, and if one
3323 * is found, it updates @start_ret with the start of the subrange.
3325 static bool find_desired_extent_in_hole(struct btrfs_inode
*inode
, int whence
,
3326 struct extent_state
**cached_state
,
3327 u64 start
, u64 end
, u64
*start_ret
)
3333 delalloc
= btrfs_find_delalloc_in_range(inode
, start
, end
, cached_state
,
3334 &delalloc_start
, &delalloc_end
);
3335 if (delalloc
&& whence
== SEEK_DATA
) {
3336 *start_ret
= delalloc_start
;
3340 if (delalloc
&& whence
== SEEK_HOLE
) {
3342 * We found delalloc but it starts after out start offset. So we
3343 * have a hole between our start offset and the delalloc start.
3345 if (start
< delalloc_start
) {
3350 * Delalloc range starts at our start offset.
3351 * If the delalloc range's length is smaller than our range,
3352 * then it means we have a hole that starts where the delalloc
3355 if (delalloc_end
< end
) {
3356 *start_ret
= delalloc_end
+ 1;
3360 /* There's delalloc for the whole range. */
3364 if (!delalloc
&& whence
== SEEK_HOLE
) {
3370 * No delalloc in the range and we are seeking for data. The caller has
3371 * to iterate to the next extent item in the subvolume btree.
3376 static loff_t
find_desired_extent(struct file
*file
, loff_t offset
, int whence
)
3378 struct btrfs_inode
*inode
= BTRFS_I(file
->f_mapping
->host
);
3379 struct btrfs_file_private
*private;
3380 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
3381 struct extent_state
*cached_state
= NULL
;
3382 struct extent_state
**delalloc_cached_state
;
3383 const loff_t i_size
= i_size_read(&inode
->vfs_inode
);
3384 const u64 ino
= btrfs_ino(inode
);
3385 struct btrfs_root
*root
= inode
->root
;
3386 struct btrfs_path
*path
;
3387 struct btrfs_key key
;
3388 u64 last_extent_end
;
3395 if (i_size
== 0 || offset
>= i_size
)
3399 * Quick path. If the inode has no prealloc extents and its number of
3400 * bytes used matches its i_size, then it can not have holes.
3402 if (whence
== SEEK_HOLE
&&
3403 !(inode
->flags
& BTRFS_INODE_PREALLOC
) &&
3404 inode_get_bytes(&inode
->vfs_inode
) == i_size
)
3407 spin_lock(&inode
->lock
);
3408 private = file
->private_data
;
3409 spin_unlock(&inode
->lock
);
3411 if (private && private->owner_task
!= current
) {
3413 * Not allocated by us, don't use it as its cached state is used
3414 * by the task that allocated it and we don't want neither to
3415 * mess with it nor get incorrect results because it reflects an
3416 * invalid state for the current task.
3419 } else if (!private) {
3420 private = kzalloc(sizeof(*private), GFP_KERNEL
);
3422 * No worries if memory allocation failed.
3423 * The private structure is used only for speeding up multiple
3424 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3425 * so everything will still be correct.
3430 private->owner_task
= current
;
3432 spin_lock(&inode
->lock
);
3433 if (file
->private_data
)
3436 file
->private_data
= private;
3437 spin_unlock(&inode
->lock
);
3447 delalloc_cached_state
= &private->llseek_cached_state
;
3449 delalloc_cached_state
= NULL
;
3452 * offset can be negative, in this case we start finding DATA/HOLE from
3453 * the very start of the file.
3455 start
= max_t(loff_t
, 0, offset
);
3457 lockstart
= round_down(start
, fs_info
->sectorsize
);
3458 lockend
= round_up(i_size
, fs_info
->sectorsize
);
3459 if (lockend
<= lockstart
)
3460 lockend
= lockstart
+ fs_info
->sectorsize
;
3463 path
= btrfs_alloc_path();
3466 path
->reada
= READA_FORWARD
;
3469 key
.type
= BTRFS_EXTENT_DATA_KEY
;
3472 last_extent_end
= lockstart
;
3474 lock_extent(&inode
->io_tree
, lockstart
, lockend
, &cached_state
);
3476 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
3479 } else if (ret
> 0 && path
->slots
[0] > 0) {
3480 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0] - 1);
3481 if (key
.objectid
== ino
&& key
.type
== BTRFS_EXTENT_DATA_KEY
)
3485 while (start
< i_size
) {
3486 struct extent_buffer
*leaf
= path
->nodes
[0];
3487 struct btrfs_file_extent_item
*extent
;
3491 if (path
->slots
[0] >= btrfs_header_nritems(leaf
)) {
3492 ret
= btrfs_next_leaf(root
, path
);
3498 leaf
= path
->nodes
[0];
3501 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
3502 if (key
.objectid
!= ino
|| key
.type
!= BTRFS_EXTENT_DATA_KEY
)
3505 extent_end
= btrfs_file_extent_end(path
);
3508 * In the first iteration we may have a slot that points to an
3509 * extent that ends before our start offset, so skip it.
3511 if (extent_end
<= start
) {
3516 /* We have an implicit hole, NO_HOLES feature is likely set. */
3517 if (last_extent_end
< key
.offset
) {
3518 u64 search_start
= last_extent_end
;
3522 * First iteration, @start matches @offset and it's
3525 if (start
== offset
)
3526 search_start
= offset
;
3528 found
= find_desired_extent_in_hole(inode
, whence
,
3529 delalloc_cached_state
,
3534 start
= found_start
;
3538 * Didn't find data or a hole (due to delalloc) in the
3539 * implicit hole range, so need to analyze the extent.
3543 extent
= btrfs_item_ptr(leaf
, path
->slots
[0],
3544 struct btrfs_file_extent_item
);
3545 type
= btrfs_file_extent_type(leaf
, extent
);
3548 * Can't access the extent's disk_bytenr field if this is an
3549 * inline extent, since at that offset, it's where the extent
3552 if (type
== BTRFS_FILE_EXTENT_PREALLOC
||
3553 (type
== BTRFS_FILE_EXTENT_REG
&&
3554 btrfs_file_extent_disk_bytenr(leaf
, extent
) == 0)) {
3556 * Explicit hole or prealloc extent, search for delalloc.
3557 * A prealloc extent is treated like a hole.
3559 u64 search_start
= key
.offset
;
3563 * First iteration, @start matches @offset and it's
3566 if (start
== offset
)
3567 search_start
= offset
;
3569 found
= find_desired_extent_in_hole(inode
, whence
,
3570 delalloc_cached_state
,
3575 start
= found_start
;
3579 * Didn't find data or a hole (due to delalloc) in the
3580 * implicit hole range, so need to analyze the next
3585 * Found a regular or inline extent.
3586 * If we are seeking for data, adjust the start offset
3587 * and stop, we're done.
3589 if (whence
== SEEK_DATA
) {
3590 start
= max_t(u64
, key
.offset
, offset
);
3595 * Else, we are seeking for a hole, check the next file
3601 last_extent_end
= extent_end
;
3603 if (fatal_signal_pending(current
)) {
3610 /* We have an implicit hole from the last extent found up to i_size. */
3611 if (!found
&& start
< i_size
) {
3612 found
= find_desired_extent_in_hole(inode
, whence
,
3613 delalloc_cached_state
, start
,
3614 i_size
- 1, &start
);
3620 unlock_extent(&inode
->io_tree
, lockstart
, lockend
, &cached_state
);
3621 btrfs_free_path(path
);
3626 if (whence
== SEEK_DATA
&& start
>= i_size
)
3629 return min_t(loff_t
, start
, i_size
);
3632 static loff_t
btrfs_file_llseek(struct file
*file
, loff_t offset
, int whence
)
3634 struct inode
*inode
= file
->f_mapping
->host
;
3638 return generic_file_llseek(file
, offset
, whence
);
3641 btrfs_inode_lock(BTRFS_I(inode
), BTRFS_ILOCK_SHARED
);
3642 offset
= find_desired_extent(file
, offset
, whence
);
3643 btrfs_inode_unlock(BTRFS_I(inode
), BTRFS_ILOCK_SHARED
);
3650 return vfs_setpos(file
, offset
, inode
->i_sb
->s_maxbytes
);
3653 static int btrfs_file_open(struct inode
*inode
, struct file
*filp
)
3657 filp
->f_mode
|= FMODE_NOWAIT
| FMODE_CAN_ODIRECT
;
3659 ret
= fsverity_file_open(inode
, filp
);
3662 return generic_file_open(inode
, filp
);
3665 static ssize_t
btrfs_file_read_iter(struct kiocb
*iocb
, struct iov_iter
*to
)
3669 if (iocb
->ki_flags
& IOCB_DIRECT
) {
3670 ret
= btrfs_direct_read(iocb
, to
);
3671 if (ret
< 0 || !iov_iter_count(to
) ||
3672 iocb
->ki_pos
>= i_size_read(file_inode(iocb
->ki_filp
)))
3676 return filemap_read(iocb
, to
, ret
);
3679 const struct file_operations btrfs_file_operations
= {
3680 .llseek
= btrfs_file_llseek
,
3681 .read_iter
= btrfs_file_read_iter
,
3682 .splice_read
= filemap_splice_read
,
3683 .write_iter
= btrfs_file_write_iter
,
3684 .splice_write
= iter_file_splice_write
,
3685 .mmap
= btrfs_file_mmap
,
3686 .open
= btrfs_file_open
,
3687 .release
= btrfs_release_file
,
3688 .get_unmapped_area
= thp_get_unmapped_area
,
3689 .fsync
= btrfs_sync_file
,
3690 .fallocate
= btrfs_fallocate
,
3691 .unlocked_ioctl
= btrfs_ioctl
,
3692 #ifdef CONFIG_COMPAT
3693 .compat_ioctl
= btrfs_compat_ioctl
,
3695 .remap_file_range
= btrfs_remap_file_range
,
3696 .uring_cmd
= btrfs_uring_cmd
,
3697 .fop_flags
= FOP_BUFFER_RASYNC
| FOP_BUFFER_WASYNC
,
3700 int btrfs_fdatawrite_range(struct btrfs_inode
*inode
, loff_t start
, loff_t end
)
3702 struct address_space
*mapping
= inode
->vfs_inode
.i_mapping
;
3706 * So with compression we will find and lock a dirty page and clear the
3707 * first one as dirty, setup an async extent, and immediately return
3708 * with the entire range locked but with nobody actually marked with
3709 * writeback. So we can't just filemap_write_and_wait_range() and
3710 * expect it to work since it will just kick off a thread to do the
3711 * actual work. So we need to call filemap_fdatawrite_range _again_
3712 * since it will wait on the page lock, which won't be unlocked until
3713 * after the pages have been marked as writeback and so we're good to go
3714 * from there. We have to do this otherwise we'll miss the ordered
3715 * extents and that results in badness. Please Josef, do not think you
3716 * know better and pull this out at some point in the future, it is
3717 * right and you are wrong.
3719 ret
= filemap_fdatawrite_range(mapping
, start
, end
);
3720 if (!ret
&& test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT
, &inode
->runtime_flags
))
3721 ret
= filemap_fdatawrite_range(mapping
, start
, end
);