1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
7 #include <linux/pagemap.h>
8 #include <linux/time.h>
9 #include <linux/init.h>
10 #include <linux/string.h>
11 #include <linux/backing-dev.h>
12 #include <linux/falloc.h>
13 #include <linux/writeback.h>
14 #include <linux/compat.h>
15 #include <linux/slab.h>
16 #include <linux/btrfs.h>
17 #include <linux/uio.h>
18 #include <linux/iversion.h>
19 #include <linux/fsverity.h>
21 #include "direct-io.h"
23 #include "transaction.h"
24 #include "btrfs_inode.h"
28 #include "compression.h"
29 #include "delalloc-space.h"
33 #include "accessors.h"
34 #include "extent-tree.h"
35 #include "file-item.h"
41 * Helper to fault in page and copy. This should go away and be replaced with
42 * calls into generic code.
44 static noinline
int btrfs_copy_from_user(loff_t pos
, size_t write_bytes
,
45 struct folio
*folio
, struct iov_iter
*i
)
48 size_t total_copied
= 0;
49 int offset
= offset_in_page(pos
);
51 while (write_bytes
> 0) {
52 size_t count
= min_t(size_t, PAGE_SIZE
- offset
, write_bytes
);
54 * Copy data from userspace to the current page
56 copied
= copy_folio_from_iter_atomic(folio
, offset
, count
, i
);
58 /* Flush processor's dcache for this page */
59 flush_dcache_folio(folio
);
62 * if we get a partial write, we can end up with
63 * partially up to date page. These add
64 * a lot of complexity, so make sure they don't
65 * happen by forcing this copy to be retried.
67 * The rest of the btrfs_file_write code will fall
68 * back to page at a time copies after we return 0.
70 if (unlikely(copied
< count
)) {
71 if (!folio_test_uptodate(folio
)) {
72 iov_iter_revert(i
, copied
);
79 write_bytes
-= copied
;
80 total_copied
+= copied
;
87 * Unlock folio after btrfs_file_write() is done with it.
89 static void btrfs_drop_folio(struct btrfs_fs_info
*fs_info
, struct folio
*folio
,
92 u64 block_start
= round_down(pos
, fs_info
->sectorsize
);
93 u64 block_len
= round_up(pos
+ copied
, fs_info
->sectorsize
) - block_start
;
95 ASSERT(block_len
<= U32_MAX
);
97 * Folio checked is some magic around finding folios that have been
98 * modified without going through btrfs_dirty_folio(). Clear it here.
99 * There should be no need to mark the pages accessed as
100 * prepare_one_folio() should have marked them accessed in
101 * prepare_one_folio() via find_or_create_page()
103 btrfs_folio_clamp_clear_checked(fs_info
, folio
, block_start
, block_len
);
109 * After btrfs_copy_from_user(), update the following things for delalloc:
110 * - Mark newly dirtied folio as DELALLOC in the io tree.
111 * Used to advise which range is to be written back.
112 * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
113 * - Update inode size for past EOF write
115 int btrfs_dirty_folio(struct btrfs_inode
*inode
, struct folio
*folio
, loff_t pos
,
116 size_t write_bytes
, struct extent_state
**cached
, bool noreserve
)
118 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
122 u64 end_of_last_block
;
123 u64 end_pos
= pos
+ write_bytes
;
124 loff_t isize
= i_size_read(&inode
->vfs_inode
);
125 unsigned int extra_bits
= 0;
127 if (write_bytes
== 0)
131 extra_bits
|= EXTENT_NORESERVE
;
133 start_pos
= round_down(pos
, fs_info
->sectorsize
);
134 num_bytes
= round_up(write_bytes
+ pos
- start_pos
,
135 fs_info
->sectorsize
);
136 ASSERT(num_bytes
<= U32_MAX
);
137 ASSERT(folio_pos(folio
) <= pos
&&
138 folio_pos(folio
) + folio_size(folio
) >= pos
+ write_bytes
);
140 end_of_last_block
= start_pos
+ num_bytes
- 1;
143 * The pages may have already been dirty, clear out old accounting so
144 * we can set things up properly
146 clear_extent_bit(&inode
->io_tree
, start_pos
, end_of_last_block
,
147 EXTENT_DELALLOC
| EXTENT_DO_ACCOUNTING
| EXTENT_DEFRAG
,
150 ret
= btrfs_set_extent_delalloc(inode
, start_pos
, end_of_last_block
,
155 btrfs_folio_clamp_set_uptodate(fs_info
, folio
, start_pos
, num_bytes
);
156 btrfs_folio_clamp_clear_checked(fs_info
, folio
, start_pos
, num_bytes
);
157 btrfs_folio_clamp_set_dirty(fs_info
, folio
, start_pos
, num_bytes
);
160 * we've only changed i_size in ram, and we haven't updated
161 * the disk i_size. There is no need to log the inode
165 i_size_write(&inode
->vfs_inode
, end_pos
);
170 * this is very complex, but the basic idea is to drop all extents
171 * in the range start - end. hint_block is filled in with a block number
172 * that would be a good hint to the block allocator for this file.
174 * If an extent intersects the range but is not entirely inside the range
175 * it is either truncated or split. Anything entirely inside the range
176 * is deleted from the tree.
178 * Note: the VFS' inode number of bytes is not updated, it's up to the caller
179 * to deal with that. We set the field 'bytes_found' of the arguments structure
180 * with the number of allocated bytes found in the target range, so that the
181 * caller can update the inode's number of bytes in an atomic way when
182 * replacing extents in a range to avoid races with stat(2).
184 int btrfs_drop_extents(struct btrfs_trans_handle
*trans
,
185 struct btrfs_root
*root
, struct btrfs_inode
*inode
,
186 struct btrfs_drop_extents_args
*args
)
188 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
189 struct extent_buffer
*leaf
;
190 struct btrfs_file_extent_item
*fi
;
191 struct btrfs_key key
;
192 struct btrfs_key new_key
;
193 u64 ino
= btrfs_ino(inode
);
194 u64 search_start
= args
->start
;
197 u64 extent_offset
= 0;
199 u64 last_end
= args
->start
;
205 int modify_tree
= -1;
208 struct btrfs_path
*path
= args
->path
;
210 args
->bytes_found
= 0;
211 args
->extent_inserted
= false;
213 /* Must always have a path if ->replace_extent is true */
214 ASSERT(!(args
->replace_extent
&& !args
->path
));
217 path
= btrfs_alloc_path();
224 if (args
->drop_cache
)
225 btrfs_drop_extent_map_range(inode
, args
->start
, args
->end
- 1, false);
227 if (args
->start
>= inode
->disk_i_size
&& !args
->replace_extent
)
230 update_refs
= (btrfs_root_id(root
) != BTRFS_TREE_LOG_OBJECTID
);
233 ret
= btrfs_lookup_file_extent(trans
, root
, path
, ino
,
234 search_start
, modify_tree
);
237 if (ret
> 0 && path
->slots
[0] > 0 && search_start
== args
->start
) {
238 leaf
= path
->nodes
[0];
239 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0] - 1);
240 if (key
.objectid
== ino
&&
241 key
.type
== BTRFS_EXTENT_DATA_KEY
)
246 leaf
= path
->nodes
[0];
247 if (path
->slots
[0] >= btrfs_header_nritems(leaf
)) {
249 ret
= btrfs_next_leaf(root
, path
);
256 leaf
= path
->nodes
[0];
260 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
262 if (key
.objectid
> ino
)
264 if (WARN_ON_ONCE(key
.objectid
< ino
) ||
265 key
.type
< BTRFS_EXTENT_DATA_KEY
) {
270 if (key
.type
> BTRFS_EXTENT_DATA_KEY
|| key
.offset
>= args
->end
)
273 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
274 struct btrfs_file_extent_item
);
275 extent_type
= btrfs_file_extent_type(leaf
, fi
);
277 if (extent_type
== BTRFS_FILE_EXTENT_REG
||
278 extent_type
== BTRFS_FILE_EXTENT_PREALLOC
) {
279 disk_bytenr
= btrfs_file_extent_disk_bytenr(leaf
, fi
);
280 num_bytes
= btrfs_file_extent_disk_num_bytes(leaf
, fi
);
281 extent_offset
= btrfs_file_extent_offset(leaf
, fi
);
282 extent_end
= key
.offset
+
283 btrfs_file_extent_num_bytes(leaf
, fi
);
284 } else if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
285 extent_end
= key
.offset
+
286 btrfs_file_extent_ram_bytes(leaf
, fi
);
293 * Don't skip extent items representing 0 byte lengths. They
294 * used to be created (bug) if while punching holes we hit
295 * -ENOSPC condition. So if we find one here, just ensure we
296 * delete it, otherwise we would insert a new file extent item
297 * with the same key (offset) as that 0 bytes length file
298 * extent item in the call to setup_items_for_insert() later
301 if (extent_end
== key
.offset
&& extent_end
>= search_start
) {
302 last_end
= extent_end
;
303 goto delete_extent_item
;
306 if (extent_end
<= search_start
) {
312 search_start
= max(key
.offset
, args
->start
);
313 if (recow
|| !modify_tree
) {
315 btrfs_release_path(path
);
320 * | - range to drop - |
321 * | -------- extent -------- |
323 if (args
->start
> key
.offset
&& args
->end
< extent_end
) {
325 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
330 memcpy(&new_key
, &key
, sizeof(new_key
));
331 new_key
.offset
= args
->start
;
332 ret
= btrfs_duplicate_item(trans
, root
, path
,
334 if (ret
== -EAGAIN
) {
335 btrfs_release_path(path
);
341 leaf
= path
->nodes
[0];
342 fi
= btrfs_item_ptr(leaf
, path
->slots
[0] - 1,
343 struct btrfs_file_extent_item
);
344 btrfs_set_file_extent_num_bytes(leaf
, fi
,
345 args
->start
- key
.offset
);
347 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
348 struct btrfs_file_extent_item
);
350 extent_offset
+= args
->start
- key
.offset
;
351 btrfs_set_file_extent_offset(leaf
, fi
, extent_offset
);
352 btrfs_set_file_extent_num_bytes(leaf
, fi
,
353 extent_end
- args
->start
);
354 btrfs_mark_buffer_dirty(trans
, leaf
);
356 if (update_refs
&& disk_bytenr
> 0) {
357 struct btrfs_ref ref
= {
358 .action
= BTRFS_ADD_DELAYED_REF
,
359 .bytenr
= disk_bytenr
,
360 .num_bytes
= num_bytes
,
362 .owning_root
= btrfs_root_id(root
),
363 .ref_root
= btrfs_root_id(root
),
365 btrfs_init_data_ref(&ref
, new_key
.objectid
,
366 args
->start
- extent_offset
,
368 ret
= btrfs_inc_extent_ref(trans
, &ref
);
370 btrfs_abort_transaction(trans
, ret
);
374 key
.offset
= args
->start
;
377 * From here on out we will have actually dropped something, so
378 * last_end can be updated.
380 last_end
= extent_end
;
383 * | ---- range to drop ----- |
384 * | -------- extent -------- |
386 if (args
->start
<= key
.offset
&& args
->end
< extent_end
) {
387 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
392 memcpy(&new_key
, &key
, sizeof(new_key
));
393 new_key
.offset
= args
->end
;
394 btrfs_set_item_key_safe(trans
, path
, &new_key
);
396 extent_offset
+= args
->end
- key
.offset
;
397 btrfs_set_file_extent_offset(leaf
, fi
, extent_offset
);
398 btrfs_set_file_extent_num_bytes(leaf
, fi
,
399 extent_end
- args
->end
);
400 btrfs_mark_buffer_dirty(trans
, leaf
);
401 if (update_refs
&& disk_bytenr
> 0)
402 args
->bytes_found
+= args
->end
- key
.offset
;
406 search_start
= extent_end
;
408 * | ---- range to drop ----- |
409 * | -------- extent -------- |
411 if (args
->start
> key
.offset
&& args
->end
>= extent_end
) {
413 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
418 btrfs_set_file_extent_num_bytes(leaf
, fi
,
419 args
->start
- key
.offset
);
420 btrfs_mark_buffer_dirty(trans
, leaf
);
421 if (update_refs
&& disk_bytenr
> 0)
422 args
->bytes_found
+= extent_end
- args
->start
;
423 if (args
->end
== extent_end
)
431 * | ---- range to drop ----- |
432 * | ------ extent ------ |
434 if (args
->start
<= key
.offset
&& args
->end
>= extent_end
) {
437 del_slot
= path
->slots
[0];
440 BUG_ON(del_slot
+ del_nr
!= path
->slots
[0]);
445 extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
446 args
->bytes_found
+= extent_end
- key
.offset
;
447 extent_end
= ALIGN(extent_end
,
448 fs_info
->sectorsize
);
449 } else if (update_refs
&& disk_bytenr
> 0) {
450 struct btrfs_ref ref
= {
451 .action
= BTRFS_DROP_DELAYED_REF
,
452 .bytenr
= disk_bytenr
,
453 .num_bytes
= num_bytes
,
455 .owning_root
= btrfs_root_id(root
),
456 .ref_root
= btrfs_root_id(root
),
458 btrfs_init_data_ref(&ref
, key
.objectid
,
459 key
.offset
- extent_offset
,
461 ret
= btrfs_free_extent(trans
, &ref
);
463 btrfs_abort_transaction(trans
, ret
);
466 args
->bytes_found
+= extent_end
- key
.offset
;
469 if (args
->end
== extent_end
)
472 if (path
->slots
[0] + 1 < btrfs_header_nritems(leaf
)) {
477 ret
= btrfs_del_items(trans
, root
, path
, del_slot
,
480 btrfs_abort_transaction(trans
, ret
);
487 btrfs_release_path(path
);
494 if (!ret
&& del_nr
> 0) {
496 * Set path->slots[0] to first slot, so that after the delete
497 * if items are move off from our leaf to its immediate left or
498 * right neighbor leafs, we end up with a correct and adjusted
499 * path->slots[0] for our insertion (if args->replace_extent).
501 path
->slots
[0] = del_slot
;
502 ret
= btrfs_del_items(trans
, root
, path
, del_slot
, del_nr
);
504 btrfs_abort_transaction(trans
, ret
);
507 leaf
= path
->nodes
[0];
509 * If btrfs_del_items() was called, it might have deleted a leaf, in
510 * which case it unlocked our path, so check path->locks[0] matches a
513 if (!ret
&& args
->replace_extent
&&
514 path
->locks
[0] == BTRFS_WRITE_LOCK
&&
515 btrfs_leaf_free_space(leaf
) >=
516 sizeof(struct btrfs_item
) + args
->extent_item_size
) {
519 key
.type
= BTRFS_EXTENT_DATA_KEY
;
520 key
.offset
= args
->start
;
521 if (!del_nr
&& path
->slots
[0] < btrfs_header_nritems(leaf
)) {
522 struct btrfs_key slot_key
;
524 btrfs_item_key_to_cpu(leaf
, &slot_key
, path
->slots
[0]);
525 if (btrfs_comp_cpu_keys(&key
, &slot_key
) > 0)
528 btrfs_setup_item_for_insert(trans
, root
, path
, &key
,
529 args
->extent_item_size
);
530 args
->extent_inserted
= true;
534 btrfs_free_path(path
);
535 else if (!args
->extent_inserted
)
536 btrfs_release_path(path
);
538 args
->drop_end
= found
? min(args
->end
, last_end
) : args
->end
;
543 static int extent_mergeable(struct extent_buffer
*leaf
, int slot
,
544 u64 objectid
, u64 bytenr
, u64 orig_offset
,
545 u64
*start
, u64
*end
)
547 struct btrfs_file_extent_item
*fi
;
548 struct btrfs_key key
;
551 if (slot
< 0 || slot
>= btrfs_header_nritems(leaf
))
554 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
555 if (key
.objectid
!= objectid
|| key
.type
!= BTRFS_EXTENT_DATA_KEY
)
558 fi
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
559 if (btrfs_file_extent_type(leaf
, fi
) != BTRFS_FILE_EXTENT_REG
||
560 btrfs_file_extent_disk_bytenr(leaf
, fi
) != bytenr
||
561 btrfs_file_extent_offset(leaf
, fi
) != key
.offset
- orig_offset
||
562 btrfs_file_extent_compression(leaf
, fi
) ||
563 btrfs_file_extent_encryption(leaf
, fi
) ||
564 btrfs_file_extent_other_encoding(leaf
, fi
))
567 extent_end
= key
.offset
+ btrfs_file_extent_num_bytes(leaf
, fi
);
568 if ((*start
&& *start
!= key
.offset
) || (*end
&& *end
!= extent_end
))
577 * Mark extent in the range start - end as written.
579 * This changes extent type from 'pre-allocated' to 'regular'. If only
580 * part of extent is marked as written, the extent will be split into
583 int btrfs_mark_extent_written(struct btrfs_trans_handle
*trans
,
584 struct btrfs_inode
*inode
, u64 start
, u64 end
)
586 struct btrfs_root
*root
= inode
->root
;
587 struct extent_buffer
*leaf
;
588 struct btrfs_path
*path
;
589 struct btrfs_file_extent_item
*fi
;
590 struct btrfs_ref ref
= { 0 };
591 struct btrfs_key key
;
592 struct btrfs_key new_key
;
604 u64 ino
= btrfs_ino(inode
);
606 path
= btrfs_alloc_path();
613 key
.type
= BTRFS_EXTENT_DATA_KEY
;
616 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
619 if (ret
> 0 && path
->slots
[0] > 0)
622 leaf
= path
->nodes
[0];
623 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
624 if (key
.objectid
!= ino
||
625 key
.type
!= BTRFS_EXTENT_DATA_KEY
) {
627 btrfs_abort_transaction(trans
, ret
);
630 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
631 struct btrfs_file_extent_item
);
632 if (btrfs_file_extent_type(leaf
, fi
) != BTRFS_FILE_EXTENT_PREALLOC
) {
634 btrfs_abort_transaction(trans
, ret
);
637 extent_end
= key
.offset
+ btrfs_file_extent_num_bytes(leaf
, fi
);
638 if (key
.offset
> start
|| extent_end
< end
) {
640 btrfs_abort_transaction(trans
, ret
);
644 bytenr
= btrfs_file_extent_disk_bytenr(leaf
, fi
);
645 num_bytes
= btrfs_file_extent_disk_num_bytes(leaf
, fi
);
646 orig_offset
= key
.offset
- btrfs_file_extent_offset(leaf
, fi
);
647 memcpy(&new_key
, &key
, sizeof(new_key
));
649 if (start
== key
.offset
&& end
< extent_end
) {
652 if (extent_mergeable(leaf
, path
->slots
[0] - 1,
653 ino
, bytenr
, orig_offset
,
654 &other_start
, &other_end
)) {
655 new_key
.offset
= end
;
656 btrfs_set_item_key_safe(trans
, path
, &new_key
);
657 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
658 struct btrfs_file_extent_item
);
659 btrfs_set_file_extent_generation(leaf
, fi
,
661 btrfs_set_file_extent_num_bytes(leaf
, fi
,
663 btrfs_set_file_extent_offset(leaf
, fi
,
665 fi
= btrfs_item_ptr(leaf
, path
->slots
[0] - 1,
666 struct btrfs_file_extent_item
);
667 btrfs_set_file_extent_generation(leaf
, fi
,
669 btrfs_set_file_extent_num_bytes(leaf
, fi
,
671 btrfs_mark_buffer_dirty(trans
, leaf
);
676 if (start
> key
.offset
&& end
== extent_end
) {
679 if (extent_mergeable(leaf
, path
->slots
[0] + 1,
680 ino
, bytenr
, orig_offset
,
681 &other_start
, &other_end
)) {
682 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
683 struct btrfs_file_extent_item
);
684 btrfs_set_file_extent_num_bytes(leaf
, fi
,
686 btrfs_set_file_extent_generation(leaf
, fi
,
689 new_key
.offset
= start
;
690 btrfs_set_item_key_safe(trans
, path
, &new_key
);
692 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
693 struct btrfs_file_extent_item
);
694 btrfs_set_file_extent_generation(leaf
, fi
,
696 btrfs_set_file_extent_num_bytes(leaf
, fi
,
698 btrfs_set_file_extent_offset(leaf
, fi
,
699 start
- orig_offset
);
700 btrfs_mark_buffer_dirty(trans
, leaf
);
705 while (start
> key
.offset
|| end
< extent_end
) {
706 if (key
.offset
== start
)
709 new_key
.offset
= split
;
710 ret
= btrfs_duplicate_item(trans
, root
, path
, &new_key
);
711 if (ret
== -EAGAIN
) {
712 btrfs_release_path(path
);
716 btrfs_abort_transaction(trans
, ret
);
720 leaf
= path
->nodes
[0];
721 fi
= btrfs_item_ptr(leaf
, path
->slots
[0] - 1,
722 struct btrfs_file_extent_item
);
723 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
724 btrfs_set_file_extent_num_bytes(leaf
, fi
,
727 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
728 struct btrfs_file_extent_item
);
730 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
731 btrfs_set_file_extent_offset(leaf
, fi
, split
- orig_offset
);
732 btrfs_set_file_extent_num_bytes(leaf
, fi
,
734 btrfs_mark_buffer_dirty(trans
, leaf
);
736 ref
.action
= BTRFS_ADD_DELAYED_REF
;
738 ref
.num_bytes
= num_bytes
;
740 ref
.owning_root
= btrfs_root_id(root
);
741 ref
.ref_root
= btrfs_root_id(root
);
742 btrfs_init_data_ref(&ref
, ino
, orig_offset
, 0, false);
743 ret
= btrfs_inc_extent_ref(trans
, &ref
);
745 btrfs_abort_transaction(trans
, ret
);
749 if (split
== start
) {
752 if (start
!= key
.offset
) {
754 btrfs_abort_transaction(trans
, ret
);
766 ref
.action
= BTRFS_DROP_DELAYED_REF
;
768 ref
.num_bytes
= num_bytes
;
770 ref
.owning_root
= btrfs_root_id(root
);
771 ref
.ref_root
= btrfs_root_id(root
);
772 btrfs_init_data_ref(&ref
, ino
, orig_offset
, 0, false);
773 if (extent_mergeable(leaf
, path
->slots
[0] + 1,
774 ino
, bytenr
, orig_offset
,
775 &other_start
, &other_end
)) {
777 btrfs_release_path(path
);
780 extent_end
= other_end
;
781 del_slot
= path
->slots
[0] + 1;
783 ret
= btrfs_free_extent(trans
, &ref
);
785 btrfs_abort_transaction(trans
, ret
);
791 if (extent_mergeable(leaf
, path
->slots
[0] - 1,
792 ino
, bytenr
, orig_offset
,
793 &other_start
, &other_end
)) {
795 btrfs_release_path(path
);
798 key
.offset
= other_start
;
799 del_slot
= path
->slots
[0];
801 ret
= btrfs_free_extent(trans
, &ref
);
803 btrfs_abort_transaction(trans
, ret
);
808 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
809 struct btrfs_file_extent_item
);
810 btrfs_set_file_extent_type(leaf
, fi
,
811 BTRFS_FILE_EXTENT_REG
);
812 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
813 btrfs_mark_buffer_dirty(trans
, leaf
);
815 fi
= btrfs_item_ptr(leaf
, del_slot
- 1,
816 struct btrfs_file_extent_item
);
817 btrfs_set_file_extent_type(leaf
, fi
,
818 BTRFS_FILE_EXTENT_REG
);
819 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
820 btrfs_set_file_extent_num_bytes(leaf
, fi
,
821 extent_end
- key
.offset
);
822 btrfs_mark_buffer_dirty(trans
, leaf
);
824 ret
= btrfs_del_items(trans
, root
, path
, del_slot
, del_nr
);
826 btrfs_abort_transaction(trans
, ret
);
831 btrfs_free_path(path
);
836 * On error return an unlocked folio and the error value
837 * On success return a locked folio and 0
839 static int prepare_uptodate_folio(struct inode
*inode
, struct folio
*folio
, u64 pos
,
840 u64 len
, bool force_uptodate
)
842 u64 clamp_start
= max_t(u64
, pos
, folio_pos(folio
));
843 u64 clamp_end
= min_t(u64
, pos
+ len
, folio_pos(folio
) + folio_size(folio
));
846 if (folio_test_uptodate(folio
))
849 if (!force_uptodate
&&
850 IS_ALIGNED(clamp_start
, PAGE_SIZE
) &&
851 IS_ALIGNED(clamp_end
, PAGE_SIZE
))
854 ret
= btrfs_read_folio(NULL
, folio
);
858 if (!folio_test_uptodate(folio
)) {
864 * Since btrfs_read_folio() will unlock the folio before it returns,
865 * there is a window where btrfs_release_folio() can be called to
866 * release the page. Here we check both inode mapping and page
867 * private to make sure the page was not released.
869 * The private flag check is essential for subpage as we need to store
870 * extra bitmap using folio private.
872 if (folio
->mapping
!= inode
->i_mapping
|| !folio_test_private(folio
)) {
879 static gfp_t
get_prepare_gfp_flags(struct inode
*inode
, bool nowait
)
883 gfp
= btrfs_alloc_write_mask(inode
->i_mapping
);
885 gfp
&= ~__GFP_DIRECT_RECLAIM
;
893 * Get folio into the page cache and lock it.
895 static noinline
int prepare_one_folio(struct inode
*inode
, struct folio
**folio_ret
,
896 loff_t pos
, size_t write_bytes
,
897 bool force_uptodate
, bool nowait
)
899 unsigned long index
= pos
>> PAGE_SHIFT
;
900 gfp_t mask
= get_prepare_gfp_flags(inode
, nowait
);
901 fgf_t fgp_flags
= (nowait
? FGP_WRITEBEGIN
| FGP_NOWAIT
: FGP_WRITEBEGIN
);
906 folio
= __filemap_get_folio(inode
->i_mapping
, index
, fgp_flags
, mask
);
911 ret
= PTR_ERR(folio
);
914 folio_wait_writeback(folio
);
915 /* Only support page sized folio yet. */
916 ASSERT(folio_order(folio
) == 0);
917 ret
= set_folio_extent_mapped(folio
);
923 ret
= prepare_uptodate_folio(inode
, folio
, pos
, write_bytes
, force_uptodate
);
925 /* The folio is already unlocked. */
927 if (!nowait
&& ret
== -EAGAIN
) {
938 * Locks the extent and properly waits for data=ordered extents to finish
939 * before allowing the folios to be modified if need.
942 * 1 - the extent is locked
943 * 0 - the extent is not locked, and everything is OK
944 * -EAGAIN - need to prepare the folios again
947 lock_and_cleanup_extent_if_need(struct btrfs_inode
*inode
, struct folio
*folio
,
948 loff_t pos
, size_t write_bytes
,
949 u64
*lockstart
, u64
*lockend
, bool nowait
,
950 struct extent_state
**cached_state
)
952 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
957 start_pos
= round_down(pos
, fs_info
->sectorsize
);
958 last_pos
= round_up(pos
+ write_bytes
, fs_info
->sectorsize
) - 1;
960 if (start_pos
< inode
->vfs_inode
.i_size
) {
961 struct btrfs_ordered_extent
*ordered
;
964 if (!try_lock_extent(&inode
->io_tree
, start_pos
, last_pos
,
971 lock_extent(&inode
->io_tree
, start_pos
, last_pos
, cached_state
);
974 ordered
= btrfs_lookup_ordered_range(inode
, start_pos
,
975 last_pos
- start_pos
+ 1);
977 ordered
->file_offset
+ ordered
->num_bytes
> start_pos
&&
978 ordered
->file_offset
<= last_pos
) {
979 unlock_extent(&inode
->io_tree
, start_pos
, last_pos
,
983 btrfs_start_ordered_extent(ordered
);
984 btrfs_put_ordered_extent(ordered
);
988 btrfs_put_ordered_extent(ordered
);
990 *lockstart
= start_pos
;
996 * We should be called after prepare_one_folio() which should have locked
997 * all pages in the range.
999 WARN_ON(!folio_test_locked(folio
));
1005 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1007 * @pos: File offset.
1008 * @write_bytes: The length to write, will be updated to the nocow writeable
1011 * This function will flush ordered extents in the range to ensure proper
1015 * > 0 If we can nocow, and updates @write_bytes.
1016 * 0 If we can't do a nocow write.
1017 * -EAGAIN If we can't do a nocow write because snapshoting of the inode's
1018 * root is in progress.
1019 * < 0 If an error happened.
1021 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
1023 int btrfs_check_nocow_lock(struct btrfs_inode
*inode
, loff_t pos
,
1024 size_t *write_bytes
, bool nowait
)
1026 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1027 struct btrfs_root
*root
= inode
->root
;
1028 struct extent_state
*cached_state
= NULL
;
1029 u64 lockstart
, lockend
;
1033 if (!(inode
->flags
& (BTRFS_INODE_NODATACOW
| BTRFS_INODE_PREALLOC
)))
1036 if (!btrfs_drew_try_write_lock(&root
->snapshot_lock
))
1039 lockstart
= round_down(pos
, fs_info
->sectorsize
);
1040 lockend
= round_up(pos
+ *write_bytes
,
1041 fs_info
->sectorsize
) - 1;
1042 num_bytes
= lockend
- lockstart
+ 1;
1045 if (!btrfs_try_lock_ordered_range(inode
, lockstart
, lockend
,
1047 btrfs_drew_write_unlock(&root
->snapshot_lock
);
1051 btrfs_lock_and_flush_ordered_range(inode
, lockstart
, lockend
,
1054 ret
= can_nocow_extent(&inode
->vfs_inode
, lockstart
, &num_bytes
,
1055 NULL
, nowait
, false);
1057 btrfs_drew_write_unlock(&root
->snapshot_lock
);
1059 *write_bytes
= min_t(size_t, *write_bytes
,
1060 num_bytes
- pos
+ lockstart
);
1061 unlock_extent(&inode
->io_tree
, lockstart
, lockend
, &cached_state
);
1066 void btrfs_check_nocow_unlock(struct btrfs_inode
*inode
)
1068 btrfs_drew_write_unlock(&inode
->root
->snapshot_lock
);
1071 int btrfs_write_check(struct kiocb
*iocb
, size_t count
)
1073 struct file
*file
= iocb
->ki_filp
;
1074 struct inode
*inode
= file_inode(file
);
1075 struct btrfs_fs_info
*fs_info
= inode_to_fs_info(inode
);
1076 loff_t pos
= iocb
->ki_pos
;
1082 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1083 * prealloc flags, as without those flags we always have to COW. We will
1084 * later check if we can really COW into the target range (using
1085 * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1087 if ((iocb
->ki_flags
& IOCB_NOWAIT
) &&
1088 !(BTRFS_I(inode
)->flags
& (BTRFS_INODE_NODATACOW
| BTRFS_INODE_PREALLOC
)))
1091 ret
= file_remove_privs(file
);
1096 * We reserve space for updating the inode when we reserve space for the
1097 * extent we are going to write, so we will enospc out there. We don't
1098 * need to start yet another transaction to update the inode as we will
1099 * update the inode when we finish writing whatever data we write.
1101 if (!IS_NOCMTIME(inode
)) {
1102 inode_set_mtime_to_ts(inode
, inode_set_ctime_current(inode
));
1103 inode_inc_iversion(inode
);
1106 start_pos
= round_down(pos
, fs_info
->sectorsize
);
1107 oldsize
= i_size_read(inode
);
1108 if (start_pos
> oldsize
) {
1109 /* Expand hole size to cover write data, preventing empty gap */
1110 loff_t end_pos
= round_up(pos
+ count
, fs_info
->sectorsize
);
1112 ret
= btrfs_cont_expand(BTRFS_I(inode
), oldsize
, end_pos
);
1120 ssize_t
btrfs_buffered_write(struct kiocb
*iocb
, struct iov_iter
*i
)
1122 struct file
*file
= iocb
->ki_filp
;
1124 struct inode
*inode
= file_inode(file
);
1125 struct btrfs_fs_info
*fs_info
= inode_to_fs_info(inode
);
1126 struct extent_changeset
*data_reserved
= NULL
;
1127 u64 release_bytes
= 0;
1130 size_t num_written
= 0;
1132 loff_t old_isize
= i_size_read(inode
);
1133 unsigned int ilock_flags
= 0;
1134 const bool nowait
= (iocb
->ki_flags
& IOCB_NOWAIT
);
1135 unsigned int bdp_flags
= (nowait
? BDP_ASYNC
: 0);
1136 bool only_release_metadata
= false;
1139 ilock_flags
|= BTRFS_ILOCK_TRY
;
1141 ret
= btrfs_inode_lock(BTRFS_I(inode
), ilock_flags
);
1145 ret
= generic_write_checks(iocb
, i
);
1149 ret
= btrfs_write_check(iocb
, ret
);
1154 while (iov_iter_count(i
) > 0) {
1155 struct extent_state
*cached_state
= NULL
;
1156 size_t offset
= offset_in_page(pos
);
1157 size_t sector_offset
;
1158 size_t write_bytes
= min(iov_iter_count(i
), PAGE_SIZE
- offset
);
1159 size_t reserve_bytes
;
1161 size_t dirty_sectors
;
1163 struct folio
*folio
= NULL
;
1165 bool force_page_uptodate
= false;
1168 * Fault pages before locking them in prepare_one_folio()
1169 * to avoid recursive lock
1171 if (unlikely(fault_in_iov_iter_readable(i
, write_bytes
))) {
1176 only_release_metadata
= false;
1177 sector_offset
= pos
& (fs_info
->sectorsize
- 1);
1179 extent_changeset_release(data_reserved
);
1180 ret
= btrfs_check_data_free_space(BTRFS_I(inode
),
1181 &data_reserved
, pos
,
1182 write_bytes
, nowait
);
1186 if (nowait
&& (ret
== -ENOSPC
|| ret
== -EAGAIN
)) {
1192 * If we don't have to COW at the offset, reserve
1193 * metadata only. write_bytes may get smaller than
1196 can_nocow
= btrfs_check_nocow_lock(BTRFS_I(inode
), pos
,
1197 &write_bytes
, nowait
);
1204 only_release_metadata
= true;
1207 reserve_bytes
= round_up(write_bytes
+ sector_offset
,
1208 fs_info
->sectorsize
);
1209 WARN_ON(reserve_bytes
== 0);
1210 ret
= btrfs_delalloc_reserve_metadata(BTRFS_I(inode
),
1212 reserve_bytes
, nowait
);
1214 if (!only_release_metadata
)
1215 btrfs_free_reserved_data_space(BTRFS_I(inode
),
1219 btrfs_check_nocow_unlock(BTRFS_I(inode
));
1221 if (nowait
&& ret
== -ENOSPC
)
1226 release_bytes
= reserve_bytes
;
1228 ret
= balance_dirty_pages_ratelimited_flags(inode
->i_mapping
, bdp_flags
);
1230 btrfs_delalloc_release_extents(BTRFS_I(inode
), reserve_bytes
);
1234 ret
= prepare_one_folio(inode
, &folio
, pos
, write_bytes
,
1235 force_page_uptodate
, false);
1237 btrfs_delalloc_release_extents(BTRFS_I(inode
),
1242 extents_locked
= lock_and_cleanup_extent_if_need(BTRFS_I(inode
),
1243 folio
, pos
, write_bytes
, &lockstart
,
1244 &lockend
, nowait
, &cached_state
);
1245 if (extents_locked
< 0) {
1246 if (!nowait
&& extents_locked
== -EAGAIN
)
1249 btrfs_delalloc_release_extents(BTRFS_I(inode
),
1251 ret
= extents_locked
;
1255 copied
= btrfs_copy_from_user(pos
, write_bytes
, folio
, i
);
1257 num_sectors
= BTRFS_BYTES_TO_BLKS(fs_info
, reserve_bytes
);
1258 dirty_sectors
= round_up(copied
+ sector_offset
,
1259 fs_info
->sectorsize
);
1260 dirty_sectors
= BTRFS_BYTES_TO_BLKS(fs_info
, dirty_sectors
);
1263 force_page_uptodate
= true;
1266 force_page_uptodate
= false;
1269 if (num_sectors
> dirty_sectors
) {
1270 /* release everything except the sectors we dirtied */
1271 release_bytes
-= dirty_sectors
<< fs_info
->sectorsize_bits
;
1272 if (only_release_metadata
) {
1273 btrfs_delalloc_release_metadata(BTRFS_I(inode
),
1274 release_bytes
, true);
1276 u64 release_start
= round_up(pos
+ copied
,
1277 fs_info
->sectorsize
);
1278 btrfs_delalloc_release_space(BTRFS_I(inode
),
1279 data_reserved
, release_start
,
1280 release_bytes
, true);
1284 release_bytes
= round_up(copied
+ sector_offset
,
1285 fs_info
->sectorsize
);
1287 ret
= btrfs_dirty_folio(BTRFS_I(inode
), folio
, pos
, copied
,
1288 &cached_state
, only_release_metadata
);
1291 * If we have not locked the extent range, because the range's
1292 * start offset is >= i_size, we might still have a non-NULL
1293 * cached extent state, acquired while marking the extent range
1294 * as delalloc through btrfs_dirty_page(). Therefore free any
1295 * possible cached extent state to avoid a memory leak.
1298 unlock_extent(&BTRFS_I(inode
)->io_tree
, lockstart
,
1299 lockend
, &cached_state
);
1301 free_extent_state(cached_state
);
1303 btrfs_delalloc_release_extents(BTRFS_I(inode
), reserve_bytes
);
1305 btrfs_drop_folio(fs_info
, folio
, pos
, copied
);
1310 if (only_release_metadata
)
1311 btrfs_check_nocow_unlock(BTRFS_I(inode
));
1313 btrfs_drop_folio(fs_info
, folio
, pos
, copied
);
1318 num_written
+= copied
;
1321 if (release_bytes
) {
1322 if (only_release_metadata
) {
1323 btrfs_check_nocow_unlock(BTRFS_I(inode
));
1324 btrfs_delalloc_release_metadata(BTRFS_I(inode
),
1325 release_bytes
, true);
1327 btrfs_delalloc_release_space(BTRFS_I(inode
),
1329 round_down(pos
, fs_info
->sectorsize
),
1330 release_bytes
, true);
1334 extent_changeset_free(data_reserved
);
1335 if (num_written
> 0) {
1336 pagecache_isize_extended(inode
, old_isize
, iocb
->ki_pos
);
1337 iocb
->ki_pos
+= num_written
;
1340 btrfs_inode_unlock(BTRFS_I(inode
), ilock_flags
);
1341 return num_written
? num_written
: ret
;
1344 static ssize_t
btrfs_encoded_write(struct kiocb
*iocb
, struct iov_iter
*from
,
1345 const struct btrfs_ioctl_encoded_io_args
*encoded
)
1347 struct file
*file
= iocb
->ki_filp
;
1348 struct inode
*inode
= file_inode(file
);
1352 btrfs_inode_lock(BTRFS_I(inode
), 0);
1353 count
= encoded
->len
;
1354 ret
= generic_write_checks_count(iocb
, &count
);
1355 if (ret
== 0 && count
!= encoded
->len
) {
1357 * The write got truncated by generic_write_checks_count(). We
1358 * can't do a partial encoded write.
1362 if (ret
|| encoded
->len
== 0)
1365 ret
= btrfs_write_check(iocb
, encoded
->len
);
1369 ret
= btrfs_do_encoded_write(iocb
, from
, encoded
);
1371 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1375 ssize_t
btrfs_do_write_iter(struct kiocb
*iocb
, struct iov_iter
*from
,
1376 const struct btrfs_ioctl_encoded_io_args
*encoded
)
1378 struct file
*file
= iocb
->ki_filp
;
1379 struct btrfs_inode
*inode
= BTRFS_I(file_inode(file
));
1380 ssize_t num_written
, num_sync
;
1383 * If the fs flips readonly due to some impossible error, although we
1384 * have opened a file as writable, we have to stop this write operation
1385 * to ensure consistency.
1387 if (BTRFS_FS_ERROR(inode
->root
->fs_info
))
1390 if (encoded
&& (iocb
->ki_flags
& IOCB_NOWAIT
))
1394 num_written
= btrfs_encoded_write(iocb
, from
, encoded
);
1395 num_sync
= encoded
->len
;
1396 } else if (iocb
->ki_flags
& IOCB_DIRECT
) {
1397 num_written
= btrfs_direct_write(iocb
, from
);
1398 num_sync
= num_written
;
1400 num_written
= btrfs_buffered_write(iocb
, from
);
1401 num_sync
= num_written
;
1404 btrfs_set_inode_last_sub_trans(inode
);
1407 num_sync
= generic_write_sync(iocb
, num_sync
);
1409 num_written
= num_sync
;
1415 static ssize_t
btrfs_file_write_iter(struct kiocb
*iocb
, struct iov_iter
*from
)
1417 return btrfs_do_write_iter(iocb
, from
, NULL
);
1420 int btrfs_release_file(struct inode
*inode
, struct file
*filp
)
1422 struct btrfs_file_private
*private = filp
->private_data
;
1425 kfree(private->filldir_buf
);
1426 free_extent_state(private->llseek_cached_state
);
1428 filp
->private_data
= NULL
;
1432 * Set by setattr when we are about to truncate a file from a non-zero
1433 * size to a zero size. This tries to flush down new bytes that may
1434 * have been written if the application were using truncate to replace
1437 if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE
,
1438 &BTRFS_I(inode
)->runtime_flags
))
1439 filemap_flush(inode
->i_mapping
);
1443 static int start_ordered_ops(struct btrfs_inode
*inode
, loff_t start
, loff_t end
)
1446 struct blk_plug plug
;
1449 * This is only called in fsync, which would do synchronous writes, so
1450 * a plug can merge adjacent IOs as much as possible. Esp. in case of
1451 * multiple disks using raid profile, a large IO can be split to
1452 * several segments of stripe length (currently 64K).
1454 blk_start_plug(&plug
);
1455 ret
= btrfs_fdatawrite_range(inode
, start
, end
);
1456 blk_finish_plug(&plug
);
1461 static inline bool skip_inode_logging(const struct btrfs_log_ctx
*ctx
)
1463 struct btrfs_inode
*inode
= ctx
->inode
;
1464 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1466 if (btrfs_inode_in_log(inode
, btrfs_get_fs_generation(fs_info
)) &&
1467 list_empty(&ctx
->ordered_extents
))
1471 * If we are doing a fast fsync we can not bail out if the inode's
1472 * last_trans is <= then the last committed transaction, because we only
1473 * update the last_trans of the inode during ordered extent completion,
1474 * and for a fast fsync we don't wait for that, we only wait for the
1475 * writeback to complete.
1477 if (inode
->last_trans
<= btrfs_get_last_trans_committed(fs_info
) &&
1478 (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC
, &inode
->runtime_flags
) ||
1479 list_empty(&ctx
->ordered_extents
)))
1486 * fsync call for both files and directories. This logs the inode into
1487 * the tree log instead of forcing full commits whenever possible.
1489 * It needs to call filemap_fdatawait so that all ordered extent updates are
1490 * in the metadata btree are up to date for copying to the log.
1492 * It drops the inode mutex before doing the tree log commit. This is an
1493 * important optimization for directories because holding the mutex prevents
1494 * new operations on the dir while we write to disk.
1496 int btrfs_sync_file(struct file
*file
, loff_t start
, loff_t end
, int datasync
)
1498 struct dentry
*dentry
= file_dentry(file
);
1499 struct btrfs_inode
*inode
= BTRFS_I(d_inode(dentry
));
1500 struct btrfs_root
*root
= inode
->root
;
1501 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1502 struct btrfs_trans_handle
*trans
;
1503 struct btrfs_log_ctx ctx
;
1507 bool skip_ilock
= false;
1509 if (current
->journal_info
== BTRFS_TRANS_DIO_WRITE_STUB
) {
1511 current
->journal_info
= NULL
;
1512 btrfs_assert_inode_locked(inode
);
1515 trace_btrfs_sync_file(file
, datasync
);
1517 btrfs_init_log_ctx(&ctx
, inode
);
1520 * Always set the range to a full range, otherwise we can get into
1521 * several problems, from missing file extent items to represent holes
1522 * when not using the NO_HOLES feature, to log tree corruption due to
1523 * races between hole detection during logging and completion of ordered
1524 * extents outside the range, to missing checksums due to ordered extents
1525 * for which we flushed only a subset of their pages.
1529 len
= (u64
)LLONG_MAX
+ 1;
1532 * We write the dirty pages in the range and wait until they complete
1533 * out of the ->i_mutex. If so, we can flush the dirty pages by
1534 * multi-task, and make the performance up. See
1535 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1537 ret
= start_ordered_ops(inode
, start
, end
);
1542 down_write(&inode
->i_mmap_lock
);
1544 btrfs_inode_lock(inode
, BTRFS_ILOCK_MMAP
);
1546 atomic_inc(&root
->log_batch
);
1549 * Before we acquired the inode's lock and the mmap lock, someone may
1550 * have dirtied more pages in the target range. We need to make sure
1551 * that writeback for any such pages does not start while we are logging
1552 * the inode, because if it does, any of the following might happen when
1553 * we are not doing a full inode sync:
1555 * 1) We log an extent after its writeback finishes but before its
1556 * checksums are added to the csum tree, leading to -EIO errors
1557 * when attempting to read the extent after a log replay.
1559 * 2) We can end up logging an extent before its writeback finishes.
1560 * Therefore after the log replay we will have a file extent item
1561 * pointing to an unwritten extent (and no data checksums as well).
1563 * So trigger writeback for any eventual new dirty pages and then we
1564 * wait for all ordered extents to complete below.
1566 ret
= start_ordered_ops(inode
, start
, end
);
1569 up_write(&inode
->i_mmap_lock
);
1571 btrfs_inode_unlock(inode
, BTRFS_ILOCK_MMAP
);
1576 * Always check for the full sync flag while holding the inode's lock,
1577 * to avoid races with other tasks. The flag must be either set all the
1578 * time during logging or always off all the time while logging.
1579 * We check the flag here after starting delalloc above, because when
1580 * running delalloc the full sync flag may be set if we need to drop
1581 * extra extent map ranges due to temporary memory allocation failures.
1583 full_sync
= test_bit(BTRFS_INODE_NEEDS_FULL_SYNC
, &inode
->runtime_flags
);
1586 * We have to do this here to avoid the priority inversion of waiting on
1587 * IO of a lower priority task while holding a transaction open.
1589 * For a full fsync we wait for the ordered extents to complete while
1590 * for a fast fsync we wait just for writeback to complete, and then
1591 * attach the ordered extents to the transaction so that a transaction
1592 * commit waits for their completion, to avoid data loss if we fsync,
1593 * the current transaction commits before the ordered extents complete
1594 * and a power failure happens right after that.
1596 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1597 * logical address recorded in the ordered extent may change. We need
1598 * to wait for the IO to stabilize the logical address.
1600 if (full_sync
|| btrfs_is_zoned(fs_info
)) {
1601 ret
= btrfs_wait_ordered_range(inode
, start
, len
);
1602 clear_bit(BTRFS_INODE_COW_WRITE_ERROR
, &inode
->runtime_flags
);
1605 * Get our ordered extents as soon as possible to avoid doing
1606 * checksum lookups in the csum tree, and use instead the
1607 * checksums attached to the ordered extents.
1609 btrfs_get_ordered_extents_for_logging(inode
, &ctx
.ordered_extents
);
1610 ret
= filemap_fdatawait_range(inode
->vfs_inode
.i_mapping
, start
, end
);
1612 goto out_release_extents
;
1615 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
1616 * starting and waiting for writeback, because for buffered IO
1617 * it may have been set during the end IO callback
1618 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
1619 * case an error happened and we need to wait for ordered
1620 * extents to complete so that any extent maps that point to
1621 * unwritten locations are dropped and we don't log them.
1623 if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR
, &inode
->runtime_flags
))
1624 ret
= btrfs_wait_ordered_range(inode
, start
, len
);
1628 goto out_release_extents
;
1630 atomic_inc(&root
->log_batch
);
1632 if (skip_inode_logging(&ctx
)) {
1634 * We've had everything committed since the last time we were
1635 * modified so clear this flag in case it was set for whatever
1636 * reason, it's no longer relevant.
1638 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC
, &inode
->runtime_flags
);
1640 * An ordered extent might have started before and completed
1641 * already with io errors, in which case the inode was not
1642 * updated and we end up here. So check the inode's mapping
1643 * for any errors that might have happened since we last
1644 * checked called fsync.
1646 ret
= filemap_check_wb_err(inode
->vfs_inode
.i_mapping
, file
->f_wb_err
);
1647 goto out_release_extents
;
1650 btrfs_init_log_ctx_scratch_eb(&ctx
);
1653 * We use start here because we will need to wait on the IO to complete
1654 * in btrfs_sync_log, which could require joining a transaction (for
1655 * example checking cross references in the nocow path). If we use join
1656 * here we could get into a situation where we're waiting on IO to
1657 * happen that is blocked on a transaction trying to commit. With start
1658 * we inc the extwriter counter, so we wait for all extwriters to exit
1659 * before we start blocking joiners. This comment is to keep somebody
1660 * from thinking they are super smart and changing this to
1661 * btrfs_join_transaction *cough*Josef*cough*.
1663 trans
= btrfs_start_transaction(root
, 0);
1664 if (IS_ERR(trans
)) {
1665 ret
= PTR_ERR(trans
);
1666 goto out_release_extents
;
1668 trans
->in_fsync
= true;
1670 ret
= btrfs_log_dentry_safe(trans
, dentry
, &ctx
);
1672 * Scratch eb no longer needed, release before syncing log or commit
1673 * transaction, to avoid holding unnecessary memory during such long
1676 if (ctx
.scratch_eb
) {
1677 free_extent_buffer(ctx
.scratch_eb
);
1678 ctx
.scratch_eb
= NULL
;
1680 btrfs_release_log_ctx_extents(&ctx
);
1682 /* Fallthrough and commit/free transaction. */
1683 ret
= BTRFS_LOG_FORCE_COMMIT
;
1686 /* we've logged all the items and now have a consistent
1687 * version of the file in the log. It is possible that
1688 * someone will come in and modify the file, but that's
1689 * fine because the log is consistent on disk, and we
1690 * have references to all of the file's extents
1692 * It is possible that someone will come in and log the
1693 * file again, but that will end up using the synchronization
1694 * inside btrfs_sync_log to keep things safe.
1697 up_write(&inode
->i_mmap_lock
);
1699 btrfs_inode_unlock(inode
, BTRFS_ILOCK_MMAP
);
1701 if (ret
== BTRFS_NO_LOG_SYNC
) {
1702 ret
= btrfs_end_transaction(trans
);
1706 /* We successfully logged the inode, attempt to sync the log. */
1708 ret
= btrfs_sync_log(trans
, root
, &ctx
);
1710 ret
= btrfs_end_transaction(trans
);
1716 * At this point we need to commit the transaction because we had
1717 * btrfs_need_log_full_commit() or some other error.
1719 * If we didn't do a full sync we have to stop the trans handle, wait on
1720 * the ordered extents, start it again and commit the transaction. If
1721 * we attempt to wait on the ordered extents here we could deadlock with
1722 * something like fallocate() that is holding the extent lock trying to
1723 * start a transaction while some other thread is trying to commit the
1724 * transaction while we (fsync) are currently holding the transaction
1728 ret
= btrfs_end_transaction(trans
);
1731 ret
= btrfs_wait_ordered_range(inode
, start
, len
);
1736 * This is safe to use here because we're only interested in
1737 * making sure the transaction that had the ordered extents is
1738 * committed. We aren't waiting on anything past this point,
1739 * we're purely getting the transaction and committing it.
1741 trans
= btrfs_attach_transaction_barrier(root
);
1742 if (IS_ERR(trans
)) {
1743 ret
= PTR_ERR(trans
);
1746 * We committed the transaction and there's no currently
1747 * running transaction, this means everything we care
1748 * about made it to disk and we are done.
1756 ret
= btrfs_commit_transaction(trans
);
1758 free_extent_buffer(ctx
.scratch_eb
);
1759 ASSERT(list_empty(&ctx
.list
));
1760 ASSERT(list_empty(&ctx
.conflict_inodes
));
1761 err
= file_check_and_advance_wb_err(file
);
1764 return ret
> 0 ? -EIO
: ret
;
1766 out_release_extents
:
1767 btrfs_release_log_ctx_extents(&ctx
);
1769 up_write(&inode
->i_mmap_lock
);
1771 btrfs_inode_unlock(inode
, BTRFS_ILOCK_MMAP
);
1776 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
1777 * called from a page fault handler when a page is first dirtied. Hence we must
1778 * be careful to check for EOF conditions here. We set the page up correctly
1779 * for a written page which means we get ENOSPC checking when writing into
1780 * holes and correct delalloc and unwritten extent mapping on filesystems that
1781 * support these features.
1783 * We are not allowed to take the i_mutex here so we have to play games to
1784 * protect against truncate races as the page could now be beyond EOF. Because
1785 * truncate_setsize() writes the inode size before removing pages, once we have
1786 * the page lock we can determine safely if the page is beyond EOF. If it is not
1787 * beyond EOF, then the page is guaranteed safe against truncation until we
1790 static vm_fault_t
btrfs_page_mkwrite(struct vm_fault
*vmf
)
1792 struct page
*page
= vmf
->page
;
1793 struct folio
*folio
= page_folio(page
);
1794 struct inode
*inode
= file_inode(vmf
->vma
->vm_file
);
1795 struct btrfs_fs_info
*fs_info
= inode_to_fs_info(inode
);
1796 struct extent_io_tree
*io_tree
= &BTRFS_I(inode
)->io_tree
;
1797 struct btrfs_ordered_extent
*ordered
;
1798 struct extent_state
*cached_state
= NULL
;
1799 struct extent_changeset
*data_reserved
= NULL
;
1800 unsigned long zero_start
;
1810 ASSERT(folio_order(folio
) == 0);
1812 reserved_space
= PAGE_SIZE
;
1814 sb_start_pagefault(inode
->i_sb
);
1815 page_start
= folio_pos(folio
);
1816 page_end
= page_start
+ folio_size(folio
) - 1;
1820 * Reserving delalloc space after obtaining the page lock can lead to
1821 * deadlock. For example, if a dirty page is locked by this function
1822 * and the call to btrfs_delalloc_reserve_space() ends up triggering
1823 * dirty page write out, then the btrfs_writepages() function could
1824 * end up waiting indefinitely to get a lock on the page currently
1825 * being processed by btrfs_page_mkwrite() function.
1827 ret2
= btrfs_delalloc_reserve_space(BTRFS_I(inode
), &data_reserved
,
1828 page_start
, reserved_space
);
1830 ret2
= file_update_time(vmf
->vma
->vm_file
);
1834 ret
= vmf_error(ret2
);
1840 /* Make the VM retry the fault. */
1841 ret
= VM_FAULT_NOPAGE
;
1843 down_read(&BTRFS_I(inode
)->i_mmap_lock
);
1845 size
= i_size_read(inode
);
1847 if ((folio
->mapping
!= inode
->i_mapping
) ||
1848 (page_start
>= size
)) {
1849 /* Page got truncated out from underneath us. */
1852 folio_wait_writeback(folio
);
1854 lock_extent(io_tree
, page_start
, page_end
, &cached_state
);
1855 ret2
= set_folio_extent_mapped(folio
);
1857 ret
= vmf_error(ret2
);
1858 unlock_extent(io_tree
, page_start
, page_end
, &cached_state
);
1863 * We can't set the delalloc bits if there are pending ordered
1864 * extents. Drop our locks and wait for them to finish.
1866 ordered
= btrfs_lookup_ordered_range(BTRFS_I(inode
), page_start
, PAGE_SIZE
);
1868 unlock_extent(io_tree
, page_start
, page_end
, &cached_state
);
1869 folio_unlock(folio
);
1870 up_read(&BTRFS_I(inode
)->i_mmap_lock
);
1871 btrfs_start_ordered_extent(ordered
);
1872 btrfs_put_ordered_extent(ordered
);
1876 if (folio
->index
== ((size
- 1) >> PAGE_SHIFT
)) {
1877 reserved_space
= round_up(size
- page_start
, fs_info
->sectorsize
);
1878 if (reserved_space
< PAGE_SIZE
) {
1879 end
= page_start
+ reserved_space
- 1;
1880 btrfs_delalloc_release_space(BTRFS_I(inode
),
1881 data_reserved
, page_start
,
1882 PAGE_SIZE
- reserved_space
, true);
1887 * page_mkwrite gets called when the page is firstly dirtied after it's
1888 * faulted in, but write(2) could also dirty a page and set delalloc
1889 * bits, thus in this case for space account reason, we still need to
1890 * clear any delalloc bits within this page range since we have to
1891 * reserve data&meta space before lock_page() (see above comments).
1893 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, page_start
, end
,
1894 EXTENT_DELALLOC
| EXTENT_DO_ACCOUNTING
|
1895 EXTENT_DEFRAG
, &cached_state
);
1897 ret2
= btrfs_set_extent_delalloc(BTRFS_I(inode
), page_start
, end
, 0,
1900 unlock_extent(io_tree
, page_start
, page_end
, &cached_state
);
1901 ret
= VM_FAULT_SIGBUS
;
1905 /* Page is wholly or partially inside EOF. */
1906 if (page_start
+ folio_size(folio
) > size
)
1907 zero_start
= offset_in_folio(folio
, size
);
1909 zero_start
= PAGE_SIZE
;
1911 if (zero_start
!= PAGE_SIZE
)
1912 folio_zero_range(folio
, zero_start
, folio_size(folio
) - zero_start
);
1914 btrfs_folio_clear_checked(fs_info
, folio
, page_start
, PAGE_SIZE
);
1915 btrfs_folio_set_dirty(fs_info
, folio
, page_start
, end
+ 1 - page_start
);
1916 btrfs_folio_set_uptodate(fs_info
, folio
, page_start
, end
+ 1 - page_start
);
1918 btrfs_set_inode_last_sub_trans(BTRFS_I(inode
));
1920 unlock_extent(io_tree
, page_start
, page_end
, &cached_state
);
1921 up_read(&BTRFS_I(inode
)->i_mmap_lock
);
1923 btrfs_delalloc_release_extents(BTRFS_I(inode
), PAGE_SIZE
);
1924 sb_end_pagefault(inode
->i_sb
);
1925 extent_changeset_free(data_reserved
);
1926 return VM_FAULT_LOCKED
;
1929 folio_unlock(folio
);
1930 up_read(&BTRFS_I(inode
)->i_mmap_lock
);
1932 btrfs_delalloc_release_extents(BTRFS_I(inode
), PAGE_SIZE
);
1933 btrfs_delalloc_release_space(BTRFS_I(inode
), data_reserved
, page_start
,
1934 reserved_space
, (ret
!= 0));
1936 sb_end_pagefault(inode
->i_sb
);
1937 extent_changeset_free(data_reserved
);
1941 static const struct vm_operations_struct btrfs_file_vm_ops
= {
1942 .fault
= filemap_fault
,
1943 .map_pages
= filemap_map_pages
,
1944 .page_mkwrite
= btrfs_page_mkwrite
,
1947 static int btrfs_file_mmap(struct file
*filp
, struct vm_area_struct
*vma
)
1949 struct address_space
*mapping
= filp
->f_mapping
;
1951 if (!mapping
->a_ops
->read_folio
)
1954 file_accessed(filp
);
1955 vma
->vm_ops
= &btrfs_file_vm_ops
;
1960 static int hole_mergeable(struct btrfs_inode
*inode
, struct extent_buffer
*leaf
,
1961 int slot
, u64 start
, u64 end
)
1963 struct btrfs_file_extent_item
*fi
;
1964 struct btrfs_key key
;
1966 if (slot
< 0 || slot
>= btrfs_header_nritems(leaf
))
1969 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
1970 if (key
.objectid
!= btrfs_ino(inode
) ||
1971 key
.type
!= BTRFS_EXTENT_DATA_KEY
)
1974 fi
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
1976 if (btrfs_file_extent_type(leaf
, fi
) != BTRFS_FILE_EXTENT_REG
)
1979 if (btrfs_file_extent_disk_bytenr(leaf
, fi
))
1982 if (key
.offset
== end
)
1984 if (key
.offset
+ btrfs_file_extent_num_bytes(leaf
, fi
) == start
)
1989 static int fill_holes(struct btrfs_trans_handle
*trans
,
1990 struct btrfs_inode
*inode
,
1991 struct btrfs_path
*path
, u64 offset
, u64 end
)
1993 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1994 struct btrfs_root
*root
= inode
->root
;
1995 struct extent_buffer
*leaf
;
1996 struct btrfs_file_extent_item
*fi
;
1997 struct extent_map
*hole_em
;
1998 struct btrfs_key key
;
2001 if (btrfs_fs_incompat(fs_info
, NO_HOLES
))
2004 key
.objectid
= btrfs_ino(inode
);
2005 key
.type
= BTRFS_EXTENT_DATA_KEY
;
2006 key
.offset
= offset
;
2008 ret
= btrfs_search_slot(trans
, root
, &key
, path
, 0, 1);
2011 * We should have dropped this offset, so if we find it then
2012 * something has gone horribly wrong.
2019 leaf
= path
->nodes
[0];
2020 if (hole_mergeable(inode
, leaf
, path
->slots
[0] - 1, offset
, end
)) {
2024 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
2025 struct btrfs_file_extent_item
);
2026 num_bytes
= btrfs_file_extent_num_bytes(leaf
, fi
) +
2028 btrfs_set_file_extent_num_bytes(leaf
, fi
, num_bytes
);
2029 btrfs_set_file_extent_ram_bytes(leaf
, fi
, num_bytes
);
2030 btrfs_set_file_extent_offset(leaf
, fi
, 0);
2031 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
2032 btrfs_mark_buffer_dirty(trans
, leaf
);
2036 if (hole_mergeable(inode
, leaf
, path
->slots
[0], offset
, end
)) {
2039 key
.offset
= offset
;
2040 btrfs_set_item_key_safe(trans
, path
, &key
);
2041 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
2042 struct btrfs_file_extent_item
);
2043 num_bytes
= btrfs_file_extent_num_bytes(leaf
, fi
) + end
-
2045 btrfs_set_file_extent_num_bytes(leaf
, fi
, num_bytes
);
2046 btrfs_set_file_extent_ram_bytes(leaf
, fi
, num_bytes
);
2047 btrfs_set_file_extent_offset(leaf
, fi
, 0);
2048 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
2049 btrfs_mark_buffer_dirty(trans
, leaf
);
2052 btrfs_release_path(path
);
2054 ret
= btrfs_insert_hole_extent(trans
, root
, btrfs_ino(inode
), offset
,
2060 btrfs_release_path(path
);
2062 hole_em
= alloc_extent_map();
2064 btrfs_drop_extent_map_range(inode
, offset
, end
- 1, false);
2065 btrfs_set_inode_full_sync(inode
);
2067 hole_em
->start
= offset
;
2068 hole_em
->len
= end
- offset
;
2069 hole_em
->ram_bytes
= hole_em
->len
;
2071 hole_em
->disk_bytenr
= EXTENT_MAP_HOLE
;
2072 hole_em
->disk_num_bytes
= 0;
2073 hole_em
->generation
= trans
->transid
;
2075 ret
= btrfs_replace_extent_map_range(inode
, hole_em
, true);
2076 free_extent_map(hole_em
);
2078 btrfs_set_inode_full_sync(inode
);
2085 * Find a hole extent on given inode and change start/len to the end of hole
2086 * extent.(hole/vacuum extent whose em->start <= start &&
2087 * em->start + em->len > start)
2088 * When a hole extent is found, return 1 and modify start/len.
2090 static int find_first_non_hole(struct btrfs_inode
*inode
, u64
*start
, u64
*len
)
2092 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
2093 struct extent_map
*em
;
2096 em
= btrfs_get_extent(inode
, NULL
,
2097 round_down(*start
, fs_info
->sectorsize
),
2098 round_up(*len
, fs_info
->sectorsize
));
2102 /* Hole or vacuum extent(only exists in no-hole mode) */
2103 if (em
->disk_bytenr
== EXTENT_MAP_HOLE
) {
2105 *len
= em
->start
+ em
->len
> *start
+ *len
?
2106 0 : *start
+ *len
- em
->start
- em
->len
;
2107 *start
= em
->start
+ em
->len
;
2109 free_extent_map(em
);
2113 static void btrfs_punch_hole_lock_range(struct inode
*inode
,
2114 const u64 lockstart
,
2116 struct extent_state
**cached_state
)
2119 * For subpage case, if the range is not at page boundary, we could
2120 * have pages at the leading/tailing part of the range.
2121 * This could lead to dead loop since filemap_range_has_page()
2122 * will always return true.
2123 * So here we need to do extra page alignment for
2124 * filemap_range_has_page().
2126 const u64 page_lockstart
= round_up(lockstart
, PAGE_SIZE
);
2127 const u64 page_lockend
= round_down(lockend
+ 1, PAGE_SIZE
) - 1;
2130 truncate_pagecache_range(inode
, lockstart
, lockend
);
2132 lock_extent(&BTRFS_I(inode
)->io_tree
, lockstart
, lockend
,
2135 * We can't have ordered extents in the range, nor dirty/writeback
2136 * pages, because we have locked the inode's VFS lock in exclusive
2137 * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2138 * we have flushed all delalloc in the range and we have waited
2139 * for any ordered extents in the range to complete.
2140 * We can race with anyone reading pages from this range, so after
2141 * locking the range check if we have pages in the range, and if
2142 * we do, unlock the range and retry.
2144 if (!filemap_range_has_page(inode
->i_mapping
, page_lockstart
,
2148 unlock_extent(&BTRFS_I(inode
)->io_tree
, lockstart
, lockend
,
2152 btrfs_assert_inode_range_clean(BTRFS_I(inode
), lockstart
, lockend
);
2155 static int btrfs_insert_replace_extent(struct btrfs_trans_handle
*trans
,
2156 struct btrfs_inode
*inode
,
2157 struct btrfs_path
*path
,
2158 struct btrfs_replace_extent_info
*extent_info
,
2159 const u64 replace_len
,
2160 const u64 bytes_to_drop
)
2162 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
2163 struct btrfs_root
*root
= inode
->root
;
2164 struct btrfs_file_extent_item
*extent
;
2165 struct extent_buffer
*leaf
;
2166 struct btrfs_key key
;
2170 if (replace_len
== 0)
2173 if (extent_info
->disk_offset
== 0 &&
2174 btrfs_fs_incompat(fs_info
, NO_HOLES
)) {
2175 btrfs_update_inode_bytes(inode
, 0, bytes_to_drop
);
2179 key
.objectid
= btrfs_ino(inode
);
2180 key
.type
= BTRFS_EXTENT_DATA_KEY
;
2181 key
.offset
= extent_info
->file_offset
;
2182 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
,
2183 sizeof(struct btrfs_file_extent_item
));
2186 leaf
= path
->nodes
[0];
2187 slot
= path
->slots
[0];
2188 write_extent_buffer(leaf
, extent_info
->extent_buf
,
2189 btrfs_item_ptr_offset(leaf
, slot
),
2190 sizeof(struct btrfs_file_extent_item
));
2191 extent
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
2192 ASSERT(btrfs_file_extent_type(leaf
, extent
) != BTRFS_FILE_EXTENT_INLINE
);
2193 btrfs_set_file_extent_offset(leaf
, extent
, extent_info
->data_offset
);
2194 btrfs_set_file_extent_num_bytes(leaf
, extent
, replace_len
);
2195 if (extent_info
->is_new_extent
)
2196 btrfs_set_file_extent_generation(leaf
, extent
, trans
->transid
);
2197 btrfs_mark_buffer_dirty(trans
, leaf
);
2198 btrfs_release_path(path
);
2200 ret
= btrfs_inode_set_file_extent_range(inode
, extent_info
->file_offset
,
2205 /* If it's a hole, nothing more needs to be done. */
2206 if (extent_info
->disk_offset
== 0) {
2207 btrfs_update_inode_bytes(inode
, 0, bytes_to_drop
);
2211 btrfs_update_inode_bytes(inode
, replace_len
, bytes_to_drop
);
2213 if (extent_info
->is_new_extent
&& extent_info
->insertions
== 0) {
2214 key
.objectid
= extent_info
->disk_offset
;
2215 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
2216 key
.offset
= extent_info
->disk_len
;
2217 ret
= btrfs_alloc_reserved_file_extent(trans
, root
,
2219 extent_info
->file_offset
,
2220 extent_info
->qgroup_reserved
,
2223 struct btrfs_ref ref
= {
2224 .action
= BTRFS_ADD_DELAYED_REF
,
2225 .bytenr
= extent_info
->disk_offset
,
2226 .num_bytes
= extent_info
->disk_len
,
2227 .owning_root
= btrfs_root_id(root
),
2228 .ref_root
= btrfs_root_id(root
),
2232 ref_offset
= extent_info
->file_offset
- extent_info
->data_offset
;
2233 btrfs_init_data_ref(&ref
, btrfs_ino(inode
), ref_offset
, 0, false);
2234 ret
= btrfs_inc_extent_ref(trans
, &ref
);
2237 extent_info
->insertions
++;
2243 * The respective range must have been previously locked, as well as the inode.
2244 * The end offset is inclusive (last byte of the range).
2245 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2246 * the file range with an extent.
2247 * When not punching a hole, we don't want to end up in a state where we dropped
2248 * extents without inserting a new one, so we must abort the transaction to avoid
2251 int btrfs_replace_file_extents(struct btrfs_inode
*inode
,
2252 struct btrfs_path
*path
, const u64 start
,
2254 struct btrfs_replace_extent_info
*extent_info
,
2255 struct btrfs_trans_handle
**trans_out
)
2257 struct btrfs_drop_extents_args drop_args
= { 0 };
2258 struct btrfs_root
*root
= inode
->root
;
2259 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2260 u64 min_size
= btrfs_calc_insert_metadata_size(fs_info
, 1);
2261 u64 ino_size
= round_up(inode
->vfs_inode
.i_size
, fs_info
->sectorsize
);
2262 struct btrfs_trans_handle
*trans
= NULL
;
2263 struct btrfs_block_rsv
*rsv
;
2264 unsigned int rsv_count
;
2266 u64 len
= end
- start
;
2272 rsv
= btrfs_alloc_block_rsv(fs_info
, BTRFS_BLOCK_RSV_TEMP
);
2277 rsv
->size
= btrfs_calc_insert_metadata_size(fs_info
, 1);
2278 rsv
->failfast
= true;
2281 * 1 - update the inode
2282 * 1 - removing the extents in the range
2283 * 1 - adding the hole extent if no_holes isn't set or if we are
2284 * replacing the range with a new extent
2286 if (!btrfs_fs_incompat(fs_info
, NO_HOLES
) || extent_info
)
2291 trans
= btrfs_start_transaction(root
, rsv_count
);
2292 if (IS_ERR(trans
)) {
2293 ret
= PTR_ERR(trans
);
2298 ret
= btrfs_block_rsv_migrate(&fs_info
->trans_block_rsv
, rsv
,
2302 trans
->block_rsv
= rsv
;
2305 drop_args
.path
= path
;
2306 drop_args
.end
= end
+ 1;
2307 drop_args
.drop_cache
= true;
2308 while (cur_offset
< end
) {
2309 drop_args
.start
= cur_offset
;
2310 ret
= btrfs_drop_extents(trans
, root
, inode
, &drop_args
);
2311 /* If we are punching a hole decrement the inode's byte count */
2313 btrfs_update_inode_bytes(inode
, 0,
2314 drop_args
.bytes_found
);
2315 if (ret
!= -ENOSPC
) {
2317 * The only time we don't want to abort is if we are
2318 * attempting to clone a partial inline extent, in which
2319 * case we'll get EOPNOTSUPP. However if we aren't
2320 * clone we need to abort no matter what, because if we
2321 * got EOPNOTSUPP via prealloc then we messed up and
2325 (ret
!= -EOPNOTSUPP
||
2326 (extent_info
&& extent_info
->is_new_extent
)))
2327 btrfs_abort_transaction(trans
, ret
);
2331 trans
->block_rsv
= &fs_info
->trans_block_rsv
;
2333 if (!extent_info
&& cur_offset
< drop_args
.drop_end
&&
2334 cur_offset
< ino_size
) {
2335 ret
= fill_holes(trans
, inode
, path
, cur_offset
,
2336 drop_args
.drop_end
);
2339 * If we failed then we didn't insert our hole
2340 * entries for the area we dropped, so now the
2341 * fs is corrupted, so we must abort the
2344 btrfs_abort_transaction(trans
, ret
);
2347 } else if (!extent_info
&& cur_offset
< drop_args
.drop_end
) {
2349 * We are past the i_size here, but since we didn't
2350 * insert holes we need to clear the mapped area so we
2351 * know to not set disk_i_size in this area until a new
2352 * file extent is inserted here.
2354 ret
= btrfs_inode_clear_file_extent_range(inode
,
2356 drop_args
.drop_end
- cur_offset
);
2359 * We couldn't clear our area, so we could
2360 * presumably adjust up and corrupt the fs, so
2363 btrfs_abort_transaction(trans
, ret
);
2369 drop_args
.drop_end
> extent_info
->file_offset
) {
2370 u64 replace_len
= drop_args
.drop_end
-
2371 extent_info
->file_offset
;
2373 ret
= btrfs_insert_replace_extent(trans
, inode
, path
,
2374 extent_info
, replace_len
,
2375 drop_args
.bytes_found
);
2377 btrfs_abort_transaction(trans
, ret
);
2380 extent_info
->data_len
-= replace_len
;
2381 extent_info
->data_offset
+= replace_len
;
2382 extent_info
->file_offset
+= replace_len
;
2386 * We are releasing our handle on the transaction, balance the
2387 * dirty pages of the btree inode and flush delayed items, and
2388 * then get a new transaction handle, which may now point to a
2389 * new transaction in case someone else may have committed the
2390 * transaction we used to replace/drop file extent items. So
2391 * bump the inode's iversion and update mtime and ctime except
2392 * if we are called from a dedupe context. This is because a
2393 * power failure/crash may happen after the transaction is
2394 * committed and before we finish replacing/dropping all the
2395 * file extent items we need.
2397 inode_inc_iversion(&inode
->vfs_inode
);
2399 if (!extent_info
|| extent_info
->update_times
)
2400 inode_set_mtime_to_ts(&inode
->vfs_inode
,
2401 inode_set_ctime_current(&inode
->vfs_inode
));
2403 ret
= btrfs_update_inode(trans
, inode
);
2407 btrfs_end_transaction(trans
);
2408 btrfs_btree_balance_dirty(fs_info
);
2410 trans
= btrfs_start_transaction(root
, rsv_count
);
2411 if (IS_ERR(trans
)) {
2412 ret
= PTR_ERR(trans
);
2417 ret
= btrfs_block_rsv_migrate(&fs_info
->trans_block_rsv
,
2418 rsv
, min_size
, false);
2421 trans
->block_rsv
= rsv
;
2423 cur_offset
= drop_args
.drop_end
;
2424 len
= end
- cur_offset
;
2425 if (!extent_info
&& len
) {
2426 ret
= find_first_non_hole(inode
, &cur_offset
, &len
);
2427 if (unlikely(ret
< 0))
2437 * If we were cloning, force the next fsync to be a full one since we
2438 * we replaced (or just dropped in the case of cloning holes when
2439 * NO_HOLES is enabled) file extent items and did not setup new extent
2440 * maps for the replacement extents (or holes).
2442 if (extent_info
&& !extent_info
->is_new_extent
)
2443 btrfs_set_inode_full_sync(inode
);
2448 trans
->block_rsv
= &fs_info
->trans_block_rsv
;
2450 * If we are using the NO_HOLES feature we might have had already an
2451 * hole that overlaps a part of the region [lockstart, lockend] and
2452 * ends at (or beyond) lockend. Since we have no file extent items to
2453 * represent holes, drop_end can be less than lockend and so we must
2454 * make sure we have an extent map representing the existing hole (the
2455 * call to __btrfs_drop_extents() might have dropped the existing extent
2456 * map representing the existing hole), otherwise the fast fsync path
2457 * will not record the existence of the hole region
2458 * [existing_hole_start, lockend].
2460 if (drop_args
.drop_end
<= end
)
2461 drop_args
.drop_end
= end
+ 1;
2463 * Don't insert file hole extent item if it's for a range beyond eof
2464 * (because it's useless) or if it represents a 0 bytes range (when
2465 * cur_offset == drop_end).
2467 if (!extent_info
&& cur_offset
< ino_size
&&
2468 cur_offset
< drop_args
.drop_end
) {
2469 ret
= fill_holes(trans
, inode
, path
, cur_offset
,
2470 drop_args
.drop_end
);
2472 /* Same comment as above. */
2473 btrfs_abort_transaction(trans
, ret
);
2476 } else if (!extent_info
&& cur_offset
< drop_args
.drop_end
) {
2477 /* See the comment in the loop above for the reasoning here. */
2478 ret
= btrfs_inode_clear_file_extent_range(inode
, cur_offset
,
2479 drop_args
.drop_end
- cur_offset
);
2481 btrfs_abort_transaction(trans
, ret
);
2487 ret
= btrfs_insert_replace_extent(trans
, inode
, path
,
2488 extent_info
, extent_info
->data_len
,
2489 drop_args
.bytes_found
);
2491 btrfs_abort_transaction(trans
, ret
);
2500 trans
->block_rsv
= &fs_info
->trans_block_rsv
;
2502 btrfs_end_transaction(trans
);
2506 btrfs_free_block_rsv(fs_info
, rsv
);
2511 static int btrfs_punch_hole(struct file
*file
, loff_t offset
, loff_t len
)
2513 struct inode
*inode
= file_inode(file
);
2514 struct btrfs_fs_info
*fs_info
= inode_to_fs_info(inode
);
2515 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
2516 struct extent_state
*cached_state
= NULL
;
2517 struct btrfs_path
*path
;
2518 struct btrfs_trans_handle
*trans
= NULL
;
2523 u64 orig_start
= offset
;
2527 bool truncated_block
= false;
2528 bool updated_inode
= false;
2530 btrfs_inode_lock(BTRFS_I(inode
), BTRFS_ILOCK_MMAP
);
2532 ret
= btrfs_wait_ordered_range(BTRFS_I(inode
), offset
, len
);
2534 goto out_only_mutex
;
2536 ino_size
= round_up(inode
->i_size
, fs_info
->sectorsize
);
2537 ret
= find_first_non_hole(BTRFS_I(inode
), &offset
, &len
);
2539 goto out_only_mutex
;
2541 /* Already in a large hole */
2543 goto out_only_mutex
;
2546 ret
= file_modified(file
);
2548 goto out_only_mutex
;
2550 lockstart
= round_up(offset
, fs_info
->sectorsize
);
2551 lockend
= round_down(offset
+ len
, fs_info
->sectorsize
) - 1;
2552 same_block
= (BTRFS_BYTES_TO_BLKS(fs_info
, offset
))
2553 == (BTRFS_BYTES_TO_BLKS(fs_info
, offset
+ len
- 1));
2555 * We needn't truncate any block which is beyond the end of the file
2556 * because we are sure there is no data there.
2559 * Only do this if we are in the same block and we aren't doing the
2562 if (same_block
&& len
< fs_info
->sectorsize
) {
2563 if (offset
< ino_size
) {
2564 truncated_block
= true;
2565 ret
= btrfs_truncate_block(BTRFS_I(inode
), offset
, len
,
2570 goto out_only_mutex
;
2573 /* zero back part of the first block */
2574 if (offset
< ino_size
) {
2575 truncated_block
= true;
2576 ret
= btrfs_truncate_block(BTRFS_I(inode
), offset
, 0, 0);
2578 btrfs_inode_unlock(BTRFS_I(inode
), BTRFS_ILOCK_MMAP
);
2583 /* Check the aligned pages after the first unaligned page,
2584 * if offset != orig_start, which means the first unaligned page
2585 * including several following pages are already in holes,
2586 * the extra check can be skipped */
2587 if (offset
== orig_start
) {
2588 /* after truncate page, check hole again */
2589 len
= offset
+ len
- lockstart
;
2591 ret
= find_first_non_hole(BTRFS_I(inode
), &offset
, &len
);
2593 goto out_only_mutex
;
2596 goto out_only_mutex
;
2601 /* Check the tail unaligned part is in a hole */
2602 tail_start
= lockend
+ 1;
2603 tail_len
= offset
+ len
- tail_start
;
2605 ret
= find_first_non_hole(BTRFS_I(inode
), &tail_start
, &tail_len
);
2606 if (unlikely(ret
< 0))
2607 goto out_only_mutex
;
2609 /* zero the front end of the last page */
2610 if (tail_start
+ tail_len
< ino_size
) {
2611 truncated_block
= true;
2612 ret
= btrfs_truncate_block(BTRFS_I(inode
),
2613 tail_start
+ tail_len
,
2616 goto out_only_mutex
;
2621 if (lockend
< lockstart
) {
2623 goto out_only_mutex
;
2626 btrfs_punch_hole_lock_range(inode
, lockstart
, lockend
, &cached_state
);
2628 path
= btrfs_alloc_path();
2634 ret
= btrfs_replace_file_extents(BTRFS_I(inode
), path
, lockstart
,
2635 lockend
, NULL
, &trans
);
2636 btrfs_free_path(path
);
2640 ASSERT(trans
!= NULL
);
2641 inode_inc_iversion(inode
);
2642 inode_set_mtime_to_ts(inode
, inode_set_ctime_current(inode
));
2643 ret
= btrfs_update_inode(trans
, BTRFS_I(inode
));
2644 updated_inode
= true;
2645 btrfs_end_transaction(trans
);
2646 btrfs_btree_balance_dirty(fs_info
);
2648 unlock_extent(&BTRFS_I(inode
)->io_tree
, lockstart
, lockend
,
2651 if (!updated_inode
&& truncated_block
&& !ret
) {
2653 * If we only end up zeroing part of a page, we still need to
2654 * update the inode item, so that all the time fields are
2655 * updated as well as the necessary btrfs inode in memory fields
2656 * for detecting, at fsync time, if the inode isn't yet in the
2657 * log tree or it's there but not up to date.
2659 struct timespec64 now
= inode_set_ctime_current(inode
);
2661 inode_inc_iversion(inode
);
2662 inode_set_mtime_to_ts(inode
, now
);
2663 trans
= btrfs_start_transaction(root
, 1);
2664 if (IS_ERR(trans
)) {
2665 ret
= PTR_ERR(trans
);
2669 ret
= btrfs_update_inode(trans
, BTRFS_I(inode
));
2670 ret2
= btrfs_end_transaction(trans
);
2675 btrfs_inode_unlock(BTRFS_I(inode
), BTRFS_ILOCK_MMAP
);
2679 /* Helper structure to record which range is already reserved */
2680 struct falloc_range
{
2681 struct list_head list
;
2687 * Helper function to add falloc range
2689 * Caller should have locked the larger range of extent containing
2692 static int add_falloc_range(struct list_head
*head
, u64 start
, u64 len
)
2694 struct falloc_range
*range
= NULL
;
2696 if (!list_empty(head
)) {
2698 * As fallocate iterates by bytenr order, we only need to check
2701 range
= list_last_entry(head
, struct falloc_range
, list
);
2702 if (range
->start
+ range
->len
== start
) {
2708 range
= kmalloc(sizeof(*range
), GFP_KERNEL
);
2711 range
->start
= start
;
2713 list_add_tail(&range
->list
, head
);
2717 static int btrfs_fallocate_update_isize(struct inode
*inode
,
2721 struct btrfs_trans_handle
*trans
;
2722 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
2726 if (mode
& FALLOC_FL_KEEP_SIZE
|| end
<= i_size_read(inode
))
2729 trans
= btrfs_start_transaction(root
, 1);
2731 return PTR_ERR(trans
);
2733 inode_set_ctime_current(inode
);
2734 i_size_write(inode
, end
);
2735 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode
), 0);
2736 ret
= btrfs_update_inode(trans
, BTRFS_I(inode
));
2737 ret2
= btrfs_end_transaction(trans
);
2739 return ret
? ret
: ret2
;
2743 RANGE_BOUNDARY_WRITTEN_EXTENT
,
2744 RANGE_BOUNDARY_PREALLOC_EXTENT
,
2745 RANGE_BOUNDARY_HOLE
,
2748 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode
*inode
,
2751 const u64 sectorsize
= inode
->root
->fs_info
->sectorsize
;
2752 struct extent_map
*em
;
2755 offset
= round_down(offset
, sectorsize
);
2756 em
= btrfs_get_extent(inode
, NULL
, offset
, sectorsize
);
2760 if (em
->disk_bytenr
== EXTENT_MAP_HOLE
)
2761 ret
= RANGE_BOUNDARY_HOLE
;
2762 else if (em
->flags
& EXTENT_FLAG_PREALLOC
)
2763 ret
= RANGE_BOUNDARY_PREALLOC_EXTENT
;
2765 ret
= RANGE_BOUNDARY_WRITTEN_EXTENT
;
2767 free_extent_map(em
);
2771 static int btrfs_zero_range(struct inode
*inode
,
2776 struct btrfs_fs_info
*fs_info
= BTRFS_I(inode
)->root
->fs_info
;
2777 struct extent_map
*em
;
2778 struct extent_changeset
*data_reserved
= NULL
;
2781 const u64 sectorsize
= fs_info
->sectorsize
;
2782 u64 alloc_start
= round_down(offset
, sectorsize
);
2783 u64 alloc_end
= round_up(offset
+ len
, sectorsize
);
2784 u64 bytes_to_reserve
= 0;
2785 bool space_reserved
= false;
2787 em
= btrfs_get_extent(BTRFS_I(inode
), NULL
, alloc_start
,
2788 alloc_end
- alloc_start
);
2795 * Avoid hole punching and extent allocation for some cases. More cases
2796 * could be considered, but these are unlikely common and we keep things
2797 * as simple as possible for now. Also, intentionally, if the target
2798 * range contains one or more prealloc extents together with regular
2799 * extents and holes, we drop all the existing extents and allocate a
2800 * new prealloc extent, so that we get a larger contiguous disk extent.
2802 if (em
->start
<= alloc_start
&& (em
->flags
& EXTENT_FLAG_PREALLOC
)) {
2803 const u64 em_end
= em
->start
+ em
->len
;
2805 if (em_end
>= offset
+ len
) {
2807 * The whole range is already a prealloc extent,
2808 * do nothing except updating the inode's i_size if
2811 free_extent_map(em
);
2812 ret
= btrfs_fallocate_update_isize(inode
, offset
+ len
,
2817 * Part of the range is already a prealloc extent, so operate
2818 * only on the remaining part of the range.
2820 alloc_start
= em_end
;
2821 ASSERT(IS_ALIGNED(alloc_start
, sectorsize
));
2822 len
= offset
+ len
- alloc_start
;
2823 offset
= alloc_start
;
2824 alloc_hint
= extent_map_block_start(em
) + em
->len
;
2826 free_extent_map(em
);
2828 if (BTRFS_BYTES_TO_BLKS(fs_info
, offset
) ==
2829 BTRFS_BYTES_TO_BLKS(fs_info
, offset
+ len
- 1)) {
2830 em
= btrfs_get_extent(BTRFS_I(inode
), NULL
, alloc_start
, sectorsize
);
2836 if (em
->flags
& EXTENT_FLAG_PREALLOC
) {
2837 free_extent_map(em
);
2838 ret
= btrfs_fallocate_update_isize(inode
, offset
+ len
,
2842 if (len
< sectorsize
&& em
->disk_bytenr
!= EXTENT_MAP_HOLE
) {
2843 free_extent_map(em
);
2844 ret
= btrfs_truncate_block(BTRFS_I(inode
), offset
, len
,
2847 ret
= btrfs_fallocate_update_isize(inode
,
2852 free_extent_map(em
);
2853 alloc_start
= round_down(offset
, sectorsize
);
2854 alloc_end
= alloc_start
+ sectorsize
;
2858 alloc_start
= round_up(offset
, sectorsize
);
2859 alloc_end
= round_down(offset
+ len
, sectorsize
);
2862 * For unaligned ranges, check the pages at the boundaries, they might
2863 * map to an extent, in which case we need to partially zero them, or
2864 * they might map to a hole, in which case we need our allocation range
2867 if (!IS_ALIGNED(offset
, sectorsize
)) {
2868 ret
= btrfs_zero_range_check_range_boundary(BTRFS_I(inode
),
2872 if (ret
== RANGE_BOUNDARY_HOLE
) {
2873 alloc_start
= round_down(offset
, sectorsize
);
2875 } else if (ret
== RANGE_BOUNDARY_WRITTEN_EXTENT
) {
2876 ret
= btrfs_truncate_block(BTRFS_I(inode
), offset
, 0, 0);
2884 if (!IS_ALIGNED(offset
+ len
, sectorsize
)) {
2885 ret
= btrfs_zero_range_check_range_boundary(BTRFS_I(inode
),
2889 if (ret
== RANGE_BOUNDARY_HOLE
) {
2890 alloc_end
= round_up(offset
+ len
, sectorsize
);
2892 } else if (ret
== RANGE_BOUNDARY_WRITTEN_EXTENT
) {
2893 ret
= btrfs_truncate_block(BTRFS_I(inode
), offset
+ len
,
2903 if (alloc_start
< alloc_end
) {
2904 struct extent_state
*cached_state
= NULL
;
2905 const u64 lockstart
= alloc_start
;
2906 const u64 lockend
= alloc_end
- 1;
2908 bytes_to_reserve
= alloc_end
- alloc_start
;
2909 ret
= btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode
),
2913 space_reserved
= true;
2914 btrfs_punch_hole_lock_range(inode
, lockstart
, lockend
,
2916 ret
= btrfs_qgroup_reserve_data(BTRFS_I(inode
), &data_reserved
,
2917 alloc_start
, bytes_to_reserve
);
2919 unlock_extent(&BTRFS_I(inode
)->io_tree
, lockstart
,
2920 lockend
, &cached_state
);
2923 ret
= btrfs_prealloc_file_range(inode
, mode
, alloc_start
,
2924 alloc_end
- alloc_start
,
2925 fs_info
->sectorsize
,
2926 offset
+ len
, &alloc_hint
);
2927 unlock_extent(&BTRFS_I(inode
)->io_tree
, lockstart
, lockend
,
2929 /* btrfs_prealloc_file_range releases reserved space on error */
2931 space_reserved
= false;
2935 ret
= btrfs_fallocate_update_isize(inode
, offset
+ len
, mode
);
2937 if (ret
&& space_reserved
)
2938 btrfs_free_reserved_data_space(BTRFS_I(inode
), data_reserved
,
2939 alloc_start
, bytes_to_reserve
);
2940 extent_changeset_free(data_reserved
);
2945 static long btrfs_fallocate(struct file
*file
, int mode
,
2946 loff_t offset
, loff_t len
)
2948 struct inode
*inode
= file_inode(file
);
2949 struct extent_state
*cached_state
= NULL
;
2950 struct extent_changeset
*data_reserved
= NULL
;
2951 struct falloc_range
*range
;
2952 struct falloc_range
*tmp
;
2953 LIST_HEAD(reserve_list
);
2961 u64 data_space_needed
= 0;
2962 u64 data_space_reserved
= 0;
2963 u64 qgroup_reserved
= 0;
2964 struct extent_map
*em
;
2965 int blocksize
= BTRFS_I(inode
)->root
->fs_info
->sectorsize
;
2968 /* Do not allow fallocate in ZONED mode */
2969 if (btrfs_is_zoned(inode_to_fs_info(inode
)))
2972 alloc_start
= round_down(offset
, blocksize
);
2973 alloc_end
= round_up(offset
+ len
, blocksize
);
2974 cur_offset
= alloc_start
;
2976 /* Make sure we aren't being give some crap mode */
2977 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
|
2978 FALLOC_FL_ZERO_RANGE
))
2981 if (mode
& FALLOC_FL_PUNCH_HOLE
)
2982 return btrfs_punch_hole(file
, offset
, len
);
2984 btrfs_inode_lock(BTRFS_I(inode
), BTRFS_ILOCK_MMAP
);
2986 if (!(mode
& FALLOC_FL_KEEP_SIZE
) && offset
+ len
> inode
->i_size
) {
2987 ret
= inode_newsize_ok(inode
, offset
+ len
);
2992 ret
= file_modified(file
);
2997 * TODO: Move these two operations after we have checked
2998 * accurate reserved space, or fallocate can still fail but
2999 * with page truncated or size expanded.
3001 * But that's a minor problem and won't do much harm BTW.
3003 if (alloc_start
> inode
->i_size
) {
3004 ret
= btrfs_cont_expand(BTRFS_I(inode
), i_size_read(inode
),
3008 } else if (offset
+ len
> inode
->i_size
) {
3010 * If we are fallocating from the end of the file onward we
3011 * need to zero out the end of the block if i_size lands in the
3012 * middle of a block.
3014 ret
= btrfs_truncate_block(BTRFS_I(inode
), inode
->i_size
, 0, 0);
3020 * We have locked the inode at the VFS level (in exclusive mode) and we
3021 * have locked the i_mmap_lock lock (in exclusive mode). Now before
3022 * locking the file range, flush all dealloc in the range and wait for
3023 * all ordered extents in the range to complete. After this we can lock
3024 * the file range and, due to the previous locking we did, we know there
3025 * can't be more delalloc or ordered extents in the range.
3027 ret
= btrfs_wait_ordered_range(BTRFS_I(inode
), alloc_start
,
3028 alloc_end
- alloc_start
);
3032 if (mode
& FALLOC_FL_ZERO_RANGE
) {
3033 ret
= btrfs_zero_range(inode
, offset
, len
, mode
);
3034 btrfs_inode_unlock(BTRFS_I(inode
), BTRFS_ILOCK_MMAP
);
3038 locked_end
= alloc_end
- 1;
3039 lock_extent(&BTRFS_I(inode
)->io_tree
, alloc_start
, locked_end
,
3042 btrfs_assert_inode_range_clean(BTRFS_I(inode
), alloc_start
, locked_end
);
3044 /* First, check if we exceed the qgroup limit */
3045 while (cur_offset
< alloc_end
) {
3046 em
= btrfs_get_extent(BTRFS_I(inode
), NULL
, cur_offset
,
3047 alloc_end
- cur_offset
);
3052 last_byte
= min(extent_map_end(em
), alloc_end
);
3053 actual_end
= min_t(u64
, extent_map_end(em
), offset
+ len
);
3054 last_byte
= ALIGN(last_byte
, blocksize
);
3055 if (em
->disk_bytenr
== EXTENT_MAP_HOLE
||
3056 (cur_offset
>= inode
->i_size
&&
3057 !(em
->flags
& EXTENT_FLAG_PREALLOC
))) {
3058 const u64 range_len
= last_byte
- cur_offset
;
3060 ret
= add_falloc_range(&reserve_list
, cur_offset
, range_len
);
3062 free_extent_map(em
);
3065 ret
= btrfs_qgroup_reserve_data(BTRFS_I(inode
),
3066 &data_reserved
, cur_offset
, range_len
);
3068 free_extent_map(em
);
3071 qgroup_reserved
+= range_len
;
3072 data_space_needed
+= range_len
;
3074 free_extent_map(em
);
3075 cur_offset
= last_byte
;
3078 if (!ret
&& data_space_needed
> 0) {
3080 * We are safe to reserve space here as we can't have delalloc
3081 * in the range, see above.
3083 ret
= btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode
),
3086 data_space_reserved
= data_space_needed
;
3090 * If ret is still 0, means we're OK to fallocate.
3091 * Or just cleanup the list and exit.
3093 list_for_each_entry_safe(range
, tmp
, &reserve_list
, list
) {
3095 ret
= btrfs_prealloc_file_range(inode
, mode
,
3097 range
->len
, blocksize
,
3098 offset
+ len
, &alloc_hint
);
3100 * btrfs_prealloc_file_range() releases space even
3101 * if it returns an error.
3103 data_space_reserved
-= range
->len
;
3104 qgroup_reserved
-= range
->len
;
3105 } else if (data_space_reserved
> 0) {
3106 btrfs_free_reserved_data_space(BTRFS_I(inode
),
3107 data_reserved
, range
->start
,
3109 data_space_reserved
-= range
->len
;
3110 qgroup_reserved
-= range
->len
;
3111 } else if (qgroup_reserved
> 0) {
3112 btrfs_qgroup_free_data(BTRFS_I(inode
), data_reserved
,
3113 range
->start
, range
->len
, NULL
);
3114 qgroup_reserved
-= range
->len
;
3116 list_del(&range
->list
);
3123 * We didn't need to allocate any more space, but we still extended the
3124 * size of the file so we need to update i_size and the inode item.
3126 ret
= btrfs_fallocate_update_isize(inode
, actual_end
, mode
);
3128 unlock_extent(&BTRFS_I(inode
)->io_tree
, alloc_start
, locked_end
,
3131 btrfs_inode_unlock(BTRFS_I(inode
), BTRFS_ILOCK_MMAP
);
3132 extent_changeset_free(data_reserved
);
3137 * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3138 * that has unflushed and/or flushing delalloc. There might be other adjacent
3139 * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3140 * looping while it gets adjacent subranges, and merging them together.
3142 static bool find_delalloc_subrange(struct btrfs_inode
*inode
, u64 start
, u64 end
,
3143 struct extent_state
**cached_state
,
3144 bool *search_io_tree
,
3145 u64
*delalloc_start_ret
, u64
*delalloc_end_ret
)
3147 u64 len
= end
+ 1 - start
;
3148 u64 delalloc_len
= 0;
3149 struct btrfs_ordered_extent
*oe
;
3154 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3155 * means we have delalloc (dirty pages) for which writeback has not
3158 if (*search_io_tree
) {
3159 spin_lock(&inode
->lock
);
3160 if (inode
->delalloc_bytes
> 0) {
3161 spin_unlock(&inode
->lock
);
3162 *delalloc_start_ret
= start
;
3163 delalloc_len
= count_range_bits(&inode
->io_tree
,
3164 delalloc_start_ret
, end
,
3165 len
, EXTENT_DELALLOC
, 1,
3168 spin_unlock(&inode
->lock
);
3172 if (delalloc_len
> 0) {
3174 * If delalloc was found then *delalloc_start_ret has a sector size
3175 * aligned value (rounded down).
3177 *delalloc_end_ret
= *delalloc_start_ret
+ delalloc_len
- 1;
3179 if (*delalloc_start_ret
== start
) {
3180 /* Delalloc for the whole range, nothing more to do. */
3181 if (*delalloc_end_ret
== end
)
3183 /* Else trim our search range for ordered extents. */
3184 start
= *delalloc_end_ret
+ 1;
3185 len
= end
+ 1 - start
;
3188 /* No delalloc, future calls don't need to search again. */
3189 *search_io_tree
= false;
3193 * Now also check if there's any ordered extent in the range.
3194 * We do this because:
3196 * 1) When delalloc is flushed, the file range is locked, we clear the
3197 * EXTENT_DELALLOC bit from the io tree and create an extent map and
3198 * an ordered extent for the write. So we might just have been called
3199 * after delalloc is flushed and before the ordered extent completes
3200 * and inserts the new file extent item in the subvolume's btree;
3202 * 2) We may have an ordered extent created by flushing delalloc for a
3203 * subrange that starts before the subrange we found marked with
3204 * EXTENT_DELALLOC in the io tree.
3206 * We could also use the extent map tree to find such delalloc that is
3207 * being flushed, but using the ordered extents tree is more efficient
3208 * because it's usually much smaller as ordered extents are removed from
3209 * the tree once they complete. With the extent maps, we mau have them
3210 * in the extent map tree for a very long time, and they were either
3211 * created by previous writes or loaded by read operations.
3213 oe
= btrfs_lookup_first_ordered_range(inode
, start
, len
);
3215 return (delalloc_len
> 0);
3217 /* The ordered extent may span beyond our search range. */
3218 oe_start
= max(oe
->file_offset
, start
);
3219 oe_end
= min(oe
->file_offset
+ oe
->num_bytes
- 1, end
);
3221 btrfs_put_ordered_extent(oe
);
3223 /* Don't have unflushed delalloc, return the ordered extent range. */
3224 if (delalloc_len
== 0) {
3225 *delalloc_start_ret
= oe_start
;
3226 *delalloc_end_ret
= oe_end
;
3231 * We have both unflushed delalloc (io_tree) and an ordered extent.
3232 * If the ranges are adjacent returned a combined range, otherwise
3233 * return the leftmost range.
3235 if (oe_start
< *delalloc_start_ret
) {
3236 if (oe_end
< *delalloc_start_ret
)
3237 *delalloc_end_ret
= oe_end
;
3238 *delalloc_start_ret
= oe_start
;
3239 } else if (*delalloc_end_ret
+ 1 == oe_start
) {
3240 *delalloc_end_ret
= oe_end
;
3247 * Check if there's delalloc in a given range.
3249 * @inode: The inode.
3250 * @start: The start offset of the range. It does not need to be
3251 * sector size aligned.
3252 * @end: The end offset (inclusive value) of the search range.
3253 * It does not need to be sector size aligned.
3254 * @cached_state: Extent state record used for speeding up delalloc
3255 * searches in the inode's io_tree. Can be NULL.
3256 * @delalloc_start_ret: Output argument, set to the start offset of the
3257 * subrange found with delalloc (may not be sector size
3259 * @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
3260 * of the subrange found with delalloc.
3262 * Returns true if a subrange with delalloc is found within the given range, and
3263 * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3264 * end offsets of the subrange.
3266 bool btrfs_find_delalloc_in_range(struct btrfs_inode
*inode
, u64 start
, u64 end
,
3267 struct extent_state
**cached_state
,
3268 u64
*delalloc_start_ret
, u64
*delalloc_end_ret
)
3270 u64 cur_offset
= round_down(start
, inode
->root
->fs_info
->sectorsize
);
3271 u64 prev_delalloc_end
= 0;
3272 bool search_io_tree
= true;
3275 while (cur_offset
<= end
) {
3280 delalloc
= find_delalloc_subrange(inode
, cur_offset
, end
,
3281 cached_state
, &search_io_tree
,
3287 if (prev_delalloc_end
== 0) {
3288 /* First subrange found. */
3289 *delalloc_start_ret
= max(delalloc_start
, start
);
3290 *delalloc_end_ret
= delalloc_end
;
3292 } else if (delalloc_start
== prev_delalloc_end
+ 1) {
3293 /* Subrange adjacent to the previous one, merge them. */
3294 *delalloc_end_ret
= delalloc_end
;
3296 /* Subrange not adjacent to the previous one, exit. */
3300 prev_delalloc_end
= delalloc_end
;
3301 cur_offset
= delalloc_end
+ 1;
3309 * Check if there's a hole or delalloc range in a range representing a hole (or
3310 * prealloc extent) found in the inode's subvolume btree.
3312 * @inode: The inode.
3313 * @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
3314 * @start: Start offset of the hole region. It does not need to be sector
3316 * @end: End offset (inclusive value) of the hole region. It does not
3317 * need to be sector size aligned.
3318 * @start_ret: Return parameter, used to set the start of the subrange in the
3319 * hole that matches the search criteria (seek mode), if such
3320 * subrange is found (return value of the function is true).
3321 * The value returned here may not be sector size aligned.
3323 * Returns true if a subrange matching the given seek mode is found, and if one
3324 * is found, it updates @start_ret with the start of the subrange.
3326 static bool find_desired_extent_in_hole(struct btrfs_inode
*inode
, int whence
,
3327 struct extent_state
**cached_state
,
3328 u64 start
, u64 end
, u64
*start_ret
)
3334 delalloc
= btrfs_find_delalloc_in_range(inode
, start
, end
, cached_state
,
3335 &delalloc_start
, &delalloc_end
);
3336 if (delalloc
&& whence
== SEEK_DATA
) {
3337 *start_ret
= delalloc_start
;
3341 if (delalloc
&& whence
== SEEK_HOLE
) {
3343 * We found delalloc but it starts after out start offset. So we
3344 * have a hole between our start offset and the delalloc start.
3346 if (start
< delalloc_start
) {
3351 * Delalloc range starts at our start offset.
3352 * If the delalloc range's length is smaller than our range,
3353 * then it means we have a hole that starts where the delalloc
3356 if (delalloc_end
< end
) {
3357 *start_ret
= delalloc_end
+ 1;
3361 /* There's delalloc for the whole range. */
3365 if (!delalloc
&& whence
== SEEK_HOLE
) {
3371 * No delalloc in the range and we are seeking for data. The caller has
3372 * to iterate to the next extent item in the subvolume btree.
3377 static loff_t
find_desired_extent(struct file
*file
, loff_t offset
, int whence
)
3379 struct btrfs_inode
*inode
= BTRFS_I(file
->f_mapping
->host
);
3380 struct btrfs_file_private
*private;
3381 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
3382 struct extent_state
*cached_state
= NULL
;
3383 struct extent_state
**delalloc_cached_state
;
3384 const loff_t i_size
= i_size_read(&inode
->vfs_inode
);
3385 const u64 ino
= btrfs_ino(inode
);
3386 struct btrfs_root
*root
= inode
->root
;
3387 struct btrfs_path
*path
;
3388 struct btrfs_key key
;
3389 u64 last_extent_end
;
3396 if (i_size
== 0 || offset
>= i_size
)
3400 * Quick path. If the inode has no prealloc extents and its number of
3401 * bytes used matches its i_size, then it can not have holes.
3403 if (whence
== SEEK_HOLE
&&
3404 !(inode
->flags
& BTRFS_INODE_PREALLOC
) &&
3405 inode_get_bytes(&inode
->vfs_inode
) == i_size
)
3408 spin_lock(&inode
->lock
);
3409 private = file
->private_data
;
3410 spin_unlock(&inode
->lock
);
3412 if (private && private->owner_task
!= current
) {
3414 * Not allocated by us, don't use it as its cached state is used
3415 * by the task that allocated it and we don't want neither to
3416 * mess with it nor get incorrect results because it reflects an
3417 * invalid state for the current task.
3420 } else if (!private) {
3421 private = kzalloc(sizeof(*private), GFP_KERNEL
);
3423 * No worries if memory allocation failed.
3424 * The private structure is used only for speeding up multiple
3425 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3426 * so everything will still be correct.
3431 private->owner_task
= current
;
3433 spin_lock(&inode
->lock
);
3434 if (file
->private_data
)
3437 file
->private_data
= private;
3438 spin_unlock(&inode
->lock
);
3448 delalloc_cached_state
= &private->llseek_cached_state
;
3450 delalloc_cached_state
= NULL
;
3453 * offset can be negative, in this case we start finding DATA/HOLE from
3454 * the very start of the file.
3456 start
= max_t(loff_t
, 0, offset
);
3458 lockstart
= round_down(start
, fs_info
->sectorsize
);
3459 lockend
= round_up(i_size
, fs_info
->sectorsize
);
3460 if (lockend
<= lockstart
)
3461 lockend
= lockstart
+ fs_info
->sectorsize
;
3464 path
= btrfs_alloc_path();
3467 path
->reada
= READA_FORWARD
;
3470 key
.type
= BTRFS_EXTENT_DATA_KEY
;
3473 last_extent_end
= lockstart
;
3475 lock_extent(&inode
->io_tree
, lockstart
, lockend
, &cached_state
);
3477 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
3480 } else if (ret
> 0 && path
->slots
[0] > 0) {
3481 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0] - 1);
3482 if (key
.objectid
== ino
&& key
.type
== BTRFS_EXTENT_DATA_KEY
)
3486 while (start
< i_size
) {
3487 struct extent_buffer
*leaf
= path
->nodes
[0];
3488 struct btrfs_file_extent_item
*extent
;
3492 if (path
->slots
[0] >= btrfs_header_nritems(leaf
)) {
3493 ret
= btrfs_next_leaf(root
, path
);
3499 leaf
= path
->nodes
[0];
3502 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
3503 if (key
.objectid
!= ino
|| key
.type
!= BTRFS_EXTENT_DATA_KEY
)
3506 extent_end
= btrfs_file_extent_end(path
);
3509 * In the first iteration we may have a slot that points to an
3510 * extent that ends before our start offset, so skip it.
3512 if (extent_end
<= start
) {
3517 /* We have an implicit hole, NO_HOLES feature is likely set. */
3518 if (last_extent_end
< key
.offset
) {
3519 u64 search_start
= last_extent_end
;
3523 * First iteration, @start matches @offset and it's
3526 if (start
== offset
)
3527 search_start
= offset
;
3529 found
= find_desired_extent_in_hole(inode
, whence
,
3530 delalloc_cached_state
,
3535 start
= found_start
;
3539 * Didn't find data or a hole (due to delalloc) in the
3540 * implicit hole range, so need to analyze the extent.
3544 extent
= btrfs_item_ptr(leaf
, path
->slots
[0],
3545 struct btrfs_file_extent_item
);
3546 type
= btrfs_file_extent_type(leaf
, extent
);
3549 * Can't access the extent's disk_bytenr field if this is an
3550 * inline extent, since at that offset, it's where the extent
3553 if (type
== BTRFS_FILE_EXTENT_PREALLOC
||
3554 (type
== BTRFS_FILE_EXTENT_REG
&&
3555 btrfs_file_extent_disk_bytenr(leaf
, extent
) == 0)) {
3557 * Explicit hole or prealloc extent, search for delalloc.
3558 * A prealloc extent is treated like a hole.
3560 u64 search_start
= key
.offset
;
3564 * First iteration, @start matches @offset and it's
3567 if (start
== offset
)
3568 search_start
= offset
;
3570 found
= find_desired_extent_in_hole(inode
, whence
,
3571 delalloc_cached_state
,
3576 start
= found_start
;
3580 * Didn't find data or a hole (due to delalloc) in the
3581 * implicit hole range, so need to analyze the next
3586 * Found a regular or inline extent.
3587 * If we are seeking for data, adjust the start offset
3588 * and stop, we're done.
3590 if (whence
== SEEK_DATA
) {
3591 start
= max_t(u64
, key
.offset
, offset
);
3596 * Else, we are seeking for a hole, check the next file
3602 last_extent_end
= extent_end
;
3604 if (fatal_signal_pending(current
)) {
3611 /* We have an implicit hole from the last extent found up to i_size. */
3612 if (!found
&& start
< i_size
) {
3613 found
= find_desired_extent_in_hole(inode
, whence
,
3614 delalloc_cached_state
, start
,
3615 i_size
- 1, &start
);
3621 unlock_extent(&inode
->io_tree
, lockstart
, lockend
, &cached_state
);
3622 btrfs_free_path(path
);
3627 if (whence
== SEEK_DATA
&& start
>= i_size
)
3630 return min_t(loff_t
, start
, i_size
);
3633 static loff_t
btrfs_file_llseek(struct file
*file
, loff_t offset
, int whence
)
3635 struct inode
*inode
= file
->f_mapping
->host
;
3639 return generic_file_llseek(file
, offset
, whence
);
3642 btrfs_inode_lock(BTRFS_I(inode
), BTRFS_ILOCK_SHARED
);
3643 offset
= find_desired_extent(file
, offset
, whence
);
3644 btrfs_inode_unlock(BTRFS_I(inode
), BTRFS_ILOCK_SHARED
);
3651 return vfs_setpos(file
, offset
, inode
->i_sb
->s_maxbytes
);
3654 static int btrfs_file_open(struct inode
*inode
, struct file
*filp
)
3658 filp
->f_mode
|= FMODE_NOWAIT
| FMODE_CAN_ODIRECT
;
3660 ret
= fsverity_file_open(inode
, filp
);
3663 return generic_file_open(inode
, filp
);
3666 static ssize_t
btrfs_file_read_iter(struct kiocb
*iocb
, struct iov_iter
*to
)
3670 if (iocb
->ki_flags
& IOCB_DIRECT
) {
3671 ret
= btrfs_direct_read(iocb
, to
);
3672 if (ret
< 0 || !iov_iter_count(to
) ||
3673 iocb
->ki_pos
>= i_size_read(file_inode(iocb
->ki_filp
)))
3677 return filemap_read(iocb
, to
, ret
);
3680 const struct file_operations btrfs_file_operations
= {
3681 .llseek
= btrfs_file_llseek
,
3682 .read_iter
= btrfs_file_read_iter
,
3683 .splice_read
= filemap_splice_read
,
3684 .write_iter
= btrfs_file_write_iter
,
3685 .splice_write
= iter_file_splice_write
,
3686 .mmap
= btrfs_file_mmap
,
3687 .open
= btrfs_file_open
,
3688 .release
= btrfs_release_file
,
3689 .get_unmapped_area
= thp_get_unmapped_area
,
3690 .fsync
= btrfs_sync_file
,
3691 .fallocate
= btrfs_fallocate
,
3692 .unlocked_ioctl
= btrfs_ioctl
,
3693 #ifdef CONFIG_COMPAT
3694 .compat_ioctl
= btrfs_compat_ioctl
,
3696 .remap_file_range
= btrfs_remap_file_range
,
3697 .uring_cmd
= btrfs_uring_cmd
,
3698 .fop_flags
= FOP_BUFFER_RASYNC
| FOP_BUFFER_WASYNC
,
3701 int btrfs_fdatawrite_range(struct btrfs_inode
*inode
, loff_t start
, loff_t end
)
3703 struct address_space
*mapping
= inode
->vfs_inode
.i_mapping
;
3707 * So with compression we will find and lock a dirty page and clear the
3708 * first one as dirty, setup an async extent, and immediately return
3709 * with the entire range locked but with nobody actually marked with
3710 * writeback. So we can't just filemap_write_and_wait_range() and
3711 * expect it to work since it will just kick off a thread to do the
3712 * actual work. So we need to call filemap_fdatawrite_range _again_
3713 * since it will wait on the page lock, which won't be unlocked until
3714 * after the pages have been marked as writeback and so we're good to go
3715 * from there. We have to do this otherwise we'll miss the ordered
3716 * extents and that results in badness. Please Josef, do not think you
3717 * know better and pull this out at some point in the future, it is
3718 * right and you are wrong.
3720 ret
= filemap_fdatawrite_range(mapping
, start
, end
);
3721 if (!ret
&& test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT
, &inode
->runtime_flags
))
3722 ret
= filemap_fdatawrite_range(mapping
, start
, end
);