1 // SPDX-License-Identifier: GPL-2.0
5 #include "alloc_foreground.h"
7 #include "btree_update.h"
12 #include "extent_update.h"
15 #include "fs-io-buffered.h"
16 #include "fs-io-pagecache.h"
26 #include <linux/aio.h>
27 #include <linux/backing-dev.h>
28 #include <linux/falloc.h>
29 #include <linux/migrate.h>
30 #include <linux/mmu_context.h>
31 #include <linux/pagevec.h>
32 #include <linux/rmap.h>
33 #include <linux/sched/signal.h>
34 #include <linux/task_io_accounting_ops.h>
35 #include <linux/uio.h>
37 #include <trace/events/writeback.h>
45 static void nocow_flush_endio(struct bio
*_bio
)
48 struct nocow_flush
*bio
= container_of(_bio
, struct nocow_flush
, bio
);
51 percpu_ref_put(&bio
->ca
->io_ref
);
55 void bch2_inode_flush_nocow_writes_async(struct bch_fs
*c
,
56 struct bch_inode_info
*inode
,
59 struct nocow_flush
*bio
;
61 struct bch_devs_mask devs
;
64 dev
= find_first_bit(inode
->ei_devs_need_flush
.d
, BCH_SB_MEMBERS_MAX
);
65 if (dev
== BCH_SB_MEMBERS_MAX
)
68 devs
= inode
->ei_devs_need_flush
;
69 memset(&inode
->ei_devs_need_flush
, 0, sizeof(inode
->ei_devs_need_flush
));
71 for_each_set_bit(dev
, devs
.d
, BCH_SB_MEMBERS_MAX
) {
73 ca
= rcu_dereference(c
->devs
[dev
]);
74 if (ca
&& !percpu_ref_tryget(&ca
->io_ref
))
81 bio
= container_of(bio_alloc_bioset(ca
->disk_sb
.bdev
, 0,
82 REQ_OP_WRITE
|REQ_PREFLUSH
,
84 &c
->nocow_flush_bioset
),
85 struct nocow_flush
, bio
);
88 bio
->bio
.bi_end_io
= nocow_flush_endio
;
89 closure_bio_submit(&bio
->bio
, cl
);
93 static int bch2_inode_flush_nocow_writes(struct bch_fs
*c
,
94 struct bch_inode_info
*inode
)
98 closure_init_stack(&cl
);
99 bch2_inode_flush_nocow_writes_async(c
, inode
, &cl
);
105 /* i_size updates: */
107 struct inode_new_size
{
113 static int inode_set_size(struct btree_trans
*trans
,
114 struct bch_inode_info
*inode
,
115 struct bch_inode_unpacked
*bi
,
118 struct inode_new_size
*s
= p
;
120 bi
->bi_size
= s
->new_size
;
121 if (s
->fields
& ATTR_ATIME
)
122 bi
->bi_atime
= s
->now
;
123 if (s
->fields
& ATTR_MTIME
)
124 bi
->bi_mtime
= s
->now
;
125 if (s
->fields
& ATTR_CTIME
)
126 bi
->bi_ctime
= s
->now
;
131 int __must_check
bch2_write_inode_size(struct bch_fs
*c
,
132 struct bch_inode_info
*inode
,
133 loff_t new_size
, unsigned fields
)
135 struct inode_new_size s
= {
136 .new_size
= new_size
,
137 .now
= bch2_current_time(c
),
141 return bch2_write_inode(c
, inode
, inode_set_size
, &s
, fields
);
144 void __bch2_i_sectors_acct(struct bch_fs
*c
, struct bch_inode_info
*inode
,
145 struct quota_res
*quota_res
, s64 sectors
)
147 bch2_fs_inconsistent_on((s64
) inode
->v
.i_blocks
+ sectors
< 0, c
,
148 "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
149 inode
->v
.i_ino
, (u64
) inode
->v
.i_blocks
, sectors
,
150 inode
->ei_inode
.bi_sectors
);
151 inode
->v
.i_blocks
+= sectors
;
153 #ifdef CONFIG_BCACHEFS_QUOTA
155 !test_bit(EI_INODE_SNAPSHOT
, &inode
->ei_flags
) &&
157 BUG_ON(sectors
> quota_res
->sectors
);
158 BUG_ON(sectors
> inode
->ei_quota_reserved
);
160 quota_res
->sectors
-= sectors
;
161 inode
->ei_quota_reserved
-= sectors
;
163 bch2_quota_acct(c
, inode
->ei_qid
, Q_SPC
, sectors
, KEY_TYPE_QUOTA_WARN
);
171 * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
172 * insert trigger: look up the btree inode instead
174 static int bch2_flush_inode(struct bch_fs
*c
,
175 struct bch_inode_info
*inode
)
177 if (c
->opts
.journal_flush_disabled
)
180 if (!bch2_write_ref_tryget(c
, BCH_WRITE_REF_fsync
))
183 struct bch_inode_unpacked u
;
184 int ret
= bch2_inode_find_by_inum(c
, inode_inum(inode
), &u
) ?:
185 bch2_journal_flush_seq(&c
->journal
, u
.bi_journal_seq
, TASK_INTERRUPTIBLE
) ?:
186 bch2_inode_flush_nocow_writes(c
, inode
);
187 bch2_write_ref_put(c
, BCH_WRITE_REF_fsync
);
191 int bch2_fsync(struct file
*file
, loff_t start
, loff_t end
, int datasync
)
193 struct bch_inode_info
*inode
= file_bch_inode(file
);
194 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
197 trace_bch2_fsync(file
, datasync
);
199 ret
= file_write_and_wait_range(file
, start
, end
);
202 ret
= sync_inode_metadata(&inode
->v
, 1);
205 ret
= bch2_flush_inode(c
, inode
);
207 ret
= bch2_err_class(ret
);
211 err
= file_check_and_advance_wb_err(file
);
220 static inline int range_has_data(struct bch_fs
*c
, u32 subvol
,
224 return bch2_trans_run(c
,
225 for_each_btree_key_in_subvolume_upto(trans
, iter
, BTREE_ID_extents
, start
, end
,
227 bkey_extent_is_data(k
.k
) && !bkey_extent_is_unwritten(k
);
231 static int __bch2_truncate_folio(struct bch_inode_info
*inode
,
232 pgoff_t index
, loff_t start
, loff_t end
)
234 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
235 struct address_space
*mapping
= inode
->v
.i_mapping
;
237 unsigned start_offset
;
241 s64 i_sectors_delta
= 0;
245 folio
= filemap_lock_folio(mapping
, index
);
246 if (IS_ERR_OR_NULL(folio
)) {
248 * XXX: we're doing two index lookups when we end up reading the
251 ret
= range_has_data(c
, inode
->ei_inum
.subvol
,
252 POS(inode
->v
.i_ino
, (index
<< PAGE_SECTORS_SHIFT
)),
253 POS(inode
->v
.i_ino
, (index
<< PAGE_SECTORS_SHIFT
) + PAGE_SECTORS
));
257 folio
= __filemap_get_folio(mapping
, index
,
258 FGP_LOCK
|FGP_CREAT
, GFP_KERNEL
);
259 if (IS_ERR_OR_NULL(folio
)) {
265 BUG_ON(start
>= folio_end_pos(folio
));
266 BUG_ON(end
<= folio_pos(folio
));
268 start_offset
= max(start
, folio_pos(folio
)) - folio_pos(folio
);
269 end_offset
= min_t(u64
, end
, folio_end_pos(folio
)) - folio_pos(folio
);
271 /* Folio boundary? Nothing to do */
272 if (start_offset
== 0 &&
273 end_offset
== folio_size(folio
)) {
278 s
= bch2_folio_create(folio
, 0);
284 if (!folio_test_uptodate(folio
)) {
285 ret
= bch2_read_single_folio(folio
, mapping
);
290 ret
= bch2_folio_set(c
, inode_inum(inode
), &folio
, 1);
294 for (i
= round_up(start_offset
, block_bytes(c
)) >> 9;
295 i
< round_down(end_offset
, block_bytes(c
)) >> 9;
297 s
->s
[i
].nr_replicas
= 0;
299 i_sectors_delta
-= s
->s
[i
].state
== SECTOR_dirty
;
300 bch2_folio_sector_set(folio
, s
, i
, SECTOR_unallocated
);
303 bch2_i_sectors_acct(c
, inode
, NULL
, i_sectors_delta
);
306 * Caller needs to know whether this folio will be written out by
307 * writeback - doing an i_size update if necessary - or whether it will
308 * be responsible for the i_size update.
310 * Note that we shouldn't ever see a folio beyond EOF, but check and
311 * warn if so. This has been observed by failure to clean up folios
312 * after a short write and there's still a chance reclaim will fix
315 WARN_ON_ONCE(folio_pos(folio
) >= inode
->v
.i_size
);
316 end_pos
= folio_end_pos(folio
);
317 if (inode
->v
.i_size
> folio_pos(folio
))
318 end_pos
= min_t(u64
, inode
->v
.i_size
, end_pos
);
319 ret
= s
->s
[folio_pos_to_s(folio
, end_pos
- 1)].state
>= SECTOR_dirty
;
321 folio_zero_segment(folio
, start_offset
, end_offset
);
324 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
326 * XXX: because we aren't currently tracking whether the folio has actual
327 * data in it (vs. just 0s, or only partially written) this wrong. ick.
329 BUG_ON(bch2_get_folio_disk_reservation(c
, inode
, folio
, false));
332 * This removes any writeable userspace mappings; we need to force
333 * .page_mkwrite to be called again before any mmapped writes, to
334 * redirty the full page:
336 folio_mkclean(folio
);
337 filemap_dirty_folio(mapping
, folio
);
345 static int bch2_truncate_folio(struct bch_inode_info
*inode
, loff_t from
)
347 return __bch2_truncate_folio(inode
, from
>> PAGE_SHIFT
,
348 from
, ANYSINT_MAX(loff_t
));
351 static int bch2_truncate_folios(struct bch_inode_info
*inode
,
352 loff_t start
, loff_t end
)
354 int ret
= __bch2_truncate_folio(inode
, start
>> PAGE_SHIFT
,
358 start
>> PAGE_SHIFT
!= end
>> PAGE_SHIFT
)
359 ret
= __bch2_truncate_folio(inode
,
360 (end
- 1) >> PAGE_SHIFT
,
365 static int bch2_extend(struct mnt_idmap
*idmap
,
366 struct bch_inode_info
*inode
,
367 struct bch_inode_unpacked
*inode_u
,
370 struct address_space
*mapping
= inode
->v
.i_mapping
;
376 * this has to be done _before_ extending i_size:
378 ret
= filemap_write_and_wait_range(mapping
, inode_u
->bi_size
, S64_MAX
);
382 truncate_setsize(&inode
->v
, iattr
->ia_size
);
384 return bch2_setattr_nonsize(idmap
, inode
, iattr
);
387 int bchfs_truncate(struct mnt_idmap
*idmap
,
388 struct bch_inode_info
*inode
, struct iattr
*iattr
)
390 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
391 struct address_space
*mapping
= inode
->v
.i_mapping
;
392 struct bch_inode_unpacked inode_u
;
393 s64 i_sectors_delta
= 0;
397 * If the truncate call with change the size of the file, the
398 * cmtimes should be updated. If the size will not change, we
399 * do not need to update the cmtimes.
401 if (iattr
->ia_size
!= inode
->v
.i_size
) {
402 if (!(iattr
->ia_valid
& ATTR_MTIME
))
403 ktime_get_coarse_real_ts64(&iattr
->ia_mtime
);
404 if (!(iattr
->ia_valid
& ATTR_CTIME
))
405 ktime_get_coarse_real_ts64(&iattr
->ia_ctime
);
406 iattr
->ia_valid
|= ATTR_MTIME
|ATTR_CTIME
;
409 inode_dio_wait(&inode
->v
);
410 bch2_pagecache_block_get(inode
);
412 ret
= bch2_inode_find_by_inum(c
, inode_inum(inode
), &inode_u
);
417 * check this before next assertion; on filesystem error our normal
418 * invariants are a bit broken (truncate has to truncate the page cache
421 ret
= bch2_journal_error(&c
->journal
);
425 WARN_ONCE(!test_bit(EI_INODE_ERROR
, &inode
->ei_flags
) &&
426 inode
->v
.i_size
< inode_u
.bi_size
,
427 "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
428 (u64
) inode
->v
.i_size
, inode_u
.bi_size
);
430 if (iattr
->ia_size
> inode
->v
.i_size
) {
431 ret
= bch2_extend(idmap
, inode
, &inode_u
, iattr
);
435 iattr
->ia_valid
&= ~ATTR_SIZE
;
437 ret
= bch2_truncate_folio(inode
, iattr
->ia_size
);
438 if (unlikely(ret
< 0))
441 truncate_setsize(&inode
->v
, iattr
->ia_size
);
444 * When extending, we're going to write the new i_size to disk
445 * immediately so we need to flush anything above the current on disk
448 * Also, when extending we need to flush the page that i_size currently
449 * straddles - if it's mapped to userspace, we need to ensure that
450 * userspace has to redirty it and call .mkwrite -> set_page_dirty
451 * again to allocate the part of the page that was extended.
453 if (iattr
->ia_size
> inode_u
.bi_size
)
454 ret
= filemap_write_and_wait_range(mapping
,
457 else if (iattr
->ia_size
& (PAGE_SIZE
- 1))
458 ret
= filemap_write_and_wait_range(mapping
,
459 round_down(iattr
->ia_size
, PAGE_SIZE
),
464 ret
= bch2_truncate(c
, inode_inum(inode
), iattr
->ia_size
, &i_sectors_delta
);
465 bch2_i_sectors_acct(c
, inode
, NULL
, i_sectors_delta
);
469 * If we error here, VFS caches are now inconsistent with btree
471 set_bit(EI_INODE_ERROR
, &inode
->ei_flags
);
475 bch2_fs_inconsistent_on(!inode
->v
.i_size
&& inode
->v
.i_blocks
&&
476 !bch2_journal_error(&c
->journal
), c
,
477 "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
478 inode
->v
.i_ino
, (u64
) inode
->v
.i_blocks
,
479 inode
->ei_inode
.bi_sectors
);
481 ret
= bch2_setattr_nonsize(idmap
, inode
, iattr
);
483 bch2_pagecache_block_put(inode
);
484 return bch2_err_class(ret
);
489 static int inode_update_times_fn(struct btree_trans
*trans
,
490 struct bch_inode_info
*inode
,
491 struct bch_inode_unpacked
*bi
, void *p
)
493 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
495 bi
->bi_mtime
= bi
->bi_ctime
= bch2_current_time(c
);
499 static noinline
long bchfs_fpunch(struct bch_inode_info
*inode
, loff_t offset
, loff_t len
)
501 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
502 u64 end
= offset
+ len
;
503 u64 block_start
= round_up(offset
, block_bytes(c
));
504 u64 block_end
= round_down(end
, block_bytes(c
));
505 bool truncated_last_page
;
508 ret
= bch2_truncate_folios(inode
, offset
, end
);
509 if (unlikely(ret
< 0))
512 truncated_last_page
= ret
;
514 truncate_pagecache_range(&inode
->v
, offset
, end
- 1);
516 if (block_start
< block_end
) {
517 s64 i_sectors_delta
= 0;
519 ret
= bch2_fpunch(c
, inode_inum(inode
),
520 block_start
>> 9, block_end
>> 9,
522 bch2_i_sectors_acct(c
, inode
, NULL
, i_sectors_delta
);
525 mutex_lock(&inode
->ei_update_lock
);
526 if (end
>= inode
->v
.i_size
&& !truncated_last_page
) {
527 ret
= bch2_write_inode_size(c
, inode
, inode
->v
.i_size
,
528 ATTR_MTIME
|ATTR_CTIME
);
530 ret
= bch2_write_inode(c
, inode
, inode_update_times_fn
, NULL
,
531 ATTR_MTIME
|ATTR_CTIME
);
533 mutex_unlock(&inode
->ei_update_lock
);
538 static noinline
long bchfs_fcollapse_finsert(struct bch_inode_info
*inode
,
539 loff_t offset
, loff_t len
,
542 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
543 struct address_space
*mapping
= inode
->v
.i_mapping
;
544 s64 i_sectors_delta
= 0;
547 if ((offset
| len
) & (block_bytes(c
) - 1))
551 if (offset
>= inode
->v
.i_size
)
554 if (offset
+ len
>= inode
->v
.i_size
)
558 ret
= bch2_write_invalidate_inode_pages_range(mapping
, offset
, LLONG_MAX
);
563 i_size_write(&inode
->v
, inode
->v
.i_size
+ len
);
565 ret
= bch2_fcollapse_finsert(c
, inode_inum(inode
), offset
>> 9, len
>> 9,
566 insert
, &i_sectors_delta
);
568 i_size_write(&inode
->v
, inode
->v
.i_size
- len
);
569 bch2_i_sectors_acct(c
, inode
, NULL
, i_sectors_delta
);
574 static noinline
int __bchfs_fallocate(struct bch_inode_info
*inode
, int mode
,
575 u64 start_sector
, u64 end_sector
)
577 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
578 struct btree_trans
*trans
= bch2_trans_get(c
);
579 struct btree_iter iter
;
580 struct bpos end_pos
= POS(inode
->v
.i_ino
, end_sector
);
581 struct bch_io_opts opts
;
584 bch2_inode_opts_get(&opts
, c
, &inode
->ei_inode
);
586 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_extents
,
587 POS(inode
->v
.i_ino
, start_sector
),
588 BTREE_ITER_slots
|BTREE_ITER_intent
);
591 s64 i_sectors_delta
= 0;
592 struct quota_res quota_res
= { 0 };
596 u64 hole_start
, hole_end
;
599 bch2_trans_begin(trans
);
601 if (bkey_ge(iter
.pos
, end_pos
))
604 ret
= bch2_subvolume_get_snapshot(trans
,
605 inode
->ei_inum
.subvol
, &snapshot
);
609 bch2_btree_iter_set_snapshot(&iter
, snapshot
);
611 k
= bch2_btree_iter_peek_slot(&iter
);
612 if ((ret
= bkey_err(k
)))
615 hole_start
= iter
.pos
.offset
;
616 hole_end
= bpos_min(k
.k
->p
, end_pos
).offset
;
617 is_allocation
= bkey_extent_is_allocation(k
.k
);
619 /* already reserved */
620 if (bkey_extent_is_reservation(k
) &&
621 bch2_bkey_nr_ptrs_fully_allocated(k
) >= opts
.data_replicas
) {
622 bch2_btree_iter_advance(&iter
);
626 if (bkey_extent_is_data(k
.k
) &&
627 !(mode
& FALLOC_FL_ZERO_RANGE
)) {
628 bch2_btree_iter_advance(&iter
);
632 if (!(mode
& FALLOC_FL_ZERO_RANGE
)) {
634 * Lock ordering - can't be holding btree locks while
635 * blocking on a folio lock:
637 if (bch2_clamp_data_hole(&inode
->v
,
640 opts
.data_replicas
, true)) {
641 ret
= drop_locks_do(trans
,
642 (bch2_clamp_data_hole(&inode
->v
,
645 opts
.data_replicas
, false), 0));
649 bch2_btree_iter_set_pos(&iter
, POS(iter
.pos
.inode
, hole_start
));
654 if (hole_start
== hole_end
)
658 sectors
= hole_end
- hole_start
;
660 if (!is_allocation
) {
661 ret
= bch2_quota_reservation_add(c
, inode
,
662 "a_res
, sectors
, true);
667 ret
= bch2_extent_fallocate(trans
, inode_inum(inode
), &iter
,
668 sectors
, opts
, &i_sectors_delta
,
669 writepoint_hashed((unsigned long) current
));
673 bch2_i_sectors_acct(c
, inode
, "a_res
, i_sectors_delta
);
675 if (bch2_mark_pagecache_reserved(inode
, &hole_start
,
676 iter
.pos
.offset
, true)) {
677 ret
= drop_locks_do(trans
,
678 bch2_mark_pagecache_reserved(inode
, &hole_start
,
679 iter
.pos
.offset
, false));
684 bch2_quota_reservation_put(c
, inode
, "a_res
);
685 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
689 if (bch2_err_matches(ret
, ENOSPC
) && (mode
& FALLOC_FL_ZERO_RANGE
)) {
690 struct quota_res quota_res
= { 0 };
691 s64 i_sectors_delta
= 0;
693 bch2_fpunch_at(trans
, &iter
, inode_inum(inode
),
694 end_sector
, &i_sectors_delta
);
695 bch2_i_sectors_acct(c
, inode
, "a_res
, i_sectors_delta
);
696 bch2_quota_reservation_put(c
, inode
, "a_res
);
699 bch2_trans_iter_exit(trans
, &iter
);
700 bch2_trans_put(trans
);
704 static noinline
long bchfs_fallocate(struct bch_inode_info
*inode
, int mode
,
705 loff_t offset
, loff_t len
)
707 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
708 u64 end
= offset
+ len
;
709 u64 block_start
= round_down(offset
, block_bytes(c
));
710 u64 block_end
= round_up(end
, block_bytes(c
));
711 bool truncated_last_page
= false;
714 if (!(mode
& FALLOC_FL_KEEP_SIZE
) && end
> inode
->v
.i_size
) {
715 ret
= inode_newsize_ok(&inode
->v
, end
);
720 if (mode
& FALLOC_FL_ZERO_RANGE
) {
721 ret
= bch2_truncate_folios(inode
, offset
, end
);
722 if (unlikely(ret
< 0))
725 truncated_last_page
= ret
;
727 truncate_pagecache_range(&inode
->v
, offset
, end
- 1);
729 block_start
= round_up(offset
, block_bytes(c
));
730 block_end
= round_down(end
, block_bytes(c
));
733 ret
= __bchfs_fallocate(inode
, mode
, block_start
>> 9, block_end
>> 9);
736 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
737 * so that the VFS cache i_size is consistent with the btree i_size:
740 !(bch2_err_matches(ret
, ENOSPC
) && (mode
& FALLOC_FL_ZERO_RANGE
)))
743 if (mode
& FALLOC_FL_KEEP_SIZE
&& end
> inode
->v
.i_size
)
744 end
= inode
->v
.i_size
;
746 if (end
>= inode
->v
.i_size
&&
747 (((mode
& FALLOC_FL_ZERO_RANGE
) && !truncated_last_page
) ||
748 !(mode
& FALLOC_FL_KEEP_SIZE
))) {
749 spin_lock(&inode
->v
.i_lock
);
750 i_size_write(&inode
->v
, end
);
751 spin_unlock(&inode
->v
.i_lock
);
753 mutex_lock(&inode
->ei_update_lock
);
754 ret2
= bch2_write_inode_size(c
, inode
, end
, 0);
755 mutex_unlock(&inode
->ei_update_lock
);
761 long bch2_fallocate_dispatch(struct file
*file
, int mode
,
762 loff_t offset
, loff_t len
)
764 struct bch_inode_info
*inode
= file_bch_inode(file
);
765 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
768 if (!bch2_write_ref_tryget(c
, BCH_WRITE_REF_fallocate
))
771 inode_lock(&inode
->v
);
772 inode_dio_wait(&inode
->v
);
773 bch2_pagecache_block_get(inode
);
775 ret
= file_modified(file
);
779 if (!(mode
& ~(FALLOC_FL_KEEP_SIZE
|FALLOC_FL_ZERO_RANGE
)))
780 ret
= bchfs_fallocate(inode
, mode
, offset
, len
);
781 else if (mode
== (FALLOC_FL_PUNCH_HOLE
|FALLOC_FL_KEEP_SIZE
))
782 ret
= bchfs_fpunch(inode
, offset
, len
);
783 else if (mode
== FALLOC_FL_INSERT_RANGE
)
784 ret
= bchfs_fcollapse_finsert(inode
, offset
, len
, true);
785 else if (mode
== FALLOC_FL_COLLAPSE_RANGE
)
786 ret
= bchfs_fcollapse_finsert(inode
, offset
, len
, false);
790 bch2_pagecache_block_put(inode
);
791 inode_unlock(&inode
->v
);
792 bch2_write_ref_put(c
, BCH_WRITE_REF_fallocate
);
794 return bch2_err_class(ret
);
798 * Take a quota reservation for unallocated blocks in a given file range
799 * Does not check pagecache
801 static int quota_reserve_range(struct bch_inode_info
*inode
,
802 struct quota_res
*res
,
805 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
806 u64 sectors
= end
- start
;
808 int ret
= bch2_trans_run(c
,
809 for_each_btree_key_in_subvolume_upto(trans
, iter
,
811 POS(inode
->v
.i_ino
, start
),
812 POS(inode
->v
.i_ino
, end
- 1),
813 inode
->ei_inum
.subvol
, 0, k
, ({
814 if (bkey_extent_is_allocation(k
.k
)) {
815 u64 s
= min(end
, k
.k
->p
.offset
) -
816 max(start
, bkey_start_offset(k
.k
));
824 return ret
?: bch2_quota_reservation_add(c
, inode
, res
, sectors
, true);
827 loff_t
bch2_remap_file_range(struct file
*file_src
, loff_t pos_src
,
828 struct file
*file_dst
, loff_t pos_dst
,
829 loff_t len
, unsigned remap_flags
)
831 struct bch_inode_info
*src
= file_bch_inode(file_src
);
832 struct bch_inode_info
*dst
= file_bch_inode(file_dst
);
833 struct bch_fs
*c
= src
->v
.i_sb
->s_fs_info
;
834 struct quota_res quota_res
= { 0 };
835 s64 i_sectors_delta
= 0;
839 if (remap_flags
& ~(REMAP_FILE_DEDUP
|REMAP_FILE_ADVISORY
))
842 if ((pos_src
& (block_bytes(c
) - 1)) ||
843 (pos_dst
& (block_bytes(c
) - 1)))
847 abs(pos_src
- pos_dst
) < len
)
850 lock_two_nondirectories(&src
->v
, &dst
->v
);
851 bch2_lock_inodes(INODE_PAGECACHE_BLOCK
, src
, dst
);
853 inode_dio_wait(&src
->v
);
854 inode_dio_wait(&dst
->v
);
856 ret
= generic_remap_file_range_prep(file_src
, pos_src
,
859 if (ret
< 0 || len
== 0)
862 aligned_len
= round_up((u64
) len
, block_bytes(c
));
864 ret
= bch2_write_invalidate_inode_pages_range(dst
->v
.i_mapping
,
865 pos_dst
, pos_dst
+ len
- 1);
869 ret
= quota_reserve_range(dst
, "a_res
, pos_dst
>> 9,
870 (pos_dst
+ aligned_len
) >> 9);
874 if (!(remap_flags
& REMAP_FILE_DEDUP
))
875 file_update_time(file_dst
);
877 bch2_mark_pagecache_unallocated(src
, pos_src
>> 9,
878 (pos_src
+ aligned_len
) >> 9);
880 ret
= bch2_remap_range(c
,
881 inode_inum(dst
), pos_dst
>> 9,
882 inode_inum(src
), pos_src
>> 9,
884 pos_dst
+ len
, &i_sectors_delta
);
889 * due to alignment, we might have remapped slightly more than requsted
891 ret
= min((u64
) ret
<< 9, (u64
) len
);
893 bch2_i_sectors_acct(c
, dst
, "a_res
, i_sectors_delta
);
895 spin_lock(&dst
->v
.i_lock
);
896 if (pos_dst
+ ret
> dst
->v
.i_size
)
897 i_size_write(&dst
->v
, pos_dst
+ ret
);
898 spin_unlock(&dst
->v
.i_lock
);
900 if ((file_dst
->f_flags
& (__O_SYNC
| O_DSYNC
)) ||
901 IS_SYNC(file_inode(file_dst
)))
902 ret
= bch2_flush_inode(c
, dst
);
904 bch2_quota_reservation_put(c
, dst
, "a_res
);
905 bch2_unlock_inodes(INODE_PAGECACHE_BLOCK
, src
, dst
);
906 unlock_two_nondirectories(&src
->v
, &dst
->v
);
908 return bch2_err_class(ret
);
913 static loff_t
bch2_seek_data(struct file
*file
, u64 offset
)
915 struct bch_inode_info
*inode
= file_bch_inode(file
);
916 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
917 subvol_inum inum
= inode_inum(inode
);
918 u64 isize
, next_data
= MAX_LFS_FILESIZE
;
920 isize
= i_size_read(&inode
->v
);
924 int ret
= bch2_trans_run(c
,
925 for_each_btree_key_in_subvolume_upto(trans
, iter
, BTREE_ID_extents
,
926 POS(inode
->v
.i_ino
, offset
>> 9),
927 POS(inode
->v
.i_ino
, U64_MAX
),
928 inum
.subvol
, 0, k
, ({
929 if (bkey_extent_is_data(k
.k
)) {
930 next_data
= max(offset
, bkey_start_offset(k
.k
) << 9);
932 } else if (k
.k
->p
.offset
>> 9 > isize
)
939 if (next_data
> offset
)
940 next_data
= bch2_seek_pagecache_data(&inode
->v
,
941 offset
, next_data
, 0, false);
943 if (next_data
>= isize
)
946 return vfs_setpos(file
, next_data
, MAX_LFS_FILESIZE
);
949 static loff_t
bch2_seek_hole(struct file
*file
, u64 offset
)
951 struct bch_inode_info
*inode
= file_bch_inode(file
);
952 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
953 subvol_inum inum
= inode_inum(inode
);
954 u64 isize
, next_hole
= MAX_LFS_FILESIZE
;
956 isize
= i_size_read(&inode
->v
);
960 int ret
= bch2_trans_run(c
,
961 for_each_btree_key_in_subvolume_upto(trans
, iter
, BTREE_ID_extents
,
962 POS(inode
->v
.i_ino
, offset
>> 9),
963 POS(inode
->v
.i_ino
, U64_MAX
),
964 inum
.subvol
, BTREE_ITER_slots
, k
, ({
965 if (k
.k
->p
.inode
!= inode
->v
.i_ino
) {
966 next_hole
= bch2_seek_pagecache_hole(&inode
->v
,
967 offset
, MAX_LFS_FILESIZE
, 0, false);
969 } else if (!bkey_extent_is_data(k
.k
)) {
970 next_hole
= bch2_seek_pagecache_hole(&inode
->v
,
971 max(offset
, bkey_start_offset(k
.k
) << 9),
972 k
.k
->p
.offset
<< 9, 0, false);
974 if (next_hole
< k
.k
->p
.offset
<< 9)
977 offset
= max(offset
, bkey_start_offset(k
.k
) << 9);
984 if (next_hole
> isize
)
987 return vfs_setpos(file
, next_hole
, MAX_LFS_FILESIZE
);
990 loff_t
bch2_llseek(struct file
*file
, loff_t offset
, int whence
)
998 ret
= generic_file_llseek(file
, offset
, whence
);
1001 ret
= bch2_seek_data(file
, offset
);
1004 ret
= bch2_seek_hole(file
, offset
);
1011 return bch2_err_class(ret
);
1014 void bch2_fs_fsio_exit(struct bch_fs
*c
)
1016 bioset_exit(&c
->nocow_flush_bioset
);
1019 int bch2_fs_fsio_init(struct bch_fs
*c
)
1021 if (bioset_init(&c
->nocow_flush_bioset
,
1022 1, offsetof(struct nocow_flush
, bio
), 0))
1023 return -BCH_ERR_ENOMEM_nocow_flush_bioset_init
;
1028 #endif /* NO_BCACHEFS_FS */