1 // SPDX-License-Identifier: GPL-2.0
3 * io_misc.c - fallocate, fpunch, truncate:
7 #include "alloc_foreground.h"
9 #include "btree_update.h"
14 #include "extent_update.h"
18 #include "logged_ops.h"
19 #include "rebalance.h"
20 #include "subvolume.h"
22 /* Overwrites whatever was present with zeroes: */
23 int bch2_extent_fallocate(struct btree_trans
*trans
,
25 struct btree_iter
*iter
,
27 struct bch_io_opts opts
,
29 struct write_point_specifier write_point
)
31 struct bch_fs
*c
= trans
->c
;
32 struct disk_reservation disk_res
= { 0 };
34 struct open_buckets open_buckets
= { 0 };
36 struct bkey_buf old
, new;
37 unsigned sectors_allocated
= 0, new_replicas
;
38 bool unwritten
= opts
.nocow
&&
39 c
->sb
.version
>= bcachefs_metadata_version_unwritten_extents
;
42 bch2_bkey_buf_init(&old
);
43 bch2_bkey_buf_init(&new);
44 closure_init_stack(&cl
);
46 k
= bch2_btree_iter_peek_slot(iter
);
51 sectors
= min_t(u64
, sectors
, k
.k
->p
.offset
- iter
->pos
.offset
);
52 new_replicas
= max(0, (int) opts
.data_replicas
-
53 (int) bch2_bkey_nr_ptrs_fully_allocated(k
));
56 * Get a disk reservation before (in the nocow case) calling
59 ret
= bch2_disk_reservation_get(c
, &disk_res
, sectors
, new_replicas
, 0);
63 bch2_bkey_buf_reassemble(&old
, c
, k
);
66 struct bkey_i_reservation
*reservation
;
68 bch2_bkey_buf_realloc(&new, c
, sizeof(*reservation
) / sizeof(u64
));
69 reservation
= bkey_reservation_init(new.k
);
70 reservation
->k
.p
= iter
->pos
;
71 bch2_key_resize(&reservation
->k
, sectors
);
72 reservation
->v
.nr_replicas
= opts
.data_replicas
;
74 struct bkey_i_extent
*e
;
75 struct bch_devs_list devs_have
;
76 struct write_point
*wp
;
80 bch2_bkey_buf_realloc(&new, c
, BKEY_EXTENT_U64s_MAX
);
82 e
= bkey_extent_init(new.k
);
85 ret
= bch2_alloc_sectors_start_trans(trans
,
86 opts
.foreground_target
,
92 BCH_WATERMARK_normal
, 0, &cl
, &wp
);
93 if (bch2_err_matches(ret
, BCH_ERR_operation_blocked
))
94 ret
= -BCH_ERR_transaction_restart_nested
;
98 sectors
= min_t(u64
, sectors
, wp
->sectors_free
);
99 sectors_allocated
= sectors
;
101 bch2_key_resize(&e
->k
, sectors
);
103 bch2_open_bucket_get(c
, wp
, &open_buckets
);
104 bch2_alloc_sectors_append_ptrs(c
, wp
, &e
->k_i
, sectors
, false);
105 bch2_alloc_sectors_done(c
, wp
);
107 extent_for_each_ptr(extent_i_to_s(e
), ptr
)
108 ptr
->unwritten
= true;
111 ret
= bch2_extent_update(trans
, inum
, iter
, new.k
, &disk_res
,
112 0, i_sectors_delta
, true);
114 if (!ret
&& sectors_allocated
)
115 bch2_increment_clock(c
, sectors_allocated
, WRITE
);
116 if (should_print_err(ret
))
117 bch_err_inum_offset_ratelimited(c
,
119 iter
->pos
.offset
<< 9,
120 "%s(): error: %s", __func__
, bch2_err_str(ret
));
122 bch2_open_buckets_put(c
, &open_buckets
);
123 bch2_disk_reservation_put(c
, &disk_res
);
124 bch2_bkey_buf_exit(&new, c
);
125 bch2_bkey_buf_exit(&old
, c
);
127 if (closure_nr_remaining(&cl
) != 1) {
128 bch2_trans_unlock_long(trans
);
129 bch2_wait_on_allocator(c
, &cl
);
136 * Returns -BCH_ERR_transacton_restart if we had to drop locks:
138 int bch2_fpunch_at(struct btree_trans
*trans
, struct btree_iter
*iter
,
139 subvol_inum inum
, u64 end
,
140 s64
*i_sectors_delta
)
142 struct bch_fs
*c
= trans
->c
;
143 unsigned max_sectors
= KEY_SIZE_MAX
& (~0 << c
->block_bits
);
144 struct bpos end_pos
= POS(inum
.inum
, end
);
146 int ret
= 0, ret2
= 0;
150 bch2_err_matches(ret
, BCH_ERR_transaction_restart
)) {
151 struct disk_reservation disk_res
=
152 bch2_disk_reservation_init(c
, 0);
153 struct bkey_i
delete;
158 bch2_trans_begin(trans
);
160 ret
= bch2_subvolume_get_snapshot(trans
, inum
.subvol
, &snapshot
);
164 bch2_btree_iter_set_snapshot(iter
, snapshot
);
167 * peek_upto() doesn't have ideal semantics for extents:
169 k
= bch2_btree_iter_peek_upto(iter
, end_pos
);
177 bkey_init(&delete.k
);
178 delete.k
.p
= iter
->pos
;
180 /* create the biggest key we can */
181 bch2_key_resize(&delete.k
, max_sectors
);
182 bch2_cut_back(end_pos
, &delete);
184 ret
= bch2_extent_update(trans
, inum
, iter
, &delete,
185 &disk_res
, 0, i_sectors_delta
, false);
186 bch2_disk_reservation_put(c
, &disk_res
);
192 int bch2_fpunch(struct bch_fs
*c
, subvol_inum inum
, u64 start
, u64 end
,
193 s64
*i_sectors_delta
)
195 struct btree_trans
*trans
= bch2_trans_get(c
);
196 struct btree_iter iter
;
199 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_extents
,
200 POS(inum
.inum
, start
),
203 ret
= bch2_fpunch_at(trans
, &iter
, inum
, end
, i_sectors_delta
);
205 bch2_trans_iter_exit(trans
, &iter
);
206 bch2_trans_put(trans
);
208 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
216 void bch2_logged_op_truncate_to_text(struct printbuf
*out
, struct bch_fs
*c
, struct bkey_s_c k
)
218 struct bkey_s_c_logged_op_truncate op
= bkey_s_c_to_logged_op_truncate(k
);
220 prt_printf(out
, "subvol=%u", le32_to_cpu(op
.v
->subvol
));
221 prt_printf(out
, " inum=%llu", le64_to_cpu(op
.v
->inum
));
222 prt_printf(out
, " new_i_size=%llu", le64_to_cpu(op
.v
->new_i_size
));
225 static int truncate_set_isize(struct btree_trans
*trans
,
230 struct btree_iter iter
= { NULL
};
231 struct bch_inode_unpacked inode_u
;
234 ret
= __bch2_inode_peek(trans
, &iter
, &inode_u
, inum
, BTREE_ITER_intent
, warn
) ?:
235 (inode_u
.bi_size
= new_i_size
, 0) ?:
236 bch2_inode_write(trans
, &iter
, &inode_u
);
238 bch2_trans_iter_exit(trans
, &iter
);
242 static int __bch2_resume_logged_op_truncate(struct btree_trans
*trans
,
244 u64
*i_sectors_delta
)
246 struct bch_fs
*c
= trans
->c
;
247 struct btree_iter fpunch_iter
;
248 struct bkey_i_logged_op_truncate
*op
= bkey_i_to_logged_op_truncate(op_k
);
249 subvol_inum inum
= { le32_to_cpu(op
->v
.subvol
), le64_to_cpu(op
->v
.inum
) };
250 u64 new_i_size
= le64_to_cpu(op
->v
.new_i_size
);
251 bool warn_errors
= i_sectors_delta
!= NULL
;
254 ret
= commit_do(trans
, NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
255 truncate_set_isize(trans
, inum
, new_i_size
, i_sectors_delta
!= NULL
));
259 bch2_trans_iter_init(trans
, &fpunch_iter
, BTREE_ID_extents
,
260 POS(inum
.inum
, round_up(new_i_size
, block_bytes(c
)) >> 9),
262 ret
= bch2_fpunch_at(trans
, &fpunch_iter
, inum
, U64_MAX
, i_sectors_delta
);
263 bch2_trans_iter_exit(trans
, &fpunch_iter
);
265 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
273 int bch2_resume_logged_op_truncate(struct btree_trans
*trans
, struct bkey_i
*op_k
)
275 return __bch2_resume_logged_op_truncate(trans
, op_k
, NULL
);
278 int bch2_truncate(struct bch_fs
*c
, subvol_inum inum
, u64 new_i_size
, u64
*i_sectors_delta
)
280 struct bkey_i_logged_op_truncate op
;
282 bkey_logged_op_truncate_init(&op
.k_i
);
283 op
.v
.subvol
= cpu_to_le32(inum
.subvol
);
284 op
.v
.inum
= cpu_to_le64(inum
.inum
);
285 op
.v
.new_i_size
= cpu_to_le64(new_i_size
);
288 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
289 * snapshot while they're in progress, then crashing, will result in the
290 * resume only proceeding in one of the snapshots
292 down_read(&c
->snapshot_create_lock
);
293 struct btree_trans
*trans
= bch2_trans_get(c
);
294 int ret
= bch2_logged_op_start(trans
, &op
.k_i
);
297 ret
= __bch2_resume_logged_op_truncate(trans
, &op
.k_i
, i_sectors_delta
);
298 ret
= bch2_logged_op_finish(trans
, &op
.k_i
) ?: ret
;
300 bch2_trans_put(trans
);
301 up_read(&c
->snapshot_create_lock
);
306 /* finsert/fcollapse: */
308 void bch2_logged_op_finsert_to_text(struct printbuf
*out
, struct bch_fs
*c
, struct bkey_s_c k
)
310 struct bkey_s_c_logged_op_finsert op
= bkey_s_c_to_logged_op_finsert(k
);
312 prt_printf(out
, "subvol=%u", le32_to_cpu(op
.v
->subvol
));
313 prt_printf(out
, " inum=%llu", le64_to_cpu(op
.v
->inum
));
314 prt_printf(out
, " dst_offset=%lli", le64_to_cpu(op
.v
->dst_offset
));
315 prt_printf(out
, " src_offset=%llu", le64_to_cpu(op
.v
->src_offset
));
318 static int adjust_i_size(struct btree_trans
*trans
, subvol_inum inum
,
319 u64 offset
, s64 len
, bool warn
)
321 struct btree_iter iter
;
322 struct bch_inode_unpacked inode_u
;
328 ret
= __bch2_inode_peek(trans
, &iter
, &inode_u
, inum
, BTREE_ITER_intent
, warn
);
333 if (MAX_LFS_FILESIZE
- inode_u
.bi_size
< len
) {
338 if (offset
>= inode_u
.bi_size
) {
344 inode_u
.bi_size
+= len
;
345 inode_u
.bi_mtime
= inode_u
.bi_ctime
= bch2_current_time(trans
->c
);
347 ret
= bch2_inode_write(trans
, &iter
, &inode_u
);
349 bch2_trans_iter_exit(trans
, &iter
);
353 static int __bch2_resume_logged_op_finsert(struct btree_trans
*trans
,
355 u64
*i_sectors_delta
)
357 struct bch_fs
*c
= trans
->c
;
358 struct btree_iter iter
;
359 struct bkey_i_logged_op_finsert
*op
= bkey_i_to_logged_op_finsert(op_k
);
360 subvol_inum inum
= { le32_to_cpu(op
->v
.subvol
), le64_to_cpu(op
->v
.inum
) };
361 struct bch_io_opts opts
;
362 u64 dst_offset
= le64_to_cpu(op
->v
.dst_offset
);
363 u64 src_offset
= le64_to_cpu(op
->v
.src_offset
);
364 s64 shift
= dst_offset
- src_offset
;
365 u64 len
= abs(shift
);
366 u64 pos
= le64_to_cpu(op
->v
.pos
);
367 bool insert
= shift
> 0;
369 bool warn_errors
= i_sectors_delta
!= NULL
;
372 ret
= bch2_inum_opts_get(trans
, inum
, &opts
);
377 * check for missing subvolume before fpunch, as in resume we don't want
378 * it to be a fatal error
380 ret
= lockrestart_do(trans
, __bch2_subvolume_get_snapshot(trans
, inum
.subvol
, &snapshot
, warn_errors
));
384 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_extents
,
388 switch (op
->v
.state
) {
389 case LOGGED_OP_FINSERT_start
:
390 op
->v
.state
= LOGGED_OP_FINSERT_shift_extents
;
393 ret
= commit_do(trans
, NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
394 adjust_i_size(trans
, inum
, src_offset
, len
, warn_errors
) ?:
395 bch2_logged_op_update(trans
, &op
->k_i
));
399 bch2_btree_iter_set_pos(&iter
, POS(inum
.inum
, src_offset
));
401 ret
= bch2_fpunch_at(trans
, &iter
, inum
, src_offset
+ len
, i_sectors_delta
);
402 if (ret
&& !bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
405 ret
= commit_do(trans
, NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
406 bch2_logged_op_update(trans
, &op
->k_i
));
410 case LOGGED_OP_FINSERT_shift_extents
:
412 struct disk_reservation disk_res
=
413 bch2_disk_reservation_init(c
, 0);
414 struct bkey_i
delete, *copy
;
416 struct bpos src_pos
= POS(inum
.inum
, src_offset
);
418 bch2_trans_begin(trans
);
420 ret
= __bch2_subvolume_get_snapshot(trans
, inum
.subvol
, &snapshot
,
425 bch2_btree_iter_set_snapshot(&iter
, snapshot
);
426 bch2_btree_iter_set_pos(&iter
, SPOS(inum
.inum
, pos
, snapshot
));
429 ? bch2_btree_iter_peek_prev(&iter
)
430 : bch2_btree_iter_peek_upto(&iter
, POS(inum
.inum
, U64_MAX
));
431 if ((ret
= bkey_err(k
)))
435 k
.k
->p
.inode
!= inum
.inum
||
436 bkey_le(k
.k
->p
, POS(inum
.inum
, src_offset
)))
439 copy
= bch2_bkey_make_mut_noupdate(trans
, k
);
440 if ((ret
= PTR_ERR_OR_ZERO(copy
)))
444 bkey_lt(bkey_start_pos(k
.k
), src_pos
)) {
445 bch2_cut_front(src_pos
, copy
);
447 /* Splitting compressed extent? */
448 bch2_disk_reservation_add(c
, &disk_res
,
450 bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy
)),
451 BCH_DISK_RESERVATION_NOFAIL
);
454 bkey_init(&delete.k
);
455 delete.k
.p
= copy
->k
.p
;
456 delete.k
.p
.snapshot
= snapshot
;
457 delete.k
.size
= copy
->k
.size
;
459 copy
->k
.p
.offset
+= shift
;
460 copy
->k
.p
.snapshot
= snapshot
;
462 op
->v
.pos
= cpu_to_le64(insert
? bkey_start_offset(&delete.k
) : delete.k
.p
.offset
);
464 ret
= bch2_bkey_set_needs_rebalance(c
, copy
, &opts
) ?:
465 bch2_btree_insert_trans(trans
, BTREE_ID_extents
, &delete, 0) ?:
466 bch2_btree_insert_trans(trans
, BTREE_ID_extents
, copy
, 0) ?:
467 bch2_logged_op_update(trans
, &op
->k_i
) ?:
468 bch2_trans_commit(trans
, &disk_res
, NULL
, BCH_TRANS_COMMIT_no_enospc
);
470 bch2_disk_reservation_put(c
, &disk_res
);
472 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
477 pos
= le64_to_cpu(op
->v
.pos
);
480 op
->v
.state
= LOGGED_OP_FINSERT_finish
;
483 ret
= commit_do(trans
, NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
484 adjust_i_size(trans
, inum
, src_offset
, shift
, warn_errors
) ?:
485 bch2_logged_op_update(trans
, &op
->k_i
));
487 /* We need an inode update to update bi_journal_seq for fsync: */
488 ret
= commit_do(trans
, NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
489 adjust_i_size(trans
, inum
, 0, 0, warn_errors
) ?:
490 bch2_logged_op_update(trans
, &op
->k_i
));
494 case LOGGED_OP_FINSERT_finish
:
498 bch2_trans_iter_exit(trans
, &iter
);
504 int bch2_resume_logged_op_finsert(struct btree_trans
*trans
, struct bkey_i
*op_k
)
506 return __bch2_resume_logged_op_finsert(trans
, op_k
, NULL
);
509 int bch2_fcollapse_finsert(struct bch_fs
*c
, subvol_inum inum
,
510 u64 offset
, u64 len
, bool insert
,
511 s64
*i_sectors_delta
)
513 struct bkey_i_logged_op_finsert op
;
514 s64 shift
= insert
? len
: -len
;
516 bkey_logged_op_finsert_init(&op
.k_i
);
517 op
.v
.subvol
= cpu_to_le32(inum
.subvol
);
518 op
.v
.inum
= cpu_to_le64(inum
.inum
);
519 op
.v
.dst_offset
= cpu_to_le64(offset
+ shift
);
520 op
.v
.src_offset
= cpu_to_le64(offset
);
521 op
.v
.pos
= cpu_to_le64(insert
? U64_MAX
: offset
);
524 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
525 * snapshot while they're in progress, then crashing, will result in the
526 * resume only proceeding in one of the snapshots
528 down_read(&c
->snapshot_create_lock
);
529 struct btree_trans
*trans
= bch2_trans_get(c
);
530 int ret
= bch2_logged_op_start(trans
, &op
.k_i
);
533 ret
= __bch2_resume_logged_op_finsert(trans
, &op
.k_i
, i_sectors_delta
);
534 ret
= bch2_logged_op_finish(trans
, &op
.k_i
) ?: ret
;
536 bch2_trans_put(trans
);
537 up_read(&c
->snapshot_create_lock
);