1 // SPDX-License-Identifier: GPL-2.0
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "backpointers.h"
7 #include "btree_cache.h"
9 #include "btree_key_cache.h"
10 #include "btree_update.h"
11 #include "btree_update_interior.h"
13 #include "btree_write_buffer.h"
15 #include "buckets_waiting_for_journal.h"
18 #include "disk_accounting.h"
26 #include <linux/kthread.h>
27 #include <linux/math64.h>
28 #include <linux/random.h>
29 #include <linux/rculist.h>
30 #include <linux/rcupdate.h>
31 #include <linux/sched/task.h>
32 #include <linux/sort.h>
33 #include <linux/jiffies.h>
35 static void bch2_discard_one_bucket_fast(struct bch_dev
*, u64
);
37 /* Persistent alloc info: */
39 static const unsigned BCH_ALLOC_V1_FIELD_BYTES
[] = {
40 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
45 struct bkey_alloc_unpacked
{
52 #define x(_name, _bits) u##_bits _name;
57 static inline u64
alloc_field_v1_get(const struct bch_alloc
*a
,
58 const void **p
, unsigned field
)
60 unsigned bytes
= BCH_ALLOC_V1_FIELD_BYTES
[field
];
63 if (!(a
->fields
& (1 << field
)))
68 v
= *((const u8
*) *p
);
87 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked
*out
,
90 const struct bch_alloc
*in
= bkey_s_c_to_alloc(k
).v
;
91 const void *d
= in
->data
;
96 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
101 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked
*out
,
104 struct bkey_s_c_alloc_v2 a
= bkey_s_c_to_alloc_v2(k
);
105 const u8
*in
= a
.v
->data
;
106 const u8
*end
= bkey_val_end(a
);
107 unsigned fieldnr
= 0;
112 out
->oldest_gen
= a
.v
->oldest_gen
;
113 out
->data_type
= a
.v
->data_type
;
115 #define x(_name, _bits) \
116 if (fieldnr < a.v->nr_fields) { \
117 ret = bch2_varint_decode_fast(in, end, &v); \
125 if (v != out->_name) \
129 BCH_ALLOC_FIELDS_V2()
134 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked
*out
,
137 struct bkey_s_c_alloc_v3 a
= bkey_s_c_to_alloc_v3(k
);
138 const u8
*in
= a
.v
->data
;
139 const u8
*end
= bkey_val_end(a
);
140 unsigned fieldnr
= 0;
145 out
->oldest_gen
= a
.v
->oldest_gen
;
146 out
->data_type
= a
.v
->data_type
;
147 out
->need_discard
= BCH_ALLOC_V3_NEED_DISCARD(a
.v
);
148 out
->need_inc_gen
= BCH_ALLOC_V3_NEED_INC_GEN(a
.v
);
149 out
->journal_seq
= le64_to_cpu(a
.v
->journal_seq
);
151 #define x(_name, _bits) \
152 if (fieldnr < a.v->nr_fields) { \
153 ret = bch2_varint_decode_fast(in, end, &v); \
161 if (v != out->_name) \
165 BCH_ALLOC_FIELDS_V2()
170 static struct bkey_alloc_unpacked
bch2_alloc_unpack(struct bkey_s_c k
)
172 struct bkey_alloc_unpacked ret
= { .gen
= 0 };
176 bch2_alloc_unpack_v1(&ret
, k
);
178 case KEY_TYPE_alloc_v2
:
179 bch2_alloc_unpack_v2(&ret
, k
);
181 case KEY_TYPE_alloc_v3
:
182 bch2_alloc_unpack_v3(&ret
, k
);
189 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc
*a
)
191 unsigned i
, bytes
= offsetof(struct bch_alloc
, data
);
193 for (i
= 0; i
< ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES
); i
++)
194 if (a
->fields
& (1 << i
))
195 bytes
+= BCH_ALLOC_V1_FIELD_BYTES
[i
];
197 return DIV_ROUND_UP(bytes
, sizeof(u64
));
200 int bch2_alloc_v1_validate(struct bch_fs
*c
, struct bkey_s_c k
,
201 enum bch_validate_flags flags
)
203 struct bkey_s_c_alloc a
= bkey_s_c_to_alloc(k
);
206 /* allow for unknown fields */
207 bkey_fsck_err_on(bkey_val_u64s(a
.k
) < bch_alloc_v1_val_u64s(a
.v
),
208 c
, alloc_v1_val_size_bad
,
209 "incorrect value size (%zu < %u)",
210 bkey_val_u64s(a
.k
), bch_alloc_v1_val_u64s(a
.v
));
215 int bch2_alloc_v2_validate(struct bch_fs
*c
, struct bkey_s_c k
,
216 enum bch_validate_flags flags
)
218 struct bkey_alloc_unpacked u
;
221 bkey_fsck_err_on(bch2_alloc_unpack_v2(&u
, k
),
222 c
, alloc_v2_unpack_error
,
228 int bch2_alloc_v3_validate(struct bch_fs
*c
, struct bkey_s_c k
,
229 enum bch_validate_flags flags
)
231 struct bkey_alloc_unpacked u
;
234 bkey_fsck_err_on(bch2_alloc_unpack_v3(&u
, k
),
235 c
, alloc_v2_unpack_error
,
241 int bch2_alloc_v4_validate(struct bch_fs
*c
, struct bkey_s_c k
,
242 enum bch_validate_flags flags
)
244 struct bch_alloc_v4 a
;
247 bkey_val_copy(&a
, bkey_s_c_to_alloc_v4(k
));
249 bkey_fsck_err_on(alloc_v4_u64s_noerror(&a
) > bkey_val_u64s(k
.k
),
250 c
, alloc_v4_val_size_bad
,
251 "bad val size (%u > %zu)",
252 alloc_v4_u64s_noerror(&a
), bkey_val_u64s(k
.k
));
254 bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a
) &&
255 BCH_ALLOC_V4_NR_BACKPOINTERS(&a
),
256 c
, alloc_v4_backpointers_start_bad
,
257 "invalid backpointers_start");
259 bkey_fsck_err_on(alloc_data_type(a
, a
.data_type
) != a
.data_type
,
260 c
, alloc_key_data_type_bad
,
261 "invalid data type (got %u should be %u)",
262 a
.data_type
, alloc_data_type(a
, a
.data_type
));
264 for (unsigned i
= 0; i
< 2; i
++)
265 bkey_fsck_err_on(a
.io_time
[i
] > LRU_TIME_MAX
,
266 c
, alloc_key_io_time_bad
,
267 "invalid io_time[%s]: %llu, max %llu",
268 i
== READ
? "read" : "write",
269 a
.io_time
[i
], LRU_TIME_MAX
);
271 unsigned stripe_sectors
= BCH_ALLOC_V4_BACKPOINTERS_START(&a
) * sizeof(u64
) >
272 offsetof(struct bch_alloc_v4
, stripe_sectors
)
276 switch (a
.data_type
) {
278 case BCH_DATA_need_gc_gens
:
279 case BCH_DATA_need_discard
:
280 bkey_fsck_err_on(stripe_sectors
||
284 c
, alloc_key_empty_but_have_data
,
285 "empty data type free but have data %u.%u.%u %u",
292 case BCH_DATA_journal
:
295 case BCH_DATA_parity
:
296 bkey_fsck_err_on(!a
.dirty_sectors
&&
298 c
, alloc_key_dirty_sectors_0
,
299 "data_type %s but dirty_sectors==0",
300 bch2_data_type_str(a
.data_type
));
302 case BCH_DATA_cached
:
303 bkey_fsck_err_on(!a
.cached_sectors
||
307 c
, alloc_key_cached_inconsistency
,
308 "data type inconsistency");
310 bkey_fsck_err_on(!a
.io_time
[READ
] &&
311 c
->curr_recovery_pass
> BCH_RECOVERY_PASS_check_alloc_to_lru_refs
,
312 c
, alloc_key_cached_but_read_time_zero
,
313 "cached bucket with read_time == 0");
315 case BCH_DATA_stripe
:
322 void bch2_alloc_v4_swab(struct bkey_s k
)
324 struct bch_alloc_v4
*a
= bkey_s_to_alloc_v4(k
).v
;
325 struct bch_backpointer
*bp
, *bps
;
327 a
->journal_seq
= swab64(a
->journal_seq
);
328 a
->flags
= swab32(a
->flags
);
329 a
->dirty_sectors
= swab32(a
->dirty_sectors
);
330 a
->cached_sectors
= swab32(a
->cached_sectors
);
331 a
->io_time
[0] = swab64(a
->io_time
[0]);
332 a
->io_time
[1] = swab64(a
->io_time
[1]);
333 a
->stripe
= swab32(a
->stripe
);
334 a
->nr_external_backpointers
= swab32(a
->nr_external_backpointers
);
335 a
->stripe_sectors
= swab32(a
->stripe_sectors
);
337 bps
= alloc_v4_backpointers(a
);
338 for (bp
= bps
; bp
< bps
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a
); bp
++) {
339 bp
->bucket_offset
= swab40(bp
->bucket_offset
);
340 bp
->bucket_len
= swab32(bp
->bucket_len
);
341 bch2_bpos_swab(&bp
->pos
);
345 void bch2_alloc_to_text(struct printbuf
*out
, struct bch_fs
*c
, struct bkey_s_c k
)
347 struct bch_alloc_v4 _a
;
348 const struct bch_alloc_v4
*a
= bch2_alloc_to_v4(k
, &_a
);
349 struct bch_dev
*ca
= c
? bch2_dev_bucket_tryget_noerror(c
, k
.k
->p
) : NULL
;
352 printbuf_indent_add(out
, 2);
354 prt_printf(out
, "gen %u oldest_gen %u data_type ", a
->gen
, a
->oldest_gen
);
355 bch2_prt_data_type(out
, a
->data_type
);
357 prt_printf(out
, "journal_seq %llu\n", a
->journal_seq
);
358 prt_printf(out
, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a
));
359 prt_printf(out
, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a
));
360 prt_printf(out
, "dirty_sectors %u\n", a
->dirty_sectors
);
361 prt_printf(out
, "stripe_sectors %u\n", a
->stripe_sectors
);
362 prt_printf(out
, "cached_sectors %u\n", a
->cached_sectors
);
363 prt_printf(out
, "stripe %u\n", a
->stripe
);
364 prt_printf(out
, "stripe_redundancy %u\n", a
->stripe_redundancy
);
365 prt_printf(out
, "io_time[READ] %llu\n", a
->io_time
[READ
]);
366 prt_printf(out
, "io_time[WRITE] %llu\n", a
->io_time
[WRITE
]);
369 prt_printf(out
, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a
, ca
));
370 prt_printf(out
, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a
));
371 printbuf_indent_sub(out
, 2);
376 void __bch2_alloc_to_v4(struct bkey_s_c k
, struct bch_alloc_v4
*out
)
378 if (k
.k
->type
== KEY_TYPE_alloc_v4
) {
381 *out
= *bkey_s_c_to_alloc_v4(k
).v
;
383 src
= alloc_v4_backpointers(out
);
384 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out
, BCH_ALLOC_V4_U64s
);
385 dst
= alloc_v4_backpointers(out
);
388 memset(src
, 0, dst
- src
);
390 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out
, 0);
392 struct bkey_alloc_unpacked u
= bch2_alloc_unpack(k
);
394 *out
= (struct bch_alloc_v4
) {
395 .journal_seq
= u
.journal_seq
,
396 .flags
= u
.need_discard
,
398 .oldest_gen
= u
.oldest_gen
,
399 .data_type
= u
.data_type
,
400 .stripe_redundancy
= u
.stripe_redundancy
,
401 .dirty_sectors
= u
.dirty_sectors
,
402 .cached_sectors
= u
.cached_sectors
,
403 .io_time
[READ
] = u
.read_time
,
404 .io_time
[WRITE
] = u
.write_time
,
408 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out
, BCH_ALLOC_V4_U64s
);
412 static noinline
struct bkey_i_alloc_v4
*
413 __bch2_alloc_to_v4_mut(struct btree_trans
*trans
, struct bkey_s_c k
)
415 struct bkey_i_alloc_v4
*ret
;
417 ret
= bch2_trans_kmalloc(trans
, max(bkey_bytes(k
.k
), sizeof(struct bkey_i_alloc_v4
)));
421 if (k
.k
->type
== KEY_TYPE_alloc_v4
) {
424 bkey_reassemble(&ret
->k_i
, k
);
426 src
= alloc_v4_backpointers(&ret
->v
);
427 SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret
->v
, BCH_ALLOC_V4_U64s
);
428 dst
= alloc_v4_backpointers(&ret
->v
);
431 memset(src
, 0, dst
- src
);
433 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret
->v
, 0);
434 set_alloc_v4_u64s(ret
);
436 bkey_alloc_v4_init(&ret
->k_i
);
438 bch2_alloc_to_v4(k
, &ret
->v
);
443 static inline struct bkey_i_alloc_v4
*bch2_alloc_to_v4_mut_inlined(struct btree_trans
*trans
, struct bkey_s_c k
)
445 struct bkey_s_c_alloc_v4 a
;
447 if (likely(k
.k
->type
== KEY_TYPE_alloc_v4
) &&
448 ((a
= bkey_s_c_to_alloc_v4(k
), true) &&
449 BCH_ALLOC_V4_NR_BACKPOINTERS(a
.v
) == 0))
450 return bch2_bkey_make_mut_noupdate_typed(trans
, k
, alloc_v4
);
452 return __bch2_alloc_to_v4_mut(trans
, k
);
455 struct bkey_i_alloc_v4
*bch2_alloc_to_v4_mut(struct btree_trans
*trans
, struct bkey_s_c k
)
457 return bch2_alloc_to_v4_mut_inlined(trans
, k
);
460 struct bkey_i_alloc_v4
*
461 bch2_trans_start_alloc_update_noupdate(struct btree_trans
*trans
, struct btree_iter
*iter
,
464 struct bkey_s_c k
= bch2_bkey_get_iter(trans
, iter
, BTREE_ID_alloc
, pos
,
465 BTREE_ITER_with_updates
|
468 int ret
= bkey_err(k
);
472 struct bkey_i_alloc_v4
*a
= bch2_alloc_to_v4_mut_inlined(trans
, k
);
473 ret
= PTR_ERR_OR_ZERO(a
);
478 bch2_trans_iter_exit(trans
, iter
);
483 struct bkey_i_alloc_v4
*bch2_trans_start_alloc_update(struct btree_trans
*trans
, struct bpos pos
,
484 enum btree_iter_update_trigger_flags flags
)
486 struct btree_iter iter
;
487 struct bkey_i_alloc_v4
*a
= bch2_trans_start_alloc_update_noupdate(trans
, &iter
, pos
);
488 int ret
= PTR_ERR_OR_ZERO(a
);
492 ret
= bch2_trans_update(trans
, &iter
, &a
->k_i
, flags
);
493 bch2_trans_iter_exit(trans
, &iter
);
494 return unlikely(ret
) ? ERR_PTR(ret
) : a
;
497 static struct bpos
alloc_gens_pos(struct bpos pos
, unsigned *offset
)
499 *offset
= pos
.offset
& KEY_TYPE_BUCKET_GENS_MASK
;
501 pos
.offset
>>= KEY_TYPE_BUCKET_GENS_BITS
;
505 static struct bpos
bucket_gens_pos_to_alloc(struct bpos pos
, unsigned offset
)
507 pos
.offset
<<= KEY_TYPE_BUCKET_GENS_BITS
;
508 pos
.offset
+= offset
;
512 static unsigned alloc_gen(struct bkey_s_c k
, unsigned offset
)
514 return k
.k
->type
== KEY_TYPE_bucket_gens
515 ? bkey_s_c_to_bucket_gens(k
).v
->gens
[offset
]
519 int bch2_bucket_gens_validate(struct bch_fs
*c
, struct bkey_s_c k
,
520 enum bch_validate_flags flags
)
524 bkey_fsck_err_on(bkey_val_bytes(k
.k
) != sizeof(struct bch_bucket_gens
),
525 c
, bucket_gens_val_size_bad
,
526 "bad val size (%zu != %zu)",
527 bkey_val_bytes(k
.k
), sizeof(struct bch_bucket_gens
));
532 void bch2_bucket_gens_to_text(struct printbuf
*out
, struct bch_fs
*c
, struct bkey_s_c k
)
534 struct bkey_s_c_bucket_gens g
= bkey_s_c_to_bucket_gens(k
);
537 for (i
= 0; i
< ARRAY_SIZE(g
.v
->gens
); i
++) {
540 prt_printf(out
, "%u", g
.v
->gens
[i
]);
544 int bch2_bucket_gens_init(struct bch_fs
*c
)
546 struct btree_trans
*trans
= bch2_trans_get(c
);
547 struct bkey_i_bucket_gens g
;
548 bool have_bucket_gens_key
= false;
551 ret
= for_each_btree_key(trans
, iter
, BTREE_ID_alloc
, POS_MIN
,
552 BTREE_ITER_prefetch
, k
, ({
554 * Not a fsck error because this is checked/repaired by
555 * bch2_check_alloc_key() which runs later:
557 if (!bch2_dev_bucket_exists(c
, k
.k
->p
))
560 struct bch_alloc_v4 a
;
561 u8 gen
= bch2_alloc_to_v4(k
, &a
)->gen
;
563 struct bpos pos
= alloc_gens_pos(iter
.pos
, &offset
);
566 if (have_bucket_gens_key
&& !bkey_eq(g
.k
.p
, pos
)) {
567 ret2
= bch2_btree_insert_trans(trans
, BTREE_ID_bucket_gens
, &g
.k_i
, 0) ?:
568 bch2_trans_commit(trans
, NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
);
571 have_bucket_gens_key
= false;
574 if (!have_bucket_gens_key
) {
575 bkey_bucket_gens_init(&g
.k_i
);
577 have_bucket_gens_key
= true;
580 g
.v
.gens
[offset
] = gen
;
585 if (have_bucket_gens_key
&& !ret
)
586 ret
= commit_do(trans
, NULL
, NULL
,
587 BCH_TRANS_COMMIT_no_enospc
,
588 bch2_btree_insert_trans(trans
, BTREE_ID_bucket_gens
, &g
.k_i
, 0));
590 bch2_trans_put(trans
);
596 int bch2_alloc_read(struct bch_fs
*c
)
598 struct btree_trans
*trans
= bch2_trans_get(c
);
599 struct bch_dev
*ca
= NULL
;
602 if (c
->sb
.version_upgrade_complete
>= bcachefs_metadata_version_bucket_gens
) {
603 ret
= for_each_btree_key(trans
, iter
, BTREE_ID_bucket_gens
, POS_MIN
,
604 BTREE_ITER_prefetch
, k
, ({
605 u64 start
= bucket_gens_pos_to_alloc(k
.k
->p
, 0).offset
;
606 u64 end
= bucket_gens_pos_to_alloc(bpos_nosnap_successor(k
.k
->p
), 0).offset
;
608 if (k
.k
->type
!= KEY_TYPE_bucket_gens
)
611 ca
= bch2_dev_iterate(c
, ca
, k
.k
->p
.inode
);
613 * Not a fsck error because this is checked/repaired by
614 * bch2_check_alloc_key() which runs later:
617 bch2_btree_iter_set_pos(&iter
, POS(k
.k
->p
.inode
+ 1, 0));
621 const struct bch_bucket_gens
*g
= bkey_s_c_to_bucket_gens(k
).v
;
623 for (u64 b
= max_t(u64
, ca
->mi
.first_bucket
, start
);
624 b
< min_t(u64
, ca
->mi
.nbuckets
, end
);
626 *bucket_gen(ca
, b
) = g
->gens
[b
& KEY_TYPE_BUCKET_GENS_MASK
];
630 ret
= for_each_btree_key(trans
, iter
, BTREE_ID_alloc
, POS_MIN
,
631 BTREE_ITER_prefetch
, k
, ({
632 ca
= bch2_dev_iterate(c
, ca
, k
.k
->p
.inode
);
634 * Not a fsck error because this is checked/repaired by
635 * bch2_check_alloc_key() which runs later:
638 bch2_btree_iter_set_pos(&iter
, POS(k
.k
->p
.inode
+ 1, 0));
642 if (k
.k
->p
.offset
< ca
->mi
.first_bucket
) {
643 bch2_btree_iter_set_pos(&iter
, POS(k
.k
->p
.inode
, ca
->mi
.first_bucket
));
647 if (k
.k
->p
.offset
>= ca
->mi
.nbuckets
) {
648 bch2_btree_iter_set_pos(&iter
, POS(k
.k
->p
.inode
+ 1, 0));
652 struct bch_alloc_v4 a
;
653 *bucket_gen(ca
, k
.k
->p
.offset
) = bch2_alloc_to_v4(k
, &a
)->gen
;
659 bch2_trans_put(trans
);
665 /* Free space/discard btree: */
667 static int bch2_bucket_do_index(struct btree_trans
*trans
,
669 struct bkey_s_c alloc_k
,
670 const struct bch_alloc_v4
*a
,
673 struct bch_fs
*c
= trans
->c
;
674 struct btree_iter iter
;
678 enum bch_bkey_type old_type
= !set
? KEY_TYPE_set
: KEY_TYPE_deleted
;
679 enum bch_bkey_type new_type
= set
? KEY_TYPE_set
: KEY_TYPE_deleted
;
680 struct printbuf buf
= PRINTBUF
;
683 if (a
->data_type
!= BCH_DATA_free
&&
684 a
->data_type
!= BCH_DATA_need_discard
)
687 k
= bch2_trans_kmalloc_nomemzero(trans
, sizeof(*k
));
692 k
->k
.type
= new_type
;
694 switch (a
->data_type
) {
696 btree
= BTREE_ID_freespace
;
697 k
->k
.p
= alloc_freespace_pos(alloc_k
.k
->p
, *a
);
698 bch2_key_resize(&k
->k
, 1);
700 case BCH_DATA_need_discard
:
701 btree
= BTREE_ID_need_discard
;
702 k
->k
.p
= alloc_k
.k
->p
;
708 old
= bch2_bkey_get_iter(trans
, &iter
, btree
,
709 bkey_start_pos(&k
->k
),
715 if (ca
->mi
.freespace_initialized
&&
716 c
->curr_recovery_pass
> BCH_RECOVERY_PASS_check_alloc_info
&&
717 bch2_trans_inconsistent_on(old
.k
->type
!= old_type
, trans
,
718 "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
720 set
? "setting" : "clearing",
721 bch2_btree_id_str(btree
),
724 bch2_bkey_types
[old
.k
->type
],
725 bch2_bkey_types
[old_type
],
726 (bch2_bkey_val_to_text(&buf
, c
, alloc_k
), buf
.buf
))) {
731 ret
= bch2_trans_update(trans
, &iter
, k
, 0);
733 bch2_trans_iter_exit(trans
, &iter
);
738 static noinline
int bch2_bucket_gen_update(struct btree_trans
*trans
,
739 struct bpos bucket
, u8 gen
)
741 struct btree_iter iter
;
743 struct bpos pos
= alloc_gens_pos(bucket
, &offset
);
744 struct bkey_i_bucket_gens
*g
;
748 g
= bch2_trans_kmalloc(trans
, sizeof(*g
));
749 ret
= PTR_ERR_OR_ZERO(g
);
753 k
= bch2_bkey_get_iter(trans
, &iter
, BTREE_ID_bucket_gens
, pos
,
755 BTREE_ITER_with_updates
);
760 if (k
.k
->type
!= KEY_TYPE_bucket_gens
) {
761 bkey_bucket_gens_init(&g
->k_i
);
764 bkey_reassemble(&g
->k_i
, k
);
767 g
->v
.gens
[offset
] = gen
;
769 ret
= bch2_trans_update(trans
, &iter
, &g
->k_i
, 0);
770 bch2_trans_iter_exit(trans
, &iter
);
774 static inline int bch2_dev_data_type_accounting_mod(struct btree_trans
*trans
, struct bch_dev
*ca
,
775 enum bch_data_type data_type
,
778 s64 delta_fragmented
, unsigned flags
)
780 struct disk_accounting_pos acc
= {
781 .type
= BCH_DISK_ACCOUNTING_dev_data_type
,
782 .dev_data_type
.dev
= ca
->dev_idx
,
783 .dev_data_type
.data_type
= data_type
,
785 s64 d
[3] = { delta_buckets
, delta_sectors
, delta_fragmented
};
787 return bch2_disk_accounting_mod(trans
, &acc
, d
, 3, flags
& BTREE_TRIGGER_gc
);
790 int bch2_alloc_key_to_dev_counters(struct btree_trans
*trans
, struct bch_dev
*ca
,
791 const struct bch_alloc_v4
*old
,
792 const struct bch_alloc_v4
*new,
795 s64 old_sectors
= bch2_bucket_sectors(*old
);
796 s64 new_sectors
= bch2_bucket_sectors(*new);
797 if (old
->data_type
!= new->data_type
) {
798 int ret
= bch2_dev_data_type_accounting_mod(trans
, ca
, new->data_type
,
799 1, new_sectors
, bch2_bucket_sectors_fragmented(ca
, *new), flags
) ?:
800 bch2_dev_data_type_accounting_mod(trans
, ca
, old
->data_type
,
801 -1, -old_sectors
, -bch2_bucket_sectors_fragmented(ca
, *old
), flags
);
804 } else if (old_sectors
!= new_sectors
) {
805 int ret
= bch2_dev_data_type_accounting_mod(trans
, ca
, new->data_type
,
807 new_sectors
- old_sectors
,
808 bch2_bucket_sectors_fragmented(ca
, *new) -
809 bch2_bucket_sectors_fragmented(ca
, *old
), flags
);
814 s64 old_unstriped
= bch2_bucket_sectors_unstriped(*old
);
815 s64 new_unstriped
= bch2_bucket_sectors_unstriped(*new);
816 if (old_unstriped
!= new_unstriped
) {
817 int ret
= bch2_dev_data_type_accounting_mod(trans
, ca
, BCH_DATA_unstriped
,
818 !!new_unstriped
- !!old_unstriped
,
819 new_unstriped
- old_unstriped
,
829 int bch2_trigger_alloc(struct btree_trans
*trans
,
830 enum btree_id btree
, unsigned level
,
831 struct bkey_s_c old
, struct bkey_s
new,
832 enum btree_iter_update_trigger_flags flags
)
834 struct bch_fs
*c
= trans
->c
;
835 struct printbuf buf
= PRINTBUF
;
838 struct bch_dev
*ca
= bch2_dev_bucket_tryget(c
, new.k
->p
);
842 struct bch_alloc_v4 old_a_convert
;
843 const struct bch_alloc_v4
*old_a
= bch2_alloc_to_v4(old
, &old_a_convert
);
845 struct bch_alloc_v4
*new_a
;
846 if (likely(new.k
->type
== KEY_TYPE_alloc_v4
)) {
847 new_a
= bkey_s_to_alloc_v4(new).v
;
849 BUG_ON(!(flags
& (BTREE_TRIGGER_gc
|BTREE_TRIGGER_check_repair
)));
851 struct bkey_i_alloc_v4
*new_ka
= bch2_alloc_to_v4_mut_inlined(trans
, new.s_c
);
852 ret
= PTR_ERR_OR_ZERO(new_ka
);
858 if (flags
& BTREE_TRIGGER_transactional
) {
859 alloc_data_type_set(new_a
, new_a
->data_type
);
861 if (bch2_bucket_sectors_total(*new_a
) > bch2_bucket_sectors_total(*old_a
)) {
862 new_a
->io_time
[READ
] = bch2_current_io_time(c
, READ
);
863 new_a
->io_time
[WRITE
]= bch2_current_io_time(c
, WRITE
);
864 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a
, true);
865 SET_BCH_ALLOC_V4_NEED_DISCARD(new_a
, true);
868 if (data_type_is_empty(new_a
->data_type
) &&
869 BCH_ALLOC_V4_NEED_INC_GEN(new_a
) &&
870 !bch2_bucket_is_open_safe(c
, new.k
->p
.inode
, new.k
->p
.offset
)) {
872 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a
, false);
873 alloc_data_type_set(new_a
, new_a
->data_type
);
876 if (old_a
->data_type
!= new_a
->data_type
||
877 (new_a
->data_type
== BCH_DATA_free
&&
878 alloc_freespace_genbits(*old_a
) != alloc_freespace_genbits(*new_a
))) {
879 ret
= bch2_bucket_do_index(trans
, ca
, old
, old_a
, false) ?:
880 bch2_bucket_do_index(trans
, ca
, new.s_c
, new_a
, true);
885 if (new_a
->data_type
== BCH_DATA_cached
&&
886 !new_a
->io_time
[READ
])
887 new_a
->io_time
[READ
] = bch2_current_io_time(c
, READ
);
889 u64 old_lru
= alloc_lru_idx_read(*old_a
);
890 u64 new_lru
= alloc_lru_idx_read(*new_a
);
891 if (old_lru
!= new_lru
) {
892 ret
= bch2_lru_change(trans
, new.k
->p
.inode
,
893 bucket_to_u64(new.k
->p
),
899 old_lru
= alloc_lru_idx_fragmentation(*old_a
, ca
);
900 new_lru
= alloc_lru_idx_fragmentation(*new_a
, ca
);
901 if (old_lru
!= new_lru
) {
902 ret
= bch2_lru_change(trans
,
903 BCH_LRU_FRAGMENTATION_START
,
904 bucket_to_u64(new.k
->p
),
910 if (old_a
->gen
!= new_a
->gen
) {
911 ret
= bch2_bucket_gen_update(trans
, new.k
->p
, new_a
->gen
);
916 if ((flags
& BTREE_TRIGGER_bucket_invalidate
) &&
917 old_a
->cached_sectors
) {
918 ret
= bch2_mod_dev_cached_sectors(trans
, ca
->dev_idx
,
919 -((s64
) old_a
->cached_sectors
),
920 flags
& BTREE_TRIGGER_gc
);
925 ret
= bch2_alloc_key_to_dev_counters(trans
, ca
, old_a
, new_a
, flags
);
930 if ((flags
& BTREE_TRIGGER_atomic
) && (flags
& BTREE_TRIGGER_insert
)) {
931 u64 journal_seq
= trans
->journal_res
.seq
;
932 u64 bucket_journal_seq
= new_a
->journal_seq
;
934 if ((flags
& BTREE_TRIGGER_insert
) &&
935 data_type_is_empty(old_a
->data_type
) !=
936 data_type_is_empty(new_a
->data_type
) &&
937 new.k
->type
== KEY_TYPE_alloc_v4
) {
938 struct bch_alloc_v4
*v
= bkey_s_to_alloc_v4(new).v
;
941 * If the btree updates referring to a bucket weren't flushed
942 * before the bucket became empty again, then the we don't have
943 * to wait on a journal flush before we can reuse the bucket:
945 v
->journal_seq
= bucket_journal_seq
=
946 data_type_is_empty(new_a
->data_type
) &&
947 (journal_seq
== v
->journal_seq
||
948 bch2_journal_noflush_seq(&c
->journal
, v
->journal_seq
))
952 if (!data_type_is_empty(old_a
->data_type
) &&
953 data_type_is_empty(new_a
->data_type
) &&
954 bucket_journal_seq
) {
955 ret
= bch2_set_bucket_needs_journal_commit(&c
->buckets_waiting_for_journal
,
956 c
->journal
.flushed_seq_ondisk
,
957 new.k
->p
.inode
, new.k
->p
.offset
,
959 if (bch2_fs_fatal_err_on(ret
, c
,
960 "setting bucket_needs_journal_commit: %s", bch2_err_str(ret
)))
964 if (new_a
->gen
!= old_a
->gen
) {
966 u8
*gen
= bucket_gen(ca
, new.k
->p
.offset
);
967 if (unlikely(!gen
)) {
975 #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
976 #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
977 #define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
979 if (statechange(a
->data_type
== BCH_DATA_free
) &&
980 bucket_flushed(new_a
))
981 closure_wake_up(&c
->freelist_wait
);
983 if (statechange(a
->data_type
== BCH_DATA_need_discard
) &&
984 !bch2_bucket_is_open_safe(c
, new.k
->p
.inode
, new.k
->p
.offset
) &&
985 bucket_flushed(new_a
))
986 bch2_discard_one_bucket_fast(ca
, new.k
->p
.offset
);
988 if (statechange(a
->data_type
== BCH_DATA_cached
) &&
989 !bch2_bucket_is_open(c
, new.k
->p
.inode
, new.k
->p
.offset
) &&
990 should_invalidate_buckets(ca
, bch2_dev_usage_read(ca
)))
991 bch2_dev_do_invalidates(ca
);
993 if (statechange(a
->data_type
== BCH_DATA_need_gc_gens
))
994 bch2_gc_gens_async(c
);
997 if ((flags
& BTREE_TRIGGER_gc
) && (flags
& BTREE_TRIGGER_insert
)) {
999 struct bucket
*g
= gc_bucket(ca
, new.k
->p
.offset
);
1002 goto invalid_bucket
;
1005 g
->gen
= new_a
->gen
;
1009 printbuf_exit(&buf
);
1013 bch2_fs_inconsistent(c
, "reference to invalid bucket\n %s",
1014 (bch2_bkey_val_to_text(&buf
, c
, new.s_c
), buf
.buf
));
1020 * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
1021 * extents style btrees, but works on non-extents btrees:
1023 static struct bkey_s_c
bch2_get_key_or_hole(struct btree_iter
*iter
, struct bpos end
, struct bkey
*hole
)
1025 struct bkey_s_c k
= bch2_btree_iter_peek_slot(iter
);
1033 struct btree_iter iter2
;
1036 bch2_trans_copy_iter(&iter2
, iter
);
1038 struct btree_path
*path
= btree_iter_path(iter
->trans
, iter
);
1039 if (!bpos_eq(path
->l
[0].b
->key
.k
.p
, SPOS_MAX
))
1040 end
= bkey_min(end
, bpos_nosnap_successor(path
->l
[0].b
->key
.k
.p
));
1042 end
= bkey_min(end
, POS(iter
->pos
.inode
, iter
->pos
.offset
+ U32_MAX
- 1));
1045 * btree node min/max is a closed interval, upto takes a half
1048 k
= bch2_btree_iter_peek_upto(&iter2
, end
);
1050 bch2_trans_iter_exit(iter
->trans
, &iter2
);
1052 BUG_ON(next
.offset
>= iter
->pos
.offset
+ U32_MAX
);
1058 hole
->p
= iter
->pos
;
1060 bch2_key_resize(hole
, next
.offset
- iter
->pos
.offset
);
1061 return (struct bkey_s_c
) { hole
, NULL
};
1065 static bool next_bucket(struct bch_fs
*c
, struct bch_dev
**ca
, struct bpos
*bucket
)
1068 if (bucket
->offset
< (*ca
)->mi
.first_bucket
)
1069 bucket
->offset
= (*ca
)->mi
.first_bucket
;
1071 if (bucket
->offset
< (*ca
)->mi
.nbuckets
)
1081 *ca
= __bch2_next_dev_idx(c
, bucket
->inode
, NULL
);
1083 *bucket
= POS((*ca
)->dev_idx
, (*ca
)->mi
.first_bucket
);
1091 static struct bkey_s_c
bch2_get_key_or_real_bucket_hole(struct btree_iter
*iter
,
1092 struct bch_dev
**ca
, struct bkey
*hole
)
1094 struct bch_fs
*c
= iter
->trans
->c
;
1097 k
= bch2_get_key_or_hole(iter
, POS_MAX
, hole
);
1101 *ca
= bch2_dev_iterate_noerror(c
, *ca
, k
.k
->p
.inode
);
1104 struct bpos hole_start
= bkey_start_pos(k
.k
);
1106 if (!*ca
|| !bucket_valid(*ca
, hole_start
.offset
)) {
1107 if (!next_bucket(c
, ca
, &hole_start
))
1108 return bkey_s_c_null
;
1110 bch2_btree_iter_set_pos(iter
, hole_start
);
1114 if (k
.k
->p
.offset
> (*ca
)->mi
.nbuckets
)
1115 bch2_key_resize(hole
, (*ca
)->mi
.nbuckets
- hole_start
.offset
);
1121 static noinline_for_stack
1122 int bch2_check_alloc_key(struct btree_trans
*trans
,
1123 struct bkey_s_c alloc_k
,
1124 struct btree_iter
*alloc_iter
,
1125 struct btree_iter
*discard_iter
,
1126 struct btree_iter
*freespace_iter
,
1127 struct btree_iter
*bucket_gens_iter
)
1129 struct bch_fs
*c
= trans
->c
;
1130 struct bch_alloc_v4 a_convert
;
1131 const struct bch_alloc_v4
*a
;
1132 unsigned discard_key_type
, freespace_key_type
;
1133 unsigned gens_offset
;
1135 struct printbuf buf
= PRINTBUF
;
1138 struct bch_dev
*ca
= bch2_dev_bucket_tryget_noerror(c
, alloc_k
.k
->p
);
1139 if (fsck_err_on(!ca
,
1140 trans
, alloc_key_to_missing_dev_bucket
,
1141 "alloc key for invalid device:bucket %llu:%llu",
1142 alloc_k
.k
->p
.inode
, alloc_k
.k
->p
.offset
))
1143 ret
= bch2_btree_delete_at(trans
, alloc_iter
, 0);
1147 if (!ca
->mi
.freespace_initialized
)
1150 a
= bch2_alloc_to_v4(alloc_k
, &a_convert
);
1152 discard_key_type
= a
->data_type
== BCH_DATA_need_discard
? KEY_TYPE_set
: 0;
1153 bch2_btree_iter_set_pos(discard_iter
, alloc_k
.k
->p
);
1154 k
= bch2_btree_iter_peek_slot(discard_iter
);
1159 if (fsck_err_on(k
.k
->type
!= discard_key_type
,
1160 trans
, need_discard_key_wrong
,
1161 "incorrect key in need_discard btree (got %s should be %s)\n"
1163 bch2_bkey_types
[k
.k
->type
],
1164 bch2_bkey_types
[discard_key_type
],
1165 (bch2_bkey_val_to_text(&buf
, c
, alloc_k
), buf
.buf
))) {
1166 struct bkey_i
*update
=
1167 bch2_trans_kmalloc(trans
, sizeof(*update
));
1169 ret
= PTR_ERR_OR_ZERO(update
);
1173 bkey_init(&update
->k
);
1174 update
->k
.type
= discard_key_type
;
1175 update
->k
.p
= discard_iter
->pos
;
1177 ret
= bch2_trans_update(trans
, discard_iter
, update
, 0);
1182 freespace_key_type
= a
->data_type
== BCH_DATA_free
? KEY_TYPE_set
: 0;
1183 bch2_btree_iter_set_pos(freespace_iter
, alloc_freespace_pos(alloc_k
.k
->p
, *a
));
1184 k
= bch2_btree_iter_peek_slot(freespace_iter
);
1189 if (fsck_err_on(k
.k
->type
!= freespace_key_type
,
1190 trans
, freespace_key_wrong
,
1191 "incorrect key in freespace btree (got %s should be %s)\n"
1193 bch2_bkey_types
[k
.k
->type
],
1194 bch2_bkey_types
[freespace_key_type
],
1195 (printbuf_reset(&buf
),
1196 bch2_bkey_val_to_text(&buf
, c
, alloc_k
), buf
.buf
))) {
1197 struct bkey_i
*update
=
1198 bch2_trans_kmalloc(trans
, sizeof(*update
));
1200 ret
= PTR_ERR_OR_ZERO(update
);
1204 bkey_init(&update
->k
);
1205 update
->k
.type
= freespace_key_type
;
1206 update
->k
.p
= freespace_iter
->pos
;
1207 bch2_key_resize(&update
->k
, 1);
1209 ret
= bch2_trans_update(trans
, freespace_iter
, update
, 0);
1214 bch2_btree_iter_set_pos(bucket_gens_iter
, alloc_gens_pos(alloc_k
.k
->p
, &gens_offset
));
1215 k
= bch2_btree_iter_peek_slot(bucket_gens_iter
);
1220 if (fsck_err_on(a
->gen
!= alloc_gen(k
, gens_offset
),
1221 trans
, bucket_gens_key_wrong
,
1222 "incorrect gen in bucket_gens btree (got %u should be %u)\n"
1224 alloc_gen(k
, gens_offset
), a
->gen
,
1225 (printbuf_reset(&buf
),
1226 bch2_bkey_val_to_text(&buf
, c
, alloc_k
), buf
.buf
))) {
1227 struct bkey_i_bucket_gens
*g
=
1228 bch2_trans_kmalloc(trans
, sizeof(*g
));
1230 ret
= PTR_ERR_OR_ZERO(g
);
1234 if (k
.k
->type
== KEY_TYPE_bucket_gens
) {
1235 bkey_reassemble(&g
->k_i
, k
);
1237 bkey_bucket_gens_init(&g
->k_i
);
1238 g
->k
.p
= alloc_gens_pos(alloc_k
.k
->p
, &gens_offset
);
1241 g
->v
.gens
[gens_offset
] = a
->gen
;
1243 ret
= bch2_trans_update(trans
, bucket_gens_iter
, &g
->k_i
, 0);
1251 printbuf_exit(&buf
);
1255 static noinline_for_stack
1256 int bch2_check_alloc_hole_freespace(struct btree_trans
*trans
,
1260 struct btree_iter
*freespace_iter
)
1263 struct printbuf buf
= PRINTBUF
;
1266 if (!ca
->mi
.freespace_initialized
)
1269 bch2_btree_iter_set_pos(freespace_iter
, start
);
1271 k
= bch2_btree_iter_peek_slot(freespace_iter
);
1276 *end
= bkey_min(k
.k
->p
, *end
);
1278 if (fsck_err_on(k
.k
->type
!= KEY_TYPE_set
,
1279 trans
, freespace_hole_missing
,
1280 "hole in alloc btree missing in freespace btree\n"
1281 " device %llu buckets %llu-%llu",
1282 freespace_iter
->pos
.inode
,
1283 freespace_iter
->pos
.offset
,
1285 struct bkey_i
*update
=
1286 bch2_trans_kmalloc(trans
, sizeof(*update
));
1288 ret
= PTR_ERR_OR_ZERO(update
);
1292 bkey_init(&update
->k
);
1293 update
->k
.type
= KEY_TYPE_set
;
1294 update
->k
.p
= freespace_iter
->pos
;
1295 bch2_key_resize(&update
->k
,
1296 min_t(u64
, U32_MAX
, end
->offset
-
1297 freespace_iter
->pos
.offset
));
1299 ret
= bch2_trans_update(trans
, freespace_iter
, update
, 0);
1305 printbuf_exit(&buf
);
1309 static noinline_for_stack
1310 int bch2_check_alloc_hole_bucket_gens(struct btree_trans
*trans
,
1313 struct btree_iter
*bucket_gens_iter
)
1316 struct printbuf buf
= PRINTBUF
;
1317 unsigned i
, gens_offset
, gens_end_offset
;
1320 bch2_btree_iter_set_pos(bucket_gens_iter
, alloc_gens_pos(start
, &gens_offset
));
1322 k
= bch2_btree_iter_peek_slot(bucket_gens_iter
);
1327 if (bkey_cmp(alloc_gens_pos(start
, &gens_offset
),
1328 alloc_gens_pos(*end
, &gens_end_offset
)))
1329 gens_end_offset
= KEY_TYPE_BUCKET_GENS_NR
;
1331 if (k
.k
->type
== KEY_TYPE_bucket_gens
) {
1332 struct bkey_i_bucket_gens g
;
1333 bool need_update
= false;
1335 bkey_reassemble(&g
.k_i
, k
);
1337 for (i
= gens_offset
; i
< gens_end_offset
; i
++) {
1338 if (fsck_err_on(g
.v
.gens
[i
], trans
,
1339 bucket_gens_hole_wrong
,
1340 "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
1341 bucket_gens_pos_to_alloc(k
.k
->p
, i
).inode
,
1342 bucket_gens_pos_to_alloc(k
.k
->p
, i
).offset
,
1350 struct bkey_i
*u
= bch2_trans_kmalloc(trans
, sizeof(g
));
1352 ret
= PTR_ERR_OR_ZERO(u
);
1356 memcpy(u
, &g
, sizeof(g
));
1358 ret
= bch2_trans_update(trans
, bucket_gens_iter
, u
, 0);
1364 *end
= bkey_min(*end
, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k
.k
->p
), 0));
1367 printbuf_exit(&buf
);
1371 static noinline_for_stack
int bch2_check_discard_freespace_key(struct btree_trans
*trans
,
1372 struct btree_iter
*iter
)
1374 struct bch_fs
*c
= trans
->c
;
1375 struct btree_iter alloc_iter
;
1376 struct bkey_s_c alloc_k
;
1377 struct bch_alloc_v4 a_convert
;
1378 const struct bch_alloc_v4
*a
;
1381 enum bch_data_type state
= iter
->btree_id
== BTREE_ID_need_discard
1382 ? BCH_DATA_need_discard
1384 struct printbuf buf
= PRINTBUF
;
1388 pos
.offset
&= ~(~0ULL << 56);
1389 genbits
= iter
->pos
.offset
& (~0ULL << 56);
1391 alloc_k
= bch2_bkey_get_iter(trans
, &alloc_iter
, BTREE_ID_alloc
, pos
, 0);
1392 ret
= bkey_err(alloc_k
);
1396 if (fsck_err_on(!bch2_dev_bucket_exists(c
, pos
),
1397 trans
, need_discard_freespace_key_to_invalid_dev_bucket
,
1398 "entry in %s btree for nonexistant dev:bucket %llu:%llu",
1399 bch2_btree_id_str(iter
->btree_id
), pos
.inode
, pos
.offset
))
1402 a
= bch2_alloc_to_v4(alloc_k
, &a_convert
);
1404 if (fsck_err_on(a
->data_type
!= state
||
1405 (state
== BCH_DATA_free
&&
1406 genbits
!= alloc_freespace_genbits(*a
)),
1407 trans
, need_discard_freespace_key_bad
,
1408 "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
1409 (bch2_bkey_val_to_text(&buf
, c
, alloc_k
), buf
.buf
),
1410 bch2_btree_id_str(iter
->btree_id
),
1413 a
->data_type
== state
,
1414 genbits
>> 56, alloc_freespace_genbits(*a
) >> 56))
1418 bch2_set_btree_iter_dontneed(&alloc_iter
);
1419 bch2_trans_iter_exit(trans
, &alloc_iter
);
1420 printbuf_exit(&buf
);
1423 ret
= bch2_btree_delete_extent_at(trans
, iter
,
1424 iter
->btree_id
== BTREE_ID_freespace
? 1 : 0, 0) ?:
1425 bch2_trans_commit(trans
, NULL
, NULL
,
1426 BCH_TRANS_COMMIT_no_enospc
);
1431 * We've already checked that generation numbers in the bucket_gens btree are
1432 * valid for buckets that exist; this just checks for keys for nonexistent
1435 static noinline_for_stack
1436 int bch2_check_bucket_gens_key(struct btree_trans
*trans
,
1437 struct btree_iter
*iter
,
1440 struct bch_fs
*c
= trans
->c
;
1441 struct bkey_i_bucket_gens g
;
1442 u64 start
= bucket_gens_pos_to_alloc(k
.k
->p
, 0).offset
;
1443 u64 end
= bucket_gens_pos_to_alloc(bpos_nosnap_successor(k
.k
->p
), 0).offset
;
1445 bool need_update
= false;
1446 struct printbuf buf
= PRINTBUF
;
1449 BUG_ON(k
.k
->type
!= KEY_TYPE_bucket_gens
);
1450 bkey_reassemble(&g
.k_i
, k
);
1452 struct bch_dev
*ca
= bch2_dev_tryget_noerror(c
, k
.k
->p
.inode
);
1454 if (fsck_err(trans
, bucket_gens_to_invalid_dev
,
1455 "bucket_gens key for invalid device:\n %s",
1456 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
)))
1457 ret
= bch2_btree_delete_at(trans
, iter
, 0);
1461 if (fsck_err_on(end
<= ca
->mi
.first_bucket
||
1462 start
>= ca
->mi
.nbuckets
,
1463 trans
, bucket_gens_to_invalid_buckets
,
1464 "bucket_gens key for invalid buckets:\n %s",
1465 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
))) {
1466 ret
= bch2_btree_delete_at(trans
, iter
, 0);
1470 for (b
= start
; b
< ca
->mi
.first_bucket
; b
++)
1471 if (fsck_err_on(g
.v
.gens
[b
& KEY_TYPE_BUCKET_GENS_MASK
],
1472 trans
, bucket_gens_nonzero_for_invalid_buckets
,
1473 "bucket_gens key has nonzero gen for invalid bucket")) {
1474 g
.v
.gens
[b
& KEY_TYPE_BUCKET_GENS_MASK
] = 0;
1478 for (b
= ca
->mi
.nbuckets
; b
< end
; b
++)
1479 if (fsck_err_on(g
.v
.gens
[b
& KEY_TYPE_BUCKET_GENS_MASK
],
1480 trans
, bucket_gens_nonzero_for_invalid_buckets
,
1481 "bucket_gens key has nonzero gen for invalid bucket")) {
1482 g
.v
.gens
[b
& KEY_TYPE_BUCKET_GENS_MASK
] = 0;
1487 struct bkey_i
*u
= bch2_trans_kmalloc(trans
, sizeof(g
));
1489 ret
= PTR_ERR_OR_ZERO(u
);
1493 memcpy(u
, &g
, sizeof(g
));
1494 ret
= bch2_trans_update(trans
, iter
, u
, 0);
1499 printbuf_exit(&buf
);
1503 int bch2_check_alloc_info(struct bch_fs
*c
)
1505 struct btree_trans
*trans
= bch2_trans_get(c
);
1506 struct btree_iter iter
, discard_iter
, freespace_iter
, bucket_gens_iter
;
1507 struct bch_dev
*ca
= NULL
;
1512 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_alloc
, POS_MIN
,
1513 BTREE_ITER_prefetch
);
1514 bch2_trans_iter_init(trans
, &discard_iter
, BTREE_ID_need_discard
, POS_MIN
,
1515 BTREE_ITER_prefetch
);
1516 bch2_trans_iter_init(trans
, &freespace_iter
, BTREE_ID_freespace
, POS_MIN
,
1517 BTREE_ITER_prefetch
);
1518 bch2_trans_iter_init(trans
, &bucket_gens_iter
, BTREE_ID_bucket_gens
, POS_MIN
,
1519 BTREE_ITER_prefetch
);
1524 bch2_trans_begin(trans
);
1526 k
= bch2_get_key_or_real_bucket_hole(&iter
, &ca
, &hole
);
1535 next
= bpos_nosnap_successor(k
.k
->p
);
1537 ret
= bch2_check_alloc_key(trans
,
1547 ret
= bch2_check_alloc_hole_freespace(trans
, ca
,
1548 bkey_start_pos(k
.k
),
1551 bch2_check_alloc_hole_bucket_gens(trans
,
1552 bkey_start_pos(k
.k
),
1559 ret
= bch2_trans_commit(trans
, NULL
, NULL
,
1560 BCH_TRANS_COMMIT_no_enospc
);
1564 bch2_btree_iter_set_pos(&iter
, next
);
1566 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
1571 bch2_trans_iter_exit(trans
, &bucket_gens_iter
);
1572 bch2_trans_iter_exit(trans
, &freespace_iter
);
1573 bch2_trans_iter_exit(trans
, &discard_iter
);
1574 bch2_trans_iter_exit(trans
, &iter
);
1581 ret
= for_each_btree_key(trans
, iter
,
1582 BTREE_ID_need_discard
, POS_MIN
,
1583 BTREE_ITER_prefetch
, k
,
1584 bch2_check_discard_freespace_key(trans
, &iter
));
1588 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_freespace
, POS_MIN
,
1589 BTREE_ITER_prefetch
);
1591 bch2_trans_begin(trans
);
1592 k
= bch2_btree_iter_peek(&iter
);
1596 ret
= bkey_err(k
) ?:
1597 bch2_check_discard_freespace_key(trans
, &iter
);
1598 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
)) {
1603 struct printbuf buf
= PRINTBUF
;
1604 bch2_bkey_val_to_text(&buf
, c
, k
);
1606 bch_err(c
, "while checking %s", buf
.buf
);
1607 printbuf_exit(&buf
);
1611 bch2_btree_iter_set_pos(&iter
, bpos_nosnap_successor(iter
.pos
));
1613 bch2_trans_iter_exit(trans
, &iter
);
1617 ret
= for_each_btree_key_commit(trans
, iter
,
1618 BTREE_ID_bucket_gens
, POS_MIN
,
1619 BTREE_ITER_prefetch
, k
,
1620 NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
1621 bch2_check_bucket_gens_key(trans
, &iter
, k
));
1623 bch2_trans_put(trans
);
1628 static int bch2_check_alloc_to_lru_ref(struct btree_trans
*trans
,
1629 struct btree_iter
*alloc_iter
,
1630 struct bkey_buf
*last_flushed
)
1632 struct bch_fs
*c
= trans
->c
;
1633 struct bch_alloc_v4 a_convert
;
1634 const struct bch_alloc_v4
*a
;
1635 struct bkey_s_c alloc_k
;
1636 struct printbuf buf
= PRINTBUF
;
1639 alloc_k
= bch2_btree_iter_peek(alloc_iter
);
1643 ret
= bkey_err(alloc_k
);
1647 struct bch_dev
*ca
= bch2_dev_tryget_noerror(c
, alloc_k
.k
->p
.inode
);
1651 a
= bch2_alloc_to_v4(alloc_k
, &a_convert
);
1653 u64 lru_idx
= alloc_lru_idx_fragmentation(*a
, ca
);
1655 ret
= bch2_lru_check_set(trans
, BCH_LRU_FRAGMENTATION_START
,
1656 lru_idx
, alloc_k
, last_flushed
);
1661 if (a
->data_type
!= BCH_DATA_cached
)
1664 if (fsck_err_on(!a
->io_time
[READ
],
1665 trans
, alloc_key_cached_but_read_time_zero
,
1666 "cached bucket with read_time 0\n"
1668 (printbuf_reset(&buf
),
1669 bch2_bkey_val_to_text(&buf
, c
, alloc_k
), buf
.buf
))) {
1670 struct bkey_i_alloc_v4
*a_mut
=
1671 bch2_alloc_to_v4_mut(trans
, alloc_k
);
1672 ret
= PTR_ERR_OR_ZERO(a_mut
);
1676 a_mut
->v
.io_time
[READ
] = bch2_current_io_time(c
, READ
);
1677 ret
= bch2_trans_update(trans
, alloc_iter
,
1678 &a_mut
->k_i
, BTREE_TRIGGER_norun
);
1685 ret
= bch2_lru_check_set(trans
, alloc_k
.k
->p
.inode
, a
->io_time
[READ
],
1686 alloc_k
, last_flushed
);
1692 printbuf_exit(&buf
);
1696 int bch2_check_alloc_to_lru_refs(struct bch_fs
*c
)
1698 struct bkey_buf last_flushed
;
1700 bch2_bkey_buf_init(&last_flushed
);
1701 bkey_init(&last_flushed
.k
->k
);
1703 int ret
= bch2_trans_run(c
,
1704 for_each_btree_key_commit(trans
, iter
, BTREE_ID_alloc
,
1705 POS_MIN
, BTREE_ITER_prefetch
, k
,
1706 NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
1707 bch2_check_alloc_to_lru_ref(trans
, &iter
, &last_flushed
)));
1709 bch2_bkey_buf_exit(&last_flushed
, c
);
1714 static int discard_in_flight_add(struct bch_dev
*ca
, u64 bucket
, bool in_progress
)
1718 mutex_lock(&ca
->discard_buckets_in_flight_lock
);
1719 darray_for_each(ca
->discard_buckets_in_flight
, i
)
1720 if (i
->bucket
== bucket
) {
1721 ret
= -BCH_ERR_EEXIST_discard_in_flight_add
;
1725 ret
= darray_push(&ca
->discard_buckets_in_flight
, ((struct discard_in_flight
) {
1726 .in_progress
= in_progress
,
1730 mutex_unlock(&ca
->discard_buckets_in_flight_lock
);
1734 static void discard_in_flight_remove(struct bch_dev
*ca
, u64 bucket
)
1736 mutex_lock(&ca
->discard_buckets_in_flight_lock
);
1737 darray_for_each(ca
->discard_buckets_in_flight
, i
)
1738 if (i
->bucket
== bucket
) {
1739 BUG_ON(!i
->in_progress
);
1740 darray_remove_item(&ca
->discard_buckets_in_flight
, i
);
1745 mutex_unlock(&ca
->discard_buckets_in_flight_lock
);
1748 struct discard_buckets_state
{
1751 u64 need_journal_commit
;
1753 u64 need_journal_commit_this_dev
;
1756 static int bch2_discard_one_bucket(struct btree_trans
*trans
,
1758 struct btree_iter
*need_discard_iter
,
1759 struct bpos
*discard_pos_done
,
1760 struct discard_buckets_state
*s
)
1762 struct bch_fs
*c
= trans
->c
;
1763 struct bpos pos
= need_discard_iter
->pos
;
1764 struct btree_iter iter
= { NULL
};
1766 struct bkey_i_alloc_v4
*a
;
1767 struct printbuf buf
= PRINTBUF
;
1768 bool discard_locked
= false;
1771 if (bch2_bucket_is_open_safe(c
, pos
.inode
, pos
.offset
)) {
1776 if (bch2_bucket_needs_journal_commit(&c
->buckets_waiting_for_journal
,
1777 c
->journal
.flushed_seq_ondisk
,
1778 pos
.inode
, pos
.offset
)) {
1779 s
->need_journal_commit
++;
1780 s
->need_journal_commit_this_dev
++;
1784 k
= bch2_bkey_get_iter(trans
, &iter
, BTREE_ID_alloc
,
1785 need_discard_iter
->pos
,
1791 a
= bch2_alloc_to_v4_mut(trans
, k
);
1792 ret
= PTR_ERR_OR_ZERO(a
);
1796 if (bch2_bucket_sectors_total(a
->v
)) {
1797 if (bch2_trans_inconsistent_on(c
->curr_recovery_pass
> BCH_RECOVERY_PASS_check_alloc_info
,
1798 trans
, "attempting to discard bucket with dirty data\n%s",
1799 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
)))
1804 if (a
->v
.data_type
!= BCH_DATA_need_discard
) {
1805 if (data_type_is_empty(a
->v
.data_type
) &&
1806 BCH_ALLOC_V4_NEED_INC_GEN(&a
->v
)) {
1808 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a
->v
, false);
1812 if (bch2_trans_inconsistent_on(c
->curr_recovery_pass
> BCH_RECOVERY_PASS_check_alloc_info
,
1813 trans
, "bucket incorrectly set in need_discard btree\n"
1815 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
)))
1820 if (a
->v
.journal_seq
> c
->journal
.flushed_seq_ondisk
) {
1821 if (bch2_trans_inconsistent_on(c
->curr_recovery_pass
> BCH_RECOVERY_PASS_check_alloc_info
,
1822 trans
, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
1824 c
->journal
.flushed_seq_ondisk
,
1825 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
)))
1830 if (discard_in_flight_add(ca
, iter
.pos
.offset
, true))
1833 discard_locked
= true;
1835 if (!bkey_eq(*discard_pos_done
, iter
.pos
) &&
1836 ca
->mi
.discard
&& !c
->opts
.nochanges
) {
1838 * This works without any other locks because this is the only
1839 * thread that removes items from the need_discard tree
1841 bch2_trans_unlock_long(trans
);
1842 blkdev_issue_discard(ca
->disk_sb
.bdev
,
1843 k
.k
->p
.offset
* ca
->mi
.bucket_size
,
1846 *discard_pos_done
= iter
.pos
;
1848 ret
= bch2_trans_relock_notrace(trans
);
1853 SET_BCH_ALLOC_V4_NEED_DISCARD(&a
->v
, false);
1855 alloc_data_type_set(&a
->v
, a
->v
.data_type
);
1857 ret
= bch2_trans_update(trans
, &iter
, &a
->k_i
, 0) ?:
1858 bch2_trans_commit(trans
, NULL
, NULL
,
1859 BCH_WATERMARK_btree
|
1860 BCH_TRANS_COMMIT_no_enospc
);
1864 count_event(c
, bucket_discard
);
1868 discard_in_flight_remove(ca
, iter
.pos
.offset
);
1870 bch2_trans_iter_exit(trans
, &iter
);
1871 printbuf_exit(&buf
);
1875 static void bch2_do_discards_work(struct work_struct
*work
)
1877 struct bch_dev
*ca
= container_of(work
, struct bch_dev
, discard_work
);
1878 struct bch_fs
*c
= ca
->fs
;
1879 struct discard_buckets_state s
= {};
1880 struct bpos discard_pos_done
= POS_MAX
;
1884 * We're doing the commit in bch2_discard_one_bucket instead of using
1885 * for_each_btree_key_commit() so that we can increment counters after
1886 * successful commit:
1888 ret
= bch2_trans_run(c
,
1889 for_each_btree_key_upto(trans
, iter
,
1890 BTREE_ID_need_discard
,
1891 POS(ca
->dev_idx
, 0),
1892 POS(ca
->dev_idx
, U64_MAX
), 0, k
,
1893 bch2_discard_one_bucket(trans
, ca
, &iter
, &discard_pos_done
, &s
)));
1895 trace_discard_buckets(c
, s
.seen
, s
.open
, s
.need_journal_commit
, s
.discarded
,
1898 percpu_ref_put(&ca
->io_ref
);
1899 bch2_write_ref_put(c
, BCH_WRITE_REF_discard
);
1902 void bch2_dev_do_discards(struct bch_dev
*ca
)
1904 struct bch_fs
*c
= ca
->fs
;
1906 if (!bch2_write_ref_tryget(c
, BCH_WRITE_REF_discard
))
1909 if (!bch2_dev_get_ioref(c
, ca
->dev_idx
, WRITE
))
1912 if (queue_work(c
->write_ref_wq
, &ca
->discard_work
))
1915 percpu_ref_put(&ca
->io_ref
);
1917 bch2_write_ref_put(c
, BCH_WRITE_REF_discard
);
1920 void bch2_do_discards(struct bch_fs
*c
)
1922 for_each_member_device(c
, ca
)
1923 bch2_dev_do_discards(ca
);
1926 static int bch2_clear_bucket_needs_discard(struct btree_trans
*trans
, struct bpos bucket
)
1928 struct btree_iter iter
;
1929 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_alloc
, bucket
, BTREE_ITER_intent
);
1930 struct bkey_s_c k
= bch2_btree_iter_peek_slot(&iter
);
1931 int ret
= bkey_err(k
);
1935 struct bkey_i_alloc_v4
*a
= bch2_alloc_to_v4_mut(trans
, k
);
1936 ret
= PTR_ERR_OR_ZERO(a
);
1940 BUG_ON(a
->v
.dirty_sectors
);
1941 SET_BCH_ALLOC_V4_NEED_DISCARD(&a
->v
, false);
1942 alloc_data_type_set(&a
->v
, a
->v
.data_type
);
1944 ret
= bch2_trans_update(trans
, &iter
, &a
->k_i
, 0);
1946 bch2_trans_iter_exit(trans
, &iter
);
1950 static void bch2_do_discards_fast_work(struct work_struct
*work
)
1952 struct bch_dev
*ca
= container_of(work
, struct bch_dev
, discard_fast_work
);
1953 struct bch_fs
*c
= ca
->fs
;
1956 bool got_bucket
= false;
1959 mutex_lock(&ca
->discard_buckets_in_flight_lock
);
1960 darray_for_each(ca
->discard_buckets_in_flight
, i
) {
1966 i
->in_progress
= true;
1969 mutex_unlock(&ca
->discard_buckets_in_flight_lock
);
1974 if (ca
->mi
.discard
&& !c
->opts
.nochanges
)
1975 blkdev_issue_discard(ca
->disk_sb
.bdev
,
1976 bucket_to_sector(ca
, bucket
),
1980 int ret
= bch2_trans_commit_do(c
, NULL
, NULL
,
1981 BCH_WATERMARK_btree
|
1982 BCH_TRANS_COMMIT_no_enospc
,
1983 bch2_clear_bucket_needs_discard(trans
, POS(ca
->dev_idx
, bucket
)));
1986 discard_in_flight_remove(ca
, bucket
);
1992 percpu_ref_put(&ca
->io_ref
);
1993 bch2_write_ref_put(c
, BCH_WRITE_REF_discard_fast
);
1996 static void bch2_discard_one_bucket_fast(struct bch_dev
*ca
, u64 bucket
)
1998 struct bch_fs
*c
= ca
->fs
;
2000 if (discard_in_flight_add(ca
, bucket
, false))
2003 if (!bch2_write_ref_tryget(c
, BCH_WRITE_REF_discard_fast
))
2006 if (!bch2_dev_get_ioref(c
, ca
->dev_idx
, WRITE
))
2009 if (queue_work(c
->write_ref_wq
, &ca
->discard_fast_work
))
2012 percpu_ref_put(&ca
->io_ref
);
2014 bch2_write_ref_put(c
, BCH_WRITE_REF_discard_fast
);
2017 static int invalidate_one_bucket(struct btree_trans
*trans
,
2018 struct btree_iter
*lru_iter
,
2019 struct bkey_s_c lru_k
,
2020 s64
*nr_to_invalidate
)
2022 struct bch_fs
*c
= trans
->c
;
2023 struct bkey_i_alloc_v4
*a
= NULL
;
2024 struct printbuf buf
= PRINTBUF
;
2025 struct bpos bucket
= u64_to_bucket(lru_k
.k
->p
.offset
);
2026 unsigned cached_sectors
;
2029 if (*nr_to_invalidate
<= 0)
2032 if (!bch2_dev_bucket_exists(c
, bucket
)) {
2033 prt_str(&buf
, "lru entry points to invalid bucket");
2037 if (bch2_bucket_is_open_safe(c
, bucket
.inode
, bucket
.offset
))
2040 a
= bch2_trans_start_alloc_update(trans
, bucket
, BTREE_TRIGGER_bucket_invalidate
);
2041 ret
= PTR_ERR_OR_ZERO(a
);
2045 /* We expect harmless races here due to the btree write buffer: */
2046 if (lru_pos_time(lru_iter
->pos
) != alloc_lru_idx_read(a
->v
))
2049 BUG_ON(a
->v
.data_type
!= BCH_DATA_cached
);
2050 BUG_ON(a
->v
.dirty_sectors
);
2052 if (!a
->v
.cached_sectors
)
2053 bch_err(c
, "invalidating empty bucket, confused");
2055 cached_sectors
= a
->v
.cached_sectors
;
2057 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a
->v
, false);
2060 a
->v
.dirty_sectors
= 0;
2061 a
->v
.stripe_sectors
= 0;
2062 a
->v
.cached_sectors
= 0;
2063 a
->v
.io_time
[READ
] = bch2_current_io_time(c
, READ
);
2064 a
->v
.io_time
[WRITE
] = bch2_current_io_time(c
, WRITE
);
2066 ret
= bch2_trans_commit(trans
, NULL
, NULL
,
2067 BCH_WATERMARK_btree
|
2068 BCH_TRANS_COMMIT_no_enospc
);
2072 trace_and_count(c
, bucket_invalidate
, c
, bucket
.inode
, bucket
.offset
, cached_sectors
);
2073 --*nr_to_invalidate
;
2075 printbuf_exit(&buf
);
2078 prt_str(&buf
, "\n lru key: ");
2079 bch2_bkey_val_to_text(&buf
, c
, lru_k
);
2081 prt_str(&buf
, "\n lru entry: ");
2082 bch2_lru_pos_to_text(&buf
, lru_iter
->pos
);
2084 prt_str(&buf
, "\n alloc key: ");
2086 bch2_bpos_to_text(&buf
, bucket
);
2088 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&a
->k_i
));
2090 bch_err(c
, "%s", buf
.buf
);
2091 if (c
->curr_recovery_pass
> BCH_RECOVERY_PASS_check_lrus
) {
2092 bch2_inconsistent_error(c
);
2099 static struct bkey_s_c
next_lru_key(struct btree_trans
*trans
, struct btree_iter
*iter
,
2100 struct bch_dev
*ca
, bool *wrapped
)
2104 k
= bch2_btree_iter_peek_upto(iter
, lru_pos(ca
->dev_idx
, U64_MAX
, LRU_TIME_MAX
));
2105 if (!k
.k
&& !*wrapped
) {
2106 bch2_btree_iter_set_pos(iter
, lru_pos(ca
->dev_idx
, 0, 0));
2114 static void bch2_do_invalidates_work(struct work_struct
*work
)
2116 struct bch_dev
*ca
= container_of(work
, struct bch_dev
, invalidate_work
);
2117 struct bch_fs
*c
= ca
->fs
;
2118 struct btree_trans
*trans
= bch2_trans_get(c
);
2121 ret
= bch2_btree_write_buffer_tryflush(trans
);
2125 s64 nr_to_invalidate
=
2126 should_invalidate_buckets(ca
, bch2_dev_usage_read(ca
));
2127 struct btree_iter iter
;
2128 bool wrapped
= false;
2130 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_lru
,
2131 lru_pos(ca
->dev_idx
, 0,
2132 ((bch2_current_io_time(c
, READ
) + U32_MAX
) &
2136 bch2_trans_begin(trans
);
2138 struct bkey_s_c k
= next_lru_key(trans
, &iter
, ca
, &wrapped
);
2145 ret
= invalidate_one_bucket(trans
, &iter
, k
, &nr_to_invalidate
);
2147 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
2152 bch2_btree_iter_advance(&iter
);
2154 bch2_trans_iter_exit(trans
, &iter
);
2156 bch2_trans_put(trans
);
2157 percpu_ref_put(&ca
->io_ref
);
2158 bch2_write_ref_put(c
, BCH_WRITE_REF_invalidate
);
2161 void bch2_dev_do_invalidates(struct bch_dev
*ca
)
2163 struct bch_fs
*c
= ca
->fs
;
2165 if (!bch2_write_ref_tryget(c
, BCH_WRITE_REF_invalidate
))
2168 if (!bch2_dev_get_ioref(c
, ca
->dev_idx
, WRITE
))
2171 if (queue_work(c
->write_ref_wq
, &ca
->invalidate_work
))
2174 percpu_ref_put(&ca
->io_ref
);
2176 bch2_write_ref_put(c
, BCH_WRITE_REF_invalidate
);
2179 void bch2_do_invalidates(struct bch_fs
*c
)
2181 for_each_member_device(c
, ca
)
2182 bch2_dev_do_invalidates(ca
);
2185 int bch2_dev_freespace_init(struct bch_fs
*c
, struct bch_dev
*ca
,
2186 u64 bucket_start
, u64 bucket_end
)
2188 struct btree_trans
*trans
= bch2_trans_get(c
);
2189 struct btree_iter iter
;
2192 struct bpos end
= POS(ca
->dev_idx
, bucket_end
);
2193 struct bch_member
*m
;
2194 unsigned long last_updated
= jiffies
;
2197 BUG_ON(bucket_start
> bucket_end
);
2198 BUG_ON(bucket_end
> ca
->mi
.nbuckets
);
2200 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_alloc
,
2201 POS(ca
->dev_idx
, max_t(u64
, ca
->mi
.first_bucket
, bucket_start
)),
2202 BTREE_ITER_prefetch
);
2204 * Scan the alloc btree for every bucket on @ca, and add buckets to the
2205 * freespace/need_discard/need_gc_gens btrees as needed:
2208 if (time_after(jiffies
, last_updated
+ HZ
* 10)) {
2209 bch_info(ca
, "%s: currently at %llu/%llu",
2210 __func__
, iter
.pos
.offset
, ca
->mi
.nbuckets
);
2211 last_updated
= jiffies
;
2214 bch2_trans_begin(trans
);
2216 if (bkey_ge(iter
.pos
, end
)) {
2221 k
= bch2_get_key_or_hole(&iter
, end
, &hole
);
2228 * We process live keys in the alloc btree one at a
2231 struct bch_alloc_v4 a_convert
;
2232 const struct bch_alloc_v4
*a
= bch2_alloc_to_v4(k
, &a_convert
);
2234 ret
= bch2_bucket_do_index(trans
, ca
, k
, a
, true) ?:
2235 bch2_trans_commit(trans
, NULL
, NULL
,
2236 BCH_TRANS_COMMIT_no_enospc
);
2240 bch2_btree_iter_advance(&iter
);
2242 struct bkey_i
*freespace
;
2244 freespace
= bch2_trans_kmalloc(trans
, sizeof(*freespace
));
2245 ret
= PTR_ERR_OR_ZERO(freespace
);
2249 bkey_init(&freespace
->k
);
2250 freespace
->k
.type
= KEY_TYPE_set
;
2251 freespace
->k
.p
= k
.k
->p
;
2252 freespace
->k
.size
= k
.k
->size
;
2254 ret
= bch2_btree_insert_trans(trans
, BTREE_ID_freespace
, freespace
, 0) ?:
2255 bch2_trans_commit(trans
, NULL
, NULL
,
2256 BCH_TRANS_COMMIT_no_enospc
);
2260 bch2_btree_iter_set_pos(&iter
, k
.k
->p
);
2263 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
2269 bch2_trans_iter_exit(trans
, &iter
);
2270 bch2_trans_put(trans
);
2273 bch_err_msg(ca
, ret
, "initializing free space");
2277 mutex_lock(&c
->sb_lock
);
2278 m
= bch2_members_v2_get_mut(c
->disk_sb
.sb
, ca
->dev_idx
);
2279 SET_BCH_MEMBER_FREESPACE_INITIALIZED(m
, true);
2280 mutex_unlock(&c
->sb_lock
);
2285 int bch2_fs_freespace_init(struct bch_fs
*c
)
2288 bool doing_init
= false;
2291 * We can crash during the device add path, so we need to check this on
2295 for_each_member_device(c
, ca
) {
2296 if (ca
->mi
.freespace_initialized
)
2300 bch_info(c
, "initializing freespace");
2304 ret
= bch2_dev_freespace_init(c
, ca
, 0, ca
->mi
.nbuckets
);
2313 mutex_lock(&c
->sb_lock
);
2314 bch2_write_super(c
);
2315 mutex_unlock(&c
->sb_lock
);
2316 bch_verbose(c
, "done initializing freespace");
2322 /* device removal */
2324 int bch2_dev_remove_alloc(struct bch_fs
*c
, struct bch_dev
*ca
)
2326 struct bpos start
= POS(ca
->dev_idx
, 0);
2327 struct bpos end
= POS(ca
->dev_idx
, U64_MAX
);
2331 * We clear the LRU and need_discard btrees first so that we don't race
2332 * with bch2_do_invalidates() and bch2_do_discards()
2334 ret
= bch2_dev_remove_stripes(c
, ca
->dev_idx
) ?:
2335 bch2_btree_delete_range(c
, BTREE_ID_lru
, start
, end
,
2336 BTREE_TRIGGER_norun
, NULL
) ?:
2337 bch2_btree_delete_range(c
, BTREE_ID_need_discard
, start
, end
,
2338 BTREE_TRIGGER_norun
, NULL
) ?:
2339 bch2_btree_delete_range(c
, BTREE_ID_freespace
, start
, end
,
2340 BTREE_TRIGGER_norun
, NULL
) ?:
2341 bch2_btree_delete_range(c
, BTREE_ID_backpointers
, start
, end
,
2342 BTREE_TRIGGER_norun
, NULL
) ?:
2343 bch2_btree_delete_range(c
, BTREE_ID_bucket_gens
, start
, end
,
2344 BTREE_TRIGGER_norun
, NULL
) ?:
2345 bch2_btree_delete_range(c
, BTREE_ID_alloc
, start
, end
,
2346 BTREE_TRIGGER_norun
, NULL
) ?:
2347 bch2_dev_usage_remove(c
, ca
->dev_idx
);
2348 bch_err_msg(ca
, ret
, "removing dev alloc info");
2352 /* Bucket IO clocks: */
2354 static int __bch2_bucket_io_time_reset(struct btree_trans
*trans
, unsigned dev
,
2355 size_t bucket_nr
, int rw
)
2357 struct bch_fs
*c
= trans
->c
;
2359 struct btree_iter iter
;
2360 struct bkey_i_alloc_v4
*a
=
2361 bch2_trans_start_alloc_update_noupdate(trans
, &iter
, POS(dev
, bucket_nr
));
2362 int ret
= PTR_ERR_OR_ZERO(a
);
2366 u64 now
= bch2_current_io_time(c
, rw
);
2367 if (a
->v
.io_time
[rw
] == now
)
2370 a
->v
.io_time
[rw
] = now
;
2372 ret
= bch2_trans_update(trans
, &iter
, &a
->k_i
, 0) ?:
2373 bch2_trans_commit(trans
, NULL
, NULL
, 0);
2375 bch2_trans_iter_exit(trans
, &iter
);
2379 int bch2_bucket_io_time_reset(struct btree_trans
*trans
, unsigned dev
,
2380 size_t bucket_nr
, int rw
)
2382 if (bch2_trans_relock(trans
))
2383 bch2_trans_begin(trans
);
2385 return nested_lockrestart_do(trans
, __bch2_bucket_io_time_reset(trans
, dev
, bucket_nr
, rw
));
2388 /* Startup/shutdown (ro/rw): */
2390 void bch2_recalc_capacity(struct bch_fs
*c
)
2392 u64 capacity
= 0, reserved_sectors
= 0, gc_reserve
;
2393 unsigned bucket_size_max
= 0;
2394 unsigned long ra_pages
= 0;
2396 lockdep_assert_held(&c
->state_lock
);
2398 for_each_online_member(c
, ca
) {
2399 struct backing_dev_info
*bdi
= ca
->disk_sb
.bdev
->bd_disk
->bdi
;
2401 ra_pages
+= bdi
->ra_pages
;
2404 bch2_set_ra_pages(c
, ra_pages
);
2406 for_each_rw_member(c
, ca
) {
2407 u64 dev_reserve
= 0;
2410 * We need to reserve buckets (from the number
2411 * of currently available buckets) against
2412 * foreground writes so that mainly copygc can
2413 * make forward progress.
2415 * We need enough to refill the various reserves
2416 * from scratch - copygc will use its entire
2417 * reserve all at once, then run against when
2418 * its reserve is refilled (from the formerly
2419 * available buckets).
2421 * This reserve is just used when considering if
2422 * allocations for foreground writes must wait -
2423 * not -ENOSPC calculations.
2426 dev_reserve
+= ca
->nr_btree_reserve
* 2;
2427 dev_reserve
+= ca
->mi
.nbuckets
>> 6; /* copygc reserve */
2429 dev_reserve
+= 1; /* btree write point */
2430 dev_reserve
+= 1; /* copygc write point */
2431 dev_reserve
+= 1; /* rebalance write point */
2433 dev_reserve
*= ca
->mi
.bucket_size
;
2435 capacity
+= bucket_to_sector(ca
, ca
->mi
.nbuckets
-
2436 ca
->mi
.first_bucket
);
2438 reserved_sectors
+= dev_reserve
* 2;
2440 bucket_size_max
= max_t(unsigned, bucket_size_max
,
2441 ca
->mi
.bucket_size
);
2444 gc_reserve
= c
->opts
.gc_reserve_bytes
2445 ? c
->opts
.gc_reserve_bytes
>> 9
2446 : div64_u64(capacity
* c
->opts
.gc_reserve_percent
, 100);
2448 reserved_sectors
= max(gc_reserve
, reserved_sectors
);
2450 reserved_sectors
= min(reserved_sectors
, capacity
);
2452 c
->reserved
= reserved_sectors
;
2453 c
->capacity
= capacity
- reserved_sectors
;
2455 c
->bucket_size_max
= bucket_size_max
;
2457 /* Wake up case someone was waiting for buckets */
2458 closure_wake_up(&c
->freelist_wait
);
2461 u64
bch2_min_rw_member_capacity(struct bch_fs
*c
)
2465 for_each_rw_member(c
, ca
)
2466 ret
= min(ret
, ca
->mi
.nbuckets
* ca
->mi
.bucket_size
);
2470 static bool bch2_dev_has_open_write_point(struct bch_fs
*c
, struct bch_dev
*ca
)
2472 struct open_bucket
*ob
;
2475 for (ob
= c
->open_buckets
;
2476 ob
< c
->open_buckets
+ ARRAY_SIZE(c
->open_buckets
);
2478 spin_lock(&ob
->lock
);
2479 if (ob
->valid
&& !ob
->on_partial_list
&&
2480 ob
->dev
== ca
->dev_idx
)
2482 spin_unlock(&ob
->lock
);
2488 /* device goes ro: */
2489 void bch2_dev_allocator_remove(struct bch_fs
*c
, struct bch_dev
*ca
)
2491 lockdep_assert_held(&c
->state_lock
);
2493 /* First, remove device from allocation groups: */
2495 for (unsigned i
= 0; i
< ARRAY_SIZE(c
->rw_devs
); i
++)
2496 clear_bit(ca
->dev_idx
, c
->rw_devs
[i
].d
);
2498 c
->rw_devs_change_count
++;
2501 * Capacity is calculated based off of devices in allocation groups:
2503 bch2_recalc_capacity(c
);
2505 bch2_open_buckets_stop(c
, ca
, false);
2508 * Wake up threads that were blocked on allocation, so they can notice
2509 * the device can no longer be removed and the capacity has changed:
2511 closure_wake_up(&c
->freelist_wait
);
2514 * journal_res_get() can block waiting for free space in the journal -
2515 * it needs to notice there may not be devices to allocate from anymore:
2517 wake_up(&c
->journal
.wait
);
2519 /* Now wait for any in flight writes: */
2521 closure_wait_event(&c
->open_buckets_wait
,
2522 !bch2_dev_has_open_write_point(c
, ca
));
2525 /* device goes rw: */
2526 void bch2_dev_allocator_add(struct bch_fs
*c
, struct bch_dev
*ca
)
2528 lockdep_assert_held(&c
->state_lock
);
2530 for (unsigned i
= 0; i
< ARRAY_SIZE(c
->rw_devs
); i
++)
2531 if (ca
->mi
.data_allowed
& (1 << i
))
2532 set_bit(ca
->dev_idx
, c
->rw_devs
[i
].d
);
2534 c
->rw_devs_change_count
++;
2537 void bch2_dev_allocator_background_exit(struct bch_dev
*ca
)
2539 darray_exit(&ca
->discard_buckets_in_flight
);
2542 void bch2_dev_allocator_background_init(struct bch_dev
*ca
)
2544 mutex_init(&ca
->discard_buckets_in_flight_lock
);
2545 INIT_WORK(&ca
->discard_work
, bch2_do_discards_work
);
2546 INIT_WORK(&ca
->discard_fast_work
, bch2_do_discards_fast_work
);
2547 INIT_WORK(&ca
->invalidate_work
, bch2_do_invalidates_work
);
2550 void bch2_fs_allocator_background_init(struct bch_fs
*c
)
2552 spin_lock_init(&c
->freelist_lock
);