1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
5 * Code for managing the extent btree and dynamically updating the writeback
10 #include "bkey_methods.h"
11 #include "btree_cache.h"
14 #include "btree_iter.h"
19 #include "disk_groups.h"
30 static unsigned bch2_crc_field_size_max
[] = {
31 [BCH_EXTENT_ENTRY_crc32
] = CRC32_SIZE_MAX
,
32 [BCH_EXTENT_ENTRY_crc64
] = CRC64_SIZE_MAX
,
33 [BCH_EXTENT_ENTRY_crc128
] = CRC128_SIZE_MAX
,
36 static void bch2_extent_crc_pack(union bch_extent_crc
*,
37 struct bch_extent_crc_unpacked
,
38 enum bch_extent_entry_type
);
40 struct bch_dev_io_failures
*bch2_dev_io_failures(struct bch_io_failures
*f
,
43 struct bch_dev_io_failures
*i
;
45 for (i
= f
->devs
; i
< f
->devs
+ f
->nr
; i
++)
52 void bch2_mark_io_failure(struct bch_io_failures
*failed
,
53 struct extent_ptr_decoded
*p
)
55 struct bch_dev_io_failures
*f
= bch2_dev_io_failures(failed
, p
->ptr
.dev
);
58 BUG_ON(failed
->nr
>= ARRAY_SIZE(failed
->devs
));
60 f
= &failed
->devs
[failed
->nr
++];
65 } else if (p
->idx
!= f
->idx
) {
74 static inline u64
dev_latency(struct bch_fs
*c
, unsigned dev
)
76 struct bch_dev
*ca
= bch2_dev_rcu(c
, dev
);
77 return ca
? atomic64_read(&ca
->cur_latency
[READ
]) : S64_MAX
;
81 * returns true if p1 is better than p2:
83 static inline bool ptr_better(struct bch_fs
*c
,
84 const struct extent_ptr_decoded p1
,
85 const struct extent_ptr_decoded p2
)
87 if (likely(!p1
.idx
&& !p2
.idx
)) {
88 u64 l1
= dev_latency(c
, p1
.ptr
.dev
);
89 u64 l2
= dev_latency(c
, p2
.ptr
.dev
);
91 /* Pick at random, biased in favor of the faster device: */
93 return bch2_rand_range(l1
+ l2
) > l1
;
96 if (bch2_force_reconstruct_read
)
97 return p1
.idx
> p2
.idx
;
99 return p1
.idx
< p2
.idx
;
103 * This picks a non-stale pointer, preferably from a device other than @avoid.
104 * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
105 * other devices, it will still pick a pointer from avoid.
107 int bch2_bkey_pick_read_device(struct bch_fs
*c
, struct bkey_s_c k
,
108 struct bch_io_failures
*failed
,
109 struct extent_ptr_decoded
*pick
)
111 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
112 const union bch_extent_entry
*entry
;
113 struct extent_ptr_decoded p
;
114 struct bch_dev_io_failures
*f
;
117 if (k
.k
->type
== KEY_TYPE_error
)
118 return -BCH_ERR_key_type_error
;
121 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
) {
123 * Unwritten extent: no need to actually read, treat it as a
124 * hole and return 0s:
126 if (p
.ptr
.unwritten
) {
132 * If there are any dirty pointers it's an error if we can't
135 if (!ret
&& !p
.ptr
.cached
)
136 ret
= -BCH_ERR_no_device_to_read_from
;
138 struct bch_dev
*ca
= bch2_dev_rcu(c
, p
.ptr
.dev
);
140 if (p
.ptr
.cached
&& (!ca
|| dev_ptr_stale_rcu(ca
, &p
.ptr
)))
143 f
= failed
? bch2_dev_io_failures(failed
, p
.ptr
.dev
) : NULL
;
145 p
.idx
= f
->nr_failed
< f
->nr_retries
149 if (!p
.idx
&& (!ca
|| !bch2_dev_is_readable(ca
)))
152 if (!p
.idx
&& p
.has_ec
&& bch2_force_reconstruct_read
)
155 if (p
.idx
> (unsigned) p
.has_ec
)
158 if (ret
> 0 && !ptr_better(c
, p
, *pick
))
169 /* KEY_TYPE_btree_ptr: */
171 int bch2_btree_ptr_validate(struct bch_fs
*c
, struct bkey_s_c k
,
172 enum bch_validate_flags flags
)
176 bkey_fsck_err_on(bkey_val_u64s(k
.k
) > BCH_REPLICAS_MAX
,
177 c
, btree_ptr_val_too_big
,
178 "value too big (%zu > %u)", bkey_val_u64s(k
.k
), BCH_REPLICAS_MAX
);
180 ret
= bch2_bkey_ptrs_validate(c
, k
, flags
);
185 void bch2_btree_ptr_to_text(struct printbuf
*out
, struct bch_fs
*c
,
188 bch2_bkey_ptrs_to_text(out
, c
, k
);
191 int bch2_btree_ptr_v2_validate(struct bch_fs
*c
, struct bkey_s_c k
,
192 enum bch_validate_flags flags
)
194 struct bkey_s_c_btree_ptr_v2 bp
= bkey_s_c_to_btree_ptr_v2(k
);
197 bkey_fsck_err_on(bkey_val_u64s(k
.k
) > BKEY_BTREE_PTR_VAL_U64s_MAX
,
198 c
, btree_ptr_v2_val_too_big
,
199 "value too big (%zu > %zu)",
200 bkey_val_u64s(k
.k
), BKEY_BTREE_PTR_VAL_U64s_MAX
);
202 bkey_fsck_err_on(bpos_ge(bp
.v
->min_key
, bp
.k
->p
),
203 c
, btree_ptr_v2_min_key_bad
,
206 if (flags
& BCH_VALIDATE_write
)
207 bkey_fsck_err_on(!bp
.v
->sectors_written
,
208 c
, btree_ptr_v2_written_0
,
209 "sectors_written == 0");
211 ret
= bch2_bkey_ptrs_validate(c
, k
, flags
);
216 void bch2_btree_ptr_v2_to_text(struct printbuf
*out
, struct bch_fs
*c
,
219 struct bkey_s_c_btree_ptr_v2 bp
= bkey_s_c_to_btree_ptr_v2(k
);
221 prt_printf(out
, "seq %llx written %u min_key %s",
222 le64_to_cpu(bp
.v
->seq
),
223 le16_to_cpu(bp
.v
->sectors_written
),
224 BTREE_PTR_RANGE_UPDATED(bp
.v
) ? "R " : "");
226 bch2_bpos_to_text(out
, bp
.v
->min_key
);
227 prt_printf(out
, " ");
228 bch2_bkey_ptrs_to_text(out
, c
, k
);
231 void bch2_btree_ptr_v2_compat(enum btree_id btree_id
, unsigned version
,
232 unsigned big_endian
, int write
,
235 struct bkey_s_btree_ptr_v2 bp
= bkey_s_to_btree_ptr_v2(k
);
237 compat_bpos(0, btree_id
, version
, big_endian
, write
, &bp
.v
->min_key
);
239 if (version
< bcachefs_metadata_version_inode_btree_change
&&
240 btree_id_is_extents(btree_id
) &&
241 !bkey_eq(bp
.v
->min_key
, POS_MIN
))
242 bp
.v
->min_key
= write
243 ? bpos_nosnap_predecessor(bp
.v
->min_key
)
244 : bpos_nosnap_successor(bp
.v
->min_key
);
247 /* KEY_TYPE_extent: */
249 bool bch2_extent_merge(struct bch_fs
*c
, struct bkey_s l
, struct bkey_s_c r
)
251 struct bkey_ptrs l_ptrs
= bch2_bkey_ptrs(l
);
252 struct bkey_ptrs_c r_ptrs
= bch2_bkey_ptrs_c(r
);
253 union bch_extent_entry
*en_l
;
254 const union bch_extent_entry
*en_r
;
255 struct extent_ptr_decoded lp
, rp
;
260 while (en_l
< l_ptrs
.end
&& en_r
< r_ptrs
.end
) {
261 if (extent_entry_type(en_l
) != extent_entry_type(en_r
))
264 en_l
= extent_entry_next(en_l
);
265 en_r
= extent_entry_next(en_r
);
268 if (en_l
< l_ptrs
.end
|| en_r
< r_ptrs
.end
)
273 lp
.crc
= bch2_extent_crc_unpack(l
.k
, NULL
);
274 rp
.crc
= bch2_extent_crc_unpack(r
.k
, NULL
);
276 while (__bkey_ptr_next_decode(l
.k
, l_ptrs
.end
, lp
, en_l
) &&
277 __bkey_ptr_next_decode(r
.k
, r_ptrs
.end
, rp
, en_r
)) {
278 if (lp
.ptr
.offset
+ lp
.crc
.offset
+ lp
.crc
.live_size
!=
279 rp
.ptr
.offset
+ rp
.crc
.offset
||
280 lp
.ptr
.dev
!= rp
.ptr
.dev
||
281 lp
.ptr
.gen
!= rp
.ptr
.gen
||
282 lp
.ptr
.unwritten
!= rp
.ptr
.unwritten
||
283 lp
.has_ec
!= rp
.has_ec
)
286 /* Extents may not straddle buckets: */
288 struct bch_dev
*ca
= bch2_dev_rcu(c
, lp
.ptr
.dev
);
289 bool same_bucket
= ca
&& PTR_BUCKET_NR(ca
, &lp
.ptr
) == PTR_BUCKET_NR(ca
, &rp
.ptr
);
295 if (lp
.has_ec
!= rp
.has_ec
||
297 (lp
.ec
.block
!= rp
.ec
.block
||
298 lp
.ec
.redundancy
!= rp
.ec
.redundancy
||
299 lp
.ec
.idx
!= rp
.ec
.idx
)))
302 if (lp
.crc
.compression_type
!= rp
.crc
.compression_type
||
303 lp
.crc
.nonce
!= rp
.crc
.nonce
)
306 if (lp
.crc
.offset
+ lp
.crc
.live_size
+ rp
.crc
.live_size
<=
307 lp
.crc
.uncompressed_size
) {
308 /* can use left extent's crc entry */
309 } else if (lp
.crc
.live_size
<= rp
.crc
.offset
) {
310 /* can use right extent's crc entry */
312 /* check if checksums can be merged: */
313 if (lp
.crc
.csum_type
!= rp
.crc
.csum_type
||
314 lp
.crc
.nonce
!= rp
.crc
.nonce
||
315 crc_is_compressed(lp
.crc
) ||
316 !bch2_checksum_mergeable(lp
.crc
.csum_type
))
319 if (lp
.crc
.offset
+ lp
.crc
.live_size
!= lp
.crc
.compressed_size
||
323 if (lp
.crc
.csum_type
&&
324 lp
.crc
.uncompressed_size
+
325 rp
.crc
.uncompressed_size
> (c
->opts
.encoded_extent_max
>> 9))
329 en_l
= extent_entry_next(en_l
);
330 en_r
= extent_entry_next(en_r
);
335 while (en_l
< l_ptrs
.end
&& en_r
< r_ptrs
.end
) {
336 if (extent_entry_is_crc(en_l
)) {
337 struct bch_extent_crc_unpacked crc_l
= bch2_extent_crc_unpack(l
.k
, entry_to_crc(en_l
));
338 struct bch_extent_crc_unpacked crc_r
= bch2_extent_crc_unpack(r
.k
, entry_to_crc(en_r
));
340 if (crc_l
.uncompressed_size
+ crc_r
.uncompressed_size
>
341 bch2_crc_field_size_max
[extent_entry_type(en_l
)])
345 en_l
= extent_entry_next(en_l
);
346 en_r
= extent_entry_next(en_r
);
349 use_right_ptr
= false;
352 while (en_l
< l_ptrs
.end
) {
353 if (extent_entry_type(en_l
) == BCH_EXTENT_ENTRY_ptr
&&
355 en_l
->ptr
= en_r
->ptr
;
357 if (extent_entry_is_crc(en_l
)) {
358 struct bch_extent_crc_unpacked crc_l
=
359 bch2_extent_crc_unpack(l
.k
, entry_to_crc(en_l
));
360 struct bch_extent_crc_unpacked crc_r
=
361 bch2_extent_crc_unpack(r
.k
, entry_to_crc(en_r
));
363 use_right_ptr
= false;
365 if (crc_l
.offset
+ crc_l
.live_size
+ crc_r
.live_size
<=
366 crc_l
.uncompressed_size
) {
367 /* can use left extent's crc entry */
368 } else if (crc_l
.live_size
<= crc_r
.offset
) {
369 /* can use right extent's crc entry */
370 crc_r
.offset
-= crc_l
.live_size
;
371 bch2_extent_crc_pack(entry_to_crc(en_l
), crc_r
,
372 extent_entry_type(en_l
));
373 use_right_ptr
= true;
375 crc_l
.csum
= bch2_checksum_merge(crc_l
.csum_type
,
378 crc_r
.uncompressed_size
<< 9);
380 crc_l
.uncompressed_size
+= crc_r
.uncompressed_size
;
381 crc_l
.compressed_size
+= crc_r
.compressed_size
;
382 bch2_extent_crc_pack(entry_to_crc(en_l
), crc_l
,
383 extent_entry_type(en_l
));
387 en_l
= extent_entry_next(en_l
);
388 en_r
= extent_entry_next(en_r
);
391 bch2_key_resize(l
.k
, l
.k
->size
+ r
.k
->size
);
395 /* KEY_TYPE_reservation: */
397 int bch2_reservation_validate(struct bch_fs
*c
, struct bkey_s_c k
,
398 enum bch_validate_flags flags
)
400 struct bkey_s_c_reservation r
= bkey_s_c_to_reservation(k
);
403 bkey_fsck_err_on(!r
.v
->nr_replicas
|| r
.v
->nr_replicas
> BCH_REPLICAS_MAX
,
404 c
, reservation_key_nr_replicas_invalid
,
405 "invalid nr_replicas (%u)", r
.v
->nr_replicas
);
410 void bch2_reservation_to_text(struct printbuf
*out
, struct bch_fs
*c
,
413 struct bkey_s_c_reservation r
= bkey_s_c_to_reservation(k
);
415 prt_printf(out
, "generation %u replicas %u",
416 le32_to_cpu(r
.v
->generation
),
420 bool bch2_reservation_merge(struct bch_fs
*c
, struct bkey_s _l
, struct bkey_s_c _r
)
422 struct bkey_s_reservation l
= bkey_s_to_reservation(_l
);
423 struct bkey_s_c_reservation r
= bkey_s_c_to_reservation(_r
);
425 if (l
.v
->generation
!= r
.v
->generation
||
426 l
.v
->nr_replicas
!= r
.v
->nr_replicas
)
429 bch2_key_resize(l
.k
, l
.k
->size
+ r
.k
->size
);
433 /* Extent checksum entries: */
435 /* returns true if not equal */
436 static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l
,
437 struct bch_extent_crc_unpacked r
)
439 return (l
.csum_type
!= r
.csum_type
||
440 l
.compression_type
!= r
.compression_type
||
441 l
.compressed_size
!= r
.compressed_size
||
442 l
.uncompressed_size
!= r
.uncompressed_size
||
443 l
.offset
!= r
.offset
||
444 l
.live_size
!= r
.live_size
||
445 l
.nonce
!= r
.nonce
||
446 bch2_crc_cmp(l
.csum
, r
.csum
));
449 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u
,
450 struct bch_extent_crc_unpacked n
)
452 return !crc_is_compressed(u
) &&
454 u
.uncompressed_size
> u
.live_size
&&
455 bch2_csum_type_is_encryption(u
.csum_type
) ==
456 bch2_csum_type_is_encryption(n
.csum_type
);
459 bool bch2_can_narrow_extent_crcs(struct bkey_s_c k
,
460 struct bch_extent_crc_unpacked n
)
462 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
463 struct bch_extent_crc_unpacked crc
;
464 const union bch_extent_entry
*i
;
469 bkey_for_each_crc(k
.k
, ptrs
, crc
, i
)
470 if (can_narrow_crc(crc
, n
))
477 * We're writing another replica for this extent, so while we've got the data in
478 * memory we'll be computing a new checksum for the currently live data.
480 * If there are other replicas we aren't moving, and they are checksummed but
481 * not compressed, we can modify them to point to only the data that is
482 * currently live (so that readers won't have to bounce) while we've got the
485 bool bch2_bkey_narrow_crcs(struct bkey_i
*k
, struct bch_extent_crc_unpacked n
)
487 struct bkey_ptrs ptrs
= bch2_bkey_ptrs(bkey_i_to_s(k
));
488 struct bch_extent_crc_unpacked u
;
489 struct extent_ptr_decoded p
;
490 union bch_extent_entry
*i
;
493 /* Find a checksum entry that covers only live data: */
495 bkey_for_each_crc(&k
->k
, ptrs
, u
, i
)
496 if (!crc_is_compressed(u
) &&
498 u
.live_size
== u
.uncompressed_size
) {
505 BUG_ON(crc_is_compressed(n
));
507 BUG_ON(n
.live_size
!= k
->k
.size
);
509 restart_narrow_pointers
:
510 ptrs
= bch2_bkey_ptrs(bkey_i_to_s(k
));
512 bkey_for_each_ptr_decode(&k
->k
, ptrs
, p
, i
)
513 if (can_narrow_crc(p
.crc
, n
)) {
514 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k
), &i
->ptr
);
515 p
.ptr
.offset
+= p
.crc
.offset
;
517 bch2_extent_ptr_decoded_append(k
, &p
);
519 goto restart_narrow_pointers
;
525 static void bch2_extent_crc_pack(union bch_extent_crc
*dst
,
526 struct bch_extent_crc_unpacked src
,
527 enum bch_extent_entry_type type
)
529 #define set_common_fields(_dst, _src) \
530 _dst.type = 1 << type; \
531 _dst.csum_type = _src.csum_type, \
532 _dst.compression_type = _src.compression_type, \
533 _dst._compressed_size = _src.compressed_size - 1, \
534 _dst._uncompressed_size = _src.uncompressed_size - 1, \
535 _dst.offset = _src.offset
538 case BCH_EXTENT_ENTRY_crc32
:
539 set_common_fields(dst
->crc32
, src
);
540 dst
->crc32
.csum
= (u32 __force
) *((__le32
*) &src
.csum
.lo
);
542 case BCH_EXTENT_ENTRY_crc64
:
543 set_common_fields(dst
->crc64
, src
);
544 dst
->crc64
.nonce
= src
.nonce
;
545 dst
->crc64
.csum_lo
= (u64 __force
) src
.csum
.lo
;
546 dst
->crc64
.csum_hi
= (u64 __force
) *((__le16
*) &src
.csum
.hi
);
548 case BCH_EXTENT_ENTRY_crc128
:
549 set_common_fields(dst
->crc128
, src
);
550 dst
->crc128
.nonce
= src
.nonce
;
551 dst
->crc128
.csum
= src
.csum
;
556 #undef set_common_fields
559 void bch2_extent_crc_append(struct bkey_i
*k
,
560 struct bch_extent_crc_unpacked
new)
562 struct bkey_ptrs ptrs
= bch2_bkey_ptrs(bkey_i_to_s(k
));
563 union bch_extent_crc
*crc
= (void *) ptrs
.end
;
564 enum bch_extent_entry_type type
;
566 if (bch_crc_bytes
[new.csum_type
] <= 4 &&
567 new.uncompressed_size
<= CRC32_SIZE_MAX
&&
568 new.nonce
<= CRC32_NONCE_MAX
)
569 type
= BCH_EXTENT_ENTRY_crc32
;
570 else if (bch_crc_bytes
[new.csum_type
] <= 10 &&
571 new.uncompressed_size
<= CRC64_SIZE_MAX
&&
572 new.nonce
<= CRC64_NONCE_MAX
)
573 type
= BCH_EXTENT_ENTRY_crc64
;
574 else if (bch_crc_bytes
[new.csum_type
] <= 16 &&
575 new.uncompressed_size
<= CRC128_SIZE_MAX
&&
576 new.nonce
<= CRC128_NONCE_MAX
)
577 type
= BCH_EXTENT_ENTRY_crc128
;
581 bch2_extent_crc_pack(crc
, new, type
);
583 k
->k
.u64s
+= extent_entry_u64s(ptrs
.end
);
585 EBUG_ON(bkey_val_u64s(&k
->k
) > BKEY_EXTENT_VAL_U64s_MAX
);
588 /* Generic code for keys with pointers: */
590 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k
)
592 return bch2_bkey_devs(k
).nr
;
595 unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k
)
597 return k
.k
->type
== KEY_TYPE_reservation
598 ? bkey_s_c_to_reservation(k
).v
->nr_replicas
599 : bch2_bkey_dirty_devs(k
).nr
;
602 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k
)
606 if (k
.k
->type
== KEY_TYPE_reservation
) {
607 ret
= bkey_s_c_to_reservation(k
).v
->nr_replicas
;
609 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
610 const union bch_extent_entry
*entry
;
611 struct extent_ptr_decoded p
;
613 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
)
614 ret
+= !p
.ptr
.cached
&& !crc_is_compressed(p
.crc
);
620 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k
)
622 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
623 const union bch_extent_entry
*entry
;
624 struct extent_ptr_decoded p
;
627 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
)
628 if (!p
.ptr
.cached
&& crc_is_compressed(p
.crc
))
629 ret
+= p
.crc
.compressed_size
;
634 bool bch2_bkey_is_incompressible(struct bkey_s_c k
)
636 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
637 const union bch_extent_entry
*entry
;
638 struct bch_extent_crc_unpacked crc
;
640 bkey_for_each_crc(k
.k
, ptrs
, crc
, entry
)
641 if (crc
.compression_type
== BCH_COMPRESSION_TYPE_incompressible
)
646 unsigned bch2_bkey_replicas(struct bch_fs
*c
, struct bkey_s_c k
)
648 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
649 const union bch_extent_entry
*entry
;
650 struct extent_ptr_decoded p
= { 0 };
651 unsigned replicas
= 0;
653 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
) {
658 replicas
+= p
.ec
.redundancy
;
667 static inline unsigned __extent_ptr_durability(struct bch_dev
*ca
, struct extent_ptr_decoded
*p
)
673 ? p
->ec
.redundancy
+ 1
677 unsigned bch2_extent_ptr_desired_durability(struct bch_fs
*c
, struct extent_ptr_decoded
*p
)
679 struct bch_dev
*ca
= bch2_dev_rcu(c
, p
->ptr
.dev
);
681 return ca
? __extent_ptr_durability(ca
, p
) : 0;
684 unsigned bch2_extent_ptr_durability(struct bch_fs
*c
, struct extent_ptr_decoded
*p
)
686 struct bch_dev
*ca
= bch2_dev_rcu(c
, p
->ptr
.dev
);
688 if (!ca
|| ca
->mi
.state
== BCH_MEMBER_STATE_failed
)
691 return __extent_ptr_durability(ca
, p
);
694 unsigned bch2_bkey_durability(struct bch_fs
*c
, struct bkey_s_c k
)
696 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
697 const union bch_extent_entry
*entry
;
698 struct extent_ptr_decoded p
;
699 unsigned durability
= 0;
702 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
)
703 durability
+= bch2_extent_ptr_durability(c
, &p
);
709 static unsigned bch2_bkey_durability_safe(struct bch_fs
*c
, struct bkey_s_c k
)
711 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
712 const union bch_extent_entry
*entry
;
713 struct extent_ptr_decoded p
;
714 unsigned durability
= 0;
717 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
)
718 if (p
.ptr
.dev
< c
->sb
.nr_devices
&& c
->devs
[p
.ptr
.dev
])
719 durability
+= bch2_extent_ptr_durability(c
, &p
);
725 void bch2_bkey_extent_entry_drop(struct bkey_i
*k
, union bch_extent_entry
*entry
)
727 union bch_extent_entry
*end
= bkey_val_end(bkey_i_to_s(k
));
728 union bch_extent_entry
*next
= extent_entry_next(entry
);
730 memmove_u64s(entry
, next
, (u64
*) end
- (u64
*) next
);
731 k
->k
.u64s
-= extent_entry_u64s(entry
);
734 void bch2_extent_ptr_decoded_append(struct bkey_i
*k
,
735 struct extent_ptr_decoded
*p
)
737 struct bkey_ptrs ptrs
= bch2_bkey_ptrs(bkey_i_to_s(k
));
738 struct bch_extent_crc_unpacked crc
=
739 bch2_extent_crc_unpack(&k
->k
, NULL
);
740 union bch_extent_entry
*pos
;
742 if (!bch2_crc_unpacked_cmp(crc
, p
->crc
)) {
747 bkey_for_each_crc(&k
->k
, ptrs
, crc
, pos
)
748 if (!bch2_crc_unpacked_cmp(crc
, p
->crc
)) {
749 pos
= extent_entry_next(pos
);
753 bch2_extent_crc_append(k
, p
->crc
);
754 pos
= bkey_val_end(bkey_i_to_s(k
));
756 p
->ptr
.type
= 1 << BCH_EXTENT_ENTRY_ptr
;
757 __extent_entry_insert(k
, pos
, to_entry(&p
->ptr
));
760 p
->ec
.type
= 1 << BCH_EXTENT_ENTRY_stripe_ptr
;
761 __extent_entry_insert(k
, pos
, to_entry(&p
->ec
));
765 static union bch_extent_entry
*extent_entry_prev(struct bkey_ptrs ptrs
,
766 union bch_extent_entry
*entry
)
768 union bch_extent_entry
*i
= ptrs
.start
;
773 while (extent_entry_next(i
) != entry
)
774 i
= extent_entry_next(i
);
779 * Returns pointer to the next entry after the one being dropped:
781 void bch2_bkey_drop_ptr_noerror(struct bkey_s k
, struct bch_extent_ptr
*ptr
)
783 struct bkey_ptrs ptrs
= bch2_bkey_ptrs(k
);
784 union bch_extent_entry
*entry
= to_entry(ptr
), *next
;
785 bool drop_crc
= true;
787 if (k
.k
->type
== KEY_TYPE_stripe
) {
788 ptr
->dev
= BCH_SB_MEMBER_INVALID
;
792 EBUG_ON(ptr
< &ptrs
.start
->ptr
||
793 ptr
>= &ptrs
.end
->ptr
);
794 EBUG_ON(ptr
->type
!= 1 << BCH_EXTENT_ENTRY_ptr
);
796 for (next
= extent_entry_next(entry
);
798 next
= extent_entry_next(next
)) {
799 if (extent_entry_is_crc(next
)) {
801 } else if (extent_entry_is_ptr(next
)) {
807 extent_entry_drop(k
, entry
);
809 while ((entry
= extent_entry_prev(ptrs
, entry
))) {
810 if (extent_entry_is_ptr(entry
))
813 if ((extent_entry_is_crc(entry
) && drop_crc
) ||
814 extent_entry_is_stripe_ptr(entry
))
815 extent_entry_drop(k
, entry
);
819 void bch2_bkey_drop_ptr(struct bkey_s k
, struct bch_extent_ptr
*ptr
)
821 if (k
.k
->type
!= KEY_TYPE_stripe
) {
822 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
.s_c
);
823 const union bch_extent_entry
*entry
;
824 struct extent_ptr_decoded p
;
826 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
)
827 if (p
.ptr
.dev
== ptr
->dev
&& p
.has_ec
) {
828 ptr
->dev
= BCH_SB_MEMBER_INVALID
;
833 bool have_dirty
= bch2_bkey_dirty_devs(k
.s_c
).nr
;
835 bch2_bkey_drop_ptr_noerror(k
, ptr
);
838 * If we deleted all the dirty pointers and there's still cached
839 * pointers, we could set the cached pointers to dirty if they're not
840 * stale - but to do that correctly we'd need to grab an open_bucket
841 * reference so that we don't race with bucket reuse:
844 !bch2_bkey_dirty_devs(k
.s_c
).nr
) {
845 k
.k
->type
= KEY_TYPE_error
;
846 set_bkey_val_u64s(k
.k
, 0);
847 } else if (!bch2_bkey_nr_ptrs(k
.s_c
)) {
848 k
.k
->type
= KEY_TYPE_deleted
;
849 set_bkey_val_u64s(k
.k
, 0);
853 void bch2_bkey_drop_device(struct bkey_s k
, unsigned dev
)
855 bch2_bkey_drop_ptrs(k
, ptr
, ptr
->dev
== dev
);
858 void bch2_bkey_drop_device_noerror(struct bkey_s k
, unsigned dev
)
860 bch2_bkey_drop_ptrs_noerror(k
, ptr
, ptr
->dev
== dev
);
863 const struct bch_extent_ptr
*bch2_bkey_has_device_c(struct bkey_s_c k
, unsigned dev
)
865 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
867 bkey_for_each_ptr(ptrs
, ptr
)
874 bool bch2_bkey_has_target(struct bch_fs
*c
, struct bkey_s_c k
, unsigned target
)
876 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
881 bkey_for_each_ptr(ptrs
, ptr
)
882 if (bch2_dev_in_target(c
, ptr
->dev
, target
) &&
883 (ca
= bch2_dev_rcu(c
, ptr
->dev
)) &&
885 !dev_ptr_stale_rcu(ca
, ptr
))) {
894 bool bch2_bkey_matches_ptr(struct bch_fs
*c
, struct bkey_s_c k
,
895 struct bch_extent_ptr m
, u64 offset
)
897 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
898 const union bch_extent_entry
*entry
;
899 struct extent_ptr_decoded p
;
901 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
)
902 if (p
.ptr
.dev
== m
.dev
&&
903 p
.ptr
.gen
== m
.gen
&&
904 (s64
) p
.ptr
.offset
+ p
.crc
.offset
- bkey_start_offset(k
.k
) ==
905 (s64
) m
.offset
- offset
)
912 * Returns true if two extents refer to the same data:
914 bool bch2_extents_match(struct bkey_s_c k1
, struct bkey_s_c k2
)
916 if (k1
.k
->type
!= k2
.k
->type
)
919 if (bkey_extent_is_direct_data(k1
.k
)) {
920 struct bkey_ptrs_c ptrs1
= bch2_bkey_ptrs_c(k1
);
921 struct bkey_ptrs_c ptrs2
= bch2_bkey_ptrs_c(k2
);
922 const union bch_extent_entry
*entry1
, *entry2
;
923 struct extent_ptr_decoded p1
, p2
;
925 if (bkey_extent_is_unwritten(k1
) != bkey_extent_is_unwritten(k2
))
928 bkey_for_each_ptr_decode(k1
.k
, ptrs1
, p1
, entry1
)
929 bkey_for_each_ptr_decode(k2
.k
, ptrs2
, p2
, entry2
)
930 if (p1
.ptr
.dev
== p2
.ptr
.dev
&&
931 p1
.ptr
.gen
== p2
.ptr
.gen
&&
934 * This checks that the two pointers point
935 * to the same region on disk - adjusting
936 * for the difference in where the extents
937 * start, since one may have been trimmed:
939 (s64
) p1
.ptr
.offset
+ p1
.crc
.offset
- bkey_start_offset(k1
.k
) ==
940 (s64
) p2
.ptr
.offset
+ p2
.crc
.offset
- bkey_start_offset(k2
.k
) &&
943 * This additionally checks that the
944 * extents overlap on disk, since the
945 * previous check may trigger spuriously
946 * when one extent is immediately partially
947 * overwritten with another extent (so that
948 * on disk they are adjacent) and
949 * compression is in use:
951 ((p1
.ptr
.offset
>= p2
.ptr
.offset
&&
952 p1
.ptr
.offset
< p2
.ptr
.offset
+ p2
.crc
.compressed_size
) ||
953 (p2
.ptr
.offset
>= p1
.ptr
.offset
&&
954 p2
.ptr
.offset
< p1
.ptr
.offset
+ p1
.crc
.compressed_size
)))
959 /* KEY_TYPE_deleted, etc. */
964 struct bch_extent_ptr
*
965 bch2_extent_has_ptr(struct bkey_s_c k1
, struct extent_ptr_decoded p1
, struct bkey_s k2
)
967 struct bkey_ptrs ptrs2
= bch2_bkey_ptrs(k2
);
968 union bch_extent_entry
*entry2
;
969 struct extent_ptr_decoded p2
;
971 bkey_for_each_ptr_decode(k2
.k
, ptrs2
, p2
, entry2
)
972 if (p1
.ptr
.dev
== p2
.ptr
.dev
&&
973 p1
.ptr
.gen
== p2
.ptr
.gen
&&
974 (s64
) p1
.ptr
.offset
+ p1
.crc
.offset
- bkey_start_offset(k1
.k
) ==
975 (s64
) p2
.ptr
.offset
+ p2
.crc
.offset
- bkey_start_offset(k2
.k
))
981 static bool want_cached_ptr(struct bch_fs
*c
, struct bch_io_opts
*opts
,
982 struct bch_extent_ptr
*ptr
)
984 if (!opts
->promote_target
||
985 !bch2_dev_in_target(c
, ptr
->dev
, opts
->promote_target
))
988 struct bch_dev
*ca
= bch2_dev_rcu_noerror(c
, ptr
->dev
);
990 return ca
&& bch2_dev_is_readable(ca
) && !dev_ptr_stale_rcu(ca
, ptr
);
993 void bch2_extent_ptr_set_cached(struct bch_fs
*c
,
994 struct bch_io_opts
*opts
,
996 struct bch_extent_ptr
*ptr
)
998 struct bkey_ptrs ptrs
= bch2_bkey_ptrs(k
);
999 union bch_extent_entry
*entry
;
1000 struct extent_ptr_decoded p
;
1003 if (!want_cached_ptr(c
, opts
, ptr
)) {
1004 bch2_bkey_drop_ptr_noerror(k
, ptr
);
1009 * Stripes can't contain cached data, for - reasons.
1011 * Possibly something we can fix in the future?
1013 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
)
1014 if (&entry
->ptr
== ptr
) {
1016 bch2_bkey_drop_ptr_noerror(k
, ptr
);
1028 * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
1030 * Returns true if @k should be dropped entirely
1032 * For existing keys, only called when btree nodes are being rewritten, not when
1033 * they're merely being compacted/resorted in memory.
1035 bool bch2_extent_normalize(struct bch_fs
*c
, struct bkey_s k
)
1040 bch2_bkey_drop_ptrs(k
, ptr
,
1042 (!(ca
= bch2_dev_rcu(c
, ptr
->dev
)) ||
1043 dev_ptr_stale_rcu(ca
, ptr
) > 0));
1046 return bkey_deleted(k
.k
);
1050 * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
1052 * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
1053 * the promote target.
1055 bool bch2_extent_normalize_by_opts(struct bch_fs
*c
,
1056 struct bch_io_opts
*opts
,
1059 struct bkey_ptrs ptrs
;
1060 bool have_cached_ptr
;
1064 ptrs
= bch2_bkey_ptrs(k
);
1065 have_cached_ptr
= false;
1067 bkey_for_each_ptr(ptrs
, ptr
)
1069 if (have_cached_ptr
|| !want_cached_ptr(c
, opts
, ptr
)) {
1070 bch2_bkey_drop_ptr(k
, ptr
);
1071 goto restart_drop_ptrs
;
1073 have_cached_ptr
= true;
1077 return bkey_deleted(k
.k
);
1080 void bch2_extent_ptr_to_text(struct printbuf
*out
, struct bch_fs
*c
, const struct bch_extent_ptr
*ptr
)
1084 struct bch_dev
*ca
= bch2_dev_rcu_noerror(c
, ptr
->dev
);
1086 prt_printf(out
, "ptr: %u:%llu gen %u%s", ptr
->dev
,
1087 (u64
) ptr
->offset
, ptr
->gen
,
1088 ptr
->cached
? " cached" : "");
1091 u64 b
= sector_to_bucket_and_offset(ca
, ptr
->offset
, &offset
);
1093 prt_printf(out
, "ptr: %u:%llu:%u gen %u",
1094 ptr
->dev
, b
, offset
, ptr
->gen
);
1095 if (ca
->mi
.durability
!= 1)
1096 prt_printf(out
, " d=%u", ca
->mi
.durability
);
1098 prt_str(out
, " cached");
1100 prt_str(out
, " unwritten");
1101 int stale
= dev_ptr_stale_rcu(ca
, ptr
);
1103 prt_printf(out
, " stale");
1105 prt_printf(out
, " invalid");
1111 void bch2_extent_crc_unpacked_to_text(struct printbuf
*out
, struct bch_extent_crc_unpacked
*crc
)
1113 prt_printf(out
, "crc: c_size %u size %u offset %u nonce %u csum ",
1114 crc
->compressed_size
,
1115 crc
->uncompressed_size
,
1116 crc
->offset
, crc
->nonce
);
1117 bch2_prt_csum_type(out
, crc
->csum_type
);
1118 prt_printf(out
, " %0llx:%0llx ", crc
->csum
.hi
, crc
->csum
.lo
);
1119 prt_str(out
, " compress ");
1120 bch2_prt_compression_type(out
, crc
->compression_type
);
1123 void bch2_bkey_ptrs_to_text(struct printbuf
*out
, struct bch_fs
*c
,
1126 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
1127 const union bch_extent_entry
*entry
;
1131 prt_printf(out
, "durability: %u ", bch2_bkey_durability_safe(c
, k
));
1133 bkey_extent_entry_for_each(ptrs
, entry
) {
1135 prt_printf(out
, " ");
1137 switch (__extent_entry_type(entry
)) {
1138 case BCH_EXTENT_ENTRY_ptr
:
1139 bch2_extent_ptr_to_text(out
, c
, entry_to_ptr(entry
));
1142 case BCH_EXTENT_ENTRY_crc32
:
1143 case BCH_EXTENT_ENTRY_crc64
:
1144 case BCH_EXTENT_ENTRY_crc128
: {
1145 struct bch_extent_crc_unpacked crc
=
1146 bch2_extent_crc_unpack(k
.k
, entry_to_crc(entry
));
1148 bch2_extent_crc_unpacked_to_text(out
, &crc
);
1151 case BCH_EXTENT_ENTRY_stripe_ptr
: {
1152 const struct bch_extent_stripe_ptr
*ec
= &entry
->stripe_ptr
;
1154 prt_printf(out
, "ec: idx %llu block %u",
1155 (u64
) ec
->idx
, ec
->block
);
1158 case BCH_EXTENT_ENTRY_rebalance
: {
1159 const struct bch_extent_rebalance
*r
= &entry
->rebalance
;
1161 prt_str(out
, "rebalance: target ");
1163 bch2_target_to_text(out
, c
, r
->target
);
1165 prt_printf(out
, "%u", r
->target
);
1166 prt_str(out
, " compression ");
1167 bch2_compression_opt_to_text(out
, r
->compression
);
1171 prt_printf(out
, "(invalid extent entry %.16llx)", *((u64
*) entry
));
1179 static int extent_ptr_validate(struct bch_fs
*c
,
1181 enum bch_validate_flags flags
,
1182 const struct bch_extent_ptr
*ptr
,
1183 unsigned size_ondisk
,
1188 /* bad pointers are repaired by check_fix_ptrs(): */
1190 struct bch_dev
*ca
= bch2_dev_rcu_noerror(c
, ptr
->dev
);
1196 u64 bucket
= sector_to_bucket_and_offset(ca
, ptr
->offset
, &bucket_offset
);
1197 unsigned first_bucket
= ca
->mi
.first_bucket
;
1198 u64 nbuckets
= ca
->mi
.nbuckets
;
1199 unsigned bucket_size
= ca
->mi
.bucket_size
;
1202 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
1203 bkey_for_each_ptr(ptrs
, ptr2
)
1204 bkey_fsck_err_on(ptr
!= ptr2
&& ptr
->dev
== ptr2
->dev
,
1205 c
, ptr_to_duplicate_device
,
1206 "multiple pointers to same device (%u)", ptr
->dev
);
1209 bkey_fsck_err_on(bucket
>= nbuckets
,
1210 c
, ptr_after_last_bucket
,
1211 "pointer past last bucket (%llu > %llu)", bucket
, nbuckets
);
1212 bkey_fsck_err_on(bucket
< first_bucket
,
1213 c
, ptr_before_first_bucket
,
1214 "pointer before first bucket (%llu < %u)", bucket
, first_bucket
);
1215 bkey_fsck_err_on(bucket_offset
+ size_ondisk
> bucket_size
,
1216 c
, ptr_spans_multiple_buckets
,
1217 "pointer spans multiple buckets (%u + %u > %u)",
1218 bucket_offset
, size_ondisk
, bucket_size
);
1223 int bch2_bkey_ptrs_validate(struct bch_fs
*c
, struct bkey_s_c k
,
1224 enum bch_validate_flags flags
)
1226 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
1227 const union bch_extent_entry
*entry
;
1228 struct bch_extent_crc_unpacked crc
;
1229 unsigned size_ondisk
= k
.k
->size
;
1230 unsigned nonce
= UINT_MAX
;
1231 unsigned nr_ptrs
= 0;
1232 bool have_written
= false, have_unwritten
= false, have_ec
= false, crc_since_last_ptr
= false;
1235 if (bkey_is_btree_ptr(k
.k
))
1236 size_ondisk
= btree_sectors(c
);
1238 bkey_extent_entry_for_each(ptrs
, entry
) {
1239 bkey_fsck_err_on(__extent_entry_type(entry
) >= BCH_EXTENT_ENTRY_MAX
,
1240 c
, extent_ptrs_invalid_entry
,
1241 "invalid extent entry type (got %u, max %u)",
1242 __extent_entry_type(entry
), BCH_EXTENT_ENTRY_MAX
);
1244 bkey_fsck_err_on(bkey_is_btree_ptr(k
.k
) &&
1245 !extent_entry_is_ptr(entry
),
1246 c
, btree_ptr_has_non_ptr
,
1247 "has non ptr field");
1249 switch (extent_entry_type(entry
)) {
1250 case BCH_EXTENT_ENTRY_ptr
:
1251 ret
= extent_ptr_validate(c
, k
, flags
, &entry
->ptr
, size_ondisk
, false);
1255 bkey_fsck_err_on(entry
->ptr
.cached
&& have_ec
,
1256 c
, ptr_cached_and_erasure_coded
,
1257 "cached, erasure coded ptr");
1259 if (!entry
->ptr
.unwritten
)
1260 have_written
= true;
1262 have_unwritten
= true;
1265 crc_since_last_ptr
= false;
1268 case BCH_EXTENT_ENTRY_crc32
:
1269 case BCH_EXTENT_ENTRY_crc64
:
1270 case BCH_EXTENT_ENTRY_crc128
:
1271 crc
= bch2_extent_crc_unpack(k
.k
, entry_to_crc(entry
));
1273 bkey_fsck_err_on(crc
.offset
+ crc
.live_size
> crc
.uncompressed_size
,
1274 c
, ptr_crc_uncompressed_size_too_small
,
1275 "checksum offset + key size > uncompressed size");
1276 bkey_fsck_err_on(!bch2_checksum_type_valid(c
, crc
.csum_type
),
1277 c
, ptr_crc_csum_type_unknown
,
1278 "invalid checksum type");
1279 bkey_fsck_err_on(crc
.compression_type
>= BCH_COMPRESSION_TYPE_NR
,
1280 c
, ptr_crc_compression_type_unknown
,
1281 "invalid compression type");
1283 if (bch2_csum_type_is_encryption(crc
.csum_type
)) {
1284 if (nonce
== UINT_MAX
)
1285 nonce
= crc
.offset
+ crc
.nonce
;
1286 else if (nonce
!= crc
.offset
+ crc
.nonce
)
1287 bkey_fsck_err(c
, ptr_crc_nonce_mismatch
,
1291 bkey_fsck_err_on(crc_since_last_ptr
,
1292 c
, ptr_crc_redundant
,
1293 "redundant crc entry");
1294 crc_since_last_ptr
= true;
1296 bkey_fsck_err_on(crc_is_encoded(crc
) &&
1297 (crc
.uncompressed_size
> c
->opts
.encoded_extent_max
>> 9) &&
1298 (flags
& (BCH_VALIDATE_write
|BCH_VALIDATE_commit
)),
1299 c
, ptr_crc_uncompressed_size_too_big
,
1300 "too large encoded extent");
1302 size_ondisk
= crc
.compressed_size
;
1304 case BCH_EXTENT_ENTRY_stripe_ptr
:
1305 bkey_fsck_err_on(have_ec
,
1306 c
, ptr_stripe_redundant
,
1307 "redundant stripe entry");
1310 case BCH_EXTENT_ENTRY_rebalance
: {
1312 * this shouldn't be a fsck error, for forward
1313 * compatibility; the rebalance code should just refetch
1314 * the compression opt if it's unknown
1317 const struct bch_extent_rebalance
*r
= &entry
->rebalance
;
1319 if (!bch2_compression_opt_valid(r
->compression
)) {
1320 struct bch_compression_opt opt
= __bch2_compression_decode(r
->compression
);
1321 prt_printf(err
, "invalid compression opt %u:%u",
1322 opt
.type
, opt
.level
);
1323 return -BCH_ERR_invalid_bkey
;
1331 bkey_fsck_err_on(!nr_ptrs
,
1332 c
, extent_ptrs_no_ptrs
,
1334 bkey_fsck_err_on(nr_ptrs
> BCH_BKEY_PTRS_MAX
,
1335 c
, extent_ptrs_too_many_ptrs
,
1336 "too many ptrs: %u > %u", nr_ptrs
, BCH_BKEY_PTRS_MAX
);
1337 bkey_fsck_err_on(have_written
&& have_unwritten
,
1338 c
, extent_ptrs_written_and_unwritten
,
1339 "extent with unwritten and written ptrs");
1340 bkey_fsck_err_on(k
.k
->type
!= KEY_TYPE_extent
&& have_unwritten
,
1341 c
, extent_ptrs_unwritten
,
1342 "has unwritten ptrs");
1343 bkey_fsck_err_on(crc_since_last_ptr
,
1344 c
, extent_ptrs_redundant_crc
,
1345 "redundant crc entry");
1346 bkey_fsck_err_on(have_ec
,
1347 c
, extent_ptrs_redundant_stripe
,
1348 "redundant stripe entry");
1353 void bch2_ptr_swab(struct bkey_s k
)
1355 struct bkey_ptrs ptrs
= bch2_bkey_ptrs(k
);
1356 union bch_extent_entry
*entry
;
1359 for (d
= (u64
*) ptrs
.start
;
1360 d
!= (u64
*) ptrs
.end
;
1364 for (entry
= ptrs
.start
;
1366 entry
= extent_entry_next(entry
)) {
1367 switch (__extent_entry_type(entry
)) {
1368 case BCH_EXTENT_ENTRY_ptr
:
1370 case BCH_EXTENT_ENTRY_crc32
:
1371 entry
->crc32
.csum
= swab32(entry
->crc32
.csum
);
1373 case BCH_EXTENT_ENTRY_crc64
:
1374 entry
->crc64
.csum_hi
= swab16(entry
->crc64
.csum_hi
);
1375 entry
->crc64
.csum_lo
= swab64(entry
->crc64
.csum_lo
);
1377 case BCH_EXTENT_ENTRY_crc128
:
1378 entry
->crc128
.csum
.hi
= (__force __le64
)
1379 swab64((__force u64
) entry
->crc128
.csum
.hi
);
1380 entry
->crc128
.csum
.lo
= (__force __le64
)
1381 swab64((__force u64
) entry
->crc128
.csum
.lo
);
1383 case BCH_EXTENT_ENTRY_stripe_ptr
:
1385 case BCH_EXTENT_ENTRY_rebalance
:
1388 /* Bad entry type: will be caught by validate() */
1394 const struct bch_extent_rebalance
*bch2_bkey_rebalance_opts(struct bkey_s_c k
)
1396 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
1397 const union bch_extent_entry
*entry
;
1399 bkey_extent_entry_for_each(ptrs
, entry
)
1400 if (__extent_entry_type(entry
) == BCH_EXTENT_ENTRY_rebalance
)
1401 return &entry
->rebalance
;
1406 unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs
*c
, struct bkey_s_c k
,
1407 unsigned target
, unsigned compression
)
1409 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
1410 unsigned rewrite_ptrs
= 0;
1413 unsigned compression_type
= bch2_compression_opt_to_type(compression
);
1414 const union bch_extent_entry
*entry
;
1415 struct extent_ptr_decoded p
;
1418 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
) {
1419 if (p
.crc
.compression_type
== BCH_COMPRESSION_TYPE_incompressible
||
1422 goto incompressible
;
1425 if (!p
.ptr
.cached
&& p
.crc
.compression_type
!= compression_type
)
1426 rewrite_ptrs
|= 1U << i
;
1431 if (target
&& bch2_target_accepts_data(c
, BCH_DATA_user
, target
)) {
1434 bkey_for_each_ptr(ptrs
, ptr
) {
1435 if (!ptr
->cached
&& !bch2_dev_in_target(c
, ptr
->dev
, target
))
1436 rewrite_ptrs
|= 1U << i
;
1441 return rewrite_ptrs
;
1444 bool bch2_bkey_needs_rebalance(struct bch_fs
*c
, struct bkey_s_c k
)
1446 const struct bch_extent_rebalance
*r
= bch2_bkey_rebalance_opts(k
);
1449 * If it's an indirect extent, we don't delete the rebalance entry when
1450 * done so that we know what options were applied - check if it still
1454 k
.k
->type
== KEY_TYPE_reflink_v
&&
1455 !bch2_bkey_ptrs_need_rebalance(c
, k
, r
->target
, r
->compression
))
1461 static u64
__bch2_bkey_sectors_need_rebalance(struct bch_fs
*c
, struct bkey_s_c k
,
1462 unsigned target
, unsigned compression
)
1464 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
1465 const union bch_extent_entry
*entry
;
1466 struct extent_ptr_decoded p
;
1470 unsigned compression_type
= bch2_compression_opt_to_type(compression
);
1472 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
) {
1473 if (p
.crc
.compression_type
== BCH_COMPRESSION_TYPE_incompressible
||
1476 goto incompressible
;
1479 if (!p
.ptr
.cached
&& p
.crc
.compression_type
!= compression_type
)
1480 sectors
+= p
.crc
.compressed_size
;
1484 if (target
&& bch2_target_accepts_data(c
, BCH_DATA_user
, target
)) {
1485 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
)
1486 if (!p
.ptr
.cached
&& !bch2_dev_in_target(c
, p
.ptr
.dev
, target
))
1487 sectors
+= p
.crc
.compressed_size
;
1493 u64
bch2_bkey_sectors_need_rebalance(struct bch_fs
*c
, struct bkey_s_c k
)
1495 const struct bch_extent_rebalance
*r
= bch2_bkey_rebalance_opts(k
);
1497 return r
? __bch2_bkey_sectors_need_rebalance(c
, k
, r
->target
, r
->compression
) : 0;
1500 int bch2_bkey_set_needs_rebalance(struct bch_fs
*c
, struct bkey_i
*_k
,
1501 struct bch_io_opts
*opts
)
1503 struct bkey_s k
= bkey_i_to_s(_k
);
1504 struct bch_extent_rebalance
*r
;
1505 unsigned target
= opts
->background_target
;
1506 unsigned compression
= background_compression(*opts
);
1507 bool needs_rebalance
;
1509 if (!bkey_extent_is_direct_data(k
.k
))
1512 /* get existing rebalance entry: */
1513 r
= (struct bch_extent_rebalance
*) bch2_bkey_rebalance_opts(k
.s_c
);
1515 if (k
.k
->type
== KEY_TYPE_reflink_v
) {
1517 * indirect extents: existing options take precedence,
1518 * so that we don't move extents back and forth if
1519 * they're referenced by different inodes with different
1525 compression
= r
->compression
;
1529 r
->compression
= compression
;
1532 needs_rebalance
= bch2_bkey_ptrs_need_rebalance(c
, k
.s_c
, target
, compression
);
1534 if (needs_rebalance
&& !r
) {
1535 union bch_extent_entry
*new = bkey_val_end(k
);
1537 new->rebalance
.type
= 1U << BCH_EXTENT_ENTRY_rebalance
;
1538 new->rebalance
.compression
= compression
;
1539 new->rebalance
.target
= target
;
1540 new->rebalance
.unused
= 0;
1541 k
.k
->u64s
+= extent_entry_u64s(new);
1542 } else if (!needs_rebalance
&& r
&& k
.k
->type
!= KEY_TYPE_reflink_v
) {
1544 * For indirect extents, don't delete the rebalance entry when
1545 * we're finished so that we know we specifically moved it or
1546 * compressed it to its current location/compression type
1548 extent_entry_drop(k
, (union bch_extent_entry
*) r
);
1554 /* Generic extent code: */
1556 int bch2_cut_front_s(struct bpos where
, struct bkey_s k
)
1558 unsigned new_val_u64s
= bkey_val_u64s(k
.k
);
1562 if (bkey_le(where
, bkey_start_pos(k
.k
)))
1565 EBUG_ON(bkey_gt(where
, k
.k
->p
));
1567 sub
= where
.offset
- bkey_start_offset(k
.k
);
1572 k
.k
->type
= KEY_TYPE_deleted
;
1576 switch (k
.k
->type
) {
1577 case KEY_TYPE_extent
:
1578 case KEY_TYPE_reflink_v
: {
1579 struct bkey_ptrs ptrs
= bch2_bkey_ptrs(k
);
1580 union bch_extent_entry
*entry
;
1581 bool seen_crc
= false;
1583 bkey_extent_entry_for_each(ptrs
, entry
) {
1584 switch (extent_entry_type(entry
)) {
1585 case BCH_EXTENT_ENTRY_ptr
:
1587 entry
->ptr
.offset
+= sub
;
1589 case BCH_EXTENT_ENTRY_crc32
:
1590 entry
->crc32
.offset
+= sub
;
1592 case BCH_EXTENT_ENTRY_crc64
:
1593 entry
->crc64
.offset
+= sub
;
1595 case BCH_EXTENT_ENTRY_crc128
:
1596 entry
->crc128
.offset
+= sub
;
1598 case BCH_EXTENT_ENTRY_stripe_ptr
:
1600 case BCH_EXTENT_ENTRY_rebalance
:
1604 if (extent_entry_is_crc(entry
))
1610 case KEY_TYPE_reflink_p
: {
1611 struct bkey_s_reflink_p p
= bkey_s_to_reflink_p(k
);
1613 le64_add_cpu(&p
.v
->idx
, sub
);
1616 case KEY_TYPE_inline_data
:
1617 case KEY_TYPE_indirect_inline_data
: {
1618 void *p
= bkey_inline_data_p(k
);
1619 unsigned bytes
= bkey_inline_data_bytes(k
.k
);
1621 sub
= min_t(u64
, sub
<< 9, bytes
);
1623 memmove(p
, p
+ sub
, bytes
- sub
);
1625 new_val_u64s
-= sub
>> 3;
1630 val_u64s_delta
= bkey_val_u64s(k
.k
) - new_val_u64s
;
1631 BUG_ON(val_u64s_delta
< 0);
1633 set_bkey_val_u64s(k
.k
, new_val_u64s
);
1634 memset(bkey_val_end(k
), 0, val_u64s_delta
* sizeof(u64
));
1635 return -val_u64s_delta
;
1638 int bch2_cut_back_s(struct bpos where
, struct bkey_s k
)
1640 unsigned new_val_u64s
= bkey_val_u64s(k
.k
);
1644 if (bkey_ge(where
, k
.k
->p
))
1647 EBUG_ON(bkey_lt(where
, bkey_start_pos(k
.k
)));
1649 len
= where
.offset
- bkey_start_offset(k
.k
);
1651 k
.k
->p
.offset
= where
.offset
;
1655 k
.k
->type
= KEY_TYPE_deleted
;
1659 switch (k
.k
->type
) {
1660 case KEY_TYPE_inline_data
:
1661 case KEY_TYPE_indirect_inline_data
:
1662 new_val_u64s
= (bkey_inline_data_offset(k
.k
) +
1663 min(bkey_inline_data_bytes(k
.k
), k
.k
->size
<< 9)) >> 3;
1667 val_u64s_delta
= bkey_val_u64s(k
.k
) - new_val_u64s
;
1668 BUG_ON(val_u64s_delta
< 0);
1670 set_bkey_val_u64s(k
.k
, new_val_u64s
);
1671 memset(bkey_val_end(k
), 0, val_u64s_delta
* sizeof(u64
));
1672 return -val_u64s_delta
;