1 // SPDX-License-Identifier: GPL-2.0+
3 * Copyright (C) 2016 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_defer.h"
14 #include "xfs_btree.h"
16 #include "xfs_refcount_btree.h"
17 #include "xfs_alloc.h"
18 #include "xfs_errortag.h"
19 #include "xfs_error.h"
20 #include "xfs_trace.h"
21 #include "xfs_trans.h"
23 #include "xfs_refcount.h"
26 #include "xfs_health.h"
27 #include "xfs_refcount_item.h"
29 struct kmem_cache
*xfs_refcount_intent_cache
;
31 /* Allowable refcount adjustment amounts. */
32 enum xfs_refc_adjust_op
{
33 XFS_REFCOUNT_ADJUST_INCREASE
= 1,
34 XFS_REFCOUNT_ADJUST_DECREASE
= -1,
35 XFS_REFCOUNT_ADJUST_COW_ALLOC
= 0,
36 XFS_REFCOUNT_ADJUST_COW_FREE
= -1,
39 STATIC
int __xfs_refcount_cow_alloc(struct xfs_btree_cur
*rcur
,
40 xfs_agblock_t agbno
, xfs_extlen_t aglen
);
41 STATIC
int __xfs_refcount_cow_free(struct xfs_btree_cur
*rcur
,
42 xfs_agblock_t agbno
, xfs_extlen_t aglen
);
45 * Look up the first record less than or equal to [bno, len] in the btree
49 xfs_refcount_lookup_le(
50 struct xfs_btree_cur
*cur
,
51 enum xfs_refc_domain domain
,
55 trace_xfs_refcount_lookup(cur
,
56 xfs_refcount_encode_startblock(bno
, domain
),
58 cur
->bc_rec
.rc
.rc_startblock
= bno
;
59 cur
->bc_rec
.rc
.rc_blockcount
= 0;
60 cur
->bc_rec
.rc
.rc_domain
= domain
;
61 return xfs_btree_lookup(cur
, XFS_LOOKUP_LE
, stat
);
65 * Look up the first record greater than or equal to [bno, len] in the btree
69 xfs_refcount_lookup_ge(
70 struct xfs_btree_cur
*cur
,
71 enum xfs_refc_domain domain
,
75 trace_xfs_refcount_lookup(cur
,
76 xfs_refcount_encode_startblock(bno
, domain
),
78 cur
->bc_rec
.rc
.rc_startblock
= bno
;
79 cur
->bc_rec
.rc
.rc_blockcount
= 0;
80 cur
->bc_rec
.rc
.rc_domain
= domain
;
81 return xfs_btree_lookup(cur
, XFS_LOOKUP_GE
, stat
);
85 * Look up the first record equal to [bno, len] in the btree
89 xfs_refcount_lookup_eq(
90 struct xfs_btree_cur
*cur
,
91 enum xfs_refc_domain domain
,
95 trace_xfs_refcount_lookup(cur
,
96 xfs_refcount_encode_startblock(bno
, domain
),
98 cur
->bc_rec
.rc
.rc_startblock
= bno
;
99 cur
->bc_rec
.rc
.rc_blockcount
= 0;
100 cur
->bc_rec
.rc
.rc_domain
= domain
;
101 return xfs_btree_lookup(cur
, XFS_LOOKUP_EQ
, stat
);
104 /* Convert on-disk record to in-core format. */
106 xfs_refcount_btrec_to_irec(
107 const union xfs_btree_rec
*rec
,
108 struct xfs_refcount_irec
*irec
)
112 start
= be32_to_cpu(rec
->refc
.rc_startblock
);
113 if (start
& XFS_REFC_COWFLAG
) {
114 start
&= ~XFS_REFC_COWFLAG
;
115 irec
->rc_domain
= XFS_REFC_DOMAIN_COW
;
117 irec
->rc_domain
= XFS_REFC_DOMAIN_SHARED
;
120 irec
->rc_startblock
= start
;
121 irec
->rc_blockcount
= be32_to_cpu(rec
->refc
.rc_blockcount
);
122 irec
->rc_refcount
= be32_to_cpu(rec
->refc
.rc_refcount
);
125 /* Simple checks for refcount records. */
127 xfs_refcount_check_irec(
128 struct xfs_perag
*pag
,
129 const struct xfs_refcount_irec
*irec
)
131 if (irec
->rc_blockcount
== 0 || irec
->rc_blockcount
> MAXREFCEXTLEN
)
132 return __this_address
;
134 if (!xfs_refcount_check_domain(irec
))
135 return __this_address
;
137 /* check for valid extent range, including overflow */
138 if (!xfs_verify_agbext(pag
, irec
->rc_startblock
, irec
->rc_blockcount
))
139 return __this_address
;
141 if (irec
->rc_refcount
== 0 || irec
->rc_refcount
> MAXREFCOUNT
)
142 return __this_address
;
148 xfs_refcount_complain_bad_rec(
149 struct xfs_btree_cur
*cur
,
151 const struct xfs_refcount_irec
*irec
)
153 struct xfs_mount
*mp
= cur
->bc_mp
;
156 "Refcount BTree record corruption in AG %d detected at %pS!",
157 cur
->bc_group
->xg_gno
, fa
);
159 "Start block 0x%x, block count 0x%x, references 0x%x",
160 irec
->rc_startblock
, irec
->rc_blockcount
, irec
->rc_refcount
);
161 xfs_btree_mark_sick(cur
);
162 return -EFSCORRUPTED
;
166 * Get the data from the pointed-to record.
169 xfs_refcount_get_rec(
170 struct xfs_btree_cur
*cur
,
171 struct xfs_refcount_irec
*irec
,
174 union xfs_btree_rec
*rec
;
178 error
= xfs_btree_get_rec(cur
, &rec
, stat
);
182 xfs_refcount_btrec_to_irec(rec
, irec
);
183 fa
= xfs_refcount_check_irec(to_perag(cur
->bc_group
), irec
);
185 return xfs_refcount_complain_bad_rec(cur
, fa
, irec
);
187 trace_xfs_refcount_get(cur
, irec
);
192 * Update the record referred to by cur to the value given
193 * by [bno, len, refcount].
194 * This either works (return 0) or gets an EFSCORRUPTED error.
198 struct xfs_btree_cur
*cur
,
199 struct xfs_refcount_irec
*irec
)
201 union xfs_btree_rec rec
;
205 trace_xfs_refcount_update(cur
, irec
);
207 start
= xfs_refcount_encode_startblock(irec
->rc_startblock
,
209 rec
.refc
.rc_startblock
= cpu_to_be32(start
);
210 rec
.refc
.rc_blockcount
= cpu_to_be32(irec
->rc_blockcount
);
211 rec
.refc
.rc_refcount
= cpu_to_be32(irec
->rc_refcount
);
213 error
= xfs_btree_update(cur
, &rec
);
215 trace_xfs_refcount_update_error(cur
, error
, _RET_IP_
);
220 * Insert the record referred to by cur to the value given
221 * by [bno, len, refcount].
222 * This either works (return 0) or gets an EFSCORRUPTED error.
226 struct xfs_btree_cur
*cur
,
227 struct xfs_refcount_irec
*irec
,
232 trace_xfs_refcount_insert(cur
, irec
);
234 cur
->bc_rec
.rc
.rc_startblock
= irec
->rc_startblock
;
235 cur
->bc_rec
.rc
.rc_blockcount
= irec
->rc_blockcount
;
236 cur
->bc_rec
.rc
.rc_refcount
= irec
->rc_refcount
;
237 cur
->bc_rec
.rc
.rc_domain
= irec
->rc_domain
;
239 error
= xfs_btree_insert(cur
, i
);
242 if (XFS_IS_CORRUPT(cur
->bc_mp
, *i
!= 1)) {
243 xfs_btree_mark_sick(cur
);
244 error
= -EFSCORRUPTED
;
250 trace_xfs_refcount_insert_error(cur
, error
, _RET_IP_
);
255 * Remove the record referred to by cur, then set the pointer to the spot
256 * where the record could be re-inserted, in case we want to increment or
257 * decrement the cursor.
258 * This either works (return 0) or gets an EFSCORRUPTED error.
262 struct xfs_btree_cur
*cur
,
265 struct xfs_refcount_irec irec
;
269 error
= xfs_refcount_get_rec(cur
, &irec
, &found_rec
);
272 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
273 xfs_btree_mark_sick(cur
);
274 error
= -EFSCORRUPTED
;
277 trace_xfs_refcount_delete(cur
, &irec
);
278 error
= xfs_btree_delete(cur
, i
);
279 if (XFS_IS_CORRUPT(cur
->bc_mp
, *i
!= 1)) {
280 xfs_btree_mark_sick(cur
);
281 error
= -EFSCORRUPTED
;
286 error
= xfs_refcount_lookup_ge(cur
, irec
.rc_domain
, irec
.rc_startblock
,
290 trace_xfs_refcount_delete_error(cur
, error
, _RET_IP_
);
295 * Adjusting the Reference Count
297 * As stated elsewhere, the reference count btree (refcbt) stores
298 * >1 reference counts for extents of physical blocks. In this
299 * operation, we're either raising or lowering the reference count of
300 * some subrange stored in the tree:
302 * <------ adjustment range ------>
303 * ----+ +---+-----+ +--+--------+---------
304 * 2 | | 3 | 4 | |17| 55 | 10
305 * ----+ +---+-----+ +--+--------+---------
306 * X axis is physical blocks number;
307 * reference counts are the numbers inside the rectangles
309 * The first thing we need to do is to ensure that there are no
310 * refcount extents crossing either boundary of the range to be
311 * adjusted. For any extent that does cross a boundary, split it into
312 * two extents so that we can increment the refcount of one of the
315 * <------ adjustment range ------>
316 * ----+ +---+-----+ +--+--------+----+----
317 * 2 | | 3 | 2 | |17| 55 | 10 | 10
318 * ----+ +---+-----+ +--+--------+----+----
320 * For this next step, let's assume that all the physical blocks in
321 * the adjustment range are mapped to a file and are therefore in use
322 * at least once. Therefore, we can infer that any gap in the
323 * refcount tree within the adjustment range represents a physical
324 * extent with refcount == 1:
326 * <------ adjustment range ------>
327 * ----+---+---+-----+-+--+--------+----+----
328 * 2 |"1"| 3 | 2 |1|17| 55 | 10 | 10
329 * ----+---+---+-----+-+--+--------+----+----
332 * For each extent that falls within the interval range, figure out
333 * which extent is to the left or the right of that extent. Now we
334 * have a left, current, and right extent. If the new reference count
335 * of the center extent enables us to merge left, center, and right
336 * into one record covering all three, do so. If the center extent is
337 * at the left end of the range, abuts the left extent, and its new
338 * reference count matches the left extent's record, then merge them.
339 * If the center extent is at the right end of the range, abuts the
340 * right extent, and the reference counts match, merge those. In the
341 * example, we can left merge (assuming an increment operation):
343 * <------ adjustment range ------>
344 * --------+---+-----+-+--+--------+----+----
345 * 2 | 3 | 2 |1|17| 55 | 10 | 10
346 * --------+---+-----+-+--+--------+----+----
349 * For all other extents within the range, adjust the reference count
350 * or delete it if the refcount falls below 2. If we were
351 * incrementing, the end result looks like this:
353 * <------ adjustment range ------>
354 * --------+---+-----+-+--+--------+----+----
355 * 2 | 4 | 3 |2|18| 56 | 11 | 10
356 * --------+---+-----+-+--+--------+----+----
358 * The result of a decrement operation looks as such:
360 * <------ adjustment range ------>
361 * ----+ +---+ +--+--------+----+----
362 * 2 | | 2 | |16| 54 | 9 | 10
363 * ----+ +---+ +--+--------+----+----
366 * The blocks marked "D" are freed; the blocks marked "1" are only
367 * referenced once and therefore the record is removed from the
371 /* Next block after this extent. */
372 static inline xfs_agblock_t
374 struct xfs_refcount_irec
*rc
)
376 return rc
->rc_startblock
+ rc
->rc_blockcount
;
380 * Split a refcount extent that crosses agbno.
383 xfs_refcount_split_extent(
384 struct xfs_btree_cur
*cur
,
385 enum xfs_refc_domain domain
,
389 struct xfs_refcount_irec rcext
, tmp
;
393 *shape_changed
= false;
394 error
= xfs_refcount_lookup_le(cur
, domain
, agbno
, &found_rec
);
400 error
= xfs_refcount_get_rec(cur
, &rcext
, &found_rec
);
403 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
404 xfs_btree_mark_sick(cur
);
405 error
= -EFSCORRUPTED
;
408 if (rcext
.rc_domain
!= domain
)
410 if (rcext
.rc_startblock
== agbno
|| xfs_refc_next(&rcext
) <= agbno
)
413 *shape_changed
= true;
414 trace_xfs_refcount_split_extent(cur
, &rcext
, agbno
);
416 /* Establish the right extent. */
418 tmp
.rc_startblock
= agbno
;
419 tmp
.rc_blockcount
-= (agbno
- rcext
.rc_startblock
);
420 error
= xfs_refcount_update(cur
, &tmp
);
424 /* Insert the left extent. */
426 tmp
.rc_blockcount
= agbno
- rcext
.rc_startblock
;
427 error
= xfs_refcount_insert(cur
, &tmp
, &found_rec
);
430 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
431 xfs_btree_mark_sick(cur
);
432 error
= -EFSCORRUPTED
;
438 trace_xfs_refcount_split_extent_error(cur
, error
, _RET_IP_
);
443 * Merge the left, center, and right extents.
446 xfs_refcount_merge_center_extents(
447 struct xfs_btree_cur
*cur
,
448 struct xfs_refcount_irec
*left
,
449 struct xfs_refcount_irec
*center
,
450 struct xfs_refcount_irec
*right
,
451 unsigned long long extlen
,
457 trace_xfs_refcount_merge_center_extents(cur
, left
, center
, right
);
459 ASSERT(left
->rc_domain
== center
->rc_domain
);
460 ASSERT(right
->rc_domain
== center
->rc_domain
);
463 * Make sure the center and right extents are not in the btree.
464 * If the center extent was synthesized, the first delete call
465 * removes the right extent and we skip the second deletion.
466 * If center and right were in the btree, then the first delete
467 * call removes the center and the second one removes the right
470 error
= xfs_refcount_lookup_ge(cur
, center
->rc_domain
,
471 center
->rc_startblock
, &found_rec
);
474 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
475 xfs_btree_mark_sick(cur
);
476 error
= -EFSCORRUPTED
;
480 error
= xfs_refcount_delete(cur
, &found_rec
);
483 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
484 xfs_btree_mark_sick(cur
);
485 error
= -EFSCORRUPTED
;
489 if (center
->rc_refcount
> 1) {
490 error
= xfs_refcount_delete(cur
, &found_rec
);
493 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
494 xfs_btree_mark_sick(cur
);
495 error
= -EFSCORRUPTED
;
500 /* Enlarge the left extent. */
501 error
= xfs_refcount_lookup_le(cur
, left
->rc_domain
,
502 left
->rc_startblock
, &found_rec
);
505 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
506 xfs_btree_mark_sick(cur
);
507 error
= -EFSCORRUPTED
;
511 left
->rc_blockcount
= extlen
;
512 error
= xfs_refcount_update(cur
, left
);
520 trace_xfs_refcount_merge_center_extents_error(cur
, error
, _RET_IP_
);
525 * Merge with the left extent.
528 xfs_refcount_merge_left_extent(
529 struct xfs_btree_cur
*cur
,
530 struct xfs_refcount_irec
*left
,
531 struct xfs_refcount_irec
*cleft
,
532 xfs_agblock_t
*agbno
,
538 trace_xfs_refcount_merge_left_extent(cur
, left
, cleft
);
540 ASSERT(left
->rc_domain
== cleft
->rc_domain
);
542 /* If the extent at agbno (cleft) wasn't synthesized, remove it. */
543 if (cleft
->rc_refcount
> 1) {
544 error
= xfs_refcount_lookup_le(cur
, cleft
->rc_domain
,
545 cleft
->rc_startblock
, &found_rec
);
548 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
549 xfs_btree_mark_sick(cur
);
550 error
= -EFSCORRUPTED
;
554 error
= xfs_refcount_delete(cur
, &found_rec
);
557 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
558 xfs_btree_mark_sick(cur
);
559 error
= -EFSCORRUPTED
;
564 /* Enlarge the left extent. */
565 error
= xfs_refcount_lookup_le(cur
, left
->rc_domain
,
566 left
->rc_startblock
, &found_rec
);
569 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
570 xfs_btree_mark_sick(cur
);
571 error
= -EFSCORRUPTED
;
575 left
->rc_blockcount
+= cleft
->rc_blockcount
;
576 error
= xfs_refcount_update(cur
, left
);
580 *agbno
+= cleft
->rc_blockcount
;
581 *aglen
-= cleft
->rc_blockcount
;
585 trace_xfs_refcount_merge_left_extent_error(cur
, error
, _RET_IP_
);
590 * Merge with the right extent.
593 xfs_refcount_merge_right_extent(
594 struct xfs_btree_cur
*cur
,
595 struct xfs_refcount_irec
*right
,
596 struct xfs_refcount_irec
*cright
,
602 trace_xfs_refcount_merge_right_extent(cur
, cright
, right
);
604 ASSERT(right
->rc_domain
== cright
->rc_domain
);
607 * If the extent ending at agbno+aglen (cright) wasn't synthesized,
610 if (cright
->rc_refcount
> 1) {
611 error
= xfs_refcount_lookup_le(cur
, cright
->rc_domain
,
612 cright
->rc_startblock
, &found_rec
);
615 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
616 xfs_btree_mark_sick(cur
);
617 error
= -EFSCORRUPTED
;
621 error
= xfs_refcount_delete(cur
, &found_rec
);
624 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
625 xfs_btree_mark_sick(cur
);
626 error
= -EFSCORRUPTED
;
631 /* Enlarge the right extent. */
632 error
= xfs_refcount_lookup_le(cur
, right
->rc_domain
,
633 right
->rc_startblock
, &found_rec
);
636 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
637 xfs_btree_mark_sick(cur
);
638 error
= -EFSCORRUPTED
;
642 right
->rc_startblock
-= cright
->rc_blockcount
;
643 right
->rc_blockcount
+= cright
->rc_blockcount
;
644 error
= xfs_refcount_update(cur
, right
);
648 *aglen
-= cright
->rc_blockcount
;
652 trace_xfs_refcount_merge_right_extent_error(cur
, error
, _RET_IP_
);
657 * Find the left extent and the one after it (cleft). This function assumes
658 * that we've already split any extent crossing agbno.
661 xfs_refcount_find_left_extents(
662 struct xfs_btree_cur
*cur
,
663 struct xfs_refcount_irec
*left
,
664 struct xfs_refcount_irec
*cleft
,
665 enum xfs_refc_domain domain
,
669 struct xfs_refcount_irec tmp
;
673 left
->rc_startblock
= cleft
->rc_startblock
= NULLAGBLOCK
;
674 error
= xfs_refcount_lookup_le(cur
, domain
, agbno
- 1, &found_rec
);
680 error
= xfs_refcount_get_rec(cur
, &tmp
, &found_rec
);
683 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
684 xfs_btree_mark_sick(cur
);
685 error
= -EFSCORRUPTED
;
689 if (tmp
.rc_domain
!= domain
)
691 if (xfs_refc_next(&tmp
) != agbno
)
693 /* We have a left extent; retrieve (or invent) the next right one */
696 error
= xfs_btree_increment(cur
, 0, &found_rec
);
700 error
= xfs_refcount_get_rec(cur
, &tmp
, &found_rec
);
703 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
704 xfs_btree_mark_sick(cur
);
705 error
= -EFSCORRUPTED
;
709 if (tmp
.rc_domain
!= domain
)
712 /* if tmp starts at the end of our range, just use that */
713 if (tmp
.rc_startblock
== agbno
)
717 * There's a gap in the refcntbt at the start of the
718 * range we're interested in (refcount == 1) so
719 * synthesize the implied extent and pass it back.
720 * We assume here that the agbno/aglen range was
721 * passed in from a data fork extent mapping and
722 * therefore is allocated to exactly one owner.
724 cleft
->rc_startblock
= agbno
;
725 cleft
->rc_blockcount
= min(aglen
,
726 tmp
.rc_startblock
- agbno
);
727 cleft
->rc_refcount
= 1;
728 cleft
->rc_domain
= domain
;
733 * No extents, so pretend that there's one covering the whole
736 cleft
->rc_startblock
= agbno
;
737 cleft
->rc_blockcount
= aglen
;
738 cleft
->rc_refcount
= 1;
739 cleft
->rc_domain
= domain
;
741 trace_xfs_refcount_find_left_extent(cur
, left
, cleft
, agbno
);
745 trace_xfs_refcount_find_left_extent_error(cur
, error
, _RET_IP_
);
750 * Find the right extent and the one before it (cright). This function
751 * assumes that we've already split any extents crossing agbno + aglen.
754 xfs_refcount_find_right_extents(
755 struct xfs_btree_cur
*cur
,
756 struct xfs_refcount_irec
*right
,
757 struct xfs_refcount_irec
*cright
,
758 enum xfs_refc_domain domain
,
762 struct xfs_refcount_irec tmp
;
766 right
->rc_startblock
= cright
->rc_startblock
= NULLAGBLOCK
;
767 error
= xfs_refcount_lookup_ge(cur
, domain
, agbno
+ aglen
, &found_rec
);
773 error
= xfs_refcount_get_rec(cur
, &tmp
, &found_rec
);
776 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
777 xfs_btree_mark_sick(cur
);
778 error
= -EFSCORRUPTED
;
782 if (tmp
.rc_domain
!= domain
)
784 if (tmp
.rc_startblock
!= agbno
+ aglen
)
786 /* We have a right extent; retrieve (or invent) the next left one */
789 error
= xfs_btree_decrement(cur
, 0, &found_rec
);
793 error
= xfs_refcount_get_rec(cur
, &tmp
, &found_rec
);
796 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
797 xfs_btree_mark_sick(cur
);
798 error
= -EFSCORRUPTED
;
802 if (tmp
.rc_domain
!= domain
)
805 /* if tmp ends at the end of our range, just use that */
806 if (xfs_refc_next(&tmp
) == agbno
+ aglen
)
810 * There's a gap in the refcntbt at the end of the
811 * range we're interested in (refcount == 1) so
812 * create the implied extent and pass it back.
813 * We assume here that the agbno/aglen range was
814 * passed in from a data fork extent mapping and
815 * therefore is allocated to exactly one owner.
817 cright
->rc_startblock
= max(agbno
, xfs_refc_next(&tmp
));
818 cright
->rc_blockcount
= right
->rc_startblock
-
819 cright
->rc_startblock
;
820 cright
->rc_refcount
= 1;
821 cright
->rc_domain
= domain
;
826 * No extents, so pretend that there's one covering the whole
829 cright
->rc_startblock
= agbno
;
830 cright
->rc_blockcount
= aglen
;
831 cright
->rc_refcount
= 1;
832 cright
->rc_domain
= domain
;
834 trace_xfs_refcount_find_right_extent(cur
, cright
, right
,
839 trace_xfs_refcount_find_right_extent_error(cur
, error
, _RET_IP_
);
843 /* Is this extent valid? */
846 const struct xfs_refcount_irec
*rc
)
848 return rc
->rc_startblock
!= NULLAGBLOCK
;
851 static inline xfs_nlink_t
852 xfs_refc_merge_refcount(
853 const struct xfs_refcount_irec
*irec
,
854 enum xfs_refc_adjust_op adjust
)
856 /* Once a record hits MAXREFCOUNT, it is pinned there forever */
857 if (irec
->rc_refcount
== MAXREFCOUNT
)
859 return irec
->rc_refcount
+ adjust
;
863 xfs_refc_want_merge_center(
864 const struct xfs_refcount_irec
*left
,
865 const struct xfs_refcount_irec
*cleft
,
866 const struct xfs_refcount_irec
*cright
,
867 const struct xfs_refcount_irec
*right
,
868 bool cleft_is_cright
,
869 enum xfs_refc_adjust_op adjust
,
870 unsigned long long *ulenp
)
872 unsigned long long ulen
= left
->rc_blockcount
;
873 xfs_nlink_t new_refcount
;
876 * To merge with a center record, both shoulder records must be
877 * adjacent to the record we want to adjust. This is only true if
878 * find_left and find_right made all four records valid.
880 if (!xfs_refc_valid(left
) || !xfs_refc_valid(right
) ||
881 !xfs_refc_valid(cleft
) || !xfs_refc_valid(cright
))
884 /* There must only be one record for the entire range. */
885 if (!cleft_is_cright
)
888 /* The shoulder record refcounts must match the new refcount. */
889 new_refcount
= xfs_refc_merge_refcount(cleft
, adjust
);
890 if (left
->rc_refcount
!= new_refcount
)
892 if (right
->rc_refcount
!= new_refcount
)
896 * The new record cannot exceed the max length. ulen is a ULL as the
897 * individual record block counts can be up to (u32 - 1) in length
898 * hence we need to catch u32 addition overflows here.
900 ulen
+= cleft
->rc_blockcount
+ right
->rc_blockcount
;
901 if (ulen
>= MAXREFCEXTLEN
)
909 xfs_refc_want_merge_left(
910 const struct xfs_refcount_irec
*left
,
911 const struct xfs_refcount_irec
*cleft
,
912 enum xfs_refc_adjust_op adjust
)
914 unsigned long long ulen
= left
->rc_blockcount
;
915 xfs_nlink_t new_refcount
;
918 * For a left merge, the left shoulder record must be adjacent to the
919 * start of the range. If this is true, find_left made left and cleft
920 * contain valid contents.
922 if (!xfs_refc_valid(left
) || !xfs_refc_valid(cleft
))
925 /* Left shoulder record refcount must match the new refcount. */
926 new_refcount
= xfs_refc_merge_refcount(cleft
, adjust
);
927 if (left
->rc_refcount
!= new_refcount
)
931 * The new record cannot exceed the max length. ulen is a ULL as the
932 * individual record block counts can be up to (u32 - 1) in length
933 * hence we need to catch u32 addition overflows here.
935 ulen
+= cleft
->rc_blockcount
;
936 if (ulen
>= MAXREFCEXTLEN
)
943 xfs_refc_want_merge_right(
944 const struct xfs_refcount_irec
*cright
,
945 const struct xfs_refcount_irec
*right
,
946 enum xfs_refc_adjust_op adjust
)
948 unsigned long long ulen
= right
->rc_blockcount
;
949 xfs_nlink_t new_refcount
;
952 * For a right merge, the right shoulder record must be adjacent to the
953 * end of the range. If this is true, find_right made cright and right
954 * contain valid contents.
956 if (!xfs_refc_valid(right
) || !xfs_refc_valid(cright
))
959 /* Right shoulder record refcount must match the new refcount. */
960 new_refcount
= xfs_refc_merge_refcount(cright
, adjust
);
961 if (right
->rc_refcount
!= new_refcount
)
965 * The new record cannot exceed the max length. ulen is a ULL as the
966 * individual record block counts can be up to (u32 - 1) in length
967 * hence we need to catch u32 addition overflows here.
969 ulen
+= cright
->rc_blockcount
;
970 if (ulen
>= MAXREFCEXTLEN
)
977 * Try to merge with any extents on the boundaries of the adjustment range.
980 xfs_refcount_merge_extents(
981 struct xfs_btree_cur
*cur
,
982 enum xfs_refc_domain domain
,
983 xfs_agblock_t
*agbno
,
985 enum xfs_refc_adjust_op adjust
,
988 struct xfs_refcount_irec left
= {0}, cleft
= {0};
989 struct xfs_refcount_irec cright
= {0}, right
= {0};
991 unsigned long long ulen
;
994 *shape_changed
= false;
996 * Find the extent just below agbno [left], just above agbno [cleft],
997 * just below (agbno + aglen) [cright], and just above (agbno + aglen)
1000 error
= xfs_refcount_find_left_extents(cur
, &left
, &cleft
, domain
,
1004 error
= xfs_refcount_find_right_extents(cur
, &right
, &cright
, domain
,
1009 /* No left or right extent to merge; exit. */
1010 if (!xfs_refc_valid(&left
) && !xfs_refc_valid(&right
))
1013 cequal
= (cleft
.rc_startblock
== cright
.rc_startblock
) &&
1014 (cleft
.rc_blockcount
== cright
.rc_blockcount
);
1016 /* Try to merge left, cleft, and right. cleft must == cright. */
1017 if (xfs_refc_want_merge_center(&left
, &cleft
, &cright
, &right
, cequal
,
1019 *shape_changed
= true;
1020 return xfs_refcount_merge_center_extents(cur
, &left
, &cleft
,
1021 &right
, ulen
, aglen
);
1024 /* Try to merge left and cleft. */
1025 if (xfs_refc_want_merge_left(&left
, &cleft
, adjust
)) {
1026 *shape_changed
= true;
1027 error
= xfs_refcount_merge_left_extent(cur
, &left
, &cleft
,
1033 * If we just merged left + cleft and cleft == cright,
1034 * we no longer have a cright to merge with right. We're done.
1040 /* Try to merge cright and right. */
1041 if (xfs_refc_want_merge_right(&cright
, &right
, adjust
)) {
1042 *shape_changed
= true;
1043 return xfs_refcount_merge_right_extent(cur
, &right
, &cright
,
1051 * XXX: This is a pretty hand-wavy estimate. The penalty for guessing
1052 * true incorrectly is a shutdown FS; the penalty for guessing false
1053 * incorrectly is more transaction rolls than might be necessary.
1054 * Be conservative here.
1057 xfs_refcount_still_have_space(
1058 struct xfs_btree_cur
*cur
)
1060 unsigned long overhead
;
1063 * Worst case estimate: full splits of the free space and rmap btrees
1064 * to handle each of the shape changes to the refcount btree.
1066 overhead
= xfs_allocfree_block_count(cur
->bc_mp
,
1067 cur
->bc_refc
.shape_changes
);
1068 overhead
+= cur
->bc_mp
->m_refc_maxlevels
;
1069 overhead
*= cur
->bc_mp
->m_sb
.sb_blocksize
;
1072 * Only allow 2 refcount extent updates per transaction if the
1073 * refcount continue update "error" has been injected.
1075 if (cur
->bc_refc
.nr_ops
> 2 &&
1076 XFS_TEST_ERROR(false, cur
->bc_mp
,
1077 XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE
))
1080 if (cur
->bc_refc
.nr_ops
== 0)
1082 else if (overhead
> cur
->bc_tp
->t_log_res
)
1084 return cur
->bc_tp
->t_log_res
- overhead
>
1085 cur
->bc_refc
.nr_ops
* XFS_REFCOUNT_ITEM_OVERHEAD
;
1089 * Adjust the refcounts of middle extents. At this point we should have
1090 * split extents that crossed the adjustment range; merged with adjacent
1091 * extents; and updated agbno/aglen to reflect the merges. Therefore,
1092 * all we have to do is update the extents inside [agbno, agbno + aglen].
1095 xfs_refcount_adjust_extents(
1096 struct xfs_btree_cur
*cur
,
1097 xfs_agblock_t
*agbno
,
1098 xfs_extlen_t
*aglen
,
1099 enum xfs_refc_adjust_op adj
)
1101 struct xfs_refcount_irec ext
, tmp
;
1103 int found_rec
, found_tmp
;
1104 xfs_fsblock_t fsbno
;
1106 /* Merging did all the work already. */
1110 error
= xfs_refcount_lookup_ge(cur
, XFS_REFC_DOMAIN_SHARED
, *agbno
,
1115 while (*aglen
> 0 && xfs_refcount_still_have_space(cur
)) {
1116 error
= xfs_refcount_get_rec(cur
, &ext
, &found_rec
);
1119 if (!found_rec
|| ext
.rc_domain
!= XFS_REFC_DOMAIN_SHARED
) {
1120 ext
.rc_startblock
= cur
->bc_mp
->m_sb
.sb_agblocks
;
1121 ext
.rc_blockcount
= 0;
1122 ext
.rc_refcount
= 0;
1123 ext
.rc_domain
= XFS_REFC_DOMAIN_SHARED
;
1127 * Deal with a hole in the refcount tree; if a file maps to
1128 * these blocks and there's no refcountbt record, pretend that
1129 * there is one with refcount == 1.
1131 if (ext
.rc_startblock
!= *agbno
) {
1132 tmp
.rc_startblock
= *agbno
;
1133 tmp
.rc_blockcount
= min(*aglen
,
1134 ext
.rc_startblock
- *agbno
);
1135 tmp
.rc_refcount
= 1 + adj
;
1136 tmp
.rc_domain
= XFS_REFC_DOMAIN_SHARED
;
1138 trace_xfs_refcount_modify_extent(cur
, &tmp
);
1141 * Either cover the hole (increment) or
1142 * delete the range (decrement).
1144 cur
->bc_refc
.nr_ops
++;
1145 if (tmp
.rc_refcount
) {
1146 error
= xfs_refcount_insert(cur
, &tmp
,
1150 if (XFS_IS_CORRUPT(cur
->bc_mp
,
1152 xfs_btree_mark_sick(cur
);
1153 error
= -EFSCORRUPTED
;
1157 fsbno
= xfs_agbno_to_fsb(to_perag(cur
->bc_group
),
1159 error
= xfs_free_extent_later(cur
->bc_tp
, fsbno
,
1160 tmp
.rc_blockcount
, NULL
,
1161 XFS_AG_RESV_NONE
, 0);
1166 (*agbno
) += tmp
.rc_blockcount
;
1167 (*aglen
) -= tmp
.rc_blockcount
;
1169 /* Stop if there's nothing left to modify */
1170 if (*aglen
== 0 || !xfs_refcount_still_have_space(cur
))
1173 /* Move the cursor to the start of ext. */
1174 error
= xfs_refcount_lookup_ge(cur
,
1175 XFS_REFC_DOMAIN_SHARED
, *agbno
,
1182 * A previous step trimmed agbno/aglen such that the end of the
1183 * range would not be in the middle of the record. If this is
1184 * no longer the case, something is seriously wrong with the
1185 * btree. Make sure we never feed the synthesized record into
1186 * the processing loop below.
1188 if (XFS_IS_CORRUPT(cur
->bc_mp
, ext
.rc_blockcount
== 0) ||
1189 XFS_IS_CORRUPT(cur
->bc_mp
, ext
.rc_blockcount
> *aglen
)) {
1190 xfs_btree_mark_sick(cur
);
1191 error
= -EFSCORRUPTED
;
1196 * Adjust the reference count and either update the tree
1197 * (incr) or free the blocks (decr).
1199 if (ext
.rc_refcount
== MAXREFCOUNT
)
1201 ext
.rc_refcount
+= adj
;
1202 trace_xfs_refcount_modify_extent(cur
, &ext
);
1203 cur
->bc_refc
.nr_ops
++;
1204 if (ext
.rc_refcount
> 1) {
1205 error
= xfs_refcount_update(cur
, &ext
);
1208 } else if (ext
.rc_refcount
== 1) {
1209 error
= xfs_refcount_delete(cur
, &found_rec
);
1212 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
1213 xfs_btree_mark_sick(cur
);
1214 error
= -EFSCORRUPTED
;
1219 fsbno
= xfs_agbno_to_fsb(to_perag(cur
->bc_group
),
1221 error
= xfs_free_extent_later(cur
->bc_tp
, fsbno
,
1222 ext
.rc_blockcount
, NULL
,
1223 XFS_AG_RESV_NONE
, 0);
1229 error
= xfs_btree_increment(cur
, 0, &found_rec
);
1234 (*agbno
) += ext
.rc_blockcount
;
1235 (*aglen
) -= ext
.rc_blockcount
;
1240 trace_xfs_refcount_modify_extent_error(cur
, error
, _RET_IP_
);
1244 /* Adjust the reference count of a range of AG blocks. */
1246 xfs_refcount_adjust(
1247 struct xfs_btree_cur
*cur
,
1248 xfs_agblock_t
*agbno
,
1249 xfs_extlen_t
*aglen
,
1250 enum xfs_refc_adjust_op adj
)
1253 int shape_changes
= 0;
1256 if (adj
== XFS_REFCOUNT_ADJUST_INCREASE
)
1257 trace_xfs_refcount_increase(cur
, *agbno
, *aglen
);
1259 trace_xfs_refcount_decrease(cur
, *agbno
, *aglen
);
1262 * Ensure that no rcextents cross the boundary of the adjustment range.
1264 error
= xfs_refcount_split_extent(cur
, XFS_REFC_DOMAIN_SHARED
,
1265 *agbno
, &shape_changed
);
1271 error
= xfs_refcount_split_extent(cur
, XFS_REFC_DOMAIN_SHARED
,
1272 *agbno
+ *aglen
, &shape_changed
);
1279 * Try to merge with the left or right extents of the range.
1281 error
= xfs_refcount_merge_extents(cur
, XFS_REFC_DOMAIN_SHARED
,
1282 agbno
, aglen
, adj
, &shape_changed
);
1288 cur
->bc_refc
.shape_changes
++;
1290 /* Now that we've taken care of the ends, adjust the middle extents */
1291 error
= xfs_refcount_adjust_extents(cur
, agbno
, aglen
, adj
);
1298 trace_xfs_refcount_adjust_error(cur
, error
, _RET_IP_
);
1303 * Set up a continuation a deferred refcount operation by updating the intent.
1304 * Checks to make sure we're not going to run off the end of the AG.
1307 xfs_refcount_continue_op(
1308 struct xfs_btree_cur
*cur
,
1309 struct xfs_refcount_intent
*ri
,
1310 xfs_agblock_t new_agbno
)
1312 struct xfs_mount
*mp
= cur
->bc_mp
;
1313 struct xfs_perag
*pag
= to_perag(cur
->bc_group
);
1315 if (XFS_IS_CORRUPT(mp
, !xfs_verify_agbext(pag
, new_agbno
,
1316 ri
->ri_blockcount
))) {
1317 xfs_btree_mark_sick(cur
);
1318 return -EFSCORRUPTED
;
1321 ri
->ri_startblock
= xfs_agbno_to_fsb(pag
, new_agbno
);
1323 ASSERT(xfs_verify_fsbext(mp
, ri
->ri_startblock
, ri
->ri_blockcount
));
1324 ASSERT(pag_agno(pag
) == XFS_FSB_TO_AGNO(mp
, ri
->ri_startblock
));
1330 * Process one of the deferred refcount operations. We pass back the
1331 * btree cursor to maintain our lock on the btree between calls.
1332 * This saves time and eliminates a buffer deadlock between the
1333 * superblock and the AGF because we'll always grab them in the same
1337 xfs_refcount_finish_one(
1338 struct xfs_trans
*tp
,
1339 struct xfs_refcount_intent
*ri
,
1340 struct xfs_btree_cur
**pcur
)
1342 struct xfs_mount
*mp
= tp
->t_mountp
;
1343 struct xfs_btree_cur
*rcur
= *pcur
;
1344 struct xfs_buf
*agbp
= NULL
;
1347 unsigned long nr_ops
= 0;
1348 int shape_changes
= 0;
1350 bno
= XFS_FSB_TO_AGBNO(mp
, ri
->ri_startblock
);
1352 trace_xfs_refcount_deferred(mp
, ri
);
1354 if (XFS_TEST_ERROR(false, mp
, XFS_ERRTAG_REFCOUNT_FINISH_ONE
))
1358 * If we haven't gotten a cursor or the cursor AG doesn't match
1359 * the startblock, get one now.
1361 if (rcur
!= NULL
&& rcur
->bc_group
!= ri
->ri_group
) {
1362 nr_ops
= rcur
->bc_refc
.nr_ops
;
1363 shape_changes
= rcur
->bc_refc
.shape_changes
;
1364 xfs_btree_del_cursor(rcur
, 0);
1369 struct xfs_perag
*pag
= to_perag(ri
->ri_group
);
1371 error
= xfs_alloc_read_agf(pag
, tp
,
1372 XFS_ALLOC_FLAG_FREEING
, &agbp
);
1376 *pcur
= rcur
= xfs_refcountbt_init_cursor(mp
, tp
, agbp
, pag
);
1377 rcur
->bc_refc
.nr_ops
= nr_ops
;
1378 rcur
->bc_refc
.shape_changes
= shape_changes
;
1381 switch (ri
->ri_type
) {
1382 case XFS_REFCOUNT_INCREASE
:
1383 error
= xfs_refcount_adjust(rcur
, &bno
, &ri
->ri_blockcount
,
1384 XFS_REFCOUNT_ADJUST_INCREASE
);
1387 if (ri
->ri_blockcount
> 0)
1388 error
= xfs_refcount_continue_op(rcur
, ri
, bno
);
1390 case XFS_REFCOUNT_DECREASE
:
1391 error
= xfs_refcount_adjust(rcur
, &bno
, &ri
->ri_blockcount
,
1392 XFS_REFCOUNT_ADJUST_DECREASE
);
1395 if (ri
->ri_blockcount
> 0)
1396 error
= xfs_refcount_continue_op(rcur
, ri
, bno
);
1398 case XFS_REFCOUNT_ALLOC_COW
:
1399 error
= __xfs_refcount_cow_alloc(rcur
, bno
, ri
->ri_blockcount
);
1402 ri
->ri_blockcount
= 0;
1404 case XFS_REFCOUNT_FREE_COW
:
1405 error
= __xfs_refcount_cow_free(rcur
, bno
, ri
->ri_blockcount
);
1408 ri
->ri_blockcount
= 0;
1412 return -EFSCORRUPTED
;
1414 if (!error
&& ri
->ri_blockcount
> 0)
1415 trace_xfs_refcount_finish_one_leftover(mp
, ri
);
1420 * Record a refcount intent for later processing.
1424 struct xfs_trans
*tp
,
1425 enum xfs_refcount_intent_type type
,
1426 xfs_fsblock_t startblock
,
1427 xfs_extlen_t blockcount
)
1429 struct xfs_refcount_intent
*ri
;
1431 ri
= kmem_cache_alloc(xfs_refcount_intent_cache
,
1432 GFP_KERNEL
| __GFP_NOFAIL
);
1433 INIT_LIST_HEAD(&ri
->ri_list
);
1435 ri
->ri_startblock
= startblock
;
1436 ri
->ri_blockcount
= blockcount
;
1438 xfs_refcount_defer_add(tp
, ri
);
1442 * Increase the reference count of the blocks backing a file's extent.
1445 xfs_refcount_increase_extent(
1446 struct xfs_trans
*tp
,
1447 struct xfs_bmbt_irec
*PREV
)
1449 if (!xfs_has_reflink(tp
->t_mountp
))
1452 __xfs_refcount_add(tp
, XFS_REFCOUNT_INCREASE
, PREV
->br_startblock
,
1453 PREV
->br_blockcount
);
1457 * Decrease the reference count of the blocks backing a file's extent.
1460 xfs_refcount_decrease_extent(
1461 struct xfs_trans
*tp
,
1462 struct xfs_bmbt_irec
*PREV
)
1464 if (!xfs_has_reflink(tp
->t_mountp
))
1467 __xfs_refcount_add(tp
, XFS_REFCOUNT_DECREASE
, PREV
->br_startblock
,
1468 PREV
->br_blockcount
);
1472 * Given an AG extent, find the lowest-numbered run of shared blocks
1473 * within that range and return the range in fbno/flen. If
1474 * find_end_of_shared is set, return the longest contiguous extent of
1475 * shared blocks; if not, just return the first extent we find. If no
1476 * shared blocks are found, fbno and flen will be set to NULLAGBLOCK
1477 * and 0, respectively.
1480 xfs_refcount_find_shared(
1481 struct xfs_btree_cur
*cur
,
1482 xfs_agblock_t agbno
,
1484 xfs_agblock_t
*fbno
,
1486 bool find_end_of_shared
)
1488 struct xfs_refcount_irec tmp
;
1493 trace_xfs_refcount_find_shared(cur
, agbno
, aglen
);
1495 /* By default, skip the whole range */
1496 *fbno
= NULLAGBLOCK
;
1499 /* Try to find a refcount extent that crosses the start */
1500 error
= xfs_refcount_lookup_le(cur
, XFS_REFC_DOMAIN_SHARED
, agbno
,
1505 /* No left extent, look at the next one */
1506 error
= xfs_btree_increment(cur
, 0, &have
);
1512 error
= xfs_refcount_get_rec(cur
, &tmp
, &i
);
1515 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1516 xfs_btree_mark_sick(cur
);
1517 error
= -EFSCORRUPTED
;
1520 if (tmp
.rc_domain
!= XFS_REFC_DOMAIN_SHARED
)
1523 /* If the extent ends before the start, look at the next one */
1524 if (tmp
.rc_startblock
+ tmp
.rc_blockcount
<= agbno
) {
1525 error
= xfs_btree_increment(cur
, 0, &have
);
1530 error
= xfs_refcount_get_rec(cur
, &tmp
, &i
);
1533 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1534 xfs_btree_mark_sick(cur
);
1535 error
= -EFSCORRUPTED
;
1538 if (tmp
.rc_domain
!= XFS_REFC_DOMAIN_SHARED
)
1542 /* If the extent starts after the range we want, bail out */
1543 if (tmp
.rc_startblock
>= agbno
+ aglen
)
1546 /* We found the start of a shared extent! */
1547 if (tmp
.rc_startblock
< agbno
) {
1548 tmp
.rc_blockcount
-= (agbno
- tmp
.rc_startblock
);
1549 tmp
.rc_startblock
= agbno
;
1552 *fbno
= tmp
.rc_startblock
;
1553 *flen
= min(tmp
.rc_blockcount
, agbno
+ aglen
- *fbno
);
1554 if (!find_end_of_shared
)
1557 /* Otherwise, find the end of this shared extent */
1558 while (*fbno
+ *flen
< agbno
+ aglen
) {
1559 error
= xfs_btree_increment(cur
, 0, &have
);
1564 error
= xfs_refcount_get_rec(cur
, &tmp
, &i
);
1567 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1568 xfs_btree_mark_sick(cur
);
1569 error
= -EFSCORRUPTED
;
1572 if (tmp
.rc_domain
!= XFS_REFC_DOMAIN_SHARED
||
1573 tmp
.rc_startblock
>= agbno
+ aglen
||
1574 tmp
.rc_startblock
!= *fbno
+ *flen
)
1576 *flen
= min(*flen
+ tmp
.rc_blockcount
, agbno
+ aglen
- *fbno
);
1580 trace_xfs_refcount_find_shared_result(cur
, *fbno
, *flen
);
1584 trace_xfs_refcount_find_shared_error(cur
, error
, _RET_IP_
);
1589 * Recovering CoW Blocks After a Crash
1591 * Due to the way that the copy on write mechanism works, there's a window of
1592 * opportunity in which we can lose track of allocated blocks during a crash.
1593 * Because CoW uses delayed allocation in the in-core CoW fork, writeback
1594 * causes blocks to be allocated and stored in the CoW fork. The blocks are
1595 * no longer in the free space btree but are not otherwise recorded anywhere
1596 * until the write completes and the blocks are mapped into the file. A crash
1597 * in between allocation and remapping results in the replacement blocks being
1598 * lost. This situation is exacerbated by the CoW extent size hint because
1599 * allocations can hang around for long time.
1601 * However, there is a place where we can record these allocations before they
1602 * become mappings -- the reference count btree. The btree does not record
1603 * extents with refcount == 1, so we can record allocations with a refcount of
1604 * 1. Blocks being used for CoW writeout cannot be shared, so there should be
1605 * no conflict with shared block records. These mappings should be created
1606 * when we allocate blocks to the CoW fork and deleted when they're removed
1607 * from the CoW fork.
1609 * Minor nit: records for in-progress CoW allocations and records for shared
1610 * extents must never be merged, to preserve the property that (except for CoW
1611 * allocations) there are no refcount btree entries with refcount == 1. The
1612 * only time this could potentially happen is when unsharing a block that's
1613 * adjacent to CoW allocations, so we must be careful to avoid this.
1615 * At mount time we recover lost CoW allocations by searching the refcount
1616 * btree for these refcount == 1 mappings. These represent CoW allocations
1617 * that were in progress at the time the filesystem went down, so we can free
1618 * them to get the space back.
1620 * This mechanism is superior to creating EFIs for unmapped CoW extents for
1621 * several reasons -- first, EFIs pin the tail of the log and would have to be
1622 * periodically relogged to avoid filling up the log. Second, CoW completions
1623 * will have to file an EFD and create new EFIs for whatever remains in the
1624 * CoW fork; this partially takes care of (1) but extent-size reservations
1625 * will have to periodically relog even if there's no writeout in progress.
1626 * This can happen if the CoW extent size hint is set, which you really want.
1627 * Third, EFIs cannot currently be automatically relogged into newer
1628 * transactions to advance the log tail. Fourth, stuffing the log full of
1629 * EFIs places an upper bound on the number of CoW allocations that can be
1630 * held filesystem-wide at any given time. Recording them in the refcount
1631 * btree doesn't require us to maintain any state in memory and doesn't pin
1635 * Adjust the refcounts of CoW allocations. These allocations are "magic"
1636 * in that they're not referenced anywhere else in the filesystem, so we
1637 * stash them in the refcount btree with a refcount of 1 until either file
1638 * remapping (or CoW cancellation) happens.
1641 xfs_refcount_adjust_cow_extents(
1642 struct xfs_btree_cur
*cur
,
1643 xfs_agblock_t agbno
,
1645 enum xfs_refc_adjust_op adj
)
1647 struct xfs_refcount_irec ext
, tmp
;
1649 int found_rec
, found_tmp
;
1654 /* Find any overlapping refcount records */
1655 error
= xfs_refcount_lookup_ge(cur
, XFS_REFC_DOMAIN_COW
, agbno
,
1659 error
= xfs_refcount_get_rec(cur
, &ext
, &found_rec
);
1662 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
&&
1663 ext
.rc_domain
!= XFS_REFC_DOMAIN_COW
)) {
1664 xfs_btree_mark_sick(cur
);
1665 error
= -EFSCORRUPTED
;
1669 ext
.rc_startblock
= cur
->bc_mp
->m_sb
.sb_agblocks
;
1670 ext
.rc_blockcount
= 0;
1671 ext
.rc_refcount
= 0;
1672 ext
.rc_domain
= XFS_REFC_DOMAIN_COW
;
1676 case XFS_REFCOUNT_ADJUST_COW_ALLOC
:
1677 /* Adding a CoW reservation, there should be nothing here. */
1678 if (XFS_IS_CORRUPT(cur
->bc_mp
,
1679 agbno
+ aglen
> ext
.rc_startblock
)) {
1680 xfs_btree_mark_sick(cur
);
1681 error
= -EFSCORRUPTED
;
1685 tmp
.rc_startblock
= agbno
;
1686 tmp
.rc_blockcount
= aglen
;
1687 tmp
.rc_refcount
= 1;
1688 tmp
.rc_domain
= XFS_REFC_DOMAIN_COW
;
1690 trace_xfs_refcount_modify_extent(cur
, &tmp
);
1692 error
= xfs_refcount_insert(cur
, &tmp
,
1696 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_tmp
!= 1)) {
1697 xfs_btree_mark_sick(cur
);
1698 error
= -EFSCORRUPTED
;
1702 case XFS_REFCOUNT_ADJUST_COW_FREE
:
1703 /* Removing a CoW reservation, there should be one extent. */
1704 if (XFS_IS_CORRUPT(cur
->bc_mp
, ext
.rc_startblock
!= agbno
)) {
1705 xfs_btree_mark_sick(cur
);
1706 error
= -EFSCORRUPTED
;
1709 if (XFS_IS_CORRUPT(cur
->bc_mp
, ext
.rc_blockcount
!= aglen
)) {
1710 xfs_btree_mark_sick(cur
);
1711 error
= -EFSCORRUPTED
;
1714 if (XFS_IS_CORRUPT(cur
->bc_mp
, ext
.rc_refcount
!= 1)) {
1715 xfs_btree_mark_sick(cur
);
1716 error
= -EFSCORRUPTED
;
1720 ext
.rc_refcount
= 0;
1721 trace_xfs_refcount_modify_extent(cur
, &ext
);
1722 error
= xfs_refcount_delete(cur
, &found_rec
);
1725 if (XFS_IS_CORRUPT(cur
->bc_mp
, found_rec
!= 1)) {
1726 xfs_btree_mark_sick(cur
);
1727 error
= -EFSCORRUPTED
;
1737 trace_xfs_refcount_modify_extent_error(cur
, error
, _RET_IP_
);
1742 * Add or remove refcount btree entries for CoW reservations.
1745 xfs_refcount_adjust_cow(
1746 struct xfs_btree_cur
*cur
,
1747 xfs_agblock_t agbno
,
1749 enum xfs_refc_adjust_op adj
)
1755 * Ensure that no rcextents cross the boundary of the adjustment range.
1757 error
= xfs_refcount_split_extent(cur
, XFS_REFC_DOMAIN_COW
,
1758 agbno
, &shape_changed
);
1762 error
= xfs_refcount_split_extent(cur
, XFS_REFC_DOMAIN_COW
,
1763 agbno
+ aglen
, &shape_changed
);
1768 * Try to merge with the left or right extents of the range.
1770 error
= xfs_refcount_merge_extents(cur
, XFS_REFC_DOMAIN_COW
, &agbno
,
1771 &aglen
, adj
, &shape_changed
);
1775 /* Now that we've taken care of the ends, adjust the middle extents */
1776 error
= xfs_refcount_adjust_cow_extents(cur
, agbno
, aglen
, adj
);
1783 trace_xfs_refcount_adjust_cow_error(cur
, error
, _RET_IP_
);
1788 * Record a CoW allocation in the refcount btree.
1791 __xfs_refcount_cow_alloc(
1792 struct xfs_btree_cur
*rcur
,
1793 xfs_agblock_t agbno
,
1796 trace_xfs_refcount_cow_increase(rcur
, agbno
, aglen
);
1798 /* Add refcount btree reservation */
1799 return xfs_refcount_adjust_cow(rcur
, agbno
, aglen
,
1800 XFS_REFCOUNT_ADJUST_COW_ALLOC
);
1804 * Remove a CoW allocation from the refcount btree.
1807 __xfs_refcount_cow_free(
1808 struct xfs_btree_cur
*rcur
,
1809 xfs_agblock_t agbno
,
1812 trace_xfs_refcount_cow_decrease(rcur
, agbno
, aglen
);
1814 /* Remove refcount btree reservation */
1815 return xfs_refcount_adjust_cow(rcur
, agbno
, aglen
,
1816 XFS_REFCOUNT_ADJUST_COW_FREE
);
1819 /* Record a CoW staging extent in the refcount btree. */
1821 xfs_refcount_alloc_cow_extent(
1822 struct xfs_trans
*tp
,
1826 struct xfs_mount
*mp
= tp
->t_mountp
;
1828 if (!xfs_has_reflink(mp
))
1831 __xfs_refcount_add(tp
, XFS_REFCOUNT_ALLOC_COW
, fsb
, len
);
1833 /* Add rmap entry */
1834 xfs_rmap_alloc_extent(tp
, XFS_FSB_TO_AGNO(mp
, fsb
),
1835 XFS_FSB_TO_AGBNO(mp
, fsb
), len
, XFS_RMAP_OWN_COW
);
1838 /* Forget a CoW staging event in the refcount btree. */
1840 xfs_refcount_free_cow_extent(
1841 struct xfs_trans
*tp
,
1845 struct xfs_mount
*mp
= tp
->t_mountp
;
1847 if (!xfs_has_reflink(mp
))
1850 /* Remove rmap entry */
1851 xfs_rmap_free_extent(tp
, XFS_FSB_TO_AGNO(mp
, fsb
),
1852 XFS_FSB_TO_AGBNO(mp
, fsb
), len
, XFS_RMAP_OWN_COW
);
1853 __xfs_refcount_add(tp
, XFS_REFCOUNT_FREE_COW
, fsb
, len
);
1856 struct xfs_refcount_recovery
{
1857 struct list_head rr_list
;
1858 struct xfs_refcount_irec rr_rrec
;
1861 /* Stuff an extent on the recovery list. */
1863 xfs_refcount_recover_extent(
1864 struct xfs_btree_cur
*cur
,
1865 const union xfs_btree_rec
*rec
,
1868 struct list_head
*debris
= priv
;
1869 struct xfs_refcount_recovery
*rr
;
1871 if (XFS_IS_CORRUPT(cur
->bc_mp
,
1872 be32_to_cpu(rec
->refc
.rc_refcount
) != 1)) {
1873 xfs_btree_mark_sick(cur
);
1874 return -EFSCORRUPTED
;
1877 rr
= kmalloc(sizeof(struct xfs_refcount_recovery
),
1878 GFP_KERNEL
| __GFP_NOFAIL
);
1879 INIT_LIST_HEAD(&rr
->rr_list
);
1880 xfs_refcount_btrec_to_irec(rec
, &rr
->rr_rrec
);
1882 if (xfs_refcount_check_irec(to_perag(cur
->bc_group
), &rr
->rr_rrec
) !=
1884 XFS_IS_CORRUPT(cur
->bc_mp
,
1885 rr
->rr_rrec
.rc_domain
!= XFS_REFC_DOMAIN_COW
)) {
1886 xfs_btree_mark_sick(cur
);
1888 return -EFSCORRUPTED
;
1891 list_add_tail(&rr
->rr_list
, debris
);
1895 /* Find and remove leftover CoW reservations. */
1897 xfs_refcount_recover_cow_leftovers(
1898 struct xfs_mount
*mp
,
1899 struct xfs_perag
*pag
)
1901 struct xfs_trans
*tp
;
1902 struct xfs_btree_cur
*cur
;
1903 struct xfs_buf
*agbp
;
1904 struct xfs_refcount_recovery
*rr
, *n
;
1905 struct list_head debris
;
1906 union xfs_btree_irec low
= {
1907 .rc
.rc_domain
= XFS_REFC_DOMAIN_COW
,
1909 union xfs_btree_irec high
= {
1910 .rc
.rc_domain
= XFS_REFC_DOMAIN_COW
,
1911 .rc
.rc_startblock
= -1U,
1916 /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */
1917 BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS
>= XFS_REFC_COWFLAG
);
1918 if (mp
->m_sb
.sb_agblocks
> XFS_MAX_CRC_AG_BLOCKS
)
1921 INIT_LIST_HEAD(&debris
);
1924 * In this first part, we use an empty transaction to gather up
1925 * all the leftover CoW extents so that we can subsequently
1926 * delete them. The empty transaction is used to avoid
1927 * a buffer lock deadlock if there happens to be a loop in the
1928 * refcountbt because we're allowed to re-grab a buffer that is
1929 * already attached to our transaction. When we're done
1930 * recording the CoW debris we cancel the (empty) transaction
1931 * and everything goes away cleanly.
1933 error
= xfs_trans_alloc_empty(mp
, &tp
);
1937 error
= xfs_alloc_read_agf(pag
, tp
, 0, &agbp
);
1940 cur
= xfs_refcountbt_init_cursor(mp
, tp
, agbp
, pag
);
1942 /* Find all the leftover CoW staging extents. */
1943 error
= xfs_btree_query_range(cur
, &low
, &high
,
1944 xfs_refcount_recover_extent
, &debris
);
1945 xfs_btree_del_cursor(cur
, error
);
1946 xfs_trans_brelse(tp
, agbp
);
1947 xfs_trans_cancel(tp
);
1951 /* Now iterate the list to free the leftovers */
1952 list_for_each_entry_safe(rr
, n
, &debris
, rr_list
) {
1953 /* Set up transaction. */
1954 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
, 0, 0, 0, &tp
);
1958 /* Free the orphan record */
1959 fsb
= xfs_agbno_to_fsb(pag
, rr
->rr_rrec
.rc_startblock
);
1960 xfs_refcount_free_cow_extent(tp
, fsb
,
1961 rr
->rr_rrec
.rc_blockcount
);
1963 /* Free the block. */
1964 error
= xfs_free_extent_later(tp
, fsb
,
1965 rr
->rr_rrec
.rc_blockcount
, NULL
,
1966 XFS_AG_RESV_NONE
, 0);
1970 error
= xfs_trans_commit(tp
);
1974 list_del(&rr
->rr_list
);
1980 xfs_trans_cancel(tp
);
1982 /* Free the leftover list */
1983 list_for_each_entry_safe(rr
, n
, &debris
, rr_list
) {
1984 list_del(&rr
->rr_list
);
1991 * Scan part of the keyspace of the refcount records and tell us if the area
1992 * has no records, is fully mapped by records, or is partially filled.
1995 xfs_refcount_has_records(
1996 struct xfs_btree_cur
*cur
,
1997 enum xfs_refc_domain domain
,
2000 enum xbtree_recpacking
*outcome
)
2002 union xfs_btree_irec low
;
2003 union xfs_btree_irec high
;
2005 memset(&low
, 0, sizeof(low
));
2006 low
.rc
.rc_startblock
= bno
;
2007 memset(&high
, 0xFF, sizeof(high
));
2008 high
.rc
.rc_startblock
= bno
+ len
- 1;
2009 low
.rc
.rc_domain
= high
.rc
.rc_domain
= domain
;
2011 return xfs_btree_has_records(cur
, &low
, &high
, NULL
, outcome
);
2014 struct xfs_refcount_query_range_info
{
2015 xfs_refcount_query_range_fn fn
;
2019 /* Format btree record and pass to our callback. */
2021 xfs_refcount_query_range_helper(
2022 struct xfs_btree_cur
*cur
,
2023 const union xfs_btree_rec
*rec
,
2026 struct xfs_refcount_query_range_info
*query
= priv
;
2027 struct xfs_refcount_irec irec
;
2030 xfs_refcount_btrec_to_irec(rec
, &irec
);
2031 fa
= xfs_refcount_check_irec(to_perag(cur
->bc_group
), &irec
);
2033 return xfs_refcount_complain_bad_rec(cur
, fa
, &irec
);
2035 return query
->fn(cur
, &irec
, query
->priv
);
2038 /* Find all refcount records between two keys. */
2040 xfs_refcount_query_range(
2041 struct xfs_btree_cur
*cur
,
2042 const struct xfs_refcount_irec
*low_rec
,
2043 const struct xfs_refcount_irec
*high_rec
,
2044 xfs_refcount_query_range_fn fn
,
2047 union xfs_btree_irec low_brec
= { .rc
= *low_rec
};
2048 union xfs_btree_irec high_brec
= { .rc
= *high_rec
};
2049 struct xfs_refcount_query_range_info query
= { .priv
= priv
, .fn
= fn
};
2051 return xfs_btree_query_range(cur
, &low_brec
, &high_brec
,
2052 xfs_refcount_query_range_helper
, &query
);
2056 xfs_refcount_intent_init_cache(void)
2058 xfs_refcount_intent_cache
= kmem_cache_create("xfs_refc_intent",
2059 sizeof(struct xfs_refcount_intent
),
2062 return xfs_refcount_intent_cache
!= NULL
? 0 : -ENOMEM
;
2066 xfs_refcount_intent_destroy_cache(void)
2068 kmem_cache_destroy(xfs_refcount_intent_cache
);
2069 xfs_refcount_intent_cache
= NULL
;