1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_btree_staging.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
17 #include "xfs_inode.h"
18 #include "xfs_alloc.h"
21 #include "xfs_defer.h"
22 #include "scrub/scrub.h"
23 #include "scrub/common.h"
24 #include "scrub/trace.h"
25 #include "scrub/repair.h"
26 #include "scrub/newbt.h"
29 * Estimate proper slack values for a btree that's being reloaded.
31 * Under most circumstances, we'll take whatever default loading value the
32 * btree bulk loading code calculates for us. However, there are some
33 * exceptions to this rule:
35 * (0) If someone turned one of the debug knobs.
36 * (1) If this is a per-AG btree and the AG has less than 10% space free.
37 * (2) If this is an inode btree and the FS has less than 10% space free.
39 * In either case, format the new btree blocks almost completely full to
40 * minimize space usage.
43 xrep_newbt_estimate_slack(
44 struct xrep_newbt
*xnr
)
46 struct xfs_scrub
*sc
= xnr
->sc
;
47 struct xfs_btree_bload
*bload
= &xnr
->bload
;
52 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
53 * unless someone has set them otherwise, so we just pull the values
56 bload
->leaf_slack
= xfs_globals
.bload_leaf_slack
;
57 bload
->node_slack
= xfs_globals
.bload_node_slack
;
59 if (sc
->ops
->type
== ST_PERAG
) {
60 free
= sc
->sa
.pag
->pagf_freeblks
;
61 sz
= xfs_ag_block_count(sc
->mp
, pag_agno(sc
->sa
.pag
));
63 free
= percpu_counter_sum(&sc
->mp
->m_fdblocks
);
64 sz
= sc
->mp
->m_sb
.sb_dblocks
;
67 /* No further changes if there's more than 10% free space left. */
68 if (free
>= div_u64(sz
, 10))
72 * We're low on space; load the btrees as tightly as possible. Leave
73 * a couple of open slots in each btree block so that we don't end up
74 * splitting the btrees like crazy after a mount.
76 if (bload
->leaf_slack
< 0)
77 bload
->leaf_slack
= 2;
78 if (bload
->node_slack
< 0)
79 bload
->node_slack
= 2;
82 /* Initialize accounting resources for staging a new AG btree. */
85 struct xrep_newbt
*xnr
,
87 const struct xfs_owner_info
*oinfo
,
88 xfs_fsblock_t alloc_hint
,
89 enum xfs_ag_resv_type resv
)
91 memset(xnr
, 0, sizeof(struct xrep_newbt
));
93 xnr
->oinfo
= *oinfo
; /* structure copy */
94 xnr
->alloc_hint
= alloc_hint
;
96 INIT_LIST_HEAD(&xnr
->resv_list
);
97 xnr
->bload
.max_dirty
= XFS_B_TO_FSBT(sc
->mp
, 256U << 10); /* 256K */
98 xrep_newbt_estimate_slack(xnr
);
101 /* Initialize accounting resources for staging a new inode fork btree. */
103 xrep_newbt_init_inode(
104 struct xrep_newbt
*xnr
,
105 struct xfs_scrub
*sc
,
107 const struct xfs_owner_info
*oinfo
)
109 struct xfs_ifork
*ifp
;
111 ifp
= kmem_cache_zalloc(xfs_ifork_cache
, XCHK_GFP_FLAGS
);
115 xrep_newbt_init_ag(xnr
, sc
, oinfo
,
116 XFS_INO_TO_FSB(sc
->mp
, sc
->ip
->i_ino
),
118 xnr
->ifake
.if_fork
= ifp
;
119 xnr
->ifake
.if_fork_size
= xfs_inode_fork_size(sc
->ip
, whichfork
);
124 * Initialize accounting resources for staging a new btree. Callers are
125 * expected to add their own reservations (and clean them up) manually.
128 xrep_newbt_init_bare(
129 struct xrep_newbt
*xnr
,
130 struct xfs_scrub
*sc
)
132 xrep_newbt_init_ag(xnr
, sc
, &XFS_RMAP_OINFO_ANY_OWNER
, NULLFSBLOCK
,
137 * Designate specific blocks to be used to build our new btree. @pag must be
138 * a passive reference.
141 xrep_newbt_add_blocks(
142 struct xrep_newbt
*xnr
,
143 struct xfs_perag
*pag
,
144 const struct xfs_alloc_arg
*args
)
146 struct xfs_mount
*mp
= xnr
->sc
->mp
;
147 struct xrep_newbt_resv
*resv
;
150 resv
= kmalloc(sizeof(struct xrep_newbt_resv
), XCHK_GFP_FLAGS
);
154 INIT_LIST_HEAD(&resv
->list
);
155 resv
->agbno
= XFS_FSB_TO_AGBNO(mp
, args
->fsbno
);
156 resv
->len
= args
->len
;
158 resv
->pag
= xfs_perag_hold(pag
);
161 ASSERT(xnr
->oinfo
.oi_offset
== 0);
163 error
= xfs_alloc_schedule_autoreap(args
,
164 XFS_FREE_EXTENT_SKIP_DISCARD
, &resv
->autoreap
);
169 list_add_tail(&resv
->list
, &xnr
->resv_list
);
172 xfs_perag_put(resv
->pag
);
178 * Add an extent to the new btree reservation pool. Callers are required to
179 * reap this reservation manually if the repair is cancelled. @pag must be a
183 xrep_newbt_add_extent(
184 struct xrep_newbt
*xnr
,
185 struct xfs_perag
*pag
,
189 struct xfs_alloc_arg args
= {
190 .tp
= NULL
, /* no autoreap */
192 .fsbno
= xfs_agbno_to_fsb(pag
, agbno
),
197 return xrep_newbt_add_blocks(xnr
, pag
, &args
);
200 /* Don't let our allocation hint take us beyond this AG */
202 xrep_newbt_validate_ag_alloc_hint(
203 struct xrep_newbt
*xnr
)
205 struct xfs_scrub
*sc
= xnr
->sc
;
206 xfs_agnumber_t agno
= XFS_FSB_TO_AGNO(sc
->mp
, xnr
->alloc_hint
);
208 if (agno
== pag_agno(sc
->sa
.pag
) &&
209 xfs_verify_fsbno(sc
->mp
, xnr
->alloc_hint
))
213 xfs_agbno_to_fsb(sc
->sa
.pag
, XFS_AGFL_BLOCK(sc
->mp
) + 1);
216 /* Allocate disk space for a new per-AG btree. */
218 xrep_newbt_alloc_ag_blocks(
219 struct xrep_newbt
*xnr
,
222 struct xfs_scrub
*sc
= xnr
->sc
;
223 struct xfs_mount
*mp
= sc
->mp
;
226 ASSERT(sc
->sa
.pag
!= NULL
);
228 while (nr_blocks
> 0) {
229 struct xfs_alloc_arg args
= {
240 xrep_newbt_validate_ag_alloc_hint(xnr
);
242 if (xnr
->alloc_vextent
)
243 error
= xnr
->alloc_vextent(sc
, &args
, xnr
->alloc_hint
);
245 error
= xfs_alloc_vextent_near_bno(&args
,
249 if (args
.fsbno
== NULLFSBLOCK
)
252 agno
= XFS_FSB_TO_AGNO(mp
, args
.fsbno
);
253 if (agno
!= pag_agno(sc
->sa
.pag
)) {
254 ASSERT(agno
== pag_agno(sc
->sa
.pag
));
255 return -EFSCORRUPTED
;
258 trace_xrep_newbt_alloc_ag_blocks(sc
->sa
.pag
,
259 XFS_FSB_TO_AGBNO(mp
, args
.fsbno
), args
.len
,
260 xnr
->oinfo
.oi_owner
);
262 error
= xrep_newbt_add_blocks(xnr
, sc
->sa
.pag
, &args
);
266 nr_blocks
-= args
.len
;
267 xnr
->alloc_hint
= args
.fsbno
+ args
.len
;
269 error
= xrep_defer_finish(sc
);
277 /* Don't let our allocation hint take us beyond EOFS */
279 xrep_newbt_validate_file_alloc_hint(
280 struct xrep_newbt
*xnr
)
282 struct xfs_scrub
*sc
= xnr
->sc
;
284 if (xfs_verify_fsbno(sc
->mp
, xnr
->alloc_hint
))
287 xnr
->alloc_hint
= XFS_AGB_TO_FSB(sc
->mp
, 0, XFS_AGFL_BLOCK(sc
->mp
) + 1);
290 /* Allocate disk space for our new file-based btree. */
292 xrep_newbt_alloc_file_blocks(
293 struct xrep_newbt
*xnr
,
296 struct xfs_scrub
*sc
= xnr
->sc
;
297 struct xfs_mount
*mp
= sc
->mp
;
300 while (nr_blocks
> 0) {
301 struct xfs_alloc_arg args
= {
310 struct xfs_perag
*pag
;
313 xrep_newbt_validate_file_alloc_hint(xnr
);
315 if (xnr
->alloc_vextent
)
316 error
= xnr
->alloc_vextent(sc
, &args
, xnr
->alloc_hint
);
318 error
= xfs_alloc_vextent_start_ag(&args
,
322 if (args
.fsbno
== NULLFSBLOCK
)
325 agno
= XFS_FSB_TO_AGNO(mp
, args
.fsbno
);
327 pag
= xfs_perag_get(mp
, agno
);
330 return -EFSCORRUPTED
;
333 trace_xrep_newbt_alloc_file_blocks(pag
,
334 XFS_FSB_TO_AGBNO(mp
, args
.fsbno
), args
.len
,
335 xnr
->oinfo
.oi_owner
);
337 error
= xrep_newbt_add_blocks(xnr
, pag
, &args
);
342 nr_blocks
-= args
.len
;
343 xnr
->alloc_hint
= args
.fsbno
+ args
.len
;
345 error
= xrep_defer_finish(sc
);
353 /* Allocate disk space for our new btree. */
355 xrep_newbt_alloc_blocks(
356 struct xrep_newbt
*xnr
,
360 return xrep_newbt_alloc_file_blocks(xnr
, nr_blocks
);
361 return xrep_newbt_alloc_ag_blocks(xnr
, nr_blocks
);
365 * Free the unused part of a space extent that was reserved for a new ondisk
366 * structure. Returns the number of EFIs logged or a negative errno.
369 xrep_newbt_free_extent(
370 struct xrep_newbt
*xnr
,
371 struct xrep_newbt_resv
*resv
,
372 bool btree_committed
)
374 struct xfs_scrub
*sc
= xnr
->sc
;
375 xfs_agblock_t free_agbno
= resv
->agbno
;
376 xfs_extlen_t free_aglen
= resv
->len
;
379 if (!btree_committed
|| resv
->used
== 0) {
381 * If we're not committing a new btree or we didn't use the
382 * space reservation, let the existing EFI free the entire
385 trace_xrep_newbt_free_blocks(resv
->pag
, free_agbno
, free_aglen
,
386 xnr
->oinfo
.oi_owner
);
387 xfs_alloc_commit_autoreap(sc
->tp
, &resv
->autoreap
);
392 * We used space and committed the btree. Cancel the autoreap, remove
393 * the written blocks from the reservation, and possibly log a new EFI
394 * to free any unused reservation space.
396 xfs_alloc_cancel_autoreap(sc
->tp
, &resv
->autoreap
);
397 free_agbno
+= resv
->used
;
398 free_aglen
-= resv
->used
;
403 trace_xrep_newbt_free_blocks(resv
->pag
, free_agbno
, free_aglen
,
404 xnr
->oinfo
.oi_owner
);
406 ASSERT(xnr
->resv
!= XFS_AG_RESV_AGFL
);
407 ASSERT(xnr
->resv
!= XFS_AG_RESV_IGNORE
);
410 * Use EFIs to free the reservations. This reduces the chance
411 * that we leak blocks if the system goes down.
413 error
= xfs_free_extent_later(sc
->tp
,
414 xfs_agbno_to_fsb(resv
->pag
, free_agbno
), free_aglen
,
415 &xnr
->oinfo
, xnr
->resv
, XFS_FREE_EXTENT_SKIP_DISCARD
);
422 /* Free all the accounting info and disk space we reserved for a new btree. */
425 struct xrep_newbt
*xnr
,
426 bool btree_committed
)
428 struct xfs_scrub
*sc
= xnr
->sc
;
429 struct xrep_newbt_resv
*resv
, *n
;
430 unsigned int freed
= 0;
434 * If the filesystem already went down, we can't free the blocks. Skip
435 * ahead to freeing the incore metadata because we can't fix anything.
437 if (xfs_is_shutdown(sc
->mp
))
440 list_for_each_entry_safe(resv
, n
, &xnr
->resv_list
, list
) {
443 ret
= xrep_newbt_free_extent(xnr
, resv
, btree_committed
);
444 list_del(&resv
->list
);
445 xfs_perag_put(resv
->pag
);
453 if (freed
>= XREP_MAX_ITRUNCATE_EFIS
) {
454 error
= xrep_defer_finish(sc
);
462 error
= xrep_defer_finish(sc
);
466 * If we still have reservations attached to @newbt, cleanup must have
467 * failed and the filesystem is about to go down. Clean up the incore
468 * reservations and try to commit to freeing the space we used.
470 list_for_each_entry_safe(resv
, n
, &xnr
->resv_list
, list
) {
471 xfs_alloc_commit_autoreap(sc
->tp
, &resv
->autoreap
);
472 list_del(&resv
->list
);
473 xfs_perag_put(resv
->pag
);
478 kmem_cache_free(xfs_ifork_cache
, xnr
->ifake
.if_fork
);
479 xnr
->ifake
.if_fork
= NULL
;
486 * Free all the accounting info and unused disk space allocations after
487 * committing a new btree.
491 struct xrep_newbt
*xnr
)
493 return xrep_newbt_free(xnr
, true);
497 * Free all the accounting info and all of the disk space we reserved for a new
498 * btree that we're not going to commit. We want to try to roll things back
499 * cleanly for things like ENOSPC midway through allocation.
503 struct xrep_newbt
*xnr
)
505 xrep_newbt_free(xnr
, false);
508 /* Feed one of the reserved btree blocks to the bulk loader. */
510 xrep_newbt_claim_block(
511 struct xfs_btree_cur
*cur
,
512 struct xrep_newbt
*xnr
,
513 union xfs_btree_ptr
*ptr
)
515 struct xrep_newbt_resv
*resv
;
519 * The first item in the list should always have a free block unless
520 * we're completely out.
522 resv
= list_first_entry(&xnr
->resv_list
, struct xrep_newbt_resv
, list
);
523 if (resv
->used
== resv
->len
)
527 * Peel off a block from the start of the reservation. We allocate
528 * blocks in order to place blocks on disk in increasing record or key
529 * order. The block reservations tend to end up on the list in
530 * decreasing order, which hopefully results in leaf blocks ending up
533 agbno
= resv
->agbno
+ resv
->used
;
536 /* If we used all the blocks in this reservation, move it to the end. */
537 if (resv
->used
== resv
->len
)
538 list_move_tail(&resv
->list
, &xnr
->resv_list
);
540 trace_xrep_newbt_claim_block(resv
->pag
, agbno
, 1, xnr
->oinfo
.oi_owner
);
542 if (cur
->bc_ops
->ptr_len
== XFS_BTREE_LONG_PTR_LEN
)
543 ptr
->l
= cpu_to_be64(xfs_agbno_to_fsb(resv
->pag
, agbno
));
545 ptr
->s
= cpu_to_be32(agbno
);
547 /* Relog all the EFIs. */
548 return xrep_defer_finish(xnr
->sc
);
551 /* How many reserved blocks are unused? */
553 xrep_newbt_unused_blocks(
554 struct xrep_newbt
*xnr
)
556 struct xrep_newbt_resv
*resv
;
557 unsigned int unused
= 0;
559 list_for_each_entry(resv
, &xnr
->resv_list
, list
)
560 unused
+= resv
->len
- resv
->used
;