1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_btree_staging.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
17 #include "xfs_inode.h"
18 #include "xfs_alloc.h"
21 #include "xfs_defer.h"
22 #include "scrub/scrub.h"
23 #include "scrub/common.h"
24 #include "scrub/trace.h"
25 #include "scrub/repair.h"
26 #include "scrub/newbt.h"
29 * Estimate proper slack values for a btree that's being reloaded.
31 * Under most circumstances, we'll take whatever default loading value the
32 * btree bulk loading code calculates for us. However, there are some
33 * exceptions to this rule:
35 * (0) If someone turned one of the debug knobs.
36 * (1) If this is a per-AG btree and the AG has less than 10% space free.
37 * (2) If this is an inode btree and the FS has less than 10% space free.
39 * In either case, format the new btree blocks almost completely full to
40 * minimize space usage.
43 xrep_newbt_estimate_slack(
44 struct xrep_newbt
*xnr
)
46 struct xfs_scrub
*sc
= xnr
->sc
;
47 struct xfs_btree_bload
*bload
= &xnr
->bload
;
52 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
53 * unless someone has set them otherwise, so we just pull the values
56 bload
->leaf_slack
= xfs_globals
.bload_leaf_slack
;
57 bload
->node_slack
= xfs_globals
.bload_node_slack
;
59 if (sc
->ops
->type
== ST_PERAG
) {
60 free
= sc
->sa
.pag
->pagf_freeblks
;
61 sz
= xfs_ag_block_count(sc
->mp
, sc
->sa
.pag
->pag_agno
);
63 free
= percpu_counter_sum(&sc
->mp
->m_fdblocks
);
64 sz
= sc
->mp
->m_sb
.sb_dblocks
;
67 /* No further changes if there's more than 10% free space left. */
68 if (free
>= div_u64(sz
, 10))
72 * We're low on space; load the btrees as tightly as possible. Leave
73 * a couple of open slots in each btree block so that we don't end up
74 * splitting the btrees like crazy after a mount.
76 if (bload
->leaf_slack
< 0)
77 bload
->leaf_slack
= 2;
78 if (bload
->node_slack
< 0)
79 bload
->node_slack
= 2;
82 /* Initialize accounting resources for staging a new AG btree. */
85 struct xrep_newbt
*xnr
,
87 const struct xfs_owner_info
*oinfo
,
88 xfs_fsblock_t alloc_hint
,
89 enum xfs_ag_resv_type resv
)
91 memset(xnr
, 0, sizeof(struct xrep_newbt
));
93 xnr
->oinfo
= *oinfo
; /* structure copy */
94 xnr
->alloc_hint
= alloc_hint
;
96 INIT_LIST_HEAD(&xnr
->resv_list
);
97 xnr
->bload
.max_dirty
= XFS_B_TO_FSBT(sc
->mp
, 256U << 10); /* 256K */
98 xrep_newbt_estimate_slack(xnr
);
101 /* Initialize accounting resources for staging a new inode fork btree. */
103 xrep_newbt_init_inode(
104 struct xrep_newbt
*xnr
,
105 struct xfs_scrub
*sc
,
107 const struct xfs_owner_info
*oinfo
)
109 struct xfs_ifork
*ifp
;
111 ifp
= kmem_cache_zalloc(xfs_ifork_cache
, XCHK_GFP_FLAGS
);
115 xrep_newbt_init_ag(xnr
, sc
, oinfo
,
116 XFS_INO_TO_FSB(sc
->mp
, sc
->ip
->i_ino
),
118 xnr
->ifake
.if_fork
= ifp
;
119 xnr
->ifake
.if_fork_size
= xfs_inode_fork_size(sc
->ip
, whichfork
);
124 * Initialize accounting resources for staging a new btree. Callers are
125 * expected to add their own reservations (and clean them up) manually.
128 xrep_newbt_init_bare(
129 struct xrep_newbt
*xnr
,
130 struct xfs_scrub
*sc
)
132 xrep_newbt_init_ag(xnr
, sc
, &XFS_RMAP_OINFO_ANY_OWNER
, NULLFSBLOCK
,
137 * Designate specific blocks to be used to build our new btree. @pag must be
138 * a passive reference.
141 xrep_newbt_add_blocks(
142 struct xrep_newbt
*xnr
,
143 struct xfs_perag
*pag
,
144 const struct xfs_alloc_arg
*args
)
146 struct xfs_mount
*mp
= xnr
->sc
->mp
;
147 struct xrep_newbt_resv
*resv
;
150 resv
= kmalloc(sizeof(struct xrep_newbt_resv
), XCHK_GFP_FLAGS
);
154 INIT_LIST_HEAD(&resv
->list
);
155 resv
->agbno
= XFS_FSB_TO_AGBNO(mp
, args
->fsbno
);
156 resv
->len
= args
->len
;
158 resv
->pag
= xfs_perag_hold(pag
);
161 ASSERT(xnr
->oinfo
.oi_offset
== 0);
163 error
= xfs_alloc_schedule_autoreap(args
,
164 XFS_FREE_EXTENT_SKIP_DISCARD
, &resv
->autoreap
);
169 list_add_tail(&resv
->list
, &xnr
->resv_list
);
172 xfs_perag_put(resv
->pag
);
178 * Add an extent to the new btree reservation pool. Callers are required to
179 * reap this reservation manually if the repair is cancelled. @pag must be a
183 xrep_newbt_add_extent(
184 struct xrep_newbt
*xnr
,
185 struct xfs_perag
*pag
,
189 struct xfs_mount
*mp
= xnr
->sc
->mp
;
190 struct xfs_alloc_arg args
= {
191 .tp
= NULL
, /* no autoreap */
193 .fsbno
= XFS_AGB_TO_FSB(mp
, pag
->pag_agno
, agbno
),
198 return xrep_newbt_add_blocks(xnr
, pag
, &args
);
201 /* Don't let our allocation hint take us beyond this AG */
203 xrep_newbt_validate_ag_alloc_hint(
204 struct xrep_newbt
*xnr
)
206 struct xfs_scrub
*sc
= xnr
->sc
;
207 xfs_agnumber_t agno
= XFS_FSB_TO_AGNO(sc
->mp
, xnr
->alloc_hint
);
209 if (agno
== sc
->sa
.pag
->pag_agno
&&
210 xfs_verify_fsbno(sc
->mp
, xnr
->alloc_hint
))
213 xnr
->alloc_hint
= XFS_AGB_TO_FSB(sc
->mp
, sc
->sa
.pag
->pag_agno
,
214 XFS_AGFL_BLOCK(sc
->mp
) + 1);
217 /* Allocate disk space for a new per-AG btree. */
219 xrep_newbt_alloc_ag_blocks(
220 struct xrep_newbt
*xnr
,
223 struct xfs_scrub
*sc
= xnr
->sc
;
224 struct xfs_mount
*mp
= sc
->mp
;
227 ASSERT(sc
->sa
.pag
!= NULL
);
229 while (nr_blocks
> 0) {
230 struct xfs_alloc_arg args
= {
241 xrep_newbt_validate_ag_alloc_hint(xnr
);
243 if (xnr
->alloc_vextent
)
244 error
= xnr
->alloc_vextent(sc
, &args
, xnr
->alloc_hint
);
246 error
= xfs_alloc_vextent_near_bno(&args
,
250 if (args
.fsbno
== NULLFSBLOCK
)
253 agno
= XFS_FSB_TO_AGNO(mp
, args
.fsbno
);
255 trace_xrep_newbt_alloc_ag_blocks(mp
, agno
,
256 XFS_FSB_TO_AGBNO(mp
, args
.fsbno
), args
.len
,
257 xnr
->oinfo
.oi_owner
);
259 if (agno
!= sc
->sa
.pag
->pag_agno
) {
260 ASSERT(agno
== sc
->sa
.pag
->pag_agno
);
261 return -EFSCORRUPTED
;
264 error
= xrep_newbt_add_blocks(xnr
, sc
->sa
.pag
, &args
);
268 nr_blocks
-= args
.len
;
269 xnr
->alloc_hint
= args
.fsbno
+ args
.len
;
271 error
= xrep_defer_finish(sc
);
279 /* Don't let our allocation hint take us beyond EOFS */
281 xrep_newbt_validate_file_alloc_hint(
282 struct xrep_newbt
*xnr
)
284 struct xfs_scrub
*sc
= xnr
->sc
;
286 if (xfs_verify_fsbno(sc
->mp
, xnr
->alloc_hint
))
289 xnr
->alloc_hint
= XFS_AGB_TO_FSB(sc
->mp
, 0, XFS_AGFL_BLOCK(sc
->mp
) + 1);
292 /* Allocate disk space for our new file-based btree. */
294 xrep_newbt_alloc_file_blocks(
295 struct xrep_newbt
*xnr
,
298 struct xfs_scrub
*sc
= xnr
->sc
;
299 struct xfs_mount
*mp
= sc
->mp
;
302 while (nr_blocks
> 0) {
303 struct xfs_alloc_arg args
= {
312 struct xfs_perag
*pag
;
315 xrep_newbt_validate_file_alloc_hint(xnr
);
317 if (xnr
->alloc_vextent
)
318 error
= xnr
->alloc_vextent(sc
, &args
, xnr
->alloc_hint
);
320 error
= xfs_alloc_vextent_start_ag(&args
,
324 if (args
.fsbno
== NULLFSBLOCK
)
327 agno
= XFS_FSB_TO_AGNO(mp
, args
.fsbno
);
329 trace_xrep_newbt_alloc_file_blocks(mp
, agno
,
330 XFS_FSB_TO_AGBNO(mp
, args
.fsbno
), args
.len
,
331 xnr
->oinfo
.oi_owner
);
333 pag
= xfs_perag_get(mp
, agno
);
336 return -EFSCORRUPTED
;
339 error
= xrep_newbt_add_blocks(xnr
, pag
, &args
);
344 nr_blocks
-= args
.len
;
345 xnr
->alloc_hint
= args
.fsbno
+ args
.len
;
347 error
= xrep_defer_finish(sc
);
355 /* Allocate disk space for our new btree. */
357 xrep_newbt_alloc_blocks(
358 struct xrep_newbt
*xnr
,
362 return xrep_newbt_alloc_file_blocks(xnr
, nr_blocks
);
363 return xrep_newbt_alloc_ag_blocks(xnr
, nr_blocks
);
367 * Free the unused part of a space extent that was reserved for a new ondisk
368 * structure. Returns the number of EFIs logged or a negative errno.
371 xrep_newbt_free_extent(
372 struct xrep_newbt
*xnr
,
373 struct xrep_newbt_resv
*resv
,
374 bool btree_committed
)
376 struct xfs_scrub
*sc
= xnr
->sc
;
377 xfs_agblock_t free_agbno
= resv
->agbno
;
378 xfs_extlen_t free_aglen
= resv
->len
;
382 if (!btree_committed
|| resv
->used
== 0) {
384 * If we're not committing a new btree or we didn't use the
385 * space reservation, let the existing EFI free the entire
388 trace_xrep_newbt_free_blocks(sc
->mp
, resv
->pag
->pag_agno
,
389 free_agbno
, free_aglen
, xnr
->oinfo
.oi_owner
);
390 xfs_alloc_commit_autoreap(sc
->tp
, &resv
->autoreap
);
395 * We used space and committed the btree. Cancel the autoreap, remove
396 * the written blocks from the reservation, and possibly log a new EFI
397 * to free any unused reservation space.
399 xfs_alloc_cancel_autoreap(sc
->tp
, &resv
->autoreap
);
400 free_agbno
+= resv
->used
;
401 free_aglen
-= resv
->used
;
406 trace_xrep_newbt_free_blocks(sc
->mp
, resv
->pag
->pag_agno
, free_agbno
,
407 free_aglen
, xnr
->oinfo
.oi_owner
);
409 ASSERT(xnr
->resv
!= XFS_AG_RESV_AGFL
);
410 ASSERT(xnr
->resv
!= XFS_AG_RESV_IGNORE
);
413 * Use EFIs to free the reservations. This reduces the chance
414 * that we leak blocks if the system goes down.
416 fsbno
= XFS_AGB_TO_FSB(sc
->mp
, resv
->pag
->pag_agno
, free_agbno
);
417 error
= xfs_free_extent_later(sc
->tp
, fsbno
, free_aglen
, &xnr
->oinfo
,
418 xnr
->resv
, XFS_FREE_EXTENT_SKIP_DISCARD
);
425 /* Free all the accounting info and disk space we reserved for a new btree. */
428 struct xrep_newbt
*xnr
,
429 bool btree_committed
)
431 struct xfs_scrub
*sc
= xnr
->sc
;
432 struct xrep_newbt_resv
*resv
, *n
;
433 unsigned int freed
= 0;
437 * If the filesystem already went down, we can't free the blocks. Skip
438 * ahead to freeing the incore metadata because we can't fix anything.
440 if (xfs_is_shutdown(sc
->mp
))
443 list_for_each_entry_safe(resv
, n
, &xnr
->resv_list
, list
) {
446 ret
= xrep_newbt_free_extent(xnr
, resv
, btree_committed
);
447 list_del(&resv
->list
);
448 xfs_perag_put(resv
->pag
);
456 if (freed
>= XREP_MAX_ITRUNCATE_EFIS
) {
457 error
= xrep_defer_finish(sc
);
465 error
= xrep_defer_finish(sc
);
469 * If we still have reservations attached to @newbt, cleanup must have
470 * failed and the filesystem is about to go down. Clean up the incore
471 * reservations and try to commit to freeing the space we used.
473 list_for_each_entry_safe(resv
, n
, &xnr
->resv_list
, list
) {
474 xfs_alloc_commit_autoreap(sc
->tp
, &resv
->autoreap
);
475 list_del(&resv
->list
);
476 xfs_perag_put(resv
->pag
);
481 kmem_cache_free(xfs_ifork_cache
, xnr
->ifake
.if_fork
);
482 xnr
->ifake
.if_fork
= NULL
;
489 * Free all the accounting info and unused disk space allocations after
490 * committing a new btree.
494 struct xrep_newbt
*xnr
)
496 return xrep_newbt_free(xnr
, true);
500 * Free all the accounting info and all of the disk space we reserved for a new
501 * btree that we're not going to commit. We want to try to roll things back
502 * cleanly for things like ENOSPC midway through allocation.
506 struct xrep_newbt
*xnr
)
508 xrep_newbt_free(xnr
, false);
511 /* Feed one of the reserved btree blocks to the bulk loader. */
513 xrep_newbt_claim_block(
514 struct xfs_btree_cur
*cur
,
515 struct xrep_newbt
*xnr
,
516 union xfs_btree_ptr
*ptr
)
518 struct xrep_newbt_resv
*resv
;
519 struct xfs_mount
*mp
= cur
->bc_mp
;
523 * The first item in the list should always have a free block unless
524 * we're completely out.
526 resv
= list_first_entry(&xnr
->resv_list
, struct xrep_newbt_resv
, list
);
527 if (resv
->used
== resv
->len
)
531 * Peel off a block from the start of the reservation. We allocate
532 * blocks in order to place blocks on disk in increasing record or key
533 * order. The block reservations tend to end up on the list in
534 * decreasing order, which hopefully results in leaf blocks ending up
537 agbno
= resv
->agbno
+ resv
->used
;
540 /* If we used all the blocks in this reservation, move it to the end. */
541 if (resv
->used
== resv
->len
)
542 list_move_tail(&resv
->list
, &xnr
->resv_list
);
544 trace_xrep_newbt_claim_block(mp
, resv
->pag
->pag_agno
, agbno
, 1,
545 xnr
->oinfo
.oi_owner
);
547 if (cur
->bc_ops
->ptr_len
== XFS_BTREE_LONG_PTR_LEN
)
548 ptr
->l
= cpu_to_be64(XFS_AGB_TO_FSB(mp
, resv
->pag
->pag_agno
,
551 ptr
->s
= cpu_to_be32(agbno
);
553 /* Relog all the EFIs. */
554 return xrep_defer_finish(xnr
->sc
);
557 /* How many reserved blocks are unused? */
559 xrep_newbt_unused_blocks(
560 struct xrep_newbt
*xnr
)
562 struct xrep_newbt_resv
*resv
;
563 unsigned int unused
= 0;
565 list_for_each_entry(resv
, &xnr
->resv_list
, list
)
566 unused
+= resv
->len
- resv
->used
;