1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_shared.h"
11 #include "xfs_trans_resv.h"
13 #include "xfs_mount.h"
14 #include "xfs_defer.h"
15 #include "xfs_btree.h"
17 #include "xfs_alloc_btree.h"
18 #include "xfs_alloc.h"
19 #include "xfs_extent_busy.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_trace.h"
23 #include "xfs_trans.h"
24 #include "xfs_buf_item.h"
27 #include "xfs_ag_resv.h"
29 #include "xfs_health.h"
30 #include "xfs_extfree_item.h"
32 struct kmem_cache
*xfs_extfree_item_cache
;
34 struct workqueue_struct
*xfs_alloc_wq
;
36 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
38 #define XFSA_FIXUP_BNO_OK 1
39 #define XFSA_FIXUP_CNT_OK 2
42 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
43 * the beginning of the block for a proper header with the location information
50 unsigned int size
= mp
->m_sb
.sb_sectsize
;
53 size
-= sizeof(struct xfs_agfl
);
55 return size
/ sizeof(xfs_agblock_t
);
62 if (xfs_has_rmapbt(mp
))
63 return XFS_RMAP_BLOCK(mp
) + 1;
64 if (xfs_has_finobt(mp
))
65 return XFS_FIBT_BLOCK(mp
) + 1;
66 return XFS_IBT_BLOCK(mp
) + 1;
73 if (xfs_has_reflink(mp
))
74 return xfs_refc_block(mp
) + 1;
75 if (xfs_has_rmapbt(mp
))
76 return XFS_RMAP_BLOCK(mp
) + 1;
77 if (xfs_has_finobt(mp
))
78 return XFS_FIBT_BLOCK(mp
) + 1;
79 return XFS_IBT_BLOCK(mp
) + 1;
83 * The number of blocks per AG that we withhold from xfs_dec_fdblocks to
84 * guarantee that we can refill the AGFL prior to allocating space in a nearly
85 * full AG. Although the space described by the free space btrees, the
86 * blocks used by the freesp btrees themselves, and the blocks owned by the
87 * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
88 * free space in the AG drop so low that the free space btrees cannot refill an
89 * empty AGFL up to the minimum level. Rather than grind through empty AGs
90 * until the fs goes down, we subtract this many AG blocks from the incore
91 * fdblocks to ensure user allocation does not overcommit the space the
92 * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to
93 * withhold space from xfs_dec_fdblocks, so we do not account for that here.
95 #define XFS_ALLOCBT_AGFL_RESERVE 4
98 * Compute the number of blocks that we set aside to guarantee the ability to
99 * refill the AGFL and handle a full bmap btree split.
101 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
102 * AGF buffer (PV 947395), we place constraints on the relationship among
103 * actual allocations for data blocks, freelist blocks, and potential file data
104 * bmap btree blocks. However, these restrictions may result in no actual space
105 * allocated for a delayed extent, for example, a data block in a certain AG is
106 * allocated but there is no additional block for the additional bmap btree
107 * block due to a split of the bmap btree of the file. The result of this may
108 * lead to an infinite loop when the file gets flushed to disk and all delayed
109 * extents need to be actually allocated. To get around this, we explicitly set
110 * aside a few blocks which will not be reserved in delayed allocation.
112 * For each AG, we need to reserve enough blocks to replenish a totally empty
113 * AGFL and 4 more to handle a potential split of the file's bmap btree.
117 struct xfs_mount
*mp
)
119 return mp
->m_sb
.sb_agcount
* (XFS_ALLOCBT_AGFL_RESERVE
+ 4);
123 * When deciding how much space to allocate out of an AG, we limit the
124 * allocation maximum size to the size the AG. However, we cannot use all the
125 * blocks in the AG - some are permanently used by metadata. These
126 * blocks are generally:
127 * - the AG superblock, AGF, AGI and AGFL
128 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally
129 * the AGI free inode and rmap btree root blocks.
130 * - blocks on the AGFL according to xfs_alloc_set_aside() limits
131 * - the rmapbt root block
133 * The AG headers are sector sized, so the amount of space they take up is
134 * dependent on filesystem geometry. The others are all single blocks.
137 xfs_alloc_ag_max_usable(
138 struct xfs_mount
*mp
)
142 blocks
= XFS_BB_TO_FSB(mp
, XFS_FSS_TO_BB(mp
, 4)); /* ag headers */
143 blocks
+= XFS_ALLOCBT_AGFL_RESERVE
;
144 blocks
+= 3; /* AGF, AGI btree root blocks */
145 if (xfs_has_finobt(mp
))
146 blocks
++; /* finobt root block */
147 if (xfs_has_rmapbt(mp
))
148 blocks
++; /* rmap root block */
149 if (xfs_has_reflink(mp
))
150 blocks
++; /* refcount root block */
152 return mp
->m_sb
.sb_agblocks
- blocks
;
158 struct xfs_btree_cur
*cur
,
166 cur
->bc_rec
.a
.ar_startblock
= bno
;
167 cur
->bc_rec
.a
.ar_blockcount
= len
;
168 error
= xfs_btree_lookup(cur
, dir
, stat
);
170 cur
->bc_flags
|= XFS_BTREE_ALLOCBT_ACTIVE
;
172 cur
->bc_flags
&= ~XFS_BTREE_ALLOCBT_ACTIVE
;
177 * Lookup the record equal to [bno, len] in the btree given by cur.
179 static inline int /* error */
181 struct xfs_btree_cur
*cur
, /* btree cursor */
182 xfs_agblock_t bno
, /* starting block of extent */
183 xfs_extlen_t len
, /* length of extent */
184 int *stat
) /* success/failure */
186 return xfs_alloc_lookup(cur
, XFS_LOOKUP_EQ
, bno
, len
, stat
);
190 * Lookup the first record greater than or equal to [bno, len]
191 * in the btree given by cur.
195 struct xfs_btree_cur
*cur
, /* btree cursor */
196 xfs_agblock_t bno
, /* starting block of extent */
197 xfs_extlen_t len
, /* length of extent */
198 int *stat
) /* success/failure */
200 return xfs_alloc_lookup(cur
, XFS_LOOKUP_GE
, bno
, len
, stat
);
204 * Lookup the first record less than or equal to [bno, len]
205 * in the btree given by cur.
209 struct xfs_btree_cur
*cur
, /* btree cursor */
210 xfs_agblock_t bno
, /* starting block of extent */
211 xfs_extlen_t len
, /* length of extent */
212 int *stat
) /* success/failure */
214 return xfs_alloc_lookup(cur
, XFS_LOOKUP_LE
, bno
, len
, stat
);
218 xfs_alloc_cur_active(
219 struct xfs_btree_cur
*cur
)
221 return cur
&& (cur
->bc_flags
& XFS_BTREE_ALLOCBT_ACTIVE
);
225 * Update the record referred to by cur to the value given
227 * This either works (return 0) or gets an EFSCORRUPTED error.
229 STATIC
int /* error */
231 struct xfs_btree_cur
*cur
, /* btree cursor */
232 xfs_agblock_t bno
, /* starting block of extent */
233 xfs_extlen_t len
) /* length of extent */
235 union xfs_btree_rec rec
;
237 rec
.alloc
.ar_startblock
= cpu_to_be32(bno
);
238 rec
.alloc
.ar_blockcount
= cpu_to_be32(len
);
239 return xfs_btree_update(cur
, &rec
);
242 /* Convert the ondisk btree record to its incore representation. */
244 xfs_alloc_btrec_to_irec(
245 const union xfs_btree_rec
*rec
,
246 struct xfs_alloc_rec_incore
*irec
)
248 irec
->ar_startblock
= be32_to_cpu(rec
->alloc
.ar_startblock
);
249 irec
->ar_blockcount
= be32_to_cpu(rec
->alloc
.ar_blockcount
);
252 /* Simple checks for free space records. */
254 xfs_alloc_check_irec(
255 struct xfs_perag
*pag
,
256 const struct xfs_alloc_rec_incore
*irec
)
258 if (irec
->ar_blockcount
== 0)
259 return __this_address
;
261 /* check for valid extent range, including overflow */
262 if (!xfs_verify_agbext(pag
, irec
->ar_startblock
, irec
->ar_blockcount
))
263 return __this_address
;
269 xfs_alloc_complain_bad_rec(
270 struct xfs_btree_cur
*cur
,
272 const struct xfs_alloc_rec_incore
*irec
)
274 struct xfs_mount
*mp
= cur
->bc_mp
;
277 "%sbt record corruption in AG %d detected at %pS!",
278 cur
->bc_ops
->name
, cur
->bc_group
->xg_gno
, fa
);
280 "start block 0x%x block count 0x%x", irec
->ar_startblock
,
281 irec
->ar_blockcount
);
282 xfs_btree_mark_sick(cur
);
283 return -EFSCORRUPTED
;
287 * Get the data from the pointed-to record.
291 struct xfs_btree_cur
*cur
, /* btree cursor */
292 xfs_agblock_t
*bno
, /* output: starting block of extent */
293 xfs_extlen_t
*len
, /* output: length of extent */
294 int *stat
) /* output: success/failure */
296 struct xfs_alloc_rec_incore irec
;
297 union xfs_btree_rec
*rec
;
301 error
= xfs_btree_get_rec(cur
, &rec
, stat
);
302 if (error
|| !(*stat
))
305 xfs_alloc_btrec_to_irec(rec
, &irec
);
306 fa
= xfs_alloc_check_irec(to_perag(cur
->bc_group
), &irec
);
308 return xfs_alloc_complain_bad_rec(cur
, fa
, &irec
);
310 *bno
= irec
.ar_startblock
;
311 *len
= irec
.ar_blockcount
;
316 * Compute aligned version of the found extent.
317 * Takes alignment and min length into account.
320 xfs_alloc_compute_aligned(
321 xfs_alloc_arg_t
*args
, /* allocation argument structure */
322 xfs_agblock_t foundbno
, /* starting block in found extent */
323 xfs_extlen_t foundlen
, /* length in found extent */
324 xfs_agblock_t
*resbno
, /* result block number */
325 xfs_extlen_t
*reslen
, /* result length */
328 xfs_agblock_t bno
= foundbno
;
329 xfs_extlen_t len
= foundlen
;
333 /* Trim busy sections out of found extent */
334 busy
= xfs_extent_busy_trim(pag_group(args
->pag
), args
->minlen
,
335 args
->maxlen
, &bno
, &len
, busy_gen
);
338 * If we have a largish extent that happens to start before min_agbno,
339 * see if we can shift it into range...
341 if (bno
< args
->min_agbno
&& bno
+ len
> args
->min_agbno
) {
342 diff
= args
->min_agbno
- bno
;
349 if (args
->alignment
> 1 && len
>= args
->minlen
) {
350 xfs_agblock_t aligned_bno
= roundup(bno
, args
->alignment
);
352 diff
= aligned_bno
- bno
;
354 *resbno
= aligned_bno
;
355 *reslen
= diff
>= len
? 0 : len
- diff
;
365 * Compute best start block and diff for "near" allocations.
366 * freelen >= wantlen already checked by caller.
368 STATIC xfs_extlen_t
/* difference value (absolute) */
369 xfs_alloc_compute_diff(
370 xfs_agblock_t wantbno
, /* target starting block */
371 xfs_extlen_t wantlen
, /* target length */
372 xfs_extlen_t alignment
, /* target alignment */
373 int datatype
, /* are we allocating data? */
374 xfs_agblock_t freebno
, /* freespace's starting block */
375 xfs_extlen_t freelen
, /* freespace's length */
376 xfs_agblock_t
*newbnop
) /* result: best start block from free */
378 xfs_agblock_t freeend
; /* end of freespace extent */
379 xfs_agblock_t newbno1
; /* return block number */
380 xfs_agblock_t newbno2
; /* other new block number */
381 xfs_extlen_t newlen1
=0; /* length with newbno1 */
382 xfs_extlen_t newlen2
=0; /* length with newbno2 */
383 xfs_agblock_t wantend
; /* end of target extent */
384 bool userdata
= datatype
& XFS_ALLOC_USERDATA
;
386 ASSERT(freelen
>= wantlen
);
387 freeend
= freebno
+ freelen
;
388 wantend
= wantbno
+ wantlen
;
390 * We want to allocate from the start of a free extent if it is past
391 * the desired block or if we are allocating user data and the free
392 * extent is before desired block. The second case is there to allow
393 * for contiguous allocation from the remaining free space if the file
394 * grows in the short term.
396 if (freebno
>= wantbno
|| (userdata
&& freeend
< wantend
)) {
397 if ((newbno1
= roundup(freebno
, alignment
)) >= freeend
)
398 newbno1
= NULLAGBLOCK
;
399 } else if (freeend
>= wantend
&& alignment
> 1) {
400 newbno1
= roundup(wantbno
, alignment
);
401 newbno2
= newbno1
- alignment
;
402 if (newbno1
>= freeend
)
403 newbno1
= NULLAGBLOCK
;
405 newlen1
= XFS_EXTLEN_MIN(wantlen
, freeend
- newbno1
);
406 if (newbno2
< freebno
)
407 newbno2
= NULLAGBLOCK
;
409 newlen2
= XFS_EXTLEN_MIN(wantlen
, freeend
- newbno2
);
410 if (newbno1
!= NULLAGBLOCK
&& newbno2
!= NULLAGBLOCK
) {
411 if (newlen1
< newlen2
||
412 (newlen1
== newlen2
&&
413 XFS_ABSDIFF(newbno1
, wantbno
) >
414 XFS_ABSDIFF(newbno2
, wantbno
)))
416 } else if (newbno2
!= NULLAGBLOCK
)
418 } else if (freeend
>= wantend
) {
420 } else if (alignment
> 1) {
421 newbno1
= roundup(freeend
- wantlen
, alignment
);
422 if (newbno1
> freeend
- wantlen
&&
423 newbno1
- alignment
>= freebno
)
424 newbno1
-= alignment
;
425 else if (newbno1
>= freeend
)
426 newbno1
= NULLAGBLOCK
;
428 newbno1
= freeend
- wantlen
;
430 return newbno1
== NULLAGBLOCK
? 0 : XFS_ABSDIFF(newbno1
, wantbno
);
434 * Fix up the length, based on mod and prod.
435 * len should be k * prod + mod for some k.
436 * If len is too small it is returned unchanged.
437 * If len hits maxlen it is left alone.
441 xfs_alloc_arg_t
*args
) /* allocation argument structure */
446 ASSERT(args
->mod
< args
->prod
);
448 ASSERT(rlen
>= args
->minlen
);
449 ASSERT(rlen
<= args
->maxlen
);
450 if (args
->prod
<= 1 || rlen
< args
->mod
|| rlen
== args
->maxlen
||
451 (args
->mod
== 0 && rlen
< args
->prod
))
453 k
= rlen
% args
->prod
;
457 rlen
= rlen
- (k
- args
->mod
);
459 rlen
= rlen
- args
->prod
+ (args
->mod
- k
);
460 /* casts to (int) catch length underflows */
461 if ((int)rlen
< (int)args
->minlen
)
463 ASSERT(rlen
>= args
->minlen
&& rlen
<= args
->maxlen
);
464 ASSERT(rlen
% args
->prod
== args
->mod
);
465 ASSERT(args
->pag
->pagf_freeblks
+ args
->pag
->pagf_flcount
>=
466 rlen
+ args
->minleft
);
471 * Determine if the cursor points to the block that contains the right-most
472 * block of records in the by-count btree. This block contains the largest
473 * contiguous free extent in the AG, so if we modify a record in this block we
474 * need to call xfs_alloc_fixup_longest() once the modifications are done to
475 * ensure the agf->agf_longest field is kept up to date with the longest free
476 * extent tracked by the by-count btree.
479 xfs_alloc_cursor_at_lastrec(
480 struct xfs_btree_cur
*cnt_cur
)
482 struct xfs_btree_block
*block
;
483 union xfs_btree_ptr ptr
;
486 block
= xfs_btree_get_block(cnt_cur
, 0, &bp
);
488 xfs_btree_get_sibling(cnt_cur
, block
, &ptr
, XFS_BB_RIGHTSIB
);
489 return xfs_btree_ptr_is_null(cnt_cur
, &ptr
);
493 * Find the rightmost record of the cntbt, and return the longest free space
494 * recorded in it. Simply set both the block number and the length to their
495 * maximum values before searching.
499 struct xfs_btree_cur
*cnt_cur
,
500 xfs_extlen_t
*longest
)
502 struct xfs_alloc_rec_incore irec
;
503 union xfs_btree_rec
*rec
;
507 memset(&cnt_cur
->bc_rec
, 0xFF, sizeof(cnt_cur
->bc_rec
));
508 error
= xfs_btree_lookup(cnt_cur
, XFS_LOOKUP_LE
, &stat
);
512 /* totally empty tree */
517 error
= xfs_btree_get_rec(cnt_cur
, &rec
, &stat
);
520 if (XFS_IS_CORRUPT(cnt_cur
->bc_mp
, !stat
)) {
521 xfs_btree_mark_sick(cnt_cur
);
522 return -EFSCORRUPTED
;
525 xfs_alloc_btrec_to_irec(rec
, &irec
);
526 *longest
= irec
.ar_blockcount
;
531 * Update the longest contiguous free extent in the AG from the by-count cursor
532 * that is passed to us. This should be done at the end of any allocation or
533 * freeing operation that touches the longest extent in the btree.
535 * Needing to update the longest extent can be determined by calling
536 * xfs_alloc_cursor_at_lastrec() after the cursor is positioned for record
537 * modification but before the modification begins.
540 xfs_alloc_fixup_longest(
541 struct xfs_btree_cur
*cnt_cur
)
543 struct xfs_perag
*pag
= to_perag(cnt_cur
->bc_group
);
544 struct xfs_buf
*bp
= cnt_cur
->bc_ag
.agbp
;
545 struct xfs_agf
*agf
= bp
->b_addr
;
546 xfs_extlen_t longest
= 0;
549 /* Lookup last rec in order to update AGF. */
550 error
= xfs_cntbt_longest(cnt_cur
, &longest
);
554 pag
->pagf_longest
= longest
;
555 agf
->agf_longest
= cpu_to_be32(pag
->pagf_longest
);
556 xfs_alloc_log_agf(cnt_cur
->bc_tp
, bp
, XFS_AGF_LONGEST
);
562 * Update the two btrees, logically removing from freespace the extent
563 * starting at rbno, rlen blocks. The extent is contained within the
564 * actual (current) free extent fbno for flen blocks.
565 * Flags are passed in indicating whether the cursors are set to the
568 STATIC
int /* error code */
569 xfs_alloc_fixup_trees(
570 struct xfs_btree_cur
*cnt_cur
, /* cursor for by-size btree */
571 struct xfs_btree_cur
*bno_cur
, /* cursor for by-block btree */
572 xfs_agblock_t fbno
, /* starting block of free extent */
573 xfs_extlen_t flen
, /* length of free extent */
574 xfs_agblock_t rbno
, /* starting block of returned extent */
575 xfs_extlen_t rlen
, /* length of returned extent */
576 int flags
) /* flags, XFSA_FIXUP_... */
578 int error
; /* error code */
579 int i
; /* operation results */
580 xfs_agblock_t nfbno1
; /* first new free startblock */
581 xfs_agblock_t nfbno2
; /* second new free startblock */
582 xfs_extlen_t nflen1
=0; /* first new free length */
583 xfs_extlen_t nflen2
=0; /* second new free length */
584 struct xfs_mount
*mp
;
585 bool fixup_longest
= false;
590 * Look up the record in the by-size tree if necessary.
592 if (flags
& XFSA_FIXUP_CNT_OK
) {
594 if ((error
= xfs_alloc_get_rec(cnt_cur
, &nfbno1
, &nflen1
, &i
)))
596 if (XFS_IS_CORRUPT(mp
,
600 xfs_btree_mark_sick(cnt_cur
);
601 return -EFSCORRUPTED
;
605 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, fbno
, flen
, &i
)))
607 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
608 xfs_btree_mark_sick(cnt_cur
);
609 return -EFSCORRUPTED
;
613 * Look up the record in the by-block tree if necessary.
615 if (flags
& XFSA_FIXUP_BNO_OK
) {
617 if ((error
= xfs_alloc_get_rec(bno_cur
, &nfbno1
, &nflen1
, &i
)))
619 if (XFS_IS_CORRUPT(mp
,
623 xfs_btree_mark_sick(bno_cur
);
624 return -EFSCORRUPTED
;
628 if ((error
= xfs_alloc_lookup_eq(bno_cur
, fbno
, flen
, &i
)))
630 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
631 xfs_btree_mark_sick(bno_cur
);
632 return -EFSCORRUPTED
;
637 if (bno_cur
->bc_nlevels
== 1 && cnt_cur
->bc_nlevels
== 1) {
638 struct xfs_btree_block
*bnoblock
;
639 struct xfs_btree_block
*cntblock
;
641 bnoblock
= XFS_BUF_TO_BLOCK(bno_cur
->bc_levels
[0].bp
);
642 cntblock
= XFS_BUF_TO_BLOCK(cnt_cur
->bc_levels
[0].bp
);
644 if (XFS_IS_CORRUPT(mp
,
645 bnoblock
->bb_numrecs
!=
646 cntblock
->bb_numrecs
)) {
647 xfs_btree_mark_sick(bno_cur
);
648 return -EFSCORRUPTED
;
654 * Deal with all four cases: the allocated record is contained
655 * within the freespace record, so we can have new freespace
656 * at either (or both) end, or no freespace remaining.
658 if (rbno
== fbno
&& rlen
== flen
)
659 nfbno1
= nfbno2
= NULLAGBLOCK
;
660 else if (rbno
== fbno
) {
661 nfbno1
= rbno
+ rlen
;
662 nflen1
= flen
- rlen
;
663 nfbno2
= NULLAGBLOCK
;
664 } else if (rbno
+ rlen
== fbno
+ flen
) {
666 nflen1
= flen
- rlen
;
667 nfbno2
= NULLAGBLOCK
;
670 nflen1
= rbno
- fbno
;
671 nfbno2
= rbno
+ rlen
;
672 nflen2
= (fbno
+ flen
) - nfbno2
;
675 if (xfs_alloc_cursor_at_lastrec(cnt_cur
))
676 fixup_longest
= true;
679 * Delete the entry from the by-size btree.
681 if ((error
= xfs_btree_delete(cnt_cur
, &i
)))
683 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
684 xfs_btree_mark_sick(cnt_cur
);
685 return -EFSCORRUPTED
;
688 * Add new by-size btree entry(s).
690 if (nfbno1
!= NULLAGBLOCK
) {
691 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, nfbno1
, nflen1
, &i
)))
693 if (XFS_IS_CORRUPT(mp
, i
!= 0)) {
694 xfs_btree_mark_sick(cnt_cur
);
695 return -EFSCORRUPTED
;
697 if ((error
= xfs_btree_insert(cnt_cur
, &i
)))
699 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
700 xfs_btree_mark_sick(cnt_cur
);
701 return -EFSCORRUPTED
;
704 if (nfbno2
!= NULLAGBLOCK
) {
705 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, nfbno2
, nflen2
, &i
)))
707 if (XFS_IS_CORRUPT(mp
, i
!= 0)) {
708 xfs_btree_mark_sick(cnt_cur
);
709 return -EFSCORRUPTED
;
711 if ((error
= xfs_btree_insert(cnt_cur
, &i
)))
713 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
714 xfs_btree_mark_sick(cnt_cur
);
715 return -EFSCORRUPTED
;
719 * Fix up the by-block btree entry(s).
721 if (nfbno1
== NULLAGBLOCK
) {
723 * No remaining freespace, just delete the by-block tree entry.
725 if ((error
= xfs_btree_delete(bno_cur
, &i
)))
727 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
728 xfs_btree_mark_sick(bno_cur
);
729 return -EFSCORRUPTED
;
733 * Update the by-block entry to start later|be shorter.
735 if ((error
= xfs_alloc_update(bno_cur
, nfbno1
, nflen1
)))
738 if (nfbno2
!= NULLAGBLOCK
) {
740 * 2 resulting free entries, need to add one.
742 if ((error
= xfs_alloc_lookup_eq(bno_cur
, nfbno2
, nflen2
, &i
)))
744 if (XFS_IS_CORRUPT(mp
, i
!= 0)) {
745 xfs_btree_mark_sick(bno_cur
);
746 return -EFSCORRUPTED
;
748 if ((error
= xfs_btree_insert(bno_cur
, &i
)))
750 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
751 xfs_btree_mark_sick(bno_cur
);
752 return -EFSCORRUPTED
;
757 return xfs_alloc_fixup_longest(cnt_cur
);
763 * We do not verify the AGFL contents against AGF-based index counters here,
764 * even though we may have access to the perag that contains shadow copies. We
765 * don't know if the AGF based counters have been checked, and if they have they
766 * still may be inconsistent because they haven't yet been reset on the first
767 * allocation after the AGF has been read in.
769 * This means we can only check that all agfl entries contain valid or null
770 * values because we can't reliably determine the active range to exclude
771 * NULLAGBNO as a valid value.
773 * However, we can't even do that for v4 format filesystems because there are
774 * old versions of mkfs out there that does not initialise the AGFL to known,
775 * verifiable values. HEnce we can't tell the difference between a AGFL block
776 * allocated by mkfs and a corrupted AGFL block here on v4 filesystems.
778 * As a result, we can only fully validate AGFL block numbers when we pull them
779 * from the freelist in xfs_alloc_get_freelist().
781 static xfs_failaddr_t
785 struct xfs_mount
*mp
= bp
->b_mount
;
786 struct xfs_agfl
*agfl
= XFS_BUF_TO_AGFL(bp
);
787 __be32
*agfl_bno
= xfs_buf_to_agfl_bno(bp
);
790 if (!xfs_has_crc(mp
))
793 if (!xfs_verify_magic(bp
, agfl
->agfl_magicnum
))
794 return __this_address
;
795 if (!uuid_equal(&agfl
->agfl_uuid
, &mp
->m_sb
.sb_meta_uuid
))
796 return __this_address
;
798 * during growfs operations, the perag is not fully initialised,
799 * so we can't use it for any useful checking. growfs ensures we can't
800 * use it by using uncached buffers that don't have the perag attached
801 * so we can detect and avoid this problem.
803 if (bp
->b_pag
&& be32_to_cpu(agfl
->agfl_seqno
) != pag_agno((bp
->b_pag
)))
804 return __this_address
;
806 for (i
= 0; i
< xfs_agfl_size(mp
); i
++) {
807 if (be32_to_cpu(agfl_bno
[i
]) != NULLAGBLOCK
&&
808 be32_to_cpu(agfl_bno
[i
]) >= mp
->m_sb
.sb_agblocks
)
809 return __this_address
;
812 if (!xfs_log_check_lsn(mp
, be64_to_cpu(XFS_BUF_TO_AGFL(bp
)->agfl_lsn
)))
813 return __this_address
;
818 xfs_agfl_read_verify(
821 struct xfs_mount
*mp
= bp
->b_mount
;
825 * There is no verification of non-crc AGFLs because mkfs does not
826 * initialise the AGFL to zero or NULL. Hence the only valid part of the
827 * AGFL is what the AGF says is active. We can't get to the AGF, so we
828 * can't verify just those entries are valid.
830 if (!xfs_has_crc(mp
))
833 if (!xfs_buf_verify_cksum(bp
, XFS_AGFL_CRC_OFF
))
834 xfs_verifier_error(bp
, -EFSBADCRC
, __this_address
);
836 fa
= xfs_agfl_verify(bp
);
838 xfs_verifier_error(bp
, -EFSCORRUPTED
, fa
);
843 xfs_agfl_write_verify(
846 struct xfs_mount
*mp
= bp
->b_mount
;
847 struct xfs_buf_log_item
*bip
= bp
->b_log_item
;
850 /* no verification of non-crc AGFLs */
851 if (!xfs_has_crc(mp
))
854 fa
= xfs_agfl_verify(bp
);
856 xfs_verifier_error(bp
, -EFSCORRUPTED
, fa
);
861 XFS_BUF_TO_AGFL(bp
)->agfl_lsn
= cpu_to_be64(bip
->bli_item
.li_lsn
);
863 xfs_buf_update_cksum(bp
, XFS_AGFL_CRC_OFF
);
866 const struct xfs_buf_ops xfs_agfl_buf_ops
= {
868 .magic
= { cpu_to_be32(XFS_AGFL_MAGIC
), cpu_to_be32(XFS_AGFL_MAGIC
) },
869 .verify_read
= xfs_agfl_read_verify
,
870 .verify_write
= xfs_agfl_write_verify
,
871 .verify_struct
= xfs_agfl_verify
,
875 * Read in the allocation group free block array.
879 struct xfs_perag
*pag
,
880 struct xfs_trans
*tp
,
881 struct xfs_buf
**bpp
)
883 struct xfs_mount
*mp
= pag_mount(pag
);
887 error
= xfs_trans_read_buf(mp
, tp
, mp
->m_ddev_targp
,
888 XFS_AG_DADDR(mp
, pag_agno(pag
), XFS_AGFL_DADDR(mp
)),
889 XFS_FSS_TO_BB(mp
, 1), 0, &bp
, &xfs_agfl_buf_ops
);
890 if (xfs_metadata_is_sick(error
))
891 xfs_ag_mark_sick(pag
, XFS_SICK_AG_AGFL
);
894 xfs_buf_set_ref(bp
, XFS_AGFL_REF
);
900 xfs_alloc_update_counters(
901 struct xfs_trans
*tp
,
902 struct xfs_buf
*agbp
,
905 struct xfs_agf
*agf
= agbp
->b_addr
;
907 agbp
->b_pag
->pagf_freeblks
+= len
;
908 be32_add_cpu(&agf
->agf_freeblks
, len
);
910 if (unlikely(be32_to_cpu(agf
->agf_freeblks
) >
911 be32_to_cpu(agf
->agf_length
))) {
912 xfs_buf_mark_corrupt(agbp
);
913 xfs_ag_mark_sick(agbp
->b_pag
, XFS_SICK_AG_AGF
);
914 return -EFSCORRUPTED
;
917 xfs_alloc_log_agf(tp
, agbp
, XFS_AGF_FREEBLKS
);
922 * Block allocation algorithm and data structures.
924 struct xfs_alloc_cur
{
925 struct xfs_btree_cur
*cnt
; /* btree cursors */
926 struct xfs_btree_cur
*bnolt
;
927 struct xfs_btree_cur
*bnogt
;
928 xfs_extlen_t cur_len
;/* current search length */
929 xfs_agblock_t rec_bno
;/* extent startblock */
930 xfs_extlen_t rec_len
;/* extent length */
931 xfs_agblock_t bno
; /* alloc bno */
932 xfs_extlen_t len
; /* alloc len */
933 xfs_extlen_t diff
; /* diff from search bno */
934 unsigned int busy_gen
;/* busy state */
939 * Set up cursors, etc. in the extent allocation cursor. This function can be
940 * called multiple times to reset an initialized structure without having to
941 * reallocate cursors.
945 struct xfs_alloc_arg
*args
,
946 struct xfs_alloc_cur
*acur
)
951 acur
->cur_len
= args
->maxlen
;
961 * Perform an initial cntbt lookup to check for availability of maxlen
962 * extents. If this fails, we'll return -ENOSPC to signal the caller to
963 * attempt a small allocation.
966 acur
->cnt
= xfs_cntbt_init_cursor(args
->mp
, args
->tp
,
967 args
->agbp
, args
->pag
);
968 error
= xfs_alloc_lookup_ge(acur
->cnt
, 0, args
->maxlen
, &i
);
973 * Allocate the bnobt left and right search cursors.
976 acur
->bnolt
= xfs_bnobt_init_cursor(args
->mp
, args
->tp
,
977 args
->agbp
, args
->pag
);
979 acur
->bnogt
= xfs_bnobt_init_cursor(args
->mp
, args
->tp
,
980 args
->agbp
, args
->pag
);
981 return i
== 1 ? 0 : -ENOSPC
;
986 struct xfs_alloc_cur
*acur
,
989 int cur_error
= XFS_BTREE_NOERROR
;
992 cur_error
= XFS_BTREE_ERROR
;
995 xfs_btree_del_cursor(acur
->cnt
, cur_error
);
997 xfs_btree_del_cursor(acur
->bnolt
, cur_error
);
999 xfs_btree_del_cursor(acur
->bnogt
, cur_error
);
1000 acur
->cnt
= acur
->bnolt
= acur
->bnogt
= NULL
;
1004 * Check an extent for allocation and track the best available candidate in the
1005 * allocation structure. The cursor is deactivated if it has entered an out of
1006 * range state based on allocation arguments. Optionally return the extent
1007 * extent geometry and allocation status if requested by the caller.
1010 xfs_alloc_cur_check(
1011 struct xfs_alloc_arg
*args
,
1012 struct xfs_alloc_cur
*acur
,
1013 struct xfs_btree_cur
*cur
,
1017 xfs_agblock_t bno
, bnoa
, bnew
;
1018 xfs_extlen_t len
, lena
, diff
= -1;
1020 unsigned busy_gen
= 0;
1021 bool deactivate
= false;
1022 bool isbnobt
= xfs_btree_is_bno(cur
->bc_ops
);
1026 error
= xfs_alloc_get_rec(cur
, &bno
, &len
, &i
);
1029 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1)) {
1030 xfs_btree_mark_sick(cur
);
1031 return -EFSCORRUPTED
;
1035 * Check minlen and deactivate a cntbt cursor if out of acceptable size
1036 * range (i.e., walking backwards looking for a minlen extent).
1038 if (len
< args
->minlen
) {
1039 deactivate
= !isbnobt
;
1043 busy
= xfs_alloc_compute_aligned(args
, bno
, len
, &bnoa
, &lena
,
1047 acur
->busy_gen
= busy_gen
;
1048 /* deactivate a bnobt cursor outside of locality range */
1049 if (bnoa
< args
->min_agbno
|| bnoa
> args
->max_agbno
) {
1050 deactivate
= isbnobt
;
1053 if (lena
< args
->minlen
)
1056 args
->len
= XFS_EXTLEN_MIN(lena
, args
->maxlen
);
1057 xfs_alloc_fix_len(args
);
1058 ASSERT(args
->len
>= args
->minlen
);
1059 if (args
->len
< acur
->len
)
1063 * We have an aligned record that satisfies minlen and beats or matches
1064 * the candidate extent size. Compare locality for near allocation mode.
1066 diff
= xfs_alloc_compute_diff(args
->agbno
, args
->len
,
1067 args
->alignment
, args
->datatype
,
1069 if (bnew
== NULLAGBLOCK
)
1073 * Deactivate a bnobt cursor with worse locality than the current best.
1075 if (diff
> acur
->diff
) {
1076 deactivate
= isbnobt
;
1080 ASSERT(args
->len
> acur
->len
||
1081 (args
->len
== acur
->len
&& diff
<= acur
->diff
));
1082 acur
->rec_bno
= bno
;
1083 acur
->rec_len
= len
;
1085 acur
->len
= args
->len
;
1090 * We're done if we found a perfect allocation. This only deactivates
1091 * the current cursor, but this is just an optimization to terminate a
1092 * cntbt search that otherwise runs to the edge of the tree.
1094 if (acur
->diff
== 0 && acur
->len
== args
->maxlen
)
1098 cur
->bc_flags
&= ~XFS_BTREE_ALLOCBT_ACTIVE
;
1099 trace_xfs_alloc_cur_check(cur
, bno
, len
, diff
, *new);
1104 * Complete an allocation of a candidate extent. Remove the extent from both
1105 * trees and update the args structure.
1108 xfs_alloc_cur_finish(
1109 struct xfs_alloc_arg
*args
,
1110 struct xfs_alloc_cur
*acur
)
1114 ASSERT(acur
->cnt
&& acur
->bnolt
);
1115 ASSERT(acur
->bno
>= acur
->rec_bno
);
1116 ASSERT(acur
->bno
+ acur
->len
<= acur
->rec_bno
+ acur
->rec_len
);
1117 ASSERT(xfs_verify_agbext(args
->pag
, acur
->rec_bno
, acur
->rec_len
));
1119 error
= xfs_alloc_fixup_trees(acur
->cnt
, acur
->bnolt
, acur
->rec_bno
,
1120 acur
->rec_len
, acur
->bno
, acur
->len
, 0);
1124 args
->agbno
= acur
->bno
;
1125 args
->len
= acur
->len
;
1126 args
->wasfromfl
= 0;
1128 trace_xfs_alloc_cur(args
);
1133 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
1134 * bno optimized lookup to search for extents with ideal size and locality.
1137 xfs_alloc_cntbt_iter(
1138 struct xfs_alloc_arg
*args
,
1139 struct xfs_alloc_cur
*acur
)
1141 struct xfs_btree_cur
*cur
= acur
->cnt
;
1143 xfs_extlen_t len
, cur_len
;
1147 if (!xfs_alloc_cur_active(cur
))
1150 /* locality optimized lookup */
1151 cur_len
= acur
->cur_len
;
1152 error
= xfs_alloc_lookup_ge(cur
, args
->agbno
, cur_len
, &i
);
1157 error
= xfs_alloc_get_rec(cur
, &bno
, &len
, &i
);
1161 /* check the current record and update search length from it */
1162 error
= xfs_alloc_cur_check(args
, acur
, cur
, &i
);
1165 ASSERT(len
>= acur
->cur_len
);
1166 acur
->cur_len
= len
;
1169 * We looked up the first record >= [agbno, len] above. The agbno is a
1170 * secondary key and so the current record may lie just before or after
1171 * agbno. If it is past agbno, check the previous record too so long as
1172 * the length matches as it may be closer. Don't check a smaller record
1173 * because that could deactivate our cursor.
1175 if (bno
> args
->agbno
) {
1176 error
= xfs_btree_decrement(cur
, 0, &i
);
1178 error
= xfs_alloc_get_rec(cur
, &bno
, &len
, &i
);
1179 if (!error
&& i
&& len
== acur
->cur_len
)
1180 error
= xfs_alloc_cur_check(args
, acur
, cur
,
1188 * Increment the search key until we find at least one allocation
1189 * candidate or if the extent we found was larger. Otherwise, double the
1190 * search key to optimize the search. Efficiency is more important here
1191 * than absolute best locality.
1194 if (!acur
->len
|| acur
->cur_len
>= cur_len
)
1197 acur
->cur_len
= cur_len
;
1203 * Deal with the case where only small freespaces remain. Either return the
1204 * contents of the last freespace record, or allocate space from the freelist if
1205 * there is nothing in the tree.
1207 STATIC
int /* error */
1208 xfs_alloc_ag_vextent_small(
1209 struct xfs_alloc_arg
*args
, /* allocation argument structure */
1210 struct xfs_btree_cur
*ccur
, /* optional by-size cursor */
1211 xfs_agblock_t
*fbnop
, /* result block number */
1212 xfs_extlen_t
*flenp
, /* result length */
1213 int *stat
) /* status: 0-freelist, 1-normal/none */
1215 struct xfs_agf
*agf
= args
->agbp
->b_addr
;
1217 xfs_agblock_t fbno
= NULLAGBLOCK
;
1218 xfs_extlen_t flen
= 0;
1222 * If a cntbt cursor is provided, try to allocate the largest record in
1223 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1224 * allocation. Make sure to respect minleft even when pulling from the
1228 error
= xfs_btree_decrement(ccur
, 0, &i
);
1232 error
= xfs_alloc_get_rec(ccur
, &fbno
, &flen
, &i
);
1235 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1)) {
1236 xfs_btree_mark_sick(ccur
);
1237 error
= -EFSCORRUPTED
;
1243 if (args
->minlen
!= 1 || args
->alignment
!= 1 ||
1244 args
->resv
== XFS_AG_RESV_AGFL
||
1245 be32_to_cpu(agf
->agf_flcount
) <= args
->minleft
)
1248 error
= xfs_alloc_get_freelist(args
->pag
, args
->tp
, args
->agbp
,
1252 if (fbno
== NULLAGBLOCK
)
1255 xfs_extent_busy_reuse(pag_group(args
->pag
), fbno
, 1,
1256 (args
->datatype
& XFS_ALLOC_NOBUSY
));
1258 if (args
->datatype
& XFS_ALLOC_USERDATA
) {
1261 error
= xfs_trans_get_buf(args
->tp
, args
->mp
->m_ddev_targp
,
1262 xfs_agbno_to_daddr(args
->pag
, fbno
),
1263 args
->mp
->m_bsize
, 0, &bp
);
1266 xfs_trans_binval(args
->tp
, bp
);
1268 *fbnop
= args
->agbno
= fbno
;
1269 *flenp
= args
->len
= 1;
1270 if (XFS_IS_CORRUPT(args
->mp
, fbno
>= be32_to_cpu(agf
->agf_length
))) {
1271 xfs_btree_mark_sick(ccur
);
1272 error
= -EFSCORRUPTED
;
1275 args
->wasfromfl
= 1;
1276 trace_xfs_alloc_small_freelist(args
);
1279 * If we're feeding an AGFL block to something that doesn't live in the
1280 * free space, we need to clear out the OWN_AG rmap.
1282 error
= xfs_rmap_free(args
->tp
, args
->agbp
, args
->pag
, fbno
, 1,
1283 &XFS_RMAP_OINFO_AG
);
1292 * Can't do the allocation, give up.
1294 if (flen
< args
->minlen
) {
1295 args
->agbno
= NULLAGBLOCK
;
1296 trace_xfs_alloc_small_notenough(args
);
1302 trace_xfs_alloc_small_done(args
);
1306 trace_xfs_alloc_small_error(args
);
1311 * Allocate a variable extent at exactly agno/bno.
1312 * Extent's length (returned in *len) will be between minlen and maxlen,
1313 * and of the form k * prod + mod unless there's nothing that large.
1314 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1316 STATIC
int /* error */
1317 xfs_alloc_ag_vextent_exact(
1318 xfs_alloc_arg_t
*args
) /* allocation argument structure */
1320 struct xfs_btree_cur
*bno_cur
;/* by block-number btree cursor */
1321 struct xfs_btree_cur
*cnt_cur
;/* by count btree cursor */
1323 xfs_agblock_t fbno
; /* start block of found extent */
1324 xfs_extlen_t flen
; /* length of found extent */
1325 xfs_agblock_t tbno
; /* start block of busy extent */
1326 xfs_extlen_t tlen
; /* length of busy extent */
1327 xfs_agblock_t tend
; /* end block of busy extent */
1328 int i
; /* success/failure of operation */
1331 ASSERT(args
->alignment
== 1);
1334 * Allocate/initialize a cursor for the by-number freespace btree.
1336 bno_cur
= xfs_bnobt_init_cursor(args
->mp
, args
->tp
, args
->agbp
,
1340 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1341 * Look for the closest free block <= bno, it must contain bno
1342 * if any free block does.
1344 error
= xfs_alloc_lookup_le(bno_cur
, args
->agbno
, args
->minlen
, &i
);
1351 * Grab the freespace record.
1353 error
= xfs_alloc_get_rec(bno_cur
, &fbno
, &flen
, &i
);
1356 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1)) {
1357 xfs_btree_mark_sick(bno_cur
);
1358 error
= -EFSCORRUPTED
;
1361 ASSERT(fbno
<= args
->agbno
);
1364 * Check for overlapping busy extents.
1368 xfs_extent_busy_trim(pag_group(args
->pag
), args
->minlen
, args
->maxlen
,
1369 &tbno
, &tlen
, &busy_gen
);
1372 * Give up if the start of the extent is busy, or the freespace isn't
1373 * long enough for the minimum request.
1375 if (tbno
> args
->agbno
)
1377 if (tlen
< args
->minlen
)
1380 if (tend
< args
->agbno
+ args
->minlen
)
1384 * End of extent will be smaller of the freespace end and the
1385 * maximal requested end.
1387 * Fix the length according to mod and prod if given.
1389 args
->len
= XFS_AGBLOCK_MIN(tend
, args
->agbno
+ args
->maxlen
)
1391 xfs_alloc_fix_len(args
);
1392 ASSERT(args
->agbno
+ args
->len
<= tend
);
1395 * We are allocating agbno for args->len
1396 * Allocate/initialize a cursor for the by-size btree.
1398 cnt_cur
= xfs_cntbt_init_cursor(args
->mp
, args
->tp
, args
->agbp
,
1400 ASSERT(xfs_verify_agbext(args
->pag
, args
->agbno
, args
->len
));
1401 error
= xfs_alloc_fixup_trees(cnt_cur
, bno_cur
, fbno
, flen
, args
->agbno
,
1402 args
->len
, XFSA_FIXUP_BNO_OK
);
1404 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_ERROR
);
1408 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_NOERROR
);
1409 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
1411 args
->wasfromfl
= 0;
1412 trace_xfs_alloc_exact_done(args
);
1416 /* Didn't find it, return null. */
1417 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_NOERROR
);
1418 args
->agbno
= NULLAGBLOCK
;
1419 trace_xfs_alloc_exact_notfound(args
);
1423 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_ERROR
);
1424 trace_xfs_alloc_exact_error(args
);
1429 * Search a given number of btree records in a given direction. Check each
1430 * record against the good extent we've already found.
1433 xfs_alloc_walk_iter(
1434 struct xfs_alloc_arg
*args
,
1435 struct xfs_alloc_cur
*acur
,
1436 struct xfs_btree_cur
*cur
,
1438 bool find_one
, /* quit on first candidate */
1439 int count
, /* rec count (-1 for infinite) */
1448 * Search so long as the cursor is active or we find a better extent.
1449 * The cursor is deactivated if it extends beyond the range of the
1450 * current allocation candidate.
1452 while (xfs_alloc_cur_active(cur
) && count
) {
1453 error
= xfs_alloc_cur_check(args
, acur
, cur
, &i
);
1461 if (!xfs_alloc_cur_active(cur
))
1465 error
= xfs_btree_increment(cur
, 0, &i
);
1467 error
= xfs_btree_decrement(cur
, 0, &i
);
1471 cur
->bc_flags
&= ~XFS_BTREE_ALLOCBT_ACTIVE
;
1481 * Search the by-bno and by-size btrees in parallel in search of an extent with
1482 * ideal locality based on the NEAR mode ->agbno locality hint.
1485 xfs_alloc_ag_vextent_locality(
1486 struct xfs_alloc_arg
*args
,
1487 struct xfs_alloc_cur
*acur
,
1490 struct xfs_btree_cur
*fbcur
= NULL
;
1495 ASSERT(acur
->len
== 0);
1499 error
= xfs_alloc_lookup_ge(acur
->cnt
, args
->agbno
, acur
->cur_len
, &i
);
1502 error
= xfs_alloc_lookup_le(acur
->bnolt
, args
->agbno
, 0, &i
);
1505 error
= xfs_alloc_lookup_ge(acur
->bnogt
, args
->agbno
, 0, &i
);
1510 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1511 * right and lookup the closest extent to the locality hint for each
1512 * extent size key in the cntbt. The entire search terminates
1513 * immediately on a bnobt hit because that means we've found best case
1514 * locality. Otherwise the search continues until the cntbt cursor runs
1515 * off the end of the tree. If no allocation candidate is found at this
1516 * point, give up on locality, walk backwards from the end of the cntbt
1517 * and take the first available extent.
1519 * The parallel tree searches balance each other out to provide fairly
1520 * consistent performance for various situations. The bnobt search can
1521 * have pathological behavior in the worst case scenario of larger
1522 * allocation requests and fragmented free space. On the other hand, the
1523 * bnobt is able to satisfy most smaller allocation requests much more
1524 * quickly than the cntbt. The cntbt search can sift through fragmented
1525 * free space and sets of free extents for larger allocation requests
1526 * more quickly than the bnobt. Since the locality hint is just a hint
1527 * and we don't want to scan the entire bnobt for perfect locality, the
1528 * cntbt search essentially bounds the bnobt search such that we can
1529 * find good enough locality at reasonable performance in most cases.
1531 while (xfs_alloc_cur_active(acur
->bnolt
) ||
1532 xfs_alloc_cur_active(acur
->bnogt
) ||
1533 xfs_alloc_cur_active(acur
->cnt
)) {
1535 trace_xfs_alloc_cur_lookup(args
);
1538 * Search the bnobt left and right. In the case of a hit, finish
1539 * the search in the opposite direction and we're done.
1541 error
= xfs_alloc_walk_iter(args
, acur
, acur
->bnolt
, false,
1546 trace_xfs_alloc_cur_left(args
);
1547 fbcur
= acur
->bnogt
;
1551 error
= xfs_alloc_walk_iter(args
, acur
, acur
->bnogt
, true, true,
1556 trace_xfs_alloc_cur_right(args
);
1557 fbcur
= acur
->bnolt
;
1563 * Check the extent with best locality based on the current
1564 * extent size search key and keep track of the best candidate.
1566 error
= xfs_alloc_cntbt_iter(args
, acur
);
1569 if (!xfs_alloc_cur_active(acur
->cnt
)) {
1570 trace_xfs_alloc_cur_lookup_done(args
);
1576 * If we failed to find anything due to busy extents, return empty
1577 * handed so the caller can flush and retry. If no busy extents were
1578 * found, walk backwards from the end of the cntbt as a last resort.
1580 if (!xfs_alloc_cur_active(acur
->cnt
) && !acur
->len
&& !acur
->busy
) {
1581 error
= xfs_btree_decrement(acur
->cnt
, 0, &i
);
1585 acur
->cnt
->bc_flags
|= XFS_BTREE_ALLOCBT_ACTIVE
;
1592 * Search in the opposite direction for a better entry in the case of
1593 * a bnobt hit or walk backwards from the end of the cntbt.
1596 error
= xfs_alloc_walk_iter(args
, acur
, fbcur
, fbinc
, true, -1,
1608 /* Check the last block of the cnt btree for allocations. */
1610 xfs_alloc_ag_vextent_lastblock(
1611 struct xfs_alloc_arg
*args
,
1612 struct xfs_alloc_cur
*acur
,
1621 /* Randomly don't execute the first algorithm. */
1622 if (get_random_u32_below(2))
1627 * Start from the entry that lookup found, sequence through all larger
1628 * free blocks. If we're actually pointing at a record smaller than
1629 * maxlen, go to the start of this block, and skip all those smaller
1632 if (*len
|| args
->alignment
> 1) {
1633 acur
->cnt
->bc_levels
[0].ptr
= 1;
1635 error
= xfs_alloc_get_rec(acur
->cnt
, bno
, len
, &i
);
1638 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1)) {
1639 xfs_btree_mark_sick(acur
->cnt
);
1640 return -EFSCORRUPTED
;
1642 if (*len
>= args
->minlen
)
1644 error
= xfs_btree_increment(acur
->cnt
, 0, &i
);
1648 ASSERT(*len
>= args
->minlen
);
1653 error
= xfs_alloc_walk_iter(args
, acur
, acur
->cnt
, true, false, -1, &i
);
1658 * It didn't work. We COULD be in a case where there's a good record
1659 * somewhere, so try again.
1664 trace_xfs_alloc_near_first(args
);
1670 * Allocate a variable extent near bno in the allocation group agno.
1671 * Extent's length (returned in len) will be between minlen and maxlen,
1672 * and of the form k * prod + mod unless there's nothing that large.
1673 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1676 xfs_alloc_ag_vextent_near(
1677 struct xfs_alloc_arg
*args
,
1678 uint32_t alloc_flags
)
1680 struct xfs_alloc_cur acur
= {};
1681 int error
; /* error code */
1682 int i
; /* result code, temporary */
1686 /* handle uninitialized agbno range so caller doesn't have to */
1687 if (!args
->min_agbno
&& !args
->max_agbno
)
1688 args
->max_agbno
= args
->mp
->m_sb
.sb_agblocks
- 1;
1689 ASSERT(args
->min_agbno
<= args
->max_agbno
);
1691 /* clamp agbno to the range if it's outside */
1692 if (args
->agbno
< args
->min_agbno
)
1693 args
->agbno
= args
->min_agbno
;
1694 if (args
->agbno
> args
->max_agbno
)
1695 args
->agbno
= args
->max_agbno
;
1697 /* Retry once quickly if we find busy extents before blocking. */
1698 alloc_flags
|= XFS_ALLOC_FLAG_TRYFLUSH
;
1703 * Set up cursors and see if there are any free extents as big as
1704 * maxlen. If not, pick the last entry in the tree unless the tree is
1707 error
= xfs_alloc_cur_setup(args
, &acur
);
1708 if (error
== -ENOSPC
) {
1709 error
= xfs_alloc_ag_vextent_small(args
, acur
.cnt
, &bno
,
1713 if (i
== 0 || len
== 0) {
1714 trace_xfs_alloc_near_noentry(args
);
1724 * If the requested extent is large wrt the freespaces available
1725 * in this a.g., then the cursor will be pointing to a btree entry
1726 * near the right edge of the tree. If it's in the last btree leaf
1727 * block, then we just examine all the entries in that block
1728 * that are big enough, and pick the best one.
1730 if (xfs_btree_islastblock(acur
.cnt
, 0)) {
1731 bool allocated
= false;
1733 error
= xfs_alloc_ag_vextent_lastblock(args
, &acur
, &bno
, &len
,
1742 * Second algorithm. Combined cntbt and bnobt search to find ideal
1745 error
= xfs_alloc_ag_vextent_locality(args
, &acur
, &i
);
1750 * If we couldn't get anything, give up.
1755 * Our only valid extents must have been busy. Flush and
1756 * retry the allocation again. If we get an -EAGAIN
1757 * error, we're being told that a deadlock was avoided
1758 * and the current transaction needs committing before
1759 * the allocation can be retried.
1761 trace_xfs_alloc_near_busy(args
);
1762 error
= xfs_extent_busy_flush(args
->tp
,
1763 pag_group(args
->pag
), acur
.busy_gen
,
1768 alloc_flags
&= ~XFS_ALLOC_FLAG_TRYFLUSH
;
1771 trace_xfs_alloc_size_neither(args
);
1772 args
->agbno
= NULLAGBLOCK
;
1777 /* fix up btrees on a successful allocation */
1778 error
= xfs_alloc_cur_finish(args
, &acur
);
1781 xfs_alloc_cur_close(&acur
, error
);
1786 * Allocate a variable extent anywhere in the allocation group agno.
1787 * Extent's length (returned in len) will be between minlen and maxlen,
1788 * and of the form k * prod + mod unless there's nothing that large.
1789 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1792 xfs_alloc_ag_vextent_size(
1793 struct xfs_alloc_arg
*args
,
1794 uint32_t alloc_flags
)
1796 struct xfs_agf
*agf
= args
->agbp
->b_addr
;
1797 struct xfs_btree_cur
*bno_cur
;
1798 struct xfs_btree_cur
*cnt_cur
;
1799 xfs_agblock_t fbno
; /* start of found freespace */
1800 xfs_extlen_t flen
; /* length of found freespace */
1801 xfs_agblock_t rbno
; /* returned block number */
1802 xfs_extlen_t rlen
; /* length of returned extent */
1808 /* Retry once quickly if we find busy extents before blocking. */
1809 alloc_flags
|= XFS_ALLOC_FLAG_TRYFLUSH
;
1812 * Allocate and initialize a cursor for the by-size btree.
1814 cnt_cur
= xfs_cntbt_init_cursor(args
->mp
, args
->tp
, args
->agbp
,
1819 * Look for an entry >= maxlen+alignment-1 blocks.
1821 if ((error
= xfs_alloc_lookup_ge(cnt_cur
, 0,
1822 args
->maxlen
+ args
->alignment
- 1, &i
)))
1826 * If none then we have to settle for a smaller extent. In the case that
1827 * there are no large extents, this will return the last entry in the
1828 * tree unless the tree is empty. In the case that there are only busy
1829 * large extents, this will return the largest small extent unless there
1830 * are no smaller extents available.
1833 error
= xfs_alloc_ag_vextent_small(args
, cnt_cur
,
1837 if (i
== 0 || flen
== 0) {
1838 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
1839 trace_xfs_alloc_size_noentry(args
);
1843 busy
= xfs_alloc_compute_aligned(args
, fbno
, flen
, &rbno
,
1847 * Search for a non-busy extent that is large enough.
1850 error
= xfs_alloc_get_rec(cnt_cur
, &fbno
, &flen
, &i
);
1853 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1)) {
1854 xfs_btree_mark_sick(cnt_cur
);
1855 error
= -EFSCORRUPTED
;
1859 busy
= xfs_alloc_compute_aligned(args
, fbno
, flen
,
1860 &rbno
, &rlen
, &busy_gen
);
1862 if (rlen
>= args
->maxlen
)
1865 error
= xfs_btree_increment(cnt_cur
, 0, &i
);
1872 * Our only valid extents must have been busy. Flush and
1873 * retry the allocation again. If we get an -EAGAIN
1874 * error, we're being told that a deadlock was avoided
1875 * and the current transaction needs committing before
1876 * the allocation can be retried.
1878 trace_xfs_alloc_size_busy(args
);
1879 error
= xfs_extent_busy_flush(args
->tp
,
1880 pag_group(args
->pag
), busy_gen
,
1885 alloc_flags
&= ~XFS_ALLOC_FLAG_TRYFLUSH
;
1886 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
1892 * In the first case above, we got the last entry in the
1893 * by-size btree. Now we check to see if the space hits maxlen
1894 * once aligned; if not, we search left for something better.
1895 * This can't happen in the second case above.
1897 rlen
= XFS_EXTLEN_MIN(args
->maxlen
, rlen
);
1898 if (XFS_IS_CORRUPT(args
->mp
,
1901 rbno
+ rlen
> fbno
+ flen
))) {
1902 xfs_btree_mark_sick(cnt_cur
);
1903 error
= -EFSCORRUPTED
;
1906 if (rlen
< args
->maxlen
) {
1907 xfs_agblock_t bestfbno
;
1908 xfs_extlen_t bestflen
;
1909 xfs_agblock_t bestrbno
;
1910 xfs_extlen_t bestrlen
;
1917 if ((error
= xfs_btree_decrement(cnt_cur
, 0, &i
)))
1921 if ((error
= xfs_alloc_get_rec(cnt_cur
, &fbno
, &flen
,
1924 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1)) {
1925 xfs_btree_mark_sick(cnt_cur
);
1926 error
= -EFSCORRUPTED
;
1929 if (flen
<= bestrlen
)
1931 busy
= xfs_alloc_compute_aligned(args
, fbno
, flen
,
1932 &rbno
, &rlen
, &busy_gen
);
1933 rlen
= XFS_EXTLEN_MIN(args
->maxlen
, rlen
);
1934 if (XFS_IS_CORRUPT(args
->mp
,
1937 rbno
+ rlen
> fbno
+ flen
))) {
1938 xfs_btree_mark_sick(cnt_cur
);
1939 error
= -EFSCORRUPTED
;
1942 if (rlen
> bestrlen
) {
1947 if (rlen
== args
->maxlen
)
1951 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, bestfbno
, bestflen
,
1954 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1)) {
1955 xfs_btree_mark_sick(cnt_cur
);
1956 error
= -EFSCORRUPTED
;
1964 args
->wasfromfl
= 0;
1966 * Fix up the length.
1969 if (rlen
< args
->minlen
) {
1972 * Our only valid extents must have been busy. Flush and
1973 * retry the allocation again. If we get an -EAGAIN
1974 * error, we're being told that a deadlock was avoided
1975 * and the current transaction needs committing before
1976 * the allocation can be retried.
1978 trace_xfs_alloc_size_busy(args
);
1979 error
= xfs_extent_busy_flush(args
->tp
,
1980 pag_group(args
->pag
), busy_gen
,
1985 alloc_flags
&= ~XFS_ALLOC_FLAG_TRYFLUSH
;
1986 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
1991 xfs_alloc_fix_len(args
);
1994 if (XFS_IS_CORRUPT(args
->mp
, rlen
> flen
)) {
1995 xfs_btree_mark_sick(cnt_cur
);
1996 error
= -EFSCORRUPTED
;
2000 * Allocate and initialize a cursor for the by-block tree.
2002 bno_cur
= xfs_bnobt_init_cursor(args
->mp
, args
->tp
, args
->agbp
,
2004 if ((error
= xfs_alloc_fixup_trees(cnt_cur
, bno_cur
, fbno
, flen
,
2005 rbno
, rlen
, XFSA_FIXUP_CNT_OK
)))
2007 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
2008 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_NOERROR
);
2009 cnt_cur
= bno_cur
= NULL
;
2012 if (XFS_IS_CORRUPT(args
->mp
,
2013 args
->agbno
+ args
->len
>
2014 be32_to_cpu(agf
->agf_length
))) {
2015 xfs_ag_mark_sick(args
->pag
, XFS_SICK_AG_BNOBT
);
2016 error
= -EFSCORRUPTED
;
2019 trace_xfs_alloc_size_done(args
);
2023 trace_xfs_alloc_size_error(args
);
2025 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_ERROR
);
2027 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_ERROR
);
2031 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
2032 trace_xfs_alloc_size_nominleft(args
);
2033 args
->agbno
= NULLAGBLOCK
;
2038 * Free the extent starting at agno/bno for length.
2042 struct xfs_trans
*tp
,
2043 struct xfs_buf
*agbp
,
2046 const struct xfs_owner_info
*oinfo
,
2047 enum xfs_ag_resv_type type
)
2049 struct xfs_mount
*mp
;
2050 struct xfs_btree_cur
*bno_cur
;
2051 struct xfs_btree_cur
*cnt_cur
;
2052 xfs_agblock_t gtbno
; /* start of right neighbor */
2053 xfs_extlen_t gtlen
; /* length of right neighbor */
2054 xfs_agblock_t ltbno
; /* start of left neighbor */
2055 xfs_extlen_t ltlen
; /* length of left neighbor */
2056 xfs_agblock_t nbno
; /* new starting block of freesp */
2057 xfs_extlen_t nlen
; /* new length of freespace */
2058 int haveleft
; /* have a left neighbor */
2059 int haveright
; /* have a right neighbor */
2062 struct xfs_perag
*pag
= agbp
->b_pag
;
2063 bool fixup_longest
= false;
2065 bno_cur
= cnt_cur
= NULL
;
2068 if (!xfs_rmap_should_skip_owner_update(oinfo
)) {
2069 error
= xfs_rmap_free(tp
, agbp
, pag
, bno
, len
, oinfo
);
2075 * Allocate and initialize a cursor for the by-block btree.
2077 bno_cur
= xfs_bnobt_init_cursor(mp
, tp
, agbp
, pag
);
2079 * Look for a neighboring block on the left (lower block numbers)
2080 * that is contiguous with this space.
2082 if ((error
= xfs_alloc_lookup_le(bno_cur
, bno
, len
, &haveleft
)))
2086 * There is a block to our left.
2088 if ((error
= xfs_alloc_get_rec(bno_cur
, <bno
, <len
, &i
)))
2090 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2091 xfs_btree_mark_sick(bno_cur
);
2092 error
= -EFSCORRUPTED
;
2096 * It's not contiguous, though.
2098 if (ltbno
+ ltlen
< bno
)
2102 * If this failure happens the request to free this
2103 * space was invalid, it's (partly) already free.
2106 if (XFS_IS_CORRUPT(mp
, ltbno
+ ltlen
> bno
)) {
2107 xfs_btree_mark_sick(bno_cur
);
2108 error
= -EFSCORRUPTED
;
2114 * Look for a neighboring block on the right (higher block numbers)
2115 * that is contiguous with this space.
2117 if ((error
= xfs_btree_increment(bno_cur
, 0, &haveright
)))
2121 * There is a block to our right.
2123 if ((error
= xfs_alloc_get_rec(bno_cur
, >bno
, >len
, &i
)))
2125 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2126 xfs_btree_mark_sick(bno_cur
);
2127 error
= -EFSCORRUPTED
;
2131 * It's not contiguous, though.
2133 if (bno
+ len
< gtbno
)
2137 * If this failure happens the request to free this
2138 * space was invalid, it's (partly) already free.
2141 if (XFS_IS_CORRUPT(mp
, bno
+ len
> gtbno
)) {
2142 xfs_btree_mark_sick(bno_cur
);
2143 error
= -EFSCORRUPTED
;
2149 * Now allocate and initialize a cursor for the by-size tree.
2151 cnt_cur
= xfs_cntbt_init_cursor(mp
, tp
, agbp
, pag
);
2153 * Have both left and right contiguous neighbors.
2154 * Merge all three into a single free block.
2156 if (haveleft
&& haveright
) {
2158 * Delete the old by-size entry on the left.
2160 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, ltbno
, ltlen
, &i
)))
2162 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2163 xfs_btree_mark_sick(cnt_cur
);
2164 error
= -EFSCORRUPTED
;
2167 if ((error
= xfs_btree_delete(cnt_cur
, &i
)))
2169 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2170 xfs_btree_mark_sick(cnt_cur
);
2171 error
= -EFSCORRUPTED
;
2175 * Delete the old by-size entry on the right.
2177 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, gtbno
, gtlen
, &i
)))
2179 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2180 xfs_btree_mark_sick(cnt_cur
);
2181 error
= -EFSCORRUPTED
;
2184 if ((error
= xfs_btree_delete(cnt_cur
, &i
)))
2186 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2187 xfs_btree_mark_sick(cnt_cur
);
2188 error
= -EFSCORRUPTED
;
2192 * Delete the old by-block entry for the right block.
2194 if ((error
= xfs_btree_delete(bno_cur
, &i
)))
2196 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2197 xfs_btree_mark_sick(bno_cur
);
2198 error
= -EFSCORRUPTED
;
2202 * Move the by-block cursor back to the left neighbor.
2204 if ((error
= xfs_btree_decrement(bno_cur
, 0, &i
)))
2206 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2207 xfs_btree_mark_sick(bno_cur
);
2208 error
= -EFSCORRUPTED
;
2213 * Check that this is the right record: delete didn't
2214 * mangle the cursor.
2217 xfs_agblock_t xxbno
;
2220 if ((error
= xfs_alloc_get_rec(bno_cur
, &xxbno
, &xxlen
,
2223 if (XFS_IS_CORRUPT(mp
,
2227 xfs_btree_mark_sick(bno_cur
);
2228 error
= -EFSCORRUPTED
;
2234 * Update remaining by-block entry to the new, joined block.
2237 nlen
= len
+ ltlen
+ gtlen
;
2238 if ((error
= xfs_alloc_update(bno_cur
, nbno
, nlen
)))
2242 * Have only a left contiguous neighbor.
2243 * Merge it together with the new freespace.
2245 else if (haveleft
) {
2247 * Delete the old by-size entry on the left.
2249 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, ltbno
, ltlen
, &i
)))
2251 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2252 xfs_btree_mark_sick(cnt_cur
);
2253 error
= -EFSCORRUPTED
;
2256 if ((error
= xfs_btree_delete(cnt_cur
, &i
)))
2258 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2259 xfs_btree_mark_sick(cnt_cur
);
2260 error
= -EFSCORRUPTED
;
2264 * Back up the by-block cursor to the left neighbor, and
2265 * update its length.
2267 if ((error
= xfs_btree_decrement(bno_cur
, 0, &i
)))
2269 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2270 xfs_btree_mark_sick(bno_cur
);
2271 error
= -EFSCORRUPTED
;
2276 if ((error
= xfs_alloc_update(bno_cur
, nbno
, nlen
)))
2280 * Have only a right contiguous neighbor.
2281 * Merge it together with the new freespace.
2283 else if (haveright
) {
2285 * Delete the old by-size entry on the right.
2287 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, gtbno
, gtlen
, &i
)))
2289 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2290 xfs_btree_mark_sick(cnt_cur
);
2291 error
= -EFSCORRUPTED
;
2294 if ((error
= xfs_btree_delete(cnt_cur
, &i
)))
2296 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2297 xfs_btree_mark_sick(cnt_cur
);
2298 error
= -EFSCORRUPTED
;
2302 * Update the starting block and length of the right
2303 * neighbor in the by-block tree.
2307 if ((error
= xfs_alloc_update(bno_cur
, nbno
, nlen
)))
2311 * No contiguous neighbors.
2312 * Insert the new freespace into the by-block tree.
2317 if ((error
= xfs_btree_insert(bno_cur
, &i
)))
2319 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2320 xfs_btree_mark_sick(bno_cur
);
2321 error
= -EFSCORRUPTED
;
2325 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_NOERROR
);
2329 * In all cases we need to insert the new freespace in the by-size tree.
2331 * If this new freespace is being inserted in the block that contains
2332 * the largest free space in the btree, make sure we also fix up the
2333 * agf->agf-longest tracker field.
2335 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, nbno
, nlen
, &i
)))
2337 if (XFS_IS_CORRUPT(mp
, i
!= 0)) {
2338 xfs_btree_mark_sick(cnt_cur
);
2339 error
= -EFSCORRUPTED
;
2342 if (xfs_alloc_cursor_at_lastrec(cnt_cur
))
2343 fixup_longest
= true;
2344 if ((error
= xfs_btree_insert(cnt_cur
, &i
)))
2346 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2347 xfs_btree_mark_sick(cnt_cur
);
2348 error
= -EFSCORRUPTED
;
2351 if (fixup_longest
) {
2352 error
= xfs_alloc_fixup_longest(cnt_cur
);
2357 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
2361 * Update the freespace totals in the ag and superblock.
2363 error
= xfs_alloc_update_counters(tp
, agbp
, len
);
2364 xfs_ag_resv_free_extent(pag
, type
, tp
, len
);
2368 XFS_STATS_INC(mp
, xs_freex
);
2369 XFS_STATS_ADD(mp
, xs_freeb
, len
);
2371 trace_xfs_free_extent(pag
, bno
, len
, type
, haveleft
, haveright
);
2376 trace_xfs_free_extent(pag
, bno
, len
, type
, -1, -1);
2378 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_ERROR
);
2380 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_ERROR
);
2385 * Visible (exported) allocation/free functions.
2386 * Some of these are used just by xfs_alloc_btree.c and this file.
2390 * Compute and fill in value of m_alloc_maxlevels.
2393 xfs_alloc_compute_maxlevels(
2394 xfs_mount_t
*mp
) /* file system mount structure */
2396 mp
->m_alloc_maxlevels
= xfs_btree_compute_maxlevels(mp
->m_alloc_mnr
,
2397 (mp
->m_sb
.sb_agblocks
+ 1) / 2);
2398 ASSERT(mp
->m_alloc_maxlevels
<= xfs_allocbt_maxlevels_ondisk());
2402 * Find the length of the longest extent in an AG. The 'need' parameter
2403 * specifies how much space we're going to need for the AGFL and the
2404 * 'reserved' parameter tells us how many blocks in this AG are reserved for
2408 xfs_alloc_longest_free_extent(
2409 struct xfs_perag
*pag
,
2411 xfs_extlen_t reserved
)
2413 xfs_extlen_t delta
= 0;
2416 * If the AGFL needs a recharge, we'll have to subtract that from the
2419 if (need
> pag
->pagf_flcount
)
2420 delta
= need
- pag
->pagf_flcount
;
2423 * If we cannot maintain others' reservations with space from the
2424 * not-longest freesp extents, we'll have to subtract /that/ from
2425 * the longest extent too.
2427 if (pag
->pagf_freeblks
- pag
->pagf_longest
< reserved
)
2428 delta
+= reserved
- (pag
->pagf_freeblks
- pag
->pagf_longest
);
2431 * If the longest extent is long enough to satisfy all the
2432 * reservations and AGFL rules in place, we can return this extent.
2434 if (pag
->pagf_longest
> delta
)
2435 return min_t(xfs_extlen_t
, pag_mount(pag
)->m_ag_max_usable
,
2436 pag
->pagf_longest
- delta
);
2438 /* Otherwise, let the caller try for 1 block if there's space. */
2439 return pag
->pagf_flcount
> 0 || pag
->pagf_longest
> 0;
2443 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL,
2444 * return the largest possible minimum length.
2447 xfs_alloc_min_freelist(
2448 struct xfs_mount
*mp
,
2449 struct xfs_perag
*pag
)
2451 /* AG btrees have at least 1 level. */
2452 const unsigned int bno_level
= pag
? pag
->pagf_bno_level
: 1;
2453 const unsigned int cnt_level
= pag
? pag
->pagf_cnt_level
: 1;
2454 const unsigned int rmap_level
= pag
? pag
->pagf_rmap_level
: 1;
2455 unsigned int min_free
;
2457 ASSERT(mp
->m_alloc_maxlevels
> 0);
2460 * For a btree shorter than the maximum height, the worst case is that
2461 * every level gets split and a new level is added, then while inserting
2462 * another entry to refill the AGFL, every level under the old root gets
2463 * split again. This is:
2465 * (full height split reservation) + (AGFL refill split height)
2466 * = (current height + 1) + (current height - 1)
2467 * = (new height) + (new height - 2)
2468 * = 2 * new height - 2
2470 * For a btree of maximum height, the worst case is that every level
2471 * under the root gets split, then while inserting another entry to
2472 * refill the AGFL, every level under the root gets split again. This is
2475 * 2 * (current height - 1)
2476 * = 2 * (new height - 1)
2477 * = 2 * new height - 2
2480 /* space needed by-bno freespace btree */
2481 min_free
= min(bno_level
+ 1, mp
->m_alloc_maxlevels
) * 2 - 2;
2482 /* space needed by-size freespace btree */
2483 min_free
+= min(cnt_level
+ 1, mp
->m_alloc_maxlevels
) * 2 - 2;
2484 /* space needed reverse mapping used space btree */
2485 if (xfs_has_rmapbt(mp
))
2486 min_free
+= min(rmap_level
+ 1, mp
->m_rmap_maxlevels
) * 2 - 2;
2491 * Check if the operation we are fixing up the freelist for should go ahead or
2492 * not. If we are freeing blocks, we always allow it, otherwise the allocation
2493 * is dependent on whether the size and shape of free space available will
2494 * permit the requested allocation to take place.
2497 xfs_alloc_space_available(
2498 struct xfs_alloc_arg
*args
,
2499 xfs_extlen_t min_free
,
2502 struct xfs_perag
*pag
= args
->pag
;
2503 xfs_extlen_t alloc_len
, longest
;
2504 xfs_extlen_t reservation
; /* blocks that are still reserved */
2506 xfs_extlen_t agflcount
;
2508 if (flags
& XFS_ALLOC_FLAG_FREEING
)
2511 reservation
= xfs_ag_resv_needed(pag
, args
->resv
);
2513 /* do we have enough contiguous free space for the allocation? */
2514 alloc_len
= args
->minlen
+ (args
->alignment
- 1) + args
->minalignslop
;
2515 longest
= xfs_alloc_longest_free_extent(pag
, min_free
, reservation
);
2516 if (longest
< alloc_len
)
2520 * Do we have enough free space remaining for the allocation? Don't
2521 * account extra agfl blocks because we are about to defer free them,
2522 * making them unavailable until the current transaction commits.
2524 agflcount
= min_t(xfs_extlen_t
, pag
->pagf_flcount
, min_free
);
2525 available
= (int)(pag
->pagf_freeblks
+ agflcount
-
2526 reservation
- min_free
- args
->minleft
);
2527 if (available
< (int)max(args
->total
, alloc_len
))
2531 * Clamp maxlen to the amount of free space available for the actual
2532 * extent allocation.
2534 if (available
< (int)args
->maxlen
&& !(flags
& XFS_ALLOC_FLAG_CHECK
)) {
2535 args
->maxlen
= available
;
2536 ASSERT(args
->maxlen
> 0);
2537 ASSERT(args
->maxlen
>= args
->minlen
);
2544 * Check the agfl fields of the agf for inconsistency or corruption.
2546 * The original purpose was to detect an agfl header padding mismatch between
2547 * current and early v5 kernels. This problem manifests as a 1-slot size
2548 * difference between the on-disk flcount and the active [first, last] range of
2551 * However, we need to use these same checks to catch agfl count corruptions
2552 * unrelated to padding. This could occur on any v4 or v5 filesystem, so either
2553 * way, we need to reset the agfl and warn the user.
2555 * Return true if a reset is required before the agfl can be used, false
2559 xfs_agfl_needs_reset(
2560 struct xfs_mount
*mp
,
2561 struct xfs_agf
*agf
)
2563 uint32_t f
= be32_to_cpu(agf
->agf_flfirst
);
2564 uint32_t l
= be32_to_cpu(agf
->agf_fllast
);
2565 uint32_t c
= be32_to_cpu(agf
->agf_flcount
);
2566 int agfl_size
= xfs_agfl_size(mp
);
2570 * The agf read verifier catches severe corruption of these fields.
2571 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2572 * the verifier allows it.
2574 if (f
>= agfl_size
|| l
>= agfl_size
)
2580 * Check consistency between the on-disk count and the active range. An
2581 * agfl padding mismatch manifests as an inconsistent flcount.
2586 active
= agfl_size
- f
+ l
+ 1;
2594 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2595 * agfl content cannot be trusted. Warn the user that a repair is required to
2596 * recover leaked blocks.
2598 * The purpose of this mechanism is to handle filesystems affected by the agfl
2599 * header padding mismatch problem. A reset keeps the filesystem online with a
2600 * relatively minor free space accounting inconsistency rather than suffer the
2601 * inevitable crash from use of an invalid agfl block.
2605 struct xfs_trans
*tp
,
2606 struct xfs_buf
*agbp
,
2607 struct xfs_perag
*pag
)
2609 struct xfs_mount
*mp
= tp
->t_mountp
;
2610 struct xfs_agf
*agf
= agbp
->b_addr
;
2612 ASSERT(xfs_perag_agfl_needs_reset(pag
));
2613 trace_xfs_agfl_reset(mp
, agf
, 0, _RET_IP_
);
2616 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2617 "Please unmount and run xfs_repair.",
2618 pag_agno(pag
), pag
->pagf_flcount
);
2620 agf
->agf_flfirst
= 0;
2621 agf
->agf_fllast
= cpu_to_be32(xfs_agfl_size(mp
) - 1);
2622 agf
->agf_flcount
= 0;
2623 xfs_alloc_log_agf(tp
, agbp
, XFS_AGF_FLFIRST
| XFS_AGF_FLLAST
|
2626 pag
->pagf_flcount
= 0;
2627 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET
, &pag
->pag_opstate
);
2631 * Add the extent to the list of extents to be free at transaction end.
2632 * The list is maintained sorted (by block number).
2635 xfs_defer_extent_free(
2636 struct xfs_trans
*tp
,
2639 const struct xfs_owner_info
*oinfo
,
2640 enum xfs_ag_resv_type type
,
2641 unsigned int free_flags
,
2642 struct xfs_defer_pending
**dfpp
)
2644 struct xfs_extent_free_item
*xefi
;
2645 struct xfs_mount
*mp
= tp
->t_mountp
;
2647 ASSERT(len
<= XFS_MAX_BMBT_EXTLEN
);
2648 ASSERT(!isnullstartblock(bno
));
2649 ASSERT(!(free_flags
& ~XFS_FREE_EXTENT_ALL_FLAGS
));
2651 if (free_flags
& XFS_FREE_EXTENT_REALTIME
) {
2652 if (type
!= XFS_AG_RESV_NONE
) {
2653 ASSERT(type
== XFS_AG_RESV_NONE
);
2654 return -EFSCORRUPTED
;
2656 if (XFS_IS_CORRUPT(mp
, !xfs_verify_rtbext(mp
, bno
, len
)))
2657 return -EFSCORRUPTED
;
2659 if (XFS_IS_CORRUPT(mp
, !xfs_verify_fsbext(mp
, bno
, len
)))
2660 return -EFSCORRUPTED
;
2663 xefi
= kmem_cache_zalloc(xfs_extfree_item_cache
,
2664 GFP_KERNEL
| __GFP_NOFAIL
);
2665 xefi
->xefi_startblock
= bno
;
2666 xefi
->xefi_blockcount
= (xfs_extlen_t
)len
;
2667 xefi
->xefi_agresv
= type
;
2668 if (free_flags
& XFS_FREE_EXTENT_SKIP_DISCARD
)
2669 xefi
->xefi_flags
|= XFS_EFI_SKIP_DISCARD
;
2670 if (free_flags
& XFS_FREE_EXTENT_REALTIME
)
2671 xefi
->xefi_flags
|= XFS_EFI_REALTIME
;
2673 ASSERT(oinfo
->oi_offset
== 0);
2675 if (oinfo
->oi_flags
& XFS_OWNER_INFO_ATTR_FORK
)
2676 xefi
->xefi_flags
|= XFS_EFI_ATTR_FORK
;
2677 if (oinfo
->oi_flags
& XFS_OWNER_INFO_BMBT_BLOCK
)
2678 xefi
->xefi_flags
|= XFS_EFI_BMBT_BLOCK
;
2679 xefi
->xefi_owner
= oinfo
->oi_owner
;
2681 xefi
->xefi_owner
= XFS_RMAP_OWN_NULL
;
2684 xfs_extent_free_defer_add(tp
, xefi
, dfpp
);
2689 xfs_free_extent_later(
2690 struct xfs_trans
*tp
,
2693 const struct xfs_owner_info
*oinfo
,
2694 enum xfs_ag_resv_type type
,
2695 unsigned int free_flags
)
2697 struct xfs_defer_pending
*dontcare
= NULL
;
2699 return xfs_defer_extent_free(tp
, bno
, len
, oinfo
, type
, free_flags
,
2704 * Set up automatic freeing of unwritten space in the filesystem.
2706 * This function attached a paused deferred extent free item to the
2707 * transaction. Pausing means that the EFI will be logged in the next
2708 * transaction commit, but the pending EFI will not be finished until the
2709 * pending item is unpaused.
2711 * If the system goes down after the EFI has been persisted to the log but
2712 * before the pending item is unpaused, log recovery will find the EFI, fail to
2713 * find the EFD, and free the space.
2715 * If the pending item is unpaused, the next transaction commit will log an EFD
2716 * without freeing the space.
2718 * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the
2719 * @args structure are set to the relevant values.
2722 xfs_alloc_schedule_autoreap(
2723 const struct xfs_alloc_arg
*args
,
2724 unsigned int free_flags
,
2725 struct xfs_alloc_autoreap
*aarp
)
2729 error
= xfs_defer_extent_free(args
->tp
, args
->fsbno
, args
->len
,
2730 &args
->oinfo
, args
->resv
, free_flags
, &aarp
->dfp
);
2734 xfs_defer_item_pause(args
->tp
, aarp
->dfp
);
2739 * Cancel automatic freeing of unwritten space in the filesystem.
2741 * Earlier, we created a paused deferred extent free item and attached it to
2742 * this transaction so that we could automatically roll back a new space
2743 * allocation if the system went down. Now we want to cancel the paused work
2744 * item by marking the EFI stale so we don't actually free the space, unpausing
2745 * the pending item and logging an EFD.
2747 * The caller generally should have already mapped the space into the ondisk
2748 * filesystem. If the reserved space was partially used, the caller must call
2749 * xfs_free_extent_later to create a new EFI to free the unused space.
2752 xfs_alloc_cancel_autoreap(
2753 struct xfs_trans
*tp
,
2754 struct xfs_alloc_autoreap
*aarp
)
2756 struct xfs_defer_pending
*dfp
= aarp
->dfp
;
2757 struct xfs_extent_free_item
*xefi
;
2762 list_for_each_entry(xefi
, &dfp
->dfp_work
, xefi_list
)
2763 xefi
->xefi_flags
|= XFS_EFI_CANCELLED
;
2765 xfs_defer_item_unpause(tp
, dfp
);
2769 * Commit automatic freeing of unwritten space in the filesystem.
2771 * This unpauses an earlier _schedule_autoreap and commits to freeing the
2772 * allocated space. Call this if none of the reserved space was used.
2775 xfs_alloc_commit_autoreap(
2776 struct xfs_trans
*tp
,
2777 struct xfs_alloc_autoreap
*aarp
)
2780 xfs_defer_item_unpause(tp
, aarp
->dfp
);
2784 * Check if an AGF has a free extent record whose length is equal to
2788 xfs_exact_minlen_extent_available(
2789 struct xfs_alloc_arg
*args
,
2790 struct xfs_buf
*agbp
,
2793 struct xfs_btree_cur
*cnt_cur
;
2798 cnt_cur
= xfs_cntbt_init_cursor(args
->mp
, args
->tp
, agbp
,
2800 error
= xfs_alloc_lookup_ge(cnt_cur
, 0, args
->minlen
, stat
);
2805 xfs_btree_mark_sick(cnt_cur
);
2806 error
= -EFSCORRUPTED
;
2810 error
= xfs_alloc_get_rec(cnt_cur
, &fbno
, &flen
, stat
);
2814 if (*stat
== 1 && flen
!= args
->minlen
)
2818 xfs_btree_del_cursor(cnt_cur
, error
);
2824 * Decide whether to use this allocation group for this allocation.
2825 * If so, fix up the btree freelist's size.
2828 xfs_alloc_fix_freelist(
2829 struct xfs_alloc_arg
*args
, /* allocation argument structure */
2830 uint32_t alloc_flags
)
2832 struct xfs_mount
*mp
= args
->mp
;
2833 struct xfs_perag
*pag
= args
->pag
;
2834 struct xfs_trans
*tp
= args
->tp
;
2835 struct xfs_buf
*agbp
= NULL
;
2836 struct xfs_buf
*agflbp
= NULL
;
2837 struct xfs_alloc_arg targs
; /* local allocation arguments */
2838 xfs_agblock_t bno
; /* freelist block */
2839 xfs_extlen_t need
; /* total blocks needed in freelist */
2842 /* deferred ops (AGFL block frees) require permanent transactions */
2843 ASSERT(tp
->t_flags
& XFS_TRANS_PERM_LOG_RES
);
2845 if (!xfs_perag_initialised_agf(pag
)) {
2846 error
= xfs_alloc_read_agf(pag
, tp
, alloc_flags
, &agbp
);
2848 /* Couldn't lock the AGF so skip this AG. */
2849 if (error
== -EAGAIN
)
2856 * If this is a metadata preferred pag and we are user data then try
2857 * somewhere else if we are not being asked to try harder at this
2860 if (xfs_perag_prefers_metadata(pag
) &&
2861 (args
->datatype
& XFS_ALLOC_USERDATA
) &&
2862 (alloc_flags
& XFS_ALLOC_FLAG_TRYLOCK
)) {
2863 ASSERT(!(alloc_flags
& XFS_ALLOC_FLAG_FREEING
));
2864 goto out_agbp_relse
;
2867 need
= xfs_alloc_min_freelist(mp
, pag
);
2868 if (!xfs_alloc_space_available(args
, need
, alloc_flags
|
2869 XFS_ALLOC_FLAG_CHECK
))
2870 goto out_agbp_relse
;
2873 * Get the a.g. freespace buffer.
2874 * Can fail if we're not blocking on locks, and it's held.
2877 error
= xfs_alloc_read_agf(pag
, tp
, alloc_flags
, &agbp
);
2879 /* Couldn't lock the AGF so skip this AG. */
2880 if (error
== -EAGAIN
)
2886 /* reset a padding mismatched agfl before final free space check */
2887 if (xfs_perag_agfl_needs_reset(pag
))
2888 xfs_agfl_reset(tp
, agbp
, pag
);
2890 /* If there isn't enough total space or single-extent, reject it. */
2891 need
= xfs_alloc_min_freelist(mp
, pag
);
2892 if (!xfs_alloc_space_available(args
, need
, alloc_flags
))
2893 goto out_agbp_relse
;
2895 if (IS_ENABLED(CONFIG_XFS_DEBUG
) && args
->alloc_minlen_only
) {
2898 error
= xfs_exact_minlen_extent_available(args
, agbp
, &stat
);
2900 goto out_agbp_relse
;
2904 * Make the freelist shorter if it's too long.
2906 * Note that from this point onwards, we will always release the agf and
2907 * agfl buffers on error. This handles the case where we error out and
2908 * the buffers are clean or may not have been joined to the transaction
2909 * and hence need to be released manually. If they have been joined to
2910 * the transaction, then xfs_trans_brelse() will handle them
2911 * appropriately based on the recursion count and dirty state of the
2914 * XXX (dgc): When we have lots of free space, does this buy us
2915 * anything other than extra overhead when we need to put more blocks
2916 * back on the free list? Maybe we should only do this when space is
2917 * getting low or the AGFL is more than half full?
2919 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2920 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2921 * updating the rmapbt. Both flags are used in xfs_repair while we're
2922 * rebuilding the rmapbt, and neither are used by the kernel. They're
2923 * both required to ensure that rmaps are correctly recorded for the
2924 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
2925 * repair/rmap.c in xfsprogs for details.
2927 memset(&targs
, 0, sizeof(targs
));
2928 /* struct copy below */
2929 if (alloc_flags
& XFS_ALLOC_FLAG_NORMAP
)
2930 targs
.oinfo
= XFS_RMAP_OINFO_SKIP_UPDATE
;
2932 targs
.oinfo
= XFS_RMAP_OINFO_AG
;
2933 while (!(alloc_flags
& XFS_ALLOC_FLAG_NOSHRINK
) &&
2934 pag
->pagf_flcount
> need
) {
2935 error
= xfs_alloc_get_freelist(pag
, tp
, agbp
, &bno
, 0);
2937 goto out_agbp_relse
;
2940 * Defer the AGFL block free.
2942 * This helps to prevent log reservation overruns due to too
2943 * many allocation operations in a transaction. AGFL frees are
2944 * prone to this problem because for one they are always freed
2945 * one at a time. Further, an immediate AGFL block free can
2946 * cause a btree join and require another block free before the
2947 * real allocation can proceed.
2948 * Deferring the free disconnects freeing up the AGFL slot from
2949 * freeing the block.
2951 error
= xfs_free_extent_later(tp
, xfs_agbno_to_fsb(pag
, bno
),
2952 1, &targs
.oinfo
, XFS_AG_RESV_AGFL
, 0);
2954 goto out_agbp_relse
;
2960 targs
.agno
= args
->agno
;
2961 targs
.alignment
= targs
.minlen
= targs
.prod
= 1;
2963 error
= xfs_alloc_read_agfl(pag
, tp
, &agflbp
);
2965 goto out_agbp_relse
;
2967 /* Make the freelist longer if it's too short. */
2968 while (pag
->pagf_flcount
< need
) {
2970 targs
.maxlen
= need
- pag
->pagf_flcount
;
2971 targs
.resv
= XFS_AG_RESV_AGFL
;
2973 /* Allocate as many blocks as possible at once. */
2974 error
= xfs_alloc_ag_vextent_size(&targs
, alloc_flags
);
2976 goto out_agflbp_relse
;
2979 * Stop if we run out. Won't happen if callers are obeying
2980 * the restrictions correctly. Can happen for free calls
2981 * on a completely full ag.
2983 if (targs
.agbno
== NULLAGBLOCK
) {
2984 if (alloc_flags
& XFS_ALLOC_FLAG_FREEING
)
2986 goto out_agflbp_relse
;
2989 if (!xfs_rmap_should_skip_owner_update(&targs
.oinfo
)) {
2990 error
= xfs_rmap_alloc(tp
, agbp
, pag
,
2991 targs
.agbno
, targs
.len
, &targs
.oinfo
);
2993 goto out_agflbp_relse
;
2995 error
= xfs_alloc_update_counters(tp
, agbp
,
2996 -((long)(targs
.len
)));
2998 goto out_agflbp_relse
;
3001 * Put each allocated block on the list.
3003 for (bno
= targs
.agbno
; bno
< targs
.agbno
+ targs
.len
; bno
++) {
3004 error
= xfs_alloc_put_freelist(pag
, tp
, agbp
,
3007 goto out_agflbp_relse
;
3010 xfs_trans_brelse(tp
, agflbp
);
3015 xfs_trans_brelse(tp
, agflbp
);
3018 xfs_trans_brelse(tp
, agbp
);
3025 * Get a block from the freelist.
3026 * Returns with the buffer for the block gotten.
3029 xfs_alloc_get_freelist(
3030 struct xfs_perag
*pag
,
3031 struct xfs_trans
*tp
,
3032 struct xfs_buf
*agbp
,
3033 xfs_agblock_t
*bnop
,
3036 struct xfs_agf
*agf
= agbp
->b_addr
;
3037 struct xfs_buf
*agflbp
;
3042 struct xfs_mount
*mp
= tp
->t_mountp
;
3045 * Freelist is empty, give up.
3047 if (!agf
->agf_flcount
) {
3048 *bnop
= NULLAGBLOCK
;
3052 * Read the array of free blocks.
3054 error
= xfs_alloc_read_agfl(pag
, tp
, &agflbp
);
3060 * Get the block number and update the data structures.
3062 agfl_bno
= xfs_buf_to_agfl_bno(agflbp
);
3063 bno
= be32_to_cpu(agfl_bno
[be32_to_cpu(agf
->agf_flfirst
)]);
3064 if (XFS_IS_CORRUPT(tp
->t_mountp
, !xfs_verify_agbno(pag
, bno
)))
3065 return -EFSCORRUPTED
;
3067 be32_add_cpu(&agf
->agf_flfirst
, 1);
3068 xfs_trans_brelse(tp
, agflbp
);
3069 if (be32_to_cpu(agf
->agf_flfirst
) == xfs_agfl_size(mp
))
3070 agf
->agf_flfirst
= 0;
3072 ASSERT(!xfs_perag_agfl_needs_reset(pag
));
3073 be32_add_cpu(&agf
->agf_flcount
, -1);
3074 pag
->pagf_flcount
--;
3076 logflags
= XFS_AGF_FLFIRST
| XFS_AGF_FLCOUNT
;
3078 be32_add_cpu(&agf
->agf_btreeblks
, 1);
3079 pag
->pagf_btreeblks
++;
3080 logflags
|= XFS_AGF_BTREEBLKS
;
3083 xfs_alloc_log_agf(tp
, agbp
, logflags
);
3090 * Log the given fields from the agf structure.
3094 struct xfs_trans
*tp
,
3098 int first
; /* first byte offset */
3099 int last
; /* last byte offset */
3100 static const short offsets
[] = {
3101 offsetof(xfs_agf_t
, agf_magicnum
),
3102 offsetof(xfs_agf_t
, agf_versionnum
),
3103 offsetof(xfs_agf_t
, agf_seqno
),
3104 offsetof(xfs_agf_t
, agf_length
),
3105 offsetof(xfs_agf_t
, agf_bno_root
), /* also cnt/rmap root */
3106 offsetof(xfs_agf_t
, agf_bno_level
), /* also cnt/rmap levels */
3107 offsetof(xfs_agf_t
, agf_flfirst
),
3108 offsetof(xfs_agf_t
, agf_fllast
),
3109 offsetof(xfs_agf_t
, agf_flcount
),
3110 offsetof(xfs_agf_t
, agf_freeblks
),
3111 offsetof(xfs_agf_t
, agf_longest
),
3112 offsetof(xfs_agf_t
, agf_btreeblks
),
3113 offsetof(xfs_agf_t
, agf_uuid
),
3114 offsetof(xfs_agf_t
, agf_rmap_blocks
),
3115 offsetof(xfs_agf_t
, agf_refcount_blocks
),
3116 offsetof(xfs_agf_t
, agf_refcount_root
),
3117 offsetof(xfs_agf_t
, agf_refcount_level
),
3118 /* needed so that we don't log the whole rest of the structure: */
3119 offsetof(xfs_agf_t
, agf_spare64
),
3123 trace_xfs_agf(tp
->t_mountp
, bp
->b_addr
, fields
, _RET_IP_
);
3125 xfs_trans_buf_set_type(tp
, bp
, XFS_BLFT_AGF_BUF
);
3127 xfs_btree_offsets(fields
, offsets
, XFS_AGF_NUM_BITS
, &first
, &last
);
3128 xfs_trans_log_buf(tp
, bp
, (uint
)first
, (uint
)last
);
3132 * Put the block on the freelist for the allocation group.
3135 xfs_alloc_put_freelist(
3136 struct xfs_perag
*pag
,
3137 struct xfs_trans
*tp
,
3138 struct xfs_buf
*agbp
,
3139 struct xfs_buf
*agflbp
,
3143 struct xfs_mount
*mp
= tp
->t_mountp
;
3144 struct xfs_agf
*agf
= agbp
->b_addr
;
3152 error
= xfs_alloc_read_agfl(pag
, tp
, &agflbp
);
3157 be32_add_cpu(&agf
->agf_fllast
, 1);
3158 if (be32_to_cpu(agf
->agf_fllast
) == xfs_agfl_size(mp
))
3159 agf
->agf_fllast
= 0;
3161 ASSERT(!xfs_perag_agfl_needs_reset(pag
));
3162 be32_add_cpu(&agf
->agf_flcount
, 1);
3163 pag
->pagf_flcount
++;
3165 logflags
= XFS_AGF_FLLAST
| XFS_AGF_FLCOUNT
;
3167 be32_add_cpu(&agf
->agf_btreeblks
, -1);
3168 pag
->pagf_btreeblks
--;
3169 logflags
|= XFS_AGF_BTREEBLKS
;
3172 ASSERT(be32_to_cpu(agf
->agf_flcount
) <= xfs_agfl_size(mp
));
3174 agfl_bno
= xfs_buf_to_agfl_bno(agflbp
);
3175 blockp
= &agfl_bno
[be32_to_cpu(agf
->agf_fllast
)];
3176 *blockp
= cpu_to_be32(bno
);
3177 startoff
= (char *)blockp
- (char *)agflbp
->b_addr
;
3179 xfs_alloc_log_agf(tp
, agbp
, logflags
);
3181 xfs_trans_buf_set_type(tp
, agflbp
, XFS_BLFT_AGFL_BUF
);
3182 xfs_trans_log_buf(tp
, agflbp
, startoff
,
3183 startoff
+ sizeof(xfs_agblock_t
) - 1);
3188 * Check that this AGF/AGI header's sequence number and length matches the AG
3189 * number and size in fsblocks.
3192 xfs_validate_ag_length(
3197 struct xfs_mount
*mp
= bp
->b_mount
;
3199 * During growfs operations, the perag is not fully initialised,
3200 * so we can't use it for any useful checking. growfs ensures we can't
3201 * use it by using uncached buffers that don't have the perag attached
3202 * so we can detect and avoid this problem.
3204 if (bp
->b_pag
&& seqno
!= pag_agno(bp
->b_pag
))
3205 return __this_address
;
3208 * Only the last AG in the filesystem is allowed to be shorter
3209 * than the AG size recorded in the superblock.
3211 if (length
!= mp
->m_sb
.sb_agblocks
) {
3213 * During growfs, the new last AG can get here before we
3214 * have updated the superblock. Give it a pass on the seqno
3217 if (bp
->b_pag
&& seqno
!= mp
->m_sb
.sb_agcount
- 1)
3218 return __this_address
;
3219 if (length
< XFS_MIN_AG_BLOCKS
)
3220 return __this_address
;
3221 if (length
> mp
->m_sb
.sb_agblocks
)
3222 return __this_address
;
3229 * Verify the AGF is consistent.
3231 * We do not verify the AGFL indexes in the AGF are fully consistent here
3232 * because of issues with variable on-disk structure sizes. Instead, we check
3233 * the agfl indexes for consistency when we initialise the perag from the AGF
3234 * information after a read completes.
3236 * If the index is inconsistent, then we mark the perag as needing an AGFL
3237 * reset. The first AGFL update performed then resets the AGFL indexes and
3238 * refills the AGFL with known good free blocks, allowing the filesystem to
3239 * continue operating normally at the cost of a few leaked free space blocks.
3241 static xfs_failaddr_t
3245 struct xfs_mount
*mp
= bp
->b_mount
;
3246 struct xfs_agf
*agf
= bp
->b_addr
;
3248 uint32_t agf_seqno
= be32_to_cpu(agf
->agf_seqno
);
3249 uint32_t agf_length
= be32_to_cpu(agf
->agf_length
);
3251 if (xfs_has_crc(mp
)) {
3252 if (!uuid_equal(&agf
->agf_uuid
, &mp
->m_sb
.sb_meta_uuid
))
3253 return __this_address
;
3254 if (!xfs_log_check_lsn(mp
, be64_to_cpu(agf
->agf_lsn
)))
3255 return __this_address
;
3258 if (!xfs_verify_magic(bp
, agf
->agf_magicnum
))
3259 return __this_address
;
3261 if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf
->agf_versionnum
)))
3262 return __this_address
;
3265 * Both agf_seqno and agf_length need to validated before anything else
3266 * block number related in the AGF or AGFL can be checked.
3268 fa
= xfs_validate_ag_length(bp
, agf_seqno
, agf_length
);
3272 if (be32_to_cpu(agf
->agf_flfirst
) >= xfs_agfl_size(mp
))
3273 return __this_address
;
3274 if (be32_to_cpu(agf
->agf_fllast
) >= xfs_agfl_size(mp
))
3275 return __this_address
;
3276 if (be32_to_cpu(agf
->agf_flcount
) > xfs_agfl_size(mp
))
3277 return __this_address
;
3279 if (be32_to_cpu(agf
->agf_freeblks
) < be32_to_cpu(agf
->agf_longest
) ||
3280 be32_to_cpu(agf
->agf_freeblks
) > agf_length
)
3281 return __this_address
;
3283 if (be32_to_cpu(agf
->agf_bno_level
) < 1 ||
3284 be32_to_cpu(agf
->agf_cnt_level
) < 1 ||
3285 be32_to_cpu(agf
->agf_bno_level
) > mp
->m_alloc_maxlevels
||
3286 be32_to_cpu(agf
->agf_cnt_level
) > mp
->m_alloc_maxlevels
)
3287 return __this_address
;
3289 if (xfs_has_lazysbcount(mp
) &&
3290 be32_to_cpu(agf
->agf_btreeblks
) > agf_length
)
3291 return __this_address
;
3293 if (xfs_has_rmapbt(mp
)) {
3294 if (be32_to_cpu(agf
->agf_rmap_blocks
) > agf_length
)
3295 return __this_address
;
3297 if (be32_to_cpu(agf
->agf_rmap_level
) < 1 ||
3298 be32_to_cpu(agf
->agf_rmap_level
) > mp
->m_rmap_maxlevels
)
3299 return __this_address
;
3302 if (xfs_has_reflink(mp
)) {
3303 if (be32_to_cpu(agf
->agf_refcount_blocks
) > agf_length
)
3304 return __this_address
;
3306 if (be32_to_cpu(agf
->agf_refcount_level
) < 1 ||
3307 be32_to_cpu(agf
->agf_refcount_level
) > mp
->m_refc_maxlevels
)
3308 return __this_address
;
3315 xfs_agf_read_verify(
3318 struct xfs_mount
*mp
= bp
->b_mount
;
3321 if (xfs_has_crc(mp
) &&
3322 !xfs_buf_verify_cksum(bp
, XFS_AGF_CRC_OFF
))
3323 xfs_verifier_error(bp
, -EFSBADCRC
, __this_address
);
3325 fa
= xfs_agf_verify(bp
);
3326 if (XFS_TEST_ERROR(fa
, mp
, XFS_ERRTAG_ALLOC_READ_AGF
))
3327 xfs_verifier_error(bp
, -EFSCORRUPTED
, fa
);
3332 xfs_agf_write_verify(
3335 struct xfs_mount
*mp
= bp
->b_mount
;
3336 struct xfs_buf_log_item
*bip
= bp
->b_log_item
;
3337 struct xfs_agf
*agf
= bp
->b_addr
;
3340 fa
= xfs_agf_verify(bp
);
3342 xfs_verifier_error(bp
, -EFSCORRUPTED
, fa
);
3346 if (!xfs_has_crc(mp
))
3350 agf
->agf_lsn
= cpu_to_be64(bip
->bli_item
.li_lsn
);
3352 xfs_buf_update_cksum(bp
, XFS_AGF_CRC_OFF
);
3355 const struct xfs_buf_ops xfs_agf_buf_ops
= {
3357 .magic
= { cpu_to_be32(XFS_AGF_MAGIC
), cpu_to_be32(XFS_AGF_MAGIC
) },
3358 .verify_read
= xfs_agf_read_verify
,
3359 .verify_write
= xfs_agf_write_verify
,
3360 .verify_struct
= xfs_agf_verify
,
3364 * Read in the allocation group header (free/alloc section).
3368 struct xfs_perag
*pag
,
3369 struct xfs_trans
*tp
,
3371 struct xfs_buf
**agfbpp
)
3373 struct xfs_mount
*mp
= pag_mount(pag
);
3376 trace_xfs_read_agf(pag
);
3378 error
= xfs_trans_read_buf(mp
, tp
, mp
->m_ddev_targp
,
3379 XFS_AG_DADDR(mp
, pag_agno(pag
), XFS_AGF_DADDR(mp
)),
3380 XFS_FSS_TO_BB(mp
, 1), flags
, agfbpp
, &xfs_agf_buf_ops
);
3381 if (xfs_metadata_is_sick(error
))
3382 xfs_ag_mark_sick(pag
, XFS_SICK_AG_AGF
);
3386 xfs_buf_set_ref(*agfbpp
, XFS_AGF_REF
);
3391 * Read in the allocation group header (free/alloc section) and initialise the
3392 * perag structure if necessary. If the caller provides @agfbpp, then return the
3393 * locked buffer to the caller, otherwise free it.
3397 struct xfs_perag
*pag
,
3398 struct xfs_trans
*tp
,
3400 struct xfs_buf
**agfbpp
)
3402 struct xfs_mount
*mp
= pag_mount(pag
);
3403 struct xfs_buf
*agfbp
;
3404 struct xfs_agf
*agf
;
3408 trace_xfs_alloc_read_agf(pag
);
3410 /* We don't support trylock when freeing. */
3411 ASSERT((flags
& (XFS_ALLOC_FLAG_FREEING
| XFS_ALLOC_FLAG_TRYLOCK
)) !=
3412 (XFS_ALLOC_FLAG_FREEING
| XFS_ALLOC_FLAG_TRYLOCK
));
3413 error
= xfs_read_agf(pag
, tp
,
3414 (flags
& XFS_ALLOC_FLAG_TRYLOCK
) ? XBF_TRYLOCK
: 0,
3419 agf
= agfbp
->b_addr
;
3420 if (!xfs_perag_initialised_agf(pag
)) {
3421 pag
->pagf_freeblks
= be32_to_cpu(agf
->agf_freeblks
);
3422 pag
->pagf_btreeblks
= be32_to_cpu(agf
->agf_btreeblks
);
3423 pag
->pagf_flcount
= be32_to_cpu(agf
->agf_flcount
);
3424 pag
->pagf_longest
= be32_to_cpu(agf
->agf_longest
);
3425 pag
->pagf_bno_level
= be32_to_cpu(agf
->agf_bno_level
);
3426 pag
->pagf_cnt_level
= be32_to_cpu(agf
->agf_cnt_level
);
3427 pag
->pagf_rmap_level
= be32_to_cpu(agf
->agf_rmap_level
);
3428 pag
->pagf_refcount_level
= be32_to_cpu(agf
->agf_refcount_level
);
3429 if (xfs_agfl_needs_reset(mp
, agf
))
3430 set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET
, &pag
->pag_opstate
);
3432 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET
, &pag
->pag_opstate
);
3435 * Update the in-core allocbt counter. Filter out the rmapbt
3436 * subset of the btreeblks counter because the rmapbt is managed
3437 * by perag reservation. Subtract one for the rmapbt root block
3438 * because the rmap counter includes it while the btreeblks
3439 * counter only tracks non-root blocks.
3441 allocbt_blks
= pag
->pagf_btreeblks
;
3442 if (xfs_has_rmapbt(mp
))
3443 allocbt_blks
-= be32_to_cpu(agf
->agf_rmap_blocks
) - 1;
3444 if (allocbt_blks
> 0)
3445 atomic64_add(allocbt_blks
, &mp
->m_allocbt_blks
);
3447 set_bit(XFS_AGSTATE_AGF_INIT
, &pag
->pag_opstate
);
3450 else if (!xfs_is_shutdown(mp
)) {
3451 ASSERT(pag
->pagf_freeblks
== be32_to_cpu(agf
->agf_freeblks
));
3452 ASSERT(pag
->pagf_btreeblks
== be32_to_cpu(agf
->agf_btreeblks
));
3453 ASSERT(pag
->pagf_flcount
== be32_to_cpu(agf
->agf_flcount
));
3454 ASSERT(pag
->pagf_longest
== be32_to_cpu(agf
->agf_longest
));
3455 ASSERT(pag
->pagf_bno_level
== be32_to_cpu(agf
->agf_bno_level
));
3456 ASSERT(pag
->pagf_cnt_level
== be32_to_cpu(agf
->agf_cnt_level
));
3462 xfs_trans_brelse(tp
, agfbp
);
3467 * Pre-proces allocation arguments to set initial state that we don't require
3468 * callers to set up correctly, as well as bounds check the allocation args
3472 xfs_alloc_vextent_check_args(
3473 struct xfs_alloc_arg
*args
,
3474 xfs_fsblock_t target
,
3475 xfs_agnumber_t
*minimum_agno
)
3477 struct xfs_mount
*mp
= args
->mp
;
3478 xfs_agblock_t agsize
;
3480 args
->fsbno
= NULLFSBLOCK
;
3483 if (args
->tp
->t_highest_agno
!= NULLAGNUMBER
)
3484 *minimum_agno
= args
->tp
->t_highest_agno
;
3487 * Just fix this up, for the case where the last a.g. is shorter
3488 * (or there's only one a.g.) and the caller couldn't easily figure
3489 * that out (xfs_bmap_alloc).
3491 agsize
= mp
->m_sb
.sb_agblocks
;
3492 if (args
->maxlen
> agsize
)
3493 args
->maxlen
= agsize
;
3494 if (args
->alignment
== 0)
3495 args
->alignment
= 1;
3497 ASSERT(args
->minlen
> 0);
3498 ASSERT(args
->maxlen
> 0);
3499 ASSERT(args
->alignment
> 0);
3500 ASSERT(args
->resv
!= XFS_AG_RESV_AGFL
);
3502 ASSERT(XFS_FSB_TO_AGNO(mp
, target
) < mp
->m_sb
.sb_agcount
);
3503 ASSERT(XFS_FSB_TO_AGBNO(mp
, target
) < agsize
);
3504 ASSERT(args
->minlen
<= args
->maxlen
);
3505 ASSERT(args
->minlen
<= agsize
);
3506 ASSERT(args
->mod
< args
->prod
);
3508 if (XFS_FSB_TO_AGNO(mp
, target
) >= mp
->m_sb
.sb_agcount
||
3509 XFS_FSB_TO_AGBNO(mp
, target
) >= agsize
||
3510 args
->minlen
> args
->maxlen
|| args
->minlen
> agsize
||
3511 args
->mod
>= args
->prod
) {
3512 trace_xfs_alloc_vextent_badargs(args
);
3516 if (args
->agno
!= NULLAGNUMBER
&& *minimum_agno
> args
->agno
) {
3517 trace_xfs_alloc_vextent_skip_deadlock(args
);
3525 * Prepare an AG for allocation. If the AG is not prepared to accept the
3526 * allocation, return failure.
3528 * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are
3529 * modified to hold their own perag references.
3532 xfs_alloc_vextent_prepare_ag(
3533 struct xfs_alloc_arg
*args
,
3534 uint32_t alloc_flags
)
3536 bool need_pag
= !args
->pag
;
3540 args
->pag
= xfs_perag_get(args
->mp
, args
->agno
);
3543 error
= xfs_alloc_fix_freelist(args
, alloc_flags
);
3545 trace_xfs_alloc_vextent_nofix(args
);
3547 xfs_perag_put(args
->pag
);
3548 args
->agbno
= NULLAGBLOCK
;
3552 /* cannot allocate in this AG at all */
3553 trace_xfs_alloc_vextent_noagbp(args
);
3554 args
->agbno
= NULLAGBLOCK
;
3557 args
->wasfromfl
= 0;
3562 * Post-process allocation results to account for the allocation if it succeed
3563 * and set the allocated block number correctly for the caller.
3565 * XXX: we should really be returning ENOSPC for ENOSPC, not
3566 * hiding it behind a "successful" NULLFSBLOCK allocation.
3569 xfs_alloc_vextent_finish(
3570 struct xfs_alloc_arg
*args
,
3571 xfs_agnumber_t minimum_agno
,
3575 struct xfs_mount
*mp
= args
->mp
;
3579 * We can end up here with a locked AGF. If we failed, the caller is
3580 * likely going to try to allocate again with different parameters, and
3581 * that can widen the AGs that are searched for free space. If we have
3582 * to do BMBT block allocation, we have to do a new allocation.
3584 * Hence leaving this function with the AGF locked opens up potential
3585 * ABBA AGF deadlocks because a future allocation attempt in this
3586 * transaction may attempt to lock a lower number AGF.
3588 * We can't release the AGF until the transaction is commited, so at
3589 * this point we must update the "first allocation" tracker to point at
3590 * this AG if the tracker is empty or points to a lower AG. This allows
3591 * the next allocation attempt to be modified appropriately to avoid
3595 (args
->tp
->t_highest_agno
== NULLAGNUMBER
||
3596 args
->agno
> minimum_agno
))
3597 args
->tp
->t_highest_agno
= args
->agno
;
3600 * If the allocation failed with an error or we had an ENOSPC result,
3601 * preserve the returned error whilst also marking the allocation result
3602 * as "no extent allocated". This ensures that callers that fail to
3603 * capture the error will still treat it as a failed allocation.
3605 if (alloc_error
|| args
->agbno
== NULLAGBLOCK
) {
3606 args
->fsbno
= NULLFSBLOCK
;
3607 error
= alloc_error
;
3608 goto out_drop_perag
;
3611 args
->fsbno
= xfs_agbno_to_fsb(args
->pag
, args
->agbno
);
3613 ASSERT(args
->len
>= args
->minlen
);
3614 ASSERT(args
->len
<= args
->maxlen
);
3615 ASSERT(args
->agbno
% args
->alignment
== 0);
3616 XFS_AG_CHECK_DADDR(mp
, XFS_FSB_TO_DADDR(mp
, args
->fsbno
), args
->len
);
3618 /* if not file data, insert new block into the reverse map btree */
3619 if (!xfs_rmap_should_skip_owner_update(&args
->oinfo
)) {
3620 error
= xfs_rmap_alloc(args
->tp
, args
->agbp
, args
->pag
,
3621 args
->agbno
, args
->len
, &args
->oinfo
);
3623 goto out_drop_perag
;
3626 if (!args
->wasfromfl
) {
3627 error
= xfs_alloc_update_counters(args
->tp
, args
->agbp
,
3628 -((long)(args
->len
)));
3630 goto out_drop_perag
;
3632 ASSERT(!xfs_extent_busy_search(pag_group(args
->pag
),
3633 args
->agbno
, args
->len
));
3636 xfs_ag_resv_alloc_extent(args
->pag
, args
->resv
, args
);
3638 XFS_STATS_INC(mp
, xs_allocx
);
3639 XFS_STATS_ADD(mp
, xs_allocb
, args
->len
);
3641 trace_xfs_alloc_vextent_finish(args
);
3644 if (drop_perag
&& args
->pag
) {
3645 xfs_perag_rele(args
->pag
);
3652 * Allocate within a single AG only. This uses a best-fit length algorithm so if
3653 * you need an exact sized allocation without locality constraints, this is the
3654 * fastest way to do it.
3656 * Caller is expected to hold a perag reference in args->pag.
3659 xfs_alloc_vextent_this_ag(
3660 struct xfs_alloc_arg
*args
,
3661 xfs_agnumber_t agno
)
3663 xfs_agnumber_t minimum_agno
;
3664 uint32_t alloc_flags
= 0;
3667 ASSERT(args
->pag
!= NULL
);
3668 ASSERT(pag_agno(args
->pag
) == agno
);
3673 trace_xfs_alloc_vextent_this_ag(args
);
3675 error
= xfs_alloc_vextent_check_args(args
,
3676 xfs_agbno_to_fsb(args
->pag
, 0), &minimum_agno
);
3678 if (error
== -ENOSPC
)
3683 error
= xfs_alloc_vextent_prepare_ag(args
, alloc_flags
);
3684 if (!error
&& args
->agbp
)
3685 error
= xfs_alloc_ag_vextent_size(args
, alloc_flags
);
3687 return xfs_alloc_vextent_finish(args
, minimum_agno
, error
, false);
3691 * Iterate all AGs trying to allocate an extent starting from @start_ag.
3693 * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
3694 * allocation attempts in @start_agno have locality information. If we fail to
3695 * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
3696 * we attempt to allocation in as there is no locality optimisation possible for
3697 * those allocations.
3699 * On return, args->pag may be left referenced if we finish before the "all
3700 * failed" return point. The allocation finish still needs the perag, and
3701 * so the caller will release it once they've finished the allocation.
3703 * When we wrap the AG iteration at the end of the filesystem, we have to be
3704 * careful not to wrap into AGs below ones we already have locked in the
3705 * transaction if we are doing a blocking iteration. This will result in an
3706 * out-of-order locking of AGFs and hence can cause deadlocks.
3709 xfs_alloc_vextent_iterate_ags(
3710 struct xfs_alloc_arg
*args
,
3711 xfs_agnumber_t minimum_agno
,
3712 xfs_agnumber_t start_agno
,
3713 xfs_agblock_t target_agbno
,
3714 uint32_t alloc_flags
)
3716 struct xfs_mount
*mp
= args
->mp
;
3717 xfs_agnumber_t restart_agno
= minimum_agno
;
3718 xfs_agnumber_t agno
;
3721 if (alloc_flags
& XFS_ALLOC_FLAG_TRYLOCK
)
3724 for_each_perag_wrap_range(mp
, start_agno
, restart_agno
,
3725 mp
->m_sb
.sb_agcount
, agno
, args
->pag
) {
3727 error
= xfs_alloc_vextent_prepare_ag(args
, alloc_flags
);
3731 trace_xfs_alloc_vextent_loopfailed(args
);
3736 * Allocation is supposed to succeed now, so break out of the
3737 * loop regardless of whether we succeed or not.
3739 if (args
->agno
== start_agno
&& target_agbno
) {
3740 args
->agbno
= target_agbno
;
3741 error
= xfs_alloc_ag_vextent_near(args
, alloc_flags
);
3744 error
= xfs_alloc_ag_vextent_size(args
, alloc_flags
);
3749 xfs_perag_rele(args
->pag
);
3757 * We didn't find an AG we can alloation from. If we were given
3758 * constraining flags by the caller, drop them and retry the allocation
3759 * without any constraints being set.
3761 if (alloc_flags
& XFS_ALLOC_FLAG_TRYLOCK
) {
3762 alloc_flags
&= ~XFS_ALLOC_FLAG_TRYLOCK
;
3763 restart_agno
= minimum_agno
;
3767 ASSERT(args
->pag
== NULL
);
3768 trace_xfs_alloc_vextent_allfailed(args
);
3773 * Iterate from the AGs from the start AG to the end of the filesystem, trying
3774 * to allocate blocks. It starts with a near allocation attempt in the initial
3775 * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
3776 * back to zero if allowed by previous allocations in this transaction,
3777 * otherwise will wrap back to the start AG and run a second blocking pass to
3778 * the end of the filesystem.
3781 xfs_alloc_vextent_start_ag(
3782 struct xfs_alloc_arg
*args
,
3783 xfs_fsblock_t target
)
3785 struct xfs_mount
*mp
= args
->mp
;
3786 xfs_agnumber_t minimum_agno
;
3787 xfs_agnumber_t start_agno
;
3788 xfs_agnumber_t rotorstep
= xfs_rotorstep
;
3789 bool bump_rotor
= false;
3790 uint32_t alloc_flags
= XFS_ALLOC_FLAG_TRYLOCK
;
3793 ASSERT(args
->pag
== NULL
);
3795 args
->agno
= NULLAGNUMBER
;
3796 args
->agbno
= NULLAGBLOCK
;
3798 trace_xfs_alloc_vextent_start_ag(args
);
3800 error
= xfs_alloc_vextent_check_args(args
, target
, &minimum_agno
);
3802 if (error
== -ENOSPC
)
3807 if ((args
->datatype
& XFS_ALLOC_INITIAL_USER_DATA
) &&
3808 xfs_is_inode32(mp
)) {
3809 target
= XFS_AGB_TO_FSB(mp
,
3810 ((mp
->m_agfrotor
/ rotorstep
) %
3811 mp
->m_sb
.sb_agcount
), 0);
3815 start_agno
= max(minimum_agno
, XFS_FSB_TO_AGNO(mp
, target
));
3816 error
= xfs_alloc_vextent_iterate_ags(args
, minimum_agno
, start_agno
,
3817 XFS_FSB_TO_AGBNO(mp
, target
), alloc_flags
);
3820 if (args
->agno
== start_agno
)
3821 mp
->m_agfrotor
= (mp
->m_agfrotor
+ 1) %
3822 (mp
->m_sb
.sb_agcount
* rotorstep
);
3824 mp
->m_agfrotor
= (args
->agno
* rotorstep
+ 1) %
3825 (mp
->m_sb
.sb_agcount
* rotorstep
);
3828 return xfs_alloc_vextent_finish(args
, minimum_agno
, error
, true);
3832 * Iterate from the agno indicated via @target through to the end of the
3833 * filesystem attempting blocking allocation. This does not wrap or try a second
3834 * pass, so will not recurse into AGs lower than indicated by the target.
3837 xfs_alloc_vextent_first_ag(
3838 struct xfs_alloc_arg
*args
,
3839 xfs_fsblock_t target
)
3841 struct xfs_mount
*mp
= args
->mp
;
3842 xfs_agnumber_t minimum_agno
;
3843 xfs_agnumber_t start_agno
;
3844 uint32_t alloc_flags
= XFS_ALLOC_FLAG_TRYLOCK
;
3847 ASSERT(args
->pag
== NULL
);
3849 args
->agno
= NULLAGNUMBER
;
3850 args
->agbno
= NULLAGBLOCK
;
3852 trace_xfs_alloc_vextent_first_ag(args
);
3854 error
= xfs_alloc_vextent_check_args(args
, target
, &minimum_agno
);
3856 if (error
== -ENOSPC
)
3861 start_agno
= max(minimum_agno
, XFS_FSB_TO_AGNO(mp
, target
));
3862 error
= xfs_alloc_vextent_iterate_ags(args
, minimum_agno
, start_agno
,
3863 XFS_FSB_TO_AGBNO(mp
, target
), alloc_flags
);
3864 return xfs_alloc_vextent_finish(args
, minimum_agno
, error
, true);
3868 * Allocate at the exact block target or fail. Caller is expected to hold a
3869 * perag reference in args->pag.
3872 xfs_alloc_vextent_exact_bno(
3873 struct xfs_alloc_arg
*args
,
3874 xfs_fsblock_t target
)
3876 struct xfs_mount
*mp
= args
->mp
;
3877 xfs_agnumber_t minimum_agno
;
3880 ASSERT(args
->pag
!= NULL
);
3881 ASSERT(pag_agno(args
->pag
) == XFS_FSB_TO_AGNO(mp
, target
));
3883 args
->agno
= XFS_FSB_TO_AGNO(mp
, target
);
3884 args
->agbno
= XFS_FSB_TO_AGBNO(mp
, target
);
3886 trace_xfs_alloc_vextent_exact_bno(args
);
3888 error
= xfs_alloc_vextent_check_args(args
, target
, &minimum_agno
);
3890 if (error
== -ENOSPC
)
3895 error
= xfs_alloc_vextent_prepare_ag(args
, 0);
3896 if (!error
&& args
->agbp
)
3897 error
= xfs_alloc_ag_vextent_exact(args
);
3899 return xfs_alloc_vextent_finish(args
, minimum_agno
, error
, false);
3903 * Allocate an extent as close to the target as possible. If there are not
3904 * viable candidates in the AG, then fail the allocation.
3906 * Caller may or may not have a per-ag reference in args->pag.
3909 xfs_alloc_vextent_near_bno(
3910 struct xfs_alloc_arg
*args
,
3911 xfs_fsblock_t target
)
3913 struct xfs_mount
*mp
= args
->mp
;
3914 xfs_agnumber_t minimum_agno
;
3915 bool needs_perag
= args
->pag
== NULL
;
3916 uint32_t alloc_flags
= 0;
3920 ASSERT(pag_agno(args
->pag
) == XFS_FSB_TO_AGNO(mp
, target
));
3922 args
->agno
= XFS_FSB_TO_AGNO(mp
, target
);
3923 args
->agbno
= XFS_FSB_TO_AGBNO(mp
, target
);
3925 trace_xfs_alloc_vextent_near_bno(args
);
3927 error
= xfs_alloc_vextent_check_args(args
, target
, &minimum_agno
);
3929 if (error
== -ENOSPC
)
3935 args
->pag
= xfs_perag_grab(mp
, args
->agno
);
3937 error
= xfs_alloc_vextent_prepare_ag(args
, alloc_flags
);
3938 if (!error
&& args
->agbp
)
3939 error
= xfs_alloc_ag_vextent_near(args
, alloc_flags
);
3941 return xfs_alloc_vextent_finish(args
, minimum_agno
, error
, needs_perag
);
3944 /* Ensure that the freelist is at full capacity. */
3946 xfs_free_extent_fix_freelist(
3947 struct xfs_trans
*tp
,
3948 struct xfs_perag
*pag
,
3949 struct xfs_buf
**agbp
)
3951 struct xfs_alloc_arg args
;
3954 memset(&args
, 0, sizeof(struct xfs_alloc_arg
));
3956 args
.mp
= tp
->t_mountp
;
3957 args
.agno
= pag_agno(pag
);
3961 * validate that the block number is legal - the enables us to detect
3962 * and handle a silent filesystem corruption rather than crashing.
3964 if (args
.agno
>= args
.mp
->m_sb
.sb_agcount
)
3965 return -EFSCORRUPTED
;
3967 error
= xfs_alloc_fix_freelist(&args
, XFS_ALLOC_FLAG_FREEING
);
3977 * Just break up the extent address and hand off to xfs_free_ag_extent
3978 * after fixing up the freelist.
3982 struct xfs_trans
*tp
,
3983 struct xfs_perag
*pag
,
3984 xfs_agblock_t agbno
,
3986 const struct xfs_owner_info
*oinfo
,
3987 enum xfs_ag_resv_type type
,
3990 struct xfs_mount
*mp
= tp
->t_mountp
;
3991 struct xfs_buf
*agbp
;
3992 struct xfs_agf
*agf
;
3994 unsigned int busy_flags
= 0;
3997 ASSERT(type
!= XFS_AG_RESV_AGFL
);
3999 if (XFS_TEST_ERROR(false, mp
,
4000 XFS_ERRTAG_FREE_EXTENT
))
4003 error
= xfs_free_extent_fix_freelist(tp
, pag
, &agbp
);
4005 if (xfs_metadata_is_sick(error
))
4006 xfs_ag_mark_sick(pag
, XFS_SICK_AG_BNOBT
);
4012 if (XFS_IS_CORRUPT(mp
, agbno
>= mp
->m_sb
.sb_agblocks
)) {
4013 xfs_ag_mark_sick(pag
, XFS_SICK_AG_BNOBT
);
4014 error
= -EFSCORRUPTED
;
4018 /* validate the extent size is legal now we have the agf locked */
4019 if (XFS_IS_CORRUPT(mp
, agbno
+ len
> be32_to_cpu(agf
->agf_length
))) {
4020 xfs_ag_mark_sick(pag
, XFS_SICK_AG_BNOBT
);
4021 error
= -EFSCORRUPTED
;
4025 error
= xfs_free_ag_extent(tp
, agbp
, agbno
, len
, oinfo
, type
);
4030 busy_flags
|= XFS_EXTENT_BUSY_SKIP_DISCARD
;
4031 xfs_extent_busy_insert(tp
, pag_group(pag
), agbno
, len
, busy_flags
);
4035 xfs_trans_brelse(tp
, agbp
);
4039 struct xfs_alloc_query_range_info
{
4040 xfs_alloc_query_range_fn fn
;
4044 /* Format btree record and pass to our callback. */
4046 xfs_alloc_query_range_helper(
4047 struct xfs_btree_cur
*cur
,
4048 const union xfs_btree_rec
*rec
,
4051 struct xfs_alloc_query_range_info
*query
= priv
;
4052 struct xfs_alloc_rec_incore irec
;
4055 xfs_alloc_btrec_to_irec(rec
, &irec
);
4056 fa
= xfs_alloc_check_irec(to_perag(cur
->bc_group
), &irec
);
4058 return xfs_alloc_complain_bad_rec(cur
, fa
, &irec
);
4060 return query
->fn(cur
, &irec
, query
->priv
);
4063 /* Find all free space within a given range of blocks. */
4065 xfs_alloc_query_range(
4066 struct xfs_btree_cur
*cur
,
4067 const struct xfs_alloc_rec_incore
*low_rec
,
4068 const struct xfs_alloc_rec_incore
*high_rec
,
4069 xfs_alloc_query_range_fn fn
,
4072 union xfs_btree_irec low_brec
= { .a
= *low_rec
};
4073 union xfs_btree_irec high_brec
= { .a
= *high_rec
};
4074 struct xfs_alloc_query_range_info query
= { .priv
= priv
, .fn
= fn
};
4076 ASSERT(xfs_btree_is_bno(cur
->bc_ops
));
4077 return xfs_btree_query_range(cur
, &low_brec
, &high_brec
,
4078 xfs_alloc_query_range_helper
, &query
);
4081 /* Find all free space records. */
4083 xfs_alloc_query_all(
4084 struct xfs_btree_cur
*cur
,
4085 xfs_alloc_query_range_fn fn
,
4088 struct xfs_alloc_query_range_info query
;
4090 ASSERT(xfs_btree_is_bno(cur
->bc_ops
));
4093 return xfs_btree_query_all(cur
, xfs_alloc_query_range_helper
, &query
);
4097 * Scan part of the keyspace of the free space and tell us if the area has no
4098 * records, is fully mapped by records, or is partially filled.
4101 xfs_alloc_has_records(
4102 struct xfs_btree_cur
*cur
,
4105 enum xbtree_recpacking
*outcome
)
4107 union xfs_btree_irec low
;
4108 union xfs_btree_irec high
;
4110 memset(&low
, 0, sizeof(low
));
4111 low
.a
.ar_startblock
= bno
;
4112 memset(&high
, 0xFF, sizeof(high
));
4113 high
.a
.ar_startblock
= bno
+ len
- 1;
4115 return xfs_btree_has_records(cur
, &low
, &high
, NULL
, outcome
);
4119 * Walk all the blocks in the AGFL. The @walk_fn can return any negative
4120 * error code or XFS_ITER_*.
4124 struct xfs_mount
*mp
,
4125 struct xfs_agf
*agf
,
4126 struct xfs_buf
*agflbp
,
4127 xfs_agfl_walk_fn walk_fn
,
4134 agfl_bno
= xfs_buf_to_agfl_bno(agflbp
);
4135 i
= be32_to_cpu(agf
->agf_flfirst
);
4137 /* Nothing to walk in an empty AGFL. */
4138 if (agf
->agf_flcount
== cpu_to_be32(0))
4141 /* Otherwise, walk from first to last, wrapping as needed. */
4143 error
= walk_fn(mp
, be32_to_cpu(agfl_bno
[i
]), priv
);
4146 if (i
== be32_to_cpu(agf
->agf_fllast
))
4148 if (++i
== xfs_agfl_size(mp
))
4156 xfs_extfree_intent_init_cache(void)
4158 xfs_extfree_item_cache
= kmem_cache_create("xfs_extfree_intent",
4159 sizeof(struct xfs_extent_free_item
),
4162 return xfs_extfree_item_cache
!= NULL
? 0 : -ENOMEM
;
4166 xfs_extfree_intent_destroy_cache(void)
4168 kmem_cache_destroy(xfs_extfree_item_cache
);
4169 xfs_extfree_item_cache
= NULL
;