1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_btree.h"
16 #include "xfs_ialloc.h"
17 #include "xfs_ialloc_btree.h"
18 #include "xfs_alloc.h"
19 #include "xfs_errortag.h"
20 #include "xfs_error.h"
22 #include "xfs_trans.h"
23 #include "xfs_buf_item.h"
24 #include "xfs_icreate_item.h"
25 #include "xfs_icache.h"
26 #include "xfs_trace.h"
30 #include "xfs_health.h"
33 * Lookup a record by ino in the btree given by cur.
37 struct xfs_btree_cur
*cur
, /* btree cursor */
38 xfs_agino_t ino
, /* starting inode of chunk */
39 xfs_lookup_t dir
, /* <=, >=, == */
40 int *stat
) /* success/failure */
42 cur
->bc_rec
.i
.ir_startino
= ino
;
43 cur
->bc_rec
.i
.ir_holemask
= 0;
44 cur
->bc_rec
.i
.ir_count
= 0;
45 cur
->bc_rec
.i
.ir_freecount
= 0;
46 cur
->bc_rec
.i
.ir_free
= 0;
47 return xfs_btree_lookup(cur
, dir
, stat
);
51 * Update the record referred to by cur to the value given.
52 * This either works (return 0) or gets an EFSCORRUPTED error.
54 STATIC
int /* error */
56 struct xfs_btree_cur
*cur
, /* btree cursor */
57 xfs_inobt_rec_incore_t
*irec
) /* btree record */
59 union xfs_btree_rec rec
;
61 rec
.inobt
.ir_startino
= cpu_to_be32(irec
->ir_startino
);
62 if (xfs_has_sparseinodes(cur
->bc_mp
)) {
63 rec
.inobt
.ir_u
.sp
.ir_holemask
= cpu_to_be16(irec
->ir_holemask
);
64 rec
.inobt
.ir_u
.sp
.ir_count
= irec
->ir_count
;
65 rec
.inobt
.ir_u
.sp
.ir_freecount
= irec
->ir_freecount
;
67 /* ir_holemask/ir_count not supported on-disk */
68 rec
.inobt
.ir_u
.f
.ir_freecount
= cpu_to_be32(irec
->ir_freecount
);
70 rec
.inobt
.ir_free
= cpu_to_be64(irec
->ir_free
);
71 return xfs_btree_update(cur
, &rec
);
74 /* Convert on-disk btree record to incore inobt record. */
76 xfs_inobt_btrec_to_irec(
78 const union xfs_btree_rec
*rec
,
79 struct xfs_inobt_rec_incore
*irec
)
81 irec
->ir_startino
= be32_to_cpu(rec
->inobt
.ir_startino
);
82 if (xfs_has_sparseinodes(mp
)) {
83 irec
->ir_holemask
= be16_to_cpu(rec
->inobt
.ir_u
.sp
.ir_holemask
);
84 irec
->ir_count
= rec
->inobt
.ir_u
.sp
.ir_count
;
85 irec
->ir_freecount
= rec
->inobt
.ir_u
.sp
.ir_freecount
;
88 * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
89 * values for full inode chunks.
91 irec
->ir_holemask
= XFS_INOBT_HOLEMASK_FULL
;
92 irec
->ir_count
= XFS_INODES_PER_CHUNK
;
94 be32_to_cpu(rec
->inobt
.ir_u
.f
.ir_freecount
);
96 irec
->ir_free
= be64_to_cpu(rec
->inobt
.ir_free
);
99 /* Compute the freecount of an incore inode record. */
101 xfs_inobt_rec_freecount(
102 const struct xfs_inobt_rec_incore
*irec
)
104 uint64_t realfree
= irec
->ir_free
;
106 if (xfs_inobt_issparse(irec
->ir_holemask
))
107 realfree
&= xfs_inobt_irec_to_allocmask(irec
);
108 return hweight64(realfree
);
111 /* Simple checks for inode records. */
113 xfs_inobt_check_irec(
114 struct xfs_perag
*pag
,
115 const struct xfs_inobt_rec_incore
*irec
)
117 /* Record has to be properly aligned within the AG. */
118 if (!xfs_verify_agino(pag
, irec
->ir_startino
))
119 return __this_address
;
120 if (!xfs_verify_agino(pag
,
121 irec
->ir_startino
+ XFS_INODES_PER_CHUNK
- 1))
122 return __this_address
;
123 if (irec
->ir_count
< XFS_INODES_PER_HOLEMASK_BIT
||
124 irec
->ir_count
> XFS_INODES_PER_CHUNK
)
125 return __this_address
;
126 if (irec
->ir_freecount
> XFS_INODES_PER_CHUNK
)
127 return __this_address
;
129 if (xfs_inobt_rec_freecount(irec
) != irec
->ir_freecount
)
130 return __this_address
;
136 xfs_inobt_complain_bad_rec(
137 struct xfs_btree_cur
*cur
,
139 const struct xfs_inobt_rec_incore
*irec
)
141 struct xfs_mount
*mp
= cur
->bc_mp
;
144 "%sbt record corruption in AG %d detected at %pS!",
145 cur
->bc_ops
->name
, cur
->bc_group
->xg_gno
, fa
);
147 "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
148 irec
->ir_startino
, irec
->ir_count
, irec
->ir_freecount
,
149 irec
->ir_free
, irec
->ir_holemask
);
150 xfs_btree_mark_sick(cur
);
151 return -EFSCORRUPTED
;
155 * Get the data from the pointed-to record.
159 struct xfs_btree_cur
*cur
,
160 struct xfs_inobt_rec_incore
*irec
,
163 struct xfs_mount
*mp
= cur
->bc_mp
;
164 union xfs_btree_rec
*rec
;
168 error
= xfs_btree_get_rec(cur
, &rec
, stat
);
169 if (error
|| *stat
== 0)
172 xfs_inobt_btrec_to_irec(mp
, rec
, irec
);
173 fa
= xfs_inobt_check_irec(to_perag(cur
->bc_group
), irec
);
175 return xfs_inobt_complain_bad_rec(cur
, fa
, irec
);
181 * Insert a single inobt record. Cursor must already point to desired location.
184 xfs_inobt_insert_rec(
185 struct xfs_btree_cur
*cur
,
192 cur
->bc_rec
.i
.ir_holemask
= holemask
;
193 cur
->bc_rec
.i
.ir_count
= count
;
194 cur
->bc_rec
.i
.ir_freecount
= freecount
;
195 cur
->bc_rec
.i
.ir_free
= free
;
196 return xfs_btree_insert(cur
, stat
);
200 * Insert records describing a newly allocated inode chunk into the inobt.
204 struct xfs_perag
*pag
,
205 struct xfs_trans
*tp
,
206 struct xfs_buf
*agbp
,
211 struct xfs_btree_cur
*cur
;
217 cur
= xfs_finobt_init_cursor(pag
, tp
, agbp
);
219 cur
= xfs_inobt_init_cursor(pag
, tp
, agbp
);
221 for (thisino
= newino
;
222 thisino
< newino
+ newlen
;
223 thisino
+= XFS_INODES_PER_CHUNK
) {
224 error
= xfs_inobt_lookup(cur
, thisino
, XFS_LOOKUP_EQ
, &i
);
226 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
231 error
= xfs_inobt_insert_rec(cur
, XFS_INOBT_HOLEMASK_FULL
,
232 XFS_INODES_PER_CHUNK
,
233 XFS_INODES_PER_CHUNK
,
234 XFS_INOBT_ALL_FREE
, &i
);
236 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
242 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
248 * Verify that the number of free inodes in the AGI is correct.
252 xfs_check_agi_freecount(
253 struct xfs_btree_cur
*cur
)
255 if (cur
->bc_nlevels
== 1) {
256 xfs_inobt_rec_incore_t rec
;
261 error
= xfs_inobt_lookup(cur
, 0, XFS_LOOKUP_GE
, &i
);
266 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
271 freecount
+= rec
.ir_freecount
;
272 error
= xfs_btree_increment(cur
, 0, &i
);
278 if (!xfs_is_shutdown(cur
->bc_mp
)) {
280 to_perag(cur
->bc_group
)->pagi_freecount
);
286 #define xfs_check_agi_freecount(cur) 0
290 * Initialise a new set of inodes. When called without a transaction context
291 * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
292 * than logging them (which in a transaction context puts them into the AIL
293 * for writeback rather than the xfsbufd queue).
296 xfs_ialloc_inode_init(
297 struct xfs_mount
*mp
,
298 struct xfs_trans
*tp
,
299 struct list_head
*buffer_list
,
303 xfs_agblock_t length
,
306 struct xfs_buf
*fbuf
;
307 struct xfs_dinode
*free
;
316 * Loop over the new block(s), filling in the inodes. For small block
317 * sizes, manipulate the inodes in buffers which are multiples of the
320 nbufs
= length
/ M_IGEO(mp
)->blocks_per_cluster
;
323 * Figure out what version number to use in the inodes we create. If
324 * the superblock version has caught up to the one that supports the new
325 * inode format, then use the new inode version. Otherwise use the old
326 * version so that old kernels will continue to be able to use the file
329 * For v3 inodes, we also need to write the inode number into the inode,
330 * so calculate the first inode number of the chunk here as
331 * XFS_AGB_TO_AGINO() only works within a filesystem block, not
332 * across multiple filesystem blocks (such as a cluster) and so cannot
333 * be used in the cluster buffer loop below.
335 * Further, because we are writing the inode directly into the buffer
336 * and calculating a CRC on the entire inode, we have ot log the entire
337 * inode so that the entire range the CRC covers is present in the log.
338 * That means for v3 inode we log the entire buffer rather than just the
341 if (xfs_has_v3inodes(mp
)) {
343 ino
= XFS_AGINO_TO_INO(mp
, agno
, XFS_AGB_TO_AGINO(mp
, agbno
));
346 * log the initialisation that is about to take place as an
347 * logical operation. This means the transaction does not
348 * need to log the physical changes to the inode buffers as log
349 * recovery will know what initialisation is actually needed.
350 * Hence we only need to log the buffers as "ordered" buffers so
351 * they track in the AIL as if they were physically logged.
354 xfs_icreate_log(tp
, agno
, agbno
, icount
,
355 mp
->m_sb
.sb_inodesize
, length
, gen
);
359 for (j
= 0; j
< nbufs
; j
++) {
363 d
= XFS_AGB_TO_DADDR(mp
, agno
, agbno
+
364 (j
* M_IGEO(mp
)->blocks_per_cluster
));
365 error
= xfs_trans_get_buf(tp
, mp
->m_ddev_targp
, d
,
366 mp
->m_bsize
* M_IGEO(mp
)->blocks_per_cluster
,
367 XBF_UNMAPPED
, &fbuf
);
371 /* Initialize the inode buffers and log them appropriately. */
372 fbuf
->b_ops
= &xfs_inode_buf_ops
;
373 xfs_buf_zero(fbuf
, 0, BBTOB(fbuf
->b_length
));
374 for (i
= 0; i
< M_IGEO(mp
)->inodes_per_cluster
; i
++) {
375 int ioffset
= i
<< mp
->m_sb
.sb_inodelog
;
377 free
= xfs_make_iptr(mp
, fbuf
, i
);
378 free
->di_magic
= cpu_to_be16(XFS_DINODE_MAGIC
);
379 free
->di_version
= version
;
380 free
->di_gen
= cpu_to_be32(gen
);
381 free
->di_next_unlinked
= cpu_to_be32(NULLAGINO
);
384 free
->di_ino
= cpu_to_be64(ino
);
386 uuid_copy(&free
->di_uuid
,
387 &mp
->m_sb
.sb_meta_uuid
);
388 xfs_dinode_calc_crc(mp
, free
);
390 /* just log the inode core */
391 xfs_trans_log_buf(tp
, fbuf
, ioffset
,
392 ioffset
+ XFS_DINODE_SIZE(mp
) - 1);
398 * Mark the buffer as an inode allocation buffer so it
399 * sticks in AIL at the point of this allocation
400 * transaction. This ensures the they are on disk before
401 * the tail of the log can be moved past this
402 * transaction (i.e. by preventing relogging from moving
403 * it forward in the log).
405 xfs_trans_inode_alloc_buf(tp
, fbuf
);
408 * Mark the buffer as ordered so that they are
409 * not physically logged in the transaction but
410 * still tracked in the AIL as part of the
411 * transaction and pin the log appropriately.
413 xfs_trans_ordered_buf(tp
, fbuf
);
416 fbuf
->b_flags
|= XBF_DONE
;
417 xfs_buf_delwri_queue(fbuf
, buffer_list
);
425 * Align startino and allocmask for a recently allocated sparse chunk such that
426 * they are fit for insertion (or merge) into the on-disk inode btrees.
430 * When enabled, sparse inode support increases the inode alignment from cluster
431 * size to inode chunk size. This means that the minimum range between two
432 * non-adjacent inode records in the inobt is large enough for a full inode
433 * record. This allows for cluster sized, cluster aligned block allocation
434 * without need to worry about whether the resulting inode record overlaps with
435 * another record in the tree. Without this basic rule, we would have to deal
436 * with the consequences of overlap by potentially undoing recent allocations in
437 * the inode allocation codepath.
439 * Because of this alignment rule (which is enforced on mount), there are two
440 * inobt possibilities for newly allocated sparse chunks. One is that the
441 * aligned inode record for the chunk covers a range of inodes not already
442 * covered in the inobt (i.e., it is safe to insert a new sparse record). The
443 * other is that a record already exists at the aligned startino that considers
444 * the newly allocated range as sparse. In the latter case, record content is
445 * merged in hope that sparse inode chunks fill to full chunks over time.
448 xfs_align_sparse_ino(
449 struct xfs_mount
*mp
,
450 xfs_agino_t
*startino
,
457 agbno
= XFS_AGINO_TO_AGBNO(mp
, *startino
);
458 mod
= agbno
% mp
->m_sb
.sb_inoalignmt
;
462 /* calculate the inode offset and align startino */
463 offset
= XFS_AGB_TO_AGINO(mp
, mod
);
467 * Since startino has been aligned down, left shift allocmask such that
468 * it continues to represent the same physical inodes relative to the
471 *allocmask
<<= offset
/ XFS_INODES_PER_HOLEMASK_BIT
;
475 * Determine whether the source inode record can merge into the target. Both
476 * records must be sparse, the inode ranges must match and there must be no
477 * allocation overlap between the records.
480 __xfs_inobt_can_merge(
481 struct xfs_inobt_rec_incore
*trec
, /* tgt record */
482 struct xfs_inobt_rec_incore
*srec
) /* src record */
487 /* records must cover the same inode range */
488 if (trec
->ir_startino
!= srec
->ir_startino
)
491 /* both records must be sparse */
492 if (!xfs_inobt_issparse(trec
->ir_holemask
) ||
493 !xfs_inobt_issparse(srec
->ir_holemask
))
496 /* both records must track some inodes */
497 if (!trec
->ir_count
|| !srec
->ir_count
)
500 /* can't exceed capacity of a full record */
501 if (trec
->ir_count
+ srec
->ir_count
> XFS_INODES_PER_CHUNK
)
504 /* verify there is no allocation overlap */
505 talloc
= xfs_inobt_irec_to_allocmask(trec
);
506 salloc
= xfs_inobt_irec_to_allocmask(srec
);
514 * Merge the source inode record into the target. The caller must call
515 * __xfs_inobt_can_merge() to ensure the merge is valid.
518 __xfs_inobt_rec_merge(
519 struct xfs_inobt_rec_incore
*trec
, /* target */
520 struct xfs_inobt_rec_incore
*srec
) /* src */
522 ASSERT(trec
->ir_startino
== srec
->ir_startino
);
524 /* combine the counts */
525 trec
->ir_count
+= srec
->ir_count
;
526 trec
->ir_freecount
+= srec
->ir_freecount
;
529 * Merge the holemask and free mask. For both fields, 0 bits refer to
530 * allocated inodes. We combine the allocated ranges with bitwise AND.
532 trec
->ir_holemask
&= srec
->ir_holemask
;
533 trec
->ir_free
&= srec
->ir_free
;
537 * Insert a new sparse inode chunk into the associated inode allocation btree.
538 * The inode record for the sparse chunk is pre-aligned to a startino that
539 * should match any pre-existing sparse inode record in the tree. This allows
540 * sparse chunks to fill over time.
542 * If no preexisting record exists, the provided record is inserted.
543 * If there is a preexisting record, the provided record is merged with the
544 * existing record and updated in place. The merged record is returned in nrec.
546 * It is considered corruption if a merge is requested and not possible. Given
547 * the sparse inode alignment constraints, this should never happen.
550 xfs_inobt_insert_sprec(
551 struct xfs_perag
*pag
,
552 struct xfs_trans
*tp
,
553 struct xfs_buf
*agbp
,
554 struct xfs_inobt_rec_incore
*nrec
) /* in/out: new/merged rec. */
556 struct xfs_mount
*mp
= pag_mount(pag
);
557 struct xfs_btree_cur
*cur
;
560 struct xfs_inobt_rec_incore rec
;
562 cur
= xfs_inobt_init_cursor(pag
, tp
, agbp
);
564 /* the new record is pre-aligned so we know where to look */
565 error
= xfs_inobt_lookup(cur
, nrec
->ir_startino
, XFS_LOOKUP_EQ
, &i
);
568 /* if nothing there, insert a new record and return */
570 error
= xfs_inobt_insert_rec(cur
, nrec
->ir_holemask
,
571 nrec
->ir_count
, nrec
->ir_freecount
,
575 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
576 xfs_btree_mark_sick(cur
);
577 error
= -EFSCORRUPTED
;
585 * A record exists at this startino. Merge the records.
587 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
590 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
591 xfs_btree_mark_sick(cur
);
592 error
= -EFSCORRUPTED
;
595 if (XFS_IS_CORRUPT(mp
, rec
.ir_startino
!= nrec
->ir_startino
)) {
596 xfs_btree_mark_sick(cur
);
597 error
= -EFSCORRUPTED
;
602 * This should never fail. If we have coexisting records that
603 * cannot merge, something is seriously wrong.
605 if (XFS_IS_CORRUPT(mp
, !__xfs_inobt_can_merge(nrec
, &rec
))) {
606 xfs_btree_mark_sick(cur
);
607 error
= -EFSCORRUPTED
;
611 trace_xfs_irec_merge_pre(pag
, &rec
, nrec
);
613 /* merge to nrec to output the updated record */
614 __xfs_inobt_rec_merge(nrec
, &rec
);
616 trace_xfs_irec_merge_post(pag
, nrec
);
618 error
= xfs_inobt_rec_check_count(mp
, nrec
);
622 error
= xfs_inobt_update(cur
, nrec
);
627 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
630 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
635 * Insert a new sparse inode chunk into the free inode btree. The inode
636 * record for the sparse chunk is pre-aligned to a startino that should match
637 * any pre-existing sparse inode record in the tree. This allows sparse chunks
640 * The new record is always inserted, overwriting a pre-existing record if
644 xfs_finobt_insert_sprec(
645 struct xfs_perag
*pag
,
646 struct xfs_trans
*tp
,
647 struct xfs_buf
*agbp
,
648 struct xfs_inobt_rec_incore
*nrec
) /* in/out: new rec. */
650 struct xfs_mount
*mp
= pag_mount(pag
);
651 struct xfs_btree_cur
*cur
;
655 cur
= xfs_finobt_init_cursor(pag
, tp
, agbp
);
657 /* the new record is pre-aligned so we know where to look */
658 error
= xfs_inobt_lookup(cur
, nrec
->ir_startino
, XFS_LOOKUP_EQ
, &i
);
661 /* if nothing there, insert a new record and return */
663 error
= xfs_inobt_insert_rec(cur
, nrec
->ir_holemask
,
664 nrec
->ir_count
, nrec
->ir_freecount
,
668 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
669 xfs_btree_mark_sick(cur
);
670 error
= -EFSCORRUPTED
;
674 error
= xfs_inobt_update(cur
, nrec
);
679 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
682 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
688 * Allocate new inodes in the allocation group specified by agbp. Returns 0 if
689 * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
690 * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
691 * inode count threshold, or the usual negative error code for other errors.
695 struct xfs_perag
*pag
,
696 struct xfs_trans
*tp
,
697 struct xfs_buf
*agbp
)
700 struct xfs_alloc_arg args
;
702 xfs_agino_t newino
; /* new first inode's number */
703 xfs_agino_t newlen
; /* new number of inodes */
704 int isaligned
= 0; /* inode allocation at stripe */
706 /* init. to full chunk */
707 struct xfs_inobt_rec_incore rec
;
708 struct xfs_ino_geometry
*igeo
= M_IGEO(tp
->t_mountp
);
709 uint16_t allocmask
= (uint16_t) -1;
712 memset(&args
, 0, sizeof(args
));
714 args
.mp
= tp
->t_mountp
;
715 args
.fsbno
= NULLFSBLOCK
;
716 args
.oinfo
= XFS_RMAP_OINFO_INODES
;
720 /* randomly do sparse inode allocations */
721 if (xfs_has_sparseinodes(tp
->t_mountp
) &&
722 igeo
->ialloc_min_blks
< igeo
->ialloc_blks
)
723 do_sparse
= get_random_u32_below(2);
727 * Locking will ensure that we don't have two callers in here
730 newlen
= igeo
->ialloc_inos
;
731 if (igeo
->maxicount
&&
732 percpu_counter_read_positive(&args
.mp
->m_icount
) + newlen
>
735 args
.minlen
= args
.maxlen
= igeo
->ialloc_blks
;
737 * First try to allocate inodes contiguous with the last-allocated
738 * chunk of inodes. If the filesystem is striped, this will fill
739 * an entire stripe unit with inodes.
742 newino
= be32_to_cpu(agi
->agi_newino
);
743 args
.agbno
= XFS_AGINO_TO_AGBNO(args
.mp
, newino
) +
747 if (likely(newino
!= NULLAGINO
&&
748 (args
.agbno
< be32_to_cpu(agi
->agi_length
)))) {
752 * We need to take into account alignment here to ensure that
753 * we don't modify the free list if we fail to have an exact
754 * block. If we don't have an exact match, and every oher
755 * attempt allocation attempt fails, we'll end up cancelling
756 * a dirty transaction and shutting down.
758 * For an exact allocation, alignment must be 1,
759 * however we need to take cluster alignment into account when
760 * fixing up the freelist. Use the minalignslop field to
761 * indicate that extra blocks might be required for alignment,
762 * but not to use them in the actual exact allocation.
765 args
.minalignslop
= igeo
->cluster_align
- 1;
767 /* Allow space for the inode btree to split. */
768 args
.minleft
= igeo
->inobt_maxlevels
;
769 error
= xfs_alloc_vextent_exact_bno(&args
,
770 xfs_agbno_to_fsb(pag
, args
.agbno
));
775 * This request might have dirtied the transaction if the AG can
776 * satisfy the request, but the exact block was not available.
777 * If the allocation did fail, subsequent requests will relax
778 * the exact agbno requirement and increase the alignment
779 * instead. It is critical that the total size of the request
780 * (len + alignment + slop) does not increase from this point
781 * on, so reset minalignslop to ensure it is not included in
782 * subsequent requests.
784 args
.minalignslop
= 0;
787 if (unlikely(args
.fsbno
== NULLFSBLOCK
)) {
789 * Set the alignment for the allocation.
790 * If stripe alignment is turned on then align at stripe unit
792 * If the cluster size is smaller than a filesystem block
793 * then we're doing I/O for inodes in filesystem block size
794 * pieces, so don't need alignment anyway.
797 if (igeo
->ialloc_align
) {
798 ASSERT(!xfs_has_noalign(args
.mp
));
799 args
.alignment
= args
.mp
->m_dalign
;
802 args
.alignment
= igeo
->cluster_align
;
804 * Allocate a fixed-size extent of inodes.
808 * Allow space for the inode btree to split.
810 args
.minleft
= igeo
->inobt_maxlevels
;
811 error
= xfs_alloc_vextent_near_bno(&args
,
812 xfs_agbno_to_fsb(pag
,
813 be32_to_cpu(agi
->agi_root
)));
819 * If stripe alignment is turned on, then try again with cluster
822 if (isaligned
&& args
.fsbno
== NULLFSBLOCK
) {
823 args
.alignment
= igeo
->cluster_align
;
824 error
= xfs_alloc_vextent_near_bno(&args
,
825 xfs_agbno_to_fsb(pag
,
826 be32_to_cpu(agi
->agi_root
)));
832 * Finally, try a sparse allocation if the filesystem supports it and
833 * the sparse allocation length is smaller than a full chunk.
835 if (xfs_has_sparseinodes(args
.mp
) &&
836 igeo
->ialloc_min_blks
< igeo
->ialloc_blks
&&
837 args
.fsbno
== NULLFSBLOCK
) {
839 args
.alignment
= args
.mp
->m_sb
.sb_spino_align
;
842 args
.minlen
= igeo
->ialloc_min_blks
;
843 args
.maxlen
= args
.minlen
;
846 * The inode record will be aligned to full chunk size. We must
847 * prevent sparse allocation from AG boundaries that result in
848 * invalid inode records, such as records that start at agbno 0
849 * or extend beyond the AG.
851 * Set min agbno to the first aligned, non-zero agbno and max to
852 * the last aligned agbno that is at least one full chunk from
855 args
.min_agbno
= args
.mp
->m_sb
.sb_inoalignmt
;
856 args
.max_agbno
= round_down(xfs_ag_block_count(args
.mp
,
858 args
.mp
->m_sb
.sb_inoalignmt
) -
861 error
= xfs_alloc_vextent_near_bno(&args
,
862 xfs_agbno_to_fsb(pag
,
863 be32_to_cpu(agi
->agi_root
)));
867 newlen
= XFS_AGB_TO_AGINO(args
.mp
, args
.len
);
868 ASSERT(newlen
<= XFS_INODES_PER_CHUNK
);
869 allocmask
= (1 << (newlen
/ XFS_INODES_PER_HOLEMASK_BIT
)) - 1;
872 if (args
.fsbno
== NULLFSBLOCK
)
875 ASSERT(args
.len
== args
.minlen
);
878 * Stamp and write the inode buffers.
880 * Seed the new inode cluster with a random generation number. This
881 * prevents short-term reuse of generation numbers if a chunk is
882 * freed and then immediately reallocated. We use random numbers
883 * rather than a linear progression to prevent the next generation
884 * number from being easily guessable.
886 error
= xfs_ialloc_inode_init(args
.mp
, tp
, NULL
, newlen
, pag_agno(pag
),
887 args
.agbno
, args
.len
, get_random_u32());
892 * Convert the results.
894 newino
= XFS_AGB_TO_AGINO(args
.mp
, args
.agbno
);
896 if (xfs_inobt_issparse(~allocmask
)) {
898 * We've allocated a sparse chunk. Align the startino and mask.
900 xfs_align_sparse_ino(args
.mp
, &newino
, &allocmask
);
902 rec
.ir_startino
= newino
;
903 rec
.ir_holemask
= ~allocmask
;
904 rec
.ir_count
= newlen
;
905 rec
.ir_freecount
= newlen
;
906 rec
.ir_free
= XFS_INOBT_ALL_FREE
;
909 * Insert the sparse record into the inobt and allow for a merge
910 * if necessary. If a merge does occur, rec is updated to the
913 error
= xfs_inobt_insert_sprec(pag
, tp
, agbp
, &rec
);
914 if (error
== -EFSCORRUPTED
) {
916 "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
917 xfs_agino_to_ino(pag
, rec
.ir_startino
),
918 rec
.ir_holemask
, rec
.ir_count
);
919 xfs_force_shutdown(args
.mp
, SHUTDOWN_CORRUPT_INCORE
);
925 * We can't merge the part we've just allocated as for the inobt
926 * due to finobt semantics. The original record may or may not
927 * exist independent of whether physical inodes exist in this
930 * We must update the finobt record based on the inobt record.
931 * rec contains the fully merged and up to date inobt record
932 * from the previous call. Set merge false to replace any
933 * existing record with this one.
935 if (xfs_has_finobt(args
.mp
)) {
936 error
= xfs_finobt_insert_sprec(pag
, tp
, agbp
, &rec
);
941 /* full chunk - insert new records to both btrees */
942 error
= xfs_inobt_insert(pag
, tp
, agbp
, newino
, newlen
, false);
946 if (xfs_has_finobt(args
.mp
)) {
947 error
= xfs_inobt_insert(pag
, tp
, agbp
, newino
,
955 * Update AGI counts and newino.
957 be32_add_cpu(&agi
->agi_count
, newlen
);
958 be32_add_cpu(&agi
->agi_freecount
, newlen
);
959 pag
->pagi_freecount
+= newlen
;
960 pag
->pagi_count
+= newlen
;
961 agi
->agi_newino
= cpu_to_be32(newino
);
964 * Log allocation group header fields
966 xfs_ialloc_log_agi(tp
, agbp
,
967 XFS_AGI_COUNT
| XFS_AGI_FREECOUNT
| XFS_AGI_NEWINO
);
969 * Modify/log superblock values for inode count and inode free count.
971 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_ICOUNT
, (long)newlen
);
972 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_IFREE
, (long)newlen
);
977 * Try to retrieve the next record to the left/right from the current one.
981 struct xfs_btree_cur
*cur
,
982 xfs_inobt_rec_incore_t
*rec
,
990 error
= xfs_btree_decrement(cur
, 0, &i
);
992 error
= xfs_btree_increment(cur
, 0, &i
);
998 error
= xfs_inobt_get_rec(cur
, rec
, &i
);
1001 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1002 xfs_btree_mark_sick(cur
);
1003 return -EFSCORRUPTED
;
1012 struct xfs_btree_cur
*cur
,
1014 xfs_inobt_rec_incore_t
*rec
,
1020 error
= xfs_inobt_lookup(cur
, agino
, XFS_LOOKUP_EQ
, &i
);
1025 error
= xfs_inobt_get_rec(cur
, rec
, &i
);
1028 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1029 xfs_btree_mark_sick(cur
);
1030 return -EFSCORRUPTED
;
1038 * Return the offset of the first free inode in the record. If the inode chunk
1039 * is sparsely allocated, we convert the record holemask to inode granularity
1040 * and mask off the unallocated regions from the inode free mask.
1043 xfs_inobt_first_free_inode(
1044 struct xfs_inobt_rec_incore
*rec
)
1046 xfs_inofree_t realfree
;
1048 /* if there are no holes, return the first available offset */
1049 if (!xfs_inobt_issparse(rec
->ir_holemask
))
1050 return xfs_lowbit64(rec
->ir_free
);
1052 realfree
= xfs_inobt_irec_to_allocmask(rec
);
1053 realfree
&= rec
->ir_free
;
1055 return xfs_lowbit64(realfree
);
1059 * If this AG has corrupt inodes, check if allocating this inode would fail
1060 * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again
1064 xfs_dialloc_check_ino(
1065 struct xfs_perag
*pag
,
1066 struct xfs_trans
*tp
,
1069 struct xfs_imap imap
;
1073 error
= xfs_imap(pag
, tp
, ino
, &imap
, 0);
1077 error
= xfs_imap_to_bp(pag_mount(pag
), tp
, &imap
, &bp
);
1081 xfs_trans_brelse(tp
, bp
);
1086 * Allocate an inode using the inobt-only algorithm.
1089 xfs_dialloc_ag_inobt(
1090 struct xfs_perag
*pag
,
1091 struct xfs_trans
*tp
,
1092 struct xfs_buf
*agbp
,
1096 struct xfs_mount
*mp
= tp
->t_mountp
;
1097 struct xfs_agi
*agi
= agbp
->b_addr
;
1098 xfs_agnumber_t pagno
= XFS_INO_TO_AGNO(mp
, parent
);
1099 xfs_agino_t pagino
= XFS_INO_TO_AGINO(mp
, parent
);
1100 struct xfs_btree_cur
*cur
, *tcur
;
1101 struct xfs_inobt_rec_incore rec
, trec
;
1106 int searchdistance
= 10;
1108 ASSERT(xfs_perag_initialised_agi(pag
));
1109 ASSERT(xfs_perag_allows_inodes(pag
));
1110 ASSERT(pag
->pagi_freecount
> 0);
1113 cur
= xfs_inobt_init_cursor(pag
, tp
, agbp
);
1115 * If pagino is 0 (this is the root inode allocation) use newino.
1116 * This must work because we've just allocated some.
1119 pagino
= be32_to_cpu(agi
->agi_newino
);
1121 error
= xfs_check_agi_freecount(cur
);
1126 * If in the same AG as the parent, try to get near the parent.
1128 if (pagno
== pag_agno(pag
)) {
1129 int doneleft
; /* done, to the left */
1130 int doneright
; /* done, to the right */
1132 error
= xfs_inobt_lookup(cur
, pagino
, XFS_LOOKUP_LE
, &i
);
1135 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
1136 xfs_btree_mark_sick(cur
);
1137 error
= -EFSCORRUPTED
;
1141 error
= xfs_inobt_get_rec(cur
, &rec
, &j
);
1144 if (XFS_IS_CORRUPT(mp
, j
!= 1)) {
1145 xfs_btree_mark_sick(cur
);
1146 error
= -EFSCORRUPTED
;
1150 if (rec
.ir_freecount
> 0) {
1152 * Found a free inode in the same chunk
1153 * as the parent, done.
1160 * In the same AG as parent, but parent's chunk is full.
1163 /* duplicate the cursor, search left & right simultaneously */
1164 error
= xfs_btree_dup_cursor(cur
, &tcur
);
1169 * Skip to last blocks looked up if same parent inode.
1171 if (pagino
!= NULLAGINO
&&
1172 pag
->pagl_pagino
== pagino
&&
1173 pag
->pagl_leftrec
!= NULLAGINO
&&
1174 pag
->pagl_rightrec
!= NULLAGINO
) {
1175 error
= xfs_ialloc_get_rec(tcur
, pag
->pagl_leftrec
,
1180 error
= xfs_ialloc_get_rec(cur
, pag
->pagl_rightrec
,
1185 /* search left with tcur, back up 1 record */
1186 error
= xfs_ialloc_next_rec(tcur
, &trec
, &doneleft
, 1);
1190 /* search right with cur, go forward 1 record. */
1191 error
= xfs_ialloc_next_rec(cur
, &rec
, &doneright
, 0);
1197 * Loop until we find an inode chunk with a free inode.
1199 while (--searchdistance
> 0 && (!doneleft
|| !doneright
)) {
1200 int useleft
; /* using left inode chunk this time */
1202 /* figure out the closer block if both are valid. */
1203 if (!doneleft
&& !doneright
) {
1205 (trec
.ir_startino
+ XFS_INODES_PER_CHUNK
- 1) <
1206 rec
.ir_startino
- pagino
;
1208 useleft
= !doneleft
;
1211 /* free inodes to the left? */
1212 if (useleft
&& trec
.ir_freecount
) {
1213 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
1216 pag
->pagl_leftrec
= trec
.ir_startino
;
1217 pag
->pagl_rightrec
= rec
.ir_startino
;
1218 pag
->pagl_pagino
= pagino
;
1223 /* free inodes to the right? */
1224 if (!useleft
&& rec
.ir_freecount
) {
1225 xfs_btree_del_cursor(tcur
, XFS_BTREE_NOERROR
);
1227 pag
->pagl_leftrec
= trec
.ir_startino
;
1228 pag
->pagl_rightrec
= rec
.ir_startino
;
1229 pag
->pagl_pagino
= pagino
;
1233 /* get next record to check */
1235 error
= xfs_ialloc_next_rec(tcur
, &trec
,
1238 error
= xfs_ialloc_next_rec(cur
, &rec
,
1245 if (searchdistance
<= 0) {
1247 * Not in range - save last search
1248 * location and allocate a new inode
1250 xfs_btree_del_cursor(tcur
, XFS_BTREE_NOERROR
);
1251 pag
->pagl_leftrec
= trec
.ir_startino
;
1252 pag
->pagl_rightrec
= rec
.ir_startino
;
1253 pag
->pagl_pagino
= pagino
;
1257 * We've reached the end of the btree. because
1258 * we are only searching a small chunk of the
1259 * btree each search, there is obviously free
1260 * inodes closer to the parent inode than we
1261 * are now. restart the search again.
1263 pag
->pagl_pagino
= NULLAGINO
;
1264 pag
->pagl_leftrec
= NULLAGINO
;
1265 pag
->pagl_rightrec
= NULLAGINO
;
1266 xfs_btree_del_cursor(tcur
, XFS_BTREE_NOERROR
);
1267 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
1273 * In a different AG from the parent.
1274 * See if the most recently allocated block has any free.
1276 if (agi
->agi_newino
!= cpu_to_be32(NULLAGINO
)) {
1277 error
= xfs_inobt_lookup(cur
, be32_to_cpu(agi
->agi_newino
),
1283 error
= xfs_inobt_get_rec(cur
, &rec
, &j
);
1287 if (j
== 1 && rec
.ir_freecount
> 0) {
1289 * The last chunk allocated in the group
1290 * still has a free inode.
1298 * None left in the last group, search the whole AG
1300 error
= xfs_inobt_lookup(cur
, 0, XFS_LOOKUP_GE
, &i
);
1303 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
1304 xfs_btree_mark_sick(cur
);
1305 error
= -EFSCORRUPTED
;
1310 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
1313 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
1314 xfs_btree_mark_sick(cur
);
1315 error
= -EFSCORRUPTED
;
1318 if (rec
.ir_freecount
> 0)
1320 error
= xfs_btree_increment(cur
, 0, &i
);
1323 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
1324 xfs_btree_mark_sick(cur
);
1325 error
= -EFSCORRUPTED
;
1331 offset
= xfs_inobt_first_free_inode(&rec
);
1332 ASSERT(offset
>= 0);
1333 ASSERT(offset
< XFS_INODES_PER_CHUNK
);
1334 ASSERT((XFS_AGINO_TO_OFFSET(mp
, rec
.ir_startino
) %
1335 XFS_INODES_PER_CHUNK
) == 0);
1336 ino
= xfs_agino_to_ino(pag
, rec
.ir_startino
+ offset
);
1338 if (xfs_ag_has_sickness(pag
, XFS_SICK_AG_INODES
)) {
1339 error
= xfs_dialloc_check_ino(pag
, tp
, ino
);
1344 rec
.ir_free
&= ~XFS_INOBT_MASK(offset
);
1346 error
= xfs_inobt_update(cur
, &rec
);
1349 be32_add_cpu(&agi
->agi_freecount
, -1);
1350 xfs_ialloc_log_agi(tp
, agbp
, XFS_AGI_FREECOUNT
);
1351 pag
->pagi_freecount
--;
1353 error
= xfs_check_agi_freecount(cur
);
1357 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
1358 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_IFREE
, -1);
1362 xfs_btree_del_cursor(tcur
, XFS_BTREE_ERROR
);
1364 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
1369 * Use the free inode btree to allocate an inode based on distance from the
1370 * parent. Note that the provided cursor may be deleted and replaced.
1373 xfs_dialloc_ag_finobt_near(
1375 struct xfs_btree_cur
**ocur
,
1376 struct xfs_inobt_rec_incore
*rec
)
1378 struct xfs_btree_cur
*lcur
= *ocur
; /* left search cursor */
1379 struct xfs_btree_cur
*rcur
; /* right search cursor */
1380 struct xfs_inobt_rec_incore rrec
;
1384 error
= xfs_inobt_lookup(lcur
, pagino
, XFS_LOOKUP_LE
, &i
);
1389 error
= xfs_inobt_get_rec(lcur
, rec
, &i
);
1392 if (XFS_IS_CORRUPT(lcur
->bc_mp
, i
!= 1)) {
1393 xfs_btree_mark_sick(lcur
);
1394 return -EFSCORRUPTED
;
1398 * See if we've landed in the parent inode record. The finobt
1399 * only tracks chunks with at least one free inode, so record
1400 * existence is enough.
1402 if (pagino
>= rec
->ir_startino
&&
1403 pagino
< (rec
->ir_startino
+ XFS_INODES_PER_CHUNK
))
1407 error
= xfs_btree_dup_cursor(lcur
, &rcur
);
1411 error
= xfs_inobt_lookup(rcur
, pagino
, XFS_LOOKUP_GE
, &j
);
1415 error
= xfs_inobt_get_rec(rcur
, &rrec
, &j
);
1418 if (XFS_IS_CORRUPT(lcur
->bc_mp
, j
!= 1)) {
1419 xfs_btree_mark_sick(lcur
);
1420 error
= -EFSCORRUPTED
;
1425 if (XFS_IS_CORRUPT(lcur
->bc_mp
, i
!= 1 && j
!= 1)) {
1426 xfs_btree_mark_sick(lcur
);
1427 error
= -EFSCORRUPTED
;
1430 if (i
== 1 && j
== 1) {
1432 * Both the left and right records are valid. Choose the closer
1433 * inode chunk to the target.
1435 if ((pagino
- rec
->ir_startino
+ XFS_INODES_PER_CHUNK
- 1) >
1436 (rrec
.ir_startino
- pagino
)) {
1438 xfs_btree_del_cursor(lcur
, XFS_BTREE_NOERROR
);
1441 xfs_btree_del_cursor(rcur
, XFS_BTREE_NOERROR
);
1443 } else if (j
== 1) {
1444 /* only the right record is valid */
1446 xfs_btree_del_cursor(lcur
, XFS_BTREE_NOERROR
);
1448 } else if (i
== 1) {
1449 /* only the left record is valid */
1450 xfs_btree_del_cursor(rcur
, XFS_BTREE_NOERROR
);
1456 xfs_btree_del_cursor(rcur
, XFS_BTREE_ERROR
);
1461 * Use the free inode btree to find a free inode based on a newino hint. If
1462 * the hint is NULL, find the first free inode in the AG.
1465 xfs_dialloc_ag_finobt_newino(
1466 struct xfs_agi
*agi
,
1467 struct xfs_btree_cur
*cur
,
1468 struct xfs_inobt_rec_incore
*rec
)
1473 if (agi
->agi_newino
!= cpu_to_be32(NULLAGINO
)) {
1474 error
= xfs_inobt_lookup(cur
, be32_to_cpu(agi
->agi_newino
),
1479 error
= xfs_inobt_get_rec(cur
, rec
, &i
);
1482 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1483 xfs_btree_mark_sick(cur
);
1484 return -EFSCORRUPTED
;
1491 * Find the first inode available in the AG.
1493 error
= xfs_inobt_lookup(cur
, 0, XFS_LOOKUP_GE
, &i
);
1496 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1497 xfs_btree_mark_sick(cur
);
1498 return -EFSCORRUPTED
;
1501 error
= xfs_inobt_get_rec(cur
, rec
, &i
);
1504 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1505 xfs_btree_mark_sick(cur
);
1506 return -EFSCORRUPTED
;
1513 * Update the inobt based on a modification made to the finobt. Also ensure that
1514 * the records from both trees are equivalent post-modification.
1517 xfs_dialloc_ag_update_inobt(
1518 struct xfs_btree_cur
*cur
, /* inobt cursor */
1519 struct xfs_inobt_rec_incore
*frec
, /* finobt record */
1520 int offset
) /* inode offset */
1522 struct xfs_inobt_rec_incore rec
;
1526 error
= xfs_inobt_lookup(cur
, frec
->ir_startino
, XFS_LOOKUP_EQ
, &i
);
1529 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1530 xfs_btree_mark_sick(cur
);
1531 return -EFSCORRUPTED
;
1534 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
1537 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1538 xfs_btree_mark_sick(cur
);
1539 return -EFSCORRUPTED
;
1541 ASSERT((XFS_AGINO_TO_OFFSET(cur
->bc_mp
, rec
.ir_startino
) %
1542 XFS_INODES_PER_CHUNK
) == 0);
1544 rec
.ir_free
&= ~XFS_INOBT_MASK(offset
);
1547 if (XFS_IS_CORRUPT(cur
->bc_mp
,
1548 rec
.ir_free
!= frec
->ir_free
||
1549 rec
.ir_freecount
!= frec
->ir_freecount
)) {
1550 xfs_btree_mark_sick(cur
);
1551 return -EFSCORRUPTED
;
1554 return xfs_inobt_update(cur
, &rec
);
1558 * Allocate an inode using the free inode btree, if available. Otherwise, fall
1559 * back to the inobt search algorithm.
1561 * The caller selected an AG for us, and made sure that free inodes are
1566 struct xfs_perag
*pag
,
1567 struct xfs_trans
*tp
,
1568 struct xfs_buf
*agbp
,
1572 struct xfs_mount
*mp
= tp
->t_mountp
;
1573 struct xfs_agi
*agi
= agbp
->b_addr
;
1574 xfs_agnumber_t pagno
= XFS_INO_TO_AGNO(mp
, parent
);
1575 xfs_agino_t pagino
= XFS_INO_TO_AGINO(mp
, parent
);
1576 struct xfs_btree_cur
*cur
; /* finobt cursor */
1577 struct xfs_btree_cur
*icur
; /* inobt cursor */
1578 struct xfs_inobt_rec_incore rec
;
1584 if (!xfs_has_finobt(mp
))
1585 return xfs_dialloc_ag_inobt(pag
, tp
, agbp
, parent
, inop
);
1588 * If pagino is 0 (this is the root inode allocation) use newino.
1589 * This must work because we've just allocated some.
1592 pagino
= be32_to_cpu(agi
->agi_newino
);
1594 cur
= xfs_finobt_init_cursor(pag
, tp
, agbp
);
1596 error
= xfs_check_agi_freecount(cur
);
1601 * The search algorithm depends on whether we're in the same AG as the
1602 * parent. If so, find the closest available inode to the parent. If
1603 * not, consider the agi hint or find the first free inode in the AG.
1605 if (pag_agno(pag
) == pagno
)
1606 error
= xfs_dialloc_ag_finobt_near(pagino
, &cur
, &rec
);
1608 error
= xfs_dialloc_ag_finobt_newino(agi
, cur
, &rec
);
1612 offset
= xfs_inobt_first_free_inode(&rec
);
1613 ASSERT(offset
>= 0);
1614 ASSERT(offset
< XFS_INODES_PER_CHUNK
);
1615 ASSERT((XFS_AGINO_TO_OFFSET(mp
, rec
.ir_startino
) %
1616 XFS_INODES_PER_CHUNK
) == 0);
1617 ino
= xfs_agino_to_ino(pag
, rec
.ir_startino
+ offset
);
1619 if (xfs_ag_has_sickness(pag
, XFS_SICK_AG_INODES
)) {
1620 error
= xfs_dialloc_check_ino(pag
, tp
, ino
);
1626 * Modify or remove the finobt record.
1628 rec
.ir_free
&= ~XFS_INOBT_MASK(offset
);
1630 if (rec
.ir_freecount
)
1631 error
= xfs_inobt_update(cur
, &rec
);
1633 error
= xfs_btree_delete(cur
, &i
);
1638 * The finobt has now been updated appropriately. We haven't updated the
1639 * agi and superblock yet, so we can create an inobt cursor and validate
1640 * the original freecount. If all is well, make the equivalent update to
1641 * the inobt using the finobt record and offset information.
1643 icur
= xfs_inobt_init_cursor(pag
, tp
, agbp
);
1645 error
= xfs_check_agi_freecount(icur
);
1649 error
= xfs_dialloc_ag_update_inobt(icur
, &rec
, offset
);
1654 * Both trees have now been updated. We must update the perag and
1655 * superblock before we can check the freecount for each btree.
1657 be32_add_cpu(&agi
->agi_freecount
, -1);
1658 xfs_ialloc_log_agi(tp
, agbp
, XFS_AGI_FREECOUNT
);
1659 pag
->pagi_freecount
--;
1661 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_IFREE
, -1);
1663 error
= xfs_check_agi_freecount(icur
);
1666 error
= xfs_check_agi_freecount(cur
);
1670 xfs_btree_del_cursor(icur
, XFS_BTREE_NOERROR
);
1671 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
1676 xfs_btree_del_cursor(icur
, XFS_BTREE_ERROR
);
1678 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
1684 struct xfs_trans
**tpp
,
1685 struct xfs_buf
*agibp
)
1687 struct xfs_trans
*tp
= *tpp
;
1688 struct xfs_dquot_acct
*dqinfo
;
1692 * Hold to on to the agibp across the commit so no other allocation can
1693 * come in and take the free inodes we just allocated for our caller.
1695 xfs_trans_bhold(tp
, agibp
);
1698 * We want the quota changes to be associated with the next transaction,
1699 * NOT this one. So, detach the dqinfo from this and attach it to the
1702 dqinfo
= tp
->t_dqinfo
;
1703 tp
->t_dqinfo
= NULL
;
1705 error
= xfs_trans_roll(&tp
);
1707 /* Re-attach the quota info that we detached from prev trx. */
1708 tp
->t_dqinfo
= dqinfo
;
1711 * Join the buffer even on commit error so that the buffer is released
1712 * when the caller cancels the transaction and doesn't have to handle
1713 * this error case specially.
1715 xfs_trans_bjoin(tp
, agibp
);
1721 xfs_dialloc_good_ag(
1722 struct xfs_perag
*pag
,
1723 struct xfs_trans
*tp
,
1728 struct xfs_mount
*mp
= tp
->t_mountp
;
1730 xfs_extlen_t longest
= 0;
1736 if (!xfs_perag_allows_inodes(pag
))
1739 if (!xfs_perag_initialised_agi(pag
)) {
1740 error
= xfs_ialloc_read_agi(pag
, tp
, 0, NULL
);
1745 if (pag
->pagi_freecount
)
1750 if (!xfs_perag_initialised_agf(pag
)) {
1751 error
= xfs_alloc_read_agf(pag
, tp
, flags
, NULL
);
1757 * Check that there is enough free space for the file plus a chunk of
1758 * inodes if we need to allocate some. If this is the first pass across
1759 * the AGs, take into account the potential space needed for alignment
1760 * of inode chunks when checking the longest contiguous free space in
1761 * the AG - this prevents us from getting ENOSPC because we have free
1762 * space larger than ialloc_blks but alignment constraints prevent us
1765 * If we can't find an AG with space for full alignment slack to be
1766 * taken into account, we must be near ENOSPC in all AGs. Hence we
1767 * don't include alignment for the second pass and so if we fail
1768 * allocation due to alignment issues then it is most likely a real
1771 * XXX(dgc): this calculation is now bogus thanks to the per-ag
1772 * reservations that xfs_alloc_fix_freelist() now does via
1773 * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will
1774 * be more than large enough for the check below to succeed, but
1775 * xfs_alloc_space_available() will fail because of the non-zero
1776 * metadata reservation and hence we won't actually be able to allocate
1777 * more inodes in this AG. We do soooo much unnecessary work near ENOSPC
1780 ineed
= M_IGEO(mp
)->ialloc_min_blks
;
1781 if (flags
&& ineed
> 1)
1782 ineed
+= M_IGEO(mp
)->cluster_align
;
1783 longest
= pag
->pagf_longest
;
1785 longest
= pag
->pagf_flcount
> 0;
1786 needspace
= S_ISDIR(mode
) || S_ISREG(mode
) || S_ISLNK(mode
);
1788 if (pag
->pagf_freeblks
< needspace
+ ineed
|| longest
< ineed
)
1795 struct xfs_perag
*pag
,
1796 struct xfs_trans
**tpp
,
1801 struct xfs_buf
*agbp
;
1806 * Then read in the AGI buffer and recheck with the AGI buffer
1809 error
= xfs_ialloc_read_agi(pag
, *tpp
, 0, &agbp
);
1813 if (!pag
->pagi_freecount
) {
1819 error
= xfs_ialloc_ag_alloc(pag
, *tpp
, agbp
);
1824 * We successfully allocated space for an inode cluster in this
1825 * AG. Roll the transaction so that we can allocate one of the
1828 ASSERT(pag
->pagi_freecount
> 0);
1829 error
= xfs_dialloc_roll(tpp
, agbp
);
1834 /* Allocate an inode in the found AG */
1835 error
= xfs_dialloc_ag(pag
, *tpp
, agbp
, parent
, &ino
);
1841 xfs_trans_brelse(*tpp
, agbp
);
1846 * Pick an AG for the new inode.
1848 * Directories, symlinks, and regular files frequently allocate at least one
1849 * block, so factor that potential expansion when we examine whether an AG has
1850 * enough space for file creation. Try to keep metadata files all in the same
1853 static inline xfs_agnumber_t
1854 xfs_dialloc_pick_ag(
1855 struct xfs_mount
*mp
,
1856 struct xfs_inode
*dp
,
1859 xfs_agnumber_t start_agno
;
1863 if (xfs_is_metadir_inode(dp
)) {
1864 if (mp
->m_sb
.sb_logstart
)
1865 return XFS_FSB_TO_AGNO(mp
, mp
->m_sb
.sb_logstart
);
1870 return (atomic_inc_return(&mp
->m_agirotor
) - 1) % mp
->m_maxagi
;
1872 start_agno
= XFS_INO_TO_AGNO(mp
, dp
->i_ino
);
1873 if (start_agno
>= mp
->m_maxagi
)
1880 * Allocate an on-disk inode.
1882 * Mode is used to tell whether the new inode is a directory and hence where to
1883 * locate it. The on-disk inode that is allocated will be returned in @new_ino
1884 * on success, otherwise an error will be set to indicate the failure (e.g.
1889 struct xfs_trans
**tpp
,
1890 const struct xfs_icreate_args
*args
,
1893 struct xfs_mount
*mp
= (*tpp
)->t_mountp
;
1894 struct xfs_perag
*pag
;
1895 struct xfs_ino_geometry
*igeo
= M_IGEO(mp
);
1896 xfs_ino_t ino
= NULLFSINO
;
1897 xfs_ino_t parent
= args
->pip
? args
->pip
->i_ino
: 0;
1898 xfs_agnumber_t agno
;
1899 xfs_agnumber_t start_agno
;
1900 umode_t mode
= args
->mode
& S_IFMT
;
1901 bool ok_alloc
= true;
1902 bool low_space
= false;
1906 start_agno
= xfs_dialloc_pick_ag(mp
, args
->pip
, mode
);
1909 * If we have already hit the ceiling of inode blocks then clear
1910 * ok_alloc so we scan all available agi structures for a free
1913 * Read rough value of mp->m_icount by percpu_counter_read_positive,
1914 * which will sacrifice the preciseness but improve the performance.
1916 if (igeo
->maxicount
&&
1917 percpu_counter_read_positive(&mp
->m_icount
) + igeo
->ialloc_inos
1918 > igeo
->maxicount
) {
1923 * If we are near to ENOSPC, we want to prefer allocation from AGs that
1924 * have free inodes in them rather than use up free space allocating new
1925 * inode chunks. Hence we turn off allocation for the first non-blocking
1926 * pass through the AGs if we are near ENOSPC to consume free inodes
1927 * that we can immediately allocate, but then we allow allocation on the
1928 * second pass if we fail to find an AG with free inodes in it.
1930 if (percpu_counter_read_positive(&mp
->m_fdblocks
) <
1931 mp
->m_low_space
[XFS_LOWSP_1_PCNT
]) {
1937 * Loop until we find an allocation group that either has free inodes
1938 * or in which we can allocate some inodes. Iterate through the
1939 * allocation groups upward, wrapping at the end.
1941 flags
= XFS_ALLOC_FLAG_TRYLOCK
;
1943 for_each_perag_wrap_at(mp
, start_agno
, mp
->m_maxagi
, agno
, pag
) {
1944 if (xfs_dialloc_good_ag(pag
, *tpp
, mode
, flags
, ok_alloc
)) {
1945 error
= xfs_dialloc_try_ag(pag
, tpp
, parent
,
1947 if (error
!= -EAGAIN
)
1952 if (xfs_is_shutdown(mp
)) {
1953 error
= -EFSCORRUPTED
;
1958 xfs_perag_rele(pag
);
1961 if (ino
== NULLFSINO
) {
1972 * Protect against obviously corrupt allocation btree records. Later
1973 * xfs_iget checks will catch re-allocation of other active in-memory
1974 * and on-disk inodes. If we don't catch reallocating the parent inode
1975 * here we will deadlock in xfs_iget() so we have to do these checks
1978 if (ino
== parent
|| !xfs_verify_dir_ino(mp
, ino
)) {
1979 xfs_alert(mp
, "Allocated a known in-use inode 0x%llx!", ino
);
1980 xfs_agno_mark_sick(mp
, XFS_INO_TO_AGNO(mp
, ino
),
1982 return -EFSCORRUPTED
;
1990 * Free the blocks of an inode chunk. We must consider that the inode chunk
1991 * might be sparse and only free the regions that are allocated as part of the
1995 xfs_difree_inode_chunk(
1996 struct xfs_trans
*tp
,
1997 struct xfs_perag
*pag
,
1998 struct xfs_inobt_rec_incore
*rec
)
2000 struct xfs_mount
*mp
= tp
->t_mountp
;
2001 xfs_agblock_t sagbno
= XFS_AGINO_TO_AGBNO(mp
,
2003 int startidx
, endidx
;
2005 xfs_agblock_t agbno
;
2007 DECLARE_BITMAP(holemask
, XFS_INOBT_HOLEMASK_BITS
);
2009 if (!xfs_inobt_issparse(rec
->ir_holemask
)) {
2010 /* not sparse, calculate extent info directly */
2011 return xfs_free_extent_later(tp
, xfs_agbno_to_fsb(pag
, sagbno
),
2012 M_IGEO(mp
)->ialloc_blks
, &XFS_RMAP_OINFO_INODES
,
2013 XFS_AG_RESV_NONE
, 0);
2016 /* holemask is only 16-bits (fits in an unsigned long) */
2017 ASSERT(sizeof(rec
->ir_holemask
) <= sizeof(holemask
[0]));
2018 holemask
[0] = rec
->ir_holemask
;
2021 * Find contiguous ranges of zeroes (i.e., allocated regions) in the
2022 * holemask and convert the start/end index of each range to an extent.
2023 * We start with the start and end index both pointing at the first 0 in
2026 startidx
= endidx
= find_first_zero_bit(holemask
,
2027 XFS_INOBT_HOLEMASK_BITS
);
2028 nextbit
= startidx
+ 1;
2029 while (startidx
< XFS_INOBT_HOLEMASK_BITS
) {
2032 nextbit
= find_next_zero_bit(holemask
, XFS_INOBT_HOLEMASK_BITS
,
2035 * If the next zero bit is contiguous, update the end index of
2036 * the current range and continue.
2038 if (nextbit
!= XFS_INOBT_HOLEMASK_BITS
&&
2039 nextbit
== endidx
+ 1) {
2045 * nextbit is not contiguous with the current end index. Convert
2046 * the current start/end to an extent and add it to the free
2049 agbno
= sagbno
+ (startidx
* XFS_INODES_PER_HOLEMASK_BIT
) /
2050 mp
->m_sb
.sb_inopblock
;
2051 contigblk
= ((endidx
- startidx
+ 1) *
2052 XFS_INODES_PER_HOLEMASK_BIT
) /
2053 mp
->m_sb
.sb_inopblock
;
2055 ASSERT(agbno
% mp
->m_sb
.sb_spino_align
== 0);
2056 ASSERT(contigblk
% mp
->m_sb
.sb_spino_align
== 0);
2057 error
= xfs_free_extent_later(tp
, xfs_agbno_to_fsb(pag
, agbno
),
2058 contigblk
, &XFS_RMAP_OINFO_INODES
,
2059 XFS_AG_RESV_NONE
, 0);
2063 /* reset range to current bit and carry on... */
2064 startidx
= endidx
= nextbit
;
2074 struct xfs_perag
*pag
,
2075 struct xfs_trans
*tp
,
2076 struct xfs_buf
*agbp
,
2078 struct xfs_icluster
*xic
,
2079 struct xfs_inobt_rec_incore
*orec
)
2081 struct xfs_mount
*mp
= pag_mount(pag
);
2082 struct xfs_agi
*agi
= agbp
->b_addr
;
2083 struct xfs_btree_cur
*cur
;
2084 struct xfs_inobt_rec_incore rec
;
2090 ASSERT(agi
->agi_magicnum
== cpu_to_be32(XFS_AGI_MAGIC
));
2091 ASSERT(XFS_AGINO_TO_AGBNO(mp
, agino
) < be32_to_cpu(agi
->agi_length
));
2094 * Initialize the cursor.
2096 cur
= xfs_inobt_init_cursor(pag
, tp
, agbp
);
2098 error
= xfs_check_agi_freecount(cur
);
2103 * Look for the entry describing this inode.
2105 if ((error
= xfs_inobt_lookup(cur
, agino
, XFS_LOOKUP_LE
, &i
))) {
2106 xfs_warn(mp
, "%s: xfs_inobt_lookup() returned error %d.",
2110 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2111 xfs_btree_mark_sick(cur
);
2112 error
= -EFSCORRUPTED
;
2115 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
2117 xfs_warn(mp
, "%s: xfs_inobt_get_rec() returned error %d.",
2121 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2122 xfs_btree_mark_sick(cur
);
2123 error
= -EFSCORRUPTED
;
2127 * Get the offset in the inode chunk.
2129 off
= agino
- rec
.ir_startino
;
2130 ASSERT(off
>= 0 && off
< XFS_INODES_PER_CHUNK
);
2131 ASSERT(!(rec
.ir_free
& XFS_INOBT_MASK(off
)));
2133 * Mark the inode free & increment the count.
2135 rec
.ir_free
|= XFS_INOBT_MASK(off
);
2139 * When an inode chunk is free, it becomes eligible for removal. Don't
2140 * remove the chunk if the block size is large enough for multiple inode
2141 * chunks (that might not be free).
2143 if (!xfs_has_ikeep(mp
) && rec
.ir_free
== XFS_INOBT_ALL_FREE
&&
2144 mp
->m_sb
.sb_inopblock
<= XFS_INODES_PER_CHUNK
) {
2145 xic
->deleted
= true;
2146 xic
->first_ino
= xfs_agino_to_ino(pag
, rec
.ir_startino
);
2147 xic
->alloc
= xfs_inobt_irec_to_allocmask(&rec
);
2150 * Remove the inode cluster from the AGI B+Tree, adjust the
2151 * AGI and Superblock inode counts, and mark the disk space
2152 * to be freed when the transaction is committed.
2154 ilen
= rec
.ir_freecount
;
2155 be32_add_cpu(&agi
->agi_count
, -ilen
);
2156 be32_add_cpu(&agi
->agi_freecount
, -(ilen
- 1));
2157 xfs_ialloc_log_agi(tp
, agbp
, XFS_AGI_COUNT
| XFS_AGI_FREECOUNT
);
2158 pag
->pagi_freecount
-= ilen
- 1;
2159 pag
->pagi_count
-= ilen
;
2160 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_ICOUNT
, -ilen
);
2161 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_IFREE
, -(ilen
- 1));
2163 if ((error
= xfs_btree_delete(cur
, &i
))) {
2164 xfs_warn(mp
, "%s: xfs_btree_delete returned error %d.",
2169 error
= xfs_difree_inode_chunk(tp
, pag
, &rec
);
2173 xic
->deleted
= false;
2175 error
= xfs_inobt_update(cur
, &rec
);
2177 xfs_warn(mp
, "%s: xfs_inobt_update returned error %d.",
2183 * Change the inode free counts and log the ag/sb changes.
2185 be32_add_cpu(&agi
->agi_freecount
, 1);
2186 xfs_ialloc_log_agi(tp
, agbp
, XFS_AGI_FREECOUNT
);
2187 pag
->pagi_freecount
++;
2188 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_IFREE
, 1);
2191 error
= xfs_check_agi_freecount(cur
);
2196 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
2200 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
2205 * Free an inode in the free inode btree.
2209 struct xfs_perag
*pag
,
2210 struct xfs_trans
*tp
,
2211 struct xfs_buf
*agbp
,
2213 struct xfs_inobt_rec_incore
*ibtrec
) /* inobt record */
2215 struct xfs_mount
*mp
= pag_mount(pag
);
2216 struct xfs_btree_cur
*cur
;
2217 struct xfs_inobt_rec_incore rec
;
2218 int offset
= agino
- ibtrec
->ir_startino
;
2222 cur
= xfs_finobt_init_cursor(pag
, tp
, agbp
);
2224 error
= xfs_inobt_lookup(cur
, ibtrec
->ir_startino
, XFS_LOOKUP_EQ
, &i
);
2229 * If the record does not exist in the finobt, we must have just
2230 * freed an inode in a previously fully allocated chunk. If not,
2231 * something is out of sync.
2233 if (XFS_IS_CORRUPT(mp
, ibtrec
->ir_freecount
!= 1)) {
2234 xfs_btree_mark_sick(cur
);
2235 error
= -EFSCORRUPTED
;
2239 error
= xfs_inobt_insert_rec(cur
, ibtrec
->ir_holemask
,
2241 ibtrec
->ir_freecount
,
2242 ibtrec
->ir_free
, &i
);
2251 * Read and update the existing record. We could just copy the ibtrec
2252 * across here, but that would defeat the purpose of having redundant
2253 * metadata. By making the modifications independently, we can catch
2254 * corruptions that we wouldn't see if we just copied from one record
2257 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
2260 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2261 xfs_btree_mark_sick(cur
);
2262 error
= -EFSCORRUPTED
;
2266 rec
.ir_free
|= XFS_INOBT_MASK(offset
);
2269 if (XFS_IS_CORRUPT(mp
,
2270 rec
.ir_free
!= ibtrec
->ir_free
||
2271 rec
.ir_freecount
!= ibtrec
->ir_freecount
)) {
2272 xfs_btree_mark_sick(cur
);
2273 error
= -EFSCORRUPTED
;
2278 * The content of inobt records should always match between the inobt
2279 * and finobt. The lifecycle of records in the finobt is different from
2280 * the inobt in that the finobt only tracks records with at least one
2281 * free inode. Hence, if all of the inodes are free and we aren't
2282 * keeping inode chunks permanently on disk, remove the record.
2283 * Otherwise, update the record with the new information.
2285 * Note that we currently can't free chunks when the block size is large
2286 * enough for multiple chunks. Leave the finobt record to remain in sync
2289 if (!xfs_has_ikeep(mp
) && rec
.ir_free
== XFS_INOBT_ALL_FREE
&&
2290 mp
->m_sb
.sb_inopblock
<= XFS_INODES_PER_CHUNK
) {
2291 error
= xfs_btree_delete(cur
, &i
);
2296 error
= xfs_inobt_update(cur
, &rec
);
2302 error
= xfs_check_agi_freecount(cur
);
2306 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
2310 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
2315 * Free disk inode. Carefully avoids touching the incore inode, all
2316 * manipulations incore are the caller's responsibility.
2317 * The on-disk inode is not changed by this operation, only the
2318 * btree (free inode mask) is changed.
2322 struct xfs_trans
*tp
,
2323 struct xfs_perag
*pag
,
2325 struct xfs_icluster
*xic
)
2328 xfs_agblock_t agbno
; /* block number containing inode */
2329 struct xfs_buf
*agbp
; /* buffer for allocation group header */
2330 xfs_agino_t agino
; /* allocation group inode number */
2331 int error
; /* error return value */
2332 struct xfs_mount
*mp
= tp
->t_mountp
;
2333 struct xfs_inobt_rec_incore rec
;/* btree record */
2336 * Break up inode number into its components.
2338 if (pag_agno(pag
) != XFS_INO_TO_AGNO(mp
, inode
)) {
2339 xfs_warn(mp
, "%s: agno != pag_agno(pag) (%d != %d).",
2340 __func__
, XFS_INO_TO_AGNO(mp
, inode
), pag_agno(pag
));
2344 agino
= XFS_INO_TO_AGINO(mp
, inode
);
2345 if (inode
!= xfs_agino_to_ino(pag
, agino
)) {
2346 xfs_warn(mp
, "%s: inode != xfs_agino_to_ino() (%llu != %llu).",
2347 __func__
, (unsigned long long)inode
,
2348 (unsigned long long)xfs_agino_to_ino(pag
, agino
));
2352 agbno
= XFS_AGINO_TO_AGBNO(mp
, agino
);
2353 if (agbno
>= xfs_ag_block_count(mp
, pag_agno(pag
))) {
2354 xfs_warn(mp
, "%s: agbno >= xfs_ag_block_count (%d >= %d).",
2355 __func__
, agbno
, xfs_ag_block_count(mp
, pag_agno(pag
)));
2360 * Get the allocation group header.
2362 error
= xfs_ialloc_read_agi(pag
, tp
, 0, &agbp
);
2364 xfs_warn(mp
, "%s: xfs_ialloc_read_agi() returned error %d.",
2370 * Fix up the inode allocation btree.
2372 error
= xfs_difree_inobt(pag
, tp
, agbp
, agino
, xic
, &rec
);
2377 * Fix up the free inode btree.
2379 if (xfs_has_finobt(mp
)) {
2380 error
= xfs_difree_finobt(pag
, tp
, agbp
, agino
, &rec
);
2393 struct xfs_perag
*pag
,
2394 struct xfs_trans
*tp
,
2396 xfs_agblock_t agbno
,
2397 xfs_agblock_t
*chunk_agbno
,
2398 xfs_agblock_t
*offset_agbno
,
2401 struct xfs_mount
*mp
= pag_mount(pag
);
2402 struct xfs_inobt_rec_incore rec
;
2403 struct xfs_btree_cur
*cur
;
2404 struct xfs_buf
*agbp
;
2408 error
= xfs_ialloc_read_agi(pag
, tp
, 0, &agbp
);
2411 "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
2412 __func__
, error
, pag_agno(pag
));
2417 * Lookup the inode record for the given agino. If the record cannot be
2418 * found, then it's an invalid inode number and we should abort. Once
2419 * we have a record, we need to ensure it contains the inode number
2420 * we are looking up.
2422 cur
= xfs_inobt_init_cursor(pag
, tp
, agbp
);
2423 error
= xfs_inobt_lookup(cur
, agino
, XFS_LOOKUP_LE
, &i
);
2426 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
2427 if (!error
&& i
== 0)
2431 xfs_trans_brelse(tp
, agbp
);
2432 xfs_btree_del_cursor(cur
, error
);
2436 /* check that the returned record contains the required inode */
2437 if (rec
.ir_startino
> agino
||
2438 rec
.ir_startino
+ M_IGEO(mp
)->ialloc_inos
<= agino
)
2441 /* for untrusted inodes check it is allocated first */
2442 if ((flags
& XFS_IGET_UNTRUSTED
) &&
2443 (rec
.ir_free
& XFS_INOBT_MASK(agino
- rec
.ir_startino
)))
2446 *chunk_agbno
= XFS_AGINO_TO_AGBNO(mp
, rec
.ir_startino
);
2447 *offset_agbno
= agbno
- *chunk_agbno
;
2452 * Return the location of the inode in imap, for mapping it into a buffer.
2456 struct xfs_perag
*pag
,
2457 struct xfs_trans
*tp
,
2458 xfs_ino_t ino
, /* inode to locate */
2459 struct xfs_imap
*imap
, /* location map structure */
2460 uint flags
) /* flags for inode btree lookup */
2462 struct xfs_mount
*mp
= pag_mount(pag
);
2463 xfs_agblock_t agbno
; /* block number of inode in the alloc group */
2464 xfs_agino_t agino
; /* inode number within alloc group */
2465 xfs_agblock_t chunk_agbno
; /* first block in inode chunk */
2466 xfs_agblock_t cluster_agbno
; /* first block in inode cluster */
2467 int error
; /* error code */
2468 int offset
; /* index of inode in its buffer */
2469 xfs_agblock_t offset_agbno
; /* blks from chunk start to inode */
2471 ASSERT(ino
!= NULLFSINO
);
2474 * Split up the inode number into its parts.
2476 agino
= XFS_INO_TO_AGINO(mp
, ino
);
2477 agbno
= XFS_AGINO_TO_AGBNO(mp
, agino
);
2478 if (agbno
>= xfs_ag_block_count(mp
, pag_agno(pag
)) ||
2479 ino
!= xfs_agino_to_ino(pag
, agino
)) {
2483 * Don't output diagnostic information for untrusted inodes
2484 * as they can be invalid without implying corruption.
2486 if (flags
& XFS_IGET_UNTRUSTED
)
2488 if (agbno
>= xfs_ag_block_count(mp
, pag_agno(pag
))) {
2490 "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
2491 __func__
, (unsigned long long)agbno
,
2492 (unsigned long)xfs_ag_block_count(mp
,
2495 if (ino
!= xfs_agino_to_ino(pag
, agino
)) {
2497 "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)",
2499 xfs_agino_to_ino(pag
, agino
));
2507 * For bulkstat and handle lookups, we have an untrusted inode number
2508 * that we have to verify is valid. We cannot do this just by reading
2509 * the inode buffer as it may have been unlinked and removed leaving
2510 * inodes in stale state on disk. Hence we have to do a btree lookup
2511 * in all cases where an untrusted inode number is passed.
2513 if (flags
& XFS_IGET_UNTRUSTED
) {
2514 error
= xfs_imap_lookup(pag
, tp
, agino
, agbno
,
2515 &chunk_agbno
, &offset_agbno
, flags
);
2522 * If the inode cluster size is the same as the blocksize or
2523 * smaller we get to the buffer by simple arithmetics.
2525 if (M_IGEO(mp
)->blocks_per_cluster
== 1) {
2526 offset
= XFS_INO_TO_OFFSET(mp
, ino
);
2527 ASSERT(offset
< mp
->m_sb
.sb_inopblock
);
2529 imap
->im_blkno
= xfs_agbno_to_daddr(pag
, agbno
);
2530 imap
->im_len
= XFS_FSB_TO_BB(mp
, 1);
2531 imap
->im_boffset
= (unsigned short)(offset
<<
2532 mp
->m_sb
.sb_inodelog
);
2537 * If the inode chunks are aligned then use simple maths to
2538 * find the location. Otherwise we have to do a btree
2539 * lookup to find the location.
2541 if (M_IGEO(mp
)->inoalign_mask
) {
2542 offset_agbno
= agbno
& M_IGEO(mp
)->inoalign_mask
;
2543 chunk_agbno
= agbno
- offset_agbno
;
2545 error
= xfs_imap_lookup(pag
, tp
, agino
, agbno
,
2546 &chunk_agbno
, &offset_agbno
, flags
);
2552 ASSERT(agbno
>= chunk_agbno
);
2553 cluster_agbno
= chunk_agbno
+
2554 ((offset_agbno
/ M_IGEO(mp
)->blocks_per_cluster
) *
2555 M_IGEO(mp
)->blocks_per_cluster
);
2556 offset
= ((agbno
- cluster_agbno
) * mp
->m_sb
.sb_inopblock
) +
2557 XFS_INO_TO_OFFSET(mp
, ino
);
2559 imap
->im_blkno
= xfs_agbno_to_daddr(pag
, cluster_agbno
);
2560 imap
->im_len
= XFS_FSB_TO_BB(mp
, M_IGEO(mp
)->blocks_per_cluster
);
2561 imap
->im_boffset
= (unsigned short)(offset
<< mp
->m_sb
.sb_inodelog
);
2564 * If the inode number maps to a block outside the bounds
2565 * of the file system then return NULL rather than calling
2566 * read_buf and panicing when we get an error from the
2569 if ((imap
->im_blkno
+ imap
->im_len
) >
2570 XFS_FSB_TO_BB(mp
, mp
->m_sb
.sb_dblocks
)) {
2572 "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
2573 __func__
, (unsigned long long) imap
->im_blkno
,
2574 (unsigned long long) imap
->im_len
,
2575 XFS_FSB_TO_BB(mp
, mp
->m_sb
.sb_dblocks
));
2582 * Log specified fields for the ag hdr (inode section). The growth of the agi
2583 * structure over time requires that we interpret the buffer as two logical
2584 * regions delineated by the end of the unlinked list. This is due to the size
2585 * of the hash table and its location in the middle of the agi.
2587 * For example, a request to log a field before agi_unlinked and a field after
2588 * agi_unlinked could cause us to log the entire hash table and use an excessive
2589 * amount of log space. To avoid this behavior, log the region up through
2590 * agi_unlinked in one call and the region after agi_unlinked through the end of
2591 * the structure in another.
2595 struct xfs_trans
*tp
,
2599 int first
; /* first byte number */
2600 int last
; /* last byte number */
2601 static const short offsets
[] = { /* field starting offsets */
2602 /* keep in sync with bit definitions */
2603 offsetof(xfs_agi_t
, agi_magicnum
),
2604 offsetof(xfs_agi_t
, agi_versionnum
),
2605 offsetof(xfs_agi_t
, agi_seqno
),
2606 offsetof(xfs_agi_t
, agi_length
),
2607 offsetof(xfs_agi_t
, agi_count
),
2608 offsetof(xfs_agi_t
, agi_root
),
2609 offsetof(xfs_agi_t
, agi_level
),
2610 offsetof(xfs_agi_t
, agi_freecount
),
2611 offsetof(xfs_agi_t
, agi_newino
),
2612 offsetof(xfs_agi_t
, agi_dirino
),
2613 offsetof(xfs_agi_t
, agi_unlinked
),
2614 offsetof(xfs_agi_t
, agi_free_root
),
2615 offsetof(xfs_agi_t
, agi_free_level
),
2616 offsetof(xfs_agi_t
, agi_iblocks
),
2620 struct xfs_agi
*agi
= bp
->b_addr
;
2622 ASSERT(agi
->agi_magicnum
== cpu_to_be32(XFS_AGI_MAGIC
));
2626 * Compute byte offsets for the first and last fields in the first
2627 * region and log the agi buffer. This only logs up through
2630 if (fields
& XFS_AGI_ALL_BITS_R1
) {
2631 xfs_btree_offsets(fields
, offsets
, XFS_AGI_NUM_BITS_R1
,
2633 xfs_trans_log_buf(tp
, bp
, first
, last
);
2637 * Mask off the bits in the first region and calculate the first and
2638 * last field offsets for any bits in the second region.
2640 fields
&= ~XFS_AGI_ALL_BITS_R1
;
2642 xfs_btree_offsets(fields
, offsets
, XFS_AGI_NUM_BITS_R2
,
2644 xfs_trans_log_buf(tp
, bp
, first
, last
);
2648 static xfs_failaddr_t
2652 struct xfs_mount
*mp
= bp
->b_mount
;
2653 struct xfs_agi
*agi
= bp
->b_addr
;
2655 uint32_t agi_seqno
= be32_to_cpu(agi
->agi_seqno
);
2656 uint32_t agi_length
= be32_to_cpu(agi
->agi_length
);
2659 if (xfs_has_crc(mp
)) {
2660 if (!uuid_equal(&agi
->agi_uuid
, &mp
->m_sb
.sb_meta_uuid
))
2661 return __this_address
;
2662 if (!xfs_log_check_lsn(mp
, be64_to_cpu(agi
->agi_lsn
)))
2663 return __this_address
;
2667 * Validate the magic number of the agi block.
2669 if (!xfs_verify_magic(bp
, agi
->agi_magicnum
))
2670 return __this_address
;
2671 if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi
->agi_versionnum
)))
2672 return __this_address
;
2674 fa
= xfs_validate_ag_length(bp
, agi_seqno
, agi_length
);
2678 if (be32_to_cpu(agi
->agi_level
) < 1 ||
2679 be32_to_cpu(agi
->agi_level
) > M_IGEO(mp
)->inobt_maxlevels
)
2680 return __this_address
;
2682 if (xfs_has_finobt(mp
) &&
2683 (be32_to_cpu(agi
->agi_free_level
) < 1 ||
2684 be32_to_cpu(agi
->agi_free_level
) > M_IGEO(mp
)->inobt_maxlevels
))
2685 return __this_address
;
2687 for (i
= 0; i
< XFS_AGI_UNLINKED_BUCKETS
; i
++) {
2688 if (agi
->agi_unlinked
[i
] == cpu_to_be32(NULLAGINO
))
2690 if (!xfs_verify_ino(mp
, be32_to_cpu(agi
->agi_unlinked
[i
])))
2691 return __this_address
;
2698 xfs_agi_read_verify(
2701 struct xfs_mount
*mp
= bp
->b_mount
;
2704 if (xfs_has_crc(mp
) &&
2705 !xfs_buf_verify_cksum(bp
, XFS_AGI_CRC_OFF
))
2706 xfs_verifier_error(bp
, -EFSBADCRC
, __this_address
);
2708 fa
= xfs_agi_verify(bp
);
2709 if (XFS_TEST_ERROR(fa
, mp
, XFS_ERRTAG_IALLOC_READ_AGI
))
2710 xfs_verifier_error(bp
, -EFSCORRUPTED
, fa
);
2715 xfs_agi_write_verify(
2718 struct xfs_mount
*mp
= bp
->b_mount
;
2719 struct xfs_buf_log_item
*bip
= bp
->b_log_item
;
2720 struct xfs_agi
*agi
= bp
->b_addr
;
2723 fa
= xfs_agi_verify(bp
);
2725 xfs_verifier_error(bp
, -EFSCORRUPTED
, fa
);
2729 if (!xfs_has_crc(mp
))
2733 agi
->agi_lsn
= cpu_to_be64(bip
->bli_item
.li_lsn
);
2734 xfs_buf_update_cksum(bp
, XFS_AGI_CRC_OFF
);
2737 const struct xfs_buf_ops xfs_agi_buf_ops
= {
2739 .magic
= { cpu_to_be32(XFS_AGI_MAGIC
), cpu_to_be32(XFS_AGI_MAGIC
) },
2740 .verify_read
= xfs_agi_read_verify
,
2741 .verify_write
= xfs_agi_write_verify
,
2742 .verify_struct
= xfs_agi_verify
,
2746 * Read in the allocation group header (inode allocation section)
2750 struct xfs_perag
*pag
,
2751 struct xfs_trans
*tp
,
2752 xfs_buf_flags_t flags
,
2753 struct xfs_buf
**agibpp
)
2755 struct xfs_mount
*mp
= pag_mount(pag
);
2758 trace_xfs_read_agi(pag
);
2760 error
= xfs_trans_read_buf(mp
, tp
, mp
->m_ddev_targp
,
2761 XFS_AG_DADDR(mp
, pag_agno(pag
), XFS_AGI_DADDR(mp
)),
2762 XFS_FSS_TO_BB(mp
, 1), flags
, agibpp
, &xfs_agi_buf_ops
);
2763 if (xfs_metadata_is_sick(error
))
2764 xfs_ag_mark_sick(pag
, XFS_SICK_AG_AGI
);
2768 xfs_trans_buf_set_type(tp
, *agibpp
, XFS_BLFT_AGI_BUF
);
2770 xfs_buf_set_ref(*agibpp
, XFS_AGI_REF
);
2775 * Read in the agi and initialise the per-ag data. If the caller supplies a
2776 * @agibpp, return the locked AGI buffer to them, otherwise release it.
2779 xfs_ialloc_read_agi(
2780 struct xfs_perag
*pag
,
2781 struct xfs_trans
*tp
,
2783 struct xfs_buf
**agibpp
)
2785 struct xfs_buf
*agibp
;
2786 struct xfs_agi
*agi
;
2789 trace_xfs_ialloc_read_agi(pag
);
2791 error
= xfs_read_agi(pag
, tp
,
2792 (flags
& XFS_IALLOC_FLAG_TRYLOCK
) ? XBF_TRYLOCK
: 0,
2797 agi
= agibp
->b_addr
;
2798 if (!xfs_perag_initialised_agi(pag
)) {
2799 pag
->pagi_freecount
= be32_to_cpu(agi
->agi_freecount
);
2800 pag
->pagi_count
= be32_to_cpu(agi
->agi_count
);
2801 set_bit(XFS_AGSTATE_AGI_INIT
, &pag
->pag_opstate
);
2805 * It's possible for these to be out of sync if
2806 * we are in the middle of a forced shutdown.
2808 ASSERT(pag
->pagi_freecount
== be32_to_cpu(agi
->agi_freecount
) ||
2809 xfs_is_shutdown(pag_mount(pag
)));
2813 xfs_trans_brelse(tp
, agibp
);
2817 /* How many inodes are backed by inode clusters ondisk? */
2819 xfs_ialloc_count_ondisk(
2820 struct xfs_btree_cur
*cur
,
2823 unsigned int *allocated
)
2825 struct xfs_inobt_rec_incore irec
;
2826 unsigned int ret
= 0;
2830 error
= xfs_inobt_lookup(cur
, low
, XFS_LOOKUP_LE
, &has_record
);
2834 while (has_record
) {
2835 unsigned int i
, hole_idx
;
2837 error
= xfs_inobt_get_rec(cur
, &irec
, &has_record
);
2840 if (irec
.ir_startino
> high
)
2843 for (i
= 0; i
< XFS_INODES_PER_CHUNK
; i
++) {
2844 if (irec
.ir_startino
+ i
< low
)
2846 if (irec
.ir_startino
+ i
> high
)
2849 hole_idx
= i
/ XFS_INODES_PER_HOLEMASK_BIT
;
2850 if (!(irec
.ir_holemask
& (1U << hole_idx
)))
2854 error
= xfs_btree_increment(cur
, 0, &has_record
);
2863 /* Is there an inode record covering a given extent? */
2865 xfs_ialloc_has_inodes_at_extent(
2866 struct xfs_btree_cur
*cur
,
2869 enum xbtree_recpacking
*outcome
)
2872 xfs_agino_t last_agino
;
2873 unsigned int allocated
;
2876 agino
= XFS_AGB_TO_AGINO(cur
->bc_mp
, bno
);
2877 last_agino
= XFS_AGB_TO_AGINO(cur
->bc_mp
, bno
+ len
) - 1;
2879 error
= xfs_ialloc_count_ondisk(cur
, agino
, last_agino
, &allocated
);
2884 *outcome
= XBTREE_RECPACKING_EMPTY
;
2885 else if (allocated
== last_agino
- agino
+ 1)
2886 *outcome
= XBTREE_RECPACKING_FULL
;
2888 *outcome
= XBTREE_RECPACKING_SPARSE
;
2892 struct xfs_ialloc_count_inodes
{
2894 xfs_agino_t freecount
;
2897 /* Record inode counts across all inobt records. */
2899 xfs_ialloc_count_inodes_rec(
2900 struct xfs_btree_cur
*cur
,
2901 const union xfs_btree_rec
*rec
,
2904 struct xfs_inobt_rec_incore irec
;
2905 struct xfs_ialloc_count_inodes
*ci
= priv
;
2908 xfs_inobt_btrec_to_irec(cur
->bc_mp
, rec
, &irec
);
2909 fa
= xfs_inobt_check_irec(to_perag(cur
->bc_group
), &irec
);
2911 return xfs_inobt_complain_bad_rec(cur
, fa
, &irec
);
2913 ci
->count
+= irec
.ir_count
;
2914 ci
->freecount
+= irec
.ir_freecount
;
2919 /* Count allocated and free inodes under an inobt. */
2921 xfs_ialloc_count_inodes(
2922 struct xfs_btree_cur
*cur
,
2924 xfs_agino_t
*freecount
)
2926 struct xfs_ialloc_count_inodes ci
= {0};
2929 ASSERT(xfs_btree_is_ino(cur
->bc_ops
));
2930 error
= xfs_btree_query_all(cur
, xfs_ialloc_count_inodes_rec
, &ci
);
2935 *freecount
= ci
.freecount
;
2940 * Initialize inode-related geometry information.
2942 * Compute the inode btree min and max levels and set maxicount.
2944 * Set the inode cluster size. This may still be overridden by the file
2945 * system block size if it is larger than the chosen cluster size.
2947 * For v5 filesystems, scale the cluster size with the inode size to keep a
2948 * constant ratio of inode per cluster buffer, but only if mkfs has set the
2949 * inode alignment value appropriately for larger cluster sizes.
2951 * Then compute the inode cluster alignment information.
2954 xfs_ialloc_setup_geometry(
2955 struct xfs_mount
*mp
)
2957 struct xfs_sb
*sbp
= &mp
->m_sb
;
2958 struct xfs_ino_geometry
*igeo
= M_IGEO(mp
);
2962 igeo
->new_diflags2
= 0;
2963 if (xfs_has_bigtime(mp
))
2964 igeo
->new_diflags2
|= XFS_DIFLAG2_BIGTIME
;
2965 if (xfs_has_large_extent_counts(mp
))
2966 igeo
->new_diflags2
|= XFS_DIFLAG2_NREXT64
;
2968 /* Compute inode btree geometry. */
2969 igeo
->agino_log
= sbp
->sb_inopblog
+ sbp
->sb_agblklog
;
2970 igeo
->inobt_mxr
[0] = xfs_inobt_maxrecs(mp
, sbp
->sb_blocksize
, true);
2971 igeo
->inobt_mxr
[1] = xfs_inobt_maxrecs(mp
, sbp
->sb_blocksize
, false);
2972 igeo
->inobt_mnr
[0] = igeo
->inobt_mxr
[0] / 2;
2973 igeo
->inobt_mnr
[1] = igeo
->inobt_mxr
[1] / 2;
2975 igeo
->ialloc_inos
= max_t(uint16_t, XFS_INODES_PER_CHUNK
,
2977 igeo
->ialloc_blks
= igeo
->ialloc_inos
>> sbp
->sb_inopblog
;
2979 if (sbp
->sb_spino_align
)
2980 igeo
->ialloc_min_blks
= sbp
->sb_spino_align
;
2982 igeo
->ialloc_min_blks
= igeo
->ialloc_blks
;
2984 /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
2985 inodes
= (1LL << XFS_INO_AGINO_BITS(mp
)) >> XFS_INODES_PER_CHUNK_LOG
;
2986 igeo
->inobt_maxlevels
= xfs_btree_compute_maxlevels(igeo
->inobt_mnr
,
2988 ASSERT(igeo
->inobt_maxlevels
<= xfs_iallocbt_maxlevels_ondisk());
2991 * Set the maximum inode count for this filesystem, being careful not
2992 * to use obviously garbage sb_inopblog/sb_inopblock values. Regular
2993 * users should never get here due to failing sb verification, but
2994 * certain users (xfs_db) need to be usable even with corrupt metadata.
2996 if (sbp
->sb_imax_pct
&& igeo
->ialloc_blks
) {
2998 * Make sure the maximum inode count is a multiple
2999 * of the units we allocate inodes in.
3001 icount
= sbp
->sb_dblocks
* sbp
->sb_imax_pct
;
3002 do_div(icount
, 100);
3003 do_div(icount
, igeo
->ialloc_blks
);
3004 igeo
->maxicount
= XFS_FSB_TO_INO(mp
,
3005 icount
* igeo
->ialloc_blks
);
3007 igeo
->maxicount
= 0;
3011 * Compute the desired size of an inode cluster buffer size, which
3012 * starts at 8K and (on v5 filesystems) scales up with larger inode
3015 * Preserve the desired inode cluster size because the sparse inodes
3016 * feature uses that desired size (not the actual size) to compute the
3017 * sparse inode alignment. The mount code validates this value, so we
3018 * cannot change the behavior.
3020 igeo
->inode_cluster_size_raw
= XFS_INODE_BIG_CLUSTER_SIZE
;
3021 if (xfs_has_v3inodes(mp
)) {
3022 int new_size
= igeo
->inode_cluster_size_raw
;
3024 new_size
*= mp
->m_sb
.sb_inodesize
/ XFS_DINODE_MIN_SIZE
;
3025 if (mp
->m_sb
.sb_inoalignmt
>= XFS_B_TO_FSBT(mp
, new_size
))
3026 igeo
->inode_cluster_size_raw
= new_size
;
3029 /* Calculate inode cluster ratios. */
3030 if (igeo
->inode_cluster_size_raw
> mp
->m_sb
.sb_blocksize
)
3031 igeo
->blocks_per_cluster
= XFS_B_TO_FSBT(mp
,
3032 igeo
->inode_cluster_size_raw
);
3034 igeo
->blocks_per_cluster
= 1;
3035 igeo
->inode_cluster_size
= XFS_FSB_TO_B(mp
, igeo
->blocks_per_cluster
);
3036 igeo
->inodes_per_cluster
= XFS_FSB_TO_INO(mp
, igeo
->blocks_per_cluster
);
3038 /* Calculate inode cluster alignment. */
3039 if (xfs_has_align(mp
) &&
3040 mp
->m_sb
.sb_inoalignmt
>= igeo
->blocks_per_cluster
)
3041 igeo
->cluster_align
= mp
->m_sb
.sb_inoalignmt
;
3043 igeo
->cluster_align
= 1;
3044 igeo
->inoalign_mask
= igeo
->cluster_align
- 1;
3045 igeo
->cluster_align_inodes
= XFS_FSB_TO_INO(mp
, igeo
->cluster_align
);
3048 * If we are using stripe alignment, check whether
3049 * the stripe unit is a multiple of the inode alignment
3051 if (mp
->m_dalign
&& igeo
->inoalign_mask
&&
3052 !(mp
->m_dalign
& igeo
->inoalign_mask
))
3053 igeo
->ialloc_align
= mp
->m_dalign
;
3055 igeo
->ialloc_align
= 0;
3057 if (mp
->m_sb
.sb_blocksize
> PAGE_SIZE
)
3058 igeo
->min_folio_order
= mp
->m_sb
.sb_blocklog
- PAGE_SHIFT
;
3060 igeo
->min_folio_order
= 0;
3063 /* Compute the location of the root directory inode that is laid out by mkfs. */
3065 xfs_ialloc_calc_rootino(
3066 struct xfs_mount
*mp
,
3069 struct xfs_ino_geometry
*igeo
= M_IGEO(mp
);
3070 xfs_agblock_t first_bno
;
3073 * Pre-calculate the geometry of AG 0. We know what it looks like
3074 * because libxfs knows how to create allocation groups now.
3076 * first_bno is the first block in which mkfs could possibly have
3077 * allocated the root directory inode, once we factor in the metadata
3078 * that mkfs formats before it. Namely, the four AG headers...
3080 first_bno
= howmany(4 * mp
->m_sb
.sb_sectsize
, mp
->m_sb
.sb_blocksize
);
3082 /* ...the two free space btree roots... */
3085 /* ...the inode btree root... */
3088 /* ...the initial AGFL... */
3089 first_bno
+= xfs_alloc_min_freelist(mp
, NULL
);
3091 /* ...the free inode btree root... */
3092 if (xfs_has_finobt(mp
))
3095 /* ...the reverse mapping btree root... */
3096 if (xfs_has_rmapbt(mp
))
3099 /* ...the reference count btree... */
3100 if (xfs_has_reflink(mp
))
3104 * ...and the log, if it is allocated in the first allocation group.
3106 * This can happen with filesystems that only have a single
3107 * allocation group, or very odd geometries created by old mkfs
3108 * versions on very small filesystems.
3110 if (xfs_ag_contains_log(mp
, 0))
3111 first_bno
+= mp
->m_sb
.sb_logblocks
;
3114 * Now round first_bno up to whatever allocation alignment is given
3115 * by the filesystem or was passed in.
3117 if (xfs_has_dalign(mp
) && igeo
->ialloc_align
> 0)
3118 first_bno
= roundup(first_bno
, sunit
);
3119 else if (xfs_has_align(mp
) &&
3120 mp
->m_sb
.sb_inoalignmt
> 1)
3121 first_bno
= roundup(first_bno
, mp
->m_sb
.sb_inoalignmt
);
3123 return XFS_AGINO_TO_INO(mp
, 0, XFS_AGB_TO_AGINO(mp
, first_bno
));
3127 * Ensure there are not sparse inode clusters that cross the new EOAG.
3129 * This is a no-op for non-spinode filesystems since clusters are always fully
3130 * allocated and checking the bnobt suffices. However, a spinode filesystem
3131 * could have a record where the upper inodes are free blocks. If those blocks
3132 * were removed from the filesystem, the inode record would extend beyond EOAG,
3133 * which will be flagged as corruption.
3136 xfs_ialloc_check_shrink(
3137 struct xfs_perag
*pag
,
3138 struct xfs_trans
*tp
,
3139 struct xfs_buf
*agibp
,
3140 xfs_agblock_t new_length
)
3142 struct xfs_inobt_rec_incore rec
;
3143 struct xfs_btree_cur
*cur
;
3148 if (!xfs_has_sparseinodes(pag_mount(pag
)))
3151 cur
= xfs_inobt_init_cursor(pag
, tp
, agibp
);
3153 /* Look up the inobt record that would correspond to the new EOFS. */
3154 agino
= XFS_AGB_TO_AGINO(pag_mount(pag
), new_length
);
3155 error
= xfs_inobt_lookup(cur
, agino
, XFS_LOOKUP_LE
, &has
);
3159 error
= xfs_inobt_get_rec(cur
, &rec
, &has
);
3164 xfs_ag_mark_sick(pag
, XFS_SICK_AG_INOBT
);
3165 error
= -EFSCORRUPTED
;
3169 /* If the record covers inodes that would be beyond EOFS, bail out. */
3170 if (rec
.ir_startino
+ XFS_INODES_PER_CHUNK
> agino
) {
3175 xfs_btree_del_cursor(cur
, error
);