2 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
5 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
6 /* All Rights Reserved */
9 * Copyright (c) 1980, 1986, 1990 The Regents of the University of California.
10 * All rights reserved.
12 * Redistribution and use in source and binary forms are permitted
13 * provided that: (1) source distributions retain this entire copyright
14 * notice and comment, and (2) distributions including binaries display
15 * the following acknowledgement: ``This product includes software
16 * developed by the University of California, Berkeley and its contributors''
17 * in the documentation or other materials provided with the distribution
18 * and in all advertising materials mentioning features or use of this
19 * software. Neither the name of the University nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
23 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
24 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/sysmacros.h>
37 #include <sys/mntent.h>
38 #include <sys/vnode.h>
39 #include <sys/fs/ufs_inode.h>
40 #include <sys/fs/ufs_fs.h>
42 #include <sys/fs/ufs_fsdir.h>
47 static int get_indir_offsets(int, daddr_t
, int *, int *);
48 static int clearanentry(struct inodesc
*);
49 static void pdinode(struct dinode
*);
50 static void inoflush(void);
51 static void mark_delayed_inodes(fsck_ino_t
, daddr32_t
);
52 static int iblock(struct inodesc
*, int, u_offset_t
, enum cki_action
);
53 static struct inoinfo
*search_cache(struct inoinfo
*, fsck_ino_t
);
54 static int ckinode_common(struct dinode
*, struct inodesc
*, enum cki_action
);
55 static int lookup_dotdot_ino(fsck_ino_t
);
58 * ckinode() essentially traverses the blocklist of the provided
59 * inode. For each block either the caller-supplied callback (id_func
60 * in the provided struct inodesc) or dirscan() is invoked. Which is
61 * chosen is controlled by what type of traversal was requested
62 * (id_type) - if it was for an ADDR or ACL, use the callback,
63 * otherwise it is assumed to be DATA (i.e., a directory) whose
64 * contents need to be scanned.
66 * Note that a directory inode can get passed in with a type of ADDR;
67 * the type field is orthogonal to the IFMT value. This is so that
68 * the file aspects (no duplicate blocks, etc) of a directory can be
69 * verified just like is done for any other file, or the actual
70 * contents can be scanned so that connectivity and such can be
73 * The traversal is controlled by flags in the return value of
74 * dirscan() or the callback. Five flags are defined, STOP, SKIP,
75 * KEEPON, ALTERED, and FOUND. Their semantics are:
77 * STOP - no further processing of this inode is desired/possible/
78 * feasible/etc. This can mean that whatever the scan
79 * was searching for was found, or a serious
80 * inconsistency was encountered, or anything else
83 * SKIP - something that made it impossible to continue was
84 * encountered, and the caller should go on to the next
85 * inode. This is more for i/o failures than for
86 * logical inconsistencies. Nothing actually looks for
89 * KEEPON - no more blocks of this inode need to be scanned, but
90 * nothing's wrong, so keep on going with the next
91 * inode. It is similar to STOP, except that
92 * ckinode()'s caller will typically advance to the next
93 * inode for KEEPON, whereas it ceases scanning through
94 * the inodes completely for STOP.
96 * ALTERED - a change was made to the inode. If the caller sees
97 * this set, it should make sure to flush out the
98 * changes. Note that any data blocks read in by the
99 * function need to be marked dirty by it directly;
100 * flushing of those will happen automatically later.
102 * FOUND - whatever was being searched for was located.
103 * Typically combined with STOP to avoid wasting time
104 * doing additional looking.
106 * During a traversal, some state needs to be carried around. At the
107 * least, the callback functions need to know what inode they're
108 * working on, which logical block, and whether or not fixing problems
109 * when they're encountered is desired. Rather than try to guess what
110 * else might be needed (and thus end up passing way more arguments
111 * than is reasonable), all the possibilities have been bundled in
112 * struct inodesc. About half of the fields are specific to directory
113 * traversals, and the rest are pretty much generic to any traversal.
115 * The general fields are:
117 * id_fix What to do when an error is found. Generally, this
118 * is set to DONTKNOW before a traversal. If a
119 * problem is encountered, it is changed to either FIX
120 * or NOFIX by the dofix() query function. If id_fix
121 * has already been set to FIX when dofix() is called, then
122 * it includes the ALTERED flag (see above) in its return
123 * value; the net effect is that the inode's buffer
124 * will get marked dirty and written to disk at some
125 * point. If id_fix is DONTKNOW, then dofix() will
126 * query the user. If it is NOFIX, then dofix()
127 * essentially does nothing. A few routines set NOFIX
128 * as the initial value, as they are performing a best-
129 * effort informational task, rather than an actual
132 * id_func This is the function that will be called for every
133 * logical block in the file (assuming id_type is not
134 * DATA). The logical block may represent a hole, so
135 * the callback needs to be prepared to handle that
136 * case. Its return value is a combination of the flags
137 * described above (SKIP, ALTERED, etc).
139 * id_number The inode number whose block list or data is being
142 * id_parent When id_type is DATA, this is the inode number for
143 * the parent of id_number. Otherwise, it is
144 * available for use as an extra parameter or return
145 * value between the callback and ckinode()'s caller.
146 * Which, if either, of those is left completely up to
147 * the two routines involved, so nothing can generally
148 * be assumed about the id_parent value for non-DATA
151 * id_lbn This is the current logical block (not fragment)
152 * number being visited by the traversal.
154 * id_blkno This is the physical block corresponding to id_lbn.
156 * id_numfrags This defines how large a block is being processed in
157 * this particular invocation of the callback.
158 * Usually, it will be the same as sblock.fs_frag.
159 * However, if a direct block is being processed and
160 * it is less than a full filesystem block,
161 * id_numfrags will indicate just how many fragments
162 * (starting from id_lbn) are actually part of the
165 * id_truncto The pass 4 callback is used in several places to
166 * free the blocks of a file (the `FILE HAS PROBLEM
167 * FOO; CLEAR?' scenario). This has been generalized
168 * to allow truncating a file to a particular length
169 * rather than always completely discarding it. If
170 * id_truncto is -1, then the entire file is released,
171 * otherwise it is logical block number to truncate
172 * to. This generalized interface was motivated by a
173 * desire to be able to discard everything after a
174 * hole in a directory, rather than the entire
177 * id_type Selects the type of traversal. DATA for dirscan(),
178 * ADDR or ACL for using the provided callback.
180 * There are several more fields used just for dirscan() traversals:
182 * id_filesize The number of bytes in the overall directory left to
185 * id_loc Byte position within the directory block. Should always
186 * point to the start of a directory entry.
188 * id_entryno Which logical directory entry is being processed (0
189 * is `.', 1 is `..', 2 and on are normal entries).
190 * This field is primarily used to enable special
191 * checks when looking at the first two entries.
193 * The exception (there's always an exception in fsck)
194 * is that in pass 1, it tracks how many fragments are
195 * being used by a particular inode.
197 * id_firsthole The first logical block number that was found to
198 * be zero. As directories are not supposed to have
199 * holes, this marks where a directory should be
200 * truncated down to. A value of -1 indicates that
201 * no holes were found.
203 * id_dirp A pointer to the in-memory copy of the current
204 * directory entry (as identified by id_loc).
206 * id_name This is a directory entry name to either create
207 * (callback is mkentry) or locate (callback is
208 * chgino, findino, or findname).
211 ckinode(struct dinode
*dp
, struct inodesc
*idesc
, enum cki_action action
)
213 struct inodesc cleardesc
;
216 if (idesc
->id_filesize
== 0)
217 idesc
->id_filesize
= (offset_t
)dp
->di_size
;
220 * Our caller should be filtering out completely-free inodes
221 * (mode == zero), so we'll work on the assumption that what
222 * we're given has some basic validity.
224 * The kernel is inconsistent about MAXPATHLEN including the
225 * trailing \0, so allow the more-generous length for symlinks.
227 mode
= dp
->di_mode
& IFMT
;
228 if (mode
== IFBLK
|| mode
== IFCHR
)
230 if (mode
== IFLNK
&& dp
->di_size
> MAXPATHLEN
) {
231 pwarn("I=%d Symlink longer than supported maximum\n",
233 init_inodesc(&cleardesc
);
234 cleardesc
.id_type
= ADDR
;
235 cleardesc
.id_number
= idesc
->id_number
;
236 cleardesc
.id_fix
= DONTKNOW
;
237 clri(&cleardesc
, "BAD", CLRI_VERBOSE
, CLRI_NOP_CORRUPT
);
240 return (ckinode_common(dp
, idesc
, action
));
244 * This was split out from ckinode() to allow it to be used
245 * without having to pass in kludge flags to suppress the
246 * wrong-for-deletion initialization and irrelevant checks.
247 * This feature is no longer needed, but is being kept in case
248 * the need comes back.
251 ckinode_common(struct dinode
*dp
, struct inodesc
*idesc
,
252 enum cki_action action
)
257 int indir_data_blks
, last_indir_blk
;
260 (void) memmove(&dino
, dp
, sizeof (struct dinode
));
261 ndb
= howmany(dino
.di_size
, (u_offset_t
)sblock
.fs_bsize
);
263 for (i
= 0; i
< NDADDR
; i
++) {
265 offset
= blkoff(&sblock
, dino
.di_size
);
266 if ((--ndb
== 0) && (offset
!= 0)) {
268 numfrags(&sblock
, fragroundup(&sblock
, offset
));
270 idesc
->id_numfrags
= sblock
.fs_frag
;
272 if (dino
.di_db
[i
] == 0) {
273 if ((ndb
> 0) && (idesc
->id_firsthole
< 0)) {
274 idesc
->id_firsthole
= i
;
278 idesc
->id_blkno
= dino
.di_db
[i
];
279 if (idesc
->id_type
== ADDR
|| idesc
->id_type
== ACL
)
280 ret
= (*idesc
->id_func
)(idesc
);
282 ret
= dirscan(idesc
);
285 * Need to clear the entry, now that we're done with
286 * it. We depend on freeblk() ignoring a request to
287 * free already-free fragments to handle the problem of
290 if ((action
== CKI_TRUNCATE
) &&
291 (idesc
->id_truncto
>= 0) &&
292 (idesc
->id_lbn
>= idesc
->id_truncto
)) {
293 dp
= ginode(idesc
->id_number
);
295 * The (int) cast is safe, in that if di_size won't
296 * fit, it'll be a multiple of any legal fs_frag,
297 * thus giving a zero result. That value, in turn
298 * means we're doing an entire block.
300 frags
= howmany((int)dp
->di_size
, sblock
.fs_fsize
) %
303 frags
= sblock
.fs_frag
;
304 freeblk(idesc
->id_number
, dp
->di_db
[i
],
306 dp
= ginode(idesc
->id_number
);
318 * Cure a lint complaint of ``possible use before set''.
319 * Apparently it can't quite figure out the switch statement.
324 * indir_data_blks contains the number of data blocks in all
325 * the previous levels for this iteration. E.g., for the
326 * single indirect case (i = 0, di_ib[i] != 0), NDADDR's worth
327 * of blocks have already been covered by the direct blocks
328 * (di_db[]). At the triple indirect level (i = NIADDR - 1),
329 * it is all of the number of data blocks that were covered
330 * by the second indirect, single indirect, and direct block
333 idesc
->id_numfrags
= sblock
.fs_frag
;
334 ndb
= howmany(dino
.di_size
, (u_offset_t
)sblock
.fs_bsize
);
335 for (i
= 0; i
< NIADDR
; i
++) {
336 (void) get_indir_offsets(i
, ndb
, &indir_data_blks
,
338 if (dino
.di_ib
[i
] != 0) {
340 * We'll only clear di_ib[i] if the first entry (and
341 * therefore all of them) is to be cleared, since we
342 * only go through this code on the first entry of
343 * each level of indirection. The +1 is to account
344 * for the fact that we don't modify id_lbn until
345 * we actually start processing on a data block.
347 idesc
->id_blkno
= dino
.di_ib
[i
];
348 ret
= iblock(idesc
, i
+ 1,
349 (u_offset_t
)howmany(dino
.di_size
,
350 (u_offset_t
)sblock
.fs_bsize
) - indir_data_blks
,
352 if ((action
== CKI_TRUNCATE
) &&
353 (idesc
->id_truncto
<= indir_data_blks
) &&
354 ((idesc
->id_lbn
+ 1) >= indir_data_blks
) &&
355 ((idesc
->id_lbn
+ 1) <= last_indir_blk
)) {
356 dp
= ginode(idesc
->id_number
);
357 if (dp
->di_ib
[i
] != 0) {
358 freeblk(idesc
->id_number
, dp
->di_ib
[i
],
366 * Need to know which of the file's logical blocks
367 * reside in the missing indirect block. However, the
368 * precise location is only needed for truncating
369 * directories, and level-of-indirection precision is
370 * sufficient for that.
372 if ((indir_data_blks
< ndb
) &&
373 (idesc
->id_firsthole
< 0)) {
374 idesc
->id_firsthole
= indir_data_blks
;
382 get_indir_offsets(int ilevel_wanted
, daddr_t ndb
, int *data_blks
,
389 for (ilevel
= 0; ilevel
< NIADDR
; ilevel
++) {
393 lblk
= dblks
+ NINDIR(&sblock
) - 1;
396 dblks
= NDADDR
+ NINDIR(&sblock
);
397 lblk
= dblks
+ (NINDIR(&sblock
) * NINDIR(&sblock
)) - 1;
400 dblks
= NDADDR
+ NINDIR(&sblock
) +
401 (NINDIR(&sblock
) * NINDIR(&sblock
));
402 lblk
= dblks
+ (NINDIR(&sblock
) * NINDIR(&sblock
) *
403 NINDIR(&sblock
)) - 1;
406 exitstat
= EXERRFATAL
;
408 * Translate from zero-based array to
409 * one-based human-style counting.
411 errexit("panic: indirection level %d not 1, 2, or 3",
416 if (dblks
< ndb
&& ndb
<= lblk
)
419 if (ilevel
== ilevel_wanted
) {
420 if (data_blks
!= NULL
)
422 if (last_blk
!= NULL
)
431 iblock(struct inodesc
*idesc
, int ilevel
, u_offset_t iblks
,
432 enum cki_action action
)
436 int (*func
)(struct inodesc
*) = NULL
;
437 u_offset_t fsbperindirb
;
444 switch (idesc
->id_type
) {
446 func
= idesc
->id_func
;
447 if (((n
= (*func
)(idesc
)) & KEEPON
) == 0)
451 func
= idesc
->id_func
;
457 errexit("unknown inodesc type %d in iblock()", idesc
->id_type
);
460 if (chkrange(idesc
->id_blkno
, idesc
->id_numfrags
)) {
461 return ((idesc
->id_type
== ACL
) ? STOP
: SKIP
);
464 bp
= getdatablk(idesc
->id_blkno
, (size_t)sblock
.fs_bsize
);
465 if (bp
->b_errs
!= 0) {
472 * Trivia note: the BSD fsck has the number of bytes remaining
473 * as the third argument to iblock(), so the equivalent of
474 * fsbperindirb starts at fs_bsize instead of one. We're
475 * working in units of filesystem blocks here, not bytes or
478 for (fsbperindirb
= 1, i
= 0; i
< ilevel
; i
++) {
479 fsbperindirb
*= (u_offset_t
)NINDIR(&sblock
);
482 * nif indicates the next "free" pointer (as an array index) in this
483 * indirect block, based on counting the blocks remaining in the
484 * file after subtracting all previously processed blocks.
485 * This figure is based on the size field of the inode.
487 * Note that in normal operation, nif may initially be calculated
488 * as larger than the number of pointers in this block (as when
489 * there are more indirect blocks following); if that is
490 * the case, nif is limited to the max number of pointers per
493 * Also note that if an inode is inconsistent (has more blocks
494 * allocated to it than the size field would indicate), the sweep
495 * through any indirect blocks directly pointed at by the inode
496 * continues. Since the block offset of any data blocks referenced
497 * by these indirect blocks is greater than the size of the file,
498 * the index nif may be computed as a negative value.
499 * In this case, we reset nif to indicate that all pointers in
500 * this retrieval block should be zeroed and the resulting
501 * unreferenced data and/or retrieval blocks will be recovered
502 * through garbage collection later.
504 nif
= (offset_t
)howmany(iblks
, fsbperindirb
);
505 if (nif
> NINDIR(&sblock
))
506 nif
= NINDIR(&sblock
);
510 * first pass: all "free" retrieval pointers (from [nif] thru
511 * the end of the indirect block) should be zero. (This
512 * assertion does not hold for directories, which may be
513 * truncated without releasing their allocated space)
515 if (nif
< NINDIR(&sblock
) && (idesc
->id_func
== pass1check
||
516 idesc
->id_func
== pass3bcheck
)) {
517 for (i
= nif
; i
< NINDIR(&sblock
); i
++) {
518 if (bp
->b_un
.b_indir
[i
] == 0)
520 (void) sprintf(buf
, "PARTIALLY TRUNCATED INODE I=%lu",
521 (ulong_t
)idesc
->id_number
);
524 } else if (dofix(idesc
, buf
)) {
525 freeblk(idesc
->id_number
,
528 bp
->b_un
.b_indir
[i
] = 0;
532 flush(fswritefd
, bp
);
535 * second pass: all retrieval pointers referring to blocks within
536 * a valid range [0..filesize] (both indirect and data blocks)
537 * are examined in the same manner as ckinode() checks the
538 * direct blocks in the inode. Sweep through from
539 * the first pointer in this retrieval block to [nif-1].
541 last_lbn
= howmany(idesc
->id_filesize
, sblock
.fs_bsize
);
542 for (i
= 0; i
< nif
; i
++) {
545 if (bp
->b_un
.b_indir
[i
] != 0) {
546 idesc
->id_blkno
= bp
->b_un
.b_indir
[i
];
548 n
= iblock(idesc
, ilevel
, iblks
, action
);
550 * Each iteration decreases "remaining block
551 * count" by the number of blocks accessible
552 * by a pointer at this indirect block level.
554 iblks
-= fsbperindirb
;
557 * If we're truncating, func will discard
558 * the data block for us.
563 if ((action
== CKI_TRUNCATE
) &&
564 (idesc
->id_truncto
>= 0) &&
565 (idesc
->id_lbn
>= idesc
->id_truncto
)) {
566 freeblk(idesc
->id_number
, bp
->b_un
.b_indir
[i
],
571 * Note that truncation never gets STOP back
572 * under normal circumstances. Abnormal would
573 * be a bad acl short-circuit in iblock() or
574 * an out-of-range failure in pass4check().
575 * We still want to keep going when truncating
576 * under those circumstances, since the whole
577 * point of truncating is to get rid of all
580 if ((n
& STOP
) && (action
!= CKI_TRUNCATE
)) {
585 if ((idesc
->id_lbn
< last_lbn
) &&
586 (idesc
->id_firsthole
< 0)) {
587 idesc
->id_firsthole
= idesc
->id_lbn
;
589 if (idesc
->id_type
== DATA
) {
591 * No point in continuing in the indirect
592 * blocks of a directory, since they'll just
596 return ((n
& ~KEEPON
) | STOP
);
606 * Check that a block is a legal block number.
607 * Return 0 if in range, 1 if out of range.
610 chkrange(daddr32_t blk
, int cnt
)
614 if (cnt
<= 0 || blk
<= 0 || ((unsigned)blk
>= (unsigned)maxfsblock
) ||
615 ((cnt
- 1) > (maxfsblock
- blk
))) {
618 "Bad fragment range: should be 1 <= %d..%d < %d\n",
619 blk
, blk
+ cnt
, maxfsblock
);
622 if ((cnt
> sblock
.fs_frag
) ||
623 ((fragnum(&sblock
, blk
) + cnt
) > sblock
.fs_frag
)) {
625 (void) printf("Bad fragment size: size %d\n", cnt
);
628 c
= dtog(&sblock
, blk
);
629 if (blk
< cgdmin(&sblock
, c
)) {
630 if ((unsigned)(blk
+ cnt
) > (unsigned)cgsblock(&sblock
, c
)) {
633 "Bad fragment position: %d..%d spans start of cg metadata\n",
638 if ((unsigned)(blk
+ cnt
) > (unsigned)cgbase(&sblock
, c
+1)) {
641 "Bad frag pos: %d..%d crosses end of cg\n",
650 * General purpose interface for reading inodes.
654 * Note that any call to ginode() can potentially invalidate any
655 * dinode pointers previously acquired from it. To avoid pain,
656 * make sure to always call inodirty() immediately after modifying
657 * an inode, if there's any chance of ginode() being called after
658 * that. Also, always call ginode() right before you need to access
659 * an inode, so that there won't be any surprises from functions
660 * called between the previous ginode() invocation and the dinode
663 * Despite all that, we aren't doing the amount of i/o that's implied,
664 * as we use the buffer cache that getdatablk() and friends maintain.
666 static fsck_ino_t startinum
= -1;
669 ginode(fsck_ino_t inum
)
674 if (inum
< UFSROOTINO
|| inum
> maxino
) {
675 errexit("bad inode number %d to ginode\n", inum
);
677 if (startinum
== -1 ||
680 inum
>= (fsck_ino_t
)(startinum
+ (fsck_ino_t
)INOPB(&sblock
))) {
681 iblk
= itod(&sblock
, inum
);
686 * We don't check for errors here, because we can't
687 * tell our caller about it, and the zeros that will
688 * be in the buffer are just as good as anything we
691 pbp
= getdatablk(iblk
, (size_t)sblock
.fs_bsize
);
693 (fsck_ino_t
)((inum
/ INOPB(&sblock
)) * INOPB(&sblock
));
695 dp
= &pbp
->b_un
.b_dinode
[inum
% INOPB(&sblock
)];
696 if (dp
->di_suid
!= UID_LONG
)
697 dp
->di_uid
= dp
->di_suid
;
698 if (dp
->di_sgid
!= GID_LONG
)
699 dp
->di_gid
= dp
->di_sgid
;
704 * Special purpose version of ginode used to optimize first pass
705 * over all the inodes in numerical order. It bypasses the buffer
706 * system used by ginode(), etc in favour of reading the bulk of a
707 * cg's inodes at one time.
709 static fsck_ino_t nextino
, lastinum
;
710 static int64_t readcnt
, readpercg
, fullcnt
, inobufsize
;
711 static int64_t partialcnt
, partialsize
;
712 static size_t lastsize
;
713 static struct dinode
*inodebuf
;
714 static diskaddr_t currentdblk
;
715 static struct dinode
*currentinode
;
718 getnextinode(fsck_ino_t inum
)
722 static struct dinode
*dp
;
724 if (inum
!= nextino
++ || inum
> maxino
)
725 errexit("bad inode number %d to nextinode\n", inum
);
728 * Will always go into the if() the first time we're called,
729 * so dp will always be valid.
731 if (inum
>= lastinum
) {
733 dblk
= fsbtodb(&sblock
, itod(&sblock
, lastinum
));
735 if (readcnt
% readpercg
== 0) {
736 if (partialsize
> SIZE_MAX
)
738 "Internal error: partialsize overflow");
739 size
= (size_t)partialsize
;
740 lastinum
+= partialcnt
;
742 if (inobufsize
> SIZE_MAX
)
743 errexit("Internal error: inobufsize overflow");
744 size
= (size_t)inobufsize
;
748 * If fsck_bread() returns an error, it will already have
749 * zeroed out the buffer, so we do not need to do so here.
751 (void) fsck_bread(fsreadfd
, (caddr_t
)inodebuf
, dblk
, size
);
760 * Reread the current getnext() buffer. This allows for changing inodes
761 * other than the current one via ginode()/inodirty()/inoflush().
763 * Just reuses all the interesting variables that getnextinode() set up
764 * last time it was called. This shouldn't get called often, so we don't
765 * try to figure out if the caller's actually touched an inode in the
766 * range we have cached. There could have been an arbitrary number of
772 if (inodebuf
== NULL
) {
777 (void) fsck_bread(fsreadfd
, (caddr_t
)inodebuf
, currentdblk
, lastsize
);
778 return (currentinode
);
788 inobufsize
= blkroundup(&sblock
, INOBUFSIZE
);
789 fullcnt
= inobufsize
/ sizeof (struct dinode
);
790 readpercg
= sblock
.fs_ipg
/ fullcnt
;
791 partialcnt
= sblock
.fs_ipg
% fullcnt
;
792 partialsize
= partialcnt
* sizeof (struct dinode
);
793 if (partialcnt
!= 0) {
796 partialcnt
= fullcnt
;
797 partialsize
= inobufsize
;
799 if (inodebuf
== NULL
&&
800 (inodebuf
= (struct dinode
*)malloc((unsigned)inobufsize
)) == NULL
)
801 errexit("Cannot allocate space for inode buffer\n");
802 while (nextino
< UFSROOTINO
)
803 (void) getnextinode(nextino
);
809 if (inodebuf
!= NULL
) {
810 free((void *)inodebuf
);
816 * Routines to maintain information about directory inodes.
817 * This is built during the first pass and used during the
818 * second and third passes.
820 * Enter inodes into the cache.
823 cacheino(struct dinode
*dp
, fsck_ino_t inum
)
826 struct inoinfo
**inpp
;
829 blks
= NDADDR
+ NIADDR
;
830 inp
= (struct inoinfo
*)
831 malloc(sizeof (*inp
) + (blks
- 1) * sizeof (daddr32_t
));
833 errexit("Cannot increase directory list\n");
834 init_inoinfo(inp
, dp
, inum
); /* doesn't touch i_nextlist or i_number */
835 inpp
= &inphead
[inum
% numdirs
];
836 inp
->i_nextlist
= *inpp
;
838 inp
->i_number
= inum
;
839 if (inplast
== listmax
) {
841 inpsort
= (struct inoinfo
**)realloc((void *)inpsort
,
842 (unsigned)listmax
* sizeof (struct inoinfo
*));
844 errexit("cannot increase directory list");
846 inpsort
[inplast
++] = inp
;
850 * Look up an inode cache structure.
853 getinoinfo(fsck_ino_t inum
)
857 inp
= search_cache(inphead
[inum
% numdirs
], inum
);
862 * Determine whether inode is in cache.
865 inocached(fsck_ino_t inum
)
867 return (search_cache(inphead
[inum
% numdirs
], inum
) != NULL
);
871 * Clean up all the inode cache structure.
876 struct inoinfo
**inpp
;
880 for (inpp
= &inpsort
[inplast
- 1]; inpp
>= inpsort
; inpp
--) {
881 free((void *)(*inpp
));
883 free((void *)inphead
);
884 free((void *)inpsort
);
885 inphead
= inpsort
= NULL
;
889 * Routines to maintain information about acl inodes.
890 * This is built during the first pass and used during the
891 * second and third passes.
893 * Enter acl inodes into the cache.
896 cacheacl(struct dinode
*dp
, fsck_ino_t inum
)
898 struct inoinfo
*aclp
;
899 struct inoinfo
**aclpp
;
902 blks
= NDADDR
+ NIADDR
;
903 aclp
= (struct inoinfo
*)
904 malloc(sizeof (*aclp
) + (blks
- 1) * sizeof (daddr32_t
));
907 aclpp
= &aclphead
[inum
% numacls
];
908 aclp
->i_nextlist
= *aclpp
;
910 aclp
->i_number
= inum
;
911 aclp
->i_isize
= (offset_t
)dp
->di_size
;
912 aclp
->i_blkssize
= (size_t)(blks
* sizeof (daddr32_t
));
913 (void) memmove(&aclp
->i_blks
[0], &dp
->di_db
[0], aclp
->i_blkssize
);
914 if (aclplast
== aclmax
) {
916 aclpsort
= (struct inoinfo
**)realloc((char *)aclpsort
,
917 (unsigned)aclmax
* sizeof (struct inoinfo
*));
918 if (aclpsort
== NULL
)
919 errexit("cannot increase acl list");
921 aclpsort
[aclplast
++] = aclp
;
926 * Generic cache search function.
927 * ROOT is the first entry in a hash chain (the caller is expected
928 * to have done the initial bucket lookup). KEY is what's being
931 * Returns a pointer to the entry if it is found, NULL otherwise.
933 static struct inoinfo
*
934 search_cache(struct inoinfo
*element
, fsck_ino_t key
)
936 while (element
!= NULL
) {
937 if (element
->i_number
== key
)
939 element
= element
->i_nextlist
;
955 flush(fswritefd
, pbp
);
959 * Interactive wrapper for freeino(), for those times when we're
960 * not sure if we should throw something away.
963 clri(struct inodesc
*idesc
, char *type
, int verbose
, int corrupting
)
968 if (statemap
[idesc
->id_number
] == USTATE
)
971 dp
= ginode(idesc
->id_number
);
972 if (verbose
== CLRI_VERBOSE
) {
973 pwarn("%s %s", type
, file_id(idesc
->id_number
, dp
->di_mode
));
974 pinode(idesc
->id_number
);
976 if (preen
|| (reply("CLEAR") == 1)) {
977 need_parent
= (corrupting
== CLRI_NOP_OK
) ?
978 TI_NOPARENT
: TI_PARENT
;
979 freeino(idesc
->id_number
, need_parent
);
981 (void) printf(" (CLEARED)\n");
982 remove_orphan_dir(idesc
->id_number
);
983 } else if (corrupting
== CLRI_NOP_CORRUPT
) {
990 * Find the directory entry for the inode noted in id_parent (which is
991 * not necessarily the parent of anything, we're just using a convenient
995 findname(struct inodesc
*idesc
)
997 struct direct
*dirp
= idesc
->id_dirp
;
999 if (dirp
->d_ino
!= idesc
->id_parent
)
1001 (void) memmove(idesc
->id_name
, dirp
->d_name
,
1002 MIN(dirp
->d_namlen
, MAXNAMLEN
) + 1);
1003 return (STOP
|FOUND
);
1007 * Find the inode number associated with the given name.
1010 findino(struct inodesc
*idesc
)
1012 struct direct
*dirp
= idesc
->id_dirp
;
1014 if (dirp
->d_ino
== 0)
1016 if (strcmp(dirp
->d_name
, idesc
->id_name
) == 0 &&
1017 dirp
->d_ino
>= UFSROOTINO
&& dirp
->d_ino
<= maxino
) {
1018 idesc
->id_parent
= dirp
->d_ino
;
1019 return (STOP
|FOUND
);
1025 cleardirentry(fsck_ino_t parentdir
, fsck_ino_t target
)
1027 struct inodesc idesc
;
1030 dp
= ginode(parentdir
);
1031 init_inodesc(&idesc
);
1032 idesc
.id_func
= clearanentry
;
1033 idesc
.id_parent
= target
;
1034 idesc
.id_type
= DATA
;
1035 idesc
.id_fix
= NOFIX
;
1036 return (ckinode(dp
, &idesc
, CKI_TRAVERSE
));
1040 clearanentry(struct inodesc
*idesc
)
1042 struct direct
*dirp
= idesc
->id_dirp
;
1044 if (dirp
->d_ino
!= idesc
->id_parent
|| idesc
->id_entryno
< 2) {
1045 idesc
->id_entryno
++;
1049 return (STOP
|FOUND
|ALTERED
);
1053 pinode(fsck_ino_t ino
)
1057 (void) printf(" I=%lu ", (ulong_t
)ino
);
1058 if (ino
< UFSROOTINO
|| ino
> maxino
)
1065 pdinode(struct dinode
*dp
)
1071 (void) printf(" OWNER=");
1072 if ((pw
= getpwuid((int)dp
->di_uid
)) != 0)
1073 (void) printf("%s ", pw
->pw_name
);
1075 (void) printf("%lu ", (ulong_t
)dp
->di_uid
);
1076 (void) printf("MODE=%o\n", dp
->di_mode
);
1078 (void) printf("%s: ", devname
);
1079 (void) printf("SIZE=%lld ", (longlong_t
)dp
->di_size
);
1081 /* ctime() ignores LOCALE, so this is safe */
1082 t
= (time_t)dp
->di_mtime
;
1084 (void) printf("MTIME=%12.12s %4.4s ", p
+ 4, p
+ 20);
1088 blkerror(fsck_ino_t ino
, char *type
, daddr32_t blk
, daddr32_t lbn
)
1090 pfatal("FRAGMENT %d %s I=%u LFN %d", blk
, type
, ino
, lbn
);
1091 (void) printf("\n");
1093 switch (statemap
[ino
] & ~INDELAYD
) {
1097 statemap
[ino
] = FCLEAR
;
1103 statemap
[ino
] = DCLEAR
;
1104 add_orphan_dir(ino
);
1108 statemap
[ino
] = SCLEAR
;
1117 errexit("BAD STATE 0x%x TO BLKERR\n", statemap
[ino
]);
1123 * allocate an unused inode
1126 allocino(fsck_ino_t request
, int type
)
1130 struct cg
*cgp
= &cgrp
;
1135 if (debug
&& (request
!= 0) && (request
!= UFSROOTINO
))
1136 errexit("assertion failed: allocino() asked for "
1137 "inode %d instead of 0 or %d",
1138 (int)request
, (int)UFSROOTINO
);
1141 * We know that we're only going to get requests for UFSROOTINO
1142 * or 0. If UFSROOTINO is wanted, then it better be available
1143 * because our caller is trying to recreate the root directory.
1144 * If we're asked for 0, then which one we return doesn't matter.
1145 * We know that inodes 0 and 1 are never valid to return, so we
1146 * the start at the lowest-legal inode number.
1148 * If we got a request for UFSROOTINO, then request != 0, and
1149 * this pair of conditionals is the only place that treats
1150 * UFSROOTINO specially.
1153 request
= UFSROOTINO
;
1154 else if (statemap
[request
] != USTATE
)
1158 * Doesn't do wrapping, since we know we started at
1159 * the smallest inode.
1161 for (ino
= request
; ino
< maxino
; ino
++)
1162 if (statemap
[ino
] == USTATE
)
1168 * In pass5, we'll calculate the bitmaps and counts all again from
1169 * scratch and do a comparison, but for that to work the cg has
1170 * to know what in-memory changes we've made to it. If we have
1171 * trouble reading the cg, cg_sanity() should kick it out so
1172 * we can skip explicit i/o error checking here.
1174 cg
= itog(&sblock
, ino
);
1175 (void) getblk(&cgblk
, cgtod(&sblock
, cg
), (size_t)sblock
.fs_cgsize
);
1176 err
= cg_sanity(cgp
, cg
);
1178 pfatal("CG %d: %s\n", cg
, err
);
1180 if (reply("REPAIR") == 0)
1181 errexit("Program terminated.");
1184 setbit(cg_inosused(cgp
), ino
% sblock
.fs_ipg
);
1185 cgp
->cg_cs
.cs_nifree
--;
1192 * Don't currently support IFATTRDIR or any of the other
1193 * types, as they aren't needed.
1195 switch (type
& IFMT
) {
1197 statemap
[ino
] = DSTATE
;
1198 cgp
->cg_cs
.cs_ndir
++;
1202 statemap
[ino
] = FSTATE
;
1206 * Pretend nothing ever happened. This clears the
1207 * dirty flag, among other things.
1211 (void) printf("allocino: unknown type 0%o\n",
1217 * We're allocating what should be a completely-unused inode,
1218 * so make sure we don't inherit anything from any previous
1222 (void) memset((void *)dp
, 0, sizeof (struct dinode
));
1223 dp
->di_db
[0] = allocblk(1);
1224 if (dp
->di_db
[0] == 0) {
1225 statemap
[ino
] = USTATE
;
1228 dp
->di_mode
= (mode_t
)type
;
1230 dp
->di_atime
= (time32_t
)t
;
1231 dp
->di_ctime
= dp
->di_atime
;
1232 dp
->di_mtime
= dp
->di_ctime
;
1233 dp
->di_size
= (u_offset_t
)sblock
.fs_fsize
;
1234 dp
->di_blocks
= btodb(sblock
.fs_fsize
);
1241 * Release some or all of the blocks of an inode.
1242 * Only truncates down. Assumes new_length is appropriately aligned
1243 * to a block boundary (or a directory block boundary, if it's a
1246 * If this is a directory, discard all of its contents first, so
1247 * we don't create a bunch of orphans that would need another fsck
1250 * Even if truncating to zero length, the inode remains allocated.
1253 truncino(fsck_ino_t ino
, offset_t new_length
, int update
)
1255 struct inodesc idesc
;
1256 struct inoinfo
*iip
;
1265 mode
= (dp
->di_mode
& IFMT
);
1266 isdir
= (mode
== IFDIR
) || (mode
== IFATTRDIR
);
1267 islink
= (mode
== IFLNK
);
1271 * Go with the parent we found by chasing references,
1272 * if we've gotten that far. Otherwise, use what the
1273 * directory itself claims. If there's no ``..'' entry
1274 * in it, give up trying to get the link counts right.
1276 if (update
== TI_NOPARENT
) {
1279 iip
= getinoinfo(ino
);
1281 parent
= iip
->i_parent
;
1283 parent
= lookup_dotdot_ino(ino
);
1286 * Make sure that the claimed
1287 * parent actually has a
1290 dp
= ginode(parent
);
1291 idesc
.id_name
= lfname
;
1292 idesc
.id_type
= DATA
;
1293 idesc
.id_func
= findino
;
1294 idesc
.id_number
= ino
;
1295 idesc
.id_fix
= DONTKNOW
;
1296 if ((ckinode(dp
, &idesc
,
1297 CKI_TRAVERSE
) & FOUND
) == 0)
1303 mark_delayed_inodes(ino
, numfrags(&sblock
, new_length
));
1305 dp
= ginode(parent
);
1306 LINK_RANGE(message
, dp
->di_nlink
, -1);
1307 if (message
!= NULL
) {
1308 LINK_CLEAR(message
, parent
, dp
->di_mode
,
1310 if (statemap
[parent
] == USTATE
)
1311 goto no_parent_update
;
1313 TRACK_LNCNTP(parent
, lncntp
[parent
]--);
1314 } else if ((mode
== IFDIR
) && (parent
== 0)) {
1316 * Currently don't have a good way to
1317 * handle this, so throw up our hands.
1318 * However, we know that we can still
1319 * do some good if we continue, so
1320 * don't actually exit yet.
1322 * We don't do it for attrdirs,
1323 * because there aren't link counts
1324 * between them and their parents.
1326 pwarn("Could not determine former parent of "
1327 "inode %d, link counts are possibly\n"
1328 "incorrect. Please rerun fsck(1M) to "
1334 * ...else if it's a directory with parent == -1, then
1335 * we've not gotten far enough to know connectivity,
1336 * and it'll get handled automatically later.
1341 init_inodesc(&idesc
);
1342 idesc
.id_type
= ADDR
;
1343 idesc
.id_func
= pass4check
;
1344 idesc
.id_number
= ino
;
1345 idesc
.id_fix
= DONTKNOW
;
1346 idesc
.id_truncto
= howmany(new_length
, sblock
.fs_bsize
);
1348 if (!islink
&& ckinode(dp
, &idesc
, CKI_TRUNCATE
) & ALTERED
)
1352 * This has to be done after ckinode(), so that all of
1353 * the fragments get visited. Note that we assume we're
1354 * always truncating to a block boundary, rather than a
1355 * fragment boundary.
1358 dp
->di_size
= new_length
;
1361 * Clear now-obsolete pointers.
1363 for (dblk
= idesc
.id_truncto
+ 1; dblk
< NDADDR
; dblk
++) {
1364 dp
->di_db
[dblk
] = 0;
1367 ilevel
= get_indir_offsets(-1, idesc
.id_truncto
, NULL
, NULL
);
1368 for (ilevel
++; ilevel
< NIADDR
; ilevel
++) {
1369 dp
->di_ib
[ilevel
] = 0;
1376 * Release an inode's resources, then release the inode itself.
1379 freeino(fsck_ino_t ino
, int update_parent
)
1388 * We need to make sure that the file is really a large file.
1389 * Everything bigger than UFS_MAXOFFSET_T is treated as a file with
1390 * negative size, which shall be cleared. (see verify_inode() in
1393 if (dp
->di_size
> (u_offset_t
)MAXOFF_T
&&
1394 dp
->di_size
<= (u_offset_t
)UFS_MAXOFFSET_T
&&
1396 (dp
->di_mode
& IFMT
) != IFBLK
&&
1397 (dp
->di_mode
& IFMT
) != IFCHR
) {
1400 truncino(ino
, 0, update_parent
);
1403 if ((dp
->di_mode
& IFMT
) == IFATTRDIR
) {
1404 clearshadow(ino
, &attrclientinfo
);
1410 statemap
[ino
] = USTATE
;
1413 * Keep the disk in sync with us so that pass5 doesn't get
1414 * upset about spurious inconsistencies.
1416 cg
= itog(&sblock
, ino
);
1417 (void) getblk(&cgblk
, (diskaddr_t
)cgtod(&sblock
, cg
),
1418 (size_t)sblock
.fs_cgsize
);
1419 cgp
= cgblk
.b_un
.b_cg
;
1420 clrbit(cg_inosused(cgp
), ino
% sblock
.fs_ipg
);
1421 cgp
->cg_cs
.cs_nifree
+= 1;
1423 sblock
.fs_cstotal
.cs_nifree
+= 1;
1428 init_inoinfo(struct inoinfo
*inp
, struct dinode
*dp
, fsck_ino_t inum
)
1430 inp
->i_parent
= ((inum
== UFSROOTINO
) ? UFSROOTINO
: (fsck_ino_t
)0);
1431 inp
->i_dotdot
= (fsck_ino_t
)0;
1432 inp
->i_isize
= (offset_t
)dp
->di_size
;
1433 inp
->i_blkssize
= (NDADDR
+ NIADDR
) * sizeof (daddr32_t
);
1434 inp
->i_extattr
= dp
->di_oeftflag
;
1435 (void) memmove((void *)&inp
->i_blks
[0], (void *)&dp
->di_db
[0],
1440 * Return the inode number in the ".." entry of the provided
1444 lookup_dotdot_ino(fsck_ino_t ino
)
1446 struct inodesc idesc
;
1448 init_inodesc(&idesc
);
1449 idesc
.id_type
= DATA
;
1450 idesc
.id_func
= findino
;
1451 idesc
.id_name
= "..";
1452 idesc
.id_number
= ino
;
1453 idesc
.id_fix
= NOFIX
;
1455 if ((ckinode(ginode(ino
), &idesc
, CKI_TRAVERSE
) & FOUND
) != 0) {
1456 return (idesc
.id_parent
);
1463 * Convenience wrapper around ckinode(findino()).
1466 lookup_named_ino(fsck_ino_t dir
, caddr_t name
)
1468 struct inodesc idesc
;
1470 init_inodesc(&idesc
);
1471 idesc
.id_type
= DATA
;
1472 idesc
.id_func
= findino
;
1473 idesc
.id_name
= name
;
1474 idesc
.id_number
= dir
;
1475 idesc
.id_fix
= NOFIX
;
1477 if ((ckinode(ginode(dir
), &idesc
, CKI_TRAVERSE
) & FOUND
) != 0) {
1478 return (idesc
.id_parent
);
1485 * Marks inodes that are being orphaned and might need to be reconnected
1486 * by pass4(). The inode we're traversing is the directory whose
1487 * contents will be reconnected later. id_parent is the lfn at which
1488 * to start looking at said contents.
1491 mark_a_delayed_inode(struct inodesc
*idesc
)
1493 struct direct
*dirp
= idesc
->id_dirp
;
1495 if (idesc
->id_lbn
< idesc
->id_parent
) {
1499 if (dirp
->d_ino
!= 0 &&
1500 strcmp(dirp
->d_name
, ".") != 0 &&
1501 strcmp(dirp
->d_name
, "..") != 0) {
1502 statemap
[dirp
->d_ino
] &= ~INFOUND
;
1503 statemap
[dirp
->d_ino
] |= INDELAYD
;
1510 mark_delayed_inodes(fsck_ino_t ino
, daddr32_t first_lfn
)
1513 struct inodesc idelayed
;
1515 init_inodesc(&idelayed
);
1516 idelayed
.id_number
= ino
;
1517 idelayed
.id_type
= DATA
;
1518 idelayed
.id_fix
= NOFIX
;
1519 idelayed
.id_func
= mark_a_delayed_inode
;
1520 idelayed
.id_parent
= first_lfn
;
1521 idelayed
.id_entryno
= 2;
1524 (void) ckinode(dp
, &idelayed
, CKI_TRAVERSE
);
1528 * Clear the i_oeftflag/extended attribute pointer from INO.
1531 clearattrref(fsck_ino_t ino
)
1537 if (dp
->di_oeftflag
== 0)
1538 (void) printf("clearattref: no attr to clear on %d\n",
1542 dp
->di_oeftflag
= 0;