2 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
5 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
6 /* All Rights Reserved */
9 * Copyright (c) 1980, 1986, 1990 The Regents of the University of California.
10 * All rights reserved.
12 * Redistribution and use in source and binary forms are permitted
13 * provided that: (1) source distributions retain this entire copyright
14 * notice and comment, and (2) distributions including binaries display
15 * the following acknowledgement: ``This product includes software
16 * developed by the University of California, Berkeley and its contributors''
17 * in the documentation or other materials provided with the distribution
18 * and in all advertising materials mentioning features or use of this
19 * software. Neither the name of the University nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
23 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
24 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/sysmacros.h>
37 #include <sys/mntent.h>
38 #include <sys/vnode.h>
39 #include <sys/fs/ufs_inode.h>
40 #include <sys/fs/ufs_fs.h>
42 #include <sys/fs/ufs_fsdir.h>
47 static int get_indir_offsets(int, daddr_t
, int *, int *);
48 static int clearanentry(struct inodesc
*);
49 static void pdinode(struct dinode
*);
50 static void inoflush(void);
51 static void mark_delayed_inodes(fsck_ino_t
, daddr32_t
);
52 static int iblock(struct inodesc
*, int, uoff_t
, enum cki_action
);
53 static struct inoinfo
*search_cache(struct inoinfo
*, fsck_ino_t
);
54 static int ckinode_common(struct dinode
*, struct inodesc
*, enum cki_action
);
55 static int lookup_dotdot_ino(fsck_ino_t
);
58 * ckinode() essentially traverses the blocklist of the provided
59 * inode. For each block either the caller-supplied callback (id_func
60 * in the provided struct inodesc) or dirscan() is invoked. Which is
61 * chosen is controlled by what type of traversal was requested
62 * (id_type) - if it was for an ADDR or ACL, use the callback,
63 * otherwise it is assumed to be DATA (i.e., a directory) whose
64 * contents need to be scanned.
66 * Note that a directory inode can get passed in with a type of ADDR;
67 * the type field is orthogonal to the IFMT value. This is so that
68 * the file aspects (no duplicate blocks, etc) of a directory can be
69 * verified just like is done for any other file, or the actual
70 * contents can be scanned so that connectivity and such can be
73 * The traversal is controlled by flags in the return value of
74 * dirscan() or the callback. Five flags are defined, STOP, SKIP,
75 * KEEPON, ALTERED, and FOUND. Their semantics are:
77 * STOP - no further processing of this inode is desired/possible/
78 * feasible/etc. This can mean that whatever the scan
79 * was searching for was found, or a serious
80 * inconsistency was encountered, or anything else
83 * SKIP - something that made it impossible to continue was
84 * encountered, and the caller should go on to the next
85 * inode. This is more for i/o failures than for
86 * logical inconsistencies. Nothing actually looks for
89 * KEEPON - no more blocks of this inode need to be scanned, but
90 * nothing's wrong, so keep on going with the next
91 * inode. It is similar to STOP, except that
92 * ckinode()'s caller will typically advance to the next
93 * inode for KEEPON, whereas it ceases scanning through
94 * the inodes completely for STOP.
96 * ALTERED - a change was made to the inode. If the caller sees
97 * this set, it should make sure to flush out the
98 * changes. Note that any data blocks read in by the
99 * function need to be marked dirty by it directly;
100 * flushing of those will happen automatically later.
102 * FOUND - whatever was being searched for was located.
103 * Typically combined with STOP to avoid wasting time
104 * doing additional looking.
106 * During a traversal, some state needs to be carried around. At the
107 * least, the callback functions need to know what inode they're
108 * working on, which logical block, and whether or not fixing problems
109 * when they're encountered is desired. Rather than try to guess what
110 * else might be needed (and thus end up passing way more arguments
111 * than is reasonable), all the possibilities have been bundled in
112 * struct inodesc. About half of the fields are specific to directory
113 * traversals, and the rest are pretty much generic to any traversal.
115 * The general fields are:
117 * id_fix What to do when an error is found. Generally, this
118 * is set to DONTKNOW before a traversal. If a
119 * problem is encountered, it is changed to either FIX
120 * or NOFIX by the dofix() query function. If id_fix
121 * has already been set to FIX when dofix() is called, then
122 * it includes the ALTERED flag (see above) in its return
123 * value; the net effect is that the inode's buffer
124 * will get marked dirty and written to disk at some
125 * point. If id_fix is DONTKNOW, then dofix() will
126 * query the user. If it is NOFIX, then dofix()
127 * essentially does nothing. A few routines set NOFIX
128 * as the initial value, as they are performing a best-
129 * effort informational task, rather than an actual
132 * id_func This is the function that will be called for every
133 * logical block in the file (assuming id_type is not
134 * DATA). The logical block may represent a hole, so
135 * the callback needs to be prepared to handle that
136 * case. Its return value is a combination of the flags
137 * described above (SKIP, ALTERED, etc).
139 * id_number The inode number whose block list or data is being
142 * id_parent When id_type is DATA, this is the inode number for
143 * the parent of id_number. Otherwise, it is
144 * available for use as an extra parameter or return
145 * value between the callback and ckinode()'s caller.
146 * Which, if either, of those is left completely up to
147 * the two routines involved, so nothing can generally
148 * be assumed about the id_parent value for non-DATA
151 * id_lbn This is the current logical block (not fragment)
152 * number being visited by the traversal.
154 * id_blkno This is the physical block corresponding to id_lbn.
156 * id_numfrags This defines how large a block is being processed in
157 * this particular invocation of the callback.
158 * Usually, it will be the same as sblock.fs_frag.
159 * However, if a direct block is being processed and
160 * it is less than a full filesystem block,
161 * id_numfrags will indicate just how many fragments
162 * (starting from id_lbn) are actually part of the
165 * id_truncto The pass 4 callback is used in several places to
166 * free the blocks of a file (the `FILE HAS PROBLEM
167 * FOO; CLEAR?' scenario). This has been generalized
168 * to allow truncating a file to a particular length
169 * rather than always completely discarding it. If
170 * id_truncto is -1, then the entire file is released,
171 * otherwise it is logical block number to truncate
172 * to. This generalized interface was motivated by a
173 * desire to be able to discard everything after a
174 * hole in a directory, rather than the entire
177 * id_type Selects the type of traversal. DATA for dirscan(),
178 * ADDR or ACL for using the provided callback.
180 * There are several more fields used just for dirscan() traversals:
182 * id_filesize The number of bytes in the overall directory left to
185 * id_loc Byte position within the directory block. Should always
186 * point to the start of a directory entry.
188 * id_entryno Which logical directory entry is being processed (0
189 * is `.', 1 is `..', 2 and on are normal entries).
190 * This field is primarily used to enable special
191 * checks when looking at the first two entries.
193 * The exception (there's always an exception in fsck)
194 * is that in pass 1, it tracks how many fragments are
195 * being used by a particular inode.
197 * id_firsthole The first logical block number that was found to
198 * be zero. As directories are not supposed to have
199 * holes, this marks where a directory should be
200 * truncated down to. A value of -1 indicates that
201 * no holes were found.
203 * id_dirp A pointer to the in-memory copy of the current
204 * directory entry (as identified by id_loc).
206 * id_name This is a directory entry name to either create
207 * (callback is mkentry) or locate (callback is
208 * chgino, findino, or findname).
211 ckinode(struct dinode
*dp
, struct inodesc
*idesc
, enum cki_action action
)
213 struct inodesc cleardesc
;
216 if (idesc
->id_filesize
== 0)
217 idesc
->id_filesize
= (offset_t
)dp
->di_size
;
220 * Our caller should be filtering out completely-free inodes
221 * (mode == zero), so we'll work on the assumption that what
222 * we're given has some basic validity.
224 * The kernel is inconsistent about MAXPATHLEN including the
225 * trailing \0, so allow the more-generous length for symlinks.
227 mode
= dp
->di_mode
& IFMT
;
228 if (mode
== IFBLK
|| mode
== IFCHR
)
230 if (mode
== IFLNK
&& dp
->di_size
> MAXPATHLEN
) {
231 pwarn("I=%d Symlink longer than supported maximum\n",
233 init_inodesc(&cleardesc
);
234 cleardesc
.id_type
= ADDR
;
235 cleardesc
.id_number
= idesc
->id_number
;
236 cleardesc
.id_fix
= DONTKNOW
;
237 clri(&cleardesc
, "BAD", CLRI_VERBOSE
, CLRI_NOP_CORRUPT
);
240 return (ckinode_common(dp
, idesc
, action
));
244 * This was split out from ckinode() to allow it to be used
245 * without having to pass in kludge flags to suppress the
246 * wrong-for-deletion initialization and irrelevant checks.
247 * This feature is no longer needed, but is being kept in case
248 * the need comes back.
251 ckinode_common(struct dinode
*dp
, struct inodesc
*idesc
,
252 enum cki_action action
)
257 int indir_data_blks
, last_indir_blk
;
260 (void) memmove(&dino
, dp
, sizeof (struct dinode
));
261 ndb
= howmany(dino
.di_size
, (uoff_t
)sblock
.fs_bsize
);
263 for (i
= 0; i
< NDADDR
; i
++) {
265 offset
= blkoff(&sblock
, dino
.di_size
);
266 if ((--ndb
== 0) && (offset
!= 0)) {
268 numfrags(&sblock
, fragroundup(&sblock
, offset
));
270 idesc
->id_numfrags
= sblock
.fs_frag
;
272 if (dino
.di_db
[i
] == 0) {
273 if ((ndb
> 0) && (idesc
->id_firsthole
< 0)) {
274 idesc
->id_firsthole
= i
;
278 idesc
->id_blkno
= dino
.di_db
[i
];
279 if (idesc
->id_type
== ADDR
|| idesc
->id_type
== ACL
)
280 ret
= (*idesc
->id_func
)(idesc
);
282 ret
= dirscan(idesc
);
285 * Need to clear the entry, now that we're done with
286 * it. We depend on freeblk() ignoring a request to
287 * free already-free fragments to handle the problem of
290 if ((action
== CKI_TRUNCATE
) &&
291 (idesc
->id_truncto
>= 0) &&
292 (idesc
->id_lbn
>= idesc
->id_truncto
)) {
293 dp
= ginode(idesc
->id_number
);
295 * The (int) cast is safe, in that if di_size won't
296 * fit, it'll be a multiple of any legal fs_frag,
297 * thus giving a zero result. That value, in turn
298 * means we're doing an entire block.
300 frags
= howmany((int)dp
->di_size
, sblock
.fs_fsize
) %
303 frags
= sblock
.fs_frag
;
304 freeblk(idesc
->id_number
, dp
->di_db
[i
],
306 dp
= ginode(idesc
->id_number
);
317 * indir_data_blks contains the number of data blocks in all
318 * the previous levels for this iteration. E.g., for the
319 * single indirect case (i = 0, di_ib[i] != 0), NDADDR's worth
320 * of blocks have already been covered by the direct blocks
321 * (di_db[]). At the triple indirect level (i = NIADDR - 1),
322 * it is all of the number of data blocks that were covered
323 * by the second indirect, single indirect, and direct block
326 idesc
->id_numfrags
= sblock
.fs_frag
;
327 ndb
= howmany(dino
.di_size
, (uoff_t
)sblock
.fs_bsize
);
328 for (i
= 0; i
< NIADDR
; i
++) {
329 (void) get_indir_offsets(i
, ndb
, &indir_data_blks
,
331 if (dino
.di_ib
[i
] != 0) {
333 * We'll only clear di_ib[i] if the first entry (and
334 * therefore all of them) is to be cleared, since we
335 * only go through this code on the first entry of
336 * each level of indirection. The +1 is to account
337 * for the fact that we don't modify id_lbn until
338 * we actually start processing on a data block.
340 idesc
->id_blkno
= dino
.di_ib
[i
];
341 ret
= iblock(idesc
, i
+ 1,
342 (uoff_t
)howmany(dino
.di_size
,
343 (uoff_t
)sblock
.fs_bsize
) - indir_data_blks
,
345 if ((action
== CKI_TRUNCATE
) &&
346 (idesc
->id_truncto
<= indir_data_blks
) &&
347 ((idesc
->id_lbn
+ 1) >= indir_data_blks
) &&
348 ((idesc
->id_lbn
+ 1) <= last_indir_blk
)) {
349 dp
= ginode(idesc
->id_number
);
350 if (dp
->di_ib
[i
] != 0) {
351 freeblk(idesc
->id_number
, dp
->di_ib
[i
],
359 * Need to know which of the file's logical blocks
360 * reside in the missing indirect block. However, the
361 * precise location is only needed for truncating
362 * directories, and level-of-indirection precision is
363 * sufficient for that.
365 if ((indir_data_blks
< ndb
) &&
366 (idesc
->id_firsthole
< 0)) {
367 idesc
->id_firsthole
= indir_data_blks
;
375 get_indir_offsets(int ilevel_wanted
, daddr_t ndb
, int *data_blks
,
382 for (ilevel
= 0; ilevel
< NIADDR
; ilevel
++) {
386 lblk
= dblks
+ NINDIR(&sblock
) - 1;
389 dblks
= NDADDR
+ NINDIR(&sblock
);
390 lblk
= dblks
+ (NINDIR(&sblock
) * NINDIR(&sblock
)) - 1;
393 dblks
= NDADDR
+ NINDIR(&sblock
) +
394 (NINDIR(&sblock
) * NINDIR(&sblock
));
395 lblk
= dblks
+ (NINDIR(&sblock
) * NINDIR(&sblock
) *
396 NINDIR(&sblock
)) - 1;
399 exitstat
= EXERRFATAL
;
401 * Translate from zero-based array to
402 * one-based human-style counting.
404 errexit("panic: indirection level %d not 1, 2, or 3",
409 if (dblks
< ndb
&& ndb
<= lblk
)
412 if (ilevel
== ilevel_wanted
) {
413 if (data_blks
!= NULL
)
415 if (last_blk
!= NULL
)
424 iblock(struct inodesc
*idesc
, int ilevel
, uoff_t iblks
,
425 enum cki_action action
)
429 int (*func
)(struct inodesc
*) = NULL
;
437 switch (idesc
->id_type
) {
439 func
= idesc
->id_func
;
440 if (((n
= (*func
)(idesc
)) & KEEPON
) == 0)
444 func
= idesc
->id_func
;
450 errexit("unknown inodesc type %d in iblock()", idesc
->id_type
);
453 if (chkrange(idesc
->id_blkno
, idesc
->id_numfrags
)) {
454 return ((idesc
->id_type
== ACL
) ? STOP
: SKIP
);
457 bp
= getdatablk(idesc
->id_blkno
, (size_t)sblock
.fs_bsize
);
458 if (bp
->b_errs
!= 0) {
465 * Trivia note: the BSD fsck has the number of bytes remaining
466 * as the third argument to iblock(), so the equivalent of
467 * fsbperindirb starts at fs_bsize instead of one. We're
468 * working in units of filesystem blocks here, not bytes or
471 for (fsbperindirb
= 1, i
= 0; i
< ilevel
; i
++) {
472 fsbperindirb
*= (uoff_t
)NINDIR(&sblock
);
475 * nif indicates the next "free" pointer (as an array index) in this
476 * indirect block, based on counting the blocks remaining in the
477 * file after subtracting all previously processed blocks.
478 * This figure is based on the size field of the inode.
480 * Note that in normal operation, nif may initially be calculated
481 * as larger than the number of pointers in this block (as when
482 * there are more indirect blocks following); if that is
483 * the case, nif is limited to the max number of pointers per
486 * Also note that if an inode is inconsistent (has more blocks
487 * allocated to it than the size field would indicate), the sweep
488 * through any indirect blocks directly pointed at by the inode
489 * continues. Since the block offset of any data blocks referenced
490 * by these indirect blocks is greater than the size of the file,
491 * the index nif may be computed as a negative value.
492 * In this case, we reset nif to indicate that all pointers in
493 * this retrieval block should be zeroed and the resulting
494 * unreferenced data and/or retrieval blocks will be recovered
495 * through garbage collection later.
497 nif
= (offset_t
)howmany(iblks
, fsbperindirb
);
498 if (nif
> NINDIR(&sblock
))
499 nif
= NINDIR(&sblock
);
503 * first pass: all "free" retrieval pointers (from [nif] thru
504 * the end of the indirect block) should be zero. (This
505 * assertion does not hold for directories, which may be
506 * truncated without releasing their allocated space)
508 if (nif
< NINDIR(&sblock
) && (idesc
->id_func
== pass1check
||
509 idesc
->id_func
== pass3bcheck
)) {
510 for (i
= nif
; i
< NINDIR(&sblock
); i
++) {
511 if (bp
->b_un
.b_indir
[i
] == 0)
513 (void) sprintf(buf
, "PARTIALLY TRUNCATED INODE I=%lu",
514 (ulong_t
)idesc
->id_number
);
517 } else if (dofix(idesc
, buf
)) {
518 freeblk(idesc
->id_number
,
521 bp
->b_un
.b_indir
[i
] = 0;
525 flush(fswritefd
, bp
);
528 * second pass: all retrieval pointers referring to blocks within
529 * a valid range [0..filesize] (both indirect and data blocks)
530 * are examined in the same manner as ckinode() checks the
531 * direct blocks in the inode. Sweep through from
532 * the first pointer in this retrieval block to [nif-1].
534 last_lbn
= howmany(idesc
->id_filesize
, sblock
.fs_bsize
);
535 for (i
= 0; i
< nif
; i
++) {
538 if (bp
->b_un
.b_indir
[i
] != 0) {
539 idesc
->id_blkno
= bp
->b_un
.b_indir
[i
];
541 n
= iblock(idesc
, ilevel
, iblks
, action
);
543 * Each iteration decreases "remaining block
544 * count" by the number of blocks accessible
545 * by a pointer at this indirect block level.
547 iblks
-= fsbperindirb
;
550 * If we're truncating, func will discard
551 * the data block for us.
556 if ((action
== CKI_TRUNCATE
) &&
557 (idesc
->id_truncto
>= 0) &&
558 (idesc
->id_lbn
>= idesc
->id_truncto
)) {
559 freeblk(idesc
->id_number
, bp
->b_un
.b_indir
[i
],
564 * Note that truncation never gets STOP back
565 * under normal circumstances. Abnormal would
566 * be a bad acl short-circuit in iblock() or
567 * an out-of-range failure in pass4check().
568 * We still want to keep going when truncating
569 * under those circumstances, since the whole
570 * point of truncating is to get rid of all
573 if ((n
& STOP
) && (action
!= CKI_TRUNCATE
)) {
578 if ((idesc
->id_lbn
< last_lbn
) &&
579 (idesc
->id_firsthole
< 0)) {
580 idesc
->id_firsthole
= idesc
->id_lbn
;
582 if (idesc
->id_type
== DATA
) {
584 * No point in continuing in the indirect
585 * blocks of a directory, since they'll just
589 return ((n
& ~KEEPON
) | STOP
);
599 * Check that a block is a legal block number.
600 * Return 0 if in range, 1 if out of range.
603 chkrange(daddr32_t blk
, int cnt
)
607 if (cnt
<= 0 || blk
<= 0 || ((unsigned)blk
>= (unsigned)maxfsblock
) ||
608 ((cnt
- 1) > (maxfsblock
- blk
))) {
611 "Bad fragment range: should be 1 <= %d..%d < %d\n",
612 blk
, blk
+ cnt
, maxfsblock
);
615 if ((cnt
> sblock
.fs_frag
) ||
616 ((fragnum(&sblock
, blk
) + cnt
) > sblock
.fs_frag
)) {
618 (void) printf("Bad fragment size: size %d\n", cnt
);
621 c
= dtog(&sblock
, blk
);
622 if (blk
< cgdmin(&sblock
, c
)) {
623 if ((unsigned)(blk
+ cnt
) > (unsigned)cgsblock(&sblock
, c
)) {
626 "Bad fragment position: %d..%d spans start of cg metadata\n",
631 if ((unsigned)(blk
+ cnt
) > (unsigned)cgbase(&sblock
, c
+1)) {
634 "Bad frag pos: %d..%d crosses end of cg\n",
643 * General purpose interface for reading inodes.
647 * Note that any call to ginode() can potentially invalidate any
648 * dinode pointers previously acquired from it. To avoid pain,
649 * make sure to always call inodirty() immediately after modifying
650 * an inode, if there's any chance of ginode() being called after
651 * that. Also, always call ginode() right before you need to access
652 * an inode, so that there won't be any surprises from functions
653 * called between the previous ginode() invocation and the dinode
656 * Despite all that, we aren't doing the amount of i/o that's implied,
657 * as we use the buffer cache that getdatablk() and friends maintain.
659 static fsck_ino_t startinum
= -1;
662 ginode(fsck_ino_t inum
)
667 if (inum
< UFSROOTINO
|| inum
> maxino
) {
668 errexit("bad inode number %d to ginode\n", inum
);
670 if (startinum
== -1 ||
673 inum
>= (fsck_ino_t
)(startinum
+ (fsck_ino_t
)INOPB(&sblock
))) {
674 iblk
= itod(&sblock
, inum
);
679 * We don't check for errors here, because we can't
680 * tell our caller about it, and the zeros that will
681 * be in the buffer are just as good as anything we
684 pbp
= getdatablk(iblk
, (size_t)sblock
.fs_bsize
);
686 (fsck_ino_t
)((inum
/ INOPB(&sblock
)) * INOPB(&sblock
));
688 dp
= &pbp
->b_un
.b_dinode
[inum
% INOPB(&sblock
)];
689 if (dp
->di_suid
!= UID_LONG
)
690 dp
->di_uid
= dp
->di_suid
;
691 if (dp
->di_sgid
!= GID_LONG
)
692 dp
->di_gid
= dp
->di_sgid
;
697 * Special purpose version of ginode used to optimize first pass
698 * over all the inodes in numerical order. It bypasses the buffer
699 * system used by ginode(), etc in favour of reading the bulk of a
700 * cg's inodes at one time.
702 static fsck_ino_t nextino
, lastinum
;
703 static int64_t readcnt
, readpercg
, fullcnt
, inobufsize
;
704 static int64_t partialcnt
, partialsize
;
705 static size_t lastsize
;
706 static struct dinode
*inodebuf
;
707 static diskaddr_t currentdblk
;
708 static struct dinode
*currentinode
;
711 getnextinode(fsck_ino_t inum
)
715 static struct dinode
*dp
;
717 if (inum
!= nextino
++ || inum
> maxino
)
718 errexit("bad inode number %d to nextinode\n", inum
);
721 * Will always go into the if() the first time we're called,
722 * so dp will always be valid.
724 if (inum
>= lastinum
) {
726 dblk
= fsbtodb(&sblock
, itod(&sblock
, lastinum
));
728 if (readcnt
% readpercg
== 0) {
729 if (partialsize
> SIZE_MAX
)
731 "Internal error: partialsize overflow");
732 size
= (size_t)partialsize
;
733 lastinum
+= partialcnt
;
735 if (inobufsize
> SIZE_MAX
)
736 errexit("Internal error: inobufsize overflow");
737 size
= (size_t)inobufsize
;
741 * If fsck_bread() returns an error, it will already have
742 * zeroed out the buffer, so we do not need to do so here.
744 (void) fsck_bread(fsreadfd
, (caddr_t
)inodebuf
, dblk
, size
);
753 * Reread the current getnext() buffer. This allows for changing inodes
754 * other than the current one via ginode()/inodirty()/inoflush().
756 * Just reuses all the interesting variables that getnextinode() set up
757 * last time it was called. This shouldn't get called often, so we don't
758 * try to figure out if the caller's actually touched an inode in the
759 * range we have cached. There could have been an arbitrary number of
765 if (inodebuf
== NULL
) {
770 (void) fsck_bread(fsreadfd
, (caddr_t
)inodebuf
, currentdblk
, lastsize
);
771 return (currentinode
);
781 inobufsize
= blkroundup(&sblock
, INOBUFSIZE
);
782 fullcnt
= inobufsize
/ sizeof (struct dinode
);
783 readpercg
= sblock
.fs_ipg
/ fullcnt
;
784 partialcnt
= sblock
.fs_ipg
% fullcnt
;
785 partialsize
= partialcnt
* sizeof (struct dinode
);
786 if (partialcnt
!= 0) {
789 partialcnt
= fullcnt
;
790 partialsize
= inobufsize
;
792 if (inodebuf
== NULL
&&
793 (inodebuf
= (struct dinode
*)malloc((unsigned)inobufsize
)) == NULL
)
794 errexit("Cannot allocate space for inode buffer\n");
795 while (nextino
< UFSROOTINO
)
796 (void) getnextinode(nextino
);
802 if (inodebuf
!= NULL
) {
803 free((void *)inodebuf
);
809 * Routines to maintain information about directory inodes.
810 * This is built during the first pass and used during the
811 * second and third passes.
813 * Enter inodes into the cache.
816 cacheino(struct dinode
*dp
, fsck_ino_t inum
)
819 struct inoinfo
**inpp
;
822 blks
= NDADDR
+ NIADDR
;
823 inp
= (struct inoinfo
*)
824 malloc(sizeof (*inp
) + (blks
- 1) * sizeof (daddr32_t
));
826 errexit("Cannot increase directory list\n");
827 init_inoinfo(inp
, dp
, inum
); /* doesn't touch i_nextlist or i_number */
828 inpp
= &inphead
[inum
% numdirs
];
829 inp
->i_nextlist
= *inpp
;
831 inp
->i_number
= inum
;
832 if (inplast
== listmax
) {
834 inpsort
= reallocarray(inpsort
, listmax
,
835 sizeof (struct inoinfo
*));
837 errexit("cannot increase directory list");
839 inpsort
[inplast
++] = inp
;
843 * Look up an inode cache structure.
846 getinoinfo(fsck_ino_t inum
)
850 inp
= search_cache(inphead
[inum
% numdirs
], inum
);
855 * Determine whether inode is in cache.
858 inocached(fsck_ino_t inum
)
860 return (search_cache(inphead
[inum
% numdirs
], inum
) != NULL
);
864 * Clean up all the inode cache structure.
869 struct inoinfo
**inpp
;
873 for (inpp
= &inpsort
[inplast
- 1]; inpp
>= inpsort
; inpp
--) {
874 free((void *)(*inpp
));
876 free((void *)inphead
);
877 free((void *)inpsort
);
878 inphead
= inpsort
= NULL
;
882 * Routines to maintain information about acl inodes.
883 * This is built during the first pass and used during the
884 * second and third passes.
886 * Enter acl inodes into the cache.
889 cacheacl(struct dinode
*dp
, fsck_ino_t inum
)
891 struct inoinfo
*aclp
;
892 struct inoinfo
**aclpp
;
895 blks
= NDADDR
+ NIADDR
;
896 aclp
= (struct inoinfo
*)
897 malloc(sizeof (*aclp
) + (blks
- 1) * sizeof (daddr32_t
));
900 aclpp
= &aclphead
[inum
% numacls
];
901 aclp
->i_nextlist
= *aclpp
;
903 aclp
->i_number
= inum
;
904 aclp
->i_isize
= (offset_t
)dp
->di_size
;
905 aclp
->i_blkssize
= (size_t)(blks
* sizeof (daddr32_t
));
906 (void) memmove(&aclp
->i_blks
[0], &dp
->di_db
[0], aclp
->i_blkssize
);
907 if (aclplast
== aclmax
) {
909 aclpsort
= reallocarray(aclpsort
, aclmax
,
910 sizeof (struct inoinfo
*));
911 if (aclpsort
== NULL
)
912 errexit("cannot increase acl list");
914 aclpsort
[aclplast
++] = aclp
;
919 * Generic cache search function.
920 * ROOT is the first entry in a hash chain (the caller is expected
921 * to have done the initial bucket lookup). KEY is what's being
924 * Returns a pointer to the entry if it is found, NULL otherwise.
926 static struct inoinfo
*
927 search_cache(struct inoinfo
*element
, fsck_ino_t key
)
929 while (element
!= NULL
) {
930 if (element
->i_number
== key
)
932 element
= element
->i_nextlist
;
948 flush(fswritefd
, pbp
);
952 * Interactive wrapper for freeino(), for those times when we're
953 * not sure if we should throw something away.
956 clri(struct inodesc
*idesc
, char *type
, int verbose
, int corrupting
)
961 if (statemap
[idesc
->id_number
] == USTATE
)
964 dp
= ginode(idesc
->id_number
);
965 if (verbose
== CLRI_VERBOSE
) {
966 pwarn("%s %s", type
, file_id(idesc
->id_number
, dp
->di_mode
));
967 pinode(idesc
->id_number
);
969 if (preen
|| (reply("CLEAR") == 1)) {
970 need_parent
= (corrupting
== CLRI_NOP_OK
) ?
971 TI_NOPARENT
: TI_PARENT
;
972 freeino(idesc
->id_number
, need_parent
);
974 (void) printf(" (CLEARED)\n");
975 remove_orphan_dir(idesc
->id_number
);
976 } else if (corrupting
== CLRI_NOP_CORRUPT
) {
983 * Find the directory entry for the inode noted in id_parent (which is
984 * not necessarily the parent of anything, we're just using a convenient
988 findname(struct inodesc
*idesc
)
990 struct direct
*dirp
= idesc
->id_dirp
;
992 if (dirp
->d_ino
!= idesc
->id_parent
)
994 (void) memmove(idesc
->id_name
, dirp
->d_name
,
995 MIN(dirp
->d_namlen
, MAXNAMLEN
) + 1);
1000 * Find the inode number associated with the given name.
1003 findino(struct inodesc
*idesc
)
1005 struct direct
*dirp
= idesc
->id_dirp
;
1007 if (dirp
->d_ino
== 0)
1009 if (strcmp(dirp
->d_name
, idesc
->id_name
) == 0 &&
1010 dirp
->d_ino
>= UFSROOTINO
&& dirp
->d_ino
<= maxino
) {
1011 idesc
->id_parent
= dirp
->d_ino
;
1012 return (STOP
|FOUND
);
1018 cleardirentry(fsck_ino_t parentdir
, fsck_ino_t target
)
1020 struct inodesc idesc
;
1023 dp
= ginode(parentdir
);
1024 init_inodesc(&idesc
);
1025 idesc
.id_func
= clearanentry
;
1026 idesc
.id_parent
= target
;
1027 idesc
.id_type
= DATA
;
1028 idesc
.id_fix
= NOFIX
;
1029 return (ckinode(dp
, &idesc
, CKI_TRAVERSE
));
1033 clearanentry(struct inodesc
*idesc
)
1035 struct direct
*dirp
= idesc
->id_dirp
;
1037 if (dirp
->d_ino
!= idesc
->id_parent
|| idesc
->id_entryno
< 2) {
1038 idesc
->id_entryno
++;
1042 return (STOP
|FOUND
|ALTERED
);
1046 pinode(fsck_ino_t ino
)
1050 (void) printf(" I=%lu ", (ulong_t
)ino
);
1051 if (ino
< UFSROOTINO
|| ino
> maxino
)
1058 pdinode(struct dinode
*dp
)
1064 (void) printf(" OWNER=");
1065 if ((pw
= getpwuid((int)dp
->di_uid
)) != 0)
1066 (void) printf("%s ", pw
->pw_name
);
1068 (void) printf("%lu ", (ulong_t
)dp
->di_uid
);
1069 (void) printf("MODE=%o\n", dp
->di_mode
);
1071 (void) printf("%s: ", devname
);
1072 (void) printf("SIZE=%lld ", (longlong_t
)dp
->di_size
);
1074 /* ctime() ignores LOCALE, so this is safe */
1075 t
= (time_t)dp
->di_mtime
;
1077 (void) printf("MTIME=%12.12s %4.4s ", p
+ 4, p
+ 20);
1081 blkerror(fsck_ino_t ino
, char *type
, daddr32_t blk
, daddr32_t lbn
)
1083 pfatal("FRAGMENT %d %s I=%u LFN %d", blk
, type
, ino
, lbn
);
1084 (void) printf("\n");
1086 switch (statemap
[ino
] & ~INDELAYD
) {
1090 statemap
[ino
] = FCLEAR
;
1096 statemap
[ino
] = DCLEAR
;
1097 add_orphan_dir(ino
);
1101 statemap
[ino
] = SCLEAR
;
1110 errexit("BAD STATE 0x%x TO BLKERR\n", statemap
[ino
]);
1116 * allocate an unused inode
1119 allocino(fsck_ino_t request
, int type
)
1123 struct cg
*cgp
= &cgrp
;
1128 if (debug
&& (request
!= 0) && (request
!= UFSROOTINO
))
1129 errexit("assertion failed: allocino() asked for "
1130 "inode %d instead of 0 or %d",
1131 (int)request
, (int)UFSROOTINO
);
1134 * We know that we're only going to get requests for UFSROOTINO
1135 * or 0. If UFSROOTINO is wanted, then it better be available
1136 * because our caller is trying to recreate the root directory.
1137 * If we're asked for 0, then which one we return doesn't matter.
1138 * We know that inodes 0 and 1 are never valid to return, so we
1139 * the start at the lowest-legal inode number.
1141 * If we got a request for UFSROOTINO, then request != 0, and
1142 * this pair of conditionals is the only place that treats
1143 * UFSROOTINO specially.
1146 request
= UFSROOTINO
;
1147 else if (statemap
[request
] != USTATE
)
1151 * Doesn't do wrapping, since we know we started at
1152 * the smallest inode.
1154 for (ino
= request
; ino
< maxino
; ino
++)
1155 if (statemap
[ino
] == USTATE
)
1161 * In pass5, we'll calculate the bitmaps and counts all again from
1162 * scratch and do a comparison, but for that to work the cg has
1163 * to know what in-memory changes we've made to it. If we have
1164 * trouble reading the cg, cg_sanity() should kick it out so
1165 * we can skip explicit i/o error checking here.
1167 cg
= itog(&sblock
, ino
);
1168 (void) getblk(&cgblk
, cgtod(&sblock
, cg
), (size_t)sblock
.fs_cgsize
);
1169 err
= cg_sanity(cgp
, cg
);
1171 pfatal("CG %d: %s\n", cg
, err
);
1173 if (reply("REPAIR") == 0)
1174 errexit("Program terminated.");
1177 setbit(cg_inosused(cgp
), ino
% sblock
.fs_ipg
);
1178 cgp
->cg_cs
.cs_nifree
--;
1185 * Don't currently support IFATTRDIR or any of the other
1186 * types, as they aren't needed.
1188 switch (type
& IFMT
) {
1190 statemap
[ino
] = DSTATE
;
1191 cgp
->cg_cs
.cs_ndir
++;
1195 statemap
[ino
] = FSTATE
;
1199 * Pretend nothing ever happened. This clears the
1200 * dirty flag, among other things.
1204 (void) printf("allocino: unknown type 0%o\n",
1210 * We're allocating what should be a completely-unused inode,
1211 * so make sure we don't inherit anything from any previous
1215 (void) memset(dp
, 0, sizeof (struct dinode
));
1216 dp
->di_db
[0] = allocblk(1);
1217 if (dp
->di_db
[0] == 0) {
1218 statemap
[ino
] = USTATE
;
1221 dp
->di_mode
= (mode_t
)type
;
1223 dp
->di_atime
= (time32_t
)t
;
1224 dp
->di_ctime
= dp
->di_atime
;
1225 dp
->di_mtime
= dp
->di_ctime
;
1226 dp
->di_size
= (uoff_t
)sblock
.fs_fsize
;
1227 dp
->di_blocks
= btodb(sblock
.fs_fsize
);
1234 * Release some or all of the blocks of an inode.
1235 * Only truncates down. Assumes new_length is appropriately aligned
1236 * to a block boundary (or a directory block boundary, if it's a
1239 * If this is a directory, discard all of its contents first, so
1240 * we don't create a bunch of orphans that would need another fsck
1243 * Even if truncating to zero length, the inode remains allocated.
1246 truncino(fsck_ino_t ino
, offset_t new_length
, int update
)
1248 struct inodesc idesc
;
1249 struct inoinfo
*iip
;
1258 mode
= (dp
->di_mode
& IFMT
);
1259 isdir
= (mode
== IFDIR
) || (mode
== IFATTRDIR
);
1260 islink
= (mode
== IFLNK
);
1264 * Go with the parent we found by chasing references,
1265 * if we've gotten that far. Otherwise, use what the
1266 * directory itself claims. If there's no ``..'' entry
1267 * in it, give up trying to get the link counts right.
1269 if (update
== TI_NOPARENT
) {
1272 iip
= getinoinfo(ino
);
1274 parent
= iip
->i_parent
;
1276 parent
= lookup_dotdot_ino(ino
);
1279 * Make sure that the claimed
1280 * parent actually has a
1283 dp
= ginode(parent
);
1284 idesc
.id_name
= lfname
;
1285 idesc
.id_type
= DATA
;
1286 idesc
.id_func
= findino
;
1287 idesc
.id_number
= ino
;
1288 idesc
.id_fix
= DONTKNOW
;
1289 if ((ckinode(dp
, &idesc
,
1290 CKI_TRAVERSE
) & FOUND
) == 0)
1296 mark_delayed_inodes(ino
, numfrags(&sblock
, new_length
));
1298 dp
= ginode(parent
);
1299 LINK_RANGE(message
, dp
->di_nlink
, -1);
1300 if (message
!= NULL
) {
1301 LINK_CLEAR(message
, parent
, dp
->di_mode
,
1303 if (statemap
[parent
] == USTATE
)
1304 goto no_parent_update
;
1306 TRACK_LNCNTP(parent
, lncntp
[parent
]--);
1307 } else if ((mode
== IFDIR
) && (parent
== 0)) {
1309 * Currently don't have a good way to
1310 * handle this, so throw up our hands.
1311 * However, we know that we can still
1312 * do some good if we continue, so
1313 * don't actually exit yet.
1315 * We don't do it for attrdirs,
1316 * because there aren't link counts
1317 * between them and their parents.
1319 pwarn("Could not determine former parent of "
1320 "inode %d, link counts are possibly\n"
1321 "incorrect. Please rerun fsck(1M) to "
1327 * ...else if it's a directory with parent == -1, then
1328 * we've not gotten far enough to know connectivity,
1329 * and it'll get handled automatically later.
1334 init_inodesc(&idesc
);
1335 idesc
.id_type
= ADDR
;
1336 idesc
.id_func
= pass4check
;
1337 idesc
.id_number
= ino
;
1338 idesc
.id_fix
= DONTKNOW
;
1339 idesc
.id_truncto
= howmany(new_length
, sblock
.fs_bsize
);
1341 if (!islink
&& ckinode(dp
, &idesc
, CKI_TRUNCATE
) & ALTERED
)
1345 * This has to be done after ckinode(), so that all of
1346 * the fragments get visited. Note that we assume we're
1347 * always truncating to a block boundary, rather than a
1348 * fragment boundary.
1351 dp
->di_size
= new_length
;
1354 * Clear now-obsolete pointers.
1356 for (dblk
= idesc
.id_truncto
+ 1; dblk
< NDADDR
; dblk
++) {
1357 dp
->di_db
[dblk
] = 0;
1360 ilevel
= get_indir_offsets(-1, idesc
.id_truncto
, NULL
, NULL
);
1361 for (ilevel
++; ilevel
< NIADDR
; ilevel
++) {
1362 dp
->di_ib
[ilevel
] = 0;
1369 * Release an inode's resources, then release the inode itself.
1372 freeino(fsck_ino_t ino
, int update_parent
)
1381 * We need to make sure that the file is really a large file.
1382 * Everything bigger than UFS_MAXOFFSET_T is treated as a file with
1383 * negative size, which shall be cleared. (see verify_inode() in
1386 if (dp
->di_size
> (uoff_t
)MAXOFF_T
&&
1387 dp
->di_size
<= (uoff_t
)UFS_MAXOFFSET_T
&&
1389 (dp
->di_mode
& IFMT
) != IFBLK
&&
1390 (dp
->di_mode
& IFMT
) != IFCHR
) {
1393 truncino(ino
, 0, update_parent
);
1396 if ((dp
->di_mode
& IFMT
) == IFATTRDIR
) {
1397 clearshadow(ino
, &attrclientinfo
);
1403 statemap
[ino
] = USTATE
;
1406 * Keep the disk in sync with us so that pass5 doesn't get
1407 * upset about spurious inconsistencies.
1409 cg
= itog(&sblock
, ino
);
1410 (void) getblk(&cgblk
, (diskaddr_t
)cgtod(&sblock
, cg
),
1411 (size_t)sblock
.fs_cgsize
);
1412 cgp
= cgblk
.b_un
.b_cg
;
1413 clrbit(cg_inosused(cgp
), ino
% sblock
.fs_ipg
);
1414 cgp
->cg_cs
.cs_nifree
+= 1;
1416 sblock
.fs_cstotal
.cs_nifree
+= 1;
1421 init_inoinfo(struct inoinfo
*inp
, struct dinode
*dp
, fsck_ino_t inum
)
1423 inp
->i_parent
= ((inum
== UFSROOTINO
) ? UFSROOTINO
: (fsck_ino_t
)0);
1424 inp
->i_dotdot
= (fsck_ino_t
)0;
1425 inp
->i_isize
= (offset_t
)dp
->di_size
;
1426 inp
->i_blkssize
= (NDADDR
+ NIADDR
) * sizeof (daddr32_t
);
1427 inp
->i_extattr
= dp
->di_oeftflag
;
1428 (void) memmove((void *)&inp
->i_blks
[0], (void *)&dp
->di_db
[0],
1433 * Return the inode number in the ".." entry of the provided
1437 lookup_dotdot_ino(fsck_ino_t ino
)
1439 struct inodesc idesc
;
1441 init_inodesc(&idesc
);
1442 idesc
.id_type
= DATA
;
1443 idesc
.id_func
= findino
;
1444 idesc
.id_name
= "..";
1445 idesc
.id_number
= ino
;
1446 idesc
.id_fix
= NOFIX
;
1448 if ((ckinode(ginode(ino
), &idesc
, CKI_TRAVERSE
) & FOUND
) != 0) {
1449 return (idesc
.id_parent
);
1456 * Convenience wrapper around ckinode(findino()).
1459 lookup_named_ino(fsck_ino_t dir
, caddr_t name
)
1461 struct inodesc idesc
;
1463 init_inodesc(&idesc
);
1464 idesc
.id_type
= DATA
;
1465 idesc
.id_func
= findino
;
1466 idesc
.id_name
= name
;
1467 idesc
.id_number
= dir
;
1468 idesc
.id_fix
= NOFIX
;
1470 if ((ckinode(ginode(dir
), &idesc
, CKI_TRAVERSE
) & FOUND
) != 0) {
1471 return (idesc
.id_parent
);
1478 * Marks inodes that are being orphaned and might need to be reconnected
1479 * by pass4(). The inode we're traversing is the directory whose
1480 * contents will be reconnected later. id_parent is the lfn at which
1481 * to start looking at said contents.
1484 mark_a_delayed_inode(struct inodesc
*idesc
)
1486 struct direct
*dirp
= idesc
->id_dirp
;
1488 if (idesc
->id_lbn
< idesc
->id_parent
) {
1492 if (dirp
->d_ino
!= 0 &&
1493 strcmp(dirp
->d_name
, ".") != 0 &&
1494 strcmp(dirp
->d_name
, "..") != 0) {
1495 statemap
[dirp
->d_ino
] &= ~INFOUND
;
1496 statemap
[dirp
->d_ino
] |= INDELAYD
;
1503 mark_delayed_inodes(fsck_ino_t ino
, daddr32_t first_lfn
)
1506 struct inodesc idelayed
;
1508 init_inodesc(&idelayed
);
1509 idelayed
.id_number
= ino
;
1510 idelayed
.id_type
= DATA
;
1511 idelayed
.id_fix
= NOFIX
;
1512 idelayed
.id_func
= mark_a_delayed_inode
;
1513 idelayed
.id_parent
= first_lfn
;
1514 idelayed
.id_entryno
= 2;
1517 (void) ckinode(dp
, &idelayed
, CKI_TRAVERSE
);
1521 * Clear the i_oeftflag/extended attribute pointer from INO.
1524 clearattrref(fsck_ino_t ino
)
1530 if (dp
->di_oeftflag
== 0)
1531 (void) printf("clearattref: no attr to clear on %d\n",
1535 dp
->di_oeftflag
= 0;