2 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
3 * Use is subject to license terms.
6 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
7 /* All Rights Reserved */
10 * Copyright (c) 1980, 1986, 1990 The Regents of the University of California.
11 * All rights reserved.
13 * Redistribution and use in source and binary forms are permitted
14 * provided that: (1) source distributions retain this entire copyright
15 * notice and comment, and (2) distributions including binaries display
16 * the following acknowledgement: ``This product includes software
17 * developed by the University of California, Berkeley and its contributors''
18 * in the documentation or other materials provided with the distribution
19 * and in all advertising materials mentioning features or use of this
20 * software. Neither the name of the University nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
23 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
24 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
28 #pragma ident "%Z%%M% %I% %E% SMI"
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/mntent.h>
37 #include <sys/fs/ufs_fs.h>
38 #include <sys/vnode.h>
40 #include <sys/fs/ufs_fsdir.h>
42 #include <sys/fs/ufs_inode.h>
46 * for each large file (size > MAXOFF_T), the global largefile_count
47 * gets incremented during this pass.
50 static uint32_t badblk
; /* number seen for the current inode */
51 static uint32_t dupblk
; /* number seen for the current inode */
53 static void clear_attr_acl(fsck_ino_t
, fsck_ino_t
, char *);
54 static void verify_inode(fsck_ino_t
, struct inodesc
*, fsck_ino_t
);
55 static void check_dirholes(fsck_ino_t
, struct inodesc
*);
56 static void collapse_dirhole(fsck_ino_t
, struct inodesc
*);
57 static void note_used(daddr32_t
);
66 fsck_ino_t maxinumber
;
69 * Set file system reserved blocks in used block map.
71 for (c
= 0; c
< sblock
.fs_ncg
; c
++) {
72 cgd
= cgdmin(&sblock
, c
);
75 * Doing the first cylinder group, account for
76 * the cg summaries as well.
78 i
= cgbase(&sblock
, c
);
79 cgd
+= howmany(sblock
.fs_cssize
, sblock
.fs_fsize
);
81 i
= cgsblock(&sblock
, c
);
83 for (; i
< cgd
; i
++) {
88 * Note blocks being used by the log, so we don't declare
89 * them as available and some time in the future we get a
90 * freeing free block panic.
92 if (islog
&& islogok
&& sblock
.fs_logbno
)
93 examinelog(¬e_used
);
96 * Find all allocated blocks. This must be completed before
97 * we read the contents of any directories, as dirscan() et al
98 * don't want to know about block allocation holes. So, part
99 * of this pass is to truncate any directories with holes to
100 * just before those holes, so dirscan() can remain blissfully
104 n_files
= n_blks
= 0;
106 maxinumber
= sblock
.fs_ncg
* sblock
.fs_ipg
;
107 for (c
= 0; c
< sblock
.fs_ncg
; c
++) {
108 for (i
= 0; i
< sblock
.fs_ipg
; i
++, inumber
++) {
109 if (inumber
< UFSROOTINO
)
111 init_inodesc(&idesc
);
112 idesc
.id_type
= ADDR
;
113 idesc
.id_func
= pass1check
;
114 verify_inode(inumber
, &idesc
, maxinumber
);
121 * Perform checks on an inode and setup/track the state of the inode
122 * in maps (statemap[], lncntp[]) for future reference and validation.
123 * Initiate the calls to ckinode and in turn pass1check() to handle
124 * further validation.
127 verify_inode(fsck_ino_t inumber
, struct inodesc
*idesc
, fsck_ino_t maxinumber
)
132 fsck_ino_t shadow
, attrinode
;
137 dp
= getnextinode(inumber
);
138 if ((dp
->di_mode
& IFMT
) == 0) {
139 /* mode and type of file is not set */
140 if ((memcmp((void *)dp
->di_db
, (void *)zino
.di_db
,
141 NDADDR
* sizeof (daddr32_t
)) != 0) ||
142 (memcmp((void *)dp
->di_ib
, (void *)zino
.di_ib
,
143 NIADDR
* sizeof (daddr32_t
)) != 0) ||
144 (dp
->di_mode
!= 0) || (dp
->di_size
!= 0)) {
145 pfatal("PARTIALLY ALLOCATED INODE I=%u", inumber
);
146 if (reply("CLEAR") == 1) {
147 dp
= ginode(inumber
);
154 statemap
[inumber
] = USTATE
;
158 isdir
= ((dp
->di_mode
& IFMT
) == IFDIR
) ||
159 ((dp
->di_mode
& IFMT
) == IFATTRDIR
);
162 if (dp
->di_size
> (u_offset_t
)UFS_MAXOFFSET_T
) {
163 pfatal("NEGATIVE SIZE %lld I=%d",
164 (longlong_t
)dp
->di_size
, inumber
);
169 * A more precise test of the type is done later on. Just get
170 * rid of the blatantly-wrong ones before we do any
173 if ((dp
->di_mode
& IFMT
) == IFMT
) {
174 pfatal("BAD MODE 0%o I=%d",
175 dp
->di_mode
& IFMT
, inumber
);
176 if (reply("BAD MODE: MAKE IT A FILE") == 1) {
177 statemap
[inumber
] = FSTATE
;
178 dp
= ginode(inumber
);
179 dp
->di_mode
= IFREG
| 0600;
181 truncino(inumber
, sblock
.fs_fsize
, TI_NOPARENT
);
182 dp
= getnextrefresh();
188 ndb
= howmany(dp
->di_size
, (u_offset_t
)sblock
.fs_bsize
);
190 /* extra space to distinguish from previous pfatal() */
191 pfatal("NEGATIVE SIZE %lld I=%d",
192 (longlong_t
)dp
->di_size
, inumber
);
196 if ((dp
->di_mode
& IFMT
) == IFBLK
||
197 (dp
->di_mode
& IFMT
) == IFCHR
) {
198 if (dp
->di_size
!= 0) {
199 pfatal("SPECIAL FILE WITH NON-ZERO LENGTH %lld I=%d",
200 (longlong_t
)dp
->di_size
, inumber
);
204 for (j
= 0; j
< NDADDR
; j
++) {
206 * It's a device, so all the block pointers
207 * should be zero except for di_ordev.
208 * di_ordev is overlayed on the block array,
209 * but where varies between big and little
210 * endian, so make sure that the only non-zero
211 * element is the correct one. There can be
212 * a device whose ordev is zero, so we can't
213 * check for the reverse.
215 if (dp
->di_db
[j
] != 0 &&
216 &dp
->di_db
[j
] != &dp
->di_ordev
) {
219 "spec file di_db[%d] has %d\n",
223 "SPECIAL FILE WITH NON-ZERO FRAGMENT LIST I=%d",
229 for (j
= 0; j
< NIADDR
; j
++) {
230 if (dp
->di_ib
[j
] != 0) {
233 "special has %d at ib[%d]\n",
236 "SPECIAL FILE WITH NON-ZERO FRAGMENT LIST I=%d",
243 * This assignment is mostly here to appease lint, but
246 err
= "Internal error: unexpected variant of having "
247 "blocks past end of file I=%d";
252 * If it's not a device, it has to follow the
253 * rules for files. In particular, no blocks after
254 * the last one that di_size says is in use.
256 for (j
= ndb
; j
< NDADDR
; j
++) {
257 if (dp
->di_db
[j
] != 0) {
259 (void) printf("bad file direct "
260 "addr[%d]: block 0x%x "
265 err
= "FILE WITH FRAGMENTS PAST END I=%d";
272 * Find last indirect pointer that should be in use,
273 * and make sure any after it are clear.
276 for (j
= 0, ndb
-= NDADDR
; ndb
> 0; j
++) {
277 ndb
/= NINDIR(&sblock
);
279 for (; j
< NIADDR
; j
++) {
280 if (dp
->di_ib
[j
] != 0) {
282 (void) printf("bad file "
283 "indirect addr: block %d\n",
287 "FILE WITH FRAGMENTS PAST END I=%d";
296 * The discarded blocks will be garbage-
297 * collected in pass5. If we're told not to
298 * discard them, it's just lost blocks, which
299 * isn't worth setting iscorrupt for.
302 if (preen
|| reply("DISCARD EXCESS FRAGMENTS") == 1) {
303 dp
= ginode(inumber
);
305 for (; j
< NDADDR
; j
++)
309 for (; j
< NIADDR
; j
++)
312 dp
= getnextrefresh();
314 (void) printf(" (TRUNCATED)");
319 if (ftypeok(dp
) == 0) {
320 pfatal("UNKNOWN FILE TYPE 0%o I=%d", dp
->di_mode
, inumber
);
324 TRACK_LNCNTP(inumber
, lncntp
[inumber
] = dp
->di_nlink
);
327 * We can't do anything about it right now, so note that its
328 * processing is being delayed. Otherwise, we'd be changing
329 * the block allocations out from under ourselves, which causes
330 * no end of confusion.
332 flags
= statemap
[inumber
] & INDELAYD
;
335 * if errorlocked or logging, then open deleted files will
336 * manifest as di_nlink <= 0 and di_mode != 0
337 * so skip them; they're ok.
338 * Also skip anything already marked to be cleared.
340 if (dp
->di_nlink
<= 0 &&
341 !((errorlocked
|| islog
) && dp
->di_mode
== 0) &&
342 !(flags
& INCLEAR
)) {
346 "marking i=%d INZLINK; nlink %d, mode 0%o, islog %d\n",
347 inumber
, dp
->di_nlink
, dp
->di_mode
, islog
);
350 switch (dp
->di_mode
& IFMT
) {
353 if (dp
->di_size
== 0) {
355 * INCLEAR means it will be ignored by passes 2 & 3.
357 if ((dp
->di_mode
& IFMT
) == IFDIR
)
358 (void) printf("ZERO-LENGTH DIR I=%d\n",
361 (void) printf("ZERO-LENGTH ATTRDIR I=%d\n",
363 add_orphan_dir(inumber
);
365 flags
&= ~INZLINK
; /* It will be cleared anyway */
367 statemap
[inumber
] = DSTATE
| flags
;
368 cacheino(dp
, inumber
);
373 if (dp
->di_size
== 0) {
374 (void) printf("ZERO-LENGTH SHADOW I=%d\n", inumber
);
376 flags
&= ~INZLINK
; /* It will be cleared anyway */
378 statemap
[inumber
] = SSTATE
| flags
;
379 cacheacl(dp
, inumber
);
383 statemap
[inumber
] = FSTATE
| flags
;
388 idesc
->id_number
= inumber
;
389 idesc
->id_fix
= DONTKNOW
;
390 if (dp
->di_size
> (u_offset_t
)MAXOFF_T
) {
394 (void) ckinode(dp
, idesc
, CKI_TRAVERSE
);
395 if (isdir
&& (idesc
->id_firsthole
>= 0))
396 check_dirholes(inumber
, idesc
);
398 if (dp
->di_blocks
!= idesc
->id_entryno
) {
400 * The kernel releases any blocks it finds in the lists,
401 * ignoring the block count itself. So, a bad count is
402 * not grounds for setting iscorrupt.
404 pwarn("INCORRECT DISK BLOCK COUNT I=%u (%d should be %d)",
405 inumber
, (uint32_t)dp
->di_blocks
, idesc
->id_entryno
);
406 if (!preen
&& (reply("CORRECT") == 0))
408 dp
= ginode(inumber
);
409 dp
->di_blocks
= idesc
->id_entryno
;
410 iip
= getinoinfo(inumber
);
412 iip
->i_isize
= dp
->di_size
;
415 (void) printf(" (CORRECTED)\n");
417 if (isdir
&& (dp
->di_blocks
== 0)) {
419 * INCLEAR will cause passes 2 and 3 to skip it.
421 (void) printf("DIR WITH ZERO BLOCKS I=%d\n", inumber
);
422 statemap
[inumber
] = DCLEAR
;
423 add_orphan_dir(inumber
);
427 * Check that the ACL is on a valid file type
429 shadow
= dp
->di_shadow
;
431 if (acltypeok(dp
) == 0) {
432 clear_attr_acl(inumber
, -1,
433 "NON-ZERO ACL REFERENCE, I=%d\n");
434 } else if ((shadow
<= UFSROOTINO
) ||
435 (shadow
> maxinumber
)) {
436 clear_attr_acl(inumber
, -1,
437 "BAD ACL REFERENCE I=%d\n");
439 registershadowclient(shadow
,
440 inumber
, &shadowclientinfo
);
444 attrinode
= dp
->di_oeftflag
;
445 if (attrinode
!= 0) {
446 if ((attrinode
<= UFSROOTINO
) ||
447 (attrinode
> maxinumber
)) {
448 clear_attr_acl(attrinode
, inumber
,
449 "BAD ATTRIBUTE REFERENCE TO I=%d FROM I=%d\n");
451 dp
= ginode(attrinode
);
452 if ((dp
->di_mode
& IFMT
) != IFATTRDIR
) {
453 clear_attr_acl(attrinode
, inumber
,
454 "BAD ATTRIBUTE DIR REF TO I=%d FROM I=%d\n");
455 } else if (dp
->di_size
== 0) {
456 clear_attr_acl(attrinode
, inumber
,
457 "REFERENCE TO ZERO-LENGTH ATTRIBUTE DIR I=%d from I=%d\n");
459 registershadowclient(attrinode
, inumber
,
467 * If we got here, we've not had the chance to see if a
468 * directory has holes, but we know the directory's bad,
469 * so it's safe to always return false (no holes found).
471 * Also, a pfatal() is always done before jumping here, so
472 * we know we're not in preen mode.
477 * INCLEAR makes passes 2 & 3 skip it.
479 statemap
[inumber
] = DCLEAR
;
480 add_orphan_dir(inumber
);
481 cacheino(dp
, inumber
);
483 statemap
[inumber
] = FCLEAR
;
485 if (reply("CLEAR") == 1) {
486 (void) tdelete((void *)inumber
, &limbo_dirs
, ino_t_cmp
);
487 freeino(inumber
, TI_PARENT
);
495 * Do fixup for bad acl/attr references. If PARENT is -1, then
496 * we assume we're working on a shadow, otherwise an extended attribute.
497 * FMT must be a printf format string, with one %d directive for
501 clear_attr_acl(fsck_ino_t inumber
, fsck_ino_t parent
, char *fmt
)
503 fsck_ino_t victim
= inumber
;
511 pwarn(fmt
, (int)inumber
);
513 pwarn(fmt
, (int)inumber
, (int)parent
);
517 (void) printf("parent file/dir I=%d\nvictim I=%d",
518 (int)parent
, (int)victim
);
520 if (!preen
&& (reply("REMOVE REFERENCE") == 0)) {
528 * The file had a bad shadow/acl, so lock it down
529 * until someone can protect it the way they need it
530 * to be (i.e., be conservatively paranoid).
540 (void) printf(" (CORRECTED)\n");
544 * Check if we have holes in the directory's indirect
545 * blocks. If there are, get rid of everything after
549 check_dirholes(fsck_ino_t inumber
, struct inodesc
*idesc
)
551 char pathbuf
[MAXPATHLEN
+ 1];
553 getpathname(pathbuf
, idesc
->id_number
, idesc
->id_number
);
554 pfatal("I=%d DIRECTORY %s: CONTAINS EMPTY BLOCKS",
555 idesc
->id_number
, pathbuf
);
556 if (reply("TRUNCATE AT FIRST EMPTY BLOCK") == 1) {
558 * We found a hole, so get rid of it.
560 collapse_dirhole(inumber
, idesc
);
563 (void) printf(" (TRUNCATED)\n");
570 * Truncate a directory to its first hole. If there are non-holes
571 * in the direct blocks after the problem block, move them down so
572 * that there's somewhat less lossage. Doing this for indirect blocks
573 * is left as an exercise for the reader.
576 collapse_dirhole(fsck_ino_t inumber
, struct inodesc
*idesc
)
581 if (idesc
->id_firsthole
< 0) {
586 * Since truncino() adjusts the size, we don't need to do that here,
587 * but we have to tell it what final size we want.
589 * We need to count from block zero up through the last block
590 * before the hole. If the hole is in the indirect blocks, chop at
591 * the start of the nearest level of indirection. Orphans will
592 * get reconnected, so we're not actually losing anything by doing
593 * it this way, and we're simplifying truncation significantly.
595 new_size
= idesc
->id_firsthole
* (offset_t
)sblock
.fs_bsize
;
596 blocks
= howmany(new_size
, sblock
.fs_bsize
);
597 if (blocks
> NDADDR
) {
598 if (blocks
< (NDADDR
+ NINDIR(&sblock
)))
600 else if (blocks
< (NDADDR
+ NINDIR(&sblock
) +
601 (NINDIR(&sblock
) * NINDIR(&sblock
))))
602 blocks
= NDADDR
+ NINDIR(&sblock
);
604 blocks
= NDADDR
+ NINDIR(&sblock
) +
605 (NINDIR(&sblock
) * NINDIR(&sblock
));
606 new_size
= blocks
* sblock
.fs_bsize
;
608 (void) printf("to %lld (blocks %d)\n",
609 (longlong_t
)new_size
, blocks
);
611 truncino(inumber
, new_size
, TI_NOPARENT
);
614 * Technically, there are still the original number of fragments
615 * associated with the object. However, that number is not used
616 * to control anything, so we can do the in-memory truncation of
617 * it without bad things happening.
619 idesc
->id_entryno
= btodb(new_size
);
623 pass1check(struct inodesc
*idesc
)
629 daddr32_t fragno
= idesc
->id_blkno
;
633 * If this is a fallocate'd file, block numbers may be stored
634 * as negative. In that case negate the negative numbers.
636 dp
= ginode(idesc
->id_number
);
637 if (dp
->di_cflags
& IFALLOCATE
&& fragno
< 0)
640 if ((anyout
= chkrange(fragno
, idesc
->id_numfrags
)) != 0) {
642 * Note that blkerror() exits when preening.
644 blkerror(idesc
->id_number
, "OUT OF RANGE",
645 fragno
, idesc
->id_lbn
* sblock
.fs_frag
);
647 dp
= ginode(idesc
->id_number
);
648 if ((((dp
->di_mode
& IFMT
) == IFDIR
) ||
649 ((dp
->di_mode
& IFMT
) == IFATTRDIR
)) &&
650 (idesc
->id_firsthole
< 0)) {
651 idesc
->id_firsthole
= idesc
->id_lbn
;
654 if (++badblk
>= MAXBAD
) {
655 pwarn("EXCESSIVE BAD FRAGMENTS I=%u",
657 if (reply("CONTINUE") == 0)
658 errexit("Program terminated.");
660 * See discussion below as to why we don't
661 * want to short-circuit the processing of
662 * this inode. However, we know that this
663 * particular block is bad, so we don't need
664 * to go through the dup check loop.
666 return (SKIP
| STOP
);
671 * For each fragment, verify that it is a legal one (either
672 * by having already found the entire run to be legal, or by
673 * individual inspection), and if it is legal, see if we've
674 * seen it before or not. If we haven't, note that we've seen
675 * it and continue on. If we have (our in-core bitmap shows
676 * it as already being busy), then this must be a duplicate
677 * allocation. Whine and moan accordingly.
679 * Note that for full-block allocations, this will produce
680 * a complaint for each fragment making up the block (i.e.,
681 * fs_frags' worth). Among other things, this could be
682 * considered artificially inflating the dup-block count.
683 * However, since it is possible that one file has a full
684 * fs block allocated, but another is only claiming a frag
685 * or two out of the middle, we'll just live it.
687 for (nfrags
= 0; nfrags
< idesc
->id_numfrags
; fragno
++, nfrags
++) {
688 if (anyout
&& chkrange(fragno
, 1)) {
689 /* bad fragment number */
691 } else if (!testbmap(fragno
)) {
692 /* no other claims seen as yet */
696 * We have a duplicate claim for the same fragment.
698 * blkerror() exits when preening.
700 * We want to report all the dups up until
701 * hitting MAXDUP. Fortunately, blkerror()'s
702 * side-effects on statemap[] are idempotent,
703 * so the ``extra'' calls are harmless.
705 lbn
= idesc
->id_lbn
* sblock
.fs_frag
+ nfrags
;
707 blkerror(idesc
->id_number
, "DUP", fragno
, lbn
);
710 * Use ==, so we only complain once, no matter
711 * how far over the limit we end up going.
713 if (++dupblk
== MAXDUP
) {
714 pwarn("EXCESSIVE DUPLICATE FRAGMENTS I=%u",
716 if (reply("CONTINUE") == 0)
717 errexit("Program terminated.");
720 * If we stop the traversal here, then
721 * there may be more dups in the
722 * inode's block list that don't get
723 * flagged. Later, if we're told to
724 * clear one of the files claiming
725 * these blocks, but not the other, we
726 * will release blocks that are
727 * actually still in use. An additional
728 * fsck run would be necessary to undo
729 * the damage. So, instead of the
730 * traditional return (STOP) when told
731 * to continue, we really do just continue.
734 (void) find_dup_ref(fragno
, idesc
->id_number
, lbn
,
735 DB_CREATE
| DB_INCR
);
738 * id_entryno counts the number of disk blocks found.
740 idesc
->id_entryno
+= btodb(sblock
.fs_fsize
);
746 note_used(daddr32_t frag
)