2 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
30 #include <sys/param.h>
32 #include <sys/disklabel.h>
33 #include <sys/mount.h>
36 #include <ufs/ufs/ufsmount.h>
37 #include <ufs/ufs/dinode.h>
38 #include <ufs/ufs/dir.h>
39 #include <ufs/ffs/fs.h>
56 #define DOTDOT_OFFSET DIRECTSIZ(1)
57 #define SUJ_HASHSIZE 2048
58 #define SUJ_HASHMASK (SUJ_HASHSIZE - 1)
59 #define SUJ_HASH(x) ((x * 2654435761) & SUJ_HASHMASK)
62 TAILQ_ENTRY(suj_seg
) ss_next
;
63 struct jsegrec ss_rec
;
68 TAILQ_ENTRY(suj_rec
) sr_next
;
71 TAILQ_HEAD(srechd
, suj_rec
);
74 LIST_ENTRY(suj_ino
) si_next
;
75 struct srechd si_recs
;
76 struct srechd si_newrecs
;
77 struct srechd si_movs
;
78 struct jtrncrec
*si_trunc
;
89 LIST_HEAD(inohd
, suj_ino
);
92 LIST_ENTRY(suj_blk
) sb_next
;
93 struct srechd sb_recs
;
96 LIST_HEAD(blkhd
, suj_blk
);
99 LIST_ENTRY(data_blk
) db_next
;
107 LIST_ENTRY(ino_blk
) ib_next
;
112 LIST_HEAD(iblkhd
, ino_blk
);
115 LIST_ENTRY(suj_cg
) sc_next
;
116 struct blkhd sc_blkhash
[SUJ_HASHSIZE
];
117 struct inohd sc_inohash
[SUJ_HASHSIZE
];
118 struct iblkhd sc_iblkhash
[SUJ_HASHSIZE
];
119 struct ino_blk
*sc_lastiblk
;
120 struct suj_ino
*sc_lastino
;
121 struct suj_blk
*sc_lastblk
;
128 static LIST_HEAD(cghd
, suj_cg
) cghash
[SUJ_HASHSIZE
];
129 static LIST_HEAD(dblkhd
, data_blk
) dbhash
[SUJ_HASHSIZE
];
130 static struct suj_cg
*lastcg
;
131 static struct data_blk
*lastblk
;
133 static TAILQ_HEAD(seghd
, suj_seg
) allsegs
;
134 static uint64_t oldseq
;
135 static struct uufsd
*disk
= NULL
;
136 static struct fs
*fs
= NULL
;
140 * Summary statistics.
142 static uint64_t freefrags
;
143 static uint64_t freeblocks
;
144 static uint64_t freeinos
;
145 static uint64_t freedir
;
146 static uint64_t jbytes
;
147 static uint64_t jrecs
;
149 static jmp_buf jmpbuf
;
151 typedef void (*ino_visitor
)(ino_t
, ufs_lbn_t
, ufs2_daddr_t
, int);
152 static void err_suj(const char *, ...) __dead2
;
153 static void ino_trunc(ino_t
, off_t
);
154 static void ino_decr(ino_t
);
155 static void ino_adjust(struct suj_ino
*);
156 static void ino_build(struct suj_ino
*);
157 static int blk_isfree(ufs2_daddr_t
);
158 static void initsuj(void);
167 err(EX_OSERR
, "malloc(%zu)", n
);
172 * When hit a fatal error in journalling check, print out
173 * the error and then offer to fallback to normal fsck.
176 err_suj(const char * restrict fmt
, ...)
181 (void)fprintf(stdout
, "%s: ", cdevname
);
184 (void)vfprintf(stdout
, fmt
, ap
);
191 * Open the given provider, load superblock.
194 opendisk(const char *devnam
)
198 disk
= Malloc(sizeof(*disk
));
200 err(EX_OSERR
, "malloc(%zu)", sizeof(*disk
));
201 if (ufs_disk_fillout(disk
, devnam
) == -1) {
202 err(EX_OSERR
, "ufs_disk_fillout(%s) failed: %s", devnam
,
206 if (real_dev_bsize
== 0 && ioctl(disk
->d_fd
, DIOCGSECTORSIZE
,
207 &real_dev_bsize
) == -1)
208 real_dev_bsize
= secsize
;
210 printf("dev_bsize %u\n", real_dev_bsize
);
214 * Mark file system as clean, write the super-block back, close the disk.
217 closedisk(const char *devnam
)
223 * Recompute the fs summary info from correct cs summaries.
225 bzero(&fs
->fs_cstotal
, sizeof(struct csum_total
));
226 for (i
= 0; i
< fs
->fs_ncg
; i
++) {
227 cgsum
= &fs
->fs_cs(fs
, i
);
228 fs
->fs_cstotal
.cs_nffree
+= cgsum
->cs_nffree
;
229 fs
->fs_cstotal
.cs_nbfree
+= cgsum
->cs_nbfree
;
230 fs
->fs_cstotal
.cs_nifree
+= cgsum
->cs_nifree
;
231 fs
->fs_cstotal
.cs_ndir
+= cgsum
->cs_ndir
;
233 fs
->fs_pendinginodes
= 0;
234 fs
->fs_pendingblocks
= 0;
236 fs
->fs_time
= time(NULL
);
237 fs
->fs_mtime
= time(NULL
);
238 if (sbwrite(disk
, 0) == -1)
239 err(EX_OSERR
, "sbwrite(%s)", devnam
);
240 if (ufs_disk_close(disk
) == -1)
241 err(EX_OSERR
, "ufs_disk_close(%s)", devnam
);
248 * Lookup a cg by number in the hash so we can keep track of which cgs
249 * need stats rebuilt.
251 static struct suj_cg
*
257 if (cgx
< 0 || cgx
>= fs
->fs_ncg
)
258 err_suj("Bad cg number %d\n", cgx
);
259 if (lastcg
&& lastcg
->sc_cgx
== cgx
)
261 hd
= &cghash
[SUJ_HASH(cgx
)];
262 LIST_FOREACH(sc
, hd
, sc_next
)
263 if (sc
->sc_cgx
== cgx
) {
267 sc
= errmalloc(sizeof(*sc
));
268 bzero(sc
, sizeof(*sc
));
269 sc
->sc_cgbuf
= errmalloc(fs
->fs_bsize
);
270 sc
->sc_cgp
= (struct cg
*)sc
->sc_cgbuf
;
272 LIST_INSERT_HEAD(hd
, sc
, sc_next
);
273 if (bread(disk
, fsbtodb(fs
, cgtod(fs
, sc
->sc_cgx
)), sc
->sc_cgbuf
,
275 err_suj("Unable to read cylinder group %d\n", sc
->sc_cgx
);
281 * Lookup an inode number in the hash and allocate a suj_ino if it does
284 static struct suj_ino
*
285 ino_lookup(ino_t ino
, int creat
)
287 struct suj_ino
*sino
;
291 sc
= cg_lookup(ino_to_cg(fs
, ino
));
292 if (sc
->sc_lastino
&& sc
->sc_lastino
->si_ino
== ino
)
293 return (sc
->sc_lastino
);
294 hd
= &sc
->sc_inohash
[SUJ_HASH(ino
)];
295 LIST_FOREACH(sino
, hd
, si_next
)
296 if (sino
->si_ino
== ino
)
300 sino
= errmalloc(sizeof(*sino
));
301 bzero(sino
, sizeof(*sino
));
303 TAILQ_INIT(&sino
->si_recs
);
304 TAILQ_INIT(&sino
->si_newrecs
);
305 TAILQ_INIT(&sino
->si_movs
);
306 LIST_INSERT_HEAD(hd
, sino
, si_next
);
312 * Lookup a block number in the hash and allocate a suj_blk if it does
315 static struct suj_blk
*
316 blk_lookup(ufs2_daddr_t blk
, int creat
)
318 struct suj_blk
*sblk
;
322 sc
= cg_lookup(dtog(fs
, blk
));
323 if (sc
->sc_lastblk
&& sc
->sc_lastblk
->sb_blk
== blk
)
324 return (sc
->sc_lastblk
);
325 hd
= &sc
->sc_blkhash
[SUJ_HASH(fragstoblks(fs
, blk
))];
326 LIST_FOREACH(sblk
, hd
, sb_next
)
327 if (sblk
->sb_blk
== blk
)
331 sblk
= errmalloc(sizeof(*sblk
));
332 bzero(sblk
, sizeof(*sblk
));
334 TAILQ_INIT(&sblk
->sb_recs
);
335 LIST_INSERT_HEAD(hd
, sblk
, sb_next
);
340 static struct data_blk
*
341 dblk_lookup(ufs2_daddr_t blk
)
343 struct data_blk
*dblk
;
346 hd
= &dbhash
[SUJ_HASH(fragstoblks(fs
, blk
))];
347 if (lastblk
&& lastblk
->db_blk
== blk
)
349 LIST_FOREACH(dblk
, hd
, db_next
)
350 if (dblk
->db_blk
== blk
)
353 * The inode block wasn't located, allocate a new one.
355 dblk
= errmalloc(sizeof(*dblk
));
356 bzero(dblk
, sizeof(*dblk
));
357 LIST_INSERT_HEAD(hd
, dblk
, db_next
);
363 dblk_read(ufs2_daddr_t blk
, int size
)
365 struct data_blk
*dblk
;
367 dblk
= dblk_lookup(blk
);
369 * I doubt size mismatches can happen in practice but it is trivial
372 if (size
!= dblk
->db_size
) {
375 dblk
->db_buf
= errmalloc(size
);
376 dblk
->db_size
= size
;
377 if (bread(disk
, fsbtodb(fs
, blk
), dblk
->db_buf
, size
) == -1)
378 err_suj("Failed to read data block %jd\n", blk
);
380 return (dblk
->db_buf
);
384 dblk_dirty(ufs2_daddr_t blk
)
386 struct data_blk
*dblk
;
388 dblk
= dblk_lookup(blk
);
395 struct data_blk
*dblk
;
398 for (i
= 0; i
< SUJ_HASHSIZE
; i
++) {
399 LIST_FOREACH(dblk
, &dbhash
[i
], db_next
) {
400 if (dblk
->db_dirty
== 0 || dblk
->db_size
== 0)
402 if (bwrite(disk
, fsbtodb(fs
, dblk
->db_blk
),
403 dblk
->db_buf
, dblk
->db_size
) == -1)
404 err_suj("Unable to write block %jd\n",
410 static union dinode
*
413 struct ino_blk
*iblk
;
419 blk
= ino_to_fsba(fs
, ino
);
420 sc
= cg_lookup(ino_to_cg(fs
, ino
));
421 iblk
= sc
->sc_lastiblk
;
422 if (iblk
&& iblk
->ib_blk
== blk
)
424 hd
= &sc
->sc_iblkhash
[SUJ_HASH(fragstoblks(fs
, blk
))];
425 LIST_FOREACH(iblk
, hd
, ib_next
)
426 if (iblk
->ib_blk
== blk
)
429 * The inode block wasn't located, allocate a new one.
431 iblk
= errmalloc(sizeof(*iblk
));
432 bzero(iblk
, sizeof(*iblk
));
433 iblk
->ib_buf
= errmalloc(fs
->fs_bsize
);
435 LIST_INSERT_HEAD(hd
, iblk
, ib_next
);
436 if (bread(disk
, fsbtodb(fs
, blk
), iblk
->ib_buf
, fs
->fs_bsize
) == -1)
437 err_suj("Failed to read inode block %jd\n", blk
);
439 sc
->sc_lastiblk
= iblk
;
440 off
= ino_to_fsbo(fs
, ino
);
441 if (fs
->fs_magic
== FS_UFS1_MAGIC
)
442 return (union dinode
*)&((struct ufs1_dinode
*)iblk
->ib_buf
)[off
];
444 return (union dinode
*)&((struct ufs2_dinode
*)iblk
->ib_buf
)[off
];
450 struct ino_blk
*iblk
;
455 blk
= ino_to_fsba(fs
, ino
);
456 sc
= cg_lookup(ino_to_cg(fs
, ino
));
457 iblk
= sc
->sc_lastiblk
;
458 if (iblk
&& iblk
->ib_blk
== blk
) {
462 hd
= &sc
->sc_iblkhash
[SUJ_HASH(fragstoblks(fs
, blk
))];
463 LIST_FOREACH(iblk
, hd
, ib_next
) {
464 if (iblk
->ib_blk
== blk
) {
474 iblk_write(struct ino_blk
*iblk
)
477 if (iblk
->ib_dirty
== 0)
479 if (bwrite(disk
, fsbtodb(fs
, iblk
->ib_blk
), iblk
->ib_buf
,
481 err_suj("Failed to write inode block %jd\n", iblk
->ib_blk
);
485 blk_overlaps(struct jblkrec
*brec
, ufs2_daddr_t start
, int frags
)
492 bstart
= brec
->jb_blkno
+ brec
->jb_oldfrags
;
493 bend
= bstart
+ brec
->jb_frags
;
494 if (start
< bend
&& end
> bstart
)
500 blk_equals(struct jblkrec
*brec
, ino_t ino
, ufs_lbn_t lbn
, ufs2_daddr_t start
,
504 if (brec
->jb_ino
!= ino
|| brec
->jb_lbn
!= lbn
)
506 if (brec
->jb_blkno
+ brec
->jb_oldfrags
!= start
)
508 if (brec
->jb_frags
< frags
)
514 blk_setmask(struct jblkrec
*brec
, int *mask
)
518 for (i
= brec
->jb_oldfrags
; i
< brec
->jb_oldfrags
+ brec
->jb_frags
; i
++)
523 * Determine whether a given block has been reallocated to a new location.
524 * Returns a mask of overlapping bits if any frags have been reused or
525 * zero if the block has not been re-used and the contents can be trusted.
527 * This is used to ensure that an orphaned pointer due to truncate is safe
528 * to be freed. The mask value can be used to free partial blocks.
531 blk_freemask(ufs2_daddr_t blk
, ino_t ino
, ufs_lbn_t lbn
, int frags
)
533 struct suj_blk
*sblk
;
534 struct suj_rec
*srec
;
535 struct jblkrec
*brec
;
540 * To be certain we're not freeing a reallocated block we lookup
541 * this block in the blk hash and see if there is an allocation
542 * journal record that overlaps with any fragments in the block
543 * we're concerned with. If any fragments have ben reallocated
544 * the block has already been freed and re-used for another purpose.
547 sblk
= blk_lookup(blknum(fs
, blk
), 0);
550 off
= blk
- sblk
->sb_blk
;
551 TAILQ_FOREACH(srec
, &sblk
->sb_recs
, sr_next
) {
552 brec
= (struct jblkrec
*)srec
->sr_rec
;
554 * If the block overlaps but does not match
555 * exactly this record refers to the current
558 if (blk_overlaps(brec
, blk
, frags
) == 0)
560 if (blk_equals(brec
, ino
, lbn
, blk
, frags
) == 1)
563 blk_setmask(brec
, &mask
);
566 printf("blk_freemask: blk %jd sblk %jd off %d mask 0x%X\n",
567 blk
, sblk
->sb_blk
, off
, mask
);
568 return (mask
>> off
);
572 * Determine whether it is safe to follow an indirect. It is not safe
573 * if any part of the indirect has been reallocated or the last journal
574 * entry was an allocation. Just allocated indirects may not have valid
575 * pointers yet and all of their children will have their own records.
576 * It is also not safe to follow an indirect if the cg bitmap has been
577 * cleared as a new allocation may write to the block prior to the journal
580 * Returns 1 if it's safe to follow the indirect and 0 otherwise.
583 blk_isindir(ufs2_daddr_t blk
, ino_t ino
, ufs_lbn_t lbn
)
585 struct suj_blk
*sblk
;
586 struct jblkrec
*brec
;
588 sblk
= blk_lookup(blk
, 0);
591 if (TAILQ_EMPTY(&sblk
->sb_recs
))
593 brec
= (struct jblkrec
*)TAILQ_LAST(&sblk
->sb_recs
, srechd
)->sr_rec
;
594 if (blk_equals(brec
, ino
, lbn
, blk
, fs
->fs_frag
))
595 if (brec
->jb_op
== JOP_FREEBLK
)
596 return (!blk_isfree(blk
));
601 * Clear an inode from the cg bitmap. If the inode was already clear return
602 * 0 so the caller knows it does not have to check the inode contents.
605 ino_free(ino_t ino
, int mode
)
612 cg
= ino_to_cg(fs
, ino
);
613 ino
= ino
% fs
->fs_ipg
;
616 inosused
= cg_inosused(cgp
);
618 * The bitmap may never have made it to the disk so we have to
619 * conditionally clear. We can avoid writing the cg in this case.
621 if (isclr(inosused
, ino
))
624 clrbit(inosused
, ino
);
625 if (ino
< cgp
->cg_irotor
)
626 cgp
->cg_irotor
= ino
;
627 cgp
->cg_cs
.cs_nifree
++;
628 if ((mode
& IFMT
) == IFDIR
) {
630 cgp
->cg_cs
.cs_ndir
--;
638 * Free 'frags' frags starting at filesystem block 'bno' skipping any frags
642 blk_free(ufs2_daddr_t bno
, int mask
, int frags
)
644 ufs1_daddr_t fragno
, cgbno
;
651 printf("Freeing %d frags at blk %jd mask 0x%x\n",
656 cgbno
= dtogd(fs
, bno
);
657 blksfree
= cg_blksfree(cgp
);
660 * If it's not allocated we only wrote the journal entry
661 * and never the bitmaps. Here we unconditionally clear and
662 * resolve the cg summary later.
664 if (frags
== fs
->fs_frag
&& mask
== 0) {
665 fragno
= fragstoblks(fs
, cgbno
);
666 ffs_setblock(fs
, blksfree
, fragno
);
670 * deallocate the fragment
672 for (i
= 0; i
< frags
; i
++)
673 if ((mask
& (1 << i
)) == 0 && isclr(blksfree
, cgbno
+i
)) {
675 setbit(blksfree
, cgbno
+ i
);
682 * Returns 1 if the whole block starting at 'bno' is marked free and 0
686 blk_isfree(ufs2_daddr_t bno
)
690 sc
= cg_lookup(dtog(fs
, bno
));
691 return ffs_isblock(fs
, cg_blksfree(sc
->sc_cgp
), dtogd(fs
, bno
));
695 * Fetch an indirect block to find the block at a given lbn. The lbn
696 * may be negative to fetch a specific indirect block pointer or positive
697 * to fetch a specific block.
700 indir_blkatoff(ufs2_daddr_t blk
, ino_t ino
, ufs_lbn_t cur
, ufs_lbn_t lbn
)
711 level
= lbn_level(cur
);
713 err_suj("Invalid indir lbn %jd\n", lbn
);
714 if (level
== 0 && lbn
< 0)
715 err_suj("Invalid lbn %jd\n", lbn
);
716 bap2
= (void *)dblk_read(blk
, fs
->fs_bsize
);
719 base
= -(cur
+ level
);
720 for (i
= level
; i
> 0; i
--)
721 lbnadd
*= NINDIR(fs
);
723 i
= (lbn
- base
) / lbnadd
;
725 i
= (-lbn
- base
) / lbnadd
;
726 if (i
< 0 || i
>= NINDIR(fs
))
727 err_suj("Invalid indirect index %d produced by lbn %jd\n",
730 cur
= base
+ (i
* lbnadd
);
732 cur
= -(base
+ (i
* lbnadd
)) - (level
- 1);
733 if (fs
->fs_magic
== FS_UFS1_MAGIC
)
740 err_suj("Invalid lbn %jd at level 0\n", lbn
);
741 return indir_blkatoff(blk
, ino
, cur
, lbn
);
745 * Finds the disk block address at the specified lbn within the inode
746 * specified by ip. This follows the whole tree and honors di_size and
747 * di_extsize so it is a true test of reachability. The lbn may be
748 * negative if an extattr or indirect block is requested.
751 ino_blkatoff(union dinode
*ip
, ino_t ino
, ufs_lbn_t lbn
, int *frags
)
759 * Handle extattr blocks first.
761 if (lbn
< 0 && lbn
>= -NXADDR
) {
763 if (lbn
> lblkno(fs
, ip
->dp2
.di_extsize
- 1))
765 *frags
= numfrags(fs
, sblksize(fs
, ip
->dp2
.di_extsize
, lbn
));
766 return (ip
->dp2
.di_extb
[lbn
]);
769 * Now direct and indirect.
771 if (DIP(ip
, di_mode
) == IFLNK
&&
772 DIP(ip
, di_size
) < fs
->fs_maxsymlinklen
)
774 if (lbn
>= 0 && lbn
< NDADDR
) {
775 *frags
= numfrags(fs
, sblksize(fs
, DIP(ip
, di_size
), lbn
));
776 return (DIP(ip
, di_db
[lbn
]));
778 *frags
= fs
->fs_frag
;
780 for (i
= 0, tmpval
= NINDIR(fs
), cur
= NDADDR
; i
< NIADDR
; i
++,
781 tmpval
*= NINDIR(fs
), cur
= next
) {
784 return (DIP(ip
, di_ib
[i
]));
786 * Determine whether the lbn in question is within this tree.
788 if (lbn
< 0 && -lbn
>= next
)
790 if (lbn
> 0 && lbn
>= next
)
792 return indir_blkatoff(DIP(ip
, di_ib
[i
]), ino
, -cur
- i
, lbn
);
794 err_suj("lbn %jd not in ino\n", lbn
);
799 * Determine whether a block exists at a particular lbn in an inode.
800 * Returns 1 if found, 0 if not. lbn may be negative for indirects
804 blk_isat(ino_t ino
, ufs_lbn_t lbn
, ufs2_daddr_t blk
, int *frags
)
811 if (DIP(ip
, di_nlink
) == 0 || DIP(ip
, di_mode
) == 0)
813 nblk
= ino_blkatoff(ip
, ino
, lbn
, frags
);
815 return (nblk
== blk
);
819 * Clear the directory entry at diroff that should point to child. Minimal
820 * checking is done and it is assumed that this path was verified with isat.
823 ino_clrat(ino_t parent
, off_t diroff
, ino_t child
)
835 printf("Clearing inode %ju from parent %ju at offset %jd\n",
836 (uintmax_t)child
, (uintmax_t)parent
, diroff
);
838 lbn
= lblkno(fs
, diroff
);
839 doff
= blkoff(fs
, diroff
);
840 dip
= ino_read(parent
);
841 blk
= ino_blkatoff(dip
, parent
, lbn
, &frags
);
842 blksize
= sblksize(fs
, DIP(dip
, di_size
), lbn
);
843 block
= dblk_read(blk
, blksize
);
844 dp
= (struct direct
*)&block
[doff
];
845 if (dp
->d_ino
!= child
)
846 errx(1, "Inode %ju does not exist in %ju at %jd",
847 (uintmax_t)child
, (uintmax_t)parent
, diroff
);
851 * The actual .. reference count will already have been removed
852 * from the parent by the .. remref record.
857 * Determines whether a pointer to an inode exists within a directory
858 * at a specified offset. Returns the mode of the found entry.
861 ino_isat(ino_t parent
, off_t diroff
, ino_t child
, int *mode
, int *isdot
)
874 dip
= ino_read(parent
);
875 *mode
= DIP(dip
, di_mode
);
876 if ((*mode
& IFMT
) != IFDIR
) {
879 * This can happen if the parent inode
883 printf("Directory %ju has bad mode %o\n",
884 (uintmax_t)parent
, *mode
);
886 printf("Directory %ju has zero mode\n",
891 lbn
= lblkno(fs
, diroff
);
892 doff
= blkoff(fs
, diroff
);
893 blksize
= sblksize(fs
, DIP(dip
, di_size
), lbn
);
894 if (diroff
+ DIRECTSIZ(1) > DIP(dip
, di_size
) || doff
>= blksize
) {
896 printf("ino %ju absent from %ju due to offset %jd"
897 " exceeding size %jd\n",
898 (uintmax_t)child
, (uintmax_t)parent
, diroff
,
902 blk
= ino_blkatoff(dip
, parent
, lbn
, &frags
);
905 printf("Sparse directory %ju", (uintmax_t)parent
);
908 block
= dblk_read(blk
, blksize
);
910 * Walk through the records from the start of the block to be
911 * certain we hit a valid record and not some junk in the middle
912 * of a file name. Stop when we reach or pass the expected offset.
914 dpoff
= rounddown(doff
, DIRBLKSIZ
);
916 dp
= (struct direct
*)&block
[dpoff
];
919 if (dp
->d_reclen
== 0)
921 dpoff
+= dp
->d_reclen
;
922 } while (dpoff
<= doff
);
923 if (dpoff
> fs
->fs_bsize
)
924 err_suj("Corrupt directory block in dir ino %ju\n",
929 printf("ino %ju not found in %ju, lbn %jd, dpoff %d\n",
930 (uintmax_t)child
, (uintmax_t)parent
, lbn
, dpoff
);
934 * We found the item in question. Record the mode and whether it's
935 * a . or .. link for the caller.
937 if (dp
->d_ino
== child
) {
940 else if (dp
->d_namlen
== 2 &&
941 dp
->d_name
[0] == '.' && dp
->d_name
[1] == '.')
943 *mode
= DTTOIF(dp
->d_type
);
947 printf("ino %ju doesn't match dirent ino %ju in parent %ju\n",
948 (uintmax_t)child
, (uintmax_t)dp
->d_ino
, (uintmax_t)parent
);
952 #define VISIT_INDIR 0x0001
953 #define VISIT_EXT 0x0002
954 #define VISIT_ROOT 0x0004 /* Operation came via root & valid pointers. */
957 * Read an indirect level which may or may not be linked into an inode.
960 indir_visit(ino_t ino
, ufs_lbn_t lbn
, ufs2_daddr_t blk
, uint64_t *frags
,
961 ino_visitor visitor
, int flags
)
972 * Don't visit indirect blocks with contents we can't trust. This
973 * should only happen when indir_visit() is called to complete a
974 * truncate that never finished and not when a pointer is found via
979 level
= lbn_level(lbn
);
981 err_suj("Invalid level for lbn %jd\n", lbn
);
982 if ((flags
& VISIT_ROOT
) == 0 && blk_isindir(blk
, ino
, lbn
) == 0) {
984 printf("blk %jd ino %ju lbn %jd(%d) is not indir.\n",
985 blk
, (uintmax_t)ino
, lbn
, level
);
989 for (i
= level
; i
> 0; i
--)
990 lbnadd
*= NINDIR(fs
);
991 bap1
= (void *)dblk_read(blk
, fs
->fs_bsize
);
993 for (i
= 0; i
< NINDIR(fs
); i
++) {
994 if (fs
->fs_magic
== FS_UFS1_MAGIC
)
1001 nlbn
= -lbn
+ i
* lbnadd
;
1002 (*frags
) += fs
->fs_frag
;
1003 visitor(ino
, nlbn
, nblk
, fs
->fs_frag
);
1005 nlbn
= (lbn
+ 1) - (i
* lbnadd
);
1006 indir_visit(ino
, nlbn
, nblk
, frags
, visitor
, flags
);
1010 if (flags
& VISIT_INDIR
) {
1011 (*frags
) += fs
->fs_frag
;
1012 visitor(ino
, lbn
, blk
, fs
->fs_frag
);
1017 * Visit each block in an inode as specified by 'flags' and call a
1018 * callback function. The callback may inspect or free blocks. The
1019 * count of frags found according to the size in the file is returned.
1020 * This is not valid for sparse files but may be used to determine
1021 * the correct di_blocks for a file.
1024 ino_visit(union dinode
*ip
, ino_t ino
, ino_visitor visitor
, int flags
)
1035 size
= DIP(ip
, di_size
);
1036 mode
= DIP(ip
, di_mode
) & IFMT
;
1038 if ((flags
& VISIT_EXT
) &&
1039 fs
->fs_magic
== FS_UFS2_MAGIC
&& ip
->dp2
.di_extsize
) {
1040 for (i
= 0; i
< NXADDR
; i
++) {
1041 if (ip
->dp2
.di_extb
[i
] == 0)
1043 frags
= sblksize(fs
, ip
->dp2
.di_extsize
, i
);
1044 frags
= numfrags(fs
, frags
);
1046 visitor(ino
, -1 - i
, ip
->dp2
.di_extb
[i
], frags
);
1049 /* Skip datablocks for short links and devices. */
1050 if (mode
== IFBLK
|| mode
== IFCHR
||
1051 (mode
== IFLNK
&& size
< fs
->fs_maxsymlinklen
))
1053 for (i
= 0; i
< NDADDR
; i
++) {
1054 if (DIP(ip
, di_db
[i
]) == 0)
1056 frags
= sblksize(fs
, size
, i
);
1057 frags
= numfrags(fs
, frags
);
1059 visitor(ino
, i
, DIP(ip
, di_db
[i
]), frags
);
1062 * We know the following indirects are real as we're following
1063 * real pointers to them.
1065 flags
|= VISIT_ROOT
;
1066 for (i
= 0, tmpval
= NINDIR(fs
), lbn
= NDADDR
; i
< NIADDR
; i
++,
1068 nextlbn
= lbn
+ tmpval
;
1069 tmpval
*= NINDIR(fs
);
1070 if (DIP(ip
, di_ib
[i
]) == 0)
1072 indir_visit(ino
, -lbn
- i
, DIP(ip
, di_ib
[i
]), &fragcnt
, visitor
,
1079 * Null visitor function used when we just want to count blocks and
1084 null_visit(ino_t ino
, ufs_lbn_t lbn
, ufs2_daddr_t blk
, int frags
)
1091 * Recalculate di_blocks when we discover that a block allocation or
1092 * free was not successfully completed. The kernel does not roll this back
1093 * because it would be too expensive to compute which indirects were
1094 * reachable at the time the inode was written.
1097 ino_adjblks(struct suj_ino
*sino
)
1108 /* No need to adjust zero'd inodes. */
1109 if (DIP(ip
, di_mode
) == 0)
1112 * Visit all blocks and count them as well as recording the last
1113 * valid lbn in the file. If the file size doesn't agree with the
1114 * last lbn we need to truncate to fix it. Otherwise just adjust
1118 frags
= ino_visit(ip
, ino
, null_visit
, VISIT_INDIR
| VISIT_EXT
);
1119 blocks
= fsbtodb(fs
, frags
);
1121 * We assume the size and direct block list is kept coherent by
1122 * softdep. For files that have extended into indirects we truncate
1123 * to the size in the inode or the maximum size permitted by
1124 * populated indirects.
1126 if (visitlbn
>= NDADDR
) {
1127 isize
= DIP(ip
, di_size
);
1128 size
= lblktosize(fs
, visitlbn
+ 1);
1131 /* Always truncate to free any unpopulated indirects. */
1132 ino_trunc(sino
->si_ino
, isize
);
1135 if (blocks
== DIP(ip
, di_blocks
))
1138 printf("ino %ju adjusting block count from %jd to %jd\n",
1139 (uintmax_t)ino
, DIP(ip
, di_blocks
), blocks
);
1140 DIP_SET(ip
, di_blocks
, blocks
);
1145 blk_free_visit(ino_t ino
, ufs_lbn_t lbn
, ufs2_daddr_t blk
, int frags
)
1148 blk_free(blk
, blk_freemask(blk
, ino
, lbn
, frags
), frags
);
1152 * Free a block or tree of blocks that was previously rooted in ino at
1153 * the given lbn. If the lbn is an indirect all children are freed
1157 blk_free_lbn(ufs2_daddr_t blk
, ino_t ino
, ufs_lbn_t lbn
, int frags
, int follow
)
1162 mask
= blk_freemask(blk
, ino
, lbn
, frags
);
1164 if (lbn
<= -NDADDR
&& follow
&& mask
== 0)
1165 indir_visit(ino
, lbn
, blk
, &resid
, blk_free_visit
, VISIT_INDIR
);
1167 blk_free(blk
, mask
, frags
);
1171 ino_setskip(struct suj_ino
*sino
, ino_t parent
)
1176 if (ino_isat(sino
->si_ino
, DOTDOT_OFFSET
, parent
, &mode
, &isdot
))
1177 sino
->si_skipparent
= 1;
1181 ino_remref(ino_t parent
, ino_t child
, uint64_t diroff
, int isdotdot
)
1183 struct suj_ino
*sino
;
1184 struct suj_rec
*srec
;
1185 struct jrefrec
*rrec
;
1188 * Lookup this inode to see if we have a record for it.
1190 sino
= ino_lookup(child
, 0);
1192 * Tell any child directories we've already removed their
1193 * parent link cnt. Don't try to adjust our link down again.
1195 if (sino
!= NULL
&& isdotdot
== 0)
1196 ino_setskip(sino
, parent
);
1198 * No valid record for this inode. Just drop the on-disk
1201 if (sino
== NULL
|| sino
->si_hasrecs
== 0) {
1206 * Use ino_adjust() if ino_check() has already processed this
1207 * child. If we lose the last non-dot reference to a
1208 * directory it will be discarded.
1210 if (sino
->si_linkadj
) {
1213 sino
->si_dotlinks
--;
1218 * If we haven't yet processed this inode we need to make
1219 * sure we will successfully discover the lost path. If not
1220 * use nlinkadj to remember.
1222 TAILQ_FOREACH(srec
, &sino
->si_recs
, sr_next
) {
1223 rrec
= (struct jrefrec
*)srec
->sr_rec
;
1224 if (rrec
->jr_parent
== parent
&&
1225 rrec
->jr_diroff
== diroff
)
1228 sino
->si_nlinkadj
++;
1232 * Free the children of a directory when the directory is discarded.
1235 ino_free_children(ino_t ino
, ufs_lbn_t lbn
, ufs2_daddr_t blk
, int frags
)
1237 struct suj_ino
*sino
;
1246 sino
= ino_lookup(ino
, 0);
1248 skipparent
= sino
->si_skipparent
;
1251 size
= lfragtosize(fs
, frags
);
1252 block
= dblk_read(blk
, size
);
1253 dp
= (struct direct
*)&block
[0];
1254 for (dpoff
= 0; dpoff
< size
&& dp
->d_reclen
; dpoff
+= dp
->d_reclen
) {
1255 dp
= (struct direct
*)&block
[dpoff
];
1256 if (dp
->d_ino
== 0 || dp
->d_ino
== WINO
)
1258 if (dp
->d_namlen
== 1 && dp
->d_name
[0] == '.')
1260 isdotdot
= dp
->d_namlen
== 2 && dp
->d_name
[0] == '.' &&
1261 dp
->d_name
[1] == '.';
1262 if (isdotdot
&& skipparent
== 1)
1265 printf("Directory %ju removing ino %ju name %s\n",
1266 (uintmax_t)ino
, (uintmax_t)dp
->d_ino
, dp
->d_name
);
1267 diroff
= lblktosize(fs
, lbn
) + dpoff
;
1268 ino_remref(ino
, dp
->d_ino
, diroff
, isdotdot
);
1273 * Reclaim an inode, freeing all blocks and decrementing all children's
1274 * link counts. Free the inode back to the cg.
1277 ino_reclaim(union dinode
*ip
, ino_t ino
, int mode
)
1282 err_suj("Attempting to free ROOTINO\n");
1284 printf("Truncating and freeing ino %ju, nlink %d, mode %o\n",
1285 (uintmax_t)ino
, DIP(ip
, di_nlink
), DIP(ip
, di_mode
));
1287 /* We are freeing an inode or directory. */
1288 if ((DIP(ip
, di_mode
) & IFMT
) == IFDIR
)
1289 ino_visit(ip
, ino
, ino_free_children
, 0);
1290 DIP_SET(ip
, di_nlink
, 0);
1291 ino_visit(ip
, ino
, blk_free_visit
, VISIT_EXT
| VISIT_INDIR
);
1292 /* Here we have to clear the inode and release any blocks it holds. */
1293 gen
= DIP(ip
, di_gen
);
1294 if (fs
->fs_magic
== FS_UFS1_MAGIC
)
1295 bzero(ip
, sizeof(struct ufs1_dinode
));
1297 bzero(ip
, sizeof(struct ufs2_dinode
));
1298 DIP_SET(ip
, di_gen
, gen
);
1300 ino_free(ino
, mode
);
1305 * Adjust an inode's link count down by one when a directory goes away.
1316 nlink
= DIP(ip
, di_nlink
);
1317 mode
= DIP(ip
, di_mode
);
1319 err_suj("Inode %d link count %d invalid\n", ino
, nlink
);
1321 err_suj("Inode %d has a link of %d with 0 mode\n", ino
, nlink
);
1323 if ((mode
& IFMT
) == IFDIR
)
1327 if (nlink
< reqlink
) {
1329 printf("ino %ju not enough links to live %d < %d\n",
1330 (uintmax_t)ino
, nlink
, reqlink
);
1331 ino_reclaim(ip
, ino
, mode
);
1334 DIP_SET(ip
, di_nlink
, nlink
);
1339 * Adjust the inode link count to 'nlink'. If the count reaches zero
1343 ino_adjust(struct suj_ino
*sino
)
1345 struct jrefrec
*rrec
;
1346 struct suj_rec
*srec
;
1347 struct suj_ino
*stmp
;
1356 nlink
= sino
->si_nlink
;
1358 mode
= sino
->si_mode
& IFMT
;
1360 * If it's a directory with no dot links, it was truncated before
1361 * the name was cleared. We need to clear the dirent that
1364 if (mode
== IFDIR
&& nlink
== 1 && sino
->si_dotlinks
== 0) {
1365 sino
->si_nlink
= nlink
= 0;
1366 TAILQ_FOREACH(srec
, &sino
->si_recs
, sr_next
) {
1367 rrec
= (struct jrefrec
*)srec
->sr_rec
;
1368 if (ino_isat(rrec
->jr_parent
, rrec
->jr_diroff
, ino
,
1369 &recmode
, &isdot
) == 0)
1371 ino_clrat(rrec
->jr_parent
, rrec
->jr_diroff
, ino
);
1375 errx(1, "Directory %ju name not found", (uintmax_t)ino
);
1378 * If it's a directory with no real names pointing to it go ahead
1379 * and truncate it. This will free any children.
1381 if (mode
== IFDIR
&& nlink
- sino
->si_dotlinks
== 0) {
1382 sino
->si_nlink
= nlink
= 0;
1384 * Mark any .. links so they know not to free this inode
1385 * when they are removed.
1387 TAILQ_FOREACH(srec
, &sino
->si_recs
, sr_next
) {
1388 rrec
= (struct jrefrec
*)srec
->sr_rec
;
1389 if (rrec
->jr_diroff
== DOTDOT_OFFSET
) {
1390 stmp
= ino_lookup(rrec
->jr_parent
, 0);
1392 ino_setskip(stmp
, ino
);
1397 mode
= DIP(ip
, di_mode
) & IFMT
;
1398 if (nlink
> LINK_MAX
)
1399 err_suj("ino %ju nlink manipulation error, new %d, old %d\n",
1400 (uintmax_t)ino
, nlink
, DIP(ip
, di_nlink
));
1402 printf("Adjusting ino %ju, nlink %d, old link %d lastmode %o\n",
1403 (uintmax_t)ino
, nlink
, DIP(ip
, di_nlink
), sino
->si_mode
);
1406 printf("ino %ju, zero inode freeing bitmap\n",
1408 ino_free(ino
, sino
->si_mode
);
1411 /* XXX Should be an assert? */
1412 if (mode
!= sino
->si_mode
&& debug
)
1413 printf("ino %ju, mode %o != %o\n",
1414 (uintmax_t)ino
, mode
, sino
->si_mode
);
1415 if ((mode
& IFMT
) == IFDIR
)
1419 /* If the inode doesn't have enough links to live, free it. */
1420 if (nlink
< reqlink
) {
1422 printf("ino %ju not enough links to live %d < %d\n",
1423 (uintmax_t)ino
, nlink
, reqlink
);
1424 ino_reclaim(ip
, ino
, mode
);
1427 /* If required write the updated link count. */
1428 if (DIP(ip
, di_nlink
) == nlink
) {
1430 printf("ino %ju, link matches, skipping.\n",
1434 DIP_SET(ip
, di_nlink
, nlink
);
1439 * Truncate some or all blocks in an indirect, freeing any that are required
1440 * and zeroing the indirect.
1443 indir_trunc(ino_t ino
, ufs_lbn_t lbn
, ufs2_daddr_t blk
, ufs_lbn_t lastlbn
)
1458 level
= lbn_level(lbn
);
1460 err_suj("Invalid level for lbn %jd\n", lbn
);
1462 for (i
= level
; i
> 0; i
--)
1463 lbnadd
*= NINDIR(fs
);
1464 bap1
= (void *)dblk_read(blk
, fs
->fs_bsize
);
1465 bap2
= (void *)bap1
;
1466 for (i
= 0; i
< NINDIR(fs
); i
++) {
1467 if (fs
->fs_magic
== FS_UFS1_MAGIC
)
1474 nlbn
= (lbn
+ 1) - (i
* lbnadd
);
1476 * Calculate the lbn of the next indirect to
1477 * determine if any of this indirect must be
1480 next
= -(lbn
+ level
) + ((i
+1) * lbnadd
);
1481 if (next
<= lastlbn
)
1483 indir_trunc(ino
, nlbn
, nblk
, lastlbn
);
1484 /* If all of this indirect was reclaimed, free it. */
1485 nlbn
= next
- lbnadd
;
1489 nlbn
= -lbn
+ i
* lbnadd
;
1494 blk_free(nblk
, 0, fs
->fs_frag
);
1495 if (fs
->fs_magic
== FS_UFS1_MAGIC
)
1505 * Truncate an inode to the minimum of the given size or the last populated
1506 * block after any over size have been discarded. The kernel would allocate
1507 * the last block in the file but fsck does not and neither do we. This
1508 * code never extends files, only shrinks them.
1511 ino_trunc(ino_t ino
, off_t size
)
1515 uint64_t totalfrags
;
1527 mode
= DIP(ip
, di_mode
) & IFMT
;
1528 cursize
= DIP(ip
, di_size
);
1530 printf("Truncating ino %ju, mode %o to size %jd from size %jd\n",
1531 (uintmax_t)ino
, mode
, size
, cursize
);
1533 /* Skip datablocks for short links and devices. */
1534 if (mode
== 0 || mode
== IFBLK
|| mode
== IFCHR
||
1535 (mode
== IFLNK
&& cursize
< fs
->fs_maxsymlinklen
))
1540 lastlbn
= lblkno(fs
, blkroundup(fs
, size
));
1541 for (i
= lastlbn
; i
< NDADDR
; i
++) {
1542 if (DIP(ip
, di_db
[i
]) == 0)
1544 frags
= sblksize(fs
, cursize
, i
);
1545 frags
= numfrags(fs
, frags
);
1546 blk_free(DIP(ip
, di_db
[i
]), 0, frags
);
1547 DIP_SET(ip
, di_db
[i
], 0);
1550 * Follow indirect blocks, freeing anything required.
1552 for (i
= 0, tmpval
= NINDIR(fs
), lbn
= NDADDR
; i
< NIADDR
; i
++,
1554 nextlbn
= lbn
+ tmpval
;
1555 tmpval
*= NINDIR(fs
);
1556 /* If we're not freeing any in this indirect range skip it. */
1557 if (lastlbn
>= nextlbn
)
1559 if (DIP(ip
, di_ib
[i
]) == 0)
1561 indir_trunc(ino
, -lbn
- i
, DIP(ip
, di_ib
[i
]), lastlbn
);
1562 /* If we freed everything in this indirect free the indir. */
1565 blk_free(DIP(ip
, di_ib
[i
]), 0, frags
);
1566 DIP_SET(ip
, di_ib
[i
], 0);
1570 * Now that we've freed any whole blocks that exceed the desired
1571 * truncation size, figure out how many blocks remain and what the
1572 * last populated lbn is. We will set the size to this last lbn
1573 * rather than worrying about allocating the final lbn as the kernel
1574 * would've done. This is consistent with normal fsck behavior.
1577 totalfrags
= ino_visit(ip
, ino
, null_visit
, VISIT_INDIR
| VISIT_EXT
);
1578 if (size
> lblktosize(fs
, visitlbn
+ 1))
1579 size
= lblktosize(fs
, visitlbn
+ 1);
1581 * If we're truncating direct blocks we have to adjust frags
1584 if (visitlbn
< NDADDR
&& totalfrags
) {
1585 long oldspace
, newspace
;
1587 bn
= DIP(ip
, di_db
[visitlbn
]);
1589 err_suj("Bad blk at ino %ju lbn %jd\n",
1590 (uintmax_t)ino
, visitlbn
);
1591 oldspace
= sblksize(fs
, cursize
, visitlbn
);
1592 newspace
= sblksize(fs
, size
, visitlbn
);
1593 if (oldspace
!= newspace
) {
1594 bn
+= numfrags(fs
, newspace
);
1595 frags
= numfrags(fs
, oldspace
- newspace
);
1596 blk_free(bn
, 0, frags
);
1597 totalfrags
-= frags
;
1600 DIP_SET(ip
, di_blocks
, fsbtodb(fs
, totalfrags
));
1601 DIP_SET(ip
, di_size
, size
);
1603 * If we've truncated into the middle of a block or frag we have
1604 * to zero it here. Otherwise the file could extend into
1605 * uninitialized space later.
1607 off
= blkoff(fs
, size
);
1608 if (off
&& DIP(ip
, di_mode
) != IFDIR
) {
1612 bn
= ino_blkatoff(ip
, ino
, visitlbn
, &frags
);
1614 err_suj("Block missing from ino %ju at lbn %jd\n",
1615 (uintmax_t)ino
, visitlbn
);
1616 clrsize
= frags
* fs
->fs_fsize
;
1617 buf
= dblk_read(bn
, clrsize
);
1620 bzero(buf
, clrsize
);
1627 * Process records available for one inode and determine whether the
1628 * link count is correct or needs adjusting.
1631 ino_check(struct suj_ino
*sino
)
1633 struct suj_rec
*srec
;
1634 struct jrefrec
*rrec
;
1644 if (sino
->si_hasrecs
== 0)
1647 rrec
= (struct jrefrec
*)TAILQ_FIRST(&sino
->si_recs
)->sr_rec
;
1648 nlink
= rrec
->jr_nlink
;
1651 removes
= sino
->si_nlinkadj
;
1652 TAILQ_FOREACH(srec
, &sino
->si_recs
, sr_next
) {
1653 rrec
= (struct jrefrec
*)srec
->sr_rec
;
1654 isat
= ino_isat(rrec
->jr_parent
, rrec
->jr_diroff
,
1655 rrec
->jr_ino
, &mode
, &isdot
);
1656 if (isat
&& (mode
& IFMT
) != (rrec
->jr_mode
& IFMT
))
1657 err_suj("Inode mode/directory type mismatch %o != %o\n",
1658 mode
, rrec
->jr_mode
);
1660 printf("jrefrec: op %d ino %ju, nlink %d, parent %d, "
1661 "diroff %jd, mode %o, isat %d, isdot %d\n",
1662 rrec
->jr_op
, (uintmax_t)rrec
->jr_ino
,
1663 rrec
->jr_nlink
, rrec
->jr_parent
, rrec
->jr_diroff
,
1664 rrec
->jr_mode
, isat
, isdot
);
1665 mode
= rrec
->jr_mode
& IFMT
;
1666 if (rrec
->jr_op
== JOP_REMREF
)
1673 * The number of links that remain are the starting link count
1674 * subtracted by the total number of removes with the total
1675 * links discovered back in. An incomplete remove thus
1676 * makes no change to the link count but an add increases
1680 printf("ino %ju nlink %d newlinks %d removes %d dotlinks %d\n",
1681 (uintmax_t)ino
, nlink
, newlinks
, removes
, dotlinks
);
1684 sino
->si_linkadj
= 1;
1685 sino
->si_nlink
= nlink
;
1686 sino
->si_dotlinks
= dotlinks
;
1687 sino
->si_mode
= mode
;
1692 * Process records available for one block and determine whether it is
1693 * still allocated and whether the owning inode needs to be updated or
1697 blk_check(struct suj_blk
*sblk
)
1699 struct suj_rec
*srec
;
1700 struct jblkrec
*brec
;
1701 struct suj_ino
*sino
;
1708 * Each suj_blk actually contains records for any fragments in that
1709 * block. As a result we must evaluate each record individually.
1712 TAILQ_FOREACH(srec
, &sblk
->sb_recs
, sr_next
) {
1713 brec
= (struct jblkrec
*)srec
->sr_rec
;
1714 frags
= brec
->jb_frags
;
1715 blk
= brec
->jb_blkno
+ brec
->jb_oldfrags
;
1716 isat
= blk_isat(brec
->jb_ino
, brec
->jb_lbn
, blk
, &frags
);
1717 if (sino
== NULL
|| sino
->si_ino
!= brec
->jb_ino
) {
1718 sino
= ino_lookup(brec
->jb_ino
, 1);
1719 sino
->si_blkadj
= 1;
1722 printf("op %d blk %jd ino %ju lbn %jd frags %d isat %d (%d)\n",
1723 brec
->jb_op
, blk
, (uintmax_t)brec
->jb_ino
,
1724 brec
->jb_lbn
, brec
->jb_frags
, isat
, frags
);
1726 * If we found the block at this address we still have to
1727 * determine if we need to free the tail end that was
1728 * added by adding contiguous fragments from the same block.
1731 if (frags
== brec
->jb_frags
)
1733 mask
= blk_freemask(blk
, brec
->jb_ino
, brec
->jb_lbn
,
1737 frags
= brec
->jb_frags
- frags
;
1738 blk_free(blk
, mask
, frags
);
1742 * The block wasn't found, attempt to free it. It won't be
1743 * freed if it was actually reallocated. If this was an
1744 * allocation we don't want to follow indirects as they
1745 * may not be written yet. Any children of the indirect will
1746 * have their own records. If it's a free we need to
1747 * recursively free children.
1749 blk_free_lbn(blk
, brec
->jb_ino
, brec
->jb_lbn
, brec
->jb_frags
,
1750 brec
->jb_op
== JOP_FREEBLK
);
1755 * Walk the list of inode records for this cg and resolve moved and duplicate
1756 * inode references now that we have a complete picture.
1759 cg_build(struct suj_cg
*sc
)
1761 struct suj_ino
*sino
;
1764 for (i
= 0; i
< SUJ_HASHSIZE
; i
++)
1765 LIST_FOREACH(sino
, &sc
->sc_inohash
[i
], si_next
)
1770 * Handle inodes requiring truncation. This must be done prior to
1771 * looking up any inodes in directories.
1774 cg_trunc(struct suj_cg
*sc
)
1776 struct suj_ino
*sino
;
1779 for (i
= 0; i
< SUJ_HASHSIZE
; i
++) {
1780 LIST_FOREACH(sino
, &sc
->sc_inohash
[i
], si_next
) {
1781 if (sino
->si_trunc
) {
1782 ino_trunc(sino
->si_ino
,
1783 sino
->si_trunc
->jt_size
);
1784 sino
->si_blkadj
= 0;
1785 sino
->si_trunc
= NULL
;
1787 if (sino
->si_blkadj
)
1794 cg_adj_blk(struct suj_cg
*sc
)
1796 struct suj_ino
*sino
;
1799 for (i
= 0; i
< SUJ_HASHSIZE
; i
++) {
1800 LIST_FOREACH(sino
, &sc
->sc_inohash
[i
], si_next
) {
1801 if (sino
->si_blkadj
)
1808 * Free any partially allocated blocks and then resolve inode block
1812 cg_check_blk(struct suj_cg
*sc
)
1814 struct suj_blk
*sblk
;
1818 for (i
= 0; i
< SUJ_HASHSIZE
; i
++)
1819 LIST_FOREACH(sblk
, &sc
->sc_blkhash
[i
], sb_next
)
1824 * Walk the list of inode records for this cg, recovering any
1825 * changes which were not complete at the time of crash.
1828 cg_check_ino(struct suj_cg
*sc
)
1830 struct suj_ino
*sino
;
1833 for (i
= 0; i
< SUJ_HASHSIZE
; i
++)
1834 LIST_FOREACH(sino
, &sc
->sc_inohash
[i
], si_next
)
1839 * Write a potentially dirty cg. Recalculate the summary information and
1840 * update the superblock summary.
1843 cg_write(struct suj_cg
*sc
)
1845 ufs1_daddr_t fragno
, cgbno
, maxbno
;
1851 if (sc
->sc_dirty
== 0)
1854 * Fix the frag and cluster summary.
1857 cgp
->cg_cs
.cs_nbfree
= 0;
1858 cgp
->cg_cs
.cs_nffree
= 0;
1859 bzero(&cgp
->cg_frsum
, sizeof(cgp
->cg_frsum
));
1860 maxbno
= fragstoblks(fs
, fs
->fs_fpg
);
1861 if (fs
->fs_contigsumsize
> 0) {
1862 for (i
= 1; i
<= fs
->fs_contigsumsize
; i
++)
1863 cg_clustersum(cgp
)[i
] = 0;
1864 bzero(cg_clustersfree(cgp
), howmany(maxbno
, CHAR_BIT
));
1866 blksfree
= cg_blksfree(cgp
);
1867 for (cgbno
= 0; cgbno
< maxbno
; cgbno
++) {
1868 if (ffs_isfreeblock(fs
, blksfree
, cgbno
))
1870 if (ffs_isblock(fs
, blksfree
, cgbno
)) {
1871 ffs_clusteracct(fs
, cgp
, cgbno
, 1);
1872 cgp
->cg_cs
.cs_nbfree
++;
1875 fragno
= blkstofrags(fs
, cgbno
);
1876 blk
= blkmap(fs
, blksfree
, fragno
);
1877 ffs_fragacct(fs
, blk
, cgp
->cg_frsum
, 1);
1878 for (i
= 0; i
< fs
->fs_frag
; i
++)
1879 if (isset(blksfree
, fragno
+ i
))
1880 cgp
->cg_cs
.cs_nffree
++;
1883 * Update the superblock cg summary from our now correct values
1884 * before writing the block.
1886 fs
->fs_cs(fs
, sc
->sc_cgx
) = cgp
->cg_cs
;
1887 if (bwrite(disk
, fsbtodb(fs
, cgtod(fs
, sc
->sc_cgx
)), sc
->sc_cgbuf
,
1888 fs
->fs_bsize
) == -1)
1889 err_suj("Unable to write cylinder group %d\n", sc
->sc_cgx
);
1893 * Write out any modified inodes.
1896 cg_write_inos(struct suj_cg
*sc
)
1898 struct ino_blk
*iblk
;
1901 for (i
= 0; i
< SUJ_HASHSIZE
; i
++)
1902 LIST_FOREACH(iblk
, &sc
->sc_iblkhash
[i
], ib_next
)
1908 cg_apply(void (*apply
)(struct suj_cg
*))
1913 for (i
= 0; i
< SUJ_HASHSIZE
; i
++)
1914 LIST_FOREACH(scg
, &cghash
[i
], sc_next
)
1919 * Process the unlinked but referenced file list. Freeing all inodes.
1929 ino
= fs
->fs_sujfree
;
1933 mode
= DIP(ip
, di_mode
) & IFMT
;
1934 inon
= DIP(ip
, di_freelink
);
1935 DIP_SET(ip
, di_freelink
, 0);
1937 * XXX Should this be an errx?
1939 if (DIP(ip
, di_nlink
) == 0) {
1941 printf("Freeing unlinked ino %ju mode %o\n",
1942 (uintmax_t)ino
, mode
);
1943 ino_reclaim(ip
, ino
, mode
);
1945 printf("Skipping ino %ju mode %o with link %d\n",
1946 (uintmax_t)ino
, mode
, DIP(ip
, di_nlink
));
1952 * Append a new record to the list of records requiring processing.
1955 ino_append(union jrec
*rec
)
1957 struct jrefrec
*refrec
;
1958 struct jmvrec
*mvrec
;
1959 struct suj_ino
*sino
;
1960 struct suj_rec
*srec
;
1962 mvrec
= &rec
->rec_jmvrec
;
1963 refrec
= &rec
->rec_jrefrec
;
1964 if (debug
&& mvrec
->jm_op
== JOP_MVREF
)
1965 printf("ino move: ino %d, parent %d, diroff %jd, oldoff %jd\n",
1966 mvrec
->jm_ino
, mvrec
->jm_parent
, mvrec
->jm_newoff
,
1969 (refrec
->jr_op
== JOP_ADDREF
|| refrec
->jr_op
== JOP_REMREF
))
1970 printf("ino ref: op %d, ino %d, nlink %d, "
1971 "parent %d, diroff %jd\n",
1972 refrec
->jr_op
, refrec
->jr_ino
, refrec
->jr_nlink
,
1973 refrec
->jr_parent
, refrec
->jr_diroff
);
1974 sino
= ino_lookup(((struct jrefrec
*)rec
)->jr_ino
, 1);
1975 sino
->si_hasrecs
= 1;
1976 srec
= errmalloc(sizeof(*srec
));
1978 TAILQ_INSERT_TAIL(&sino
->si_newrecs
, srec
, sr_next
);
1982 * Add a reference adjustment to the sino list and eliminate dups. The
1983 * primary loop in ino_build_ref() checks for dups but new ones may be
1984 * created as a result of offset adjustments.
1987 ino_add_ref(struct suj_ino
*sino
, struct suj_rec
*srec
)
1989 struct jrefrec
*refrec
;
1990 struct suj_rec
*srn
;
1991 struct jrefrec
*rrn
;
1993 refrec
= (struct jrefrec
*)srec
->sr_rec
;
1995 * We walk backwards so that the oldest link count is preserved. If
1996 * an add record conflicts with a remove keep the remove. Redundant
1997 * removes are eliminated in ino_build_ref. Otherwise we keep the
1998 * oldest record at a given location.
2000 for (srn
= TAILQ_LAST(&sino
->si_recs
, srechd
); srn
;
2001 srn
= TAILQ_PREV(srn
, srechd
, sr_next
)) {
2002 rrn
= (struct jrefrec
*)srn
->sr_rec
;
2003 if (rrn
->jr_parent
!= refrec
->jr_parent
||
2004 rrn
->jr_diroff
!= refrec
->jr_diroff
)
2006 if (rrn
->jr_op
== JOP_REMREF
|| refrec
->jr_op
== JOP_ADDREF
) {
2007 rrn
->jr_mode
= refrec
->jr_mode
;
2013 * Replace the record in place with the old nlink in case
2014 * we replace the head of the list. Abandon srec as a dup.
2016 refrec
->jr_nlink
= rrn
->jr_nlink
;
2017 srn
->sr_rec
= srec
->sr_rec
;
2020 TAILQ_INSERT_TAIL(&sino
->si_recs
, srec
, sr_next
);
2024 * Create a duplicate of a reference at a previous location.
2027 ino_dup_ref(struct suj_ino
*sino
, struct jrefrec
*refrec
, off_t diroff
)
2029 struct jrefrec
*rrn
;
2030 struct suj_rec
*srn
;
2032 rrn
= errmalloc(sizeof(*refrec
));
2034 rrn
->jr_op
= JOP_ADDREF
;
2035 rrn
->jr_diroff
= diroff
;
2036 srn
= errmalloc(sizeof(*srn
));
2037 srn
->sr_rec
= (union jrec
*)rrn
;
2038 ino_add_ref(sino
, srn
);
2042 * Add a reference to the list at all known locations. We follow the offset
2043 * changes for a single instance and create duplicate add refs at each so
2044 * that we can tolerate any version of the directory block. Eliminate
2045 * removes which collide with adds that are seen in the journal. They should
2046 * not adjust the link count down.
2049 ino_build_ref(struct suj_ino
*sino
, struct suj_rec
*srec
)
2051 struct jrefrec
*refrec
;
2052 struct jmvrec
*mvrec
;
2053 struct suj_rec
*srp
;
2054 struct suj_rec
*srn
;
2055 struct jrefrec
*rrn
;
2058 refrec
= (struct jrefrec
*)srec
->sr_rec
;
2060 * Search for a mvrec that matches this offset. Whether it's an add
2061 * or a remove we can delete the mvref after creating a dup record in
2064 if (!TAILQ_EMPTY(&sino
->si_movs
)) {
2065 diroff
= refrec
->jr_diroff
;
2066 for (srn
= TAILQ_LAST(&sino
->si_movs
, srechd
); srn
; srn
= srp
) {
2067 srp
= TAILQ_PREV(srn
, srechd
, sr_next
);
2068 mvrec
= (struct jmvrec
*)srn
->sr_rec
;
2069 if (mvrec
->jm_parent
!= refrec
->jr_parent
||
2070 mvrec
->jm_newoff
!= diroff
)
2072 diroff
= mvrec
->jm_oldoff
;
2073 TAILQ_REMOVE(&sino
->si_movs
, srn
, sr_next
);
2075 ino_dup_ref(sino
, refrec
, diroff
);
2079 * If a remove wasn't eliminated by an earlier add just append it to
2082 if (refrec
->jr_op
== JOP_REMREF
) {
2083 ino_add_ref(sino
, srec
);
2087 * Walk the list of records waiting to be added to the list. We
2088 * must check for moves that apply to our current offset and remove
2089 * them from the list. Remove any duplicates to eliminate removes
2090 * with corresponding adds.
2092 TAILQ_FOREACH_SAFE(srn
, &sino
->si_newrecs
, sr_next
, srp
) {
2093 switch (srn
->sr_rec
->rec_jrefrec
.jr_op
) {
2096 * This should actually be an error we should
2097 * have a remove for every add journaled.
2099 rrn
= (struct jrefrec
*)srn
->sr_rec
;
2100 if (rrn
->jr_parent
!= refrec
->jr_parent
||
2101 rrn
->jr_diroff
!= refrec
->jr_diroff
)
2103 TAILQ_REMOVE(&sino
->si_newrecs
, srn
, sr_next
);
2107 * Once we remove the current iteration of the
2108 * record at this address we're done.
2110 rrn
= (struct jrefrec
*)srn
->sr_rec
;
2111 if (rrn
->jr_parent
!= refrec
->jr_parent
||
2112 rrn
->jr_diroff
!= refrec
->jr_diroff
)
2114 TAILQ_REMOVE(&sino
->si_newrecs
, srn
, sr_next
);
2115 ino_add_ref(sino
, srec
);
2119 * Update our diroff based on any moves that match
2120 * and remove the move.
2122 mvrec
= (struct jmvrec
*)srn
->sr_rec
;
2123 if (mvrec
->jm_parent
!= refrec
->jr_parent
||
2124 mvrec
->jm_oldoff
!= refrec
->jr_diroff
)
2126 ino_dup_ref(sino
, refrec
, mvrec
->jm_oldoff
);
2127 refrec
->jr_diroff
= mvrec
->jm_newoff
;
2128 TAILQ_REMOVE(&sino
->si_newrecs
, srn
, sr_next
);
2131 err_suj("ino_build_ref: Unknown op %d\n",
2132 srn
->sr_rec
->rec_jrefrec
.jr_op
);
2135 ino_add_ref(sino
, srec
);
2139 * Walk the list of new records and add them in-order resolving any
2140 * dups and adjusted offsets.
2143 ino_build(struct suj_ino
*sino
)
2145 struct suj_rec
*srec
;
2147 while ((srec
= TAILQ_FIRST(&sino
->si_newrecs
)) != NULL
) {
2148 TAILQ_REMOVE(&sino
->si_newrecs
, srec
, sr_next
);
2149 switch (srec
->sr_rec
->rec_jrefrec
.jr_op
) {
2152 ino_build_ref(sino
, srec
);
2156 * Add this mvrec to the queue of pending mvs.
2158 TAILQ_INSERT_TAIL(&sino
->si_movs
, srec
, sr_next
);
2161 err_suj("ino_build: Unknown op %d\n",
2162 srec
->sr_rec
->rec_jrefrec
.jr_op
);
2165 if (TAILQ_EMPTY(&sino
->si_recs
))
2166 sino
->si_hasrecs
= 0;
2170 * Modify journal records so they refer to the base block number
2171 * and a start and end frag range. This is to facilitate the discovery
2172 * of overlapping fragment allocations.
2175 blk_build(struct jblkrec
*blkrec
)
2177 struct suj_rec
*srec
;
2178 struct suj_blk
*sblk
;
2179 struct jblkrec
*blkrn
;
2184 printf("blk_build: op %d blkno %jd frags %d oldfrags %d "
2186 blkrec
->jb_op
, blkrec
->jb_blkno
, blkrec
->jb_frags
,
2187 blkrec
->jb_oldfrags
, blkrec
->jb_ino
, blkrec
->jb_lbn
);
2189 blk
= blknum(fs
, blkrec
->jb_blkno
);
2190 frag
= fragnum(fs
, blkrec
->jb_blkno
);
2191 sblk
= blk_lookup(blk
, 1);
2193 * Rewrite the record using oldfrags to indicate the offset into
2194 * the block. Leave jb_frags as the actual allocated count.
2196 blkrec
->jb_blkno
-= frag
;
2197 blkrec
->jb_oldfrags
= frag
;
2198 if (blkrec
->jb_oldfrags
+ blkrec
->jb_frags
> fs
->fs_frag
)
2199 err_suj("Invalid fragment count %d oldfrags %d\n",
2200 blkrec
->jb_frags
, frag
);
2202 * Detect dups. If we detect a dup we always discard the oldest
2203 * record as it is superseded by the new record. This speeds up
2204 * later stages but also eliminates free records which are used
2205 * to indicate that the contents of indirects can be trusted.
2207 TAILQ_FOREACH(srec
, &sblk
->sb_recs
, sr_next
) {
2208 blkrn
= (struct jblkrec
*)srec
->sr_rec
;
2209 if (blkrn
->jb_ino
!= blkrec
->jb_ino
||
2210 blkrn
->jb_lbn
!= blkrec
->jb_lbn
||
2211 blkrn
->jb_blkno
!= blkrec
->jb_blkno
||
2212 blkrn
->jb_frags
!= blkrec
->jb_frags
||
2213 blkrn
->jb_oldfrags
!= blkrec
->jb_oldfrags
)
2216 printf("Removed dup.\n");
2217 /* Discard the free which is a dup with an alloc. */
2218 if (blkrec
->jb_op
== JOP_FREEBLK
)
2220 TAILQ_REMOVE(&sblk
->sb_recs
, srec
, sr_next
);
2224 srec
= errmalloc(sizeof(*srec
));
2225 srec
->sr_rec
= (union jrec
*)blkrec
;
2226 TAILQ_INSERT_TAIL(&sblk
->sb_recs
, srec
, sr_next
);
2230 ino_build_trunc(struct jtrncrec
*rec
)
2232 struct suj_ino
*sino
;
2235 printf("ino_build_trunc: op %d ino %d, size %jd\n",
2236 rec
->jt_op
, rec
->jt_ino
, rec
->jt_size
);
2237 sino
= ino_lookup(rec
->jt_ino
, 1);
2238 if (rec
->jt_op
== JOP_SYNC
) {
2239 sino
->si_trunc
= NULL
;
2242 if (sino
->si_trunc
== NULL
|| sino
->si_trunc
->jt_size
> rec
->jt_size
)
2243 sino
->si_trunc
= rec
;
2247 * Build up tables of the operations we need to recover.
2252 struct suj_seg
*seg
;
2257 TAILQ_FOREACH(seg
, &allsegs
, ss_next
) {
2259 printf("seg %jd has %d records, oldseq %jd.\n",
2260 seg
->ss_rec
.jsr_seq
, seg
->ss_rec
.jsr_cnt
,
2261 seg
->ss_rec
.jsr_oldest
);
2263 rec
= (union jrec
*)seg
->ss_blk
;
2264 for (i
= 0; i
< seg
->ss_rec
.jsr_cnt
; off
+= JREC_SIZE
, rec
++) {
2265 /* skip the segrec. */
2266 if ((off
% real_dev_bsize
) == 0)
2268 switch (rec
->rec_jrefrec
.jr_op
) {
2276 blk_build((struct jblkrec
*)rec
);
2280 ino_build_trunc((struct jtrncrec
*)rec
);
2283 err_suj("Unknown journal operation %d (%d)\n",
2284 rec
->rec_jrefrec
.jr_op
, off
);
2292 * Prune the journal segments to those we care about based on the
2293 * oldest sequence in the newest segment. Order the segment list
2294 * based on sequence number.
2299 struct suj_seg
*seg
;
2300 struct suj_seg
*segn
;
2305 printf("Pruning up to %jd\n", oldseq
);
2306 /* First free the expired segments. */
2307 TAILQ_FOREACH_SAFE(seg
, &allsegs
, ss_next
, segn
) {
2308 if (seg
->ss_rec
.jsr_seq
>= oldseq
)
2310 TAILQ_REMOVE(&allsegs
, seg
, ss_next
);
2314 /* Next ensure that segments are ordered properly. */
2315 seg
= TAILQ_FIRST(&allsegs
);
2318 printf("Empty journal\n");
2321 newseq
= seg
->ss_rec
.jsr_seq
;
2323 seg
= TAILQ_LAST(&allsegs
, seghd
);
2324 if (seg
->ss_rec
.jsr_seq
>= newseq
)
2326 TAILQ_REMOVE(&allsegs
, seg
, ss_next
);
2327 TAILQ_INSERT_HEAD(&allsegs
, seg
, ss_next
);
2328 newseq
= seg
->ss_rec
.jsr_seq
;
2331 if (newseq
!= oldseq
) {
2332 TAILQ_FOREACH(seg
, &allsegs
, ss_next
) {
2333 printf("%jd, ", seg
->ss_rec
.jsr_seq
);
2336 err_suj("Journal file sequence mismatch %jd != %jd\n",
2340 * The kernel may asynchronously write segments which can create
2341 * gaps in the sequence space. Throw away any segments after the
2342 * gap as the kernel guarantees only those that are contiguously
2343 * reachable are marked as completed.
2346 TAILQ_FOREACH_SAFE(seg
, &allsegs
, ss_next
, segn
) {
2347 if (!discard
&& newseq
++ == seg
->ss_rec
.jsr_seq
) {
2348 jrecs
+= seg
->ss_rec
.jsr_cnt
;
2349 jbytes
+= seg
->ss_rec
.jsr_blocks
* real_dev_bsize
;
2354 printf("Journal order mismatch %jd != %jd pruning\n",
2355 newseq
-1, seg
->ss_rec
.jsr_seq
);
2356 TAILQ_REMOVE(&allsegs
, seg
, ss_next
);
2361 printf("Processing journal segments from %jd to %jd\n",
2366 * Verify the journal inode before attempting to read records.
2369 suj_verifyino(union dinode
*ip
)
2372 if (DIP(ip
, di_nlink
) != 1) {
2373 printf("Invalid link count %d for journal inode %ju\n",
2374 DIP(ip
, di_nlink
), (uintmax_t)sujino
);
2378 if ((DIP(ip
, di_flags
) & (SF_IMMUTABLE
| SF_NOUNLINK
)) !=
2379 (SF_IMMUTABLE
| SF_NOUNLINK
)) {
2380 printf("Invalid flags 0x%X for journal inode %ju\n",
2381 DIP(ip
, di_flags
), (uintmax_t)sujino
);
2385 if (DIP(ip
, di_mode
) != (IFREG
| IREAD
)) {
2386 printf("Invalid mode %o for journal inode %ju\n",
2387 DIP(ip
, di_mode
), (uintmax_t)sujino
);
2391 if (DIP(ip
, di_size
) < SUJ_MIN
) {
2392 printf("Invalid size %jd for journal inode %ju\n",
2393 DIP(ip
, di_size
), (uintmax_t)sujino
);
2397 if (DIP(ip
, di_modrev
) != fs
->fs_mtime
) {
2398 printf("Journal timestamp does not match fs mount time\n");
2406 struct jextent
*jb_extent
; /* Extent array. */
2407 int jb_avail
; /* Available extents. */
2408 int jb_used
; /* Last used extent. */
2409 int jb_head
; /* Allocator head. */
2410 int jb_off
; /* Allocator extent offset. */
2413 ufs2_daddr_t je_daddr
; /* Disk block address. */
2414 int je_blocks
; /* Disk block count. */
2417 static struct jblocks
*suj_jblocks
;
2419 static struct jblocks
*
2420 jblocks_create(void)
2422 struct jblocks
*jblocks
;
2425 jblocks
= errmalloc(sizeof(*jblocks
));
2426 jblocks
->jb_avail
= 10;
2427 jblocks
->jb_used
= 0;
2428 jblocks
->jb_head
= 0;
2429 jblocks
->jb_off
= 0;
2430 size
= sizeof(struct jextent
) * jblocks
->jb_avail
;
2431 jblocks
->jb_extent
= errmalloc(size
);
2432 bzero(jblocks
->jb_extent
, size
);
2438 * Return the next available disk block and the amount of contiguous
2439 * free space it contains.
2442 jblocks_next(struct jblocks
*jblocks
, int bytes
, int *actual
)
2444 struct jextent
*jext
;
2449 blocks
= bytes
/ disk
->d_bsize
;
2450 jext
= &jblocks
->jb_extent
[jblocks
->jb_head
];
2451 freecnt
= jext
->je_blocks
- jblocks
->jb_off
;
2453 jblocks
->jb_off
= 0;
2454 if (++jblocks
->jb_head
> jblocks
->jb_used
)
2456 jext
= &jblocks
->jb_extent
[jblocks
->jb_head
];
2457 freecnt
= jext
->je_blocks
;
2459 if (freecnt
> blocks
)
2461 *actual
= freecnt
* disk
->d_bsize
;
2462 daddr
= jext
->je_daddr
+ jblocks
->jb_off
;
2468 * Advance the allocation head by a specified number of bytes, consuming
2469 * one journal segment.
2472 jblocks_advance(struct jblocks
*jblocks
, int bytes
)
2475 jblocks
->jb_off
+= bytes
/ disk
->d_bsize
;
2479 jblocks_destroy(struct jblocks
*jblocks
)
2482 free(jblocks
->jb_extent
);
2487 jblocks_add(struct jblocks
*jblocks
, ufs2_daddr_t daddr
, int blocks
)
2489 struct jextent
*jext
;
2492 jext
= &jblocks
->jb_extent
[jblocks
->jb_used
];
2493 /* Adding the first block. */
2494 if (jext
->je_daddr
== 0) {
2495 jext
->je_daddr
= daddr
;
2496 jext
->je_blocks
= blocks
;
2499 /* Extending the last extent. */
2500 if (jext
->je_daddr
+ jext
->je_blocks
== daddr
) {
2501 jext
->je_blocks
+= blocks
;
2504 /* Adding a new extent. */
2505 if (++jblocks
->jb_used
== jblocks
->jb_avail
) {
2506 jblocks
->jb_avail
*= 2;
2507 size
= sizeof(struct jextent
) * jblocks
->jb_avail
;
2508 jext
= errmalloc(size
);
2510 bcopy(jblocks
->jb_extent
, jext
,
2511 sizeof(struct jextent
) * jblocks
->jb_used
);
2512 free(jblocks
->jb_extent
);
2513 jblocks
->jb_extent
= jext
;
2515 jext
= &jblocks
->jb_extent
[jblocks
->jb_used
];
2516 jext
->je_daddr
= daddr
;
2517 jext
->je_blocks
= blocks
;
2523 * Add a file block from the journal to the extent map. We can't read
2524 * each file block individually because the kernel treats it as a circular
2525 * buffer and segments may span mutliple contiguous blocks.
2528 suj_add_block(ino_t ino
, ufs_lbn_t lbn
, ufs2_daddr_t blk
, int frags
)
2531 jblocks_add(suj_jblocks
, fsbtodb(fs
, blk
), fsbtodb(fs
, frags
));
2537 uint8_t block
[1 * 1024 * 1024];
2538 struct suj_seg
*seg
;
2539 struct jsegrec
*recn
;
2540 struct jsegrec
*rec
;
2549 * Read records until we exhaust the journal space. If we find
2550 * an invalid record we start searching for a valid segment header
2551 * at the next block. This is because we don't have a head/tail
2552 * pointer and must recover the information indirectly. At the gap
2553 * between the head and tail we won't necessarily have a valid
2558 size
= sizeof(block
);
2559 blk
= jblocks_next(suj_jblocks
, size
, &readsize
);
2564 * Read 1MB at a time and scan for records within this block.
2566 if (bread(disk
, blk
, &block
, size
) == -1) {
2567 err_suj("Error reading journal block %jd\n",
2570 for (rec
= (void *)block
; size
; size
-= recsize
,
2571 rec
= (struct jsegrec
*)((uintptr_t)rec
+ recsize
)) {
2572 recsize
= real_dev_bsize
;
2573 if (rec
->jsr_time
!= fs
->fs_mtime
) {
2575 printf("Rec time %jd != fs mtime %jd\n",
2576 rec
->jsr_time
, fs
->fs_mtime
);
2577 jblocks_advance(suj_jblocks
, recsize
);
2580 if (rec
->jsr_cnt
== 0) {
2582 printf("Found illegal count %d\n",
2584 jblocks_advance(suj_jblocks
, recsize
);
2587 blocks
= rec
->jsr_blocks
;
2588 recsize
= blocks
* real_dev_bsize
;
2589 if (recsize
> size
) {
2591 * We may just have run out of buffer, restart
2592 * the loop to re-read from this spot.
2594 if (size
< fs
->fs_bsize
&&
2596 recsize
<= fs
->fs_bsize
)
2599 printf("Found invalid segsize %d > %d\n",
2601 recsize
= real_dev_bsize
;
2602 jblocks_advance(suj_jblocks
, recsize
);
2606 * Verify that all blocks in the segment are present.
2608 for (i
= 1; i
< blocks
; i
++) {
2609 recn
= (void *)((uintptr_t)rec
) + i
*
2611 if (recn
->jsr_seq
== rec
->jsr_seq
&&
2612 recn
->jsr_time
== rec
->jsr_time
)
2615 printf("Incomplete record %jd (%d)\n",
2617 recsize
= i
* real_dev_bsize
;
2618 jblocks_advance(suj_jblocks
, recsize
);
2621 seg
= errmalloc(sizeof(*seg
));
2622 seg
->ss_blk
= errmalloc(recsize
);
2624 bcopy((void *)rec
, seg
->ss_blk
, recsize
);
2625 if (rec
->jsr_oldest
> oldseq
)
2626 oldseq
= rec
->jsr_oldest
;
2627 TAILQ_INSERT_TAIL(&allsegs
, seg
, ss_next
);
2628 jblocks_advance(suj_jblocks
, recsize
);
2634 * Search a directory block for the SUJ_FILE.
2637 suj_find(ino_t ino
, ufs_lbn_t lbn
, ufs2_daddr_t blk
, int frags
)
2639 char block
[MAXBSIZE
];
2646 bytes
= lfragtosize(fs
, frags
);
2647 if (bread(disk
, fsbtodb(fs
, blk
), block
, bytes
) <= 0)
2648 err_suj("Failed to read ROOTINO directory block %jd\n", blk
);
2649 for (off
= 0; off
< bytes
; off
+= dp
->d_reclen
) {
2650 dp
= (struct direct
*)&block
[off
];
2651 if (dp
->d_reclen
== 0)
2655 if (dp
->d_namlen
!= strlen(SUJ_FILE
))
2657 if (bcmp(dp
->d_name
, SUJ_FILE
, dp
->d_namlen
) != 0)
2665 * Orchestrate the verification of a filesystem via the softupdates journal.
2668 suj_check(const char *filesys
)
2674 struct suj_seg
*seg
;
2675 struct suj_seg
*segn
;
2681 * Set an exit point when SUJ check failed
2683 retval
= setjmp(jmpbuf
);
2685 pwarn("UNEXPECTED SU+J INCONSISTENCY\n");
2686 TAILQ_FOREACH_SAFE(seg
, &allsegs
, ss_next
, segn
) {
2687 TAILQ_REMOVE(&allsegs
, seg
, ss_next
);
2691 if (reply("FALLBACK TO FULL FSCK") == 0) {
2699 * Find the journal inode.
2701 ip
= ino_read(ROOTINO
);
2703 ino_visit(ip
, ROOTINO
, suj_find
, 0);
2705 printf("Journal inode removed. Use tunefs to re-create.\n");
2706 sblock
.fs_flags
&= ~FS_SUJ
;
2707 sblock
.fs_sujfree
= 0;
2711 * Fetch the journal inode and verify it.
2713 jip
= ino_read(sujino
);
2714 printf("** SU+J Recovering %s\n", filesys
);
2715 if (suj_verifyino(jip
) != 0)
2718 * Build a list of journal blocks in jblocks before parsing the
2719 * available journal blocks in with suj_read().
2721 printf("** Reading %jd byte journal from inode %ju.\n",
2722 DIP(jip
, di_size
), (uintmax_t)sujino
);
2723 suj_jblocks
= jblocks_create();
2724 blocks
= ino_visit(jip
, sujino
, suj_add_block
, 0);
2725 if (blocks
!= numfrags(fs
, DIP(jip
, di_size
))) {
2726 printf("Sparse journal inode %ju.\n", (uintmax_t)sujino
);
2730 jblocks_destroy(suj_jblocks
);
2732 if (preen
|| reply("RECOVER")) {
2733 printf("** Building recovery table.\n");
2737 printf("** Resolving unreferenced inode list.\n");
2739 printf("** Processing journal entries.\n");
2741 cg_apply(cg_check_blk
);
2742 cg_apply(cg_adj_blk
);
2743 cg_apply(cg_check_ino
);
2745 if (preen
== 0 && (jrecs
> 0 || jbytes
> 0) && reply("WRITE CHANGES") == 0)
2748 * To remain idempotent with partial truncations the free bitmaps
2749 * must be written followed by indirect blocks and lastly inode
2750 * blocks. This preserves access to the modified pointers until
2755 cg_apply(cg_write_inos
);
2756 /* Write back superblock. */
2758 if (jrecs
> 0 || jbytes
> 0) {
2759 printf("** %jd journal records in %jd bytes for %.2f%% utilization\n",
2760 jrecs
, jbytes
, ((float)jrecs
/ (float)(jbytes
/ JREC_SIZE
)) * 100);
2761 printf("** Freed %jd inodes (%jd dirs) %jd blocks, and %jd frags.\n",
2762 freeinos
, freedir
, freeblocks
, freefrags
);
2773 for (i
= 0; i
< SUJ_HASHSIZE
; i
++) {
2774 LIST_INIT(&cghash
[i
]);
2775 LIST_INIT(&dbhash
[i
]);
2779 TAILQ_INIT(&allsegs
);