1 /* $NetBSD: lfs_subr.c,v 1.72 2008/01/02 11:49:12 ad Exp $ */
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant@hhhh.org>.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
32 * Copyright (c) 1991, 1993
33 * The Regents of the University of California. All rights reserved.
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * @(#)lfs_subr.c 8.4 (Berkeley) 5/8/95
62 #include <sys/cdefs.h>
63 __KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.72 2008/01/02 11:49:12 ad Exp $");
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/namei.h>
68 #include <sys/vnode.h>
70 #include <sys/mount.h>
71 #include <sys/malloc.h>
73 #include <sys/kauth.h>
75 #include <ufs/ufs/inode.h>
76 #include <ufs/lfs/lfs.h>
77 #include <ufs/lfs/lfs_extern.h>
82 const char *lfs_res_names
[LFS_NB_COUNT
] = {
92 int lfs_res_qty
[LFS_NB_COUNT
] = {
102 lfs_setup_resblks(struct lfs
*fs
)
107 ASSERT_NO_SEGLOCK(fs
);
108 fs
->lfs_resblk
= (res_t
*)malloc(LFS_N_TOTAL
* sizeof(res_t
), M_SEGMENT
,
110 for (i
= 0; i
< LFS_N_TOTAL
; i
++) {
111 fs
->lfs_resblk
[i
].inuse
= 0;
112 fs
->lfs_resblk
[i
].p
= NULL
;
114 for (i
= 0; i
< LFS_RESHASH_WIDTH
; i
++)
115 LIST_INIT(fs
->lfs_reshash
+ i
);
118 * These types of allocations can be larger than a page,
119 * so we can't use the pool subsystem for them.
121 for (i
= 0, j
= 0; j
< LFS_N_SUMMARIES
; j
++, i
++)
122 fs
->lfs_resblk
[i
].size
= fs
->lfs_sumsize
;
123 for (j
= 0; j
< LFS_N_SBLOCKS
; j
++, i
++)
124 fs
->lfs_resblk
[i
].size
= LFS_SBPAD
;
125 for (j
= 0; j
< LFS_N_IBLOCKS
; j
++, i
++)
126 fs
->lfs_resblk
[i
].size
= fs
->lfs_bsize
;
127 for (j
= 0; j
< LFS_N_CLUSTERS
; j
++, i
++)
128 fs
->lfs_resblk
[i
].size
= MAXPHYS
;
129 for (j
= 0; j
< LFS_N_CLEAN
; j
++, i
++)
130 fs
->lfs_resblk
[i
].size
= MAXPHYS
;
131 for (j
= 0; j
< LFS_N_BLKIOV
; j
++, i
++)
132 fs
->lfs_resblk
[i
].size
= LFS_MARKV_MAXBLKCNT
* sizeof(BLOCK_INFO
);
134 for (i
= 0; i
< LFS_N_TOTAL
; i
++) {
135 fs
->lfs_resblk
[i
].p
= malloc(fs
->lfs_resblk
[i
].size
,
136 M_SEGMENT
, M_WAITOK
);
140 * Initialize pools for small types (XXX is BPP small?)
142 pool_init(&fs
->lfs_clpool
, sizeof(struct lfs_cluster
), 0, 0, 0,
143 "lfsclpl", &pool_allocator_nointr
, IPL_NONE
);
144 pool_init(&fs
->lfs_segpool
, sizeof(struct segment
), 0, 0, 0,
145 "lfssegpool", &pool_allocator_nointr
, IPL_NONE
);
146 maxbpp
= ((fs
->lfs_sumsize
- SEGSUM_SIZE(fs
)) / sizeof(int32_t) + 2);
147 maxbpp
= MIN(maxbpp
, segsize(fs
) / fs
->lfs_fsize
+ 2);
148 pool_init(&fs
->lfs_bpppool
, maxbpp
* sizeof(struct buf
*), 0, 0, 0,
149 "lfsbpppl", &pool_allocator_nointr
, IPL_NONE
);
153 lfs_free_resblks(struct lfs
*fs
)
157 pool_destroy(&fs
->lfs_bpppool
);
158 pool_destroy(&fs
->lfs_segpool
);
159 pool_destroy(&fs
->lfs_clpool
);
161 mutex_enter(&lfs_lock
);
162 for (i
= 0; i
< LFS_N_TOTAL
; i
++) {
163 while (fs
->lfs_resblk
[i
].inuse
)
164 mtsleep(&fs
->lfs_resblk
, PRIBIO
+ 1, "lfs_free", 0,
166 if (fs
->lfs_resblk
[i
].p
!= NULL
)
167 free(fs
->lfs_resblk
[i
].p
, M_SEGMENT
);
169 free(fs
->lfs_resblk
, M_SEGMENT
);
170 mutex_exit(&lfs_lock
);
176 return (unsigned int)(((unsigned long)vp
) >> 2) % LFS_RESHASH_WIDTH
;
180 * Return memory of the given size for the given purpose, or use one of a
181 * number of spare last-resort buffers, if malloc returns NULL.
184 lfs_malloc(struct lfs
*fs
, size_t size
, int type
)
186 struct lfs_res_blk
*re
;
191 ASSERT_MAYBE_SEGLOCK(fs
);
194 /* If no mem allocated for this type, it just waits */
195 if (lfs_res_qty
[type
] == 0) {
196 r
= malloc(size
, M_SEGMENT
, M_WAITOK
);
200 /* Otherwise try a quick malloc, and if it works, great */
201 if ((r
= malloc(size
, M_SEGMENT
, M_NOWAIT
)) != NULL
) {
206 * If malloc returned NULL, we are forced to use one of our
207 * reserve blocks. We have on hand at least one summary block,
208 * at least one cluster block, at least one superblock,
209 * and several indirect blocks.
212 mutex_enter(&lfs_lock
);
213 /* skip over blocks of other types */
214 for (i
= 0, start
= 0; i
< type
; i
++)
215 start
+= lfs_res_qty
[i
];
217 for (i
= 0; i
< lfs_res_qty
[type
]; i
++) {
218 if (fs
->lfs_resblk
[start
+ i
].inuse
== 0) {
219 re
= fs
->lfs_resblk
+ start
+ i
;
222 KASSERT(re
->size
>= size
);
225 LIST_INSERT_HEAD(&fs
->lfs_reshash
[h
], re
, res
);
227 mutex_exit(&lfs_lock
);
231 DLOG((DLOG_MALLOC
, "sleeping on %s (%d)\n",
232 lfs_res_names
[type
], lfs_res_qty
[type
]));
233 mtsleep(&fs
->lfs_resblk
, PVM
, "lfs_malloc", 0,
235 DLOG((DLOG_MALLOC
, "done sleeping on %s\n",
236 lfs_res_names
[type
]));
239 mutex_exit(&lfs_lock
);
244 lfs_free(struct lfs
*fs
, void *p
, int type
)
253 ASSERT_MAYBE_SEGLOCK(fs
);
255 mutex_enter(&lfs_lock
);
257 LIST_FOREACH(re
, &fs
->lfs_reshash
[h
], res
) {
259 KASSERT(re
->inuse
== 1);
260 LIST_REMOVE(re
, res
);
262 wakeup(&fs
->lfs_resblk
);
264 mutex_exit(&lfs_lock
);
269 for (i
= 0; i
< LFS_N_TOTAL
; i
++) {
270 if (fs
->lfs_resblk
[i
].p
== p
)
271 panic("lfs_free: inconsistent reserved block");
275 mutex_exit(&lfs_lock
);
278 * If we didn't find it, free it.
285 * Single thread the segment writer.
288 lfs_seglock(struct lfs
*fs
, unsigned long flags
)
292 mutex_enter(&lfs_lock
);
293 if (fs
->lfs_seglock
) {
294 if (fs
->lfs_lockpid
== curproc
->p_pid
&&
295 fs
->lfs_locklwp
== curlwp
->l_lid
) {
296 mutex_exit(&lfs_lock
);
298 fs
->lfs_sp
->seg_flags
|= flags
;
300 } else if (flags
& SEGM_PAGEDAEMON
) {
301 mutex_exit(&lfs_lock
);
304 while (fs
->lfs_seglock
) {
305 (void)mtsleep(&fs
->lfs_seglock
, PRIBIO
+ 1,
306 "lfs_seglock", 0, &lfs_lock
);
312 fs
->lfs_lockpid
= curproc
->p_pid
;
313 fs
->lfs_locklwp
= curlwp
->l_lid
;
314 mutex_exit(&lfs_lock
);
315 fs
->lfs_cleanind
= 0;
318 LFS_ENTER_LOG("seglock", __FILE__
, __LINE__
, 0, flags
, curproc
->p_pid
);
320 /* Drain fragment size changes out */
321 rw_enter(&fs
->lfs_fraglock
, RW_WRITER
);
323 sp
= fs
->lfs_sp
= pool_get(&fs
->lfs_segpool
, PR_WAITOK
);
324 sp
->bpp
= pool_get(&fs
->lfs_bpppool
, PR_WAITOK
);
325 sp
->seg_flags
= flags
;
328 (void) lfs_initseg(fs
);
331 * Keep a cumulative count of the outstanding I/O operations. If the
332 * disk drive catches up with us it could go to zero before we finish,
333 * so we artificially increment it by one until we've scheduled all of
334 * the writes we intend to do.
336 mutex_enter(&lfs_lock
);
338 mutex_exit(&lfs_lock
);
342 static void lfs_unmark_dirop(struct lfs
*);
345 lfs_unmark_dirop(struct lfs
*fs
)
347 struct inode
*ip
, *nip
;
351 ASSERT_NO_SEGLOCK(fs
);
352 mutex_enter(&lfs_lock
);
353 doit
= !(fs
->lfs_flags
& LFS_UNDIROP
);
355 fs
->lfs_flags
|= LFS_UNDIROP
;
357 mutex_exit(&lfs_lock
);
361 for (ip
= TAILQ_FIRST(&fs
->lfs_dchainhd
); ip
!= NULL
; ip
= nip
) {
362 nip
= TAILQ_NEXT(ip
, i_lfs_dchain
);
364 if (VOP_ISLOCKED(vp
) == LK_EXCLOTHER
)
366 if ((VTOI(vp
)->i_flag
& (IN_ADIROP
| IN_ALLMOD
)) == 0) {
369 vp
->v_uflag
&= ~VU_DIROP
;
370 TAILQ_REMOVE(&fs
->lfs_dchainhd
, ip
, i_lfs_dchain
);
371 wakeup(&lfs_dirvcount
);
372 fs
->lfs_unlockvp
= vp
;
373 mutex_exit(&lfs_lock
);
375 mutex_enter(&lfs_lock
);
376 fs
->lfs_unlockvp
= NULL
;
380 fs
->lfs_flags
&= ~LFS_UNDIROP
;
381 wakeup(&fs
->lfs_flags
);
382 mutex_exit(&lfs_lock
);
386 lfs_auto_segclean(struct lfs
*fs
)
388 int i
, error
, s
, waited
;
392 * Now that we've swapped lfs_activesb, but while we still
393 * hold the segment lock, run through the segment list marking
394 * the empty ones clean.
395 * XXX - do we really need to do them all at once?
398 for (i
= 0; i
< fs
->lfs_nseg
; i
++) {
399 if ((fs
->lfs_suflags
[0][i
] &
400 (SEGUSE_ACTIVE
| SEGUSE_DIRTY
| SEGUSE_EMPTY
)) ==
401 (SEGUSE_DIRTY
| SEGUSE_EMPTY
) &&
402 (fs
->lfs_suflags
[1][i
] &
403 (SEGUSE_ACTIVE
| SEGUSE_DIRTY
| SEGUSE_EMPTY
)) ==
404 (SEGUSE_DIRTY
| SEGUSE_EMPTY
)) {
406 /* Make sure the sb is written before we clean */
407 mutex_enter(&lfs_lock
);
409 while (waited
== 0 && fs
->lfs_sbactive
)
410 mtsleep(&fs
->lfs_sbactive
, PRIBIO
+1, "lfs asb",
413 mutex_exit(&lfs_lock
);
416 if ((error
= lfs_do_segclean(fs
, i
)) != 0) {
417 DLOG((DLOG_CLEAN
, "lfs_auto_segclean: lfs_do_segclean returned %d for seg %d\n", error
, i
));
420 fs
->lfs_suflags
[1 - fs
->lfs_activesb
][i
] =
421 fs
->lfs_suflags
[fs
->lfs_activesb
][i
];
427 * Single thread the segment writer.
430 lfs_segunlock(struct lfs
*fs
)
433 unsigned long sync
, ckp
;
435 int do_unmark_dirop
= 0;
439 mutex_enter(&lfs_lock
);
440 KASSERT(LFS_SEGLOCK_HELD(fs
));
441 if (fs
->lfs_seglock
== 1) {
442 if ((sp
->seg_flags
& (SEGM_PROT
| SEGM_CLEAN
)) == 0 &&
443 LFS_STARVED_FOR_SEGS(fs
) == 0)
445 mutex_exit(&lfs_lock
);
446 sync
= sp
->seg_flags
& SEGM_SYNC
;
447 ckp
= sp
->seg_flags
& SEGM_CKP
;
449 /* We should have a segment summary, and nothing else */
450 KASSERT(sp
->cbpp
== sp
->bpp
+ 1);
452 /* Free allocated segment summary */
453 fs
->lfs_offset
-= btofsb(fs
, fs
->lfs_sumsize
);
457 pool_put(&fs
->lfs_bpppool
, sp
->bpp
);
461 * If we're not sync, we're done with sp, get rid of it.
462 * Otherwise, we keep a local copy around but free
463 * fs->lfs_sp so another process can use it (we have to
464 * wait but they don't have to wait for us).
467 pool_put(&fs
->lfs_segpool
, sp
);
471 * If the I/O count is non-zero, sleep until it reaches zero.
472 * At the moment, the user's process hangs around so we can
475 mutex_enter(&lfs_lock
);
476 if (--fs
->lfs_iocount
== 0) {
477 LFS_DEBUG_COUNTLOCKED("lfs_segunlock");
479 if (fs
->lfs_iocount
<= 1)
480 wakeup(&fs
->lfs_iocount
);
481 mutex_exit(&lfs_lock
);
483 * If we're not checkpointing, we don't have to block
484 * other processes to wait for a synchronous write
489 LFS_ENTER_LOG("segunlock_std", __FILE__
, __LINE__
, 0, 0, curproc
->p_pid
);
491 mutex_enter(&lfs_lock
);
495 mutex_exit(&lfs_lock
);
496 wakeup(&fs
->lfs_seglock
);
499 * We let checkpoints happen asynchronously. That means
500 * that during recovery, we have to roll forward between
501 * the two segments described by the first and second
502 * superblocks to make sure that the checkpoint described
503 * by a superblock completed.
505 mutex_enter(&lfs_lock
);
506 while (ckp
&& sync
&& fs
->lfs_iocount
)
507 (void)mtsleep(&fs
->lfs_iocount
, PRIBIO
+ 1,
508 "lfs_iocount", 0, &lfs_lock
);
509 while (sync
&& sp
->seg_iocount
) {
510 (void)mtsleep(&sp
->seg_iocount
, PRIBIO
+ 1,
511 "seg_iocount", 0, &lfs_lock
);
512 DLOG((DLOG_SEG
, "sleeping on iocount %x == %d\n", sp
, sp
->seg_iocount
));
514 mutex_exit(&lfs_lock
);
516 pool_put(&fs
->lfs_segpool
, sp
);
520 /* If we *know* everything's on disk, write both sbs */
521 /* XXX should wait for this one */
523 lfs_writesuper(fs
, fs
->lfs_sboffs
[fs
->lfs_activesb
]);
524 lfs_writesuper(fs
, fs
->lfs_sboffs
[1 - fs
->lfs_activesb
]);
525 if (!(fs
->lfs_ivnode
->v_mount
->mnt_iflag
& IMNT_UNMOUNT
)) {
526 lfs_auto_segclean(fs
);
527 /* If sync, we can clean the remainder too */
529 lfs_auto_segclean(fs
);
531 fs
->lfs_activesb
= 1 - fs
->lfs_activesb
;
533 LFS_ENTER_LOG("segunlock_ckp", __FILE__
, __LINE__
, 0, 0, curproc
->p_pid
);
535 mutex_enter(&lfs_lock
);
539 mutex_exit(&lfs_lock
);
540 wakeup(&fs
->lfs_seglock
);
542 /* Reenable fragment size changes */
543 rw_exit(&fs
->lfs_fraglock
);
545 lfs_unmark_dirop(fs
);
546 } else if (fs
->lfs_seglock
== 0) {
547 mutex_exit(&lfs_lock
);
548 panic ("Seglock not held");
551 mutex_exit(&lfs_lock
);
556 * Drain dirops and start writer.
558 * No simple_locks are held when we enter and none are held when we return.
561 lfs_writer_enter(struct lfs
*fs
, const char *wmesg
)
565 ASSERT_MAYBE_SEGLOCK(fs
);
566 mutex_enter(&lfs_lock
);
568 /* disallow dirops during flush */
571 while (fs
->lfs_dirops
> 0) {
573 error
= mtsleep(&fs
->lfs_writer
, PRIBIO
+1, wmesg
, 0,
581 mutex_exit(&lfs_lock
);
587 lfs_writer_leave(struct lfs
*fs
)
591 ASSERT_MAYBE_SEGLOCK(fs
);
592 mutex_enter(&lfs_lock
);
593 dowakeup
= !(--fs
->lfs_writer
);
594 mutex_exit(&lfs_lock
);
596 wakeup(&fs
->lfs_dirops
);
600 * Unlock, wait for the cleaner, then relock to where we were before.
601 * To be used only at a fairly high level, to address a paucity of free
602 * segments propagated back from lfs_gop_write().
605 lfs_segunlock_relock(struct lfs
*fs
)
607 int n
= fs
->lfs_seglock
;
615 /* Write anything we've already gathered to disk */
616 lfs_writeseg(fs
, fs
->lfs_sp
);
619 LFS_CLEANERINFO(cip
, fs
, bp
);
620 cip
->flags
|= LFS_CLEANER_MUST_CLEAN
;
621 LFS_SYNC_CLEANERINFO(cip
, fs
, bp
, 1);
623 /* Save segment flags for later */
624 seg_flags
= fs
->lfs_sp
->seg_flags
;
626 fs
->lfs_sp
->seg_flags
|= SEGM_PROT
; /* Don't unmark dirop nodes */
627 while(fs
->lfs_seglock
)
630 /* Wait for the cleaner */
631 lfs_wakeup_cleaner(fs
);
632 mutex_enter(&lfs_lock
);
633 while (LFS_STARVED_FOR_SEGS(fs
))
634 mtsleep(&fs
->lfs_avail
, PRIBIO
, "relock", 0,
636 mutex_exit(&lfs_lock
);
638 /* Put the segment lock back the way it was. */
640 lfs_seglock(fs
, seg_flags
);
642 /* Cleaner can relax now */
643 LFS_CLEANERINFO(cip
, fs
, bp
);
644 cip
->flags
&= ~LFS_CLEANER_MUST_CLEAN
;
645 LFS_SYNC_CLEANERINFO(cip
, fs
, bp
, 1);
651 * Wake up the cleaner, provided that nowrap is not set.
654 lfs_wakeup_cleaner(struct lfs
*fs
)
656 if (fs
->lfs_nowrap
> 0)
659 wakeup(&fs
->lfs_nextseg
);
660 wakeup(&lfs_allclean_wakeup
);