1 /* $NetBSD: lfs_subr.c,v 1.86 2015/10/03 08:28:16 dholland Exp $ */
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant@hhhh.org>.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
32 * Copyright (c) 1991, 1993
33 * The Regents of the University of California. All rights reserved.
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * @(#)lfs_subr.c 8.4 (Berkeley) 5/8/95
62 #include <sys/cdefs.h>
63 __KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.86 2015/10/03 08:28:16 dholland Exp $");
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/namei.h>
68 #include <sys/vnode.h>
70 #include <sys/mount.h>
71 #include <sys/malloc.h>
73 #include <sys/kauth.h>
75 #include <ufs/lfs/ulfs_inode.h>
76 #include <ufs/lfs/lfs.h>
77 #include <ufs/lfs/lfs_accessors.h>
78 #include <ufs/lfs/lfs_kernel.h>
79 #include <ufs/lfs/lfs_extern.h>
84 const char *lfs_res_names
[LFS_NB_COUNT
] = {
94 int lfs_res_qty
[LFS_NB_COUNT
] = {
104 lfs_setup_resblks(struct lfs
*fs
)
109 ASSERT_NO_SEGLOCK(fs
);
110 fs
->lfs_resblk
= malloc(LFS_N_TOTAL
* sizeof(res_t
), M_SEGMENT
,
112 for (i
= 0; i
< LFS_N_TOTAL
; i
++) {
113 fs
->lfs_resblk
[i
].inuse
= 0;
114 fs
->lfs_resblk
[i
].p
= NULL
;
116 for (i
= 0; i
< LFS_RESHASH_WIDTH
; i
++)
117 LIST_INIT(fs
->lfs_reshash
+ i
);
120 * These types of allocations can be larger than a page,
121 * so we can't use the pool subsystem for them.
123 for (i
= 0, j
= 0; j
< LFS_N_SUMMARIES
; j
++, i
++)
124 fs
->lfs_resblk
[i
].size
= lfs_sb_getsumsize(fs
);
125 for (j
= 0; j
< LFS_N_SBLOCKS
; j
++, i
++)
126 fs
->lfs_resblk
[i
].size
= LFS_SBPAD
;
127 for (j
= 0; j
< LFS_N_IBLOCKS
; j
++, i
++)
128 fs
->lfs_resblk
[i
].size
= lfs_sb_getbsize(fs
);
129 for (j
= 0; j
< LFS_N_CLUSTERS
; j
++, i
++)
130 fs
->lfs_resblk
[i
].size
= MAXPHYS
;
131 for (j
= 0; j
< LFS_N_CLEAN
; j
++, i
++)
132 fs
->lfs_resblk
[i
].size
= MAXPHYS
;
133 for (j
= 0; j
< LFS_N_BLKIOV
; j
++, i
++)
134 fs
->lfs_resblk
[i
].size
= LFS_MARKV_MAXBLKCNT
* sizeof(BLOCK_INFO
);
136 for (i
= 0; i
< LFS_N_TOTAL
; i
++) {
137 fs
->lfs_resblk
[i
].p
= malloc(fs
->lfs_resblk
[i
].size
,
138 M_SEGMENT
, M_WAITOK
);
142 * Initialize pools for small types (XXX is BPP small?)
144 pool_init(&fs
->lfs_clpool
, sizeof(struct lfs_cluster
), 0, 0, 0,
145 "lfsclpl", &pool_allocator_nointr
, IPL_NONE
);
146 pool_init(&fs
->lfs_segpool
, sizeof(struct segment
), 0, 0, 0,
147 "lfssegpool", &pool_allocator_nointr
, IPL_NONE
);
148 /* XXX: should this int32 be 32/64? */
149 maxbpp
= ((lfs_sb_getsumsize(fs
) - SEGSUM_SIZE(fs
)) / sizeof(int32_t) + 2);
150 maxbpp
= MIN(maxbpp
, lfs_segsize(fs
) / lfs_sb_getfsize(fs
) + 2);
151 pool_init(&fs
->lfs_bpppool
, maxbpp
* sizeof(struct buf
*), 0, 0, 0,
152 "lfsbpppl", &pool_allocator_nointr
, IPL_NONE
);
156 lfs_free_resblks(struct lfs
*fs
)
160 pool_destroy(&fs
->lfs_bpppool
);
161 pool_destroy(&fs
->lfs_segpool
);
162 pool_destroy(&fs
->lfs_clpool
);
164 mutex_enter(&lfs_lock
);
165 for (i
= 0; i
< LFS_N_TOTAL
; i
++) {
166 while (fs
->lfs_resblk
[i
].inuse
)
167 mtsleep(&fs
->lfs_resblk
, PRIBIO
+ 1, "lfs_free", 0,
169 if (fs
->lfs_resblk
[i
].p
!= NULL
)
170 free(fs
->lfs_resblk
[i
].p
, M_SEGMENT
);
172 free(fs
->lfs_resblk
, M_SEGMENT
);
173 mutex_exit(&lfs_lock
);
179 return (unsigned int)(((unsigned long)vp
) >> 2) % LFS_RESHASH_WIDTH
;
183 * Return memory of the given size for the given purpose, or use one of a
184 * number of spare last-resort buffers, if malloc returns NULL.
187 lfs_malloc(struct lfs
*fs
, size_t size
, int type
)
189 struct lfs_res_blk
*re
;
194 ASSERT_MAYBE_SEGLOCK(fs
);
197 /* If no mem allocated for this type, it just waits */
198 if (lfs_res_qty
[type
] == 0) {
199 r
= malloc(size
, M_SEGMENT
, M_WAITOK
);
203 /* Otherwise try a quick malloc, and if it works, great */
204 if ((r
= malloc(size
, M_SEGMENT
, M_NOWAIT
)) != NULL
) {
209 * If malloc returned NULL, we are forced to use one of our
210 * reserve blocks. We have on hand at least one summary block,
211 * at least one cluster block, at least one superblock,
212 * and several indirect blocks.
215 mutex_enter(&lfs_lock
);
216 /* skip over blocks of other types */
217 for (i
= 0, start
= 0; i
< type
; i
++)
218 start
+= lfs_res_qty
[i
];
220 for (i
= 0; i
< lfs_res_qty
[type
]; i
++) {
221 if (fs
->lfs_resblk
[start
+ i
].inuse
== 0) {
222 re
= fs
->lfs_resblk
+ start
+ i
;
225 KASSERT(re
->size
>= size
);
228 LIST_INSERT_HEAD(&fs
->lfs_reshash
[h
], re
, res
);
230 mutex_exit(&lfs_lock
);
234 DLOG((DLOG_MALLOC
, "sleeping on %s (%d)\n",
235 lfs_res_names
[type
], lfs_res_qty
[type
]));
236 mtsleep(&fs
->lfs_resblk
, PVM
, "lfs_malloc", 0,
238 DLOG((DLOG_MALLOC
, "done sleeping on %s\n",
239 lfs_res_names
[type
]));
242 mutex_exit(&lfs_lock
);
247 lfs_free(struct lfs
*fs
, void *p
, int type
)
256 ASSERT_MAYBE_SEGLOCK(fs
);
258 mutex_enter(&lfs_lock
);
260 LIST_FOREACH(re
, &fs
->lfs_reshash
[h
], res
) {
262 KASSERT(re
->inuse
== 1);
263 LIST_REMOVE(re
, res
);
265 wakeup(&fs
->lfs_resblk
);
267 mutex_exit(&lfs_lock
);
272 for (i
= 0; i
< LFS_N_TOTAL
; i
++) {
273 if (fs
->lfs_resblk
[i
].p
== p
)
274 panic("lfs_free: inconsistent reserved block");
278 mutex_exit(&lfs_lock
);
281 * If we didn't find it, free it.
288 * Single thread the segment writer.
291 lfs_seglock(struct lfs
*fs
, unsigned long flags
)
295 mutex_enter(&lfs_lock
);
296 if (fs
->lfs_seglock
) {
297 if (fs
->lfs_lockpid
== curproc
->p_pid
&&
298 fs
->lfs_locklwp
== curlwp
->l_lid
) {
300 fs
->lfs_sp
->seg_flags
|= flags
;
301 mutex_exit(&lfs_lock
);
303 } else if (flags
& SEGM_PAGEDAEMON
) {
304 mutex_exit(&lfs_lock
);
307 while (fs
->lfs_seglock
) {
308 (void)mtsleep(&fs
->lfs_seglock
, PRIBIO
+ 1,
309 "lfs_seglock", 0, &lfs_lock
);
315 fs
->lfs_lockpid
= curproc
->p_pid
;
316 fs
->lfs_locklwp
= curlwp
->l_lid
;
317 mutex_exit(&lfs_lock
);
318 fs
->lfs_cleanind
= 0;
321 LFS_ENTER_LOG("seglock", __FILE__
, __LINE__
, 0, flags
, curproc
->p_pid
);
323 /* Drain fragment size changes out */
324 rw_enter(&fs
->lfs_fraglock
, RW_WRITER
);
326 sp
= fs
->lfs_sp
= pool_get(&fs
->lfs_segpool
, PR_WAITOK
);
327 sp
->bpp
= pool_get(&fs
->lfs_bpppool
, PR_WAITOK
);
328 sp
->seg_flags
= flags
;
331 (void) lfs_initseg(fs
);
334 * Keep a cumulative count of the outstanding I/O operations. If the
335 * disk drive catches up with us it could go to zero before we finish,
336 * so we artificially increment it by one until we've scheduled all of
337 * the writes we intend to do.
339 mutex_enter(&lfs_lock
);
341 fs
->lfs_startseg
= lfs_sb_getcurseg(fs
);
342 mutex_exit(&lfs_lock
);
346 static void lfs_unmark_dirop(struct lfs
*);
349 lfs_unmark_dirop(struct lfs
*fs
)
351 struct inode
*ip
, *nip
;
355 ASSERT_NO_SEGLOCK(fs
);
356 mutex_enter(&lfs_lock
);
357 doit
= !(fs
->lfs_flags
& LFS_UNDIROP
);
359 fs
->lfs_flags
|= LFS_UNDIROP
;
361 mutex_exit(&lfs_lock
);
365 for (ip
= TAILQ_FIRST(&fs
->lfs_dchainhd
); ip
!= NULL
; ip
= nip
) {
366 nip
= TAILQ_NEXT(ip
, i_lfs_dchain
);
368 if ((ip
->i_flag
& (IN_ADIROP
| IN_CDIROP
)) == IN_CDIROP
) {
371 vp
->v_uflag
&= ~VU_DIROP
;
372 TAILQ_REMOVE(&fs
->lfs_dchainhd
, ip
, i_lfs_dchain
);
373 wakeup(&lfs_dirvcount
);
374 fs
->lfs_unlockvp
= vp
;
375 mutex_exit(&lfs_lock
);
377 mutex_enter(&lfs_lock
);
378 fs
->lfs_unlockvp
= NULL
;
379 ip
->i_flag
&= ~IN_CDIROP
;
383 fs
->lfs_flags
&= ~LFS_UNDIROP
;
384 wakeup(&fs
->lfs_flags
);
385 mutex_exit(&lfs_lock
);
389 lfs_auto_segclean(struct lfs
*fs
)
391 int i
, error
, s
, waited
;
395 * Now that we've swapped lfs_activesb, but while we still
396 * hold the segment lock, run through the segment list marking
397 * the empty ones clean.
398 * XXX - do we really need to do them all at once?
401 for (i
= 0; i
< lfs_sb_getnseg(fs
); i
++) {
402 if ((fs
->lfs_suflags
[0][i
] &
403 (SEGUSE_ACTIVE
| SEGUSE_DIRTY
| SEGUSE_EMPTY
)) ==
404 (SEGUSE_DIRTY
| SEGUSE_EMPTY
) &&
405 (fs
->lfs_suflags
[1][i
] &
406 (SEGUSE_ACTIVE
| SEGUSE_DIRTY
| SEGUSE_EMPTY
)) ==
407 (SEGUSE_DIRTY
| SEGUSE_EMPTY
)) {
409 /* Make sure the sb is written before we clean */
410 mutex_enter(&lfs_lock
);
412 while (waited
== 0 && fs
->lfs_sbactive
)
413 mtsleep(&fs
->lfs_sbactive
, PRIBIO
+1, "lfs asb",
416 mutex_exit(&lfs_lock
);
419 if ((error
= lfs_do_segclean(fs
, i
)) != 0) {
420 DLOG((DLOG_CLEAN
, "lfs_auto_segclean: lfs_do_segclean returned %d for seg %d\n", error
, i
));
423 fs
->lfs_suflags
[1 - fs
->lfs_activesb
][i
] =
424 fs
->lfs_suflags
[fs
->lfs_activesb
][i
];
430 * Single thread the segment writer.
433 lfs_segunlock(struct lfs
*fs
)
436 unsigned long sync
, ckp
;
438 int do_unmark_dirop
= 0;
442 mutex_enter(&lfs_lock
);
443 KASSERT(LFS_SEGLOCK_HELD(fs
));
444 if (fs
->lfs_seglock
== 1) {
445 if ((sp
->seg_flags
& (SEGM_PROT
| SEGM_CLEAN
)) == 0)
447 mutex_exit(&lfs_lock
);
448 sync
= sp
->seg_flags
& SEGM_SYNC
;
449 ckp
= sp
->seg_flags
& SEGM_CKP
;
451 /* We should have a segment summary, and nothing else */
452 KASSERT(sp
->cbpp
== sp
->bpp
+ 1);
454 /* Free allocated segment summary */
455 lfs_sb_suboffset(fs
, lfs_btofsb(fs
, lfs_sb_getsumsize(fs
)));
459 pool_put(&fs
->lfs_bpppool
, sp
->bpp
);
463 * If we're not sync, we're done with sp, get rid of it.
464 * Otherwise, we keep a local copy around but free
465 * fs->lfs_sp so another process can use it (we have to
466 * wait but they don't have to wait for us).
469 pool_put(&fs
->lfs_segpool
, sp
);
473 * If the I/O count is non-zero, sleep until it reaches zero.
474 * At the moment, the user's process hangs around so we can
477 mutex_enter(&lfs_lock
);
478 if (--fs
->lfs_iocount
== 0) {
479 LFS_DEBUG_COUNTLOCKED("lfs_segunlock");
481 if (fs
->lfs_iocount
<= 1)
482 wakeup(&fs
->lfs_iocount
);
483 mutex_exit(&lfs_lock
);
485 * If we're not checkpointing, we don't have to block
486 * other processes to wait for a synchronous write
491 LFS_ENTER_LOG("segunlock_std", __FILE__
, __LINE__
, 0, 0, curproc
->p_pid
);
493 mutex_enter(&lfs_lock
);
497 mutex_exit(&lfs_lock
);
498 wakeup(&fs
->lfs_seglock
);
501 * We let checkpoints happen asynchronously. That means
502 * that during recovery, we have to roll forward between
503 * the two segments described by the first and second
504 * superblocks to make sure that the checkpoint described
505 * by a superblock completed.
507 mutex_enter(&lfs_lock
);
508 while (ckp
&& sync
&& fs
->lfs_iocount
) {
509 (void)mtsleep(&fs
->lfs_iocount
, PRIBIO
+ 1,
510 "lfs_iocount", 0, &lfs_lock
);
511 DLOG((DLOG_SEG
, "sleeping on iocount %x == %d\n", fs
, fs
->lfs_iocount
));
513 while (sync
&& sp
->seg_iocount
) {
514 (void)mtsleep(&sp
->seg_iocount
, PRIBIO
+ 1,
515 "seg_iocount", 0, &lfs_lock
);
516 DLOG((DLOG_SEG
, "sleeping on iocount %x == %d\n", sp
, sp
->seg_iocount
));
518 mutex_exit(&lfs_lock
);
520 pool_put(&fs
->lfs_segpool
, sp
);
524 /* If we *know* everything's on disk, write both sbs */
525 /* XXX should wait for this one */
527 lfs_writesuper(fs
, lfs_sb_getsboff(fs
, fs
->lfs_activesb
));
528 lfs_writesuper(fs
, lfs_sb_getsboff(fs
, 1 - fs
->lfs_activesb
));
529 if (!(fs
->lfs_ivnode
->v_mount
->mnt_iflag
& IMNT_UNMOUNT
)) {
530 lfs_auto_segclean(fs
);
531 /* If sync, we can clean the remainder too */
533 lfs_auto_segclean(fs
);
535 fs
->lfs_activesb
= 1 - fs
->lfs_activesb
;
537 LFS_ENTER_LOG("segunlock_ckp", __FILE__
, __LINE__
, 0, 0, curproc
->p_pid
);
539 mutex_enter(&lfs_lock
);
543 mutex_exit(&lfs_lock
);
544 wakeup(&fs
->lfs_seglock
);
546 /* Reenable fragment size changes */
547 rw_exit(&fs
->lfs_fraglock
);
549 lfs_unmark_dirop(fs
);
550 } else if (fs
->lfs_seglock
== 0) {
551 mutex_exit(&lfs_lock
);
552 panic ("Seglock not held");
555 mutex_exit(&lfs_lock
);
560 * Drain dirops and start writer.
562 * No simple_locks are held when we enter and none are held when we return.
565 lfs_writer_enter(struct lfs
*fs
, const char *wmesg
)
569 ASSERT_MAYBE_SEGLOCK(fs
);
570 mutex_enter(&lfs_lock
);
572 /* disallow dirops during flush */
575 while (fs
->lfs_dirops
> 0) {
577 error
= mtsleep(&fs
->lfs_writer
, PRIBIO
+1, wmesg
, 0,
585 mutex_exit(&lfs_lock
);
591 lfs_writer_leave(struct lfs
*fs
)
595 ASSERT_MAYBE_SEGLOCK(fs
);
596 mutex_enter(&lfs_lock
);
597 dowakeup
= !(--fs
->lfs_writer
);
598 mutex_exit(&lfs_lock
);
600 wakeup(&fs
->lfs_dirops
);
604 * Unlock, wait for the cleaner, then relock to where we were before.
605 * To be used only at a fairly high level, to address a paucity of free
606 * segments propagated back from lfs_gop_write().
609 lfs_segunlock_relock(struct lfs
*fs
)
611 int n
= fs
->lfs_seglock
;
619 /* Write anything we've already gathered to disk */
620 lfs_writeseg(fs
, fs
->lfs_sp
);
623 LFS_CLEANERINFO(cip
, fs
, bp
);
624 lfs_ci_setflags(fs
, cip
,
625 lfs_ci_getflags(fs
, cip
) | LFS_CLEANER_MUST_CLEAN
);
626 LFS_SYNC_CLEANERINFO(cip
, fs
, bp
, 1);
628 /* Save segment flags for later */
629 seg_flags
= fs
->lfs_sp
->seg_flags
;
631 fs
->lfs_sp
->seg_flags
|= SEGM_PROT
; /* Don't unmark dirop nodes */
632 while(fs
->lfs_seglock
)
635 /* Wait for the cleaner */
636 lfs_wakeup_cleaner(fs
);
637 mutex_enter(&lfs_lock
);
638 while (LFS_STARVED_FOR_SEGS(fs
))
639 mtsleep(&fs
->lfs_availsleep
, PRIBIO
, "relock", 0,
641 mutex_exit(&lfs_lock
);
643 /* Put the segment lock back the way it was. */
645 lfs_seglock(fs
, seg_flags
);
647 /* Cleaner can relax now */
648 LFS_CLEANERINFO(cip
, fs
, bp
);
649 lfs_ci_setflags(fs
, cip
,
650 lfs_ci_getflags(fs
, cip
) & ~LFS_CLEANER_MUST_CLEAN
);
651 LFS_SYNC_CLEANERINFO(cip
, fs
, bp
, 1);
657 * Wake up the cleaner, provided that nowrap is not set.
660 lfs_wakeup_cleaner(struct lfs
*fs
)
662 if (fs
->lfs_nowrap
> 0)
665 wakeup(&fs
->lfs_nextsegsleep
);
666 wakeup(&lfs_allclean_wakeup
);