1 /* $NetBSD: lfs_syscalls.c,v 1.134 2009/01/11 02:45:56 christos Exp $ */
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008
5 * The NetBSD Foundation, Inc.
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Konrad E. Schroder <perseant@hhhh.org>.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
33 * Copyright (c) 1991, 1993, 1994
34 * The Regents of the University of California. All rights reserved.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95
63 #include <sys/cdefs.h>
64 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.134 2009/01/11 02:45:56 christos Exp $");
67 # define LFS /* for prototypes in syscallargs.h */
70 #include <sys/param.h>
71 #include <sys/systm.h>
74 #include <sys/mount.h>
75 #include <sys/vnode.h>
76 #include <sys/kernel.h>
77 #include <sys/kauth.h>
78 #include <sys/syscallargs.h>
80 #include <ufs/ufs/inode.h>
81 #include <ufs/ufs/ufsmount.h>
82 #include <ufs/ufs/ufs_extern.h>
84 #include <ufs/lfs/lfs.h>
85 #include <ufs/lfs/lfs_extern.h>
87 struct buf
*lfs_fakebuf(struct lfs
*, struct vnode
*, int, size_t, void *);
88 int lfs_fasthashget(dev_t
, ino_t
, struct vnode
**);
90 pid_t lfs_cleaner_pid
= 0;
95 * This will mark inodes and blocks dirty, so they are written into the log.
96 * It will block until all the blocks have been written. The segment create
97 * time passed in the block_info and inode_info structures is used to decide
98 * if the data is valid for each block (in case some process dirtied a block
99 * or inode that is being cleaned between the determination that a block is
100 * live and the lfs_markv call).
103 * -1/errno is return on error.
105 #ifdef USE_64BIT_SYSCALLS
107 sys_lfs_markv(struct lwp
*l
, const struct sys_lfs_markv_args
*uap
, register_t
*retval
)
110 syscallarg(fsid_t *) fsidp;
111 syscallarg(struct block_info *) blkiov;
112 syscallarg(int) blkcnt;
120 if ((error
= kauth_authorize_generic(l
->l_cred
, KAUTH_GENERIC_ISSUSER
,
124 if ((error
= copyin(SCARG(uap
, fsidp
), &fsid
, sizeof(fsid_t
))) != 0)
127 if ((mntp
= vfs_getvfs(fsidp
)) == NULL
)
129 fs
= VFSTOUFS(mntp
)->um_lfs
;
131 blkcnt
= SCARG(uap
, blkcnt
);
132 if ((u_int
) blkcnt
> LFS_MARKV_MAXBLKCNT
)
135 KERNEL_LOCK(1, NULL
);
136 blkiov
= lfs_malloc(fs
, blkcnt
* sizeof(BLOCK_INFO
), LFS_NB_BLKIOV
);
137 if ((error
= copyin(SCARG(uap
, blkiov
), blkiov
,
138 blkcnt
* sizeof(BLOCK_INFO
))) != 0)
141 if ((error
= lfs_markv(p
, &fsid
, blkiov
, blkcnt
)) == 0)
142 copyout(blkiov
, SCARG(uap
, blkiov
),
143 blkcnt
* sizeof(BLOCK_INFO
));
145 lfs_free(fs
, blkiov
, LFS_NB_BLKIOV
);
146 KERNEL_UNLOCK_ONE(NULL
);
151 sys_lfs_markv(struct lwp
*l
, const struct sys_lfs_markv_args
*uap
, register_t
*retval
)
154 syscallarg(fsid_t *) fsidp;
155 syscallarg(struct block_info *) blkiov;
156 syscallarg(int) blkcnt;
159 BLOCK_INFO_15
*blkiov15
;
160 int i
, blkcnt
, error
;
165 if ((error
= kauth_authorize_generic(l
->l_cred
, KAUTH_GENERIC_ISSUSER
,
169 if ((error
= copyin(SCARG(uap
, fsidp
), &fsid
, sizeof(fsid_t
))) != 0)
172 if ((mntp
= vfs_getvfs(&fsid
)) == NULL
)
174 fs
= VFSTOUFS(mntp
)->um_lfs
;
176 blkcnt
= SCARG(uap
, blkcnt
);
177 if ((u_int
) blkcnt
> LFS_MARKV_MAXBLKCNT
)
180 KERNEL_LOCK(1, NULL
);
181 blkiov
= lfs_malloc(fs
, blkcnt
* sizeof(BLOCK_INFO
), LFS_NB_BLKIOV
);
182 blkiov15
= lfs_malloc(fs
, blkcnt
* sizeof(BLOCK_INFO_15
), LFS_NB_BLKIOV
);
183 if ((error
= copyin(SCARG(uap
, blkiov
), blkiov15
,
184 blkcnt
* sizeof(BLOCK_INFO_15
))) != 0)
187 for (i
= 0; i
< blkcnt
; i
++) {
188 blkiov
[i
].bi_inode
= blkiov15
[i
].bi_inode
;
189 blkiov
[i
].bi_lbn
= blkiov15
[i
].bi_lbn
;
190 blkiov
[i
].bi_daddr
= blkiov15
[i
].bi_daddr
;
191 blkiov
[i
].bi_segcreate
= blkiov15
[i
].bi_segcreate
;
192 blkiov
[i
].bi_version
= blkiov15
[i
].bi_version
;
193 blkiov
[i
].bi_bp
= blkiov15
[i
].bi_bp
;
194 blkiov
[i
].bi_size
= blkiov15
[i
].bi_size
;
197 if ((error
= lfs_markv(l
->l_proc
, &fsid
, blkiov
, blkcnt
)) == 0) {
198 for (i
= 0; i
< blkcnt
; i
++) {
199 blkiov15
[i
].bi_inode
= blkiov
[i
].bi_inode
;
200 blkiov15
[i
].bi_lbn
= blkiov
[i
].bi_lbn
;
201 blkiov15
[i
].bi_daddr
= blkiov
[i
].bi_daddr
;
202 blkiov15
[i
].bi_segcreate
= blkiov
[i
].bi_segcreate
;
203 blkiov15
[i
].bi_version
= blkiov
[i
].bi_version
;
204 blkiov15
[i
].bi_bp
= blkiov
[i
].bi_bp
;
205 blkiov15
[i
].bi_size
= blkiov
[i
].bi_size
;
207 copyout(blkiov15
, SCARG(uap
, blkiov
),
208 blkcnt
* sizeof(BLOCK_INFO_15
));
211 lfs_free(fs
, blkiov
, LFS_NB_BLKIOV
);
212 lfs_free(fs
, blkiov15
, LFS_NB_BLKIOV
);
213 KERNEL_UNLOCK_ONE(NULL
);
218 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS)
221 lfs_markv(struct proc
*p
, fsid_t
*fsidp
, BLOCK_INFO
*blkiov
,
227 struct inode
*ip
= NULL
;
230 struct vnode
*vp
= NULL
;
232 daddr_t b_daddr
, v_daddr
;
239 /* number of blocks/inodes that we have already bwrite'ed */
240 int nblkwritten
, ninowritten
;
242 if ((mntp
= vfs_getvfs(fsidp
)) == NULL
)
245 fs
= VFSTOUFS(mntp
)->um_lfs
;
250 maxino
= (fragstoblks(fs
, fsbtofrags(fs
, VTOI(fs
->lfs_ivnode
)->i_ffs1_blocks
)) -
251 fs
->lfs_cleansz
- fs
->lfs_segtabsz
) * fs
->lfs_ifpb
;
255 if ((error
= vfs_busy(mntp
, NULL
)) != 0)
259 * This seglock is just to prevent the fact that we might have to sleep
260 * from allowing the possibility that our blocks might become
263 * It is also important to note here that unless we specify SEGM_CKP,
264 * any Ifile blocks that we might be asked to clean will never get
267 lfs_seglock(fs
, SEGM_CLEAN
| SEGM_CKP
| SEGM_SYNC
);
269 /* Mark blocks/inodes dirty. */
272 /* these were inside the initialization for the for loop */
273 v_daddr
= LFS_UNUSED_DADDR
;
274 lastino
= LFS_UNUSED_INUM
;
275 nblkwritten
= ninowritten
= 0;
276 for (blkp
= blkiov
; cnt
--; ++blkp
)
278 /* Bounds-check incoming data, avoid panic for failed VGET */
279 if (blkp
->bi_inode
<= 0 || blkp
->bi_inode
>= maxino
) {
284 * Get the IFILE entry (only once) and see if the file still
287 if (lastino
!= blkp
->bi_inode
) {
289 * Finish the old file, if there was one. The presence
290 * of a usable vnode in vp is signaled by a valid v_daddr.
292 if (v_daddr
!= LFS_UNUSED_DADDR
) {
300 lastino
= blkp
->bi_inode
;
301 if (blkp
->bi_inode
== LFS_IFILE_INUM
)
302 v_daddr
= fs
->lfs_idaddr
;
304 LFS_IENTRY(ifp
, fs
, blkp
->bi_inode
, bp
);
305 /* XXX fix for force write */
306 v_daddr
= ifp
->if_daddr
;
309 if (v_daddr
== LFS_UNUSED_DADDR
)
312 /* Get the vnode/inode. */
313 error
= lfs_fastvget(mntp
, blkp
->bi_inode
, v_daddr
,
315 (blkp
->bi_lbn
== LFS_UNUSED_LBN
323 DLOG((DLOG_CLEAN
, "lfs_markv: lfs_fastvget"
324 " failed with %d (ino %d, segment %d)\n",
325 error
, blkp
->bi_inode
,
326 dtosn(fs
, blkp
->bi_daddr
)));
328 * If we got EAGAIN, that means that the
329 * Inode was locked. This is
330 * recoverable: just clean the rest of
331 * this segment, and let the cleaner try
332 * again with another. (When the
333 * cleaner runs again, this segment will
334 * sort high on the list, since it is
335 * now almost entirely empty.) But, we
336 * still set v_daddr = LFS_UNUSED_ADDR
337 * so as not to test this over and over
340 if (error
== EAGAIN
) {
345 else if (error
!= ENOENT
)
346 panic("lfs_markv VFS_VGET FAILED");
348 /* lastino = LFS_UNUSED_INUM; */
349 v_daddr
= LFS_UNUSED_DADDR
;
356 } else if (v_daddr
== LFS_UNUSED_DADDR
) {
358 * This can only happen if the vnode is dead (or
359 * in any case we can't get it...e.g., it is
360 * inlocked). Keep going.
365 /* Past this point we are guaranteed that vp, ip are valid. */
367 /* Can't clean VU_DIROP directories in case of truncation */
368 /* XXX - maybe we should mark removed dirs specially? */
369 if (vp
->v_type
== VDIR
&& (vp
->v_uflag
& VU_DIROP
)) {
374 /* If this BLOCK_INFO didn't contain a block, keep going. */
375 if (blkp
->bi_lbn
== LFS_UNUSED_LBN
) {
376 /* XXX need to make sure that the inode gets written in this case */
377 /* XXX but only write the inode if it's the right one */
378 if (blkp
->bi_inode
!= LFS_IFILE_INUM
) {
379 LFS_IENTRY(ifp
, fs
, blkp
->bi_inode
, bp
);
380 if (ifp
->if_daddr
== blkp
->bi_daddr
) {
381 mutex_enter(&lfs_lock
);
382 LFS_SET_UINO(ip
, IN_CLEANING
);
383 mutex_exit(&lfs_lock
);
391 if (VOP_BMAP(vp
, blkp
->bi_lbn
, NULL
, &b_daddr
, NULL
) ||
392 dbtofsb(fs
, b_daddr
) != blkp
->bi_daddr
)
394 if (dtosn(fs
, dbtofsb(fs
, b_daddr
)) ==
395 dtosn(fs
, blkp
->bi_daddr
))
397 DLOG((DLOG_CLEAN
, "lfs_markv: wrong da same seg: %llx vs %llx\n",
398 (long long)blkp
->bi_daddr
, (long long)dbtofsb(fs
, b_daddr
)));
405 * Check block sizes. The blocks being cleaned come from
406 * disk, so they should have the same size as their on-disk
409 if (blkp
->bi_lbn
>= 0)
410 obsize
= blksize(fs
, ip
, blkp
->bi_lbn
);
412 obsize
= fs
->lfs_bsize
;
413 /* Check for fragment size change */
414 if (blkp
->bi_lbn
>= 0 && blkp
->bi_lbn
< NDADDR
) {
415 obsize
= ip
->i_lfs_fragsize
[blkp
->bi_lbn
];
417 if (obsize
!= blkp
->bi_size
) {
418 DLOG((DLOG_CLEAN
, "lfs_markv: ino %d lbn %lld wrong"
419 " size (%ld != %d), try again\n",
420 blkp
->bi_inode
, (long long)blkp
->bi_lbn
,
421 (long) obsize
, blkp
->bi_size
));
427 * If we get to here, then we are keeping the block. If
428 * it is an indirect block, we want to actually put it
429 * in the buffer cache so that it can be updated in the
430 * finish_meta section. If it's not, we need to
431 * allocate a fake buffer so that writeseg can perform
432 * the copyin and write the buffer.
434 if (ip
->i_number
!= LFS_IFILE_INUM
&& blkp
->bi_lbn
>= 0) {
436 bp
= lfs_fakebuf(fs
, vp
, blkp
->bi_lbn
,
437 blkp
->bi_size
, blkp
->bi_bp
);
438 /* Pretend we used bread() to get it */
439 bp
->b_blkno
= fsbtodb(fs
, blkp
->bi_daddr
);
441 /* Indirect block or ifile */
442 if (blkp
->bi_size
!= fs
->lfs_bsize
&&
443 ip
->i_number
!= LFS_IFILE_INUM
)
444 panic("lfs_markv: partial indirect block?"
445 " size=%d\n", blkp
->bi_size
);
446 bp
= getblk(vp
, blkp
->bi_lbn
, blkp
->bi_size
, 0, 0);
447 if (!(bp
->b_oflags
& (BO_DONE
|BO_DELWRI
))) {
449 * The block in question was not found
450 * in the cache; i.e., the block that
451 * getblk() returned is empty. So, we
452 * can (and should) copy in the
453 * contents, because we've already
454 * determined that this was the right
455 * version of this block on disk.
457 * And, it can't have changed underneath
458 * us, because we have the segment lock.
460 error
= copyin(blkp
->bi_bp
, bp
->b_data
, blkp
->bi_size
);
465 if ((error
= lfs_bwrite_ext(bp
, BW_CLEAN
)) != 0)
470 * XXX should account indirect blocks and ifile pages as well
472 if (nblkwritten
+ lblkno(fs
, ninowritten
* sizeof (struct ufs1_dinode
))
473 > LFS_MARKV_MAX_BLOCKS
) {
474 DLOG((DLOG_CLEAN
, "lfs_markv: writing %d blks %d inos\n",
475 nblkwritten
, ninowritten
));
476 lfs_segwrite(mntp
, SEGM_CLEAN
);
477 nblkwritten
= ninowritten
= 0;
482 * Finish the old file, if there was one
484 if (v_daddr
!= LFS_UNUSED_DADDR
) {
491 panic("lfs_markv: numrefed=%d", numrefed
);
493 DLOG((DLOG_CLEAN
, "lfs_markv: writing %d blks %d inos (check point)\n",
494 nblkwritten
, ninowritten
));
497 * The last write has to be SEGM_SYNC, because of calling semantics.
498 * It also has to be SEGM_CKP, because otherwise we could write
499 * over the newly cleaned data contained in a checkpoint, and then
500 * we'd be unhappy at recovery time.
502 lfs_segwrite(mntp
, SEGM_CLEAN
| SEGM_CKP
| SEGM_SYNC
);
506 vfs_unbusy(mntp
, false, NULL
);
515 DLOG((DLOG_CLEAN
, "lfs_markv err2\n"));
518 * XXX we're here because copyin() failed.
519 * XXX it means that we can't trust the cleanerd. too bad.
520 * XXX how can we recover from this?
524 KERNEL_UNLOCK_ONE(NULL
);
526 * XXX should do segwrite here anyway?
529 if (v_daddr
!= LFS_UNUSED_DADDR
) {
535 vfs_unbusy(mntp
, false, NULL
);
538 panic("lfs_markv: numrefed=%d", numrefed
);
547 * This will fill in the current disk address for arrays of blocks.
550 * -1/errno is return on error.
552 #ifdef USE_64BIT_SYSCALLS
554 sys_lfs_bmapv(struct lwp
*l
, const struct sys_lfs_bmapv_args
*uap
, register_t
*retval
)
557 syscallarg(fsid_t *) fsidp;
558 syscallarg(struct block_info *) blkiov;
559 syscallarg(int) blkcnt;
567 if ((error
= kauth_authorize_generic(l
->l_cred
, KAUTH_GENERIC_ISSUSER
,
571 if ((error
= copyin(SCARG(uap
, fsidp
), &fsid
, sizeof(fsid_t
))) != 0)
574 if ((mntp
= vfs_getvfs(&fsid
)) == NULL
)
576 fs
= VFSTOUFS(mntp
)->um_lfs
;
578 blkcnt
= SCARG(uap
, blkcnt
);
579 if ((u_int
) blkcnt
> SIZE_T_MAX
/ sizeof(BLOCK_INFO
))
581 KERNEL_LOCK(1, NULL
);
582 blkiov
= lfs_malloc(fs
, blkcnt
* sizeof(BLOCK_INFO
), LFS_NB_BLKIOV
);
583 if ((error
= copyin(SCARG(uap
, blkiov
), blkiov
,
584 blkcnt
* sizeof(BLOCK_INFO
))) != 0)
587 if ((error
= lfs_bmapv(p
, &fsid
, blkiov
, blkcnt
)) == 0)
588 copyout(blkiov
, SCARG(uap
, blkiov
),
589 blkcnt
* sizeof(BLOCK_INFO
));
591 lfs_free(fs
, blkiov
, LFS_NB_BLKIOV
);
592 KERNEL_UNLOCK_ONE(NULL
);
597 sys_lfs_bmapv(struct lwp
*l
, const struct sys_lfs_bmapv_args
*uap
, register_t
*retval
)
600 syscallarg(fsid_t *) fsidp;
601 syscallarg(struct block_info *) blkiov;
602 syscallarg(int) blkcnt;
605 BLOCK_INFO_15
*blkiov15
;
606 int i
, blkcnt
, error
;
611 if ((error
= kauth_authorize_generic(l
->l_cred
, KAUTH_GENERIC_ISSUSER
,
615 if ((error
= copyin(SCARG(uap
, fsidp
), &fsid
, sizeof(fsid_t
))) != 0)
618 if ((mntp
= vfs_getvfs(&fsid
)) == NULL
)
620 fs
= VFSTOUFS(mntp
)->um_lfs
;
622 blkcnt
= SCARG(uap
, blkcnt
);
623 if ((size_t) blkcnt
> SIZE_T_MAX
/ sizeof(BLOCK_INFO
))
625 KERNEL_LOCK(1, NULL
);
626 blkiov
= lfs_malloc(fs
, blkcnt
* sizeof(BLOCK_INFO
), LFS_NB_BLKIOV
);
627 blkiov15
= lfs_malloc(fs
, blkcnt
* sizeof(BLOCK_INFO_15
), LFS_NB_BLKIOV
);
628 if ((error
= copyin(SCARG(uap
, blkiov
), blkiov15
,
629 blkcnt
* sizeof(BLOCK_INFO_15
))) != 0)
632 for (i
= 0; i
< blkcnt
; i
++) {
633 blkiov
[i
].bi_inode
= blkiov15
[i
].bi_inode
;
634 blkiov
[i
].bi_lbn
= blkiov15
[i
].bi_lbn
;
635 blkiov
[i
].bi_daddr
= blkiov15
[i
].bi_daddr
;
636 blkiov
[i
].bi_segcreate
= blkiov15
[i
].bi_segcreate
;
637 blkiov
[i
].bi_version
= blkiov15
[i
].bi_version
;
638 blkiov
[i
].bi_bp
= blkiov15
[i
].bi_bp
;
639 blkiov
[i
].bi_size
= blkiov15
[i
].bi_size
;
642 if ((error
= lfs_bmapv(l
->l_proc
, &fsid
, blkiov
, blkcnt
)) == 0) {
643 for (i
= 0; i
< blkcnt
; i
++) {
644 blkiov15
[i
].bi_inode
= blkiov
[i
].bi_inode
;
645 blkiov15
[i
].bi_lbn
= blkiov
[i
].bi_lbn
;
646 blkiov15
[i
].bi_daddr
= blkiov
[i
].bi_daddr
;
647 blkiov15
[i
].bi_segcreate
= blkiov
[i
].bi_segcreate
;
648 blkiov15
[i
].bi_version
= blkiov
[i
].bi_version
;
649 blkiov15
[i
].bi_bp
= blkiov
[i
].bi_bp
;
650 blkiov15
[i
].bi_size
= blkiov
[i
].bi_size
;
652 copyout(blkiov15
, SCARG(uap
, blkiov
),
653 blkcnt
* sizeof(BLOCK_INFO_15
));
656 lfs_free(fs
, blkiov
, LFS_NB_BLKIOV
);
657 lfs_free(fs
, blkiov15
, LFS_NB_BLKIOV
);
658 KERNEL_UNLOCK_ONE(NULL
);
664 lfs_bmapv(struct proc
*p
, fsid_t
*fsidp
, BLOCK_INFO
*blkiov
, int blkcnt
)
669 struct inode
*ip
= NULL
;
672 struct ufsmount
*ump
;
679 lfs_cleaner_pid
= p
->p_pid
;
681 if ((mntp
= vfs_getvfs(fsidp
)) == NULL
)
684 ump
= VFSTOUFS(mntp
);
685 if ((error
= vfs_busy(mntp
, NULL
)) != 0)
690 fs
= VFSTOUFS(mntp
)->um_lfs
;
694 /* these were inside the initialization for the for loop */
695 v_daddr
= LFS_UNUSED_DADDR
;
696 lastino
= LFS_UNUSED_INUM
;
697 for (blkp
= blkiov
; cnt
--; ++blkp
)
700 * Get the IFILE entry (only once) and see if the file still
703 if (lastino
!= blkp
->bi_inode
) {
705 * Finish the old file, if there was one. The presence
706 * of a usable vnode in vp is signaled by a valid
709 if (v_daddr
!= LFS_UNUSED_DADDR
) {
717 lastino
= blkp
->bi_inode
;
718 if (blkp
->bi_inode
== LFS_IFILE_INUM
)
719 v_daddr
= fs
->lfs_idaddr
;
721 LFS_IENTRY(ifp
, fs
, blkp
->bi_inode
, bp
);
722 v_daddr
= ifp
->if_daddr
;
725 if (v_daddr
== LFS_UNUSED_DADDR
) {
726 blkp
->bi_daddr
= LFS_UNUSED_DADDR
;
730 * A regular call to VFS_VGET could deadlock
731 * here. Instead, we try an unlocked access.
733 mutex_enter(&ufs_ihash_lock
);
734 vp
= ufs_ihashlookup(ump
->um_dev
, blkp
->bi_inode
);
735 if (vp
!= NULL
&& !(vp
->v_iflag
& VI_XLOCK
)) {
737 mutex_enter(&vp
->v_interlock
);
738 mutex_exit(&ufs_ihash_lock
);
740 v_daddr
= LFS_UNUSED_DADDR
;
745 mutex_exit(&ufs_ihash_lock
);
747 * Don't VFS_VGET if we're being unmounted,
748 * since we hold vfs_busy().
750 if (mntp
->mnt_iflag
& IMNT_UNMOUNT
) {
751 v_daddr
= LFS_UNUSED_DADDR
;
754 error
= VFS_VGET(mntp
, blkp
->bi_inode
, &vp
);
756 DLOG((DLOG_CLEAN
, "lfs_bmapv: vget ino"
758 blkp
->bi_inode
,error
));
759 v_daddr
= LFS_UNUSED_DADDR
;
762 KASSERT(VOP_ISLOCKED(vp
));
768 } else if (v_daddr
== LFS_UNUSED_DADDR
) {
770 * This can only happen if the vnode is dead.
771 * Keep going. Note that we DO NOT set the
772 * bi_addr to anything -- if we failed to get
773 * the vnode, for example, we want to assume
774 * conservatively that all of its blocks *are*
775 * located in the segment in question.
776 * lfs_markv will throw them out if we are
779 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
783 /* Past this point we are guaranteed that vp, ip are valid. */
785 if (blkp
->bi_lbn
== LFS_UNUSED_LBN
) {
787 * We just want the inode address, which is
788 * conveniently in v_daddr.
790 blkp
->bi_daddr
= v_daddr
;
795 error
= VOP_BMAP(vp
, blkp
->bi_lbn
, NULL
,
799 blkp
->bi_daddr
= LFS_UNUSED_DADDR
;
802 blkp
->bi_daddr
= dbtofsb(fs
, bi_daddr
);
803 /* Fill in the block size, too */
804 if (blkp
->bi_lbn
>= 0)
805 blkp
->bi_size
= blksize(fs
, ip
, blkp
->bi_lbn
);
807 blkp
->bi_size
= fs
->lfs_bsize
;
812 * Finish the old file, if there was one. The presence
813 * of a usable vnode in vp is signaled by a valid v_daddr.
815 if (v_daddr
!= LFS_UNUSED_DADDR
) {
822 panic("lfs_bmapv: numrefed=%d", numrefed
);
825 vfs_unbusy(mntp
, false, NULL
);
833 * Mark the segment clean.
836 * -1/errno is return on error.
839 sys_lfs_segclean(struct lwp
*l
, const struct sys_lfs_segclean_args
*uap
, register_t
*retval
)
842 syscallarg(fsid_t *) fsidp;
843 syscallarg(u_long) segment;
849 unsigned long segnum
;
851 if ((error
= kauth_authorize_generic(l
->l_cred
, KAUTH_GENERIC_ISSUSER
,
855 if ((error
= copyin(SCARG(uap
, fsidp
), &fsid
, sizeof(fsid_t
))) != 0)
857 if ((mntp
= vfs_getvfs(&fsid
)) == NULL
)
860 fs
= VFSTOUFS(mntp
)->um_lfs
;
861 segnum
= SCARG(uap
, segment
);
863 if ((error
= vfs_busy(mntp
, NULL
)) != 0)
866 KERNEL_LOCK(1, NULL
);
867 lfs_seglock(fs
, SEGM_PROT
);
868 error
= lfs_do_segclean(fs
, segnum
);
870 KERNEL_UNLOCK_ONE(NULL
);
871 vfs_unbusy(mntp
, false, NULL
);
876 * Actually mark the segment clean.
877 * Must be called with the segment lock held.
880 lfs_do_segclean(struct lfs
*fs
, unsigned long segnum
)
882 extern int lfs_dostats
;
887 if (dtosn(fs
, fs
->lfs_curseg
) == segnum
) {
891 LFS_SEGENTRY(sup
, fs
, segnum
, bp
);
892 if (sup
->su_nbytes
) {
893 DLOG((DLOG_CLEAN
, "lfs_segclean: not cleaning segment %lu:"
894 " %d live bytes\n", segnum
, sup
->su_nbytes
));
898 if (sup
->su_flags
& SEGUSE_ACTIVE
) {
899 DLOG((DLOG_CLEAN
, "lfs_segclean: not cleaning segment %lu:"
900 " segment is active\n", segnum
));
904 if (!(sup
->su_flags
& SEGUSE_DIRTY
)) {
905 DLOG((DLOG_CLEAN
, "lfs_segclean: not cleaning segment %lu:"
906 " segment is already clean\n", segnum
));
911 fs
->lfs_avail
+= segtod(fs
, 1);
912 if (sup
->su_flags
& SEGUSE_SUPERBLOCK
)
913 fs
->lfs_avail
-= btofsb(fs
, LFS_SBPAD
);
914 if (fs
->lfs_version
> 1 && segnum
== 0 &&
915 fs
->lfs_start
< btofsb(fs
, LFS_LABELPAD
))
916 fs
->lfs_avail
-= btofsb(fs
, LFS_LABELPAD
) - fs
->lfs_start
;
917 mutex_enter(&lfs_lock
);
918 fs
->lfs_bfree
+= sup
->su_nsums
* btofsb(fs
, fs
->lfs_sumsize
) +
919 btofsb(fs
, sup
->su_ninos
* fs
->lfs_ibsize
);
920 fs
->lfs_dmeta
-= sup
->su_nsums
* btofsb(fs
, fs
->lfs_sumsize
) +
921 btofsb(fs
, sup
->su_ninos
* fs
->lfs_ibsize
);
922 if (fs
->lfs_dmeta
< 0)
924 mutex_exit(&lfs_lock
);
925 sup
->su_flags
&= ~SEGUSE_DIRTY
;
926 LFS_WRITESEGENTRY(sup
, fs
, segnum
, bp
);
928 LFS_CLEANERINFO(cip
, fs
, bp
);
931 fs
->lfs_nclean
= cip
->clean
;
932 cip
->bfree
= fs
->lfs_bfree
;
933 mutex_enter(&lfs_lock
);
934 cip
->avail
= fs
->lfs_avail
- fs
->lfs_ravail
- fs
->lfs_favail
;
935 wakeup(&fs
->lfs_avail
);
936 mutex_exit(&lfs_lock
);
937 (void) LFS_BWRITE_LOG(bp
);
940 ++lfs_stats
.segs_reclaimed
;
946 * This will block until a segment in file system fsid is written. A timeout
947 * in milliseconds may be specified which will awake the cleaner automatically.
948 * An fsid of -1 means any file system, and a timeout of 0 means forever.
951 lfs_segwait(fsid_t
*fsidp
, struct timeval
*tv
)
958 KERNEL_LOCK(1, NULL
);
959 if (fsidp
== NULL
|| (mntp
= vfs_getvfs(fsidp
)) == NULL
)
960 addr
= &lfs_allclean_wakeup
;
962 addr
= &VFSTOUFS(mntp
)->um_lfs
->lfs_nextseg
;
964 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
965 * XXX IS THAT WHAT IS INTENDED?
967 timeout
= tvtohz(tv
);
968 error
= tsleep(addr
, PCATCH
| PVFS
, "segment", timeout
);
969 KERNEL_UNLOCK_ONE(NULL
);
970 return (error
== ERESTART
? EINTR
: 0);
976 * System call wrapper around lfs_segwait().
980 * -1/errno is return on error.
983 sys___lfs_segwait50(struct lwp
*l
, const struct sys___lfs_segwait50_args
*uap
,
987 syscallarg(fsid_t *) fsidp;
988 syscallarg(struct timeval *) tv;
994 /* XXX need we be su to segwait? */
995 if ((error
= kauth_authorize_generic(l
->l_cred
, KAUTH_GENERIC_ISSUSER
,
998 if ((error
= copyin(SCARG(uap
, fsidp
), &fsid
, sizeof(fsid_t
))) != 0)
1001 if (SCARG(uap
, tv
)) {
1002 error
= copyin(SCARG(uap
, tv
), &atv
, sizeof(struct timeval
));
1005 if (itimerfix(&atv
))
1007 } else /* NULL or invalid */
1008 atv
.tv_sec
= atv
.tv_usec
= 0;
1009 return lfs_segwait(&fsid
, &atv
);
1013 * VFS_VGET call specialized for the cleaner. The cleaner already knows the
1014 * daddr from the ifile, so don't look it up again. If the cleaner is
1015 * processing IINFO structures, it may have the ondisk inode already, so
1016 * don't go retrieving it again.
1018 * we lfs_vref, and it is the caller's responsibility to lfs_vunref
1023 lfs_fasthashget(dev_t dev
, ino_t ino
, struct vnode
**vpp
)
1027 mutex_enter(&ufs_ihash_lock
);
1028 if ((vp
= ufs_ihashlookup(dev
, ino
)) != NULL
) {
1029 mutex_enter(&vp
->v_interlock
);
1030 mutex_exit(&ufs_ihash_lock
);
1031 if (vp
->v_iflag
& VI_XLOCK
) {
1032 DLOG((DLOG_CLEAN
, "lfs_fastvget: ino %d VI_XLOCK\n",
1034 lfs_stats
.clean_vnlocked
++;
1035 mutex_exit(&vp
->v_interlock
);
1039 DLOG((DLOG_CLEAN
, "lfs_fastvget: lfs_vref failed"
1040 " for ino %d\n", ino
));
1041 lfs_stats
.clean_inlocked
++;
1045 mutex_exit(&ufs_ihash_lock
);
1053 lfs_fastvget(struct mount
*mp
, ino_t ino
, daddr_t daddr
, struct vnode
**vpp
,
1054 struct ufs1_dinode
*dinp
)
1057 struct ufs1_dinode
*dip
;
1059 struct ufsmount
*ump
;
1070 * Wait until the filesystem is fully mounted before allowing vget
1071 * to complete. This prevents possible problems with roll-forward.
1073 mutex_enter(&lfs_lock
);
1074 while (fs
->lfs_flags
& LFS_NOTYET
) {
1075 mtsleep(&fs
->lfs_flags
, PRIBIO
+1, "lfs_fnotyet", 0,
1078 mutex_exit(&lfs_lock
);
1081 * This is playing fast and loose. Someone may have the inode
1082 * locked, in which case they are going to be distinctly unhappy
1083 * if we trash something.
1086 error
= lfs_fasthashget(dev
, ino
, vpp
);
1087 if (error
!= 0 || *vpp
!= NULL
)
1091 * getnewvnode(9) will call vfs_busy, which will block if the
1092 * filesystem is being unmounted; but umount(9) is waiting for
1093 * us because we're already holding the fs busy.
1096 if (mp
->mnt_iflag
& IMNT_UNMOUNT
) {
1100 if ((error
= getnewvnode(VT_LFS
, mp
, lfs_vnodeop_p
, &vp
)) != 0) {
1105 mutex_enter(&ufs_hashlock
);
1106 error
= lfs_fasthashget(dev
, ino
, vpp
);
1107 if (error
!= 0 || *vpp
!= NULL
) {
1108 mutex_exit(&ufs_hashlock
);
1113 /* Allocate new vnode/inode. */
1114 lfs_vcreate(mp
, ino
, vp
);
1117 * Put it onto its hash chain and lock it so that other requests for
1118 * this inode will block if they arrive while we are sleeping waiting
1119 * for old data structures to be purged or for the contents of the
1120 * disk portion of this inode to be read.
1124 mutex_exit(&ufs_hashlock
);
1128 * This may not need to be here, logically it should go down with
1129 * the i_devvp initialization.
1134 /* Read in the disk contents for the inode, copy into the inode. */
1136 error
= copyin(dinp
, ip
->i_din
.ffs1_din
, sizeof (struct ufs1_dinode
));
1138 DLOG((DLOG_CLEAN
, "lfs_fastvget: dinode copyin failed"
1139 " for ino %d\n", ino
));
1142 /* Unlock and discard unneeded inode. */
1143 vlockmgr(&vp
->v_lock
, LK_RELEASE
);
1148 if (ip
->i_number
!= ino
)
1149 panic("lfs_fastvget: I was fed the wrong inode!");
1153 error
= bread(ump
->um_devvp
, fsbtodb(fs
, daddr
), fs
->lfs_ibsize
,
1156 DLOG((DLOG_CLEAN
, "lfs_fastvget: bread failed (%d)\n",
1159 * The inode does not contain anything useful, so it
1160 * would be misleading to leave it on its hash chain.
1161 * Iput() will return it to the free list.
1165 /* Unlock and discard unneeded inode. */
1166 vlockmgr(&vp
->v_lock
, LK_RELEASE
);
1172 dip
= lfs_ifind(ump
->um_lfs
, ino
, bp
);
1174 /* Assume write has not completed yet; try again */
1175 brelse(bp
, BC_INVAL
);
1177 if (retries
> LFS_IFIND_RETRIES
)
1178 panic("lfs_fastvget: dinode not found");
1179 DLOG((DLOG_CLEAN
, "lfs_fastvget: dinode not found,"
1183 *ip
->i_din
.ffs1_din
= *dip
;
1190 KASSERT(VOP_ISLOCKED(vp
));
1197 * Make up a "fake" cleaner buffer, copy the data from userland into it.
1200 lfs_fakebuf(struct lfs
*fs
, struct vnode
*vp
, int lbn
, size_t size
, void *uaddr
)
1205 KASSERT(VTOI(vp
)->i_number
!= LFS_IFILE_INUM
);
1207 bp
= lfs_newbuf(VTOI(vp
)->i_lfs
, vp
, lbn
, size
, LFS_NB_CLEAN
);
1208 error
= copyin(uaddr
, bp
->b_data
, size
);
1210 lfs_freebuf(fs
, bp
);
1213 KDASSERT(bp
->b_iodone
== lfs_callback
);
1216 mutex_enter(&lfs_lock
);
1218 mutex_exit(&lfs_lock
);
1220 bp
->b_bufsize
= size
;
1221 bp
->b_bcount
= size
;