sys/ufs/lfs/lfs_syscalls.c

   1 /*      $NetBSD: lfs_syscalls.c,v 1.170 2015/09/01 06:08:37 dholland Exp $      */
   2
   3 /*-
   4  * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008
   5  *    The NetBSD Foundation, Inc.
   6  * All rights reserved.
   7  *
   8  * This code is derived from software contributed to The NetBSD Foundation
   9  * by Konrad E. Schroder <perseant@hhhh.org>.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30  * POSSIBILITY OF SUCH DAMAGE.
  31  */
  32 /*-
  33  * Copyright (c) 1991, 1993, 1994
  34  *      The Regents of the University of California.  All rights reserved.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  * 3. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)lfs_syscalls.c      8.10 (Berkeley) 5/14/95
  61  */
  62
  63 #include <sys/cdefs.h>
  64 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.170 2015/09/01 06:08:37 dholland Exp $");
  65
  66 #ifndef LFS
  67 # define LFS            /* for prototypes in syscallargs.h */
  68 #endif
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/proc.h>
  73 #include <sys/buf.h>
  74 #include <sys/mount.h>
  75 #include <sys/vnode.h>
  76 #include <sys/kernel.h>
  77 #include <sys/kauth.h>
  78 #include <sys/syscallargs.h>
  79
  80 #include <ufs/lfs/ulfs_inode.h>
  81 #include <ufs/lfs/ulfsmount.h>
  82 #include <ufs/lfs/ulfs_extern.h>
  83
  84 #include <ufs/lfs/lfs.h>
  85 #include <ufs/lfs/lfs_accessors.h>
  86 #include <ufs/lfs/lfs_kernel.h>
  87 #include <ufs/lfs/lfs_extern.h>
  88
  89 static int lfs_fastvget(struct mount *, ino_t, BLOCK_INFO *, int,
  90     struct vnode **);
  91 static struct buf *lfs_fakebuf(struct lfs *, struct vnode *, daddr_t,
  92     size_t, void *);
  93
  94 /*
  95  * sys_lfs_markv:
  96  *
  97  * This will mark inodes and blocks dirty, so they are written into the log.
  98  * It will block until all the blocks have been written.  The segment create
  99  * time passed in the block_info and inode_info structures is used to decide
 100  * if the data is valid for each block (in case some process dirtied a block
 101  * or inode that is being cleaned between the determination that a block is
 102  * live and the lfs_markv call).
 103  *
 104  *  0 on success
 105  * -1/errno is return on error.
 106  */
 107 #ifdef USE_64BIT_SYSCALLS
 108 int
 109 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval)
 110 {
 111         /* {
 112                 syscallarg(fsid_t *) fsidp;
 113                 syscallarg(struct block_info *) blkiov;
 114                 syscallarg(int) blkcnt;
 115         } */
 116         BLOCK_INFO *blkiov;
 117         int blkcnt, error;
 118         fsid_t fsid;
 119         struct lfs *fs;
 120         struct mount *mntp;
 121
 122         if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
 123                 return (error);
 124
 125         if ((mntp = vfs_getvfs(fsidp)) == NULL)
 126                 return (ENOENT);
 127         fs = VFSTOULFS(mntp)->um_lfs;
 128
 129         blkcnt = SCARG(uap, blkcnt);
 130         if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
 131                 return (EINVAL);
 132
 133         KERNEL_LOCK(1, NULL);
 134         blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
 135         if ((error = copyin(SCARG(uap, blkiov), blkiov,
 136                             blkcnt * sizeof(BLOCK_INFO))) != 0)
 137                 goto out;
 138
 139         if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0)
 140                 copyout(blkiov, SCARG(uap, blkiov),
 141                         blkcnt * sizeof(BLOCK_INFO));
 142     out:
 143         lfs_free(fs, blkiov, LFS_NB_BLKIOV);
 144         KERNEL_UNLOCK_ONE(NULL);
 145         return error;
 146 }
 147 #else
 148 int
 149 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval)
 150 {
 151         /* {
 152                 syscallarg(fsid_t *) fsidp;
 153                 syscallarg(struct block_info *) blkiov;
 154                 syscallarg(int) blkcnt;
 155         } */
 156         BLOCK_INFO *blkiov;
 157         BLOCK_INFO_15 *blkiov15;
 158         int i, blkcnt, error;
 159         fsid_t fsid;
 160         struct lfs *fs;
 161         struct mount *mntp;
 162
 163         if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
 164                 return (error);
 165
 166         if ((mntp = vfs_getvfs(&fsid)) == NULL)
 167                 return (ENOENT);
 168         fs = VFSTOULFS(mntp)->um_lfs;
 169
 170         blkcnt = SCARG(uap, blkcnt);
 171         if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
 172                 return (EINVAL);
 173
 174         KERNEL_LOCK(1, NULL);
 175         blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
 176         blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
 177         if ((error = copyin(SCARG(uap, blkiov), blkiov15,
 178                             blkcnt * sizeof(BLOCK_INFO_15))) != 0)
 179                 goto out;
 180
 181         for (i = 0; i < blkcnt; i++) {
 182                 blkiov[i].bi_inode     = blkiov15[i].bi_inode;
 183                 blkiov[i].bi_lbn       = blkiov15[i].bi_lbn;
 184                 blkiov[i].bi_daddr     = blkiov15[i].bi_daddr;
 185                 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
 186                 blkiov[i].bi_version   = blkiov15[i].bi_version;
 187                 blkiov[i].bi_bp        = blkiov15[i].bi_bp;
 188                 blkiov[i].bi_size      = blkiov15[i].bi_size;
 189         }
 190
 191         if ((error = lfs_markv(l, &fsid, blkiov, blkcnt)) == 0) {
 192                 for (i = 0; i < blkcnt; i++) {
 193                         blkiov15[i].bi_inode     = blkiov[i].bi_inode;
 194                         blkiov15[i].bi_lbn       = blkiov[i].bi_lbn;
 195                         blkiov15[i].bi_daddr     = blkiov[i].bi_daddr;
 196                         blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
 197                         blkiov15[i].bi_version   = blkiov[i].bi_version;
 198                         blkiov15[i].bi_bp        = blkiov[i].bi_bp;
 199                         blkiov15[i].bi_size      = blkiov[i].bi_size;
 200                 }
 201                 copyout(blkiov15, SCARG(uap, blkiov),
 202                         blkcnt * sizeof(BLOCK_INFO_15));
 203         }
 204     out:
 205         lfs_free(fs, blkiov, LFS_NB_BLKIOV);
 206         lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
 207         KERNEL_UNLOCK_ONE(NULL);
 208         return error;
 209 }
 210 #endif
 211
 212 #define LFS_MARKV_MAX_BLOCKS    (LFS_MAX_BUFS)
 213
 214 int
 215 lfs_markv(struct lwp *l, fsid_t *fsidp, BLOCK_INFO *blkiov,
 216     int blkcnt)
 217 {
 218         BLOCK_INFO *blkp;
 219         IFILE *ifp;
 220         struct buf *bp;
 221         struct inode *ip = NULL;
 222         struct lfs *fs;
 223         struct mount *mntp;
 224         struct ulfsmount *ump;
 225         struct vnode *vp;
 226         ino_t lastino;
 227         daddr_t b_daddr;
 228         int cnt, error;
 229         int do_again = 0;
 230         int numrefed = 0;
 231         ino_t maxino;
 232         size_t obsize;
 233
 234         /* number of blocks/inodes that we have already bwrite'ed */
 235         int nblkwritten, ninowritten;
 236
 237         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS,
 238             KAUTH_REQ_SYSTEM_LFS_MARKV, NULL, NULL, NULL);
 239         if (error)
 240                 return (error);
 241
 242         if ((mntp = vfs_getvfs(fsidp)) == NULL)
 243                 return (ENOENT);
 244
 245         ump = VFSTOULFS(mntp);
 246         fs = ump->um_lfs;
 247
 248         if (fs->lfs_ronly)
 249                 return EROFS;
 250
 251         maxino = (lfs_fragstoblks(fs, lfs_dino_getblocks(fs, VTOI(fs->lfs_ivnode)->i_din)) -
 252                       lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs);
 253
 254         cnt = blkcnt;
 255
 256         if ((error = vfs_busy(mntp, NULL)) != 0)
 257                 return (error);
 258
 259         /*
 260          * This seglock is just to prevent the fact that we might have to sleep
 261          * from allowing the possibility that our blocks might become
 262          * invalid.
 263          *
 264          * It is also important to note here that unless we specify SEGM_CKP,
 265          * any Ifile blocks that we might be asked to clean will never get
 266          * to the disk.
 267          */
 268         lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
 269
 270         /* Mark blocks/inodes dirty.  */
 271         error = 0;
 272
 273         /* these were inside the initialization for the for loop */
 274         vp = NULL;
 275         lastino = LFS_UNUSED_INUM;
 276         nblkwritten = ninowritten = 0;
 277         for (blkp = blkiov; cnt--; ++blkp)
 278         {
 279                 /* Bounds-check incoming data, avoid panic for failed VGET */
 280                 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
 281                         error = EINVAL;
 282                         goto err3;
 283                 }
 284                 /*
 285                  * Get the IFILE entry (only once) and see if the file still
 286                  * exists.
 287                  */
 288                 if (lastino != blkp->bi_inode) {
 289                         /*
 290                          * Finish the old file, if there was one.
 291                          */
 292                         if (vp != NULL) {
 293                                 vput(vp);
 294                                 vp = NULL;
 295                                 numrefed--;
 296                         }
 297
 298                         /*
 299                          * Start a new file
 300                          */
 301                         lastino = blkp->bi_inode;
 302
 303                         /* Get the vnode/inode. */
 304                         error = lfs_fastvget(mntp, blkp->bi_inode, blkp,
 305                             LK_EXCLUSIVE | LK_NOWAIT, &vp);
 306                         if (error) {
 307                                 DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget"
 308                                       " failed with %d (ino %d, segment %d)\n",
 309                                       error, blkp->bi_inode,
 310                                       lfs_dtosn(fs, blkp->bi_daddr)));
 311                                 /*
 312                                  * If we got EAGAIN, that means that the
 313                                  * Inode was locked.  This is
 314                                  * recoverable: just clean the rest of
 315                                  * this segment, and let the cleaner try
 316                                  * again with another.  (When the
 317                                  * cleaner runs again, this segment will
 318                                  * sort high on the list, since it is
 319                                  * now almost entirely empty.)
 320                                  */
 321                                 if (error == EAGAIN) {
 322                                         error = 0;
 323                                         do_again++;
 324                                 } else
 325                                         KASSERT(error == ENOENT);
 326                                 KASSERT(vp == NULL);
 327                                 ip = NULL;
 328                                 continue;
 329                         }
 330
 331                         ip = VTOI(vp);
 332                         numrefed++;
 333                         ninowritten++;
 334                 } else if (vp == NULL) {
 335                         /*
 336                          * This can only happen if the vnode is dead (or
 337                          * in any case we can't get it...e.g., it is
 338                          * inlocked).  Keep going.
 339                          */
 340                         continue;
 341                 }
 342
 343                 /* Past this point we are guaranteed that vp, ip are valid. */
 344
 345                 /* Can't clean VU_DIROP directories in case of truncation */
 346                 /* XXX - maybe we should mark removed dirs specially? */
 347                 if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) {
 348                         do_again++;
 349                         continue;
 350                 }
 351
 352                 /* If this BLOCK_INFO didn't contain a block, keep going. */
 353                 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
 354                         /* XXX need to make sure that the inode gets written in this case */
 355                         /* XXX but only write the inode if it's the right one */
 356                         if (blkp->bi_inode != LFS_IFILE_INUM) {
 357                                 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
 358                                 if (lfs_if_getdaddr(fs, ifp) == blkp->bi_daddr) {
 359                                         mutex_enter(&lfs_lock);
 360                                         LFS_SET_UINO(ip, IN_CLEANING);
 361                                         mutex_exit(&lfs_lock);
 362                                 }
 363                                 brelse(bp, 0);
 364                         }
 365                         continue;
 366                 }
 367
 368                 b_daddr = 0;
 369                 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
 370                     LFS_DBTOFSB(fs, b_daddr) != blkp->bi_daddr)
 371                 {
 372                         if (lfs_dtosn(fs, LFS_DBTOFSB(fs, b_daddr)) ==
 373                             lfs_dtosn(fs, blkp->bi_daddr))
 374                         {
 375                                 DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %jx vs %jx\n",
 376                                       (intmax_t)blkp->bi_daddr, (intmax_t)LFS_DBTOFSB(fs, b_daddr)));
 377                         }
 378                         do_again++;
 379                         continue;
 380                 }
 381
 382                 /*
 383                  * Check block sizes.  The blocks being cleaned come from
 384                  * disk, so they should have the same size as their on-disk
 385                  * counterparts.
 386                  */
 387                 if (blkp->bi_lbn >= 0)
 388                         obsize = lfs_blksize(fs, ip, blkp->bi_lbn);
 389                 else
 390                         obsize = lfs_sb_getbsize(fs);
 391                 /* Check for fragment size change */
 392                 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < ULFS_NDADDR) {
 393                         obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
 394                 }
 395                 if (obsize != blkp->bi_size) {
 396                         DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %jd wrong"
 397                               " size (%ld != %d), try again\n",
 398                               blkp->bi_inode, (intmax_t)blkp->bi_lbn,
 399                               (long) obsize, blkp->bi_size));
 400                         do_again++;
 401                         continue;
 402                 }
 403
 404                 /*
 405                  * If we get to here, then we are keeping the block.  If
 406                  * it is an indirect block, we want to actually put it
 407                  * in the buffer cache so that it can be updated in the
 408                  * finish_meta section.  If it's not, we need to
 409                  * allocate a fake buffer so that writeseg can perform
 410                  * the copyin and write the buffer.
 411                  */
 412                 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
 413                         /* Data Block */
 414                         bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
 415                                          blkp->bi_size, blkp->bi_bp);
 416                         /* Pretend we used bread() to get it */
 417                         bp->b_blkno = LFS_FSBTODB(fs, blkp->bi_daddr);
 418                 } else {
 419                         /* Indirect block or ifile */
 420                         if (blkp->bi_size != lfs_sb_getbsize(fs) &&
 421                             ip->i_number != LFS_IFILE_INUM)
 422                                 panic("lfs_markv: partial indirect block?"
 423                                     " size=%d\n", blkp->bi_size);
 424                         bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
 425                         if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) {
 426                                 /*
 427                                  * The block in question was not found
 428                                  * in the cache; i.e., the block that
 429                                  * getblk() returned is empty.  So, we
 430                                  * can (and should) copy in the
 431                                  * contents, because we've already
 432                                  * determined that this was the right
 433                                  * version of this block on disk.
 434                                  *
 435                                  * And, it can't have changed underneath
 436                                  * us, because we have the segment lock.
 437                                  */
 438                                 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
 439                                 if (error)
 440                                         goto err2;
 441                         }
 442                 }
 443                 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0)
 444                         goto err2;
 445
 446                 nblkwritten++;
 447                 /*
 448                  * XXX should account indirect blocks and ifile pages as well
 449                  */
 450                 if (nblkwritten + lfs_lblkno(fs, ninowritten * DINOSIZE(fs))
 451                     > LFS_MARKV_MAX_BLOCKS) {
 452                         DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n",
 453                               nblkwritten, ninowritten));
 454                         lfs_segwrite(mntp, SEGM_CLEAN);
 455                         nblkwritten = ninowritten = 0;
 456                 }
 457         }
 458
 459         /*
 460          * Finish the old file, if there was one
 461          */
 462         if (vp != NULL) {
 463                 vput(vp);
 464                 vp = NULL;
 465                 numrefed--;
 466         }
 467
 468 #ifdef DIAGNOSTIC
 469         if (numrefed != 0)
 470                 panic("lfs_markv: numrefed=%d", numrefed);
 471 #endif
 472         DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n",
 473               nblkwritten, ninowritten));
 474
 475         /*
 476          * The last write has to be SEGM_SYNC, because of calling semantics.
 477          * It also has to be SEGM_CKP, because otherwise we could write
 478          * over the newly cleaned data contained in a checkpoint, and then
 479          * we'd be unhappy at recovery time.
 480          */
 481         lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
 482
 483         lfs_segunlock(fs);
 484
 485         vfs_unbusy(mntp, false, NULL);
 486         if (error)
 487                 return (error);
 488         else if (do_again)
 489                 return EAGAIN;
 490
 491         return 0;
 492
 493 err2:
 494         DLOG((DLOG_CLEAN, "lfs_markv err2\n"));
 495
 496         /*
 497          * XXX we're here because copyin() failed.
 498          * XXX it means that we can't trust the cleanerd.  too bad.
 499          * XXX how can we recover from this?
 500          */
 501
 502 err3:
 503         /*
 504          * XXX should do segwrite here anyway?
 505          */
 506
 507         if (vp != NULL) {
 508                 vput(vp);
 509                 vp = NULL;
 510                 --numrefed;
 511         }
 512
 513         lfs_segunlock(fs);
 514         vfs_unbusy(mntp, false, NULL);
 515 #ifdef DIAGNOSTIC
 516         if (numrefed != 0)
 517                 panic("lfs_markv: numrefed=%d", numrefed);
 518 #endif
 519
 520         return (error);
 521 }
 522
 523 /*
 524  * sys_lfs_bmapv:
 525  *
 526  * This will fill in the current disk address for arrays of blocks.
 527  *
 528  *  0 on success
 529  * -1/errno is return on error.
 530  */
 531 #ifdef USE_64BIT_SYSCALLS
 532 int
 533 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval)
 534 {
 535         /* {
 536                 syscallarg(fsid_t *) fsidp;
 537                 syscallarg(struct block_info *) blkiov;
 538                 syscallarg(int) blkcnt;
 539         } */
 540         BLOCK_INFO *blkiov;
 541         int blkcnt, error;
 542         fsid_t fsid;
 543         struct lfs *fs;
 544         struct mount *mntp;
 545
 546         if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
 547                 return (error);
 548
 549         if ((mntp = vfs_getvfs(&fsid)) == NULL)
 550                 return (ENOENT);
 551         fs = VFSTOULFS(mntp)->um_lfs;
 552
 553         blkcnt = SCARG(uap, blkcnt);
 554         if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
 555                 return (EINVAL);
 556         KERNEL_LOCK(1, NULL);
 557         blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
 558         if ((error = copyin(SCARG(uap, blkiov), blkiov,
 559                             blkcnt * sizeof(BLOCK_INFO))) != 0)
 560                 goto out;
 561
 562         if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0)
 563                 copyout(blkiov, SCARG(uap, blkiov),
 564                         blkcnt * sizeof(BLOCK_INFO));
 565     out:
 566         lfs_free(fs, blkiov, LFS_NB_BLKIOV);
 567         KERNEL_UNLOCK_ONE(NULL);
 568         return error;
 569 }
 570 #else
 571 int
 572 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval)
 573 {
 574         /* {
 575                 syscallarg(fsid_t *) fsidp;
 576                 syscallarg(struct block_info *) blkiov;
 577                 syscallarg(int) blkcnt;
 578         } */
 579         BLOCK_INFO *blkiov;
 580         BLOCK_INFO_15 *blkiov15;
 581         int i, blkcnt, error;
 582         fsid_t fsid;
 583         struct lfs *fs;
 584         struct mount *mntp;
 585
 586         if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
 587                 return (error);
 588
 589         if ((mntp = vfs_getvfs(&fsid)) == NULL)
 590                 return (ENOENT);
 591         fs = VFSTOULFS(mntp)->um_lfs;
 592
 593         blkcnt = SCARG(uap, blkcnt);
 594         if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
 595                 return (EINVAL);
 596         KERNEL_LOCK(1, NULL);
 597         blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
 598         blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
 599         if ((error = copyin(SCARG(uap, blkiov), blkiov15,
 600                             blkcnt * sizeof(BLOCK_INFO_15))) != 0)
 601                 goto out;
 602
 603         for (i = 0; i < blkcnt; i++) {
 604                 blkiov[i].bi_inode     = blkiov15[i].bi_inode;
 605                 blkiov[i].bi_lbn       = blkiov15[i].bi_lbn;
 606                 blkiov[i].bi_daddr     = blkiov15[i].bi_daddr;
 607                 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
 608                 blkiov[i].bi_version   = blkiov15[i].bi_version;
 609                 blkiov[i].bi_bp        = blkiov15[i].bi_bp;
 610                 blkiov[i].bi_size      = blkiov15[i].bi_size;
 611         }
 612
 613         if ((error = lfs_bmapv(l, &fsid, blkiov, blkcnt)) == 0) {
 614                 for (i = 0; i < blkcnt; i++) {
 615                         blkiov15[i].bi_inode     = blkiov[i].bi_inode;
 616                         blkiov15[i].bi_lbn       = blkiov[i].bi_lbn;
 617                         blkiov15[i].bi_daddr     = blkiov[i].bi_daddr;
 618                         blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
 619                         blkiov15[i].bi_version   = blkiov[i].bi_version;
 620                         blkiov15[i].bi_bp        = blkiov[i].bi_bp;
 621                         blkiov15[i].bi_size      = blkiov[i].bi_size;
 622                 }
 623                 copyout(blkiov15, SCARG(uap, blkiov),
 624                         blkcnt * sizeof(BLOCK_INFO_15));
 625         }
 626     out:
 627         lfs_free(fs, blkiov, LFS_NB_BLKIOV);
 628         lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
 629         KERNEL_UNLOCK_ONE(NULL);
 630         return error;
 631 }
 632 #endif
 633
 634 int
 635 lfs_bmapv(struct lwp *l, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
 636 {
 637         BLOCK_INFO *blkp;
 638         IFILE *ifp;
 639         struct buf *bp;
 640         struct inode *ip = NULL;
 641         struct lfs *fs;
 642         struct mount *mntp;
 643         struct ulfsmount *ump;
 644         struct vnode *vp;
 645         ino_t lastino;
 646         daddr_t v_daddr;
 647         int cnt, error;
 648         int numrefed = 0;
 649
 650         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS,
 651             KAUTH_REQ_SYSTEM_LFS_BMAPV, NULL, NULL, NULL);
 652         if (error)
 653                 return (error);
 654
 655         if ((mntp = vfs_getvfs(fsidp)) == NULL)
 656                 return (ENOENT);
 657
 658         ump = VFSTOULFS(mntp);
 659         if ((error = vfs_busy(mntp, NULL)) != 0)
 660                 return (error);
 661
 662         if (ump->um_cleaner_thread == NULL)
 663                 ump->um_cleaner_thread = curlwp;
 664         KASSERT(ump->um_cleaner_thread == curlwp);
 665
 666         cnt = blkcnt;
 667
 668         fs = VFSTOULFS(mntp)->um_lfs;
 669
 670         error = 0;
 671
 672         /* these were inside the initialization for the for loop */
 673         vp = NULL;
 674         v_daddr = LFS_UNUSED_DADDR;
 675         lastino = LFS_UNUSED_INUM;
 676         for (blkp = blkiov; cnt--; ++blkp)
 677         {
 678                 /*
 679                  * Get the IFILE entry (only once) and see if the file still
 680                  * exists.
 681                  */
 682                 if (lastino != blkp->bi_inode) {
 683                         /*
 684                          * Finish the old file, if there was one.
 685                          */
 686                         if (vp != NULL) {
 687                                 vput(vp);
 688                                 vp = NULL;
 689                                 numrefed--;
 690                         }
 691
 692                         /*
 693                          * Start a new file
 694                          */
 695                         lastino = blkp->bi_inode;
 696                         if (blkp->bi_inode == LFS_IFILE_INUM)
 697                                 v_daddr = lfs_sb_getidaddr(fs);
 698                         else {
 699                                 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
 700                                 v_daddr = lfs_if_getdaddr(fs, ifp);
 701                                 brelse(bp, 0);
 702                         }
 703                         if (v_daddr == LFS_UNUSED_DADDR) {
 704                                 blkp->bi_daddr = LFS_UNUSED_DADDR;
 705                                 continue;
 706                         }
 707                         error = lfs_fastvget(mntp, blkp->bi_inode, NULL,
 708                             LK_SHARED, &vp);
 709                         if (error) {
 710                                 DLOG((DLOG_CLEAN, "lfs_bmapv: lfs_fastvget ino"
 711                                       "%d failed with %d",
 712                                       blkp->bi_inode,error));
 713                                 KASSERT(vp == NULL);
 714                                 continue;
 715                         } else {
 716                                 KASSERT(VOP_ISLOCKED(vp));
 717                                 numrefed++;
 718                         }
 719                         ip = VTOI(vp);
 720                 } else if (vp == NULL) {
 721                         /*
 722                          * This can only happen if the vnode is dead.
 723                          * Keep going.  Note that we DO NOT set the
 724                          * bi_addr to anything -- if we failed to get
 725                          * the vnode, for example, we want to assume
 726                          * conservatively that all of its blocks *are*
 727                          * located in the segment in question.
 728                          * lfs_markv will throw them out if we are
 729                          * wrong.
 730                          */
 731                         continue;
 732                 }
 733
 734                 /* Past this point we are guaranteed that vp, ip are valid. */
 735
 736                 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
 737                         /*
 738                          * We just want the inode address, which is
 739                          * conveniently in v_daddr.
 740                          */
 741                         blkp->bi_daddr = v_daddr;
 742                 } else {
 743                         daddr_t bi_daddr;
 744
 745                         error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
 746                                          &bi_daddr, NULL);
 747                         if (error)
 748                         {
 749                                 blkp->bi_daddr = LFS_UNUSED_DADDR;
 750                                 continue;
 751                         }
 752                         blkp->bi_daddr = LFS_DBTOFSB(fs, bi_daddr);
 753                         /* Fill in the block size, too */
 754                         if (blkp->bi_lbn >= 0)
 755                                 blkp->bi_size = lfs_blksize(fs, ip, blkp->bi_lbn);
 756                         else
 757                                 blkp->bi_size = lfs_sb_getbsize(fs);
 758                 }
 759         }
 760
 761         /*
 762          * Finish the old file, if there was one.
 763          */
 764         if (vp != NULL) {
 765                 vput(vp);
 766                 vp = NULL;
 767                 numrefed--;
 768         }
 769
 770 #ifdef DIAGNOSTIC
 771         if (numrefed != 0)
 772                 panic("lfs_bmapv: numrefed=%d", numrefed);
 773 #endif
 774
 775         vfs_unbusy(mntp, false, NULL);
 776
 777         return 0;
 778 }
 779
 780 /*
 781  * sys_lfs_segclean:
 782  *
 783  * Mark the segment clean.
 784  *
 785  *  0 on success
 786  * -1/errno is return on error.
 787  */
 788 int
 789 sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval)
 790 {
 791         /* {
 792                 syscallarg(fsid_t *) fsidp;
 793                 syscallarg(u_long) segment;
 794         } */
 795         struct lfs *fs;
 796         struct mount *mntp;
 797         fsid_t fsid;
 798         int error;
 799         unsigned long segnum;
 800
 801         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS,
 802             KAUTH_REQ_SYSTEM_LFS_SEGCLEAN, NULL, NULL, NULL);
 803         if (error)
 804                 return (error);
 805
 806         if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
 807                 return (error);
 808         if ((mntp = vfs_getvfs(&fsid)) == NULL)
 809                 return (ENOENT);
 810
 811         fs = VFSTOULFS(mntp)->um_lfs;
 812         segnum = SCARG(uap, segment);
 813
 814         if ((error = vfs_busy(mntp, NULL)) != 0)
 815                 return (error);
 816
 817         KERNEL_LOCK(1, NULL);
 818         lfs_seglock(fs, SEGM_PROT);
 819         error = lfs_do_segclean(fs, segnum);
 820         lfs_segunlock(fs);
 821         KERNEL_UNLOCK_ONE(NULL);
 822         vfs_unbusy(mntp, false, NULL);
 823         return error;
 824 }
 825
 826 /*
 827  * Actually mark the segment clean.
 828  * Must be called with the segment lock held.
 829  */
 830 int
 831 lfs_do_segclean(struct lfs *fs, unsigned long segnum)
 832 {
 833         extern int lfs_dostats;
 834         struct buf *bp;
 835         CLEANERINFO *cip;
 836         SEGUSE *sup;
 837
 838         if (lfs_dtosn(fs, lfs_sb_getcurseg(fs)) == segnum) {
 839                 return (EBUSY);
 840         }
 841
 842         LFS_SEGENTRY(sup, fs, segnum, bp);
 843         if (sup->su_nbytes) {
 844                 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
 845                       " %d live bytes\n", segnum, sup->su_nbytes));
 846                 brelse(bp, 0);
 847                 return (EBUSY);
 848         }
 849         if (sup->su_flags & SEGUSE_ACTIVE) {
 850                 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
 851                       " segment is active\n", segnum));
 852                 brelse(bp, 0);
 853                 return (EBUSY);
 854         }
 855         if (!(sup->su_flags & SEGUSE_DIRTY)) {
 856                 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
 857                       " segment is already clean\n", segnum));
 858                 brelse(bp, 0);
 859                 return (EALREADY);
 860         }
 861
 862         lfs_sb_addavail(fs, lfs_segtod(fs, 1));
 863         if (sup->su_flags & SEGUSE_SUPERBLOCK)
 864                 lfs_sb_subavail(fs, lfs_btofsb(fs, LFS_SBPAD));
 865         if (lfs_sb_getversion(fs) > 1 && segnum == 0 &&
 866             lfs_sb_gets0addr(fs) < lfs_btofsb(fs, LFS_LABELPAD))
 867                 lfs_sb_subavail(fs, lfs_btofsb(fs, LFS_LABELPAD) - lfs_sb_gets0addr(fs));
 868         mutex_enter(&lfs_lock);
 869         lfs_sb_addbfree(fs, sup->su_nsums * lfs_btofsb(fs, lfs_sb_getsumsize(fs)) +
 870                 lfs_btofsb(fs, sup->su_ninos * lfs_sb_getibsize(fs)));
 871         lfs_sb_subdmeta(fs, sup->su_nsums * lfs_btofsb(fs, lfs_sb_getsumsize(fs)) +
 872                 lfs_btofsb(fs, sup->su_ninos * lfs_sb_getibsize(fs)));
 873         if (lfs_sb_getdmeta(fs) < 0)
 874                 lfs_sb_setdmeta(fs, 0);
 875         mutex_exit(&lfs_lock);
 876         sup->su_flags &= ~SEGUSE_DIRTY;
 877         LFS_WRITESEGENTRY(sup, fs, segnum, bp);
 878
 879         LFS_CLEANERINFO(cip, fs, bp);
 880         lfs_ci_shiftdirtytoclean(fs, cip, 1);
 881         lfs_sb_setnclean(fs, lfs_ci_getclean(fs, cip));
 882         mutex_enter(&lfs_lock);
 883         lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs));
 884         lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs)
 885                         - fs->lfs_ravail - fs->lfs_favail);
 886         wakeup(&fs->lfs_availsleep);
 887         mutex_exit(&lfs_lock);
 888         (void) LFS_BWRITE_LOG(bp);
 889
 890         if (lfs_dostats)
 891                 ++lfs_stats.segs_reclaimed;
 892
 893         return (0);
 894 }
 895
 896 /*
 897  * This will block until a segment in file system fsid is written.  A timeout
 898  * in milliseconds may be specified which will awake the cleaner automatically.
 899  * An fsid of -1 means any file system, and a timeout of 0 means forever.
 900  */
 901 int
 902 lfs_segwait(fsid_t *fsidp, struct timeval *tv)
 903 {
 904         struct mount *mntp;
 905         void *addr;
 906         u_long timeout;
 907         int error;
 908
 909         KERNEL_LOCK(1, NULL);
 910         if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL)
 911                 addr = &lfs_allclean_wakeup;
 912         else
 913                 addr = &VFSTOULFS(mntp)->um_lfs->lfs_nextsegsleep;
 914         /*
 915          * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
 916          * XXX IS THAT WHAT IS INTENDED?
 917          */
 918         timeout = tvtohz(tv);
 919         error = tsleep(addr, PCATCH | PVFS, "segment", timeout);
 920         KERNEL_UNLOCK_ONE(NULL);
 921         return (error == ERESTART ? EINTR : 0);
 922 }
 923
 924 /*
 925  * sys_lfs_segwait:
 926  *
 927  * System call wrapper around lfs_segwait().
 928  *
 929  *  0 on success
 930  *  1 on timeout
 931  * -1/errno is return on error.
 932  */
 933 int
 934 sys___lfs_segwait50(struct lwp *l, const struct sys___lfs_segwait50_args *uap,
 935     register_t *retval)
 936 {
 937         /* {
 938                 syscallarg(fsid_t *) fsidp;
 939                 syscallarg(struct timeval *) tv;
 940         } */
 941         struct timeval atv;
 942         fsid_t fsid;
 943         int error;
 944
 945         /* XXX need we be su to segwait? */
 946         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS,
 947             KAUTH_REQ_SYSTEM_LFS_SEGWAIT, NULL, NULL, NULL);
 948         if (error)
 949                 return (error);
 950         if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
 951                 return (error);
 952
 953         if (SCARG(uap, tv)) {
 954                 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
 955                 if (error)
 956                         return (error);
 957                 if (itimerfix(&atv))
 958                         return (EINVAL);
 959         } else /* NULL or invalid */
 960                 atv.tv_sec = atv.tv_usec = 0;
 961         return lfs_segwait(&fsid, &atv);
 962 }
 963
 964 /*
 965  * VFS_VGET call specialized for the cleaner.  If the cleaner is
 966  * processing IINFO structures, it may have the ondisk inode already, so
 967  * don't go retrieving it again.
 968  *
 969  * Return the vnode referenced and locked.
 970  */
 971
 972 static int
 973 lfs_fastvget(struct mount *mp, ino_t ino, BLOCK_INFO *blkp, int lk_flags,
 974     struct vnode **vpp)
 975 {
 976         struct ulfsmount *ump;
 977         int error;
 978
 979         ump = VFSTOULFS(mp);
 980         ump->um_cleaner_hint = blkp;
 981         error = vcache_get(mp, &ino, sizeof(ino), vpp);
 982         ump->um_cleaner_hint = NULL;
 983         if (error)
 984                 return error;
 985         error = vn_lock(*vpp, lk_flags);
 986         if (error) {
 987                 if (error == EBUSY)
 988                         error = EAGAIN;
 989                 vrele(*vpp);
 990                 *vpp = NULL;
 991                 return error;
 992         }
 993
 994         return 0;
 995 }
 996
 997 /*
 998  * Make up a "fake" cleaner buffer, copy the data from userland into it.
 999  */
1000 static struct buf *
1001 lfs_fakebuf(struct lfs *fs, struct vnode *vp, daddr_t lbn, size_t size, void *uaddr)
1002 {
1003         struct buf *bp;
1004         int error;
1005
1006         KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM);
1007
1008         bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
1009         error = copyin(uaddr, bp->b_data, size);
1010         if (error) {
1011                 lfs_freebuf(fs, bp);
1012                 return NULL;
1013         }
1014         KDASSERT(bp->b_iodone == lfs_callback);
1015
1016 #if 0
1017         mutex_enter(&lfs_lock);
1018         ++fs->lfs_iocount;
1019         mutex_exit(&lfs_lock);
1020 #endif
1021         bp->b_bufsize = size;
1022         bp->b_bcount = size;
1023         return (bp);
1024 }