kernel/fs/ufs/ufs_inode.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
  24  */
  25
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/param.h>
  42 #include <sys/systm.h>
  43 #include <sys/uio.h>
  44 #include <sys/bitmap.h>
  45 #include <sys/signal.h>
  46 #include <sys/cred.h>
  47 #include <sys/user.h>
  48 #include <sys/vfs.h>
  49 #include <sys/stat.h>
  50 #include <sys/vnode.h>
  51 #include <sys/buf.h>
  52 #include <sys/proc.h>
  53 #include <sys/disp.h>
  54 #include <sys/dnlc.h>
  55 #include <sys/mode.h>
  56 #include <sys/cmn_err.h>
  57 #include <sys/kstat.h>
  58 #include <sys/acl.h>
  59 #include <sys/var.h>
  60 #include <sys/fs/ufs_inode.h>
  61 #include <sys/fs/ufs_fs.h>
  62 #include <sys/fs/ufs_trans.h>
  63 #include <sys/fs/ufs_acl.h>
  64 #include <sys/fs/ufs_bio.h>
  65 #include <sys/fs/ufs_quota.h>
  66 #include <sys/fs/ufs_log.h>
  67 #include <vm/hat.h>
  68 #include <vm/as.h>
  69 #include <vm/pvn.h>
  70 #include <vm/seg.h>
  71 #include <sys/swap.h>
  72 #include <sys/cpuvar.h>
  73 #include <sys/sysmacros.h>
  74 #include <sys/errno.h>
  75 #include <sys/kmem.h>
  76 #include <sys/debug.h>
  77 #include <sys/fs_subr.h>
  78 #include <sys/policy.h>
  79
  80 struct kmem_cache *inode_cache;         /* cache of free inodes */
  81
  82 /* UFS Inode Cache Stats -- Not protected */
  83 struct  instats ins = {
  84         { "size",               KSTAT_DATA_ULONG },
  85         { "maxsize",            KSTAT_DATA_ULONG },
  86         { "hits",               KSTAT_DATA_ULONG },
  87         { "misses",             KSTAT_DATA_ULONG },
  88         { "kmem allocs",        KSTAT_DATA_ULONG },
  89         { "kmem frees",         KSTAT_DATA_ULONG },
  90         { "maxsize reached",    KSTAT_DATA_ULONG },
  91         { "puts at frontlist",  KSTAT_DATA_ULONG },
  92         { "puts at backlist",   KSTAT_DATA_ULONG },
  93         { "queues to free",     KSTAT_DATA_ULONG },
  94         { "scans",              KSTAT_DATA_ULONG },
  95         { "thread idles",       KSTAT_DATA_ULONG },
  96         { "lookup idles",       KSTAT_DATA_ULONG },
  97         { "vget idles",         KSTAT_DATA_ULONG },
  98         { "cache allocs",       KSTAT_DATA_ULONG },
  99         { "cache frees",        KSTAT_DATA_ULONG },
 100         { "pushes at close",    KSTAT_DATA_ULONG }
 101 };
 102
 103 /* kstat data */
 104 static kstat_t          *ufs_inode_kstat = NULL;
 105
 106 union ihead *ihead;     /* inode LRU cache, Chris Maltby */
 107 kmutex_t *ih_lock;      /* protect inode cache hash table */
 108 static int ino_hashlen = 4;     /* desired average hash chain length */
 109 int inohsz;             /* number of buckets in the hash table */
 110
 111 kmutex_t        ufs_scan_lock;  /* stop racing multiple ufs_scan_inodes() */
 112 kmutex_t        ufs_iuniqtime_lock; /* protect iuniqtime */
 113 kmutex_t        ufsvfs_mutex;
 114 struct ufsvfs   *oldufsvfslist, *ufsvfslist;
 115
 116 /*
 117  * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
 118  * I/Os are going on.
 119  */
 120 clock_t ufs_iowait;
 121
 122 /*
 123  * the threads that process idle inodes and free (deleted) inodes
 124  * have high water marks that are set in ufsinit().
 125  * These values but can be no less then the minimum shown below
 126  */
 127 int     ufs_idle_max;   /* # of allowable idle inodes */
 128 ulong_t ufs_inode_max;  /* hard limit of allowable idle inodes */
 129 #define UFS_IDLE_MAX    (16)    /* min # of allowable idle inodes */
 130
 131 /*
 132  * Tunables for ufs write throttling.
 133  * These are validated in ufs_iinit() since improper settings
 134  * can lead to filesystem hangs.
 135  */
 136 #define UFS_HW_DEFAULT  (16 * 1024 * 1024)
 137 #define UFS_LW_DEFAULT  (8 * 1024 * 1024)
 138 int     ufs_HW = UFS_HW_DEFAULT;
 139 int     ufs_LW = UFS_LW_DEFAULT;
 140
 141 static void ihinit(void);
 142 extern int hash2ints(int, int);
 143
 144 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
 145     struct cred *, int);
 146
 147 /* ARGSUSED */
 148 static int
 149 ufs_inode_kstat_update(kstat_t *ksp, int rw)
 150 {
 151         if (rw == KSTAT_WRITE)
 152                 return (EACCES);
 153
 154         ins.in_malloc.value.ul  = (ulong_t)kmem_cache_stat(inode_cache,
 155             "slab_alloc");
 156         ins.in_mfree.value.ul   = (ulong_t)kmem_cache_stat(inode_cache,
 157             "slab_free");
 158         ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
 159             "alloc");
 160         ins.in_kcfree.value.ul  = (ulong_t)kmem_cache_stat(inode_cache,
 161             "free");
 162         ins.in_size.value.ul    = (ulong_t)kmem_cache_stat(inode_cache,
 163             "buf_inuse");
 164         ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
 165             "buf_max");
 166         ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
 167
 168         return (0);
 169 }
 170
 171 void
 172 ufs_iinit(void)
 173 {
 174         /*
 175          * Validate that ufs_HW > ufs_LW.
 176          * The default values for these two tunables have been increased.
 177          * There is now a range of values for ufs_HW that used to be
 178          * legal on previous Solaris versions but no longer is now.
 179          * Upgrading a machine which has an /etc/system setting for ufs_HW
 180          * from that range can lead to filesystem hangs unless the values
 181          * are checked here.
 182          */
 183         if (ufs_HW <= ufs_LW) {
 184                 cmn_err(CE_WARN,
 185                     "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
 186                     ufs_HW, ufs_LW);
 187                 ufs_LW = UFS_LW_DEFAULT;
 188                 ufs_HW = UFS_HW_DEFAULT;
 189                 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
 190                     ufs_HW, ufs_LW);
 191         }
 192
 193         /*
 194          * Adjust the tunable `ufs_ninode' to a reasonable value
 195          */
 196         if (ufs_ninode <= 0)
 197                 ufs_ninode = ncsize;
 198         if (ufs_inode_max == 0)
 199                 ufs_inode_max =
 200                     (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
 201         if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
 202                 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
 203                     ufs_inode_max);
 204                 ufs_ninode = ufs_inode_max;
 205         }
 206         /*
 207          * Wait till third call of ufs_update to declare that no I/Os are
 208          * going on. This allows deferred access times to be flushed to disk.
 209          */
 210         ufs_iowait = v.v_autoup * hz * 2;
 211
 212         /*
 213          * idle thread runs when 25% of ufs_ninode entries are on the queue
 214          */
 215         if (ufs_idle_max == 0)
 216                 ufs_idle_max = ufs_ninode >> 2;
 217         if (ufs_idle_max < UFS_IDLE_MAX)
 218                 ufs_idle_max = UFS_IDLE_MAX;
 219         if (ufs_idle_max > ufs_ninode)
 220                 ufs_idle_max = ufs_ninode;
 221         /*
 222          * This is really a misnomer, it is ufs_queue_init
 223          */
 224         ufs_thread_init(&ufs_idle_q, ufs_idle_max);
 225         ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
 226
 227         /*
 228          * global hlock thread
 229          */
 230         ufs_thread_init(&ufs_hlock, 1);
 231         ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
 232
 233         ihinit();
 234         qtinit();
 235         ins.in_maxsize.value.ul = ufs_ninode;
 236         if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
 237             KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
 238             KSTAT_FLAG_VIRTUAL)) != NULL) {
 239                 ufs_inode_kstat->ks_data = (void *)&ins;
 240                 ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
 241                 kstat_install(ufs_inode_kstat);
 242         }
 243         ufsfx_init();           /* fix-on-panic initialization */
 244         si_cache_init();
 245         ufs_directio_init();
 246         lufs_init();
 247         mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
 248 }
 249
 250 /* ARGSUSED */
 251 static int
 252 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
 253 {
 254         struct inode *ip = buf;
 255         struct vnode *vp;
 256
 257         vp = ip->i_vnode = vn_alloc(kmflags);
 258         if (vp == NULL) {
 259                 return (-1);
 260         }
 261         vn_setops(vp, &ufs_vnodeops);
 262         vp->v_data = ip;
 263
 264         rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
 265         rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
 266         mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
 267         dnlc_dir_init(&ip->i_danchor);
 268
 269         cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
 270
 271         return (0);
 272 }
 273
 274 /* ARGSUSED */
 275 static void
 276 ufs_inode_cache_destructor(void *buf, void *cdrarg)
 277 {
 278         struct inode *ip = buf;
 279         struct vnode *vp;
 280
 281         vp = ITOV(ip);
 282
 283         rw_destroy(&ip->i_rwlock);
 284         rw_destroy(&ip->i_contents);
 285         mutex_destroy(&ip->i_tlock);
 286         if (vp->v_type == VDIR) {
 287                 dnlc_dir_fini(&ip->i_danchor);
 288         }
 289
 290         cv_destroy(&ip->i_wrcv);
 291
 292         vn_free(vp);
 293 }
 294
 295 /*
 296  * Initialize hash links for inodes
 297  * and build inode free list.
 298  */
 299 void
 300 ihinit(void)
 301 {
 302         int i;
 303         union   ihead *ih = ihead;
 304
 305         mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
 306
 307         inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
 308         ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
 309         ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
 310
 311         for (i = 0, ih = ihead; i < inohsz; i++,  ih++) {
 312                 ih->ih_head[0] = ih;
 313                 ih->ih_head[1] = ih;
 314                 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
 315         }
 316         inode_cache = kmem_cache_create("ufs_inode_cache",
 317             sizeof (struct inode), 0, ufs_inode_cache_constructor,
 318             ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
 319             NULL, NULL, 0);
 320 }
 321
 322 /*
 323  * Free an inode structure
 324  */
 325 void
 326 ufs_free_inode(struct inode *ip)
 327 {
 328         vn_invalid(ITOV(ip));
 329         kmem_cache_free(inode_cache, ip);
 330 }
 331
 332 /*
 333  * Allocate an inode structure
 334  */
 335 struct inode *
 336 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
 337 {
 338         struct inode *ip;
 339         vnode_t *vp;
 340
 341         ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
 342         /*
 343          * at this point we have a newly allocated inode
 344          */
 345         ip->i_freef = ip;
 346         ip->i_freeb = ip;
 347         ip->i_flag = IREF;
 348         ip->i_seq = 0xFF;       /* Unique initial value */
 349         ip->i_dev = ufsvfsp->vfs_dev;
 350         ip->i_ufsvfs = ufsvfsp;
 351         ip->i_devvp = ufsvfsp->vfs_devvp;
 352         ip->i_number = ino;
 353         ip->i_diroff = 0;
 354         ip->i_nextr = 0;
 355         ip->i_map = NULL;
 356         ip->i_rdev = 0;
 357         ip->i_writes = 0;
 358         ip->i_mode = 0;
 359         ip->i_delaylen = 0;
 360         ip->i_delayoff = 0;
 361         ip->i_nextrio = 0;
 362         ip->i_ufs_acl = NULL;
 363         ip->i_cflags = 0;
 364         ip->i_mapcnt = 0;
 365         ip->i_dquot = NULL;
 366         ip->i_cachedir = CD_ENABLED;
 367         ip->i_writer = NULL;
 368
 369         /*
 370          * the vnode for this inode was allocated by the constructor
 371          */
 372         vp = ITOV(ip);
 373         vn_reinit(vp);
 374         if (ino == (ino_t)UFSROOTINO)
 375                 vp->v_flag = VROOT;
 376         vp->v_vfsp = ufsvfsp->vfs_vfs;
 377         vn_exists(vp);
 378         return (ip);
 379 }
 380
 381 /*
 382  * Look up an inode by device, inumber.  If it is in core (in the
 383  * inode structure), honor the locking protocol.  If it is not in
 384  * core, read it in from the specified device after freeing any pages.
 385  * In all cases, a pointer to a VN_HELD inode structure is returned.
 386  */
 387 int
 388 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
 389 {
 390         return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
 391 }
 392
 393 /*
 394  * A version of ufs_iget which returns only allocated, linked inodes.
 395  * This is appropriate for any callers who do not expect a free inode.
 396  */
 397 int
 398 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
 399     struct cred *cr)
 400 {
 401         return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
 402 }
 403
 404 /*
 405  * Set vnode attributes based on v_type, this should be called whenever
 406  * an inode's i_mode is changed.
 407  */
 408 void
 409 ufs_reset_vnode(vnode_t *vp)
 410 {
 411         /*
 412          * an old DBE hack
 413          */
 414         if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
 415                 vp->v_flag |= VSWAPLIKE;
 416         else
 417                 vp->v_flag &= ~VSWAPLIKE;
 418
 419         /*
 420          * if not swap like and it's just a regular file, we want
 421          * to maintain the vnode's pages sorted by clean/modified
 422          * for faster sync'ing to disk
 423          */
 424         if (vp->v_type == VREG)
 425                 vp->v_flag |= VMODSORT;
 426         else
 427                 vp->v_flag &= ~VMODSORT;
 428
 429         /*
 430          * Is this an attribute hidden dir?
 431          */
 432         if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
 433                 vp->v_flag |= V_XATTRDIR;
 434         else
 435                 vp->v_flag &= ~V_XATTRDIR;
 436 }
 437
 438 /*
 439  * Shared implementation of ufs_iget and ufs_iget_alloced.  The 'validate'
 440  * flag is used to distinguish the two; when true, we validate that the inode
 441  * being retrieved looks like a linked and allocated inode.
 442  */
 443 /* ARGSUSED */
 444 static int
 445 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
 446     struct cred *cr, int validate)
 447 {
 448         struct inode *ip, *sp;
 449         union ihead *ih;
 450         kmutex_t *ihm;
 451         struct buf *bp;
 452         struct dinode *dp;
 453         struct vnode *vp;
 454         extern vfs_t EIO_vfs;
 455         int error;
 456         int ftype;      /* XXX - Remove later on */
 457         dev_t vfs_dev;
 458         struct ufsvfs *ufsvfsp;
 459         struct fs *fs;
 460         int hno;
 461         daddr_t bno;
 462         ulong_t ioff;
 463
 464         CPU_STATS_ADD_K(sys, ufsiget, 1);
 465
 466         /*
 467          * Lookup inode in cache.
 468          */
 469         vfs_dev = vfsp->vfs_dev;
 470         hno = INOHASH(ino);
 471         ih = &ihead[hno];
 472         ihm = &ih_lock[hno];
 473
 474 again:
 475         mutex_enter(ihm);
 476         for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
 477                 if (ino != ip->i_number || vfs_dev != ip->i_dev ||
 478                     (ip->i_flag & ISTALE))
 479                         continue;
 480
 481                 /*
 482                  * Found the interesting inode; hold it and drop the cache lock
 483                  */
 484                 vp = ITOV(ip);  /* for locknest */
 485                 VN_HOLD(vp);
 486                 mutex_exit(ihm);
 487                 rw_enter(&ip->i_contents, RW_READER);
 488
 489                 /*
 490                  * if necessary, remove from idle list
 491                  */
 492                 if ((ip->i_flag & IREF) == 0) {
 493                         if (ufs_rmidle(ip))
 494                                 VN_RELE(vp);
 495                 }
 496
 497                 /*
 498                  * Could the inode be read from disk?
 499                  */
 500                 if (ip->i_flag & ISTALE) {
 501                         rw_exit(&ip->i_contents);
 502                         VN_RELE(vp);
 503                         goto again;
 504                 }
 505
 506                 ins.in_hits.value.ul++;
 507                 *ipp = ip;
 508
 509                 /*
 510                  * Reset the vnode's attribute flags
 511                  */
 512                 mutex_enter(&vp->v_lock);
 513                 ufs_reset_vnode(vp);
 514                 mutex_exit(&vp->v_lock);
 515
 516                 rw_exit(&ip->i_contents);
 517
 518                 return (0);
 519         }
 520         mutex_exit(ihm);
 521
 522         /*
 523          * Inode was not in cache.
 524          *
 525          * Allocate a new entry
 526          */
 527         ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
 528         fs = ufsvfsp->vfs_fs;
 529
 530         ip = ufs_alloc_inode(ufsvfsp, ino);
 531         vp = ITOV(ip);
 532
 533         bno = fsbtodb(fs, itod(fs, ino));
 534         ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
 535         ip->i_doff = (offset_t)ioff + ldbtob(bno);
 536
 537         /*
 538          * put a place holder in the cache (if not already there)
 539          */
 540         mutex_enter(ihm);
 541         for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
 542                 if (ino == sp->i_number && vfs_dev == sp->i_dev &&
 543                     ((sp->i_flag & ISTALE) == 0)) {
 544                         mutex_exit(ihm);
 545                         ufs_free_inode(ip);
 546                         goto again;
 547                 }
 548         /*
 549          * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
 550          * here, but if we do, then shadow inode allocations panic the
 551          * system.  We don't have to hold vfs_dqrwlock for shadow inodes
 552          * and the ufs_iget() parameters don't tell us what we are getting
 553          * so we have no way of knowing this is a ufs_iget() call from
 554          * a ufs_ialloc() call for a shadow inode.
 555          */
 556         rw_enter(&ip->i_contents, RW_WRITER);
 557         insque(ip, ih);
 558         mutex_exit(ihm);
 559         /*
 560          * read the dinode
 561          */
 562         bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
 563
 564         /*
 565          * Check I/O errors
 566          */
 567         error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
 568         if (error) {
 569                 brelse(bp);
 570                 ip->i_flag |= ISTALE;   /* in case someone is looking it up */
 571                 rw_exit(&ip->i_contents);
 572                 vp->v_vfsp = &EIO_vfs;
 573                 VN_RELE(vp);
 574                 return (error);
 575         }
 576         /*
 577          * initialize the inode's dinode
 578          */
 579         dp = (struct dinode *)(ioff + bp->b_un.b_addr);
 580         ip->i_ic = dp->di_ic;                   /* structure assignment */
 581         brelse(bp);
 582
 583         /*
 584          * Maintain compatibility with Solaris 1.x UFS
 585          */
 586         if (ip->i_suid != UID_LONG)
 587                 ip->i_uid = ip->i_suid;
 588         if (ip->i_sgid != GID_LONG)
 589                 ip->i_gid = ip->i_sgid;
 590
 591         ftype = ip->i_mode & IFMT;
 592         if (ftype == IFBLK || ftype == IFCHR) {
 593                 dev_t dv;
 594                 uint_t top16 = ip->i_ordev & 0xffff0000u;
 595
 596                 if (top16 == 0 || top16 == 0xffff0000u)
 597                         dv = expdev(ip->i_ordev);
 598                 else
 599                         dv = expldev(ip->i_ordev);
 600                 vp->v_rdev = ip->i_rdev = dv;
 601         }
 602
 603         /*
 604          * if our caller only expects allocated inodes, verify that
 605          * this inode looks good; throw it out if it's bad.
 606          */
 607         if (validate) {
 608                 if ((ftype == 0) || (ip->i_nlink <= 0)) {
 609                         ip->i_flag |= ISTALE;
 610                         rw_exit(&ip->i_contents);
 611                         vp->v_vfsp = &EIO_vfs;
 612                         VN_RELE(vp);
 613                         cmn_err(CE_NOTE,
 614                             "%s: unexpected free inode %d, run fsck(1M)%s",
 615                             fs->fs_fsmnt, (int)ino,
 616                             (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
 617                         return (EIO);
 618                 }
 619         }
 620
 621         /*
 622          * Finish initializing the vnode, special handling for shadow inodes
 623          * because IFTOVT() will produce a v_type of VNON which is not what we
 624          * want, set v_type to VREG explicitly in that case.
 625          */
 626         if (ftype == IFSHAD) {
 627                 vp->v_type = VREG;
 628         } else {
 629                 vp->v_type = IFTOVT((mode_t)ip->i_mode);
 630         }
 631
 632         ufs_reset_vnode(vp);
 633
 634         /*
 635          * read the shadow
 636          */
 637         if (ftype != 0 && ip->i_shadow != 0) {
 638                 if ((error = ufs_si_load(ip, cr)) != 0) {
 639                         ip->i_flag |= ISTALE;
 640                         ip->i_ufs_acl = NULL;
 641                         rw_exit(&ip->i_contents);
 642                         vp->v_vfsp = &EIO_vfs;
 643                         VN_RELE(vp);
 644                         return (error);
 645                 }
 646         }
 647
 648         /*
 649          * Only attach quota information if the inode has a type and if
 650          * that type is not a shadow inode.
 651          */
 652         if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
 653             ((ip->i_mode & IFMT) != IFATTRDIR)) {
 654                 ip->i_dquot = getinoquota(ip);
 655         }
 656         TRANS_MATA_IGET(ufsvfsp, ip);
 657         *ipp = ip;
 658         rw_exit(&ip->i_contents);
 659
 660         return (0);
 661 }
 662
 663 /*
 664  * Vnode is no longer referenced, write the inode out
 665  * and if necessary, truncate and deallocate the file.
 666  */
 667 void
 668 ufs_iinactive(struct inode *ip)
 669 {
 670         int             front;
 671         struct inode    *iq;
 672         struct inode    *hip;
 673         struct ufs_q    *uq;
 674         struct vnode    *vp = ITOV(ip);
 675         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
 676         struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
 677
 678         /*
 679          * Because the vnode type might have been changed,
 680          * the dnlc_dir_purge must be called unconditionally.
 681          */
 682         dnlc_dir_purge(&ip->i_danchor);
 683
 684         /*
 685          * Get exclusive access to inode data.
 686          */
 687         rw_enter(&ip->i_contents, RW_WRITER);
 688         ASSERT(ip->i_flag & IREF);
 689
 690         /*
 691          * Make sure no one reclaimed the inode before we put it on
 692          * the freelist or destroy it. We keep our 'hold' on the vnode
 693          * from vn_rele until we are ready to do something with the inode.
 694          *
 695          * Pageout may put a VN_HOLD/VN_RELE at anytime during this
 696          * operation via an async putpage, so we must make sure
 697          * we don't free/destroy the inode more than once. ufs_iget
 698          * may also put a VN_HOLD on the inode before it grabs
 699          * the i_contents lock. This is done so we don't free
 700          * an inode that a thread is waiting on.
 701          */
 702         mutex_enter(&vp->v_lock);
 703
 704         if (vp->v_count > 1) {
 705                 VN_RELE_LOCKED(vp);
 706                 mutex_exit(&vp->v_lock);
 707                 rw_exit(&ip->i_contents);
 708                 return;
 709         }
 710         mutex_exit(&vp->v_lock);
 711
 712         /*
 713          * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
 714          * and clean.  It can be safely destroyed (cyf).
 715          */
 716         if (ip->i_ufsvfs == NULL) {
 717                 rw_exit(&ip->i_contents);
 718                 ufs_si_del(ip);
 719                 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
 720                 ufs_free_inode(ip);
 721                 return;
 722         }
 723
 724         /*
 725          * queue idle inode to appropriate thread. Will check v_count == 1
 726          * prior to putting this on the appropriate queue.
 727          * Stale inodes will be unhashed and freed by the ufs idle thread
 728          * in ufs_idle_free()
 729          */
 730         front = 1;
 731         if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
 732             ip->i_mode && ip->i_nlink <= 0) {
 733                 /*
 734                  * Mark the i_flag to indicate that inode is being deleted.
 735                  * This flag will be cleared when the deletion is complete.
 736                  * This prevents nfs from sneaking in via ufs_vget() while
 737                  * the delete is in progress (bugid 1242481).
 738                  */
 739                 ip->i_flag |= IDEL;
 740
 741                 /*
 742                  * NOIDEL means that deletes are not allowed at this time;
 743                  * whoever resets NOIDEL will also send this inode back
 744                  * through ufs_iinactive.  IREF remains set.
 745                  */
 746                 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
 747                         mutex_enter(&vp->v_lock);
 748                         VN_RELE_LOCKED(vp);
 749                         mutex_exit(&vp->v_lock);
 750                         rw_exit(&ip->i_contents);
 751                         return;
 752                 }
 753                 if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
 754                         rw_exit(&ip->i_contents);
 755                         ufs_delete(ip->i_ufsvfs, ip, 0);
 756                         return;
 757                 }
 758
 759                 /* queue to delete thread; IREF remains set */
 760                 ins.in_qfree.value.ul++;
 761                 uq = &ip->i_ufsvfs->vfs_delete;
 762
 763                 mutex_enter(&uq->uq_mutex);
 764
 765                 /* add to q */
 766                 if ((iq = uq->uq_ihead) != 0) {
 767                         ip->i_freef = iq;
 768                         ip->i_freeb = iq->i_freeb;
 769                         iq->i_freeb->i_freef = ip;
 770                         iq->i_freeb = ip;
 771                         if (front)
 772                                 uq->uq_ihead = ip;
 773                 } else {
 774                         uq->uq_ihead = ip;
 775                         ip->i_freef = ip;
 776                         ip->i_freeb = ip;
 777                 }
 778
 779                 delq_info->delq_unreclaimed_files += 1;
 780                 delq_info->delq_unreclaimed_blocks += ip->i_blocks;
 781         } else {
 782                 /*
 783                  * queue to idle thread
 784                  *  Check the v_count == 1 again.
 785                  *
 786                  */
 787                 mutex_enter(&vp->v_lock);
 788                 if (vp->v_count > 1) {
 789                         VN_RELE_LOCKED(vp);
 790                         mutex_exit(&vp->v_lock);
 791                         rw_exit(&ip->i_contents);
 792                         return;
 793                 }
 794                 mutex_exit(&vp->v_lock);
 795                 uq = &ufs_idle_q;
 796
 797                 /*
 798                  * useful iff it has pages or is a fastsymlink; otherwise junk
 799                  */
 800                 mutex_enter(&uq->uq_mutex);
 801
 802                 /* clear IREF means `on idle list' */
 803                 ip->i_flag &= ~(IREF | IDIRECTIO);
 804
 805                 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
 806                         ins.in_frback.value.ul++;
 807                         hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
 808                         ufs_nuseful_iq++;
 809                 } else {
 810                         ins.in_frfront.value.ul++;
 811                         hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
 812                         ip->i_flag |= IJUNKIQ;
 813                         ufs_njunk_iq++;
 814                 }
 815                 ip->i_freef = hip;
 816                 ip->i_freeb = hip->i_freeb;
 817                 hip->i_freeb->i_freef = ip;
 818                 hip->i_freeb = ip;
 819         }
 820
 821         /* wakeup thread(s) if q is overfull */
 822         if (++uq->uq_ne == uq->uq_lowat)
 823                 cv_broadcast(&uq->uq_cv);
 824
 825         /* all done, release the q and inode */
 826         mutex_exit(&uq->uq_mutex);
 827         rw_exit(&ip->i_contents);
 828 }
 829
 830 /*
 831  * Check accessed and update flags on an inode structure.
 832  * If any are on, update the inode with the (unique) current time.
 833  * If waitfor is given, insure I/O order so wait for write to complete.
 834  */
 835 void
 836 ufs_iupdat(struct inode *ip, int waitfor)
 837 {
 838         struct buf      *bp;
 839         struct fs       *fp;
 840         struct dinode   *dp;
 841         struct ufsvfs   *ufsvfsp        = ip->i_ufsvfs;
 842         int             i;
 843         int             do_trans_times;
 844         ushort_t        flag;
 845         o_uid_t         suid;
 846         o_gid_t         sgid;
 847
 848         /*
 849          * This function is now safe to be called with either the reader
 850          * or writer i_contents lock.
 851          */
 852         ASSERT(RW_LOCK_HELD(&ip->i_contents));
 853
 854         /*
 855          * Return if file system has been forcibly umounted.
 856          */
 857         if (ufsvfsp == NULL)
 858                 return;
 859
 860         flag = ip->i_flag;      /* Atomic read */
 861         /*
 862          * We better not update the disk inode from a stale inode.
 863          */
 864         if (flag & ISTALE)
 865                 return;
 866
 867         fp = ip->i_fs;
 868
 869         if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
 870                 if (fp->fs_ronly) {
 871                         mutex_enter(&ip->i_tlock);
 872                         ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
 873                         mutex_exit(&ip->i_tlock);
 874                         return;
 875                 }
 876                 /*
 877                  * fs is active while metadata is being written
 878                  */
 879                 mutex_enter(&ufsvfsp->vfs_lock);
 880                 ufs_notclean(ufsvfsp);
 881                 /*
 882                  * get the dinode
 883                  */
 884                 bp = UFS_BREAD(ufsvfsp, ip->i_dev,
 885                     (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
 886                     (int)fp->fs_bsize);
 887                 if (bp->b_flags & B_ERROR) {
 888                         mutex_enter(&ip->i_tlock);
 889                         ip->i_flag &=
 890                             ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
 891                         mutex_exit(&ip->i_tlock);
 892                         brelse(bp);
 893                         return;
 894                 }
 895                 /*
 896                  * munge inode fields
 897                  */
 898                 mutex_enter(&ip->i_tlock);
 899                 ITIMES_NOLOCK(ip);
 900                 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
 901                 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
 902                 mutex_exit(&ip->i_tlock);
 903
 904                 /*
 905                  * For reads and concurrent re-writes, no deltas were
 906                  * entered for the access time changes - do it now.
 907                  */
 908                 if (do_trans_times) {
 909                         TRANS_INODE_TIMES(ufsvfsp, ip);
 910                 }
 911
 912                 /*
 913                  * For SunOS 5.0->5.4, these lines below read:
 914                  *
 915                  * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
 916                  * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
 917                  *
 918                  * where MAXUID was set to 60002.  This was incorrect -
 919                  * the uids should have been constrained to what fitted into
 920                  * a 16-bit word.
 921                  *
 922                  * This means that files from 4.x filesystems that have an
 923                  * i_suid field larger than 60002 will have that field
 924                  * changed to 65535.
 925                  *
 926                  * Security note: 4.x UFS could never create a i_suid of
 927                  * UID_LONG since that would've corresponded to -1.
 928                  */
 929                 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
 930                     UID_LONG : ip->i_uid;
 931                 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
 932                     GID_LONG : ip->i_gid;
 933
 934                 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
 935                         ip->i_suid = suid;
 936                         ip->i_sgid = sgid;
 937                         TRANS_INODE(ufsvfsp, ip);
 938                 }
 939
 940                 if ((ip->i_mode & IFMT) == IFBLK ||
 941                     (ip->i_mode & IFMT) == IFCHR) {
 942                         dev_t d = ip->i_rdev;
 943                         dev32_t dev32;
 944
 945                         /*
 946                          * load first direct block only if special device
 947                          */
 948                         if (!cmpldev(&dev32, d)) {
 949                                 /*
 950                                  * We panic here because there's "no way"
 951                                  * we should have been able to create a large
 952                                  * inode with a large dev_t.  Earlier layers
 953                                  * should've caught this.
 954                                  */
 955                                 panic("ip %p: i_rdev too big", (void *)ip);
 956                         }
 957
 958                         if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
 959                                 ip->i_ordev = dev32;    /* can't use old fmt. */
 960                         } else {
 961                                 ip->i_ordev = cmpdev(d);
 962                         }
 963                 }
 964
 965                 /*
 966                  * copy inode to dinode (zero fastsymlnk in dinode)
 967                  */
 968                 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
 969                 dp->di_ic = ip->i_ic;   /* structure assignment */
 970                 if (flag & IFASTSYMLNK) {
 971                         for (i = 1; i < NDADDR; i++)
 972                                 dp->di_db[i] = 0;
 973                         for (i = 0; i < NIADDR; i++)
 974                                 dp->di_ib[i] = 0;
 975                 }
 976                 if (TRANS_ISTRANS(ufsvfsp)) {
 977                         /*
 978                          * Pass only a sector size buffer containing
 979                          * the inode, otherwise when the buffer is copied
 980                          * into a cached roll buffer then too much memory
 981                          * gets consumed if 8KB inode buffers are passed.
 982                          */
 983                         TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
 984                             sizeof (struct dinode),
 985                             (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
 986                             DEV_BSIZE);
 987
 988                         brelse(bp);
 989                 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
 990                         UFS_BRWRITE(ufsvfsp, bp);
 991
 992                         /*
 993                          * Synchronous write has guaranteed that inode
 994                          * has been written on disk so clear the flag
 995                          */
 996                         mutex_enter(&ip->i_tlock);
 997                         ip->i_flag &= ~IBDWRITE;
 998                         mutex_exit(&ip->i_tlock);
 999                 } else {
1000                         bdrwrite(bp);
1001
1002                         /*
1003                          * This write hasn't guaranteed that inode has been
1004                          * written on the disk.
1005                          * Since, all updat flags on inode are cleared, we must
1006                          * remember the condition in case inode is to be updated
1007                          * synchronously later (e.g.- fsync()/fdatasync())
1008                          * and inode has not been modified yet.
1009                          */
1010                         mutex_enter(&ip->i_tlock);
1011                         ip->i_flag |= IBDWRITE;
1012                         mutex_exit(&ip->i_tlock);
1013                 }
1014         } else {
1015                 /*
1016                  * In case previous inode update was done asynchronously
1017                  * (IBDWRITE) and this inode update request wants guaranteed
1018                  * (synchronous) disk update, flush the inode.
1019                  */
1020                 if (waitfor && (flag & IBDWRITE)) {
1021                         blkflush(ip->i_dev,
1022                             (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
1023                         mutex_enter(&ip->i_tlock);
1024                         ip->i_flag &= ~IBDWRITE;
1025                         mutex_exit(&ip->i_tlock);
1026                 }
1027         }
1028 }
1029
1030 #define SINGLE  0       /* index of single indirect block */
1031 #define DOUBLE  1       /* index of double indirect block */
1032 #define TRIPLE  2       /* index of triple indirect block */
1033
1034 /*
1035  * Release blocks associated with the inode ip and
1036  * stored in the indirect block bn.  Blocks are free'd
1037  * in LIFO order up to (but not including) lastbn.  If
1038  * level is greater than SINGLE, the block is an indirect
1039  * block and recursive calls to indirtrunc must be used to
1040  * cleanse other indirect blocks.
1041  *
1042  * N.B.: triple indirect blocks are untested.
1043  */
1044 static long
1045 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
1046 {
1047         int i;
1048         struct buf *bp, *copy;
1049         daddr32_t *bap;
1050         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1051         struct fs *fs = ufsvfsp->vfs_fs;
1052         daddr_t nb, last;
1053         long factor;
1054         int blocksreleased = 0, nblocks;
1055
1056         ASSERT(RW_WRITE_HELD(&ip->i_contents));
1057         /*
1058          * Calculate index in current block of last
1059          * block to be kept.  -1 indicates the entire
1060          * block so we need not calculate the index.
1061          */
1062         factor = 1;
1063         for (i = SINGLE; i < level; i++)
1064                 factor *= NINDIR(fs);
1065         last = lastbn;
1066         if (lastbn > 0)
1067                 last /= factor;
1068         nblocks = btodb(fs->fs_bsize);
1069         /*
1070          * Get buffer of block pointers, zero those
1071          * entries corresponding to blocks to be free'd,
1072          * and update on disk copy first.
1073          * *Unless* the root pointer has been synchronously
1074          * written to disk.  If nothing points to this
1075          * indirect block then don't bother zero'ing and
1076          * writing it.
1077          */
1078         bp = UFS_BREAD(ufsvfsp,
1079             ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
1080         if (bp->b_flags & B_ERROR) {
1081                 brelse(bp);
1082                 return (0);
1083         }
1084         bap = bp->b_un.b_daddr;
1085         if ((flags & I_CHEAP) == 0) {
1086                 uint_t  zb;
1087
1088                 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
1089
1090                 if (zb) {
1091                         /*
1092                          * push any data into the log before we zero it
1093                          */
1094                         if (bp->b_flags & B_DELWRI)
1095                                 TRANS_LOG(ufsvfsp, (caddr_t)bap,
1096                                     ldbtob(bp->b_blkno), bp->b_bcount,
1097                                     bp->b_un.b_addr, bp->b_bcount);
1098                         copy = ngeteblk(fs->fs_bsize);
1099                         bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
1100                             (uint_t)fs->fs_bsize);
1101                         bzero((caddr_t)&bap[last + 1], zb);
1102
1103                         TRANS_BUF(ufsvfsp,
1104                             (caddr_t)&bap[last + 1] - (caddr_t)bap,
1105                             zb, bp, DT_ABZERO);
1106
1107                         UFS_BRWRITE(ufsvfsp, bp);
1108                         bp = copy, bap = bp->b_un.b_daddr;
1109                 }
1110         } else {
1111                 /* make sure write retries are also cleared */
1112                 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
1113                 bp->b_flags |= B_STALE | B_AGE;
1114         }
1115
1116         /*
1117          * Recursively free totally unused blocks.
1118          */
1119         flags |= I_CHEAP;
1120         for (i = NINDIR(fs) - 1; i > last; i--) {
1121                 nb = bap[i];
1122                 if (nb == 0)
1123                         continue;
1124                 if (level > SINGLE) {
1125                         blocksreleased +=
1126                             indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
1127                         free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
1128                 } else
1129                         free(ip, nb, (off_t)fs->fs_bsize, flags);
1130                 blocksreleased += nblocks;
1131         }
1132         flags &= ~I_CHEAP;
1133
1134         /*
1135          * Recursively free last partial block.
1136          */
1137         if (level > SINGLE && lastbn >= 0) {
1138                 last = lastbn % factor;
1139                 nb = bap[i];
1140                 if (nb != 0)
1141                         blocksreleased +=
1142                             indirtrunc(ip, nb, last, level - 1, flags);
1143         }
1144         brelse(bp);
1145         return (blocksreleased);
1146 }
1147
1148 /*
1149  * Truncate the inode ip to at most length size.
1150  * Free affected disk blocks -- the blocks of the
1151  * file are removed in reverse order.
1152  *
1153  * N.B.: triple indirect blocks are untested.
1154  */
1155 static int i_genrand = 1234;
1156 int
1157 ufs_itrunc(struct inode *oip, uoff_t length, int flags, cred_t *cr)
1158 {
1159         struct fs *fs = oip->i_fs;
1160         struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
1161         struct inode *ip;
1162         daddr_t lastblock;
1163         off_t bsize;
1164         int boff;
1165         daddr_t bn, lastiblock[NIADDR];
1166         int level;
1167         long nblocks, blocksreleased = 0;
1168         int i;
1169         ushort_t mode;
1170         struct inode tip;
1171         int err;
1172         uoff_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
1173             (UFS_MAXOFFSET_T) : (MAXOFF32_T);
1174
1175         /*
1176          * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
1177          * other uses need the reader lock. opendq() holds the writer lock.
1178          */
1179         ASSERT((oip->i_mode & IFMT) == IFSHAD ||
1180             RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
1181         ASSERT(RW_WRITE_HELD(&oip->i_contents));
1182         /*
1183          * We only allow truncation of regular files and directories
1184          * to arbitrary lengths here.  In addition, we allow symbolic
1185          * links to be truncated only to zero length.  Other inode
1186          * types cannot have their length set here.  Disk blocks are
1187          * being dealt with - especially device inodes where
1188          * ip->i_ordev is actually being stored in ip->i_db[0]!
1189          */
1190         TRANS_INODE(ufsvfsp, oip);
1191         mode = oip->i_mode & IFMT;
1192         if (flags & I_FREE) {
1193                 i_genrand *= 16843009;  /* turns into shift and adds */
1194                 i_genrand++;
1195                 oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1;
1196                 oip->i_flag |= ICHG |IUPD;
1197                 oip->i_seq++;
1198                 if (length == oip->i_size)
1199                         return (0);
1200                 flags |= I_CHEAP;
1201         }
1202         if (mode == IFIFO)
1203                 return (0);
1204         if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
1205             !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
1206                 return (EINVAL);
1207         if (length > maxoffset)
1208                 return (EFBIG);
1209         if ((mode == IFDIR) || (mode == IFATTRDIR))
1210                 flags |= I_DIR;
1211         if (mode == IFSHAD)
1212                 flags |= I_SHAD;
1213         if (oip == ufsvfsp->vfs_qinod)
1214                 flags |= I_QUOTA;
1215         if (length == oip->i_size) {
1216                 /* update ctime and mtime to please POSIX tests */
1217                 oip->i_flag |= ICHG |IUPD;
1218                 oip->i_seq++;
1219                 if (length == 0) {
1220                         /* nothing to cache so clear the flag */
1221                         oip->i_flag &= ~IFASTSYMLNK;
1222                 }
1223                 return (0);
1224         }
1225         /* wipe out fast symlink till next access */
1226         if (oip->i_flag & IFASTSYMLNK) {
1227                 int j;
1228
1229                 ASSERT(ITOV(oip)->v_type == VLNK);
1230
1231                 oip->i_flag &= ~IFASTSYMLNK;
1232
1233                 for (j = 1; j < NDADDR; j++)
1234                         oip->i_db[j] = 0;
1235                 for (j = 0; j < NIADDR; j++)
1236                         oip->i_ib[j] = 0;
1237         }
1238
1239         boff = (int)blkoff(fs, length);
1240
1241         if (length > oip->i_size) {
1242                 /*
1243                  * Trunc up case.  BMAPALLOC will insure that the right blocks
1244                  * are allocated.  This includes extending the old frag to a
1245                  * full block (if needed) in addition to doing any work
1246                  * needed for allocating the last block.
1247                  */
1248                 if (boff == 0)
1249                         err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
1250                 else
1251                         err = BMAPALLOC(oip, length - 1, boff, cr);
1252
1253                 if (err == 0) {
1254                         /*
1255                          * Save old size and set inode's size now
1256                          * so that we don't cause too much of the
1257                          * file to be zero'd and pushed.
1258                          */
1259                         uoff_t osize = oip->i_size;
1260                         oip->i_size  = length;
1261                         /*
1262                          * Make sure we zero out the remaining bytes of
1263                          * the page in case a mmap scribbled on it. We
1264                          * can't prevent a mmap from writing beyond EOF
1265                          * on the last page of a file.
1266                          *
1267                          */
1268                         if ((boff = (int)blkoff(fs, osize)) != 0) {
1269                                 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
1270                                     fs->fs_bsize : fragroundup(fs, boff);
1271                                 pvn_vpzero(ITOV(oip), osize,
1272                                     (size_t)(bsize - boff));
1273                         }
1274                         oip->i_flag |= ICHG|IATTCHG;
1275                         oip->i_seq++;
1276                         ITIMES_NOLOCK(oip);
1277                         /*
1278                          * MAXOFF32_T is old 2GB size limit. If
1279                          * this operation caused a large file to be
1280                          * created, turn on the superblock flag
1281                          * and update the superblock, if the flag
1282                          * is not already on.
1283                          */
1284                         if ((length > (uoff_t)MAXOFF32_T) &&
1285                             !(fs->fs_flags & FSLARGEFILES)) {
1286                                 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1287                                 mutex_enter(&ufsvfsp->vfs_lock);
1288                                 fs->fs_flags |= FSLARGEFILES;
1289                                 ufs_sbwrite(ufsvfsp);
1290                                 mutex_exit(&ufsvfsp->vfs_lock);
1291                         }
1292                 }
1293
1294                 return (err);
1295         }
1296
1297         /*
1298          * Update the pages of the file.  If the file is not being
1299          * truncated to a block boundary, the contents of the
1300          * pages following the end of the file must be zero'ed
1301          * in case it ever become accessible again because
1302          * of subsequent file growth.
1303          */
1304         if (boff == 0) {
1305                 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
1306                     B_INVAL | B_TRUNC, CRED());
1307         } else {
1308                 /*
1309                  * Make sure that the last block is properly allocated.
1310                  * We only really have to do this if the last block is
1311                  * actually allocated since ufs_bmap will now handle the case
1312                  * of an fragment which has no block allocated.  Just to
1313                  * be sure, we do it now independent of current allocation.
1314                  */
1315                 err = BMAPALLOC(oip, length - 1, boff, cr);
1316                 if (err)
1317                         return (err);
1318
1319                 /*
1320                  * BMAPALLOC will call bmap_write which defers i_seq
1321                  * processing.  If the timestamps were changed, update
1322                  * i_seq before rdip drops i_contents or syncs the inode.
1323                  */
1324                 if (oip->i_flag & (ICHG|IUPD))
1325                         oip->i_seq++;
1326
1327                 /*
1328                  * BugId 4069932
1329                  * Make sure that the relevant partial page appears in
1330                  * the v_object's list, so that pvn_vpzero() will do its
1331                  * job.  Since doing this correctly requires everything
1332                  * in rdip() except for the uiomove(), it's easier and
1333                  * safer to do the uiomove() rather than duplicate the
1334                  * rest of rdip() here.
1335                  *
1336                  * To get here, we know that length indicates a byte
1337                  * that is not the first byte of a block.  (length - 1)
1338                  * is the last actual byte known to exist.  Deduction
1339                  * shows it is in the same block as byte (length).
1340                  * Thus, this rdip() invocation should always succeed
1341                  * except in the face of i/o errors, and give us the
1342                  * block we care about.
1343                  *
1344                  * rdip() makes the same locking assertions and
1345                  * assumptions as we do.  We do not acquire any locks
1346                  * before calling it, so we have not changed the locking
1347                  * situation.  Finally, there do not appear to be any
1348                  * paths whereby rdip() ends up invoking us again.
1349                  * Thus, infinite recursion is avoided.
1350                  */
1351                 {
1352                         uio_t uio;
1353                         iovec_t iov[1];
1354                         char buffer;
1355
1356                         uio.uio_iov = iov;
1357                         uio.uio_iovcnt = 1;
1358                         uio.uio_loffset = length - 1;
1359                         uio.uio_resid = 1;
1360                         uio.uio_segflg = UIO_SYSSPACE;
1361                         uio.uio_extflg = UIO_COPY_CACHED;
1362
1363                         iov[0].iov_base = &buffer;
1364                         iov[0].iov_len = 1;
1365
1366                         err = rdip(oip, &uio, UIO_READ, NULL);
1367                         if (err)
1368                                 return (err);
1369                 }
1370
1371                 bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
1372                     fs->fs_bsize : fragroundup(fs, boff);
1373                 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
1374                 /*
1375                  * Ensure full fs block is marked as dirty.
1376                  */
1377                 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
1378                     ufs_putapage, B_INVAL | B_TRUNC, CRED());
1379         }
1380
1381         /*
1382          * Calculate index into inode's block list of
1383          * last direct and indirect blocks (if any)
1384          * which we want to keep.  Lastblock is -1 when
1385          * the file is truncated to 0.
1386          */
1387         lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
1388         lastiblock[SINGLE] = lastblock - NDADDR;
1389         lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
1390         lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
1391         nblocks = btodb(fs->fs_bsize);
1392
1393         /*
1394          * Update file and block pointers
1395          * on disk before we start freeing blocks.
1396          * If we crash before free'ing blocks below,
1397          * the blocks will be returned to the free list.
1398          * lastiblock values are also normalized to -1
1399          * for calls to indirtrunc below.
1400          */
1401         tip = *oip;                     /* structure copy */
1402         ip = &tip;
1403
1404         for (level = TRIPLE; level >= SINGLE; level--)
1405                 if (lastiblock[level] < 0) {
1406                         oip->i_ib[level] = 0;
1407                         lastiblock[level] = -1;
1408                 }
1409         for (i = NDADDR - 1; i > lastblock; i--) {
1410                 oip->i_db[i] = 0;
1411                 flags |= I_CHEAP;
1412         }
1413         oip->i_size = length;
1414         oip->i_flag |= ICHG|IUPD|IATTCHG;
1415         oip->i_seq++;
1416         if (!TRANS_ISTRANS(ufsvfsp))
1417                 ufs_iupdat(oip, I_SYNC);        /* do sync inode update */
1418
1419         /*
1420          * Indirect blocks first.
1421          */
1422         for (level = TRIPLE; level >= SINGLE; level--) {
1423                 bn = ip->i_ib[level];
1424                 if (bn != 0) {
1425                         blocksreleased +=
1426                             indirtrunc(ip, bn, lastiblock[level], level, flags);
1427                         if (lastiblock[level] < 0) {
1428                                 ip->i_ib[level] = 0;
1429                                 free(ip, bn, (off_t)fs->fs_bsize,
1430                                     flags | I_IBLK);
1431                                 blocksreleased += nblocks;
1432                         }
1433                 }
1434                 if (lastiblock[level] >= 0)
1435                         goto done;
1436         }
1437
1438         /*
1439          * All whole direct blocks or frags.
1440          */
1441         for (i = NDADDR - 1; i > lastblock; i--) {
1442                 bn = ip->i_db[i];
1443                 if (bn == 0)
1444                         continue;
1445                 ip->i_db[i] = 0;
1446                 bsize = (off_t)blksize(fs, ip, i);
1447                 free(ip, bn, bsize, flags);
1448                 blocksreleased += btodb(bsize);
1449         }
1450         if (lastblock < 0)
1451                 goto done;
1452
1453         /*
1454          * Finally, look for a change in size of the
1455          * last direct block; release any frags.
1456          */
1457         bn = ip->i_db[lastblock];
1458         if (bn != 0) {
1459                 off_t oldspace, newspace;
1460
1461                 /*
1462                  * Calculate amount of space we're giving
1463                  * back as old block size minus new block size.
1464                  */
1465                 oldspace = blksize(fs, ip, lastblock);
1466                 UFS_SET_ISIZE(length, ip);
1467                 newspace = blksize(fs, ip, lastblock);
1468                 if (newspace == 0) {
1469                         err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
1470                         return (err);
1471                 }
1472                 if (oldspace - newspace > 0) {
1473                         /*
1474                          * Block number of space to be free'd is
1475                          * the old block # plus the number of frags
1476                          * required for the storage we're keeping.
1477                          */
1478                         bn += numfrags(fs, newspace);
1479                         free(ip, bn, oldspace - newspace, flags);
1480                         blocksreleased += btodb(oldspace - newspace);
1481                 }
1482         }
1483 done:
1484 /* BEGIN PARANOIA */
1485         for (level = SINGLE; level <= TRIPLE; level++)
1486                 if (ip->i_ib[level] != oip->i_ib[level]) {
1487                         err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
1488                         return (err);
1489                 }
1490
1491         for (i = 0; i < NDADDR; i++)
1492                 if (ip->i_db[i] != oip->i_db[i]) {
1493                         err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
1494                         return (err);
1495                 }
1496 /* END PARANOIA */
1497         oip->i_blocks -= blocksreleased;
1498
1499         if (oip->i_blocks < 0) {                /* sanity */
1500                 cmn_err(CE_NOTE,
1501                     "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
1502                     fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
1503                     (int)oip->i_blocks);
1504                 oip->i_blocks = 0;
1505         }
1506         oip->i_flag |= ICHG|IATTCHG;
1507         oip->i_seq++;
1508         /* blocksreleased is >= zero, so this can not fail */
1509         (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL, NULL);
1510         return (0);
1511 }
1512
1513 /*
1514  * Check mode permission on inode.  Mode is READ, WRITE or EXEC.
1515  * In the case of WRITE, the read-only status of the file system
1516  * is checked.  Depending on the calling user, the appropriate
1517  * mode bits are selected; privileges to override missing permission
1518  * bits are checked through secpolicy_vnode_access().
1519  * The i_contens lock must be held as reader here to prevent racing with
1520  * the acl subsystem removing/setting/changing acls on this inode.
1521  * The caller is responsible for indicating whether or not the i_contents
1522  * lock needs to be acquired here or if already held.
1523  */
1524 int
1525 ufs_iaccess(struct inode  *ip, int mode, struct cred *cr, int dolock)
1526 {
1527         int shift = 0;
1528         int ret = 0;
1529
1530         if (dolock)
1531                 rw_enter(&ip->i_contents, RW_READER);
1532         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1533
1534         if (mode & IWRITE) {
1535                 /*
1536                  * Disallow write attempts on read-only
1537                  * file systems, unless the file is a block
1538                  * or character device or a FIFO.
1539                  */
1540                 if (ip->i_fs->fs_ronly != 0) {
1541                         if ((ip->i_mode & IFMT) != IFCHR &&
1542                             (ip->i_mode & IFMT) != IFBLK &&
1543                             (ip->i_mode & IFMT) != IFIFO) {
1544                                 ret = EROFS;
1545                                 goto out;
1546                         }
1547                 }
1548         }
1549         /*
1550          * If there is an acl, check the acl and return.
1551          */
1552         if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) {
1553                 ret = ufs_acl_access(ip, mode, cr);
1554                 goto out;
1555         }
1556
1557         /*
1558          * Access check is based on only one of owner, group, public.
1559          * If not owner, then check group.
1560          * If not a member of the group, then check public access.
1561          */
1562         if (crgetuid(cr) != ip->i_uid) {
1563                 shift += 3;
1564                 if (!groupmember((uid_t)ip->i_gid, cr))
1565                         shift += 3;
1566         }
1567
1568         /* test missing privilege bits */
1569         ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid,
1570             ip->i_mode << shift, mode);
1571 out:
1572         if (dolock)
1573                 rw_exit(&ip->i_contents);
1574         return (ret);
1575 }
1576
1577 /*
1578  * if necessary, remove an inode from the free list
1579  *      i_contents is held except at unmount
1580  *
1581  * Return 1 if the inode is taken off of the ufs_idle_q,
1582  * and the caller is expected to call VN_RELE.
1583  *
1584  * Return 0 otherwise.
1585  */
1586 int
1587 ufs_rmidle(struct inode *ip)
1588 {
1589         int rval = 0;
1590
1591         mutex_enter(&ip->i_tlock);
1592         if ((ip->i_flag & IREF) == 0) {
1593                 mutex_enter(&ufs_idle_q.uq_mutex);
1594                 ip->i_freef->i_freeb = ip->i_freeb;
1595                 ip->i_freeb->i_freef = ip->i_freef;
1596                 ip->i_freef = ip;
1597                 ip->i_freeb = ip;
1598                 ip->i_flag |= IREF;
1599                 ufs_idle_q.uq_ne--;
1600                 if (ip->i_flag & IJUNKIQ) {
1601                         ufs_njunk_iq--;
1602                         ip->i_flag &= ~IJUNKIQ;
1603                 } else {
1604                         ufs_nuseful_iq--;
1605                 }
1606                 mutex_exit(&ufs_idle_q.uq_mutex);
1607                 rval = 1;
1608         }
1609         mutex_exit(&ip->i_tlock);
1610         return (rval);
1611 }
1612
1613 /*
1614  * scan the hash of inodes and call func with the inode locked
1615  */
1616 int
1617 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
1618     struct ufsvfs *ufsvfsp)
1619 {
1620         struct inode            *ip;            /* current inode */
1621         struct inode            *lip = NULL;    /* last/previous inode */
1622         union ihead             *ih;            /* current hash chain */
1623         int                     error, i;
1624         int                     saverror = 0;
1625         int                     lip_held;       /* lip needs a VN_RELE() */
1626
1627         /*
1628          * If ufsvfsp is NULL, then our caller should be holding
1629          * ufs_scan_lock to avoid conflicts between ufs_unmount() and
1630          * ufs_update().  Otherwise, to avoid false-positives in
1631          * ufs_unmount()'s v_count-based EBUSY check, we only hold
1632          * those inodes that are in the file system our caller cares
1633          * about.
1634          *
1635          * We know that ip is a valid inode in the hash chain (and thus
1636          * we can trust i_ufsvfs) because the inode we chained from
1637          * (lip) is still in the hash chain.  This is true because either:
1638          *
1639          * 1. We did not drop the hash chain lock since the last
1640          *    iteration (because we were not interested in the last inode),
1641          * or
1642          * 2. We maintained a hold on the last inode while we
1643          *    we were processing it, so it could not be removed
1644          *    from the hash chain.
1645          *
1646          * The whole reason we're dropping and re-grabbing the chain
1647          * lock on every inode is so that we don't present a major
1648          * choke point on throughput, particularly when we've been
1649          * called on behalf of fsflush.
1650          */
1651
1652         for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
1653                 mutex_enter(&ih_lock[i]);
1654                 for (ip = ih->ih_chain[0], lip_held = 0;
1655                     ip != (struct inode *)ih;
1656                     ip = lip->i_forw) {
1657
1658                         ins.in_scan.value.ul++;
1659
1660                         /*
1661                          * Undo the previous iteration's VN_HOLD(), but
1662                          * only if one was done.
1663                          */
1664                         if (lip_held)
1665                                 VN_RELE(ITOV(lip));
1666
1667                         lip = ip;
1668                         if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
1669                                 /*
1670                                  * We're not processing all inodes, and
1671                                  * this inode is not in the filesystem of
1672                                  * interest, so skip it.  No need to do a
1673                                  * VN_HOLD() since we're not dropping the
1674                                  * hash chain lock until after we've
1675                                  * done the i_forw traversal above.
1676                                  */
1677                                 lip_held = 0;
1678                                 continue;
1679                         }
1680                         VN_HOLD(ITOV(ip));
1681                         lip_held = 1;
1682                         mutex_exit(&ih_lock[i]);
1683
1684                         /*
1685                          * Acquire the contents lock as writer to make
1686                          * sure that the inode has been initialized in
1687                          * the cache or removed from the idle list by
1688                          * ufs_iget().  This works because ufs_iget()
1689                          * acquires the contents lock before putting
1690                          * the inode into the cache.  If we can lock
1691                          * it, then ufs_iget() is done with it.
1692                          */
1693
1694                         if (rwtry) {
1695                                 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
1696                                         mutex_enter(&ih_lock[i]);
1697                                         continue;
1698                                 }
1699                         } else {
1700                                 rw_enter(&ip->i_contents, RW_WRITER);
1701                         }
1702
1703                         rw_exit(&ip->i_contents);
1704
1705                         /*
1706                          * ISTALE means the inode couldn't be read
1707                          *
1708                          * We don't have to hold the i_contents lock
1709                          * for this check for a couple of
1710                          * reasons. First, if ISTALE is set then the
1711                          * flag cannot be cleared until the inode is
1712                          * removed from the cache and that cannot
1713                          * happen until after we VN_RELE() it.
1714                          * Second, if ISTALE is not set, then the
1715                          * inode is in the cache and does not need to
1716                          * be read from disk so ISTALE cannot be set
1717                          * while we are not looking.
1718                          */
1719                         if ((ip->i_flag & ISTALE) == 0) {
1720                                 if ((error = (*func)(ip, arg)) != 0)
1721                                         saverror = error;
1722                         }
1723
1724                         mutex_enter(&ih_lock[i]);
1725                 }
1726                 if (lip_held)
1727                         VN_RELE(ITOV(lip));
1728                 mutex_exit(&ih_lock[i]);
1729         }
1730         return (saverror);
1731 }
1732
1733 /*
1734  * Mark inode with the current time, plus a unique increment.
1735  *
1736  * Since we only keep 32-bit time on disk, if UFS is still alive
1737  * beyond 2038, filesystem times will simply stick at the last
1738  * possible second of 32-bit time. Not ideal, but probably better
1739  * than going into the remote past, or confusing applications with
1740  * negative time.
1741  */
1742 void
1743 ufs_imark(struct inode *ip)
1744 {
1745         timestruc_t now;
1746         int32_t usec, nsec;
1747
1748         /*
1749          * The update of i_seq may have been deferred, increase i_seq here
1750          * to make sure it is in sync with the timestamps.
1751          */
1752         if (ip->i_flag & ISEQ) {
1753                 ASSERT(ip->i_flag & (IUPD|ICHG));
1754                 ip->i_seq++;
1755                 ip->i_flag &= ~ISEQ;
1756         }
1757
1758         gethrestime(&now);
1759
1760         /*
1761          * Fast algorithm to convert nsec to usec -- see hrt2ts()
1762          * in kernel/os/timers.c for a full description.
1763          */
1764         nsec = now.tv_nsec;
1765         usec = nsec + (nsec >> 2);
1766         usec = nsec + (usec >> 1);
1767         usec = nsec + (usec >> 2);
1768         usec = nsec + (usec >> 4);
1769         usec = nsec - (usec >> 3);
1770         usec = nsec + (usec >> 2);
1771         usec = nsec + (usec >> 3);
1772         usec = nsec + (usec >> 4);
1773         usec = nsec + (usec >> 1);
1774         usec = nsec + (usec >> 6);
1775         usec = usec >> 10;
1776
1777         mutex_enter(&ufs_iuniqtime_lock);
1778         if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
1779             usec > iuniqtime.tv_usec) {
1780                 if (now.tv_sec < TIME32_MAX) {
1781                         iuniqtime.tv_sec = (time32_t)now.tv_sec;
1782                         iuniqtime.tv_usec = usec;
1783                 }
1784         } else {
1785                 if (iuniqtime.tv_sec < TIME32_MAX) {
1786                         iuniqtime.tv_usec++;
1787                         /* Check for usec overflow */
1788                         if (iuniqtime.tv_usec >= MICROSEC) {
1789                                 iuniqtime.tv_sec++;
1790                                 iuniqtime.tv_usec = 0;
1791                         }
1792                 }
1793         }
1794
1795         if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
1796                 ip->i_atime = iuniqtime;
1797         }
1798         if (ip->i_flag & IUPD) {
1799                 ip->i_mtime = iuniqtime;
1800                 ip->i_flag |= IMODTIME;
1801         }
1802         if (ip->i_flag & ICHG) {
1803                 ip->i_diroff = 0;
1804                 ip->i_ctime = iuniqtime;
1805         }
1806         mutex_exit(&ufs_iuniqtime_lock);
1807 }
1808
1809 /*
1810  * Update timestamps in inode.
1811  */
1812 void
1813 ufs_itimes_nolock(struct inode *ip)
1814 {
1815
1816         /*
1817          * if noatime is set and the inode access time is the only field that
1818          * must be changed, exit immediately.
1819          */
1820         if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
1821             (ip->i_ufsvfs->vfs_noatime)) {
1822                 return;
1823         }
1824
1825         if (ip->i_flag & (IUPD|IACC|ICHG)) {
1826                 if (ip->i_flag & ICHG)
1827                         ip->i_flag |= IMOD;
1828                 else
1829                         ip->i_flag |= IMODACC;
1830                 ufs_imark(ip);
1831                 ip->i_flag &= ~(IACC|IUPD|ICHG);
1832         }
1833 }