dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / kernel / fs / ufs / ufs_inode.c
blobb098b3938e70dbb2d2bdc6de18c7b557976c8827
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/uio.h>
44 #include <sys/bitmap.h>
45 #include <sys/signal.h>
46 #include <sys/cred.h>
47 #include <sys/user.h>
48 #include <sys/vfs.h>
49 #include <sys/stat.h>
50 #include <sys/vnode.h>
51 #include <sys/buf.h>
52 #include <sys/proc.h>
53 #include <sys/disp.h>
54 #include <sys/dnlc.h>
55 #include <sys/mode.h>
56 #include <sys/cmn_err.h>
57 #include <sys/kstat.h>
58 #include <sys/acl.h>
59 #include <sys/var.h>
60 #include <sys/fs/ufs_inode.h>
61 #include <sys/fs/ufs_fs.h>
62 #include <sys/fs/ufs_trans.h>
63 #include <sys/fs/ufs_acl.h>
64 #include <sys/fs/ufs_bio.h>
65 #include <sys/fs/ufs_quota.h>
66 #include <sys/fs/ufs_log.h>
67 #include <vm/hat.h>
68 #include <vm/as.h>
69 #include <vm/pvn.h>
70 #include <vm/seg.h>
71 #include <sys/swap.h>
72 #include <sys/cpuvar.h>
73 #include <sys/sysmacros.h>
74 #include <sys/errno.h>
75 #include <sys/kmem.h>
76 #include <sys/debug.h>
77 #include <sys/fs_subr.h>
78 #include <sys/policy.h>
80 struct kmem_cache *inode_cache; /* cache of free inodes */
82 /* UFS Inode Cache Stats -- Not protected */
83 struct instats ins = {
84 { "size", KSTAT_DATA_ULONG },
85 { "maxsize", KSTAT_DATA_ULONG },
86 { "hits", KSTAT_DATA_ULONG },
87 { "misses", KSTAT_DATA_ULONG },
88 { "kmem allocs", KSTAT_DATA_ULONG },
89 { "kmem frees", KSTAT_DATA_ULONG },
90 { "maxsize reached", KSTAT_DATA_ULONG },
91 { "puts at frontlist", KSTAT_DATA_ULONG },
92 { "puts at backlist", KSTAT_DATA_ULONG },
93 { "queues to free", KSTAT_DATA_ULONG },
94 { "scans", KSTAT_DATA_ULONG },
95 { "thread idles", KSTAT_DATA_ULONG },
96 { "lookup idles", KSTAT_DATA_ULONG },
97 { "vget idles", KSTAT_DATA_ULONG },
98 { "cache allocs", KSTAT_DATA_ULONG },
99 { "cache frees", KSTAT_DATA_ULONG },
100 { "pushes at close", KSTAT_DATA_ULONG }
103 /* kstat data */
104 static kstat_t *ufs_inode_kstat = NULL;
106 union ihead *ihead; /* inode LRU cache, Chris Maltby */
107 kmutex_t *ih_lock; /* protect inode cache hash table */
108 static int ino_hashlen = 4; /* desired average hash chain length */
109 int inohsz; /* number of buckets in the hash table */
111 kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */
112 kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */
113 kmutex_t ufsvfs_mutex;
114 struct ufsvfs *oldufsvfslist, *ufsvfslist;
117 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
118 * I/Os are going on.
120 clock_t ufs_iowait;
123 * the threads that process idle inodes and free (deleted) inodes
124 * have high water marks that are set in ufsinit().
125 * These values but can be no less then the minimum shown below
127 int ufs_idle_max; /* # of allowable idle inodes */
128 ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */
129 #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */
132 * Tunables for ufs write throttling.
133 * These are validated in ufs_iinit() since improper settings
134 * can lead to filesystem hangs.
136 #define UFS_HW_DEFAULT (16 * 1024 * 1024)
137 #define UFS_LW_DEFAULT (8 * 1024 * 1024)
138 int ufs_HW = UFS_HW_DEFAULT;
139 int ufs_LW = UFS_LW_DEFAULT;
141 static void ihinit(void);
142 extern int hash2ints(int, int);
144 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
145 struct cred *, int);
147 /* ARGSUSED */
148 static int
149 ufs_inode_kstat_update(kstat_t *ksp, int rw)
151 if (rw == KSTAT_WRITE)
152 return (EACCES);
154 ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
155 "slab_alloc");
156 ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
157 "slab_free");
158 ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
159 "alloc");
160 ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
161 "free");
162 ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
163 "buf_inuse");
164 ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
165 "buf_max");
166 ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
168 return (0);
171 void
172 ufs_iinit(void)
175 * Validate that ufs_HW > ufs_LW.
176 * The default values for these two tunables have been increased.
177 * There is now a range of values for ufs_HW that used to be
178 * legal on previous Solaris versions but no longer is now.
179 * Upgrading a machine which has an /etc/system setting for ufs_HW
180 * from that range can lead to filesystem hangs unless the values
181 * are checked here.
183 if (ufs_HW <= ufs_LW) {
184 cmn_err(CE_WARN,
185 "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
186 ufs_HW, ufs_LW);
187 ufs_LW = UFS_LW_DEFAULT;
188 ufs_HW = UFS_HW_DEFAULT;
189 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
190 ufs_HW, ufs_LW);
194 * Adjust the tunable `ufs_ninode' to a reasonable value
196 if (ufs_ninode <= 0)
197 ufs_ninode = ncsize;
198 if (ufs_inode_max == 0)
199 ufs_inode_max =
200 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
201 if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
202 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
203 ufs_inode_max);
204 ufs_ninode = ufs_inode_max;
207 * Wait till third call of ufs_update to declare that no I/Os are
208 * going on. This allows deferred access times to be flushed to disk.
210 ufs_iowait = v.v_autoup * hz * 2;
213 * idle thread runs when 25% of ufs_ninode entries are on the queue
215 if (ufs_idle_max == 0)
216 ufs_idle_max = ufs_ninode >> 2;
217 if (ufs_idle_max < UFS_IDLE_MAX)
218 ufs_idle_max = UFS_IDLE_MAX;
219 if (ufs_idle_max > ufs_ninode)
220 ufs_idle_max = ufs_ninode;
222 * This is really a misnomer, it is ufs_queue_init
224 ufs_thread_init(&ufs_idle_q, ufs_idle_max);
225 ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
228 * global hlock thread
230 ufs_thread_init(&ufs_hlock, 1);
231 ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
233 ihinit();
234 qtinit();
235 ins.in_maxsize.value.ul = ufs_ninode;
236 if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
237 KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
238 KSTAT_FLAG_VIRTUAL)) != NULL) {
239 ufs_inode_kstat->ks_data = (void *)&ins;
240 ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
241 kstat_install(ufs_inode_kstat);
243 ufsfx_init(); /* fix-on-panic initialization */
244 si_cache_init();
245 ufs_directio_init();
246 lufs_init();
247 mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
250 /* ARGSUSED */
251 static int
252 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
254 struct inode *ip = buf;
255 struct vnode *vp;
257 vp = ip->i_vnode = vn_alloc(kmflags);
258 if (vp == NULL) {
259 return (-1);
261 vn_setops(vp, &ufs_vnodeops);
262 vp->v_data = ip;
264 rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
265 rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
266 mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
267 dnlc_dir_init(&ip->i_danchor);
269 cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
271 return (0);
274 /* ARGSUSED */
275 static void
276 ufs_inode_cache_destructor(void *buf, void *cdrarg)
278 struct inode *ip = buf;
279 struct vnode *vp;
281 vp = ITOV(ip);
283 rw_destroy(&ip->i_rwlock);
284 rw_destroy(&ip->i_contents);
285 mutex_destroy(&ip->i_tlock);
286 if (vp->v_type == VDIR) {
287 dnlc_dir_fini(&ip->i_danchor);
290 cv_destroy(&ip->i_wrcv);
292 vn_free(vp);
296 * Initialize hash links for inodes
297 * and build inode free list.
299 void
300 ihinit(void)
302 int i;
303 union ihead *ih = ihead;
305 mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
307 inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
308 ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
309 ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
311 for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
312 ih->ih_head[0] = ih;
313 ih->ih_head[1] = ih;
314 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
316 inode_cache = kmem_cache_create("ufs_inode_cache",
317 sizeof (struct inode), 0, ufs_inode_cache_constructor,
318 ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
319 NULL, NULL, 0);
323 * Free an inode structure
325 void
326 ufs_free_inode(struct inode *ip)
328 vn_invalid(ITOV(ip));
329 kmem_cache_free(inode_cache, ip);
333 * Allocate an inode structure
335 struct inode *
336 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
338 struct inode *ip;
339 vnode_t *vp;
341 ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
343 * at this point we have a newly allocated inode
345 ip->i_freef = ip;
346 ip->i_freeb = ip;
347 ip->i_flag = IREF;
348 ip->i_seq = 0xFF; /* Unique initial value */
349 ip->i_dev = ufsvfsp->vfs_dev;
350 ip->i_ufsvfs = ufsvfsp;
351 ip->i_devvp = ufsvfsp->vfs_devvp;
352 ip->i_number = ino;
353 ip->i_diroff = 0;
354 ip->i_nextr = 0;
355 ip->i_map = NULL;
356 ip->i_rdev = 0;
357 ip->i_writes = 0;
358 ip->i_mode = 0;
359 ip->i_delaylen = 0;
360 ip->i_delayoff = 0;
361 ip->i_nextrio = 0;
362 ip->i_ufs_acl = NULL;
363 ip->i_cflags = 0;
364 ip->i_mapcnt = 0;
365 ip->i_dquot = NULL;
366 ip->i_cachedir = CD_ENABLED;
367 ip->i_writer = NULL;
370 * the vnode for this inode was allocated by the constructor
372 vp = ITOV(ip);
373 vn_reinit(vp);
374 if (ino == (ino_t)UFSROOTINO)
375 vp->v_flag = VROOT;
376 vp->v_vfsp = ufsvfsp->vfs_vfs;
377 vn_exists(vp);
378 return (ip);
382 * Look up an inode by device, inumber. If it is in core (in the
383 * inode structure), honor the locking protocol. If it is not in
384 * core, read it in from the specified device after freeing any pages.
385 * In all cases, a pointer to a VN_HELD inode structure is returned.
388 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
390 return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
394 * A version of ufs_iget which returns only allocated, linked inodes.
395 * This is appropriate for any callers who do not expect a free inode.
398 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
399 struct cred *cr)
401 return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
405 * Set vnode attributes based on v_type, this should be called whenever
406 * an inode's i_mode is changed.
408 void
409 ufs_reset_vnode(vnode_t *vp)
412 * an old DBE hack
414 if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
415 vp->v_flag |= VSWAPLIKE;
416 else
417 vp->v_flag &= ~VSWAPLIKE;
420 * if not swap like and it's just a regular file, we want
421 * to maintain the vnode's pages sorted by clean/modified
422 * for faster sync'ing to disk
424 if (vp->v_type == VREG)
425 vp->v_flag |= VMODSORT;
426 else
427 vp->v_flag &= ~VMODSORT;
430 * Is this an attribute hidden dir?
432 if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
433 vp->v_flag |= V_XATTRDIR;
434 else
435 vp->v_flag &= ~V_XATTRDIR;
439 * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate'
440 * flag is used to distinguish the two; when true, we validate that the inode
441 * being retrieved looks like a linked and allocated inode.
443 /* ARGSUSED */
444 static int
445 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
446 struct cred *cr, int validate)
448 struct inode *ip, *sp;
449 union ihead *ih;
450 kmutex_t *ihm;
451 struct buf *bp;
452 struct dinode *dp;
453 struct vnode *vp;
454 extern vfs_t EIO_vfs;
455 int error;
456 int ftype; /* XXX - Remove later on */
457 dev_t vfs_dev;
458 struct ufsvfs *ufsvfsp;
459 struct fs *fs;
460 int hno;
461 daddr_t bno;
462 ulong_t ioff;
464 CPU_STATS_ADD_K(sys, ufsiget, 1);
467 * Lookup inode in cache.
469 vfs_dev = vfsp->vfs_dev;
470 hno = INOHASH(ino);
471 ih = &ihead[hno];
472 ihm = &ih_lock[hno];
474 again:
475 mutex_enter(ihm);
476 for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
477 if (ino != ip->i_number || vfs_dev != ip->i_dev ||
478 (ip->i_flag & ISTALE))
479 continue;
482 * Found the interesting inode; hold it and drop the cache lock
484 vp = ITOV(ip); /* for locknest */
485 VN_HOLD(vp);
486 mutex_exit(ihm);
487 rw_enter(&ip->i_contents, RW_READER);
490 * if necessary, remove from idle list
492 if ((ip->i_flag & IREF) == 0) {
493 if (ufs_rmidle(ip))
494 VN_RELE(vp);
498 * Could the inode be read from disk?
500 if (ip->i_flag & ISTALE) {
501 rw_exit(&ip->i_contents);
502 VN_RELE(vp);
503 goto again;
506 ins.in_hits.value.ul++;
507 *ipp = ip;
510 * Reset the vnode's attribute flags
512 mutex_enter(&vp->v_lock);
513 ufs_reset_vnode(vp);
514 mutex_exit(&vp->v_lock);
516 rw_exit(&ip->i_contents);
518 return (0);
520 mutex_exit(ihm);
523 * Inode was not in cache.
525 * Allocate a new entry
527 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
528 fs = ufsvfsp->vfs_fs;
530 ip = ufs_alloc_inode(ufsvfsp, ino);
531 vp = ITOV(ip);
533 bno = fsbtodb(fs, itod(fs, ino));
534 ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
535 ip->i_doff = (offset_t)ioff + ldbtob(bno);
538 * put a place holder in the cache (if not already there)
540 mutex_enter(ihm);
541 for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
542 if (ino == sp->i_number && vfs_dev == sp->i_dev &&
543 ((sp->i_flag & ISTALE) == 0)) {
544 mutex_exit(ihm);
545 ufs_free_inode(ip);
546 goto again;
549 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
550 * here, but if we do, then shadow inode allocations panic the
551 * system. We don't have to hold vfs_dqrwlock for shadow inodes
552 * and the ufs_iget() parameters don't tell us what we are getting
553 * so we have no way of knowing this is a ufs_iget() call from
554 * a ufs_ialloc() call for a shadow inode.
556 rw_enter(&ip->i_contents, RW_WRITER);
557 insque(ip, ih);
558 mutex_exit(ihm);
560 * read the dinode
562 bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
565 * Check I/O errors
567 error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
568 if (error) {
569 brelse(bp);
570 ip->i_flag |= ISTALE; /* in case someone is looking it up */
571 rw_exit(&ip->i_contents);
572 vp->v_vfsp = &EIO_vfs;
573 VN_RELE(vp);
574 return (error);
577 * initialize the inode's dinode
579 dp = (struct dinode *)(ioff + bp->b_un.b_addr);
580 ip->i_ic = dp->di_ic; /* structure assignment */
581 brelse(bp);
584 * Maintain compatibility with Solaris 1.x UFS
586 if (ip->i_suid != UID_LONG)
587 ip->i_uid = ip->i_suid;
588 if (ip->i_sgid != GID_LONG)
589 ip->i_gid = ip->i_sgid;
591 ftype = ip->i_mode & IFMT;
592 if (ftype == IFBLK || ftype == IFCHR) {
593 dev_t dv;
594 uint_t top16 = ip->i_ordev & 0xffff0000u;
596 if (top16 == 0 || top16 == 0xffff0000u)
597 dv = expdev(ip->i_ordev);
598 else
599 dv = expldev(ip->i_ordev);
600 vp->v_rdev = ip->i_rdev = dv;
604 * if our caller only expects allocated inodes, verify that
605 * this inode looks good; throw it out if it's bad.
607 if (validate) {
608 if ((ftype == 0) || (ip->i_nlink <= 0)) {
609 ip->i_flag |= ISTALE;
610 rw_exit(&ip->i_contents);
611 vp->v_vfsp = &EIO_vfs;
612 VN_RELE(vp);
613 cmn_err(CE_NOTE,
614 "%s: unexpected free inode %d, run fsck(1M)%s",
615 fs->fs_fsmnt, (int)ino,
616 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
617 return (EIO);
622 * Finish initializing the vnode, special handling for shadow inodes
623 * because IFTOVT() will produce a v_type of VNON which is not what we
624 * want, set v_type to VREG explicitly in that case.
626 if (ftype == IFSHAD) {
627 vp->v_type = VREG;
628 } else {
629 vp->v_type = IFTOVT((mode_t)ip->i_mode);
632 ufs_reset_vnode(vp);
635 * read the shadow
637 if (ftype != 0 && ip->i_shadow != 0) {
638 if ((error = ufs_si_load(ip, cr)) != 0) {
639 ip->i_flag |= ISTALE;
640 ip->i_ufs_acl = NULL;
641 rw_exit(&ip->i_contents);
642 vp->v_vfsp = &EIO_vfs;
643 VN_RELE(vp);
644 return (error);
649 * Only attach quota information if the inode has a type and if
650 * that type is not a shadow inode.
652 if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
653 ((ip->i_mode & IFMT) != IFATTRDIR)) {
654 ip->i_dquot = getinoquota(ip);
656 TRANS_MATA_IGET(ufsvfsp, ip);
657 *ipp = ip;
658 rw_exit(&ip->i_contents);
660 return (0);
664 * Vnode is no longer referenced, write the inode out
665 * and if necessary, truncate and deallocate the file.
667 void
668 ufs_iinactive(struct inode *ip)
670 int front;
671 struct inode *iq;
672 struct inode *hip;
673 struct ufs_q *uq;
674 struct vnode *vp = ITOV(ip);
675 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
676 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
679 * Because the vnode type might have been changed,
680 * the dnlc_dir_purge must be called unconditionally.
682 dnlc_dir_purge(&ip->i_danchor);
685 * Get exclusive access to inode data.
687 rw_enter(&ip->i_contents, RW_WRITER);
688 ASSERT(ip->i_flag & IREF);
691 * Make sure no one reclaimed the inode before we put it on
692 * the freelist or destroy it. We keep our 'hold' on the vnode
693 * from vn_rele until we are ready to do something with the inode.
695 * Pageout may put a VN_HOLD/VN_RELE at anytime during this
696 * operation via an async putpage, so we must make sure
697 * we don't free/destroy the inode more than once. ufs_iget
698 * may also put a VN_HOLD on the inode before it grabs
699 * the i_contents lock. This is done so we don't free
700 * an inode that a thread is waiting on.
702 mutex_enter(&vp->v_lock);
704 if (vp->v_count > 1) {
705 VN_RELE_LOCKED(vp);
706 mutex_exit(&vp->v_lock);
707 rw_exit(&ip->i_contents);
708 return;
710 mutex_exit(&vp->v_lock);
713 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
714 * and clean. It can be safely destroyed (cyf).
716 if (ip->i_ufsvfs == NULL) {
717 rw_exit(&ip->i_contents);
718 ufs_si_del(ip);
719 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
720 ufs_free_inode(ip);
721 return;
725 * queue idle inode to appropriate thread. Will check v_count == 1
726 * prior to putting this on the appropriate queue.
727 * Stale inodes will be unhashed and freed by the ufs idle thread
728 * in ufs_idle_free()
730 front = 1;
731 if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
732 ip->i_mode && ip->i_nlink <= 0) {
734 * Mark the i_flag to indicate that inode is being deleted.
735 * This flag will be cleared when the deletion is complete.
736 * This prevents nfs from sneaking in via ufs_vget() while
737 * the delete is in progress (bugid 1242481).
739 ip->i_flag |= IDEL;
742 * NOIDEL means that deletes are not allowed at this time;
743 * whoever resets NOIDEL will also send this inode back
744 * through ufs_iinactive. IREF remains set.
746 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
747 mutex_enter(&vp->v_lock);
748 VN_RELE_LOCKED(vp);
749 mutex_exit(&vp->v_lock);
750 rw_exit(&ip->i_contents);
751 return;
753 if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
754 rw_exit(&ip->i_contents);
755 ufs_delete(ip->i_ufsvfs, ip, 0);
756 return;
759 /* queue to delete thread; IREF remains set */
760 ins.in_qfree.value.ul++;
761 uq = &ip->i_ufsvfs->vfs_delete;
763 mutex_enter(&uq->uq_mutex);
765 /* add to q */
766 if ((iq = uq->uq_ihead) != 0) {
767 ip->i_freef = iq;
768 ip->i_freeb = iq->i_freeb;
769 iq->i_freeb->i_freef = ip;
770 iq->i_freeb = ip;
771 if (front)
772 uq->uq_ihead = ip;
773 } else {
774 uq->uq_ihead = ip;
775 ip->i_freef = ip;
776 ip->i_freeb = ip;
779 delq_info->delq_unreclaimed_files += 1;
780 delq_info->delq_unreclaimed_blocks += ip->i_blocks;
781 } else {
783 * queue to idle thread
784 * Check the v_count == 1 again.
787 mutex_enter(&vp->v_lock);
788 if (vp->v_count > 1) {
789 VN_RELE_LOCKED(vp);
790 mutex_exit(&vp->v_lock);
791 rw_exit(&ip->i_contents);
792 return;
794 mutex_exit(&vp->v_lock);
795 uq = &ufs_idle_q;
798 * useful iff it has pages or is a fastsymlink; otherwise junk
800 mutex_enter(&uq->uq_mutex);
802 /* clear IREF means `on idle list' */
803 ip->i_flag &= ~(IREF | IDIRECTIO);
805 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
806 ins.in_frback.value.ul++;
807 hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
808 ufs_nuseful_iq++;
809 } else {
810 ins.in_frfront.value.ul++;
811 hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
812 ip->i_flag |= IJUNKIQ;
813 ufs_njunk_iq++;
815 ip->i_freef = hip;
816 ip->i_freeb = hip->i_freeb;
817 hip->i_freeb->i_freef = ip;
818 hip->i_freeb = ip;
821 /* wakeup thread(s) if q is overfull */
822 if (++uq->uq_ne == uq->uq_lowat)
823 cv_broadcast(&uq->uq_cv);
825 /* all done, release the q and inode */
826 mutex_exit(&uq->uq_mutex);
827 rw_exit(&ip->i_contents);
831 * Check accessed and update flags on an inode structure.
832 * If any are on, update the inode with the (unique) current time.
833 * If waitfor is given, insure I/O order so wait for write to complete.
835 void
836 ufs_iupdat(struct inode *ip, int waitfor)
838 struct buf *bp;
839 struct fs *fp;
840 struct dinode *dp;
841 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
842 int i;
843 int do_trans_times;
844 ushort_t flag;
845 o_uid_t suid;
846 o_gid_t sgid;
849 * This function is now safe to be called with either the reader
850 * or writer i_contents lock.
852 ASSERT(RW_LOCK_HELD(&ip->i_contents));
855 * Return if file system has been forcibly umounted.
857 if (ufsvfsp == NULL)
858 return;
860 flag = ip->i_flag; /* Atomic read */
862 * We better not update the disk inode from a stale inode.
864 if (flag & ISTALE)
865 return;
867 fp = ip->i_fs;
869 if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
870 if (fp->fs_ronly) {
871 mutex_enter(&ip->i_tlock);
872 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
873 mutex_exit(&ip->i_tlock);
874 return;
877 * fs is active while metadata is being written
879 mutex_enter(&ufsvfsp->vfs_lock);
880 ufs_notclean(ufsvfsp);
882 * get the dinode
884 bp = UFS_BREAD(ufsvfsp, ip->i_dev,
885 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
886 (int)fp->fs_bsize);
887 if (bp->b_flags & B_ERROR) {
888 mutex_enter(&ip->i_tlock);
889 ip->i_flag &=
890 ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
891 mutex_exit(&ip->i_tlock);
892 brelse(bp);
893 return;
896 * munge inode fields
898 mutex_enter(&ip->i_tlock);
899 ITIMES_NOLOCK(ip);
900 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
901 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
902 mutex_exit(&ip->i_tlock);
905 * For reads and concurrent re-writes, no deltas were
906 * entered for the access time changes - do it now.
908 if (do_trans_times) {
909 TRANS_INODE_TIMES(ufsvfsp, ip);
913 * For SunOS 5.0->5.4, these lines below read:
915 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
916 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
918 * where MAXUID was set to 60002. This was incorrect -
919 * the uids should have been constrained to what fitted into
920 * a 16-bit word.
922 * This means that files from 4.x filesystems that have an
923 * i_suid field larger than 60002 will have that field
924 * changed to 65535.
926 * Security note: 4.x UFS could never create a i_suid of
927 * UID_LONG since that would've corresponded to -1.
929 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
930 UID_LONG : ip->i_uid;
931 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
932 GID_LONG : ip->i_gid;
934 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
935 ip->i_suid = suid;
936 ip->i_sgid = sgid;
937 TRANS_INODE(ufsvfsp, ip);
940 if ((ip->i_mode & IFMT) == IFBLK ||
941 (ip->i_mode & IFMT) == IFCHR) {
942 dev_t d = ip->i_rdev;
943 dev32_t dev32;
946 * load first direct block only if special device
948 if (!cmpldev(&dev32, d)) {
950 * We panic here because there's "no way"
951 * we should have been able to create a large
952 * inode with a large dev_t. Earlier layers
953 * should've caught this.
955 panic("ip %p: i_rdev too big", (void *)ip);
958 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
959 ip->i_ordev = dev32; /* can't use old fmt. */
960 } else {
961 ip->i_ordev = cmpdev(d);
966 * copy inode to dinode (zero fastsymlnk in dinode)
968 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
969 dp->di_ic = ip->i_ic; /* structure assignment */
970 if (flag & IFASTSYMLNK) {
971 for (i = 1; i < NDADDR; i++)
972 dp->di_db[i] = 0;
973 for (i = 0; i < NIADDR; i++)
974 dp->di_ib[i] = 0;
976 if (TRANS_ISTRANS(ufsvfsp)) {
978 * Pass only a sector size buffer containing
979 * the inode, otherwise when the buffer is copied
980 * into a cached roll buffer then too much memory
981 * gets consumed if 8KB inode buffers are passed.
983 TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
984 sizeof (struct dinode),
985 (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
986 DEV_BSIZE);
988 brelse(bp);
989 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
990 UFS_BRWRITE(ufsvfsp, bp);
993 * Synchronous write has guaranteed that inode
994 * has been written on disk so clear the flag
996 mutex_enter(&ip->i_tlock);
997 ip->i_flag &= ~IBDWRITE;
998 mutex_exit(&ip->i_tlock);
999 } else {
1000 bdrwrite(bp);
1003 * This write hasn't guaranteed that inode has been
1004 * written on the disk.
1005 * Since, all updat flags on inode are cleared, we must
1006 * remember the condition in case inode is to be updated
1007 * synchronously later (e.g.- fsync()/fdatasync())
1008 * and inode has not been modified yet.
1010 mutex_enter(&ip->i_tlock);
1011 ip->i_flag |= IBDWRITE;
1012 mutex_exit(&ip->i_tlock);
1014 } else {
1016 * In case previous inode update was done asynchronously
1017 * (IBDWRITE) and this inode update request wants guaranteed
1018 * (synchronous) disk update, flush the inode.
1020 if (waitfor && (flag & IBDWRITE)) {
1021 blkflush(ip->i_dev,
1022 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
1023 mutex_enter(&ip->i_tlock);
1024 ip->i_flag &= ~IBDWRITE;
1025 mutex_exit(&ip->i_tlock);
1030 #define SINGLE 0 /* index of single indirect block */
1031 #define DOUBLE 1 /* index of double indirect block */
1032 #define TRIPLE 2 /* index of triple indirect block */
1035 * Release blocks associated with the inode ip and
1036 * stored in the indirect block bn. Blocks are free'd
1037 * in LIFO order up to (but not including) lastbn. If
1038 * level is greater than SINGLE, the block is an indirect
1039 * block and recursive calls to indirtrunc must be used to
1040 * cleanse other indirect blocks.
1042 * N.B.: triple indirect blocks are untested.
1044 static long
1045 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
1047 int i;
1048 struct buf *bp, *copy;
1049 daddr32_t *bap;
1050 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1051 struct fs *fs = ufsvfsp->vfs_fs;
1052 daddr_t nb, last;
1053 long factor;
1054 int blocksreleased = 0, nblocks;
1056 ASSERT(RW_WRITE_HELD(&ip->i_contents));
1058 * Calculate index in current block of last
1059 * block to be kept. -1 indicates the entire
1060 * block so we need not calculate the index.
1062 factor = 1;
1063 for (i = SINGLE; i < level; i++)
1064 factor *= NINDIR(fs);
1065 last = lastbn;
1066 if (lastbn > 0)
1067 last /= factor;
1068 nblocks = btodb(fs->fs_bsize);
1070 * Get buffer of block pointers, zero those
1071 * entries corresponding to blocks to be free'd,
1072 * and update on disk copy first.
1073 * *Unless* the root pointer has been synchronously
1074 * written to disk. If nothing points to this
1075 * indirect block then don't bother zero'ing and
1076 * writing it.
1078 bp = UFS_BREAD(ufsvfsp,
1079 ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
1080 if (bp->b_flags & B_ERROR) {
1081 brelse(bp);
1082 return (0);
1084 bap = bp->b_un.b_daddr;
1085 if ((flags & I_CHEAP) == 0) {
1086 uint_t zb;
1088 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
1090 if (zb) {
1092 * push any data into the log before we zero it
1094 if (bp->b_flags & B_DELWRI)
1095 TRANS_LOG(ufsvfsp, (caddr_t)bap,
1096 ldbtob(bp->b_blkno), bp->b_bcount,
1097 bp->b_un.b_addr, bp->b_bcount);
1098 copy = ngeteblk(fs->fs_bsize);
1099 bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
1100 (uint_t)fs->fs_bsize);
1101 bzero((caddr_t)&bap[last + 1], zb);
1103 TRANS_BUF(ufsvfsp,
1104 (caddr_t)&bap[last + 1] - (caddr_t)bap,
1105 zb, bp, DT_ABZERO);
1107 UFS_BRWRITE(ufsvfsp, bp);
1108 bp = copy, bap = bp->b_un.b_daddr;
1110 } else {
1111 /* make sure write retries are also cleared */
1112 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
1113 bp->b_flags |= B_STALE | B_AGE;
1117 * Recursively free totally unused blocks.
1119 flags |= I_CHEAP;
1120 for (i = NINDIR(fs) - 1; i > last; i--) {
1121 nb = bap[i];
1122 if (nb == 0)
1123 continue;
1124 if (level > SINGLE) {
1125 blocksreleased +=
1126 indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
1127 free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
1128 } else
1129 free(ip, nb, (off_t)fs->fs_bsize, flags);
1130 blocksreleased += nblocks;
1132 flags &= ~I_CHEAP;
1135 * Recursively free last partial block.
1137 if (level > SINGLE && lastbn >= 0) {
1138 last = lastbn % factor;
1139 nb = bap[i];
1140 if (nb != 0)
1141 blocksreleased +=
1142 indirtrunc(ip, nb, last, level - 1, flags);
1144 brelse(bp);
1145 return (blocksreleased);
1149 * Truncate the inode ip to at most length size.
1150 * Free affected disk blocks -- the blocks of the
1151 * file are removed in reverse order.
1153 * N.B.: triple indirect blocks are untested.
1155 static int i_genrand = 1234;
1157 ufs_itrunc(struct inode *oip, uoff_t length, int flags, cred_t *cr)
1159 struct fs *fs = oip->i_fs;
1160 struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
1161 struct inode *ip;
1162 daddr_t lastblock;
1163 off_t bsize;
1164 int boff;
1165 daddr_t bn, lastiblock[NIADDR];
1166 int level;
1167 long nblocks, blocksreleased = 0;
1168 int i;
1169 ushort_t mode;
1170 struct inode tip;
1171 int err;
1172 uoff_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
1173 (UFS_MAXOFFSET_T) : (MAXOFF32_T);
1176 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
1177 * other uses need the reader lock. opendq() holds the writer lock.
1179 ASSERT((oip->i_mode & IFMT) == IFSHAD ||
1180 RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
1181 ASSERT(RW_WRITE_HELD(&oip->i_contents));
1183 * We only allow truncation of regular files and directories
1184 * to arbitrary lengths here. In addition, we allow symbolic
1185 * links to be truncated only to zero length. Other inode
1186 * types cannot have their length set here. Disk blocks are
1187 * being dealt with - especially device inodes where
1188 * ip->i_ordev is actually being stored in ip->i_db[0]!
1190 TRANS_INODE(ufsvfsp, oip);
1191 mode = oip->i_mode & IFMT;
1192 if (flags & I_FREE) {
1193 i_genrand *= 16843009; /* turns into shift and adds */
1194 i_genrand++;
1195 oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1;
1196 oip->i_flag |= ICHG |IUPD;
1197 oip->i_seq++;
1198 if (length == oip->i_size)
1199 return (0);
1200 flags |= I_CHEAP;
1202 if (mode == IFIFO)
1203 return (0);
1204 if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
1205 !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
1206 return (EINVAL);
1207 if (length > maxoffset)
1208 return (EFBIG);
1209 if ((mode == IFDIR) || (mode == IFATTRDIR))
1210 flags |= I_DIR;
1211 if (mode == IFSHAD)
1212 flags |= I_SHAD;
1213 if (oip == ufsvfsp->vfs_qinod)
1214 flags |= I_QUOTA;
1215 if (length == oip->i_size) {
1216 /* update ctime and mtime to please POSIX tests */
1217 oip->i_flag |= ICHG |IUPD;
1218 oip->i_seq++;
1219 if (length == 0) {
1220 /* nothing to cache so clear the flag */
1221 oip->i_flag &= ~IFASTSYMLNK;
1223 return (0);
1225 /* wipe out fast symlink till next access */
1226 if (oip->i_flag & IFASTSYMLNK) {
1227 int j;
1229 ASSERT(ITOV(oip)->v_type == VLNK);
1231 oip->i_flag &= ~IFASTSYMLNK;
1233 for (j = 1; j < NDADDR; j++)
1234 oip->i_db[j] = 0;
1235 for (j = 0; j < NIADDR; j++)
1236 oip->i_ib[j] = 0;
1239 boff = (int)blkoff(fs, length);
1241 if (length > oip->i_size) {
1243 * Trunc up case. BMAPALLOC will insure that the right blocks
1244 * are allocated. This includes extending the old frag to a
1245 * full block (if needed) in addition to doing any work
1246 * needed for allocating the last block.
1248 if (boff == 0)
1249 err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
1250 else
1251 err = BMAPALLOC(oip, length - 1, boff, cr);
1253 if (err == 0) {
1255 * Save old size and set inode's size now
1256 * so that we don't cause too much of the
1257 * file to be zero'd and pushed.
1259 uoff_t osize = oip->i_size;
1260 oip->i_size = length;
1262 * Make sure we zero out the remaining bytes of
1263 * the page in case a mmap scribbled on it. We
1264 * can't prevent a mmap from writing beyond EOF
1265 * on the last page of a file.
1268 if ((boff = (int)blkoff(fs, osize)) != 0) {
1269 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
1270 fs->fs_bsize : fragroundup(fs, boff);
1271 pvn_vpzero(ITOV(oip), osize,
1272 (size_t)(bsize - boff));
1274 oip->i_flag |= ICHG|IATTCHG;
1275 oip->i_seq++;
1276 ITIMES_NOLOCK(oip);
1278 * MAXOFF32_T is old 2GB size limit. If
1279 * this operation caused a large file to be
1280 * created, turn on the superblock flag
1281 * and update the superblock, if the flag
1282 * is not already on.
1284 if ((length > (uoff_t)MAXOFF32_T) &&
1285 !(fs->fs_flags & FSLARGEFILES)) {
1286 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1287 mutex_enter(&ufsvfsp->vfs_lock);
1288 fs->fs_flags |= FSLARGEFILES;
1289 ufs_sbwrite(ufsvfsp);
1290 mutex_exit(&ufsvfsp->vfs_lock);
1294 return (err);
1298 * Update the pages of the file. If the file is not being
1299 * truncated to a block boundary, the contents of the
1300 * pages following the end of the file must be zero'ed
1301 * in case it ever become accessible again because
1302 * of subsequent file growth.
1304 if (boff == 0) {
1305 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
1306 B_INVAL | B_TRUNC, CRED());
1307 } else {
1309 * Make sure that the last block is properly allocated.
1310 * We only really have to do this if the last block is
1311 * actually allocated since ufs_bmap will now handle the case
1312 * of an fragment which has no block allocated. Just to
1313 * be sure, we do it now independent of current allocation.
1315 err = BMAPALLOC(oip, length - 1, boff, cr);
1316 if (err)
1317 return (err);
1320 * BMAPALLOC will call bmap_write which defers i_seq
1321 * processing. If the timestamps were changed, update
1322 * i_seq before rdip drops i_contents or syncs the inode.
1324 if (oip->i_flag & (ICHG|IUPD))
1325 oip->i_seq++;
1328 * BugId 4069932
1329 * Make sure that the relevant partial page appears in
1330 * the v_object's list, so that pvn_vpzero() will do its
1331 * job. Since doing this correctly requires everything
1332 * in rdip() except for the uiomove(), it's easier and
1333 * safer to do the uiomove() rather than duplicate the
1334 * rest of rdip() here.
1336 * To get here, we know that length indicates a byte
1337 * that is not the first byte of a block. (length - 1)
1338 * is the last actual byte known to exist. Deduction
1339 * shows it is in the same block as byte (length).
1340 * Thus, this rdip() invocation should always succeed
1341 * except in the face of i/o errors, and give us the
1342 * block we care about.
1344 * rdip() makes the same locking assertions and
1345 * assumptions as we do. We do not acquire any locks
1346 * before calling it, so we have not changed the locking
1347 * situation. Finally, there do not appear to be any
1348 * paths whereby rdip() ends up invoking us again.
1349 * Thus, infinite recursion is avoided.
1352 uio_t uio;
1353 iovec_t iov[1];
1354 char buffer;
1356 uio.uio_iov = iov;
1357 uio.uio_iovcnt = 1;
1358 uio.uio_loffset = length - 1;
1359 uio.uio_resid = 1;
1360 uio.uio_segflg = UIO_SYSSPACE;
1361 uio.uio_extflg = UIO_COPY_CACHED;
1363 iov[0].iov_base = &buffer;
1364 iov[0].iov_len = 1;
1366 err = rdip(oip, &uio, UIO_READ, NULL);
1367 if (err)
1368 return (err);
1371 bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
1372 fs->fs_bsize : fragroundup(fs, boff);
1373 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
1375 * Ensure full fs block is marked as dirty.
1377 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
1378 ufs_putapage, B_INVAL | B_TRUNC, CRED());
1382 * Calculate index into inode's block list of
1383 * last direct and indirect blocks (if any)
1384 * which we want to keep. Lastblock is -1 when
1385 * the file is truncated to 0.
1387 lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
1388 lastiblock[SINGLE] = lastblock - NDADDR;
1389 lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
1390 lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
1391 nblocks = btodb(fs->fs_bsize);
1394 * Update file and block pointers
1395 * on disk before we start freeing blocks.
1396 * If we crash before free'ing blocks below,
1397 * the blocks will be returned to the free list.
1398 * lastiblock values are also normalized to -1
1399 * for calls to indirtrunc below.
1401 tip = *oip; /* structure copy */
1402 ip = &tip;
1404 for (level = TRIPLE; level >= SINGLE; level--)
1405 if (lastiblock[level] < 0) {
1406 oip->i_ib[level] = 0;
1407 lastiblock[level] = -1;
1409 for (i = NDADDR - 1; i > lastblock; i--) {
1410 oip->i_db[i] = 0;
1411 flags |= I_CHEAP;
1413 oip->i_size = length;
1414 oip->i_flag |= ICHG|IUPD|IATTCHG;
1415 oip->i_seq++;
1416 if (!TRANS_ISTRANS(ufsvfsp))
1417 ufs_iupdat(oip, I_SYNC); /* do sync inode update */
1420 * Indirect blocks first.
1422 for (level = TRIPLE; level >= SINGLE; level--) {
1423 bn = ip->i_ib[level];
1424 if (bn != 0) {
1425 blocksreleased +=
1426 indirtrunc(ip, bn, lastiblock[level], level, flags);
1427 if (lastiblock[level] < 0) {
1428 ip->i_ib[level] = 0;
1429 free(ip, bn, (off_t)fs->fs_bsize,
1430 flags | I_IBLK);
1431 blocksreleased += nblocks;
1434 if (lastiblock[level] >= 0)
1435 goto done;
1439 * All whole direct blocks or frags.
1441 for (i = NDADDR - 1; i > lastblock; i--) {
1442 bn = ip->i_db[i];
1443 if (bn == 0)
1444 continue;
1445 ip->i_db[i] = 0;
1446 bsize = (off_t)blksize(fs, ip, i);
1447 free(ip, bn, bsize, flags);
1448 blocksreleased += btodb(bsize);
1450 if (lastblock < 0)
1451 goto done;
1454 * Finally, look for a change in size of the
1455 * last direct block; release any frags.
1457 bn = ip->i_db[lastblock];
1458 if (bn != 0) {
1459 off_t oldspace, newspace;
1462 * Calculate amount of space we're giving
1463 * back as old block size minus new block size.
1465 oldspace = blksize(fs, ip, lastblock);
1466 UFS_SET_ISIZE(length, ip);
1467 newspace = blksize(fs, ip, lastblock);
1468 if (newspace == 0) {
1469 err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
1470 return (err);
1472 if (oldspace - newspace > 0) {
1474 * Block number of space to be free'd is
1475 * the old block # plus the number of frags
1476 * required for the storage we're keeping.
1478 bn += numfrags(fs, newspace);
1479 free(ip, bn, oldspace - newspace, flags);
1480 blocksreleased += btodb(oldspace - newspace);
1483 done:
1484 /* BEGIN PARANOIA */
1485 for (level = SINGLE; level <= TRIPLE; level++)
1486 if (ip->i_ib[level] != oip->i_ib[level]) {
1487 err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
1488 return (err);
1491 for (i = 0; i < NDADDR; i++)
1492 if (ip->i_db[i] != oip->i_db[i]) {
1493 err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
1494 return (err);
1496 /* END PARANOIA */
1497 oip->i_blocks -= blocksreleased;
1499 if (oip->i_blocks < 0) { /* sanity */
1500 cmn_err(CE_NOTE,
1501 "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
1502 fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
1503 (int)oip->i_blocks);
1504 oip->i_blocks = 0;
1506 oip->i_flag |= ICHG|IATTCHG;
1507 oip->i_seq++;
1508 /* blocksreleased is >= zero, so this can not fail */
1509 (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL, NULL);
1510 return (0);
1514 * Check mode permission on inode. Mode is READ, WRITE or EXEC.
1515 * In the case of WRITE, the read-only status of the file system
1516 * is checked. Depending on the calling user, the appropriate
1517 * mode bits are selected; privileges to override missing permission
1518 * bits are checked through secpolicy_vnode_access().
1519 * The i_contens lock must be held as reader here to prevent racing with
1520 * the acl subsystem removing/setting/changing acls on this inode.
1521 * The caller is responsible for indicating whether or not the i_contents
1522 * lock needs to be acquired here or if already held.
1525 ufs_iaccess(struct inode *ip, int mode, struct cred *cr, int dolock)
1527 int shift = 0;
1528 int ret = 0;
1530 if (dolock)
1531 rw_enter(&ip->i_contents, RW_READER);
1532 ASSERT(RW_LOCK_HELD(&ip->i_contents));
1534 if (mode & IWRITE) {
1536 * Disallow write attempts on read-only
1537 * file systems, unless the file is a block
1538 * or character device or a FIFO.
1540 if (ip->i_fs->fs_ronly != 0) {
1541 if ((ip->i_mode & IFMT) != IFCHR &&
1542 (ip->i_mode & IFMT) != IFBLK &&
1543 (ip->i_mode & IFMT) != IFIFO) {
1544 ret = EROFS;
1545 goto out;
1550 * If there is an acl, check the acl and return.
1552 if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) {
1553 ret = ufs_acl_access(ip, mode, cr);
1554 goto out;
1558 * Access check is based on only one of owner, group, public.
1559 * If not owner, then check group.
1560 * If not a member of the group, then check public access.
1562 if (crgetuid(cr) != ip->i_uid) {
1563 shift += 3;
1564 if (!groupmember((uid_t)ip->i_gid, cr))
1565 shift += 3;
1568 /* test missing privilege bits */
1569 ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid,
1570 ip->i_mode << shift, mode);
1571 out:
1572 if (dolock)
1573 rw_exit(&ip->i_contents);
1574 return (ret);
1578 * if necessary, remove an inode from the free list
1579 * i_contents is held except at unmount
1581 * Return 1 if the inode is taken off of the ufs_idle_q,
1582 * and the caller is expected to call VN_RELE.
1584 * Return 0 otherwise.
1587 ufs_rmidle(struct inode *ip)
1589 int rval = 0;
1591 mutex_enter(&ip->i_tlock);
1592 if ((ip->i_flag & IREF) == 0) {
1593 mutex_enter(&ufs_idle_q.uq_mutex);
1594 ip->i_freef->i_freeb = ip->i_freeb;
1595 ip->i_freeb->i_freef = ip->i_freef;
1596 ip->i_freef = ip;
1597 ip->i_freeb = ip;
1598 ip->i_flag |= IREF;
1599 ufs_idle_q.uq_ne--;
1600 if (ip->i_flag & IJUNKIQ) {
1601 ufs_njunk_iq--;
1602 ip->i_flag &= ~IJUNKIQ;
1603 } else {
1604 ufs_nuseful_iq--;
1606 mutex_exit(&ufs_idle_q.uq_mutex);
1607 rval = 1;
1609 mutex_exit(&ip->i_tlock);
1610 return (rval);
1614 * scan the hash of inodes and call func with the inode locked
1617 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
1618 struct ufsvfs *ufsvfsp)
1620 struct inode *ip; /* current inode */
1621 struct inode *lip = NULL; /* last/previous inode */
1622 union ihead *ih; /* current hash chain */
1623 int error, i;
1624 int saverror = 0;
1625 int lip_held; /* lip needs a VN_RELE() */
1628 * If ufsvfsp is NULL, then our caller should be holding
1629 * ufs_scan_lock to avoid conflicts between ufs_unmount() and
1630 * ufs_update(). Otherwise, to avoid false-positives in
1631 * ufs_unmount()'s v_count-based EBUSY check, we only hold
1632 * those inodes that are in the file system our caller cares
1633 * about.
1635 * We know that ip is a valid inode in the hash chain (and thus
1636 * we can trust i_ufsvfs) because the inode we chained from
1637 * (lip) is still in the hash chain. This is true because either:
1639 * 1. We did not drop the hash chain lock since the last
1640 * iteration (because we were not interested in the last inode),
1641 * or
1642 * 2. We maintained a hold on the last inode while we
1643 * we were processing it, so it could not be removed
1644 * from the hash chain.
1646 * The whole reason we're dropping and re-grabbing the chain
1647 * lock on every inode is so that we don't present a major
1648 * choke point on throughput, particularly when we've been
1649 * called on behalf of fsflush.
1652 for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
1653 mutex_enter(&ih_lock[i]);
1654 for (ip = ih->ih_chain[0], lip_held = 0;
1655 ip != (struct inode *)ih;
1656 ip = lip->i_forw) {
1658 ins.in_scan.value.ul++;
1661 * Undo the previous iteration's VN_HOLD(), but
1662 * only if one was done.
1664 if (lip_held)
1665 VN_RELE(ITOV(lip));
1667 lip = ip;
1668 if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
1670 * We're not processing all inodes, and
1671 * this inode is not in the filesystem of
1672 * interest, so skip it. No need to do a
1673 * VN_HOLD() since we're not dropping the
1674 * hash chain lock until after we've
1675 * done the i_forw traversal above.
1677 lip_held = 0;
1678 continue;
1680 VN_HOLD(ITOV(ip));
1681 lip_held = 1;
1682 mutex_exit(&ih_lock[i]);
1685 * Acquire the contents lock as writer to make
1686 * sure that the inode has been initialized in
1687 * the cache or removed from the idle list by
1688 * ufs_iget(). This works because ufs_iget()
1689 * acquires the contents lock before putting
1690 * the inode into the cache. If we can lock
1691 * it, then ufs_iget() is done with it.
1694 if (rwtry) {
1695 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
1696 mutex_enter(&ih_lock[i]);
1697 continue;
1699 } else {
1700 rw_enter(&ip->i_contents, RW_WRITER);
1703 rw_exit(&ip->i_contents);
1706 * ISTALE means the inode couldn't be read
1708 * We don't have to hold the i_contents lock
1709 * for this check for a couple of
1710 * reasons. First, if ISTALE is set then the
1711 * flag cannot be cleared until the inode is
1712 * removed from the cache and that cannot
1713 * happen until after we VN_RELE() it.
1714 * Second, if ISTALE is not set, then the
1715 * inode is in the cache and does not need to
1716 * be read from disk so ISTALE cannot be set
1717 * while we are not looking.
1719 if ((ip->i_flag & ISTALE) == 0) {
1720 if ((error = (*func)(ip, arg)) != 0)
1721 saverror = error;
1724 mutex_enter(&ih_lock[i]);
1726 if (lip_held)
1727 VN_RELE(ITOV(lip));
1728 mutex_exit(&ih_lock[i]);
1730 return (saverror);
1734 * Mark inode with the current time, plus a unique increment.
1736 * Since we only keep 32-bit time on disk, if UFS is still alive
1737 * beyond 2038, filesystem times will simply stick at the last
1738 * possible second of 32-bit time. Not ideal, but probably better
1739 * than going into the remote past, or confusing applications with
1740 * negative time.
1742 void
1743 ufs_imark(struct inode *ip)
1745 timestruc_t now;
1746 int32_t usec, nsec;
1749 * The update of i_seq may have been deferred, increase i_seq here
1750 * to make sure it is in sync with the timestamps.
1752 if (ip->i_flag & ISEQ) {
1753 ASSERT(ip->i_flag & (IUPD|ICHG));
1754 ip->i_seq++;
1755 ip->i_flag &= ~ISEQ;
1758 gethrestime(&now);
1761 * Fast algorithm to convert nsec to usec -- see hrt2ts()
1762 * in kernel/os/timers.c for a full description.
1764 nsec = now.tv_nsec;
1765 usec = nsec + (nsec >> 2);
1766 usec = nsec + (usec >> 1);
1767 usec = nsec + (usec >> 2);
1768 usec = nsec + (usec >> 4);
1769 usec = nsec - (usec >> 3);
1770 usec = nsec + (usec >> 2);
1771 usec = nsec + (usec >> 3);
1772 usec = nsec + (usec >> 4);
1773 usec = nsec + (usec >> 1);
1774 usec = nsec + (usec >> 6);
1775 usec = usec >> 10;
1777 mutex_enter(&ufs_iuniqtime_lock);
1778 if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
1779 usec > iuniqtime.tv_usec) {
1780 if (now.tv_sec < TIME32_MAX) {
1781 iuniqtime.tv_sec = (time32_t)now.tv_sec;
1782 iuniqtime.tv_usec = usec;
1784 } else {
1785 if (iuniqtime.tv_sec < TIME32_MAX) {
1786 iuniqtime.tv_usec++;
1787 /* Check for usec overflow */
1788 if (iuniqtime.tv_usec >= MICROSEC) {
1789 iuniqtime.tv_sec++;
1790 iuniqtime.tv_usec = 0;
1795 if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
1796 ip->i_atime = iuniqtime;
1798 if (ip->i_flag & IUPD) {
1799 ip->i_mtime = iuniqtime;
1800 ip->i_flag |= IMODTIME;
1802 if (ip->i_flag & ICHG) {
1803 ip->i_diroff = 0;
1804 ip->i_ctime = iuniqtime;
1806 mutex_exit(&ufs_iuniqtime_lock);
1810 * Update timestamps in inode.
1812 void
1813 ufs_itimes_nolock(struct inode *ip)
1817 * if noatime is set and the inode access time is the only field that
1818 * must be changed, exit immediately.
1820 if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
1821 (ip->i_ufsvfs->vfs_noatime)) {
1822 return;
1825 if (ip->i_flag & (IUPD|IACC|ICHG)) {
1826 if (ip->i_flag & ICHG)
1827 ip->i_flag |= IMOD;
1828 else
1829 ip->i_flag |= IMODACC;
1830 ufs_imark(ip);
1831 ip->i_flag &= ~(IACC|IUPD|ICHG);