4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
44 #include <sys/bitmap.h>
45 #include <sys/signal.h>
50 #include <sys/vnode.h>
56 #include <sys/cmn_err.h>
57 #include <sys/kstat.h>
60 #include <sys/fs/ufs_inode.h>
61 #include <sys/fs/ufs_fs.h>
62 #include <sys/fs/ufs_trans.h>
63 #include <sys/fs/ufs_acl.h>
64 #include <sys/fs/ufs_bio.h>
65 #include <sys/fs/ufs_quota.h>
66 #include <sys/fs/ufs_log.h>
72 #include <sys/cpuvar.h>
73 #include <sys/sysmacros.h>
74 #include <sys/errno.h>
76 #include <sys/debug.h>
77 #include <sys/fs_subr.h>
78 #include <sys/policy.h>
80 struct kmem_cache
*inode_cache
; /* cache of free inodes */
82 /* UFS Inode Cache Stats -- Not protected */
83 struct instats ins
= {
84 { "size", KSTAT_DATA_ULONG
},
85 { "maxsize", KSTAT_DATA_ULONG
},
86 { "hits", KSTAT_DATA_ULONG
},
87 { "misses", KSTAT_DATA_ULONG
},
88 { "kmem allocs", KSTAT_DATA_ULONG
},
89 { "kmem frees", KSTAT_DATA_ULONG
},
90 { "maxsize reached", KSTAT_DATA_ULONG
},
91 { "puts at frontlist", KSTAT_DATA_ULONG
},
92 { "puts at backlist", KSTAT_DATA_ULONG
},
93 { "queues to free", KSTAT_DATA_ULONG
},
94 { "scans", KSTAT_DATA_ULONG
},
95 { "thread idles", KSTAT_DATA_ULONG
},
96 { "lookup idles", KSTAT_DATA_ULONG
},
97 { "vget idles", KSTAT_DATA_ULONG
},
98 { "cache allocs", KSTAT_DATA_ULONG
},
99 { "cache frees", KSTAT_DATA_ULONG
},
100 { "pushes at close", KSTAT_DATA_ULONG
}
104 static kstat_t
*ufs_inode_kstat
= NULL
;
106 union ihead
*ihead
; /* inode LRU cache, Chris Maltby */
107 kmutex_t
*ih_lock
; /* protect inode cache hash table */
108 static int ino_hashlen
= 4; /* desired average hash chain length */
109 int inohsz
; /* number of buckets in the hash table */
111 kmutex_t ufs_scan_lock
; /* stop racing multiple ufs_scan_inodes() */
112 kmutex_t ufs_iuniqtime_lock
; /* protect iuniqtime */
113 kmutex_t ufsvfs_mutex
;
114 struct ufsvfs
*oldufsvfslist
, *ufsvfslist
;
117 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
123 * the threads that process idle inodes and free (deleted) inodes
124 * have high water marks that are set in ufsinit().
125 * These values but can be no less then the minimum shown below
127 int ufs_idle_max
; /* # of allowable idle inodes */
128 ulong_t ufs_inode_max
; /* hard limit of allowable idle inodes */
129 #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */
132 * Tunables for ufs write throttling.
133 * These are validated in ufs_iinit() since improper settings
134 * can lead to filesystem hangs.
136 #define UFS_HW_DEFAULT (16 * 1024 * 1024)
137 #define UFS_LW_DEFAULT (8 * 1024 * 1024)
138 int ufs_HW
= UFS_HW_DEFAULT
;
139 int ufs_LW
= UFS_LW_DEFAULT
;
141 static void ihinit(void);
142 extern int hash2ints(int, int);
144 static int ufs_iget_internal(struct vfs
*, ino_t
, struct inode
**,
149 ufs_inode_kstat_update(kstat_t
*ksp
, int rw
)
151 if (rw
== KSTAT_WRITE
)
154 ins
.in_malloc
.value
.ul
= (ulong_t
)kmem_cache_stat(inode_cache
,
156 ins
.in_mfree
.value
.ul
= (ulong_t
)kmem_cache_stat(inode_cache
,
158 ins
.in_kcalloc
.value
.ul
= (ulong_t
)kmem_cache_stat(inode_cache
,
160 ins
.in_kcfree
.value
.ul
= (ulong_t
)kmem_cache_stat(inode_cache
,
162 ins
.in_size
.value
.ul
= (ulong_t
)kmem_cache_stat(inode_cache
,
164 ins
.in_maxreached
.value
.ul
= (ulong_t
)kmem_cache_stat(inode_cache
,
166 ins
.in_misses
.value
.ul
= ins
.in_kcalloc
.value
.ul
;
175 * Validate that ufs_HW > ufs_LW.
176 * The default values for these two tunables have been increased.
177 * There is now a range of values for ufs_HW that used to be
178 * legal on previous Solaris versions but no longer is now.
179 * Upgrading a machine which has an /etc/system setting for ufs_HW
180 * from that range can lead to filesystem hangs unless the values
183 if (ufs_HW
<= ufs_LW
) {
185 "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
187 ufs_LW
= UFS_LW_DEFAULT
;
188 ufs_HW
= UFS_HW_DEFAULT
;
189 cmn_err(CE_CONT
, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
194 * Adjust the tunable `ufs_ninode' to a reasonable value
198 if (ufs_inode_max
== 0)
200 (ulong_t
)((kmem_maxavail() >> 2) / sizeof (struct inode
));
201 if (ufs_ninode
> ufs_inode_max
|| (ufs_ninode
== 0 && ncsize
== 0)) {
202 cmn_err(CE_NOTE
, "setting ufs_ninode to max value of %ld",
204 ufs_ninode
= ufs_inode_max
;
207 * Wait till third call of ufs_update to declare that no I/Os are
208 * going on. This allows deferred access times to be flushed to disk.
210 ufs_iowait
= v
.v_autoup
* hz
* 2;
213 * idle thread runs when 25% of ufs_ninode entries are on the queue
215 if (ufs_idle_max
== 0)
216 ufs_idle_max
= ufs_ninode
>> 2;
217 if (ufs_idle_max
< UFS_IDLE_MAX
)
218 ufs_idle_max
= UFS_IDLE_MAX
;
219 if (ufs_idle_max
> ufs_ninode
)
220 ufs_idle_max
= ufs_ninode
;
222 * This is really a misnomer, it is ufs_queue_init
224 ufs_thread_init(&ufs_idle_q
, ufs_idle_max
);
225 ufs_thread_start(&ufs_idle_q
, ufs_thread_idle
, NULL
);
228 * global hlock thread
230 ufs_thread_init(&ufs_hlock
, 1);
231 ufs_thread_start(&ufs_hlock
, ufs_thread_hlock
, NULL
);
235 ins
.in_maxsize
.value
.ul
= ufs_ninode
;
236 if ((ufs_inode_kstat
= kstat_create("ufs", 0, "inode_cache", "ufs",
237 KSTAT_TYPE_NAMED
, sizeof (ins
) / sizeof (kstat_named_t
),
238 KSTAT_FLAG_VIRTUAL
)) != NULL
) {
239 ufs_inode_kstat
->ks_data
= (void *)&ins
;
240 ufs_inode_kstat
->ks_update
= ufs_inode_kstat_update
;
241 kstat_install(ufs_inode_kstat
);
243 ufsfx_init(); /* fix-on-panic initialization */
247 mutex_init(&ufs_iuniqtime_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
252 ufs_inode_cache_constructor(void *buf
, void *cdrarg
, int kmflags
)
254 struct inode
*ip
= buf
;
257 vp
= ip
->i_vnode
= vn_alloc(kmflags
);
261 vn_setops(vp
, &ufs_vnodeops
);
264 rw_init(&ip
->i_rwlock
, NULL
, RW_DEFAULT
, NULL
);
265 rw_init(&ip
->i_contents
, NULL
, RW_DEFAULT
, NULL
);
266 mutex_init(&ip
->i_tlock
, NULL
, MUTEX_DEFAULT
, NULL
);
267 dnlc_dir_init(&ip
->i_danchor
);
269 cv_init(&ip
->i_wrcv
, NULL
, CV_DRIVER
, NULL
);
276 ufs_inode_cache_destructor(void *buf
, void *cdrarg
)
278 struct inode
*ip
= buf
;
283 rw_destroy(&ip
->i_rwlock
);
284 rw_destroy(&ip
->i_contents
);
285 mutex_destroy(&ip
->i_tlock
);
286 if (vp
->v_type
== VDIR
) {
287 dnlc_dir_fini(&ip
->i_danchor
);
290 cv_destroy(&ip
->i_wrcv
);
296 * Initialize hash links for inodes
297 * and build inode free list.
303 union ihead
*ih
= ihead
;
305 mutex_init(&ufs_scan_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
307 inohsz
= 1 << highbit(ufs_ninode
/ ino_hashlen
);
308 ihead
= kmem_zalloc(inohsz
* sizeof (union ihead
), KM_SLEEP
);
309 ih_lock
= kmem_zalloc(inohsz
* sizeof (kmutex_t
), KM_SLEEP
);
311 for (i
= 0, ih
= ihead
; i
< inohsz
; i
++, ih
++) {
314 mutex_init(&ih_lock
[i
], NULL
, MUTEX_DEFAULT
, NULL
);
316 inode_cache
= kmem_cache_create("ufs_inode_cache",
317 sizeof (struct inode
), 0, ufs_inode_cache_constructor
,
318 ufs_inode_cache_destructor
, ufs_inode_cache_reclaim
,
323 * Free an inode structure
326 ufs_free_inode(struct inode
*ip
)
328 vn_invalid(ITOV(ip
));
329 kmem_cache_free(inode_cache
, ip
);
333 * Allocate an inode structure
336 ufs_alloc_inode(ufsvfs_t
*ufsvfsp
, ino_t ino
)
341 ip
= kmem_cache_alloc(inode_cache
, KM_SLEEP
);
343 * at this point we have a newly allocated inode
348 ip
->i_seq
= 0xFF; /* Unique initial value */
349 ip
->i_dev
= ufsvfsp
->vfs_dev
;
350 ip
->i_ufsvfs
= ufsvfsp
;
351 ip
->i_devvp
= ufsvfsp
->vfs_devvp
;
362 ip
->i_ufs_acl
= NULL
;
366 ip
->i_cachedir
= CD_ENABLED
;
370 * the vnode for this inode was allocated by the constructor
374 if (ino
== (ino_t
)UFSROOTINO
)
376 vp
->v_vfsp
= ufsvfsp
->vfs_vfs
;
382 * Look up an inode by device, inumber. If it is in core (in the
383 * inode structure), honor the locking protocol. If it is not in
384 * core, read it in from the specified device after freeing any pages.
385 * In all cases, a pointer to a VN_HELD inode structure is returned.
388 ufs_iget(struct vfs
*vfsp
, ino_t ino
, struct inode
**ipp
, struct cred
*cr
)
390 return (ufs_iget_internal(vfsp
, ino
, ipp
, cr
, 0));
394 * A version of ufs_iget which returns only allocated, linked inodes.
395 * This is appropriate for any callers who do not expect a free inode.
398 ufs_iget_alloced(struct vfs
*vfsp
, ino_t ino
, struct inode
**ipp
,
401 return (ufs_iget_internal(vfsp
, ino
, ipp
, cr
, 1));
405 * Set vnode attributes based on v_type, this should be called whenever
406 * an inode's i_mode is changed.
409 ufs_reset_vnode(vnode_t
*vp
)
414 if ((VTOI(vp
)->i_mode
& (ISVTX
| IEXEC
| IFDIR
)) == ISVTX
)
415 vp
->v_flag
|= VSWAPLIKE
;
417 vp
->v_flag
&= ~VSWAPLIKE
;
420 * if not swap like and it's just a regular file, we want
421 * to maintain the vnode's pages sorted by clean/modified
422 * for faster sync'ing to disk
424 if (vp
->v_type
== VREG
)
425 vp
->v_flag
|= VMODSORT
;
427 vp
->v_flag
&= ~VMODSORT
;
430 * Is this an attribute hidden dir?
432 if ((VTOI(vp
)->i_mode
& IFMT
) == IFATTRDIR
)
433 vp
->v_flag
|= V_XATTRDIR
;
435 vp
->v_flag
&= ~V_XATTRDIR
;
439 * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate'
440 * flag is used to distinguish the two; when true, we validate that the inode
441 * being retrieved looks like a linked and allocated inode.
445 ufs_iget_internal(struct vfs
*vfsp
, ino_t ino
, struct inode
**ipp
,
446 struct cred
*cr
, int validate
)
448 struct inode
*ip
, *sp
;
454 extern vfs_t EIO_vfs
;
456 int ftype
; /* XXX - Remove later on */
458 struct ufsvfs
*ufsvfsp
;
464 CPU_STATS_ADD_K(sys
, ufsiget
, 1);
467 * Lookup inode in cache.
469 vfs_dev
= vfsp
->vfs_dev
;
476 for (ip
= ih
->ih_chain
[0]; ip
!= (struct inode
*)ih
; ip
= ip
->i_forw
) {
477 if (ino
!= ip
->i_number
|| vfs_dev
!= ip
->i_dev
||
478 (ip
->i_flag
& ISTALE
))
482 * Found the interesting inode; hold it and drop the cache lock
484 vp
= ITOV(ip
); /* for locknest */
487 rw_enter(&ip
->i_contents
, RW_READER
);
490 * if necessary, remove from idle list
492 if ((ip
->i_flag
& IREF
) == 0) {
498 * Could the inode be read from disk?
500 if (ip
->i_flag
& ISTALE
) {
501 rw_exit(&ip
->i_contents
);
506 ins
.in_hits
.value
.ul
++;
510 * Reset the vnode's attribute flags
512 mutex_enter(&vp
->v_lock
);
514 mutex_exit(&vp
->v_lock
);
516 rw_exit(&ip
->i_contents
);
523 * Inode was not in cache.
525 * Allocate a new entry
527 ufsvfsp
= (struct ufsvfs
*)vfsp
->vfs_data
;
528 fs
= ufsvfsp
->vfs_fs
;
530 ip
= ufs_alloc_inode(ufsvfsp
, ino
);
533 bno
= fsbtodb(fs
, itod(fs
, ino
));
534 ioff
= (sizeof (struct dinode
)) * (itoo(fs
, ino
));
535 ip
->i_doff
= (offset_t
)ioff
+ ldbtob(bno
);
538 * put a place holder in the cache (if not already there)
541 for (sp
= ih
->ih_chain
[0]; sp
!= (struct inode
*)ih
; sp
= sp
->i_forw
)
542 if (ino
== sp
->i_number
&& vfs_dev
== sp
->i_dev
&&
543 ((sp
->i_flag
& ISTALE
) == 0)) {
549 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
550 * here, but if we do, then shadow inode allocations panic the
551 * system. We don't have to hold vfs_dqrwlock for shadow inodes
552 * and the ufs_iget() parameters don't tell us what we are getting
553 * so we have no way of knowing this is a ufs_iget() call from
554 * a ufs_ialloc() call for a shadow inode.
556 rw_enter(&ip
->i_contents
, RW_WRITER
);
562 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, bno
, (int)fs
->fs_bsize
);
567 error
= ((bp
->b_flags
& B_ERROR
) ? geterror(bp
) : 0);
570 ip
->i_flag
|= ISTALE
; /* in case someone is looking it up */
571 rw_exit(&ip
->i_contents
);
572 vp
->v_vfsp
= &EIO_vfs
;
577 * initialize the inode's dinode
579 dp
= (struct dinode
*)(ioff
+ bp
->b_un
.b_addr
);
580 ip
->i_ic
= dp
->di_ic
; /* structure assignment */
584 * Maintain compatibility with Solaris 1.x UFS
586 if (ip
->i_suid
!= UID_LONG
)
587 ip
->i_uid
= ip
->i_suid
;
588 if (ip
->i_sgid
!= GID_LONG
)
589 ip
->i_gid
= ip
->i_sgid
;
591 ftype
= ip
->i_mode
& IFMT
;
592 if (ftype
== IFBLK
|| ftype
== IFCHR
) {
594 uint_t top16
= ip
->i_ordev
& 0xffff0000u
;
596 if (top16
== 0 || top16
== 0xffff0000u
)
597 dv
= expdev(ip
->i_ordev
);
599 dv
= expldev(ip
->i_ordev
);
600 vp
->v_rdev
= ip
->i_rdev
= dv
;
604 * if our caller only expects allocated inodes, verify that
605 * this inode looks good; throw it out if it's bad.
608 if ((ftype
== 0) || (ip
->i_nlink
<= 0)) {
609 ip
->i_flag
|= ISTALE
;
610 rw_exit(&ip
->i_contents
);
611 vp
->v_vfsp
= &EIO_vfs
;
614 "%s: unexpected free inode %d, run fsck(1M)%s",
615 fs
->fs_fsmnt
, (int)ino
,
616 (TRANS_ISTRANS(ufsvfsp
) ? " -o f" : ""));
622 * Finish initializing the vnode, special handling for shadow inodes
623 * because IFTOVT() will produce a v_type of VNON which is not what we
624 * want, set v_type to VREG explicitly in that case.
626 if (ftype
== IFSHAD
) {
629 vp
->v_type
= IFTOVT((mode_t
)ip
->i_mode
);
637 if (ftype
!= 0 && ip
->i_shadow
!= 0) {
638 if ((error
= ufs_si_load(ip
, cr
)) != 0) {
639 ip
->i_flag
|= ISTALE
;
640 ip
->i_ufs_acl
= NULL
;
641 rw_exit(&ip
->i_contents
);
642 vp
->v_vfsp
= &EIO_vfs
;
649 * Only attach quota information if the inode has a type and if
650 * that type is not a shadow inode.
652 if (ip
->i_mode
&& ((ip
->i_mode
& IFMT
) != IFSHAD
) &&
653 ((ip
->i_mode
& IFMT
) != IFATTRDIR
)) {
654 ip
->i_dquot
= getinoquota(ip
);
656 TRANS_MATA_IGET(ufsvfsp
, ip
);
658 rw_exit(&ip
->i_contents
);
664 * Vnode is no longer referenced, write the inode out
665 * and if necessary, truncate and deallocate the file.
668 ufs_iinactive(struct inode
*ip
)
674 struct vnode
*vp
= ITOV(ip
);
675 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
676 struct ufs_delq_info
*delq_info
= &ufsvfsp
->vfs_delete_info
;
679 * Because the vnode type might have been changed,
680 * the dnlc_dir_purge must be called unconditionally.
682 dnlc_dir_purge(&ip
->i_danchor
);
685 * Get exclusive access to inode data.
687 rw_enter(&ip
->i_contents
, RW_WRITER
);
688 ASSERT(ip
->i_flag
& IREF
);
691 * Make sure no one reclaimed the inode before we put it on
692 * the freelist or destroy it. We keep our 'hold' on the vnode
693 * from vn_rele until we are ready to do something with the inode.
695 * Pageout may put a VN_HOLD/VN_RELE at anytime during this
696 * operation via an async putpage, so we must make sure
697 * we don't free/destroy the inode more than once. ufs_iget
698 * may also put a VN_HOLD on the inode before it grabs
699 * the i_contents lock. This is done so we don't free
700 * an inode that a thread is waiting on.
702 mutex_enter(&vp
->v_lock
);
704 if (vp
->v_count
> 1) {
706 mutex_exit(&vp
->v_lock
);
707 rw_exit(&ip
->i_contents
);
710 mutex_exit(&vp
->v_lock
);
713 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
714 * and clean. It can be safely destroyed (cyf).
716 if (ip
->i_ufsvfs
== NULL
) {
717 rw_exit(&ip
->i_contents
);
719 ASSERT((vp
->v_type
== VCHR
) || !vn_has_cached_data(vp
));
725 * queue idle inode to appropriate thread. Will check v_count == 1
726 * prior to putting this on the appropriate queue.
727 * Stale inodes will be unhashed and freed by the ufs idle thread
731 if ((ip
->i_flag
& ISTALE
) == 0 && ip
->i_fs
->fs_ronly
== 0 &&
732 ip
->i_mode
&& ip
->i_nlink
<= 0) {
734 * Mark the i_flag to indicate that inode is being deleted.
735 * This flag will be cleared when the deletion is complete.
736 * This prevents nfs from sneaking in via ufs_vget() while
737 * the delete is in progress (bugid 1242481).
742 * NOIDEL means that deletes are not allowed at this time;
743 * whoever resets NOIDEL will also send this inode back
744 * through ufs_iinactive. IREF remains set.
746 if (ULOCKFS_IS_NOIDEL(ITOUL(ip
))) {
747 mutex_enter(&vp
->v_lock
);
749 mutex_exit(&vp
->v_lock
);
750 rw_exit(&ip
->i_contents
);
753 if (!TRANS_ISTRANS(ip
->i_ufsvfs
)) {
754 rw_exit(&ip
->i_contents
);
755 ufs_delete(ip
->i_ufsvfs
, ip
, 0);
759 /* queue to delete thread; IREF remains set */
760 ins
.in_qfree
.value
.ul
++;
761 uq
= &ip
->i_ufsvfs
->vfs_delete
;
763 mutex_enter(&uq
->uq_mutex
);
766 if ((iq
= uq
->uq_ihead
) != 0) {
768 ip
->i_freeb
= iq
->i_freeb
;
769 iq
->i_freeb
->i_freef
= ip
;
779 delq_info
->delq_unreclaimed_files
+= 1;
780 delq_info
->delq_unreclaimed_blocks
+= ip
->i_blocks
;
783 * queue to idle thread
784 * Check the v_count == 1 again.
787 mutex_enter(&vp
->v_lock
);
788 if (vp
->v_count
> 1) {
790 mutex_exit(&vp
->v_lock
);
791 rw_exit(&ip
->i_contents
);
794 mutex_exit(&vp
->v_lock
);
798 * useful iff it has pages or is a fastsymlink; otherwise junk
800 mutex_enter(&uq
->uq_mutex
);
802 /* clear IREF means `on idle list' */
803 ip
->i_flag
&= ~(IREF
| IDIRECTIO
);
805 if (vn_has_cached_data(vp
) || ip
->i_flag
& IFASTSYMLNK
) {
806 ins
.in_frback
.value
.ul
++;
807 hip
= (inode_t
*)&ufs_useful_iq
[IQHASH(ip
)];
810 ins
.in_frfront
.value
.ul
++;
811 hip
= (inode_t
*)&ufs_junk_iq
[IQHASH(ip
)];
812 ip
->i_flag
|= IJUNKIQ
;
816 ip
->i_freeb
= hip
->i_freeb
;
817 hip
->i_freeb
->i_freef
= ip
;
821 /* wakeup thread(s) if q is overfull */
822 if (++uq
->uq_ne
== uq
->uq_lowat
)
823 cv_broadcast(&uq
->uq_cv
);
825 /* all done, release the q and inode */
826 mutex_exit(&uq
->uq_mutex
);
827 rw_exit(&ip
->i_contents
);
831 * Check accessed and update flags on an inode structure.
832 * If any are on, update the inode with the (unique) current time.
833 * If waitfor is given, insure I/O order so wait for write to complete.
836 ufs_iupdat(struct inode
*ip
, int waitfor
)
841 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
849 * This function is now safe to be called with either the reader
850 * or writer i_contents lock.
852 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
855 * Return if file system has been forcibly umounted.
860 flag
= ip
->i_flag
; /* Atomic read */
862 * We better not update the disk inode from a stale inode.
869 if ((flag
& (IUPD
|IACC
|ICHG
|IMOD
|IMODACC
|IATTCHG
)) != 0) {
871 mutex_enter(&ip
->i_tlock
);
872 ip
->i_flag
&= ~(IUPD
|IACC
|ICHG
|IMOD
|IMODACC
|IATTCHG
);
873 mutex_exit(&ip
->i_tlock
);
877 * fs is active while metadata is being written
879 mutex_enter(&ufsvfsp
->vfs_lock
);
880 ufs_notclean(ufsvfsp
);
884 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
,
885 (daddr_t
)fsbtodb(fp
, itod(fp
, ip
->i_number
)),
887 if (bp
->b_flags
& B_ERROR
) {
888 mutex_enter(&ip
->i_tlock
);
890 ~(IUPD
|IACC
|ICHG
|IMOD
|IMODACC
|IATTCHG
);
891 mutex_exit(&ip
->i_tlock
);
898 mutex_enter(&ip
->i_tlock
);
900 do_trans_times
= ((ip
->i_flag
& (IMOD
|IMODACC
)) == IMODACC
);
901 ip
->i_flag
&= ~(IUPD
|IACC
|ICHG
|IMOD
|IMODACC
|IATTCHG
);
902 mutex_exit(&ip
->i_tlock
);
905 * For reads and concurrent re-writes, no deltas were
906 * entered for the access time changes - do it now.
908 if (do_trans_times
) {
909 TRANS_INODE_TIMES(ufsvfsp
, ip
);
913 * For SunOS 5.0->5.4, these lines below read:
915 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
916 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
918 * where MAXUID was set to 60002. This was incorrect -
919 * the uids should have been constrained to what fitted into
922 * This means that files from 4.x filesystems that have an
923 * i_suid field larger than 60002 will have that field
926 * Security note: 4.x UFS could never create a i_suid of
927 * UID_LONG since that would've corresponded to -1.
929 suid
= (ulong_t
)ip
->i_uid
> (ulong_t
)USHRT_MAX
?
930 UID_LONG
: ip
->i_uid
;
931 sgid
= (ulong_t
)ip
->i_gid
> (ulong_t
)USHRT_MAX
?
932 GID_LONG
: ip
->i_gid
;
934 if ((ip
->i_suid
!= suid
) || (ip
->i_sgid
!= sgid
)) {
937 TRANS_INODE(ufsvfsp
, ip
);
940 if ((ip
->i_mode
& IFMT
) == IFBLK
||
941 (ip
->i_mode
& IFMT
) == IFCHR
) {
942 dev_t d
= ip
->i_rdev
;
946 * load first direct block only if special device
948 if (!cmpldev(&dev32
, d
)) {
950 * We panic here because there's "no way"
951 * we should have been able to create a large
952 * inode with a large dev_t. Earlier layers
953 * should've caught this.
955 panic("ip %p: i_rdev too big", (void *)ip
);
958 if (dev32
& ~((O_MAXMAJ
<< L_BITSMINOR32
) | O_MAXMIN
)) {
959 ip
->i_ordev
= dev32
; /* can't use old fmt. */
961 ip
->i_ordev
= cmpdev(d
);
966 * copy inode to dinode (zero fastsymlnk in dinode)
968 dp
= (struct dinode
*)bp
->b_un
.b_addr
+ itoo(fp
, ip
->i_number
);
969 dp
->di_ic
= ip
->i_ic
; /* structure assignment */
970 if (flag
& IFASTSYMLNK
) {
971 for (i
= 1; i
< NDADDR
; i
++)
973 for (i
= 0; i
< NIADDR
; i
++)
976 if (TRANS_ISTRANS(ufsvfsp
)) {
978 * Pass only a sector size buffer containing
979 * the inode, otherwise when the buffer is copied
980 * into a cached roll buffer then too much memory
981 * gets consumed if 8KB inode buffers are passed.
983 TRANS_LOG(ufsvfsp
, (caddr_t
)dp
, ip
->i_doff
,
984 sizeof (struct dinode
),
985 (caddr_t
)P2ALIGN((uintptr_t)dp
, DEV_BSIZE
),
989 } else if (waitfor
&& (ip
->i_ufsvfs
->vfs_dio
== 0)) {
990 UFS_BRWRITE(ufsvfsp
, bp
);
993 * Synchronous write has guaranteed that inode
994 * has been written on disk so clear the flag
996 mutex_enter(&ip
->i_tlock
);
997 ip
->i_flag
&= ~IBDWRITE
;
998 mutex_exit(&ip
->i_tlock
);
1003 * This write hasn't guaranteed that inode has been
1004 * written on the disk.
1005 * Since, all updat flags on inode are cleared, we must
1006 * remember the condition in case inode is to be updated
1007 * synchronously later (e.g.- fsync()/fdatasync())
1008 * and inode has not been modified yet.
1010 mutex_enter(&ip
->i_tlock
);
1011 ip
->i_flag
|= IBDWRITE
;
1012 mutex_exit(&ip
->i_tlock
);
1016 * In case previous inode update was done asynchronously
1017 * (IBDWRITE) and this inode update request wants guaranteed
1018 * (synchronous) disk update, flush the inode.
1020 if (waitfor
&& (flag
& IBDWRITE
)) {
1022 (daddr_t
)fsbtodb(fp
, itod(fp
, ip
->i_number
)));
1023 mutex_enter(&ip
->i_tlock
);
1024 ip
->i_flag
&= ~IBDWRITE
;
1025 mutex_exit(&ip
->i_tlock
);
1030 #define SINGLE 0 /* index of single indirect block */
1031 #define DOUBLE 1 /* index of double indirect block */
1032 #define TRIPLE 2 /* index of triple indirect block */
1035 * Release blocks associated with the inode ip and
1036 * stored in the indirect block bn. Blocks are free'd
1037 * in LIFO order up to (but not including) lastbn. If
1038 * level is greater than SINGLE, the block is an indirect
1039 * block and recursive calls to indirtrunc must be used to
1040 * cleanse other indirect blocks.
1042 * N.B.: triple indirect blocks are untested.
1045 indirtrunc(struct inode
*ip
, daddr_t bn
, daddr_t lastbn
, int level
, int flags
)
1048 struct buf
*bp
, *copy
;
1050 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
1051 struct fs
*fs
= ufsvfsp
->vfs_fs
;
1054 int blocksreleased
= 0, nblocks
;
1056 ASSERT(RW_WRITE_HELD(&ip
->i_contents
));
1058 * Calculate index in current block of last
1059 * block to be kept. -1 indicates the entire
1060 * block so we need not calculate the index.
1063 for (i
= SINGLE
; i
< level
; i
++)
1064 factor
*= NINDIR(fs
);
1068 nblocks
= btodb(fs
->fs_bsize
);
1070 * Get buffer of block pointers, zero those
1071 * entries corresponding to blocks to be free'd,
1072 * and update on disk copy first.
1073 * *Unless* the root pointer has been synchronously
1074 * written to disk. If nothing points to this
1075 * indirect block then don't bother zero'ing and
1078 bp
= UFS_BREAD(ufsvfsp
,
1079 ip
->i_dev
, (daddr_t
)fsbtodb(fs
, bn
), (int)fs
->fs_bsize
);
1080 if (bp
->b_flags
& B_ERROR
) {
1084 bap
= bp
->b_un
.b_daddr
;
1085 if ((flags
& I_CHEAP
) == 0) {
1088 zb
= (uint_t
)((NINDIR(fs
) - (last
+ 1)) * sizeof (daddr32_t
));
1092 * push any data into the log before we zero it
1094 if (bp
->b_flags
& B_DELWRI
)
1095 TRANS_LOG(ufsvfsp
, (caddr_t
)bap
,
1096 ldbtob(bp
->b_blkno
), bp
->b_bcount
,
1097 bp
->b_un
.b_addr
, bp
->b_bcount
);
1098 copy
= ngeteblk(fs
->fs_bsize
);
1099 bcopy((caddr_t
)bap
, (caddr_t
)copy
->b_un
.b_daddr
,
1100 (uint_t
)fs
->fs_bsize
);
1101 bzero((caddr_t
)&bap
[last
+ 1], zb
);
1104 (caddr_t
)&bap
[last
+ 1] - (caddr_t
)bap
,
1107 UFS_BRWRITE(ufsvfsp
, bp
);
1108 bp
= copy
, bap
= bp
->b_un
.b_daddr
;
1111 /* make sure write retries are also cleared */
1112 bp
->b_flags
&= ~(B_DELWRI
| B_RETRYWRI
);
1113 bp
->b_flags
|= B_STALE
| B_AGE
;
1117 * Recursively free totally unused blocks.
1120 for (i
= NINDIR(fs
) - 1; i
> last
; i
--) {
1124 if (level
> SINGLE
) {
1126 indirtrunc(ip
, nb
, (daddr_t
)-1, level
- 1, flags
);
1127 free(ip
, nb
, (off_t
)fs
->fs_bsize
, flags
| I_IBLK
);
1129 free(ip
, nb
, (off_t
)fs
->fs_bsize
, flags
);
1130 blocksreleased
+= nblocks
;
1135 * Recursively free last partial block.
1137 if (level
> SINGLE
&& lastbn
>= 0) {
1138 last
= lastbn
% factor
;
1142 indirtrunc(ip
, nb
, last
, level
- 1, flags
);
1145 return (blocksreleased
);
1149 * Truncate the inode ip to at most length size.
1150 * Free affected disk blocks -- the blocks of the
1151 * file are removed in reverse order.
1153 * N.B.: triple indirect blocks are untested.
1155 static int i_genrand
= 1234;
1157 ufs_itrunc(struct inode
*oip
, uoff_t length
, int flags
, cred_t
*cr
)
1159 struct fs
*fs
= oip
->i_fs
;
1160 struct ufsvfs
*ufsvfsp
= oip
->i_ufsvfs
;
1165 daddr_t bn
, lastiblock
[NIADDR
];
1167 long nblocks
, blocksreleased
= 0;
1172 uoff_t maxoffset
= (ufsvfsp
->vfs_lfflags
& UFS_LARGEFILES
) ?
1173 (UFS_MAXOFFSET_T
) : (MAXOFF32_T
);
1176 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
1177 * other uses need the reader lock. opendq() holds the writer lock.
1179 ASSERT((oip
->i_mode
& IFMT
) == IFSHAD
||
1180 RW_LOCK_HELD(&ufsvfsp
->vfs_dqrwlock
));
1181 ASSERT(RW_WRITE_HELD(&oip
->i_contents
));
1183 * We only allow truncation of regular files and directories
1184 * to arbitrary lengths here. In addition, we allow symbolic
1185 * links to be truncated only to zero length. Other inode
1186 * types cannot have their length set here. Disk blocks are
1187 * being dealt with - especially device inodes where
1188 * ip->i_ordev is actually being stored in ip->i_db[0]!
1190 TRANS_INODE(ufsvfsp
, oip
);
1191 mode
= oip
->i_mode
& IFMT
;
1192 if (flags
& I_FREE
) {
1193 i_genrand
*= 16843009; /* turns into shift and adds */
1195 oip
->i_gen
+= ((i_genrand
+ ddi_get_lbolt()) & 0xffff) + 1;
1196 oip
->i_flag
|= ICHG
|IUPD
;
1198 if (length
== oip
->i_size
)
1204 if (mode
!= IFREG
&& mode
!= IFDIR
&& mode
!= IFATTRDIR
&&
1205 !(mode
== IFLNK
&& length
== (offset_t
)0) && mode
!= IFSHAD
)
1207 if (length
> maxoffset
)
1209 if ((mode
== IFDIR
) || (mode
== IFATTRDIR
))
1213 if (oip
== ufsvfsp
->vfs_qinod
)
1215 if (length
== oip
->i_size
) {
1216 /* update ctime and mtime to please POSIX tests */
1217 oip
->i_flag
|= ICHG
|IUPD
;
1220 /* nothing to cache so clear the flag */
1221 oip
->i_flag
&= ~IFASTSYMLNK
;
1225 /* wipe out fast symlink till next access */
1226 if (oip
->i_flag
& IFASTSYMLNK
) {
1229 ASSERT(ITOV(oip
)->v_type
== VLNK
);
1231 oip
->i_flag
&= ~IFASTSYMLNK
;
1233 for (j
= 1; j
< NDADDR
; j
++)
1235 for (j
= 0; j
< NIADDR
; j
++)
1239 boff
= (int)blkoff(fs
, length
);
1241 if (length
> oip
->i_size
) {
1243 * Trunc up case. BMAPALLOC will insure that the right blocks
1244 * are allocated. This includes extending the old frag to a
1245 * full block (if needed) in addition to doing any work
1246 * needed for allocating the last block.
1249 err
= BMAPALLOC(oip
, length
- 1, (int)fs
->fs_bsize
, cr
);
1251 err
= BMAPALLOC(oip
, length
- 1, boff
, cr
);
1255 * Save old size and set inode's size now
1256 * so that we don't cause too much of the
1257 * file to be zero'd and pushed.
1259 uoff_t osize
= oip
->i_size
;
1260 oip
->i_size
= length
;
1262 * Make sure we zero out the remaining bytes of
1263 * the page in case a mmap scribbled on it. We
1264 * can't prevent a mmap from writing beyond EOF
1265 * on the last page of a file.
1268 if ((boff
= (int)blkoff(fs
, osize
)) != 0) {
1269 bsize
= (int)lblkno(fs
, osize
- 1) >= NDADDR
?
1270 fs
->fs_bsize
: fragroundup(fs
, boff
);
1271 pvn_vpzero(ITOV(oip
), osize
,
1272 (size_t)(bsize
- boff
));
1274 oip
->i_flag
|= ICHG
|IATTCHG
;
1278 * MAXOFF32_T is old 2GB size limit. If
1279 * this operation caused a large file to be
1280 * created, turn on the superblock flag
1281 * and update the superblock, if the flag
1282 * is not already on.
1284 if ((length
> (uoff_t
)MAXOFF32_T
) &&
1285 !(fs
->fs_flags
& FSLARGEFILES
)) {
1286 ASSERT(ufsvfsp
->vfs_lfflags
& UFS_LARGEFILES
);
1287 mutex_enter(&ufsvfsp
->vfs_lock
);
1288 fs
->fs_flags
|= FSLARGEFILES
;
1289 ufs_sbwrite(ufsvfsp
);
1290 mutex_exit(&ufsvfsp
->vfs_lock
);
1298 * Update the pages of the file. If the file is not being
1299 * truncated to a block boundary, the contents of the
1300 * pages following the end of the file must be zero'ed
1301 * in case it ever become accessible again because
1302 * of subsequent file growth.
1305 (void) pvn_vplist_dirty(ITOV(oip
), length
, ufs_putapage
,
1306 B_INVAL
| B_TRUNC
, CRED());
1309 * Make sure that the last block is properly allocated.
1310 * We only really have to do this if the last block is
1311 * actually allocated since ufs_bmap will now handle the case
1312 * of an fragment which has no block allocated. Just to
1313 * be sure, we do it now independent of current allocation.
1315 err
= BMAPALLOC(oip
, length
- 1, boff
, cr
);
1320 * BMAPALLOC will call bmap_write which defers i_seq
1321 * processing. If the timestamps were changed, update
1322 * i_seq before rdip drops i_contents or syncs the inode.
1324 if (oip
->i_flag
& (ICHG
|IUPD
))
1329 * Make sure that the relevant partial page appears in
1330 * the v_object's list, so that pvn_vpzero() will do its
1331 * job. Since doing this correctly requires everything
1332 * in rdip() except for the uiomove(), it's easier and
1333 * safer to do the uiomove() rather than duplicate the
1334 * rest of rdip() here.
1336 * To get here, we know that length indicates a byte
1337 * that is not the first byte of a block. (length - 1)
1338 * is the last actual byte known to exist. Deduction
1339 * shows it is in the same block as byte (length).
1340 * Thus, this rdip() invocation should always succeed
1341 * except in the face of i/o errors, and give us the
1342 * block we care about.
1344 * rdip() makes the same locking assertions and
1345 * assumptions as we do. We do not acquire any locks
1346 * before calling it, so we have not changed the locking
1347 * situation. Finally, there do not appear to be any
1348 * paths whereby rdip() ends up invoking us again.
1349 * Thus, infinite recursion is avoided.
1358 uio
.uio_loffset
= length
- 1;
1360 uio
.uio_segflg
= UIO_SYSSPACE
;
1361 uio
.uio_extflg
= UIO_COPY_CACHED
;
1363 iov
[0].iov_base
= &buffer
;
1366 err
= rdip(oip
, &uio
, UIO_READ
, NULL
);
1371 bsize
= (int)lblkno(fs
, length
- 1) >= NDADDR
?
1372 fs
->fs_bsize
: fragroundup(fs
, boff
);
1373 pvn_vpzero(ITOV(oip
), length
, (size_t)(bsize
- boff
));
1375 * Ensure full fs block is marked as dirty.
1377 (void) pvn_vplist_dirty(ITOV(oip
), length
+ (bsize
- boff
),
1378 ufs_putapage
, B_INVAL
| B_TRUNC
, CRED());
1382 * Calculate index into inode's block list of
1383 * last direct and indirect blocks (if any)
1384 * which we want to keep. Lastblock is -1 when
1385 * the file is truncated to 0.
1387 lastblock
= lblkno(fs
, length
+ fs
->fs_bsize
- 1) - 1;
1388 lastiblock
[SINGLE
] = lastblock
- NDADDR
;
1389 lastiblock
[DOUBLE
] = lastiblock
[SINGLE
] - NINDIR(fs
);
1390 lastiblock
[TRIPLE
] = lastiblock
[DOUBLE
] - NINDIR(fs
) * NINDIR(fs
);
1391 nblocks
= btodb(fs
->fs_bsize
);
1394 * Update file and block pointers
1395 * on disk before we start freeing blocks.
1396 * If we crash before free'ing blocks below,
1397 * the blocks will be returned to the free list.
1398 * lastiblock values are also normalized to -1
1399 * for calls to indirtrunc below.
1401 tip
= *oip
; /* structure copy */
1404 for (level
= TRIPLE
; level
>= SINGLE
; level
--)
1405 if (lastiblock
[level
] < 0) {
1406 oip
->i_ib
[level
] = 0;
1407 lastiblock
[level
] = -1;
1409 for (i
= NDADDR
- 1; i
> lastblock
; i
--) {
1413 oip
->i_size
= length
;
1414 oip
->i_flag
|= ICHG
|IUPD
|IATTCHG
;
1416 if (!TRANS_ISTRANS(ufsvfsp
))
1417 ufs_iupdat(oip
, I_SYNC
); /* do sync inode update */
1420 * Indirect blocks first.
1422 for (level
= TRIPLE
; level
>= SINGLE
; level
--) {
1423 bn
= ip
->i_ib
[level
];
1426 indirtrunc(ip
, bn
, lastiblock
[level
], level
, flags
);
1427 if (lastiblock
[level
] < 0) {
1428 ip
->i_ib
[level
] = 0;
1429 free(ip
, bn
, (off_t
)fs
->fs_bsize
,
1431 blocksreleased
+= nblocks
;
1434 if (lastiblock
[level
] >= 0)
1439 * All whole direct blocks or frags.
1441 for (i
= NDADDR
- 1; i
> lastblock
; i
--) {
1446 bsize
= (off_t
)blksize(fs
, ip
, i
);
1447 free(ip
, bn
, bsize
, flags
);
1448 blocksreleased
+= btodb(bsize
);
1454 * Finally, look for a change in size of the
1455 * last direct block; release any frags.
1457 bn
= ip
->i_db
[lastblock
];
1459 off_t oldspace
, newspace
;
1462 * Calculate amount of space we're giving
1463 * back as old block size minus new block size.
1465 oldspace
= blksize(fs
, ip
, lastblock
);
1466 UFS_SET_ISIZE(length
, ip
);
1467 newspace
= blksize(fs
, ip
, lastblock
);
1468 if (newspace
== 0) {
1469 err
= ufs_fault(ITOV(ip
), "ufs_itrunc: newspace == 0");
1472 if (oldspace
- newspace
> 0) {
1474 * Block number of space to be free'd is
1475 * the old block # plus the number of frags
1476 * required for the storage we're keeping.
1478 bn
+= numfrags(fs
, newspace
);
1479 free(ip
, bn
, oldspace
- newspace
, flags
);
1480 blocksreleased
+= btodb(oldspace
- newspace
);
1484 /* BEGIN PARANOIA */
1485 for (level
= SINGLE
; level
<= TRIPLE
; level
++)
1486 if (ip
->i_ib
[level
] != oip
->i_ib
[level
]) {
1487 err
= ufs_fault(ITOV(ip
), "ufs_itrunc: indirect block");
1491 for (i
= 0; i
< NDADDR
; i
++)
1492 if (ip
->i_db
[i
] != oip
->i_db
[i
]) {
1493 err
= ufs_fault(ITOV(ip
), "ufs_itrunc: direct block");
1497 oip
->i_blocks
-= blocksreleased
;
1499 if (oip
->i_blocks
< 0) { /* sanity */
1501 "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
1502 fs
->fs_fsmnt
, (int)oip
->i_number
, oip
->i_size
,
1503 (int)oip
->i_blocks
);
1506 oip
->i_flag
|= ICHG
|IATTCHG
;
1508 /* blocksreleased is >= zero, so this can not fail */
1509 (void) chkdq(oip
, -blocksreleased
, 0, cr
, (char **)NULL
, NULL
);
1514 * Check mode permission on inode. Mode is READ, WRITE or EXEC.
1515 * In the case of WRITE, the read-only status of the file system
1516 * is checked. Depending on the calling user, the appropriate
1517 * mode bits are selected; privileges to override missing permission
1518 * bits are checked through secpolicy_vnode_access().
1519 * The i_contens lock must be held as reader here to prevent racing with
1520 * the acl subsystem removing/setting/changing acls on this inode.
1521 * The caller is responsible for indicating whether or not the i_contents
1522 * lock needs to be acquired here or if already held.
1525 ufs_iaccess(struct inode
*ip
, int mode
, struct cred
*cr
, int dolock
)
1531 rw_enter(&ip
->i_contents
, RW_READER
);
1532 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
1534 if (mode
& IWRITE
) {
1536 * Disallow write attempts on read-only
1537 * file systems, unless the file is a block
1538 * or character device or a FIFO.
1540 if (ip
->i_fs
->fs_ronly
!= 0) {
1541 if ((ip
->i_mode
& IFMT
) != IFCHR
&&
1542 (ip
->i_mode
& IFMT
) != IFBLK
&&
1543 (ip
->i_mode
& IFMT
) != IFIFO
) {
1550 * If there is an acl, check the acl and return.
1552 if (ip
->i_ufs_acl
&& ip
->i_ufs_acl
->aowner
) {
1553 ret
= ufs_acl_access(ip
, mode
, cr
);
1558 * Access check is based on only one of owner, group, public.
1559 * If not owner, then check group.
1560 * If not a member of the group, then check public access.
1562 if (crgetuid(cr
) != ip
->i_uid
) {
1564 if (!groupmember((uid_t
)ip
->i_gid
, cr
))
1568 /* test missing privilege bits */
1569 ret
= secpolicy_vnode_access2(cr
, ITOV(ip
), ip
->i_uid
,
1570 ip
->i_mode
<< shift
, mode
);
1573 rw_exit(&ip
->i_contents
);
1578 * if necessary, remove an inode from the free list
1579 * i_contents is held except at unmount
1581 * Return 1 if the inode is taken off of the ufs_idle_q,
1582 * and the caller is expected to call VN_RELE.
1584 * Return 0 otherwise.
1587 ufs_rmidle(struct inode
*ip
)
1591 mutex_enter(&ip
->i_tlock
);
1592 if ((ip
->i_flag
& IREF
) == 0) {
1593 mutex_enter(&ufs_idle_q
.uq_mutex
);
1594 ip
->i_freef
->i_freeb
= ip
->i_freeb
;
1595 ip
->i_freeb
->i_freef
= ip
->i_freef
;
1600 if (ip
->i_flag
& IJUNKIQ
) {
1602 ip
->i_flag
&= ~IJUNKIQ
;
1606 mutex_exit(&ufs_idle_q
.uq_mutex
);
1609 mutex_exit(&ip
->i_tlock
);
1614 * scan the hash of inodes and call func with the inode locked
1617 ufs_scan_inodes(int rwtry
, int (*func
)(struct inode
*, void *), void *arg
,
1618 struct ufsvfs
*ufsvfsp
)
1620 struct inode
*ip
; /* current inode */
1621 struct inode
*lip
= NULL
; /* last/previous inode */
1622 union ihead
*ih
; /* current hash chain */
1625 int lip_held
; /* lip needs a VN_RELE() */
1628 * If ufsvfsp is NULL, then our caller should be holding
1629 * ufs_scan_lock to avoid conflicts between ufs_unmount() and
1630 * ufs_update(). Otherwise, to avoid false-positives in
1631 * ufs_unmount()'s v_count-based EBUSY check, we only hold
1632 * those inodes that are in the file system our caller cares
1635 * We know that ip is a valid inode in the hash chain (and thus
1636 * we can trust i_ufsvfs) because the inode we chained from
1637 * (lip) is still in the hash chain. This is true because either:
1639 * 1. We did not drop the hash chain lock since the last
1640 * iteration (because we were not interested in the last inode),
1642 * 2. We maintained a hold on the last inode while we
1643 * we were processing it, so it could not be removed
1644 * from the hash chain.
1646 * The whole reason we're dropping and re-grabbing the chain
1647 * lock on every inode is so that we don't present a major
1648 * choke point on throughput, particularly when we've been
1649 * called on behalf of fsflush.
1652 for (i
= 0, ih
= ihead
; i
< inohsz
; i
++, ih
++) {
1653 mutex_enter(&ih_lock
[i
]);
1654 for (ip
= ih
->ih_chain
[0], lip_held
= 0;
1655 ip
!= (struct inode
*)ih
;
1658 ins
.in_scan
.value
.ul
++;
1661 * Undo the previous iteration's VN_HOLD(), but
1662 * only if one was done.
1668 if (ufsvfsp
!= NULL
&& ip
->i_ufsvfs
!= ufsvfsp
) {
1670 * We're not processing all inodes, and
1671 * this inode is not in the filesystem of
1672 * interest, so skip it. No need to do a
1673 * VN_HOLD() since we're not dropping the
1674 * hash chain lock until after we've
1675 * done the i_forw traversal above.
1682 mutex_exit(&ih_lock
[i
]);
1685 * Acquire the contents lock as writer to make
1686 * sure that the inode has been initialized in
1687 * the cache or removed from the idle list by
1688 * ufs_iget(). This works because ufs_iget()
1689 * acquires the contents lock before putting
1690 * the inode into the cache. If we can lock
1691 * it, then ufs_iget() is done with it.
1695 if (!rw_tryenter(&ip
->i_contents
, RW_WRITER
)) {
1696 mutex_enter(&ih_lock
[i
]);
1700 rw_enter(&ip
->i_contents
, RW_WRITER
);
1703 rw_exit(&ip
->i_contents
);
1706 * ISTALE means the inode couldn't be read
1708 * We don't have to hold the i_contents lock
1709 * for this check for a couple of
1710 * reasons. First, if ISTALE is set then the
1711 * flag cannot be cleared until the inode is
1712 * removed from the cache and that cannot
1713 * happen until after we VN_RELE() it.
1714 * Second, if ISTALE is not set, then the
1715 * inode is in the cache and does not need to
1716 * be read from disk so ISTALE cannot be set
1717 * while we are not looking.
1719 if ((ip
->i_flag
& ISTALE
) == 0) {
1720 if ((error
= (*func
)(ip
, arg
)) != 0)
1724 mutex_enter(&ih_lock
[i
]);
1728 mutex_exit(&ih_lock
[i
]);
1734 * Mark inode with the current time, plus a unique increment.
1736 * Since we only keep 32-bit time on disk, if UFS is still alive
1737 * beyond 2038, filesystem times will simply stick at the last
1738 * possible second of 32-bit time. Not ideal, but probably better
1739 * than going into the remote past, or confusing applications with
1743 ufs_imark(struct inode
*ip
)
1749 * The update of i_seq may have been deferred, increase i_seq here
1750 * to make sure it is in sync with the timestamps.
1752 if (ip
->i_flag
& ISEQ
) {
1753 ASSERT(ip
->i_flag
& (IUPD
|ICHG
));
1755 ip
->i_flag
&= ~ISEQ
;
1761 * Fast algorithm to convert nsec to usec -- see hrt2ts()
1762 * in kernel/os/timers.c for a full description.
1765 usec
= nsec
+ (nsec
>> 2);
1766 usec
= nsec
+ (usec
>> 1);
1767 usec
= nsec
+ (usec
>> 2);
1768 usec
= nsec
+ (usec
>> 4);
1769 usec
= nsec
- (usec
>> 3);
1770 usec
= nsec
+ (usec
>> 2);
1771 usec
= nsec
+ (usec
>> 3);
1772 usec
= nsec
+ (usec
>> 4);
1773 usec
= nsec
+ (usec
>> 1);
1774 usec
= nsec
+ (usec
>> 6);
1777 mutex_enter(&ufs_iuniqtime_lock
);
1778 if (now
.tv_sec
> (time_t)iuniqtime
.tv_sec
||
1779 usec
> iuniqtime
.tv_usec
) {
1780 if (now
.tv_sec
< TIME32_MAX
) {
1781 iuniqtime
.tv_sec
= (time32_t
)now
.tv_sec
;
1782 iuniqtime
.tv_usec
= usec
;
1785 if (iuniqtime
.tv_sec
< TIME32_MAX
) {
1786 iuniqtime
.tv_usec
++;
1787 /* Check for usec overflow */
1788 if (iuniqtime
.tv_usec
>= MICROSEC
) {
1790 iuniqtime
.tv_usec
= 0;
1795 if ((ip
->i_flag
& IACC
) && !(ip
->i_ufsvfs
->vfs_noatime
)) {
1796 ip
->i_atime
= iuniqtime
;
1798 if (ip
->i_flag
& IUPD
) {
1799 ip
->i_mtime
= iuniqtime
;
1800 ip
->i_flag
|= IMODTIME
;
1802 if (ip
->i_flag
& ICHG
) {
1804 ip
->i_ctime
= iuniqtime
;
1806 mutex_exit(&ufs_iuniqtime_lock
);
1810 * Update timestamps in inode.
1813 ufs_itimes_nolock(struct inode
*ip
)
1817 * if noatime is set and the inode access time is the only field that
1818 * must be changed, exit immediately.
1820 if (((ip
->i_flag
& (IUPD
|IACC
|ICHG
)) == IACC
) &&
1821 (ip
->i_ufsvfs
->vfs_noatime
)) {
1825 if (ip
->i_flag
& (IUPD
|IACC
|ICHG
)) {
1826 if (ip
->i_flag
& ICHG
)
1829 ip
->i_flag
|= IMODACC
;
1831 ip
->i_flag
&= ~(IACC
|IUPD
|ICHG
);