4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * Copyright (c) 2017 by Delphix. All rights reserved.
34 * Portions of this source code were derived from Berkeley 4.3 BSD
35 * under license from the Regents of the University of California.
38 #include <sys/types.h>
39 #include <sys/systm.h>
40 #include <sys/errno.h>
43 #include <sys/vnode.h>
46 #include <sys/callb.h>
47 #include <sys/cpuvar.h>
48 #include <sys/fs/ufs_inode.h>
49 #include <sys/fs/ufs_log.h>
50 #include <sys/fs/ufs_trans.h>
51 #include <sys/fs/ufs_acl.h>
52 #include <sys/fs/ufs_bio.h>
53 #include <sys/fs/ufs_fsdir.h>
54 #include <sys/debug.h>
55 #include <sys/cmn_err.h>
56 #include <sys/sysmacros.h>
59 extern pri_t minclsyspri
;
60 extern int hash2ints();
61 extern struct kmem_cache
*inode_cache
; /* cache of free inodes */
62 extern int ufs_idle_waiters
;
63 extern struct instats ins
;
65 static void ufs_attr_purge(struct inode
*);
68 * initialize a thread's queue struct
71 ufs_thread_init(struct ufs_q
*uq
, int lowat
)
73 bzero((caddr_t
)uq
, sizeof (*uq
));
74 cv_init(&uq
->uq_cv
, NULL
, CV_DEFAULT
, NULL
);
75 mutex_init(&uq
->uq_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
77 uq
->uq_hiwat
= 2 * lowat
;
78 uq
->uq_threadp
= NULL
;
82 * start a thread for a queue (assumes success)
85 ufs_thread_start(struct ufs_q
*uq
, void (*func
)(), struct vfs
*vfsp
)
87 mutex_enter(&uq
->uq_mutex
);
88 if (uq
->uq_threadp
== NULL
) {
89 uq
->uq_threadp
= thread_create(NULL
, 0, func
, vfsp
, 0, &p0
,
93 mutex_exit(&uq
->uq_mutex
);
97 * wait for the thread to exit
100 ufs_thread_exit(struct ufs_q
*uq
)
102 kt_did_t ufs_thread_did
= 0;
104 mutex_enter(&uq
->uq_mutex
);
105 uq
->uq_flags
&= ~(UQ_SUSPEND
| UQ_SUSPENDED
);
106 if (uq
->uq_threadp
!= NULL
) {
107 ufs_thread_did
= uq
->uq_threadp
->t_did
;
108 uq
->uq_flags
|= (UQ_EXIT
|UQ_WAIT
);
109 cv_broadcast(&uq
->uq_cv
);
111 mutex_exit(&uq
->uq_mutex
);
114 * It's safe to call thread_join() with an already-gone
115 * t_did, but we have to obtain it before the kernel
116 * thread structure is freed. We do so above under the
117 * protection of the uq_mutex when we're sure the thread
118 * still exists and it's save to de-reference it.
119 * We also have to check if ufs_thread_did is != 0
120 * before calling thread_join() since thread 0 in the system
124 thread_join(ufs_thread_did
);
128 * wait for a thread to suspend itself on the caller's behalf
129 * the caller is responsible for continuing the thread
132 ufs_thread_suspend(struct ufs_q
*uq
)
134 mutex_enter(&uq
->uq_mutex
);
135 if (uq
->uq_threadp
!= NULL
) {
137 * wait while another thread is suspending this thread.
138 * no need to do a cv_broadcast(), as whoever suspended
139 * the thread must continue it at some point.
141 while ((uq
->uq_flags
& UQ_SUSPEND
) &&
142 (uq
->uq_threadp
!= NULL
)) {
144 * We can't use cv_signal() because if our
145 * signal doesn't happen to hit the desired
146 * thread but instead some other waiter like
147 * ourselves, we'll wait forever for a
148 * response. Well, at least an indeterminate
149 * amount of time until we just happen to get
150 * lucky from whomever did get signalled doing
151 * a cv_signal() of their own. This is an
152 * unfortunate performance lossage.
154 uq
->uq_flags
|= UQ_WAIT
;
155 cv_wait(&uq
->uq_cv
, &uq
->uq_mutex
);
158 uq
->uq_flags
|= (UQ_SUSPEND
| UQ_WAIT
);
161 * wait for the thread to suspend itself
163 if ((uq
->uq_flags
& UQ_SUSPENDED
) == 0 &&
164 (uq
->uq_threadp
!= NULL
)) {
165 cv_broadcast(&uq
->uq_cv
);
168 while (((uq
->uq_flags
& UQ_SUSPENDED
) == 0) &&
169 (uq
->uq_threadp
!= NULL
)) {
170 cv_wait(&uq
->uq_cv
, &uq
->uq_mutex
);
173 mutex_exit(&uq
->uq_mutex
);
177 * allow a thread to continue from a ufs_thread_suspend()
178 * This thread must be the same as the thread that called
179 * ufs_thread_suspend.
182 ufs_thread_continue(struct ufs_q
*uq
)
184 mutex_enter(&uq
->uq_mutex
);
185 uq
->uq_flags
&= ~(UQ_SUSPEND
| UQ_SUSPENDED
);
186 cv_broadcast(&uq
->uq_cv
);
187 mutex_exit(&uq
->uq_mutex
);
191 * some common code for managing a threads execution
192 * uq is locked at entry and return
197 * Kind of a hack passing in the callb_cpr_t * here.
198 * It should really be part of the ufs_q structure.
199 * I did not put it in there because we are already in beta
200 * and I was concerned that changing ufs_inode.h to include
201 * callb.h might break something.
204 ufs_thread_run(struct ufs_q
*uq
, callb_cpr_t
*cprinfop
)
207 ASSERT(uq
->uq_ne
>= 0);
209 if (uq
->uq_flags
& UQ_SUSPEND
) {
210 uq
->uq_flags
|= UQ_SUSPENDED
;
211 } else if (uq
->uq_flags
& UQ_EXIT
) {
213 * exiting; empty the queue (may infinite loop)
217 uq
->uq_threadp
= NULL
;
218 if (uq
->uq_flags
& UQ_WAIT
) {
219 cv_broadcast(&uq
->uq_cv
);
221 uq
->uq_flags
&= ~(UQ_EXIT
| UQ_WAIT
);
222 CALLB_CPR_EXIT(cprinfop
);
224 } else if (uq
->uq_ne
>= uq
->uq_lowat
) {
226 * process a block of entries until below high water mark
228 return (uq
->uq_ne
- (uq
->uq_lowat
>> 1));
230 if (uq
->uq_flags
& UQ_WAIT
) {
231 uq
->uq_flags
&= ~UQ_WAIT
;
232 cv_broadcast(&uq
->uq_cv
);
234 CALLB_CPR_SAFE_BEGIN(cprinfop
);
235 cv_wait(&uq
->uq_cv
, &uq
->uq_mutex
);
236 CALLB_CPR_SAFE_END(cprinfop
, &uq
->uq_mutex
);
242 * The following routines implement the protocol for freeing the resources
243 * held by an idle and deleted inode.
246 ufs_delete(struct ufsvfs
*ufsvfsp
, struct inode
*ip
, int dolockfs
)
249 struct vnode
*vp
= ITOV(ip
);
252 int dorwlock
= ((ip
->i_mode
& IFMT
) == IFREG
);
256 struct ufs_q
*delq
= &ufsvfsp
->vfs_delete
;
257 struct ufs_delq_info
*delq_info
= &ufsvfsp
->vfs_delete_info
;
260 * Ignore if deletes are not allowed (wlock/hlock)
262 if (ULOCKFS_IS_NOIDEL(ITOUL(ip
))) {
263 mutex_enter(&delq
->uq_mutex
);
264 delq_info
->delq_unreclaimed_blocks
-= ip
->i_blocks
;
265 delq_info
->delq_unreclaimed_files
--;
266 mutex_exit(&delq
->uq_mutex
);
271 if ((vp
->v_count
> 1) || (ip
->i_mode
== 0)) {
272 mutex_enter(&delq
->uq_mutex
);
273 delq_info
->delq_unreclaimed_blocks
-= ip
->i_blocks
;
274 delq_info
->delq_unreclaimed_files
--;
275 mutex_exit(&delq
->uq_mutex
);
280 * If we are called as part of setting a fs lock, then only
281 * do part of the lockfs protocol. In other words, don't hang.
284 if (ufs_lockfs_begin(ufsvfsp
, &ulp
, ULOCKFS_DELETE_MASK
))
288 * check for recursive VOP call
290 if (curthread
->t_flag
& T_DONTBLOCK
) {
293 ulp
= &ufsvfsp
->vfs_ulockfs
;
294 curthread
->t_flag
|= T_DONTBLOCK
;
299 * Hold rwlock to synchronize with (nfs) writes
302 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
305 * Delete the attribute directory.
307 if (ip
->i_oeftflag
!= 0) {
308 TRANS_BEGIN_CSYNC(ufsvfsp
, issync
, TOP_REMOVE
,
309 trans_size
= (int)TOP_REMOVE_SIZE(ip
));
310 rw_enter(&ip
->i_contents
, RW_WRITER
);
311 err
= ufs_iget(ip
->i_vfs
, ip
->i_oeftflag
,
314 rw_enter(&dp
->i_rwlock
, RW_WRITER
);
315 rw_enter(&dp
->i_contents
, RW_WRITER
);
316 dp
->i_flag
|= IUPD
|ICHG
;
318 TRANS_INODE(dp
->i_ufsvfs
, dp
);
322 * Should get rid of any negative cache entries that
323 * might be lingering, as well as ``.'' and
324 * ``..''. If we don't, the VN_RELE() below
325 * won't actually put dp on the delete queue
326 * and it'll hang out until someone forces it
327 * (lockfs -f, umount, ...). The only reliable
328 * way of doing this at the moment is to call
329 * dnlc_purge_vp(ITOV(dp)), which is unacceptably
330 * slow, so we'll just note the problem in this
333 dnlc_remove(ITOV(dp
), ".");
334 dnlc_remove(ITOV(dp
), "..");
336 if (!TRANS_ISTRANS(ufsvfsp
)) {
337 ufs_iupdat(dp
, I_SYNC
);
339 rw_exit(&dp
->i_contents
);
340 rw_exit(&dp
->i_rwlock
);
344 * Clear out attribute pointer
347 rw_exit(&ip
->i_contents
);
348 TRANS_END_CSYNC(ufsvfsp
, err
, issync
,
349 TOP_REMOVE
, trans_size
);
350 dnlc_remove(ITOV(ip
), XATTR_DIR_NAME
);
353 if ((ip
->i_mode
& IFMT
) == IFATTRDIR
) {
357 (void) TRANS_ITRUNC(ip
, (u_offset_t
)0, I_FREE
| I_ACCT
, CRED());
360 * the inode's space has been freed; now free the inode
363 trans_size
= TOP_IFREE_SIZE(ip
);
364 TRANS_BEGIN_ASYNC(ufsvfsp
, TOP_IFREE
, trans_size
);
366 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
367 rw_enter(&ip
->i_contents
, RW_WRITER
);
368 TRANS_INODE(ufsvfsp
, ip
);
375 (void) ufs_si_free(ip
->i_ufs_acl
, vp
->v_vfsp
, CRED());
376 ip
->i_ufs_acl
= NULL
;
381 * This inode is torn down but still retains it's identity
382 * (inode number). It could get recycled soon so it's best
383 * to clean up the vnode just in case.
385 mutex_enter(&vp
->v_lock
);
387 mutex_exit(&vp
->v_lock
);
392 ufs_ifree(ip
, ip
->i_number
, mode
);
394 * release quota resources; can't fail
396 (void) chkiq((struct ufsvfs
*)vp
->v_vfsp
->vfs_data
,
397 /* change */ -1, ip
, (uid_t
)ip
->i_uid
, 0, CRED(),
398 (char **)NULL
, (size_t *)NULL
);
401 ip
->i_flag
&= ~(IDEL
| IDIRECTIO
);
403 if (!TRANS_ISTRANS(ufsvfsp
)) {
404 ufs_iupdat(ip
, I_SYNC
);
406 mutex_enter(&delq
->uq_mutex
);
407 delq_info
->delq_unreclaimed_files
--;
408 mutex_exit(&delq
->uq_mutex
);
410 rw_exit(&ip
->i_contents
);
411 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
413 rw_exit(&ip
->i_rwlock
);
420 TRANS_END_ASYNC(ufsvfsp
, TOP_IFREE
, trans_size
);
424 curthread
->t_flag
&= ~T_DONTBLOCK
;
429 * Create the delete thread and init the delq_info for this fs
432 ufs_delete_init(struct ufsvfs
*ufsvfsp
, int lowat
)
434 struct ufs_delq_info
*delq_info
= &ufsvfsp
->vfs_delete_info
;
436 ufs_thread_init(&ufsvfsp
->vfs_delete
, lowat
);
437 (void) memset((void *)delq_info
, 0, sizeof (*delq_info
));
441 * thread that frees up deleted inodes
444 ufs_thread_delete(struct vfs
*vfsp
)
446 struct ufsvfs
*ufsvfsp
= (struct ufsvfs
*)vfsp
->vfs_data
;
447 struct ufs_q
*uq
= &ufsvfsp
->vfs_delete
;
452 CALLB_CPR_INIT(&cprinfo
, &uq
->uq_mutex
, callb_generic_cpr
,
455 mutex_enter(&uq
->uq_mutex
);
458 * Sleep until there is work to do. Only do one entry at
459 * a time, to reduce the wait time for checking for a suspend
460 * request. The ?: is for pedantic portability.
462 ne
= ufs_thread_run(uq
, &cprinfo
) ? 1 : 0;
465 * process an entry, if there are any
467 if (ne
&& (ip
= uq
->uq_ihead
)) {
469 * process first entry on queue. Assumed conditions are:
470 * ip is held (v_count >= 1)
471 * ip is referenced (i_flag & IREF)
472 * ip is free (i_nlink <= 0)
474 if ((uq
->uq_ihead
= ip
->i_freef
) == ip
)
476 ip
->i_freef
->i_freeb
= ip
->i_freeb
;
477 ip
->i_freeb
->i_freef
= ip
->i_freef
;
481 mutex_exit(&uq
->uq_mutex
);
482 ufs_delete(ufsvfsp
, ip
, 1);
483 mutex_enter(&uq
->uq_mutex
);
489 * drain ne entries off the delete queue. As new queue entries may
490 * be added while we're working, ne is interpreted as follows:
492 * ne > 0 => remove up to ne entries
493 * ne == 0 => remove all entries currently on the queue
494 * ne == -1 => remove entries until the queue is empty
497 ufs_delete_drain(struct vfs
*vfsp
, int ne
, int dolockfs
)
499 struct ufsvfs
*ufsvfsp
= (struct ufsvfs
*)vfsp
->vfs_data
;
506 * if forcibly unmounted; ignore
511 uq
= &ufsvfsp
->vfs_delete
;
512 mutex_enter(&uq
->uq_mutex
);
514 drain_cnt
= uq
->uq_ne
;
519 * process up to ne entries
523 while (!done
&& (ip
= uq
->uq_ihead
)) {
526 if (ne
!= -1 && drain_cnt
== 0)
528 if ((uq
->uq_ihead
= ip
->i_freef
) == ip
)
530 ip
->i_freef
->i_freeb
= ip
->i_freeb
;
531 ip
->i_freeb
->i_freef
= ip
->i_freef
;
535 mutex_exit(&uq
->uq_mutex
);
536 ufs_delete(ufsvfsp
, ip
, dolockfs
);
537 mutex_enter(&uq
->uq_mutex
);
539 mutex_exit(&uq
->uq_mutex
);
543 ufs_sync_with_thread(struct ufs_q
*uq
)
545 mutex_enter(&uq
->uq_mutex
);
548 * Wake up delete thread to free up space.
550 if ((uq
->uq_flags
& UQ_WAIT
) == 0) {
551 uq
->uq_flags
|= UQ_WAIT
;
552 cv_broadcast(&uq
->uq_cv
);
555 while ((uq
->uq_threadp
!= NULL
) && (uq
->uq_flags
& UQ_WAIT
)) {
556 cv_wait(&uq
->uq_cv
, &uq
->uq_mutex
);
559 mutex_exit(&uq
->uq_mutex
);
563 * Get rid of everything that's currently in the delete queue,
564 * plus whatever the delete thread is working on at the moment.
566 * This ability is required for providing true POSIX semantics
567 * regarding close(2), unlink(2), etc, even when logging is enabled.
568 * The standard requires that the released space be immediately
569 * observable (statvfs(2)) and allocatable (e.g., write(2)).
572 ufs_delete_drain_wait(struct ufsvfs
*ufsvfsp
, int dolockfs
)
574 struct ufs_q
*uq
= &ufsvfsp
->vfs_delete
;
576 struct ufs_q
*delq
= &ufsvfsp
->vfs_delete
;
577 struct ufs_delq_info
*delq_info
= &ufsvfsp
->vfs_delete_info
;
580 * If there is something on delq or delete thread
583 mutex_enter(&delq
->uq_mutex
);
584 if (delq_info
->delq_unreclaimed_files
> 0) {
585 mutex_exit(&delq
->uq_mutex
);
586 (void) ufs_delete_drain(ufsvfsp
->vfs_vfs
, 0, dolockfs
);
587 ufs_sync_with_thread(uq
);
589 ASSERT(delq_info
->delq_unreclaimed_files
== 0);
590 mutex_exit(&delq
->uq_mutex
);
595 * Commit any outstanding transactions to make sure
596 * any canceled freed blocks are available for allocation.
598 curthread
->t_flag
|= T_DONTBLOCK
;
599 TRANS_BEGIN_SYNC(ufsvfsp
, TOP_COMMIT_UPDATE
, TOP_COMMIT_SIZE
, error
);
601 TRANS_END_SYNC(ufsvfsp
, error
, TOP_COMMIT_UPDATE
,
604 curthread
->t_flag
&= ~T_DONTBLOCK
;
608 * Adjust the resource usage in a struct statvfs based on
609 * what's in the delete queue.
611 * We do not consider the impact of ACLs or extended attributes
612 * that may be deleted as a side-effect of deleting a file.
613 * Those are metadata, and their sizes aren't reflected in the
614 * sizes returned by stat(), so this is not a problem.
617 ufs_delete_adjust_stats(struct ufsvfs
*ufsvfsp
, struct statvfs64
*sp
)
619 struct ufs_q
*uq
= &ufsvfsp
->vfs_delete
;
620 struct ufs_delq_info
*delq_info
= &ufsvfsp
->vfs_delete_info
;
622 mutex_enter(&uq
->uq_mutex
);
624 * The blocks accounted for in the delete queue info are
625 * counted in DEV_BSIZE chunks, but ufs_statvfs counts in
626 * filesystem fragments, so a conversion is required here.
628 sp
->f_bfree
+= dbtofsb(ufsvfsp
->vfs_fs
,
629 delq_info
->delq_unreclaimed_blocks
);
630 sp
->f_ffree
+= delq_info
->delq_unreclaimed_files
;
631 mutex_exit(&uq
->uq_mutex
);
636 * The following routines implement the protocol for maintaining an
637 * LRU list of idle inodes and for moving the idle inodes to the
638 * reuse list when the number of allocated inodes exceeds the user
639 * tunable high-water mark (ufs_ninode).
643 * clean an idle inode and move it to the reuse list
646 ufs_idle_free(struct inode
*ip
)
651 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
652 struct vnode
*vp
= ITOV(ip
);
653 int vn_has_data
, vn_modified
;
660 * remember `pages' for stats below
662 pages
= (ip
->i_mode
&& vn_has_cached_data(vp
) && vp
->v_type
!= VCHR
);
665 * start the dirty pages to disk and then invalidate them
666 * unless the inode is invalid (ISTALE)
668 if ((ip
->i_flag
& ISTALE
) == 0) {
669 (void) TRANS_SYNCIP(ip
, B_ASYNC
, I_ASYNC
, TOP_SYNCIP_FREE
);
670 (void) TRANS_SYNCIP(ip
,
671 (TRANS_ISERROR(ufsvfsp
)) ? B_INVAL
| B_FORCE
: B_INVAL
,
672 I_ASYNC
, TOP_SYNCIP_FREE
);
676 * wait for any current ufs_iget to finish and block future ufs_igets
678 ASSERT(ip
->i_number
!= 0);
679 hno
= INOHASH(ip
->i_number
);
684 * It must be guaranteed that v_count >= 2, otherwise
685 * something must be wrong with this vnode already.
686 * That is why we use VN_RELE_LOCKED() instead of VN_RELE().
687 * Acquire the vnode lock in case another thread is in
690 mutex_enter(&vp
->v_lock
);
692 VERIFY3U(vp
->v_count
, >=, 2);
696 vn_has_data
= (vp
->v_type
!= VCHR
&& vn_has_cached_data(vp
));
697 vn_modified
= (ip
->i_flag
& (IMOD
|IMODACC
|IACC
|ICHG
|IUPD
|IATTCHG
));
699 if (vp
->v_count
!= 1 ||
700 ((vn_has_data
|| vn_modified
) &&
701 ((ip
->i_flag
& ISTALE
) == 0))) {
703 * Another thread has referenced this inode while
704 * we are trying to free it. Call VN_RELE() to
705 * release our reference, if v_count > 1 data is
706 * present or one of the modified etc. flags was
707 * set, whereby ISTALE wasn't set.
708 * If we'd proceed with ISTALE set here, we might
709 * get ourselves into a deadlock situation.
711 mutex_exit(&vp
->v_lock
);
716 * The inode is currently unreferenced and can not
717 * acquire further references because it has no pages
718 * and the hash is locked. Inodes acquire references
719 * via the hash list or via their pages.
722 mutex_exit(&vp
->v_lock
);
725 * remove it from the cache
730 * Stale inodes have no valid ufsvfs
732 if ((ip
->i_flag
& ISTALE
) == 0 && ip
->i_dquot
) {
733 TRANS_DQRELE(ufsvfsp
, ip
->i_dquot
);
736 if ((ip
->i_flag
& ISTALE
) &&
739 * ISTALE inodes may have data
740 * and this data needs to be
743 (void) pvn_vplist_dirty(vp
, (u_offset_t
)0,
744 ufs_putapage
, B_INVAL
| B_TRUNC
,
745 (struct cred
*)NULL
);
749 CPU_STATS_ADDQ(CPU
, sys
, ufsipage
, 1);
751 CPU_STATS_ADDQ(CPU
, sys
, ufsinopage
, 1);
753 ASSERT((vp
->v_type
== VCHR
) || !vn_has_cached_data(vp
));
756 * We had better not have a vnode reference count > 1
757 * at this point, if we do then something is broken as
758 * this inode/vnode acquired a reference underneath of us.
760 ASSERT(vp
->v_count
== 1);
767 * this thread processes the global idle queue
769 iqhead_t
*ufs_junk_iq
;
770 iqhead_t
*ufs_useful_iq
;
771 int ufs_njunk_iq
= 0;
772 int ufs_nuseful_iq
= 0;
775 struct ufs_q ufs_idle_q
;
778 ufs_thread_idle(void)
784 ufs_niqhash
= (ufs_idle_q
.uq_lowat
>> 1) / IQHASHQLEN
;
785 ufs_niqhash
= 1 << highbit(ufs_niqhash
); /* round up to power of 2 */
786 ufs_iqhashmask
= ufs_niqhash
- 1;
787 ufs_junk_iq
= kmem_alloc(ufs_niqhash
* sizeof (*ufs_junk_iq
),
789 ufs_useful_iq
= kmem_alloc(ufs_niqhash
* sizeof (*ufs_useful_iq
),
792 /* Initialize hash queue headers */
793 for (i
= 0; i
< ufs_niqhash
; i
++) {
794 ufs_junk_iq
[i
].i_freef
= (inode_t
*)&ufs_junk_iq
[i
];
795 ufs_junk_iq
[i
].i_freeb
= (inode_t
*)&ufs_junk_iq
[i
];
796 ufs_useful_iq
[i
].i_freef
= (inode_t
*)&ufs_useful_iq
[i
];
797 ufs_useful_iq
[i
].i_freeb
= (inode_t
*)&ufs_useful_iq
[i
];
800 CALLB_CPR_INIT(&cprinfo
, &ufs_idle_q
.uq_mutex
, callb_generic_cpr
,
804 * Whenever the idle thread is awakened, it repeatedly gives
805 * back half of the idle queue until the idle queue falls
808 mutex_enter(&ufs_idle_q
.uq_mutex
);
809 if (ufs_idle_q
.uq_ne
< ufs_idle_q
.uq_lowat
) {
810 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
811 cv_wait(&ufs_idle_q
.uq_cv
, &ufs_idle_q
.uq_mutex
);
812 CALLB_CPR_SAFE_END(&cprinfo
, &ufs_idle_q
.uq_mutex
);
814 mutex_exit(&ufs_idle_q
.uq_mutex
);
817 * Give back 1/2 of the idle queue
819 ne
= ufs_idle_q
.uq_ne
>> 1;
820 ins
.in_tidles
.value
.ul
+= ne
;
826 * Reclaim callback for ufs inode cache.
827 * Invoked by the kernel memory allocator when memory gets tight.
831 ufs_inode_cache_reclaim(void *cdrarg
)
834 * If we are low on memory and the idle queue is over its
835 * halfway mark, then free 50% of the idle q
837 * We don't free all of the idle inodes because the inodes
838 * for popular NFS files may have been kicked from the dnlc.
839 * The inodes for these files will end up on the idle queue
840 * after every NFS access.
842 * If we repeatedly push them from the idle queue then
843 * NFS users may be unhappy as an extra buf cache operation
844 * is incurred for every NFS operation to these files.
846 * It's not common, but I have seen it happen.
849 if (ufs_idle_q
.uq_ne
< (ufs_idle_q
.uq_lowat
>> 1))
851 mutex_enter(&ufs_idle_q
.uq_mutex
);
852 cv_broadcast(&ufs_idle_q
.uq_cv
);
853 mutex_exit(&ufs_idle_q
.uq_mutex
);
857 * Free up some idle inodes
860 ufs_idle_some(int ne
)
865 static int junk_rotor
= 0;
866 static int useful_rotor
= 0;
868 for (i
= 0; i
< ne
; ++i
) {
869 mutex_enter(&ufs_idle_q
.uq_mutex
);
872 while (ufs_junk_iq
[junk_rotor
].i_freef
==
873 (inode_t
*)&ufs_junk_iq
[junk_rotor
]) {
874 junk_rotor
= IQNEXT(junk_rotor
);
876 ip
= ufs_junk_iq
[junk_rotor
].i_freef
;
877 ASSERT(ip
->i_flag
& IJUNKIQ
);
878 } else if (ufs_nuseful_iq
) {
879 while (ufs_useful_iq
[useful_rotor
].i_freef
==
880 (inode_t
*)&ufs_useful_iq
[useful_rotor
]) {
881 useful_rotor
= IQNEXT(useful_rotor
);
883 ip
= ufs_useful_iq
[useful_rotor
].i_freef
;
884 ASSERT(!(ip
->i_flag
& IJUNKIQ
));
886 mutex_exit(&ufs_idle_q
.uq_mutex
);
895 mutex_exit(&ufs_idle_q
.uq_mutex
);
896 rw_enter(&ip
->i_contents
, RW_WRITER
);
898 * VN_RELE should not be called if
899 * ufs_rmidle returns true, as it will
900 * effectively be done in ufs_idle_free.
902 if (ufs_rmidle(ip
)) {
903 rw_exit(&ip
->i_contents
);
906 rw_exit(&ip
->i_contents
);
913 * drain entries for vfsp from the idle queue
914 * vfsp == NULL means drain the entire thing
917 ufs_idle_drain(struct vfs
*vfsp
)
919 struct inode
*ip
, *nip
;
920 struct inode
*ianchor
= NULL
;
923 mutex_enter(&ufs_idle_q
.uq_mutex
);
925 /* for each hash q */
926 for (i
= 0; i
< ufs_niqhash
; i
++) {
927 /* search down the hash q */
928 for (ip
= ufs_junk_iq
[i
].i_freef
;
929 ip
!= (inode_t
*)&ufs_junk_iq
[i
];
931 if (ip
->i_vfs
== vfsp
|| vfsp
== NULL
) {
932 /* found a matching entry */
934 mutex_exit(&ufs_idle_q
.uq_mutex
);
935 rw_enter(&ip
->i_contents
, RW_WRITER
);
937 * See comments in ufs_idle_some()
938 * as we will call ufs_idle_free()
939 * after scanning both queues.
941 if (ufs_rmidle(ip
)) {
942 rw_exit(&ip
->i_contents
);
943 ip
->i_freef
= ianchor
;
946 rw_exit(&ip
->i_contents
);
949 /* restart this hash q */
950 ip
= (inode_t
*)&ufs_junk_iq
[i
];
951 mutex_enter(&ufs_idle_q
.uq_mutex
);
956 if (ufs_nuseful_iq
) {
957 /* for each hash q */
958 for (i
= 0; i
< ufs_niqhash
; i
++) {
959 /* search down the hash q */
960 for (ip
= ufs_useful_iq
[i
].i_freef
;
961 ip
!= (inode_t
*)&ufs_useful_iq
[i
];
963 if (ip
->i_vfs
== vfsp
|| vfsp
== NULL
) {
964 /* found a matching entry */
966 mutex_exit(&ufs_idle_q
.uq_mutex
);
967 rw_enter(&ip
->i_contents
, RW_WRITER
);
969 * See comments in ufs_idle_some()
970 * as we will call ufs_idle_free()
971 * after scanning both queues.
973 if (ufs_rmidle(ip
)) {
974 rw_exit(&ip
->i_contents
);
975 ip
->i_freef
= ianchor
;
978 rw_exit(&ip
->i_contents
);
981 /* restart this hash q */
982 ip
= (inode_t
*)&ufs_useful_iq
[i
];
983 mutex_enter(&ufs_idle_q
.uq_mutex
);
989 mutex_exit(&ufs_idle_q
.uq_mutex
);
990 /* no more matching entries, release those we have found (if any) */
991 for (ip
= ianchor
; ip
; ip
= nip
) {
999 * RECLAIM DELETED INODES
1000 * The following thread scans the file system once looking for deleted files
1003 ufs_thread_reclaim(struct vfs
*vfsp
)
1005 struct ufsvfs
*ufsvfsp
= (struct ufsvfs
*)vfsp
->vfs_data
;
1006 struct ufs_q
*uq
= &ufsvfsp
->vfs_reclaim
;
1007 struct fs
*fs
= ufsvfsp
->vfs_fs
;
1014 callb_cpr_t cprinfo
;
1016 CALLB_CPR_INIT(&cprinfo
, &uq
->uq_mutex
, callb_generic_cpr
,
1020 * mount decided that we don't need a reclaim thread
1022 if ((fs
->fs_reclaim
& FS_RECLAIMING
) == 0)
1026 * don't reclaim if readonly
1031 for (ino
= 0; ino
< (fs
->fs_ncg
* fs
->fs_ipg
) && !err
; ++ino
) {
1034 * Check whether we are the target of another
1035 * thread having called ufs_thread_exit() or
1036 * ufs_thread_suspend().
1038 mutex_enter(&uq
->uq_mutex
);
1040 if (uq
->uq_flags
& UQ_EXIT
) {
1042 mutex_exit(&uq
->uq_mutex
);
1044 } else if (uq
->uq_flags
& UQ_SUSPEND
) {
1045 uq
->uq_flags
|= UQ_SUSPENDED
;
1047 * Release the buf before we cv_wait()
1048 * otherwise we may deadlock with the
1049 * thread that called ufs_thread_suspend().
1055 if (uq
->uq_flags
& UQ_WAIT
) {
1056 uq
->uq_flags
&= ~UQ_WAIT
;
1057 cv_broadcast(&uq
->uq_cv
);
1059 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
1060 cv_wait(&uq
->uq_cv
, &uq
->uq_mutex
);
1061 CALLB_CPR_SAFE_END(&cprinfo
, &uq
->uq_mutex
);
1064 mutex_exit(&uq
->uq_mutex
);
1067 * if we don't already have the buf; get it
1069 bno
= fsbtodb(fs
, itod(fs
, ino
));
1070 if ((bp
== 0) || (bp
->b_blkno
!= bno
)) {
1073 bp
= UFS_BREAD(ufsvfsp
,
1074 ufsvfsp
->vfs_dev
, bno
, fs
->fs_bsize
);
1075 bp
->b_flags
|= B_AGE
;
1077 if (bp
->b_flags
& B_ERROR
) {
1082 * nlink <= 0 and mode != 0 means deleted
1084 dp
= (struct dinode
*)bp
->b_un
.b_addr
+ itoo(fs
, ino
);
1085 if ((dp
->di_nlink
<= 0) && (dp
->di_mode
!= 0)) {
1087 * can't hold the buf (deadlock)
1091 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
1093 * iget/iput sequence will put inode on ifree
1094 * thread queue if it is idle. This is a nop
1095 * for busy (open, deleted) inodes
1097 if (ufs_iget(vfsp
, ino
, &ip
, CRED()))
1101 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1109 * reset the reclaiming-bit
1111 mutex_enter(&ufsvfsp
->vfs_lock
);
1112 fs
->fs_reclaim
&= ~FS_RECLAIMING
;
1113 mutex_exit(&ufsvfsp
->vfs_lock
);
1114 TRANS_SBWRITE(ufsvfsp
, TOP_SBWRITE_RECLAIM
);
1118 * exit the reclaim thread
1120 mutex_enter(&uq
->uq_mutex
);
1121 uq
->uq_threadp
= NULL
;
1122 uq
->uq_flags
&= ~UQ_WAIT
;
1123 cv_broadcast(&uq
->uq_cv
);
1124 CALLB_CPR_EXIT(&cprinfo
);
1129 * hlock the file system's whose logs have device errors
1131 struct ufs_q ufs_hlock
;
1134 ufs_thread_hlock(void *ignore
)
1137 callb_cpr_t cprinfo
;
1139 CALLB_CPR_INIT(&cprinfo
, &ufs_hlock
.uq_mutex
, callb_generic_cpr
,
1144 * sleep until there is work to do
1146 mutex_enter(&ufs_hlock
.uq_mutex
);
1147 (void) ufs_thread_run(&ufs_hlock
, &cprinfo
);
1148 ufs_hlock
.uq_ne
= 0;
1149 mutex_exit(&ufs_hlock
.uq_mutex
);
1151 * hlock the error'ed fs's
1152 * retry after a bit if another app is doing lockfs stuff
1155 retry
= ufs_trans_hlock();
1157 mutex_enter(&ufs_hlock
.uq_mutex
);
1158 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
1159 (void) cv_reltimedwait(&ufs_hlock
.uq_cv
,
1160 &ufs_hlock
.uq_mutex
, hz
, TR_CLOCK_TICK
);
1161 CALLB_CPR_SAFE_END(&cprinfo
,
1162 &ufs_hlock
.uq_mutex
);
1163 mutex_exit(&ufs_hlock
.uq_mutex
);
1170 ufs_attr_purge(struct inode
*dp
)
1174 off_t dirsize
; /* size of the directory */
1175 off_t offset
; /* offset in the directory */
1176 int entryoffsetinblk
; /* offset of ep in fbp's buffer */
1178 struct fbuf
*fbp
; /* pointer to directory block */
1179 struct direct
*ep
; /* directory entry */
1182 struct ufsvfs
*ufsvfsp
= dp
->i_ufsvfs
;
1184 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
1187 dirsize
= roundup(dp
->i_size
, DIRBLKSIZ
);
1189 entryoffsetinblk
= 0;
1192 * Purge directory cache
1195 dnlc_dir_purge(&dp
->i_danchor
);
1197 while (offset
< dirsize
) {
1199 * If offset is on a block boundary,
1200 * read the next directory block.
1201 * Release previous if it exists.
1203 if (blkoff(dp
->i_fs
, offset
) == 0) {
1205 fbrelse(fbp
, S_OTHER
);
1208 err
= blkatoff(dp
, offset
, (char **)0, &fbp
);
1212 entryoffsetinblk
= 0;
1214 ep
= (struct direct
*)(fbp
->fb_addr
+ entryoffsetinblk
);
1215 if (ep
->d_ino
== 0 || (ep
->d_name
[0] == '.' &&
1216 ep
->d_name
[1] == '\0') ||
1217 (ep
->d_name
[0] == '.' && ep
->d_name
[1] == '.' &&
1218 ep
->d_name
[2] == '\0')) {
1220 entryoffsetinblk
+= ep
->d_reclen
;
1224 if ((err
= ufs_iget(dp
->i_vfs
, ep
->d_ino
,
1225 &tp
, CRED())) != 0) {
1229 TRANS_BEGIN_CSYNC(ufsvfsp
, issync
, TOP_REMOVE
,
1230 trans_size
= (int)TOP_REMOVE_SIZE(tp
));
1236 dnlc_remove(ITOV(dp
), ep
->d_name
);
1238 rw_enter(&tp
->i_contents
, RW_WRITER
);
1241 TRANS_INODE(tp
->i_ufsvfs
, tp
);
1245 rw_exit(&tp
->i_contents
);
1248 entryoffsetinblk
+= ep
->d_reclen
;
1249 TRANS_END_CSYNC(ufsvfsp
, error
,
1250 issync
, TOP_REMOVE
, trans_size
);
1253 offset
+= ep
->d_reclen
;
1257 fbrelse(fbp
, S_OTHER
);
1261 rw_exit(&ufsvfsp
->vfs_dqrwlock
);