4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2017, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
29 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
33 * University Copyright- Copyright (c) 1982, 1986, 1988
34 * The Regents of the University of California
37 * University Acknowledgment- Portions of this document are derived from
38 * software developed by the University of California, Berkeley, and its
42 #include <sys/types.h>
43 #include <sys/param.h>
44 #include <sys/t_lock.h>
45 #include <sys/errno.h>
50 #include <sys/pathname.h>
51 #include <sys/atomic.h>
53 #include <sys/vnode.h>
54 #include <sys/vnode_dispatch.h>
55 #include <sys/rwstlock.h>
60 #include <sys/sysmacros.h>
61 #include <sys/cmn_err.h>
62 #include <sys/systm.h>
64 #include <sys/debug.h>
66 #include <sys/nbmlock.h>
67 #include <sys/fcntl.h>
68 #include <sys/fs_subr.h>
69 #include <sys/taskq.h>
70 #include <sys/fs_reparse.h>
74 /* Determine if this vnode is a file that is read-only */
75 #define ISROFILE(vp) \
76 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
77 (vp)->v_type != VFIFO && vn_is_readonly(vp))
79 /* Tunable via /etc/system; used only by admin/install */
80 int nfs_global_client_only
;
83 * Array of vopstats_t for per-FS-type vopstats. This array has the same
84 * number of entries as and parallel to the vfssw table. (Arguably, it could
85 * be part of the vfssw table.) Once it's initialized, it's accessed using
86 * the same fstype index that is used to index into the vfssw table.
88 vopstats_t
**vopstats_fstype
;
90 /* vopstats initialization template used for fast initialization via bcopy() */
91 static vopstats_t
*vs_templatep
;
93 /* Kmem cache handle for vsk_anchor_t allocations */
94 kmem_cache_t
*vsk_anchor_cache
;
96 /* file events cleanup routine */
97 extern void free_fopdata(vnode_t
*);
100 * Root of AVL tree for the kstats associated with vopstats. Lock protects
101 * updates to vsktat_tree.
103 avl_tree_t vskstat_tree
;
104 kmutex_t vskstat_tree_lock
;
106 /* Global variable which enables/disables the vopstats collection */
107 int vopstats_enabled
= 1;
109 /* Global used for empty/invalid v_path */
110 char *vn_vpath_empty
= "";
113 * forward declarations for internal vnode specific data (vsd)
115 static void *vsd_realloc(void *, size_t, size_t);
118 * forward declarations for reparse point functions
120 static int fs_reparse_mark(char *target
, vattr_t
*vap
, xvattr_t
*xvattr
);
123 * VSD -- VNODE SPECIFIC DATA
124 * The v_data pointer is typically used by a file system to store a
125 * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
126 * However, there are times when additional project private data needs
127 * to be stored separately from the data (node) pointed to by v_data.
128 * This additional data could be stored by the file system itself or
129 * by a completely different kernel entity. VSD provides a way for
130 * callers to obtain a key and store a pointer to private data associated
133 * Callers are responsible for protecting the vsd by holding v_vsd_lock
134 * for calls to vsd_set() and vsd_get().
139 * vsd_nkeys - creation and deletion of vsd keys
140 * vsd_list - insertion and deletion of vsd_node in the vsd_list
141 * vsd_destructor - adding and removing destructors to the list
143 static kmutex_t vsd_lock
;
144 static uint_t vsd_nkeys
; /* size of destructor array */
145 /* list of vsd_node's */
146 static list_t
*vsd_list
= NULL
;
147 /* per-key destructor funcs */
148 static void (**vsd_destructor
)(void *);
151 * The following is the common set of actions needed to update the
152 * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and
153 * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
154 * recording of the bytes transferred. Since the code is similar
155 * but small, it is nearly a duplicate. Consequently any changes
156 * to one may need to be reflected in the other.
157 * Rundown of the variables:
158 * vp - Pointer to the vnode
159 * counter - Partial name structure member to update in vopstats for counts
160 * bytecounter - Partial name structure member to update in vopstats for bytes
161 * bytesval - Value to update in vopstats for bytes
162 * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
163 * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
166 #define VOPSTATS_UPDATE(vp, counter) { \
167 vfs_t *vfsp = (vp)->v_vfsp; \
168 if (vfsp && vfsp->vfs_implp && \
169 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
170 vopstats_t *vsp = &vfsp->vfs_vopstats; \
171 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
172 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
173 size_t, uint64_t *); \
174 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \
176 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
177 vsp->n##counter.value.ui64++; \
182 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \
183 vfs_t *vfsp = (vp)->v_vfsp; \
184 if (vfsp && vfsp->vfs_implp && \
185 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
186 vopstats_t *vsp = &vfsp->vfs_vopstats; \
187 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
188 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
189 size_t, uint64_t *); \
190 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
192 vsp->bytecounter.value.ui64 += bytesval; \
193 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
194 vsp->n##counter.value.ui64++; \
195 vsp->bytecounter.value.ui64 += bytesval; \
201 * If the filesystem does not support XIDs map credential
202 * If the vfsp is NULL, perhaps we should also map?
204 #define VOPXID_MAP_CR(vp, cr) { \
205 vfs_t *vfsp = (vp)->v_vfsp; \
206 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \
207 cr = crgetmapped(cr); \
211 * Convert stat(2) formats to vnode types and vice versa. (Knows about
212 * numerical order of S_IFMT and vnode types.)
214 enum vtype iftovt_tab
[] = {
215 VNON
, VFIFO
, VCHR
, VNON
, VDIR
, VNON
, VBLK
, VNON
,
216 VREG
, VNON
, VLNK
, VNON
, VSOCK
, VNON
, VNON
, VNON
219 ushort_t vttoif_tab
[] = {
220 0, S_IFREG
, S_IFDIR
, S_IFBLK
, S_IFCHR
, S_IFLNK
, S_IFIFO
,
221 S_IFDOOR
, 0, S_IFSOCK
, S_IFPORT
, 0
225 * The system vnode cache.
228 kmem_cache_t
*vn_cache
;
231 /* Extensible attribute (xva) routines. */
234 * Zero out the structure, set the size of the requested/returned bitmaps,
235 * set VATTR_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
236 * to the returned attributes array.
239 xva_init(xvattr_t
*xvap
)
241 bzero(xvap
, sizeof (xvattr_t
));
242 xvap
->xva_mapsize
= XVA_MAPSIZE
;
243 xvap
->xva_magic
= XVA_MAGIC
;
244 xvap
->xva_vattr
.va_mask
= VATTR_XVATTR
;
245 xvap
->xva_rtnattrmapp
= &(xvap
->xva_rtnattrmap
)[0];
249 * If VATTR_XVATTR is set, returns a pointer to the embedded xoptattr_t
250 * structure. Otherwise, returns NULL.
253 xva_getxoptattr(xvattr_t
*xvap
)
255 xoptattr_t
*xoap
= NULL
;
256 if (xvap
->xva_vattr
.va_mask
& VATTR_XVATTR
)
257 xoap
= &xvap
->xva_xoptattrs
;
262 * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
263 * We use the f_fsid reported by VFS_STATVFS() since we use that for the
267 vska_compar(const void *n1
, const void *n2
)
270 ulong_t p1
= ((vsk_anchor_t
*)n1
)->vsk_fsid
;
271 ulong_t p2
= ((vsk_anchor_t
*)n2
)->vsk_fsid
;
275 } else if (p1
> p2
) {
285 * Used to create a single template which will be bcopy()ed to a newly
286 * allocated vsanchor_combo_t structure in new_vsanchor(), below.
289 create_vopstats_template()
293 vsp
= kmem_alloc(sizeof (vopstats_t
), KM_SLEEP
);
294 bzero(vsp
, sizeof (*vsp
)); /* Start fresh */
297 kstat_named_init(&vsp
->nopen
, "nopen", KSTAT_DATA_UINT64
);
299 kstat_named_init(&vsp
->nclose
, "nclose", KSTAT_DATA_UINT64
);
301 kstat_named_init(&vsp
->nread
, "nread", KSTAT_DATA_UINT64
);
302 kstat_named_init(&vsp
->read_bytes
, "read_bytes", KSTAT_DATA_UINT64
);
304 kstat_named_init(&vsp
->nwrite
, "nwrite", KSTAT_DATA_UINT64
);
305 kstat_named_init(&vsp
->write_bytes
, "write_bytes", KSTAT_DATA_UINT64
);
307 kstat_named_init(&vsp
->nioctl
, "nioctl", KSTAT_DATA_UINT64
);
309 kstat_named_init(&vsp
->nsetfl
, "nsetfl", KSTAT_DATA_UINT64
);
311 kstat_named_init(&vsp
->ngetattr
, "ngetattr", KSTAT_DATA_UINT64
);
313 kstat_named_init(&vsp
->nsetattr
, "nsetattr", KSTAT_DATA_UINT64
);
315 kstat_named_init(&vsp
->naccess
, "naccess", KSTAT_DATA_UINT64
);
317 kstat_named_init(&vsp
->nlookup
, "nlookup", KSTAT_DATA_UINT64
);
319 kstat_named_init(&vsp
->ncreate
, "ncreate", KSTAT_DATA_UINT64
);
321 kstat_named_init(&vsp
->nremove
, "nremove", KSTAT_DATA_UINT64
);
323 kstat_named_init(&vsp
->nlink
, "nlink", KSTAT_DATA_UINT64
);
325 kstat_named_init(&vsp
->nrename
, "nrename", KSTAT_DATA_UINT64
);
327 kstat_named_init(&vsp
->nmkdir
, "nmkdir", KSTAT_DATA_UINT64
);
329 kstat_named_init(&vsp
->nrmdir
, "nrmdir", KSTAT_DATA_UINT64
);
330 /* fop_readdir I/O */
331 kstat_named_init(&vsp
->nreaddir
, "nreaddir", KSTAT_DATA_UINT64
);
332 kstat_named_init(&vsp
->readdir_bytes
, "readdir_bytes",
335 kstat_named_init(&vsp
->nsymlink
, "nsymlink", KSTAT_DATA_UINT64
);
337 kstat_named_init(&vsp
->nreadlink
, "nreadlink", KSTAT_DATA_UINT64
);
339 kstat_named_init(&vsp
->nfsync
, "nfsync", KSTAT_DATA_UINT64
);
341 kstat_named_init(&vsp
->ninactive
, "ninactive", KSTAT_DATA_UINT64
);
343 kstat_named_init(&vsp
->nfid
, "nfid", KSTAT_DATA_UINT64
);
345 kstat_named_init(&vsp
->nrwlock
, "nrwlock", KSTAT_DATA_UINT64
);
347 kstat_named_init(&vsp
->nrwunlock
, "nrwunlock", KSTAT_DATA_UINT64
);
349 kstat_named_init(&vsp
->nseek
, "nseek", KSTAT_DATA_UINT64
);
351 kstat_named_init(&vsp
->ncmp
, "ncmp", KSTAT_DATA_UINT64
);
353 kstat_named_init(&vsp
->nfrlock
, "nfrlock", KSTAT_DATA_UINT64
);
355 kstat_named_init(&vsp
->nspace
, "nspace", KSTAT_DATA_UINT64
);
357 kstat_named_init(&vsp
->nrealvp
, "nrealvp", KSTAT_DATA_UINT64
);
359 kstat_named_init(&vsp
->ngetpage
, "ngetpage", KSTAT_DATA_UINT64
);
361 kstat_named_init(&vsp
->nputpage
, "nputpage", KSTAT_DATA_UINT64
);
363 kstat_named_init(&vsp
->nmap
, "nmap", KSTAT_DATA_UINT64
);
365 kstat_named_init(&vsp
->naddmap
, "naddmap", KSTAT_DATA_UINT64
);
367 kstat_named_init(&vsp
->ndelmap
, "ndelmap", KSTAT_DATA_UINT64
);
369 kstat_named_init(&vsp
->npoll
, "npoll", KSTAT_DATA_UINT64
);
371 kstat_named_init(&vsp
->ndump
, "ndump", KSTAT_DATA_UINT64
);
373 kstat_named_init(&vsp
->npathconf
, "npathconf", KSTAT_DATA_UINT64
);
375 kstat_named_init(&vsp
->npageio
, "npageio", KSTAT_DATA_UINT64
);
377 kstat_named_init(&vsp
->ndumpctl
, "ndumpctl", KSTAT_DATA_UINT64
);
379 kstat_named_init(&vsp
->ndispose
, "ndispose", KSTAT_DATA_UINT64
);
381 kstat_named_init(&vsp
->nsetsecattr
, "nsetsecattr", KSTAT_DATA_UINT64
);
383 kstat_named_init(&vsp
->ngetsecattr
, "ngetsecattr", KSTAT_DATA_UINT64
);
385 kstat_named_init(&vsp
->nshrlock
, "nshrlock", KSTAT_DATA_UINT64
);
387 kstat_named_init(&vsp
->nvnevent
, "nvnevent", KSTAT_DATA_UINT64
);
389 kstat_named_init(&vsp
->nreqzcbuf
, "nreqzcbuf", KSTAT_DATA_UINT64
);
391 kstat_named_init(&vsp
->nretzcbuf
, "nretzcbuf", KSTAT_DATA_UINT64
);
397 * Creates a kstat structure associated with a vopstats structure.
400 new_vskstat(char *ksname
, vopstats_t
*vsp
)
404 if (!vopstats_enabled
) {
408 ksp
= kstat_create("unix", 0, ksname
, "misc", KSTAT_TYPE_NAMED
,
409 sizeof (vopstats_t
)/sizeof (kstat_named_t
),
410 KSTAT_FLAG_VIRTUAL
|KSTAT_FLAG_WRITABLE
);
420 * Called from vfsinit() to initialize the support mechanisms for vopstats
425 if (!vopstats_enabled
)
429 * Creates the AVL tree which holds per-vfs vopstat anchors. This
430 * is necessary since we need to check if a kstat exists before we
431 * attempt to create it. Also, initialize its lock.
433 avl_create(&vskstat_tree
, vska_compar
, sizeof (vsk_anchor_t
),
434 offsetof(vsk_anchor_t
, vsk_node
));
435 mutex_init(&vskstat_tree_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
437 vsk_anchor_cache
= kmem_cache_create("vsk_anchor_cache",
438 sizeof (vsk_anchor_t
), sizeof (uintptr_t), NULL
, NULL
, NULL
,
442 * Set up the array of pointers for the vopstats-by-FS-type.
443 * The entries will be allocated/initialized as each file system
444 * goes through modload/mod_installfs.
446 vopstats_fstype
= (vopstats_t
**)kmem_zalloc(
447 (sizeof (vopstats_t
*) * nfstype
), KM_SLEEP
);
449 /* Set up the global vopstats initialization template */
450 vs_templatep
= create_vopstats_template();
454 * We need to have the all of the counters zeroed.
455 * The initialization of the vopstats_t includes on the order of
456 * 50 calls to kstat_named_init(). Rather that do that on every call,
457 * we do it once in a template (vs_templatep) then bcopy it over.
460 initialize_vopstats(vopstats_t
*vsp
)
465 bcopy(vs_templatep
, vsp
, sizeof (vopstats_t
));
469 * If possible, determine which vopstats by fstype to use and
470 * return a pointer to the caller.
473 get_fstype_vopstats(vfs_t
*vfsp
, struct vfssw
*vswp
)
475 int fstype
= 0; /* Index into vfssw[] */
476 vopstats_t
*vsp
= NULL
;
478 if (vfsp
== NULL
|| (vfsp
->vfs_flag
& VFS_STATS
) == 0 ||
482 * Set up the fstype. We go to so much trouble because all versions
483 * of NFS use the same fstype in their vfs even though they have
484 * distinct entries in the vfssw[] table.
485 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
488 fstype
= vswp
- vfssw
; /* Gets us the index */
490 fstype
= vfsp
->vfs_fstype
;
494 * Point to the per-fstype vopstats. The only valid values are
495 * non-zero positive values less than the number of vfssw[] table
498 if (fstype
> 0 && fstype
< nfstype
) {
499 vsp
= vopstats_fstype
[fstype
];
506 * Generate a kstat name, create the kstat structure, and allocate a
507 * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t
508 * to the caller. This must only be called from a mount.
511 get_vskstat_anchor(vfs_t
*vfsp
)
513 char kstatstr
[KSTAT_STRLEN
]; /* kstat name for vopstats */
514 statvfs64_t statvfsbuf
; /* Needed to find f_fsid */
515 vsk_anchor_t
*vskp
= NULL
; /* vfs <--> kstat anchor */
516 kstat_t
*ksp
; /* Ptr to new kstat */
517 avl_index_t where
; /* Location in the AVL tree */
519 if (vfsp
== NULL
|| vfsp
->vfs_implp
== NULL
||
520 (vfsp
->vfs_flag
& VFS_STATS
) == 0 || !vopstats_enabled
)
523 /* Need to get the fsid to build a kstat name */
524 if (VFS_STATVFS(vfsp
, &statvfsbuf
) == 0) {
525 /* Create a name for our kstats based on fsid */
526 (void) snprintf(kstatstr
, KSTAT_STRLEN
, "%s%lx",
527 VOPSTATS_STR
, statvfsbuf
.f_fsid
);
529 /* Allocate and initialize the vsk_anchor_t */
530 vskp
= kmem_cache_alloc(vsk_anchor_cache
, KM_SLEEP
);
531 bzero(vskp
, sizeof (*vskp
));
532 vskp
->vsk_fsid
= statvfsbuf
.f_fsid
;
534 mutex_enter(&vskstat_tree_lock
);
535 if (avl_find(&vskstat_tree
, vskp
, &where
) == NULL
) {
536 avl_insert(&vskstat_tree
, vskp
, where
);
537 mutex_exit(&vskstat_tree_lock
);
540 * Now that we've got the anchor in the AVL
541 * tree, we can create the kstat.
543 ksp
= new_vskstat(kstatstr
, &vfsp
->vfs_vopstats
);
548 /* Oops, found one! Release memory and lock. */
549 mutex_exit(&vskstat_tree_lock
);
550 kmem_cache_free(vsk_anchor_cache
, vskp
);
558 * We're in the process of tearing down the vfs and need to cleanup
559 * the data structures associated with the vopstats. Must only be called
563 teardown_vopstats(vfs_t
*vfsp
)
568 if (vfsp
== NULL
|| vfsp
->vfs_implp
== NULL
||
569 (vfsp
->vfs_flag
& VFS_STATS
) == 0 || !vopstats_enabled
)
572 /* This is a safe check since VFS_STATS must be set (see above) */
573 if ((vskap
= vfsp
->vfs_vskap
) == NULL
)
576 /* Whack the pointer right away */
577 vfsp
->vfs_vskap
= NULL
;
579 /* Lock the tree, remove the node, and delete the kstat */
580 mutex_enter(&vskstat_tree_lock
);
581 if (avl_find(&vskstat_tree
, vskap
, &where
)) {
582 avl_remove(&vskstat_tree
, vskap
);
585 if (vskap
->vsk_ksp
) {
586 kstat_delete(vskap
->vsk_ksp
);
588 mutex_exit(&vskstat_tree_lock
);
590 kmem_cache_free(vsk_anchor_cache
, vskap
);
594 * Read or write a vnode. Called from kernel code.
605 rlim_t ulimit
, /* meaningful only if rw is UIO_WRITE */
614 if (rw
== UIO_WRITE
&& ISROFILE(vp
))
620 VOPXID_MAP_CR(vp
, cr
);
626 uio
.uio_loffset
= offset
;
627 uio
.uio_segflg
= (short)seg
;
629 uio
.uio_llimit
= ulimit
;
632 * We have to enter the critical region before calling fop_rwlock
633 * to avoid a deadlock with ufs.
635 if (nbl_need_check(vp
)) {
638 nbl_start_crit(vp
, RW_READER
);
640 error
= nbl_svmand(vp
, cr
, &svmand
);
643 if (nbl_conflict(vp
, rw
== UIO_WRITE
? NBL_WRITE
: NBL_READ
,
644 uio
.uio_offset
, uio
.uio_resid
, svmand
, NULL
)) {
650 (void) fop_rwlock(vp
,
651 rw
== UIO_WRITE
? V_WRITELOCK_TRUE
: V_WRITELOCK_FALSE
, NULL
);
652 if (rw
== UIO_WRITE
) {
653 uio
.uio_fmode
= FWRITE
;
654 uio
.uio_extflg
= UIO_COPY_DEFAULT
;
655 error
= fop_write(vp
, &uio
, ioflag
, cr
, NULL
);
657 uio
.uio_fmode
= FREAD
;
658 uio
.uio_extflg
= UIO_COPY_CACHED
;
659 error
= fop_read(vp
, &uio
, ioflag
, cr
, NULL
);
662 rw
== UIO_WRITE
? V_WRITELOCK_TRUE
: V_WRITELOCK_FALSE
, NULL
);
664 *residp
= uio
.uio_resid
;
665 else if (uio
.uio_resid
)
675 * Release a vnode. Call fop_inactive on last reference or
676 * decrement reference count.
678 * To avoid race conditions, the v_count is left at 1 for
679 * the call to fop_inactive. This prevents another thread
680 * from reclaiming and releasing the vnode *before* the
681 * fop_inactive routine has a chance to destroy the vnode.
682 * We can't have more than 1 thread calling fop_inactive
688 VERIFY(vp
->v_count
> 0);
689 mutex_enter(&vp
->v_lock
);
690 if (vp
->v_count
== 1) {
691 mutex_exit(&vp
->v_lock
);
692 fop_inactive(vp
, CRED(), NULL
);
696 mutex_exit(&vp
->v_lock
);
700 * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
701 * as a single reference, so v_count is not decremented until the last DNLC hold
702 * is released. This makes it possible to distinguish vnodes that are referenced
706 vn_rele_dnlc(vnode_t
*vp
)
708 VERIFY((vp
->v_count
> 0) && (vp
->v_count_dnlc
> 0));
709 mutex_enter(&vp
->v_lock
);
710 if (--vp
->v_count_dnlc
== 0) {
711 if (vp
->v_count
== 1) {
712 mutex_exit(&vp
->v_lock
);
713 fop_inactive(vp
, CRED(), NULL
);
718 mutex_exit(&vp
->v_lock
);
722 * Like vn_rele() except that it clears v_stream under v_lock.
723 * This is used by sockfs when it dismantles the association between
724 * the sockfs node and the vnode in the underlying file system.
725 * v_lock has to be held to prevent a thread coming through the lookupname
726 * path from accessing a stream head that is going away.
729 vn_rele_stream(vnode_t
*vp
)
731 VERIFY(vp
->v_count
> 0);
732 mutex_enter(&vp
->v_lock
);
734 if (vp
->v_count
== 1) {
735 mutex_exit(&vp
->v_lock
);
736 fop_inactive(vp
, CRED(), NULL
);
740 mutex_exit(&vp
->v_lock
);
744 vn_rele_inactive(vnode_t
*vp
)
746 fop_inactive(vp
, CRED(), NULL
);
750 * Like vn_rele() except if we are going to call fop_inactive() then do it
751 * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
752 * the file system as a result of releasing the vnode. Note, file systems
753 * already have to handle the race where the vnode is incremented before the
754 * inactive routine is called and does its locking.
756 * Warning: Excessive use of this routine can lead to performance problems.
757 * This is because taskqs throttle back allocation if too many are created.
760 vn_rele_async(vnode_t
*vp
, taskq_t
*taskq
)
762 VERIFY(vp
->v_count
> 0);
763 mutex_enter(&vp
->v_lock
);
764 if (vp
->v_count
== 1) {
765 mutex_exit(&vp
->v_lock
);
766 VERIFY(taskq_dispatch(taskq
, (task_func_t
*)vn_rele_inactive
,
767 vp
, TQ_SLEEP
) != (uintptr_t)NULL
);
771 mutex_exit(&vp
->v_lock
);
784 return (vn_openat(pnamep
, seg
, filemode
, createmode
, vpp
, crwhy
,
790 * Open/create a vnode.
791 * This may be callable by the kernel, the only known use
792 * of user context being that the current user credentials
793 * are used for permissions. crwhy is defined iff filemode & FCREAT.
804 struct vnode
*startvp
,
813 int shrlock_done
= 0;
815 enum symfollow follow
;
816 int estale_retry
= 0;
818 struct shr_locowner shr_own
;
820 if (filemode
& FSEARCH
)
821 filemode
|= FDIRECTORY
;
825 if (filemode
& FREAD
)
827 if (filemode
& (FWRITE
|FTRUNC
))
829 if (filemode
& (FSEARCH
|FEXEC
|FXATTRDIROPEN
))
832 /* symlink interpretation */
833 if (filemode
& FNOFOLLOW
)
838 if (filemode
& FAPPEND
)
839 accessflags
|= V_APPEND
;
842 if (filemode
& FCREAT
&& !(filemode
& FDIRECTORY
)) {
845 /* Wish to create a file. */
846 vattr
.va_type
= VREG
;
847 vattr
.va_mode
= createmode
;
848 vattr
.va_mask
= VATTR_TYPE
|VATTR_MODE
;
849 if (filemode
& FTRUNC
) {
851 vattr
.va_mask
|= VATTR_SIZE
;
853 if (filemode
& FEXCL
)
859 vn_createat(pnamep
, seg
, &vattr
, excl
, mode
, &vp
, crwhy
,
860 (filemode
& ~(FTRUNC
|FEXCL
)), umask
, startvp
))
863 /* Wish to open a file. Just look it up. */
864 if (error
= lookupnameat(pnamep
, seg
, follow
,
865 NULLVPP
, &vp
, startvp
)) {
866 if ((error
== ESTALE
) &&
867 fs_need_estale_retry(estale_retry
++))
873 * Can't write directories, active texts, or
874 * read-only filesystems. Can't truncate files
875 * on which mandatory locking is in effect.
877 if (filemode
& (FWRITE
|FTRUNC
)) {
879 * Allow writable directory if VDIROPEN flag is set.
881 if (vp
->v_type
== VDIR
&& !(vp
->v_flag
& VDIROPEN
)) {
890 * Can't truncate files on which
891 * sysv mandatory locking is in effect.
893 if (filemode
& FTRUNC
) {
896 if (fop_realvp(vp
, &rvp
, NULL
) != 0)
898 if (rvp
->v_filocks
!= NULL
) {
899 vattr
.va_mask
= VATTR_MODE
;
900 if ((error
= fop_getattr(vp
,
901 &vattr
, 0, CRED(), NULL
)) == 0 &&
902 MANDLOCK(vp
, vattr
.va_mode
))
912 if (error
= fop_access(vp
, mode
, accessflags
, CRED(), NULL
))
915 * Require FDIRECTORY to return a directory.
916 * Require FEXEC to return a regular file.
918 if ((filemode
& FDIRECTORY
) && vp
->v_type
!= VDIR
) {
922 if ((filemode
& FEXEC
) && vp
->v_type
!= VREG
) {
923 error
= ENOEXEC
; /* XXX: error code? */
929 * Do remaining checks for FNOFOLLOW and FNOLINKS.
931 if ((filemode
& FNOFOLLOW
) && vp
->v_type
== VLNK
) {
935 if (filemode
& FNOLINKS
) {
936 vattr
.va_mask
= VATTR_NLINK
;
937 if ((error
= fop_getattr(vp
, &vattr
, 0, CRED(), NULL
))) {
940 if (vattr
.va_nlink
!= 1) {
947 * Opening a socket corresponding to the AF_UNIX pathname
948 * in the filesystem name space is not supported.
949 * However, VSOCK nodes in namefs are supported in order
950 * to make fattach work for sockets.
952 * XXX This uses fop_realvp to distinguish between
953 * an unopened namefs node (where fop_realvp returns a
954 * different VSOCK vnode) and a VSOCK created by vn_create
955 * in some file system (where fop_realvp would never return
956 * a different vnode).
958 if (vp
->v_type
== VSOCK
) {
961 error
= fop_realvp(vp
, &nvp
, NULL
);
962 if (error
!= 0 || nvp
== NULL
|| nvp
== vp
||
963 nvp
->v_type
!= VSOCK
) {
969 if ((vp
->v_type
== VREG
) && nbl_need_check(vp
)) {
970 /* get share reservation */
972 if (filemode
& FWRITE
)
973 shr
.s_access
|= F_WRACC
;
974 if (filemode
& FREAD
)
975 shr
.s_access
|= F_RDACC
;
978 shr
.s_pid
= ttoproc(curthread
)->p_pid
;
979 shr_own
.sl_pid
= shr
.s_pid
;
981 shr
.s_own_len
= sizeof (shr_own
);
982 shr
.s_owner
= (caddr_t
)&shr_own
;
983 error
= fop_shrlock(vp
, F_SHARE_NBMAND
, &shr
, filemode
, CRED(),
989 /* nbmand conflict check if truncating file */
990 if ((filemode
& FTRUNC
) && !(filemode
& FCREAT
)) {
991 nbl_start_crit(vp
, RW_READER
);
994 vattr
.va_mask
= VATTR_SIZE
;
995 if (error
= fop_getattr(vp
, &vattr
, 0, CRED(), NULL
))
997 if (nbl_conflict(vp
, NBL_WRITE
, 0, vattr
.va_size
, 0,
1006 * Do opening protocol.
1008 error
= fop_open(&vp
, filemode
, CRED(), NULL
);
1014 * Truncate if required.
1016 if ((filemode
& FTRUNC
) && !(filemode
& FCREAT
)) {
1018 vattr
.va_mask
= VATTR_SIZE
;
1019 if ((error
= fop_setattr(vp
, &vattr
, 0, CRED(), NULL
)) != 0)
1023 ASSERT(vp
->v_count
> 0);
1031 (void) fop_close(vp
, filemode
, 1, 0, CRED(),
1037 (void) fop_shrlock(vp
, F_UNSHARE
, &shr
, 0, CRED(),
1043 * The following clause was added to handle a problem
1044 * with NFS consistency. It is possible that a lookup
1045 * of the file to be opened succeeded, but the file
1046 * itself doesn't actually exist on the server. This
1047 * is chiefly due to the DNLC containing an entry for
1048 * the file which has been removed on the server. In
1049 * this case, we just start over. If there was some
1050 * other cause for the ESTALE error, then the lookup
1051 * of the file will fail and the error will be returned
1052 * above instead of looping around from here.
1055 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1063 * The following two accessor functions are for the NFSv4 server. Since there
1064 * is no fop_open_UP/DOWNGRADE we need a way for the NFS server to keep the
1065 * vnode open counts correct when a client "upgrades" an open or does an
1066 * open_downgrade. In NFS, an upgrade or downgrade can not only change the
1067 * open mode (add or subtract read or write), but also change the share/deny
1068 * modes. However, share reservations are not integrated with OPEN, yet, so
1069 * we need to handle each separately. These functions are cleaner than having
1070 * the NFS server manipulate the counts directly, however, nobody else should
1071 * use these functions.
1078 ASSERT(vp
->v_type
== VREG
);
1080 if (filemode
& FREAD
)
1081 atomic_inc_32(&vp
->v_rdcnt
);
1082 if (filemode
& FWRITE
)
1083 atomic_inc_32(&vp
->v_wrcnt
);
1092 ASSERT(vp
->v_type
== VREG
);
1094 if (filemode
& FREAD
) {
1095 ASSERT(vp
->v_rdcnt
> 0);
1096 atomic_dec_32(&vp
->v_rdcnt
);
1098 if (filemode
& FWRITE
) {
1099 ASSERT(vp
->v_wrcnt
> 0);
1100 atomic_dec_32(&vp
->v_wrcnt
);
1117 return (vn_createat(pnamep
, seg
, vap
, excl
, mode
, vpp
, why
, flag
,
1122 * Create a vnode (makenode).
1135 struct vnode
*startvp
)
1137 struct vnode
*dvp
; /* ptr to parent dir vnode */
1138 struct vnode
*vp
= NULL
;
1143 enum symfollow follow
;
1144 int estale_retry
= 0;
1146 ASSERT((vap
->va_mask
& (VATTR_TYPE
|VATTR_MODE
)) == (VATTR_TYPE
|VATTR_MODE
));
1148 /* symlink interpretation */
1149 if ((flag
& FNOFOLLOW
) || excl
== EXCL
)
1153 flag
&= ~(FNOFOLLOW
|FNOLINKS
);
1158 * If new object is a file, call lower level to create it.
1159 * Note that it is up to the lower level to enforce exclusive
1160 * creation, if the file is already there.
1161 * This allows the lower level to do whatever
1162 * locking or protocol that is needed to prevent races.
1163 * If the new object is directory call lower level to make
1164 * the new directory, with "." and "..".
1166 if (error
= pn_get(pnamep
, seg
, &pn
))
1171 * lookup will find the parent directory for the vnode.
1172 * When it is done the pn holds the name of the entry
1174 * If this is a non-exclusive create we also find the node itself.
1176 error
= lookuppnat(&pn
, NULL
, follow
, &dvp
,
1177 (excl
== EXCL
) ? NULLVPP
: vpp
, startvp
);
1180 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1182 if (why
== CRMKDIR
&& error
== EINVAL
)
1183 error
= EEXIST
; /* SVID */
1188 vap
->va_mode
&= ~VSVTX
;
1191 * If default ACLs are defined for the directory don't apply the
1192 * umask if umask is passed.
1199 vsec
.vsa_aclcnt
= 0;
1200 vsec
.vsa_aclentp
= NULL
;
1201 vsec
.vsa_dfaclcnt
= 0;
1202 vsec
.vsa_dfaclentp
= NULL
;
1203 vsec
.vsa_mask
= VSA_DFACLCNT
;
1204 error
= fop_getsecattr(dvp
, &vsec
, 0, CRED(), NULL
);
1206 * If error is ENOSYS then treat it as no error
1207 * Don't want to force all file systems to support
1208 * aclent_t style of ACL's.
1210 if (error
== ENOSYS
)
1218 * Apply the umask if no default ACLs.
1220 if (vsec
.vsa_dfaclcnt
== 0)
1221 vap
->va_mode
&= ~umask
;
1224 * fop_getsecattr() may have allocated memory for
1225 * ACLs we didn't request, so double-check and
1226 * free it if necessary.
1228 if (vsec
.vsa_aclcnt
&& vsec
.vsa_aclentp
!= NULL
)
1229 kmem_free((caddr_t
)vsec
.vsa_aclentp
,
1230 vsec
.vsa_aclcnt
* sizeof (aclent_t
));
1231 if (vsec
.vsa_dfaclcnt
&& vsec
.vsa_dfaclentp
!= NULL
)
1232 kmem_free((caddr_t
)vsec
.vsa_dfaclentp
,
1233 vsec
.vsa_dfaclcnt
* sizeof (aclent_t
));
1238 * In general we want to generate EROFS if the file system is
1239 * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1
1240 * documents the open system call, and it says that O_CREAT has no
1241 * effect if the file already exists. Bug 1119649 states
1242 * that open(path, O_CREAT, ...) fails when attempting to open an
1243 * existing file on a read only file system. Thus, the first part
1244 * of the following if statement has 3 checks:
1245 * if the file exists &&
1246 * it is being open with write access &&
1247 * the file system is read only
1248 * then generate EROFS
1250 if ((*vpp
!= NULL
&& (mode
& VWRITE
) && ISROFILE(*vpp
)) ||
1251 (*vpp
== NULL
&& dvp
->v_vfsp
->vfs_flag
& VFS_RDONLY
)) {
1255 } else if (excl
== NONEXCL
&& *vpp
!= NULL
) {
1259 * File already exists. If a mandatory lock has been
1260 * applied, return error.
1263 if (fop_realvp(vp
, &rvp
, NULL
) != 0)
1265 if ((vap
->va_mask
& VATTR_SIZE
) && nbl_need_check(vp
)) {
1266 nbl_start_crit(vp
, RW_READER
);
1269 if (rvp
->v_filocks
!= NULL
|| rvp
->v_shrlocks
!= NULL
) {
1270 vattr
.va_mask
= VATTR_MODE
|VATTR_SIZE
;
1271 if (error
= fop_getattr(vp
, &vattr
, 0, CRED(), NULL
)) {
1274 if (MANDLOCK(vp
, vattr
.va_mode
)) {
1279 * File cannot be truncated if non-blocking mandatory
1280 * locks are currently on the file.
1282 if ((vap
->va_mask
& VATTR_SIZE
) && in_crit
) {
1286 offset
= vap
->va_size
> vattr
.va_size
?
1287 vattr
.va_size
: vap
->va_size
;
1288 length
= vap
->va_size
> vattr
.va_size
?
1289 vap
->va_size
- vattr
.va_size
:
1290 vattr
.va_size
- vap
->va_size
;
1291 if (nbl_conflict(vp
, NBL_WRITE
, offset
,
1300 * If the file is the root of a VFS, we've crossed a
1301 * mount point and the "containing" directory that we
1302 * acquired above (dvp) is irrelevant because it's in
1303 * a different file system. We apply fop_create to the
1304 * target itself instead of to the containing directory
1305 * and supply a null path name to indicate (conventionally)
1306 * the node itself as the "component" of interest.
1308 * The call to fop_create() is necessary to ensure
1309 * that the appropriate permission checks are made,
1310 * i.e. EISDIR, EACCES, etc. We already know that vpp
1311 * exists since we are in the else condition where this
1314 if (vp
->v_flag
& VROOT
) {
1315 ASSERT(why
!= CRMKDIR
);
1316 error
= fop_create(vp
, "", vap
, excl
, mode
, vpp
,
1317 CRED(), flag
, NULL
, NULL
);
1319 * If the create succeeded, it will have created a
1320 * new reference on a new vnode (*vpp) in the child
1321 * file system, so we want to drop our reference on
1322 * the old (vp) upon exit.
1330 * Call mkdir() if specified, otherwise create().
1332 int must_be_dir
= pn_fixslash(&pn
); /* trailing '/'? */
1336 * N.B., if vn_createat() ever requests
1337 * case-insensitive behavior then it will need
1338 * to be passed to fop_mkdir(). fop_create()
1339 * will already get it via "flag"
1341 error
= fop_mkdir(dvp
, pn
.pn_path
, vap
, vpp
, CRED(),
1343 else if (!must_be_dir
)
1344 error
= fop_create(dvp
, pn
.pn_path
, vap
,
1345 excl
, mode
, vpp
, CRED(), flag
, NULL
, NULL
);
1363 * The following clause was added to handle a problem
1364 * with NFS consistency. It is possible that a lookup
1365 * of the file to be created succeeded, but the file
1366 * itself doesn't actually exist on the server. This
1367 * is chiefly due to the DNLC containing an entry for
1368 * the file which has been removed on the server. In
1369 * this case, we just start over. If there was some
1370 * other cause for the ESTALE error, then the lookup
1371 * of the file will fail and the error will be returned
1372 * above instead of looping around from here.
1374 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1380 vn_link(char *from
, char *to
, enum uio_seg seg
)
1382 return (vn_linkat(NULL
, from
, NO_FOLLOW
, NULL
, to
, seg
));
1386 vn_linkat(vnode_t
*fstartvp
, char *from
, enum symfollow follow
,
1387 vnode_t
*tstartvp
, char *to
, enum uio_seg seg
)
1389 struct vnode
*fvp
; /* from vnode ptr */
1390 struct vnode
*tdvp
; /* to directory vnode ptr */
1395 int estale_retry
= 0;
1399 if (error
= pn_get(to
, seg
, &pn
))
1401 if (error
= lookupnameat(from
, seg
, follow
, NULLVPP
, &fvp
, fstartvp
))
1403 if (error
= lookuppnat(&pn
, NULL
, NO_FOLLOW
, &tdvp
, NULLVPP
, tstartvp
))
1406 * Make sure both source vnode and target directory vnode are
1407 * in the same vfs and that it is writeable.
1409 vattr
.va_mask
= VATTR_FSID
;
1410 if (error
= fop_getattr(fvp
, &vattr
, 0, CRED(), NULL
))
1412 fsid
= vattr
.va_fsid
;
1413 vattr
.va_mask
= VATTR_FSID
;
1414 if (error
= fop_getattr(tdvp
, &vattr
, 0, CRED(), NULL
))
1416 if (fsid
!= vattr
.va_fsid
) {
1420 if (tdvp
->v_vfsp
->vfs_flag
& VFS_RDONLY
) {
1427 (void) pn_fixslash(&pn
);
1428 error
= fop_link(tdvp
, fvp
, pn
.pn_path
, CRED(), NULL
, 0);
1435 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1441 vn_rename(char *from
, char *to
, enum uio_seg seg
)
1443 return (vn_renameat(NULL
, from
, NULL
, to
, seg
));
1447 vn_renameat(vnode_t
*fdvp
, char *fname
, vnode_t
*tdvp
,
1448 char *tname
, enum uio_seg seg
)
1452 struct pathname fpn
; /* from pathname */
1453 struct pathname tpn
; /* to pathname */
1455 int in_crit_src
, in_crit_targ
;
1456 vnode_t
*fromvp
, *fvp
;
1457 vnode_t
*tovp
, *targvp
;
1458 int estale_retry
= 0;
1461 fvp
= fromvp
= tovp
= targvp
= NULL
;
1462 in_crit_src
= in_crit_targ
= 0;
1464 * Get to and from pathnames.
1466 if (error
= pn_get(fname
, seg
, &fpn
))
1468 if (error
= pn_get(tname
, seg
, &tpn
)) {
1474 * First we need to resolve the correct directories
1475 * The passed in directories may only be a starting point,
1476 * but we need the real directories the file(s) live in.
1477 * For example the fname may be something like usr/lib/sparc
1478 * and we were passed in the / directory, but we need to
1479 * use the lib directory for the rename.
1483 * Lookup to and from directories.
1485 if (error
= lookuppnat(&fpn
, NULL
, NO_FOLLOW
, &fromvp
, &fvp
, fdvp
)) {
1490 * Make sure there is an entry.
1497 if (error
= lookuppnat(&tpn
, NULL
, NO_FOLLOW
, &tovp
, &targvp
, tdvp
)) {
1502 * Make sure both the from vnode directory and the to directory
1503 * are in the same vfs and the to directory is writable.
1504 * We check fsid's, not vfs pointers, so loopback fs works.
1506 if (fromvp
!= tovp
) {
1507 vattr
.va_mask
= VATTR_FSID
;
1508 if (error
= fop_getattr(fromvp
, &vattr
, 0, CRED(), NULL
))
1510 fsid
= vattr
.va_fsid
;
1511 vattr
.va_mask
= VATTR_FSID
;
1512 if (error
= fop_getattr(tovp
, &vattr
, 0, CRED(), NULL
))
1514 if (fsid
!= vattr
.va_fsid
) {
1520 if (tovp
->v_vfsp
->vfs_flag
& VFS_RDONLY
) {
1526 * Make sure "from" vp is not a mount point.
1527 * Note, lookup did traverse() already, so
1528 * we'll be looking at the mounted FS root.
1529 * (but allow files like mnttab)
1531 if ((fvp
->v_flag
& VROOT
) != 0 && fvp
->v_type
== VDIR
) {
1536 if (targvp
&& (fvp
!= targvp
)) {
1537 nbl_start_crit(targvp
, RW_READER
);
1539 if (nbl_conflict(targvp
, NBL_REMOVE
, 0, 0, 0, NULL
)) {
1545 if (nbl_need_check(fvp
)) {
1546 nbl_start_crit(fvp
, RW_READER
);
1548 if (nbl_conflict(fvp
, NBL_RENAME
, 0, 0, 0, NULL
)) {
1557 (void) pn_fixslash(&tpn
);
1558 error
= fop_rename(fromvp
, fpn
.pn_path
, tovp
, tpn
.pn_path
, CRED(),
1567 nbl_end_crit(targvp
);
1576 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1582 * Remove a file or directory.
1585 vn_remove(char *fnamep
, enum uio_seg seg
, enum rm dirflag
)
1587 return (vn_removeat(NULL
, fnamep
, seg
, dirflag
));
1591 vn_removeat(vnode_t
*startvp
, char *fnamep
, enum uio_seg seg
, enum rm dirflag
)
1593 struct vnode
*vp
; /* entry vnode */
1594 struct vnode
*dvp
; /* ptr to parent dir vnode */
1595 struct vnode
*coveredvp
;
1596 struct pathname pn
; /* name of entry */
1600 struct vfs
*dvfsp
; /* ptr to parent dir vfs */
1602 int estale_retry
= 0;
1605 if (error
= pn_get(fnamep
, seg
, &pn
))
1608 if (error
= lookuppnat(&pn
, NULL
, NO_FOLLOW
, &dvp
, &vp
, startvp
)) {
1610 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1616 * Make sure there is an entry.
1624 dvfsp
= dvp
->v_vfsp
;
1627 * If the named file is the root of a mounted filesystem, fail,
1628 * unless it's marked unlinkable. In that case, unmount the
1629 * filesystem and proceed to unlink the covered vnode. (If the
1630 * covered vnode is a directory, use rmdir instead of unlink,
1631 * to avoid file system corruption.)
1633 if (vp
->v_flag
& VROOT
) {
1634 if ((vfsp
->vfs_flag
& VFS_UNLINKABLE
) == 0) {
1640 * Namefs specific code starts here.
1643 if (dirflag
== RMDIRECTORY
) {
1645 * User called rmdir(2) on a file that has
1646 * been namefs mounted on top of. Since
1647 * namefs doesn't allow directories to
1648 * be mounted on other files we know
1649 * vp is not of type VDIR so fail to operation.
1656 * If VROOT is still set after grabbing vp->v_lock,
1657 * noone has finished nm_unmount so far and coveredvp
1659 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1660 * vp->v_lock, any race window is eliminated.
1663 mutex_enter(&vp
->v_lock
);
1664 if ((vp
->v_flag
& VROOT
) == 0) {
1665 /* Someone beat us to the unmount */
1666 mutex_exit(&vp
->v_lock
);
1671 coveredvp
= vfsp
->vfs_vnodecovered
;
1674 * Note: Implementation of vn_vfswlock shows that ordering of
1675 * v_lock / vn_vfswlock is not an issue here.
1677 error
= vn_vfswlock(coveredvp
);
1678 mutex_exit(&vp
->v_lock
);
1685 error
= dounmount(vfsp
, 0, CRED());
1688 * Unmounted the namefs file system; now get
1689 * the object it was mounted over.
1693 * If namefs was mounted over a directory, then
1694 * we want to use rmdir() instead of unlink().
1696 if (vp
->v_type
== VDIR
)
1697 dirflag
= RMDIRECTORY
;
1704 * Make sure filesystem is writeable.
1705 * We check the parent directory's vfs in case this is an lofs vnode.
1707 if (dvfsp
&& dvfsp
->vfs_flag
& VFS_RDONLY
) {
1715 * If there is the possibility of an nbmand share reservation, make
1716 * sure it's okay to remove the file. Keep a reference to the
1717 * vnode, so that we can exit the nbl critical region after
1718 * calling fop_remove.
1719 * If there is no possibility of an nbmand share reservation,
1720 * release the vnode reference now. Filesystems like NFS may
1721 * behave differently if there is an extra reference, so get rid of
1722 * this one. Fortunately, we can't have nbmand mounts on NFS
1725 if (nbl_need_check(vp
)) {
1726 nbl_start_crit(vp
, RW_READER
);
1728 if (nbl_conflict(vp
, NBL_REMOVE
, 0, 0, 0, NULL
)) {
1737 if (dirflag
== RMDIRECTORY
) {
1739 * Caller is using rmdir(2), which can only be applied to
1742 if (vtype
!= VDIR
) {
1746 proc_t
*pp
= curproc
;
1748 mutex_enter(&pp
->p_lock
);
1749 cwd
= PTOU(pp
)->u_cdir
;
1751 mutex_exit(&pp
->p_lock
);
1752 error
= fop_rmdir(dvp
, pn
.pn_path
, cwd
, CRED(),
1758 * Unlink(2) can be applied to anything.
1760 error
= fop_remove(dvp
, pn
.pn_path
, CRED(), NULL
, 0);
1773 if ((error
== ESTALE
) && fs_need_estale_retry(estale_retry
++))
1779 * Utility function to compare equality of vnodes.
1780 * Compare the underlying real vnodes, if there are underlying vnodes.
1781 * This is a more thorough comparison than the VN_CMP() macro provides.
1784 vn_compare(vnode_t
*vp1
, vnode_t
*vp2
)
1788 if (vp1
!= NULL
&& fop_realvp(vp1
, &realvp
, NULL
) == 0)
1790 if (vp2
!= NULL
&& fop_realvp(vp2
, &realvp
, NULL
) == 0)
1792 return (VN_CMP(vp1
, vp2
));
1796 * The number of locks to hash into. This value must be a power
1797 * of 2 minus 1 and should probably also be prime.
1799 #define NUM_BUCKETS 1023
1801 struct vn_vfslocks_bucket
{
1803 vn_vfslocks_entry_t
*vb_list
;
1804 char pad
[64 - sizeof (kmutex_t
) - sizeof (void *)];
1808 * Total number of buckets will be NUM_BUCKETS + 1 .
1811 #pragma align 64(vn_vfslocks_buckets)
1812 static struct vn_vfslocks_bucket vn_vfslocks_buckets
[NUM_BUCKETS
+ 1];
1814 #define VN_VFSLOCKS_SHIFT 9
1816 #define VN_VFSLOCKS_HASH(vfsvpptr) \
1817 ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
1820 * vn_vfslocks_getlock() uses an HASH scheme to generate
1821 * rwstlock using vfs/vnode pointer passed to it.
1823 * vn_vfslocks_rele() releases a reference in the
1824 * HASH table which allows the entry allocated by
1825 * vn_vfslocks_getlock() to be freed at a later
1826 * stage when the refcount drops to zero.
1829 vn_vfslocks_entry_t
*
1830 vn_vfslocks_getlock(void *vfsvpptr
)
1832 struct vn_vfslocks_bucket
*bp
;
1833 vn_vfslocks_entry_t
*vep
;
1834 vn_vfslocks_entry_t
*tvep
;
1836 ASSERT(vfsvpptr
!= NULL
);
1837 bp
= &vn_vfslocks_buckets
[VN_VFSLOCKS_HASH(vfsvpptr
)];
1839 mutex_enter(&bp
->vb_lock
);
1840 for (vep
= bp
->vb_list
; vep
!= NULL
; vep
= vep
->ve_next
) {
1841 if (vep
->ve_vpvfs
== vfsvpptr
) {
1843 mutex_exit(&bp
->vb_lock
);
1847 mutex_exit(&bp
->vb_lock
);
1848 vep
= kmem_alloc(sizeof (*vep
), KM_SLEEP
);
1849 rwst_init(&vep
->ve_lock
, NULL
, RW_DEFAULT
, NULL
);
1850 vep
->ve_vpvfs
= (char *)vfsvpptr
;
1852 mutex_enter(&bp
->vb_lock
);
1853 for (tvep
= bp
->vb_list
; tvep
!= NULL
; tvep
= tvep
->ve_next
) {
1854 if (tvep
->ve_vpvfs
== vfsvpptr
) {
1856 mutex_exit(&bp
->vb_lock
);
1859 * There is already an entry in the hash
1860 * destroy what we just allocated.
1862 rwst_destroy(&vep
->ve_lock
);
1863 kmem_free(vep
, sizeof (*vep
));
1867 vep
->ve_next
= bp
->vb_list
;
1869 mutex_exit(&bp
->vb_lock
);
1874 vn_vfslocks_rele(vn_vfslocks_entry_t
*vepent
)
1876 struct vn_vfslocks_bucket
*bp
;
1877 vn_vfslocks_entry_t
*vep
;
1878 vn_vfslocks_entry_t
*pvep
;
1880 ASSERT(vepent
!= NULL
);
1881 ASSERT(vepent
->ve_vpvfs
!= NULL
);
1883 bp
= &vn_vfslocks_buckets
[VN_VFSLOCKS_HASH(vepent
->ve_vpvfs
)];
1885 mutex_enter(&bp
->vb_lock
);
1886 vepent
->ve_refcnt
--;
1888 if ((int32_t)vepent
->ve_refcnt
< 0)
1889 cmn_err(CE_PANIC
, "vn_vfslocks_rele: refcount negative");
1891 if (vepent
->ve_refcnt
== 0) {
1892 for (vep
= bp
->vb_list
; vep
!= NULL
; vep
= vep
->ve_next
) {
1893 if (vep
->ve_vpvfs
== vepent
->ve_vpvfs
) {
1894 if (bp
->vb_list
== vep
)
1895 bp
->vb_list
= vep
->ve_next
;
1898 pvep
->ve_next
= vep
->ve_next
;
1900 mutex_exit(&bp
->vb_lock
);
1901 rwst_destroy(&vep
->ve_lock
);
1902 kmem_free(vep
, sizeof (*vep
));
1907 cmn_err(CE_PANIC
, "vn_vfslocks_rele: vp/vfs not found");
1909 mutex_exit(&bp
->vb_lock
);
1913 * vn_vfswlock_wait is used to implement a lock which is logically a writers
1914 * lock protecting the v_vfsmountedhere field.
1915 * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
1916 * except that it blocks to acquire the lock VVFSLOCK.
1918 * traverse() and routines re-implementing part of traverse (e.g. autofs)
1919 * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
1920 * need the non-blocking version of the writers lock i.e. vn_vfswlock
1923 vn_vfswlock_wait(vnode_t
*vp
)
1926 vn_vfslocks_entry_t
*vpvfsentry
;
1929 vpvfsentry
= vn_vfslocks_getlock(vp
);
1930 retval
= rwst_enter_sig(&vpvfsentry
->ve_lock
, RW_WRITER
);
1932 if (retval
== EINTR
) {
1933 vn_vfslocks_rele(vpvfsentry
);
1940 vn_vfsrlock_wait(vnode_t
*vp
)
1943 vn_vfslocks_entry_t
*vpvfsentry
;
1946 vpvfsentry
= vn_vfslocks_getlock(vp
);
1947 retval
= rwst_enter_sig(&vpvfsentry
->ve_lock
, RW_READER
);
1949 if (retval
== EINTR
) {
1950 vn_vfslocks_rele(vpvfsentry
);
1959 * vn_vfswlock is used to implement a lock which is logically a writers lock
1960 * protecting the v_vfsmountedhere field.
1963 vn_vfswlock(vnode_t
*vp
)
1965 vn_vfslocks_entry_t
*vpvfsentry
;
1968 * If vp is NULL then somebody is trying to lock the covered vnode
1969 * of /. (vfs_vnodecovered is NULL for /). This situation will
1970 * only happen when unmounting /. Since that operation will fail
1971 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
1976 vpvfsentry
= vn_vfslocks_getlock(vp
);
1978 if (rwst_tryenter(&vpvfsentry
->ve_lock
, RW_WRITER
))
1981 vn_vfslocks_rele(vpvfsentry
);
1986 vn_vfsrlock(vnode_t
*vp
)
1988 vn_vfslocks_entry_t
*vpvfsentry
;
1991 * If vp is NULL then somebody is trying to lock the covered vnode
1992 * of /. (vfs_vnodecovered is NULL for /). This situation will
1993 * only happen when unmounting /. Since that operation will fail
1994 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
1999 vpvfsentry
= vn_vfslocks_getlock(vp
);
2001 if (rwst_tryenter(&vpvfsentry
->ve_lock
, RW_READER
))
2004 vn_vfslocks_rele(vpvfsentry
);
2009 vn_vfsunlock(vnode_t
*vp
)
2011 vn_vfslocks_entry_t
*vpvfsentry
;
2014 * ve_refcnt needs to be decremented twice.
2015 * 1. To release refernce after a call to vn_vfslocks_getlock()
2016 * 2. To release the reference from the locking routines like
2017 * vn_vfsrlock/vn_vfswlock etc,.
2019 vpvfsentry
= vn_vfslocks_getlock(vp
);
2020 vn_vfslocks_rele(vpvfsentry
);
2022 rwst_exit(&vpvfsentry
->ve_lock
);
2023 vn_vfslocks_rele(vpvfsentry
);
2027 vn_vfswlock_held(vnode_t
*vp
)
2030 vn_vfslocks_entry_t
*vpvfsentry
;
2034 vpvfsentry
= vn_vfslocks_getlock(vp
);
2035 held
= rwst_lock_held(&vpvfsentry
->ve_lock
, RW_WRITER
);
2037 vn_vfslocks_rele(vpvfsentry
);
2048 vn_cache_constructor(void *buf
, void *cdrarg
, int kmflags
)
2054 mutex_init(&vp
->v_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
2055 mutex_init(&vp
->v_vsd_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
2056 cv_init(&vp
->v_cv
, NULL
, CV_DEFAULT
, NULL
);
2057 rw_init(&vp
->v_nbllock
, NULL
, RW_DEFAULT
, NULL
);
2058 vp
->v_femhead
= NULL
; /* Must be done before vn_reinit() */
2059 vp
->v_path
= vn_vpath_empty
;
2060 vp
->v_path_stamp
= 0;
2061 vp
->v_mpssdata
= NULL
;
2063 vp
->v_fopdata
= NULL
;
2065 vmobject_init(&vp
->v_object
, vp
);
2072 vn_cache_destructor(void *buf
, void *cdrarg
)
2078 vmobject_fini(&vp
->v_object
);
2080 rw_destroy(&vp
->v_nbllock
);
2081 cv_destroy(&vp
->v_cv
);
2082 mutex_destroy(&vp
->v_vsd_lock
);
2083 mutex_destroy(&vp
->v_lock
);
2087 vn_create_cache(void)
2090 ASSERT((1 << VNODE_ALIGN_LOG2
) ==
2091 P2ROUNDUP(sizeof (struct vnode
), VNODE_ALIGN
));
2092 vn_cache
= kmem_cache_create("vn_cache", sizeof (struct vnode
),
2093 VNODE_ALIGN
, vn_cache_constructor
, vn_cache_destructor
, NULL
, NULL
,
2098 vn_destroy_cache(void)
2100 kmem_cache_destroy(vn_cache
);
2104 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2105 * cached by the file system and vnodes remain associated.
2108 vn_recycle(vnode_t
*vp
)
2110 ASSERT(!vn_has_cached_data(vp
));
2111 VERIFY(vp
->v_path
!= NULL
);
2114 * XXX - This really belongs in vn_reinit(), but we have some issues
2115 * with the counts. Best to have it here for clean initialization.
2119 vp
->v_mmap_read
= 0;
2120 vp
->v_mmap_write
= 0;
2123 * If FEM was in use, make sure everything gets cleaned up
2124 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2127 if (vp
->v_femhead
) {
2128 /* XXX - There should be a free_femhead() that does all this */
2129 ASSERT(vp
->v_femhead
->femh_list
== NULL
);
2130 mutex_destroy(&vp
->v_femhead
->femh_lock
);
2131 kmem_free(vp
->v_femhead
, sizeof (*(vp
->v_femhead
)));
2132 vp
->v_femhead
= NULL
;
2134 if (vp
->v_path
!= vn_vpath_empty
) {
2135 kmem_free(vp
->v_path
, strlen(vp
->v_path
) + 1);
2136 vp
->v_path
= vn_vpath_empty
;
2138 vp
->v_path_stamp
= 0;
2140 if (vp
->v_fopdata
!= NULL
) {
2143 vp
->v_mpssdata
= NULL
;
2148 * Used to reset the vnode fields including those that are directly accessible
2149 * as well as those which require an accessor function.
2151 * Does not initialize:
2152 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2153 * v_data (since FS-nodes and vnodes point to each other and should
2154 * be updated simultaneously)
2155 * v_op (in case someone needs to make a VOP call on this object)
2158 vn_reinit(vnode_t
*vp
)
2161 vp
->v_count_dnlc
= 0;
2163 vp
->v_stream
= NULL
;
2164 vp
->v_vfsmountedhere
= NULL
;
2169 vp
->v_filocks
= NULL
;
2170 vp
->v_shrlocks
= NULL
;
2171 VERIFY(!vn_has_cached_data(vp
));
2173 vp
->v_locality
= NULL
;
2174 vp
->v_xattrdir
= NULL
;
2177 * In a few specific instances, vn_reinit() is used to initialize
2178 * locally defined vnode_t instances. Lacking the construction offered
2179 * by vn_alloc(), these vnodes require v_path initialization.
2181 if (vp
->v_path
== NULL
) {
2182 vp
->v_path
= vn_vpath_empty
;
2185 /* Handles v_femhead, v_path, and the r/w/map counts */
2190 vn_alloc(int kmflag
)
2194 vp
= kmem_cache_alloc(vn_cache
, kmflag
);
2197 vp
->v_femhead
= NULL
; /* Must be done before vn_reinit() */
2198 vp
->v_fopdata
= NULL
;
2206 vn_free(vnode_t
*vp
)
2208 ASSERT(vp
->v_shrlocks
== NULL
);
2209 ASSERT(vp
->v_filocks
== NULL
);
2212 * Some file systems call vn_free() with v_count of zero,
2213 * some with v_count of 1. In any case, the value should
2214 * never be anything else.
2216 ASSERT((vp
->v_count
== 0) || (vp
->v_count
== 1));
2217 ASSERT(vp
->v_count_dnlc
== 0);
2218 VERIFY(vp
->v_path
!= NULL
);
2219 if (vp
->v_path
!= vn_vpath_empty
) {
2220 kmem_free(vp
->v_path
, strlen(vp
->v_path
) + 1);
2221 vp
->v_path
= vn_vpath_empty
;
2224 /* If FEM was in use, make sure everything gets cleaned up */
2225 if (vp
->v_femhead
) {
2226 /* XXX - There should be a free_femhead() that does all this */
2227 ASSERT(vp
->v_femhead
->femh_list
== NULL
);
2228 mutex_destroy(&vp
->v_femhead
->femh_lock
);
2229 kmem_free(vp
->v_femhead
, sizeof (*(vp
->v_femhead
)));
2230 vp
->v_femhead
= NULL
;
2233 if (vp
->v_fopdata
!= NULL
) {
2236 vp
->v_mpssdata
= NULL
;
2238 kmem_cache_free(vn_cache
, vp
);
2242 * vnode status changes, should define better states than 1, 0.
2245 vn_reclaim(vnode_t
*vp
)
2247 vfs_t
*vfsp
= vp
->v_vfsp
;
2250 vfsp
->vfs_implp
== NULL
|| vfsp
->vfs_femhead
== NULL
) {
2253 (void) VFS_VNSTATE(vfsp
, vp
, VNTRANS_RECLAIMED
);
2257 vn_idle(vnode_t
*vp
)
2259 vfs_t
*vfsp
= vp
->v_vfsp
;
2262 vfsp
->vfs_implp
== NULL
|| vfsp
->vfs_femhead
== NULL
) {
2265 (void) VFS_VNSTATE(vfsp
, vp
, VNTRANS_IDLED
);
2268 vn_exists(vnode_t
*vp
)
2270 vfs_t
*vfsp
= vp
->v_vfsp
;
2273 vfsp
->vfs_implp
== NULL
|| vfsp
->vfs_femhead
== NULL
) {
2276 (void) VFS_VNSTATE(vfsp
, vp
, VNTRANS_EXISTS
);
2280 vn_invalid(vnode_t
*vp
)
2282 vfs_t
*vfsp
= vp
->v_vfsp
;
2285 vfsp
->vfs_implp
== NULL
|| vfsp
->vfs_femhead
== NULL
) {
2288 (void) VFS_VNSTATE(vfsp
, vp
, VNTRANS_DESTROYED
);
2291 /* Vnode event notification */
2294 vnevent_support(vnode_t
*vp
, caller_context_t
*ct
)
2299 return (fop_vnevent(vp
, VE_SUPPORT
, NULL
, NULL
, ct
));
2303 vnevent_rename_src(vnode_t
*vp
, vnode_t
*dvp
, char *name
, caller_context_t
*ct
)
2305 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2308 (void) fop_vnevent(vp
, VE_RENAME_SRC
, dvp
, name
, ct
);
2312 vnevent_rename_dest(vnode_t
*vp
, vnode_t
*dvp
, char *name
,
2313 caller_context_t
*ct
)
2315 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2318 (void) fop_vnevent(vp
, VE_RENAME_DEST
, dvp
, name
, ct
);
2322 vnevent_rename_dest_dir(vnode_t
*vp
, caller_context_t
*ct
)
2324 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2327 (void) fop_vnevent(vp
, VE_RENAME_DEST_DIR
, NULL
, NULL
, ct
);
2331 vnevent_remove(vnode_t
*vp
, vnode_t
*dvp
, char *name
, caller_context_t
*ct
)
2333 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2336 (void) fop_vnevent(vp
, VE_REMOVE
, dvp
, name
, ct
);
2340 vnevent_rmdir(vnode_t
*vp
, vnode_t
*dvp
, char *name
, caller_context_t
*ct
)
2342 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2345 (void) fop_vnevent(vp
, VE_RMDIR
, dvp
, name
, ct
);
2349 vnevent_pre_rename_src(vnode_t
*vp
, vnode_t
*dvp
, char *name
,
2350 caller_context_t
*ct
)
2352 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2355 (void) fop_vnevent(vp
, VE_PRE_RENAME_SRC
, dvp
, name
, ct
);
2359 vnevent_pre_rename_dest(vnode_t
*vp
, vnode_t
*dvp
, char *name
,
2360 caller_context_t
*ct
)
2362 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2365 (void) fop_vnevent(vp
, VE_PRE_RENAME_DEST
, dvp
, name
, ct
);
2369 vnevent_pre_rename_dest_dir(vnode_t
*vp
, vnode_t
*nvp
, char *name
,
2370 caller_context_t
*ct
)
2372 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2375 (void) fop_vnevent(vp
, VE_PRE_RENAME_DEST_DIR
, nvp
, name
, ct
);
2379 vnevent_create(vnode_t
*vp
, caller_context_t
*ct
)
2381 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2384 (void) fop_vnevent(vp
, VE_CREATE
, NULL
, NULL
, ct
);
2388 vnevent_link(vnode_t
*vp
, caller_context_t
*ct
)
2390 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2393 (void) fop_vnevent(vp
, VE_LINK
, NULL
, NULL
, ct
);
2397 vnevent_mountedover(vnode_t
*vp
, caller_context_t
*ct
)
2399 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2402 (void) fop_vnevent(vp
, VE_MOUNTEDOVER
, NULL
, NULL
, ct
);
2406 vnevent_truncate(vnode_t
*vp
, caller_context_t
*ct
)
2408 if (vp
== NULL
|| vp
->v_femhead
== NULL
) {
2411 (void) fop_vnevent(vp
, VE_TRUNCATE
, NULL
, NULL
, ct
);
2419 vn_is_readonly(vnode_t
*vp
)
2421 return (vp
->v_vfsp
->vfs_flag
& VFS_RDONLY
);
2425 vn_has_flocks(vnode_t
*vp
)
2427 return (vp
->v_filocks
!= NULL
);
2431 vn_has_mandatory_locks(vnode_t
*vp
, int mode
)
2433 return ((vp
->v_filocks
!= NULL
) && (MANDLOCK(vp
, mode
)));
2437 vn_has_cached_data(vnode_t
*vp
)
2439 return (!list_is_empty(&vp
->v_object
.list
));
2443 * Return 0 if the vnode in question shouldn't be permitted into a zone via
2447 vn_can_change_zones(vnode_t
*vp
)
2453 if (nfs_global_client_only
!= 0)
2457 * We always want to look at the underlying vnode if there is one.
2459 if (fop_realvp(vp
, &rvp
, NULL
) != 0)
2462 * Some pseudo filesystems (including doorfs) don't actually register
2463 * their vfsops_t, so the following may return NULL; we happily let
2464 * such vnodes switch zones.
2466 vswp
= vfs_getvfsswbyvfsops(vfs_getops(rvp
->v_vfsp
));
2468 if (vswp
->vsw_flag
& VSW_NOTZONESAFE
)
2470 vfs_unrefvfssw(vswp
);
2476 * Return nonzero if the vnode is a mount point, zero if not.
2479 vn_ismntpt(vnode_t
*vp
)
2481 return (vp
->v_vfsmountedhere
!= NULL
);
2484 /* Retrieve the vfs (if any) mounted on this vnode */
2486 vn_mountedvfs(vnode_t
*vp
)
2488 return (vp
->v_vfsmountedhere
);
2492 * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2495 vn_in_dnlc(vnode_t
*vp
)
2497 return (vp
->v_count_dnlc
> 0);
2501 * vn_has_other_opens() checks whether a particular file is opened by more than
2502 * just the caller and whether the open is for read and/or write.
2503 * This routine is for calling after the caller has already called fop_open()
2504 * and the caller wishes to know if they are the only one with it open for
2505 * the mode(s) specified.
2507 * Vnode counts are only kept on regular files (v_type=VREG).
2510 vn_has_other_opens(struct vnode
*vp
, v_mode_t mode
)
2516 if (vp
->v_wrcnt
> 1)
2520 if ((vp
->v_rdcnt
> 1) || (vp
->v_wrcnt
> 1))
2524 if ((vp
->v_rdcnt
> 1) && (vp
->v_wrcnt
> 1))
2528 if (vp
->v_rdcnt
> 1)
2537 * vn_is_opened() checks whether a particular file is opened and
2538 * whether the open is for read and/or write.
2540 * Vnode counts are only kept on regular files (v_type=VREG).
2542 bool vn_is_opened(struct vnode
*vp
, v_mode_t mode
)
2552 if (vp
->v_rdcnt
&& vp
->v_wrcnt
)
2556 if (vp
->v_rdcnt
|| vp
->v_wrcnt
)
2569 * vn_is_mapped() checks whether a particular file is mapped and whether
2570 * the file is mapped read and/or write.
2572 bool vn_is_mapped(struct vnode
*vp
, v_mode_t mode
)
2579 * The atomic_add_64_nv functions force atomicity in the
2580 * case of 32 bit architectures. Otherwise the 64 bit values
2581 * require two fetches. The value of the fields may be
2582 * (potentially) changed between the first fetch and the
2586 if (atomic_add_64_nv((&(vp
->v_mmap_write
)), 0))
2590 if ((atomic_add_64_nv((&(vp
->v_mmap_read
)), 0)) &&
2591 (atomic_add_64_nv((&(vp
->v_mmap_write
)), 0)))
2595 if ((atomic_add_64_nv((&(vp
->v_mmap_read
)), 0)) ||
2596 (atomic_add_64_nv((&(vp
->v_mmap_write
)), 0)))
2600 if (atomic_add_64_nv((&(vp
->v_mmap_read
)), 0))
2607 if (vp
->v_mmap_write
)
2611 if (vp
->v_mmap_read
&& vp
->v_mmap_write
)
2615 if (vp
->v_mmap_read
|| vp
->v_mmap_write
)
2619 if (vp
->v_mmap_read
)
2629 * Set the operations vector for a vnode.
2632 vn_setops(struct vnode
*vnode
, const struct vnodeops
*ops
)
2638 * Retrieve the operations vector for a vnode
2640 const struct vnodeops
*
2641 vn_getops(struct vnode
*vnode
)
2647 * Returns non-zero (1) if the vnodeops matches that of the vnode.
2648 * Returns zero (0) if not.
2651 vn_matchops(struct vnode
*vp
, const struct vnodeops
*vnodeops
)
2653 return (vn_getops(vp
) == vnodeops
);
2657 * fs_new_caller_id() needs to return a unique ID on a given local system.
2658 * The IDs do not need to survive across reboots. These are primarily
2659 * used so that (FEM) monitors can detect particular callers (such as
2660 * the NFS server) to a given vnode/vfs operation.
2665 static uint64_t next_caller_id
= 0LL; /* First call returns 1 */
2667 return ((u_longlong_t
)atomic_inc_64_nv(&next_caller_id
));
2671 * The value stored in v_path is relative to rootdir, located in the global
2672 * zone. Zones or chroot environments which reside deeper inside the VFS
2673 * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
2674 * what lies below their perceived root. In order to keep v_path usable for
2675 * these child environments, its allocations are allowed to exceed MAXPATHLEN.
2677 * An upper bound of max_vnode_path is placed upon v_path allocations to
2678 * prevent the system from going too wild at the behest of pathological
2679 * behavior from the operator.
2681 size_t max_vnode_path
= 4 * MAXPATHLEN
;
2685 vn_clearpath(vnode_t
*vp
, hrtime_t compare_stamp
)
2689 mutex_enter(&vp
->v_lock
);
2691 * If the snapshot of v_path_stamp passed in via compare_stamp does not
2692 * match the present value on the vnode, it indicates that subsequent
2693 * changes have occurred. The v_path value is not cleared in this case
2694 * since the new value may be valid.
2696 if (compare_stamp
!= 0 && vp
->v_path_stamp
!= compare_stamp
) {
2697 mutex_exit(&vp
->v_lock
);
2701 vp
->v_path
= vn_vpath_empty
;
2702 vp
->v_path_stamp
= 0;
2703 mutex_exit(&vp
->v_lock
);
2704 if (buf
!= vn_vpath_empty
) {
2705 kmem_free(buf
, strlen(buf
) + 1);
2710 vn_setpath_common(vnode_t
*pvp
, vnode_t
*vp
, const char *name
, size_t len
,
2711 boolean_t is_rename
)
2715 size_t baselen
, buflen
= 0;
2717 /* Handle the vn_setpath_str case. */
2719 if (len
+ 1 > max_vnode_path
) {
2720 DTRACE_PROBE4(vn__setpath__too__long
, vnode_t
*, pvp
,
2721 vnode_t
*, vp
, char *, name
, size_t, len
+ 1);
2724 buf
= kmem_alloc(len
+ 1, KM_SLEEP
);
2725 bcopy(name
, buf
, len
);
2728 mutex_enter(&vp
->v_lock
);
2729 oldbuf
= vp
->v_path
;
2731 vp
->v_path_stamp
= gethrtime();
2732 mutex_exit(&vp
->v_lock
);
2733 if (oldbuf
!= vn_vpath_empty
) {
2734 kmem_free(oldbuf
, strlen(oldbuf
) + 1);
2739 /* Take snapshot of parent dir */
2740 mutex_enter(&pvp
->v_lock
);
2742 if ((pvp
->v_flag
& VTRAVERSE
) != 0) {
2744 * When the parent vnode has VTRAVERSE set in its flags, normal
2745 * assumptions about v_path calculation no longer apply. The
2746 * primary situation where this occurs is via the VFS tricks
2747 * which procfs plays in order to allow /proc/PID/(root|cwd) to
2748 * yield meaningful results.
2750 * When this flag is set, v_path on the child must not be
2751 * updated since the calculated value is likely to be
2752 * incorrect, given the current context.
2754 mutex_exit(&pvp
->v_lock
);
2759 if (pvp
->v_path
== vn_vpath_empty
) {
2761 * Without v_path from the parent directory, generating a child
2762 * path from the name is impossible.
2765 pstamp
= pvp
->v_path_stamp
;
2766 mutex_exit(&pvp
->v_lock
);
2767 vn_clearpath(vp
, pstamp
);
2772 * The only feasible case here is where a NUL lookup is being
2773 * performed on rootdir prior to its v_path being populated.
2775 ASSERT(pvp
->v_path_stamp
== 0);
2779 pstamp
= pvp
->v_path_stamp
;
2780 baselen
= strlen(pvp
->v_path
);
2781 /* ignore a trailing slash if present */
2782 if (pvp
->v_path
[baselen
- 1] == '/') {
2783 /* This should only the be case for rootdir */
2784 ASSERT(baselen
== 1 && pvp
== rootdir
);
2788 mutex_exit(&pvp
->v_lock
);
2791 /* Free the existing (mis-sized) buffer in case of retry */
2792 kmem_free(buf
, buflen
);
2794 /* base, '/', name and trailing NUL */
2795 buflen
= baselen
+ len
+ 2;
2796 if (buflen
> max_vnode_path
) {
2797 DTRACE_PROBE4(vn__setpath_too__long
, vnode_t
*, pvp
,
2798 vnode_t
*, vp
, char *, name
, size_t, buflen
);
2801 buf
= kmem_alloc(buflen
, KM_SLEEP
);
2803 mutex_enter(&pvp
->v_lock
);
2804 if (pvp
->v_path_stamp
!= pstamp
) {
2808 * Since v_path_stamp changed on the parent, it is likely that
2809 * v_path has been altered as well. If the length does not
2810 * exactly match what was previously measured, the buffer
2811 * allocation must be repeated for proper sizing.
2813 if (pvp
->v_path
== vn_vpath_empty
) {
2814 /* Give up if parent lack v_path */
2815 mutex_exit(&pvp
->v_lock
);
2816 kmem_free(buf
, buflen
);
2819 vlen
= strlen(pvp
->v_path
);
2820 if (pvp
->v_path
[vlen
- 1] == '/') {
2823 if (vlen
!= baselen
) {
2827 bcopy(pvp
->v_path
, buf
, baselen
);
2828 mutex_exit(&pvp
->v_lock
);
2832 bcopy(name
, &buf
[baselen
], len
+ 1);
2834 mutex_enter(&vp
->v_lock
);
2835 if (vp
->v_path_stamp
== 0) {
2836 /* never-visited vnode can inherit stamp from parent */
2837 ASSERT(vp
->v_path
== vn_vpath_empty
);
2838 vp
->v_path_stamp
= pstamp
;
2840 mutex_exit(&vp
->v_lock
);
2841 } else if (vp
->v_path_stamp
< pstamp
|| is_rename
) {
2843 * Install the updated path and stamp, ensuring that the v_path
2844 * pointer is valid at all times for dtrace.
2846 oldbuf
= vp
->v_path
;
2848 vp
->v_path_stamp
= gethrtime();
2849 mutex_exit(&vp
->v_lock
);
2850 kmem_free(oldbuf
, strlen(oldbuf
) + 1);
2853 * If the timestamp matches or is greater, it means another
2854 * thread performed the update first while locks were dropped
2855 * here to make the allocation. We defer to the newer value.
2857 mutex_exit(&vp
->v_lock
);
2858 kmem_free(buf
, buflen
);
2860 ASSERT(MUTEX_NOT_HELD(&vp
->v_lock
));
2864 vn_updatepath(vnode_t
*pvp
, vnode_t
*vp
, const char *name
)
2869 * If the parent is older or empty, there's nothing further to do.
2871 if (pvp
->v_path
== vn_vpath_empty
||
2872 pvp
->v_path_stamp
<= vp
->v_path_stamp
) {
2877 * Given the lack of appropriate context, meaningful updates to v_path
2878 * cannot be made for during lookups for the '.' or '..' entries.
2881 if (len
== 0 || (len
== 1 && name
[0] == '.') ||
2882 (len
== 2 && name
[0] == '.' && name
[1] == '.')) {
2886 vn_setpath_common(pvp
, vp
, name
, len
, B_FALSE
);
2890 * Given a starting vnode and a path, updates the path in the target vnode in
2891 * a safe manner. If the vnode already has path information embedded, then the
2892 * cached path is left untouched.
2896 vn_setpath(vnode_t
*rootvp
, vnode_t
*pvp
, vnode_t
*vp
, const char *name
,
2899 vn_setpath_common(pvp
, vp
, name
, len
, B_FALSE
);
2903 * Sets the path to the vnode to be the given string, regardless of current
2904 * context. The string must be a complete path from rootdir. This is only used
2905 * by fsop_root() for setting the path based on the mountpoint.
2908 vn_setpath_str(vnode_t
*vp
, const char *str
, size_t len
)
2910 vn_setpath_common(NULL
, vp
, str
, len
, B_FALSE
);
2914 * Called from within filesystem's vop_rename() to handle renames once the
2915 * target vnode is available.
2918 vn_renamepath(vnode_t
*pvp
, vnode_t
*vp
, const char *name
, size_t len
)
2920 vn_setpath_common(pvp
, vp
, name
, len
, B_TRUE
);
2924 * Similar to vn_setpath_str(), this function sets the path of the destination
2925 * vnode to the be the same as the source vnode.
2928 vn_copypath(struct vnode
*src
, struct vnode
*dst
)
2934 mutex_enter(&src
->v_lock
);
2935 if (src
->v_path
== vn_vpath_empty
) {
2936 mutex_exit(&src
->v_lock
);
2939 buflen
= strlen(src
->v_path
) + 1;
2940 mutex_exit(&src
->v_lock
);
2942 buf
= kmem_alloc(buflen
, KM_SLEEP
);
2944 mutex_enter(&src
->v_lock
);
2945 if (src
->v_path
== vn_vpath_empty
||
2946 strlen(src
->v_path
) + 1 != buflen
) {
2947 mutex_exit(&src
->v_lock
);
2948 kmem_free(buf
, buflen
);
2951 bcopy(src
->v_path
, buf
, buflen
);
2952 stamp
= src
->v_path_stamp
;
2953 mutex_exit(&src
->v_lock
);
2955 mutex_enter(&dst
->v_lock
);
2956 if (dst
->v_path
!= vn_vpath_empty
) {
2957 mutex_exit(&dst
->v_lock
);
2958 kmem_free(buf
, buflen
);
2962 dst
->v_path_stamp
= stamp
;
2963 mutex_exit(&dst
->v_lock
);
2968 * XXX Private interface for segvn routines that handle vnode
2969 * large page segments.
2971 * return 1 if vp's file system fop_pageio() implementation
2972 * can be safely used instead of fop_getpage() for handling
2973 * pagefaults against regular non swap files. fop_pageio()
2974 * interface is considered safe here if its implementation
2975 * is very close to fop_getpage() implementation.
2976 * e.g. It zero's out the part of the page beyond EOF. Doesn't
2977 * panic if there're file holes but instead returns an error.
2978 * Doesn't assume file won't be changed by user writes, etc.
2980 * return 0 otherwise.
2982 * For now allow segvn to only use fop_pageio() with ufs and nfs.
2985 vn_vmpss_usepageio(vnode_t
*vp
)
2987 vfs_t
*vfsp
= vp
->v_vfsp
;
2988 char *fsname
= vfssw
[vfsp
->vfs_fstype
].vsw_name
;
2989 char *pageio_ok_fss
[] = {"ufs", "nfs", NULL
};
2990 char **fsok
= pageio_ok_fss
;
2992 if (fsname
== NULL
) {
2996 for (; *fsok
; fsok
++) {
2997 if (strcmp(*fsok
, fsname
) == 0) {
3004 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3011 caller_context_t
*ct
)
3018 * Adding to the vnode counts before calling open
3019 * avoids the need for a mutex. It circumvents a race
3020 * condition where a query made on the vnode counts results in a
3021 * false negative. The inquirer goes away believing the file is
3022 * not open when there is an open on the file already under way.
3024 * The counts are meant to prevent NFS from granting a delegation
3025 * when it would be dangerous to do so.
3027 * The vnode counts are only kept on regular files
3029 if ((*vpp
)->v_type
== VREG
) {
3031 atomic_inc_32(&(*vpp
)->v_rdcnt
);
3033 atomic_inc_32(&(*vpp
)->v_wrcnt
);
3036 VOPXID_MAP_CR(vp
, cr
);
3038 ret
= fop_open_dispatch(vpp
, mode
, cr
, ct
, true);
3042 * Use the saved vp just in case the vnode ptr got trashed
3045 VOPSTATS_UPDATE(vp
, open
);
3046 if ((vp
->v_type
== VREG
) && (mode
& FREAD
))
3047 atomic_dec_32(&vp
->v_rdcnt
);
3048 if ((vp
->v_type
== VREG
) && (mode
& FWRITE
))
3049 atomic_dec_32(&vp
->v_wrcnt
);
3052 * Some filesystems will return a different vnode,
3053 * but the same path was still used to open it.
3054 * So if we do change the vnode and need to
3055 * copy over the path, do so here, rather than special
3056 * casing each filesystem. Adjust the vnode counts to
3057 * reflect the vnode switch.
3059 VOPSTATS_UPDATE(*vpp
, open
);
3060 if (*vpp
!= vp
&& *vpp
!= NULL
) {
3061 vn_copypath(vp
, *vpp
);
3062 if (((*vpp
)->v_type
== VREG
) && (mode
& FREAD
))
3063 atomic_inc_32(&(*vpp
)->v_rdcnt
);
3064 if ((vp
->v_type
== VREG
) && (mode
& FREAD
))
3065 atomic_dec_32(&vp
->v_rdcnt
);
3066 if (((*vpp
)->v_type
== VREG
) && (mode
& FWRITE
))
3067 atomic_inc_32(&(*vpp
)->v_wrcnt
);
3068 if ((vp
->v_type
== VREG
) && (mode
& FWRITE
))
3069 atomic_dec_32(&vp
->v_wrcnt
);
3083 caller_context_t
*ct
)
3087 VOPXID_MAP_CR(vp
, cr
);
3089 err
= fop_close_dispatch(vp
, flag
, count
, offset
, cr
, ct
, true);
3091 VOPSTATS_UPDATE(vp
, close
);
3093 * Check passed in count to handle possible dups. Vnode counts are only
3094 * kept on regular files
3096 if ((vp
->v_type
== VREG
) && (count
== 1)) {
3098 ASSERT(vp
->v_rdcnt
> 0);
3099 atomic_dec_32(&vp
->v_rdcnt
);
3101 if (flag
& FWRITE
) {
3102 ASSERT(vp
->v_wrcnt
> 0);
3103 atomic_dec_32(&vp
->v_wrcnt
);
3115 caller_context_t
*ct
)
3118 ssize_t resid_start
= uiop
->uio_resid
;
3120 VOPXID_MAP_CR(vp
, cr
);
3122 err
= fop_read_dispatch(vp
, uiop
, ioflag
, cr
, ct
, true);
3124 VOPSTATS_UPDATE_IO(vp
, read
,
3125 read_bytes
, (resid_start
- uiop
->uio_resid
));
3135 caller_context_t
*ct
)
3138 ssize_t resid_start
= uiop
->uio_resid
;
3140 VOPXID_MAP_CR(vp
, cr
);
3142 err
= fop_write_dispatch(vp
, uiop
, ioflag
, cr
, ct
, true);
3144 VOPSTATS_UPDATE_IO(vp
, write
,
3145 write_bytes
, (resid_start
- uiop
->uio_resid
));
3157 caller_context_t
*ct
)
3161 VOPXID_MAP_CR(vp
, cr
);
3163 err
= fop_ioctl_dispatch(vp
, cmd
, arg
, flag
, cr
, rvalp
, ct
, true);
3165 VOPSTATS_UPDATE(vp
, ioctl
);
3175 caller_context_t
*ct
)
3179 VOPXID_MAP_CR(vp
, cr
);
3181 err
= fop_setfl_dispatch(vp
, oflags
, nflags
, cr
, ct
, true);
3183 VOPSTATS_UPDATE(vp
, setfl
);
3193 caller_context_t
*ct
)
3197 VOPXID_MAP_CR(vp
, cr
);
3200 * If this file system doesn't understand the xvattr extensions
3201 * then turn off the xvattr bit.
3203 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_XVATTR
) == 0) {
3204 vap
->va_mask
&= ~VATTR_XVATTR
;
3208 * We're only allowed to skip the ACL check iff we used a 32 bit
3209 * ACE mask with fop_access() to determine permissions.
3211 if ((flags
& ATTR_NOACLCHECK
) &&
3212 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0)
3215 err
= fop_getattr_dispatch(vp
, vap
, flags
, cr
, ct
, true);
3217 VOPSTATS_UPDATE(vp
, getattr
);
3227 caller_context_t
*ct
)
3231 VOPXID_MAP_CR(vp
, cr
);
3234 * If this file system doesn't understand the xvattr extensions
3235 * then turn off the xvattr bit.
3237 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_XVATTR
) == 0) {
3238 vap
->va_mask
&= ~VATTR_XVATTR
;
3242 * We're only allowed to skip the ACL check iff we used a 32 bit
3243 * ACE mask with fop_access() to determine permissions.
3245 if ((flags
& ATTR_NOACLCHECK
) &&
3246 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0)
3249 err
= fop_setattr_dispatch(vp
, vap
, flags
, cr
, ct
, true);
3251 VOPSTATS_UPDATE(vp
, setattr
);
3261 caller_context_t
*ct
)
3265 if ((flags
& V_ACE_MASK
) &&
3266 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0) {
3270 VOPXID_MAP_CR(vp
, cr
);
3272 err
= fop_access_dispatch(vp
, mode
, flags
, cr
, ct
, true);
3274 VOPSTATS_UPDATE(vp
, access
);
3287 caller_context_t
*ct
,
3288 int *deflags
, /* Returned per-dirent flags */
3289 pathname_t
*ppnp
) /* Returned case-preserved name in directory */
3294 * If this file system doesn't support case-insensitive access
3295 * and said access is requested, fail quickly. It is required
3296 * that if the vfs supports case-insensitive lookup, it also
3297 * supports extended dirent flags.
3299 if (flags
& FIGNORECASE
&&
3300 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3301 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3304 VOPXID_MAP_CR(dvp
, cr
);
3306 if ((flags
& LOOKUP_XATTR
) && (flags
& LOOKUP_HAVE_SYSATTR_DIR
) == 0) {
3307 ret
= xattr_dir_lookup(dvp
, vpp
, flags
, cr
);
3309 ret
= fop_lookup_dispatch(dvp
, nm
, vpp
, pnp
, flags
, rdir
, cr
,
3310 ct
, deflags
, ppnp
, true);
3313 if (ret
== 0 && *vpp
) {
3314 VOPSTATS_UPDATE(*vpp
, lookup
);
3315 vn_updatepath(dvp
, *vpp
, nm
);
3331 caller_context_t
*ct
,
3332 vsecattr_t
*vsecp
) /* ACL to set during create */
3336 if (vsecp
!= NULL
&&
3337 vfs_has_feature(dvp
->v_vfsp
, VFSFT_ACLONCREATE
) == 0) {
3341 * If this file system doesn't support case-insensitive access
3342 * and said access is requested, fail quickly.
3344 if (flags
& FIGNORECASE
&&
3345 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3346 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3349 VOPXID_MAP_CR(dvp
, cr
);
3351 ret
= fop_create_dispatch(dvp
, name
, vap
, excl
, mode
, vpp
, cr
, flags
,
3354 if (ret
== 0 && *vpp
) {
3355 VOPSTATS_UPDATE(*vpp
, create
);
3356 vn_updatepath(dvp
, *vpp
, name
);
3367 caller_context_t
*ct
,
3373 * If this file system doesn't support case-insensitive access
3374 * and said access is requested, fail quickly.
3376 if (flags
& FIGNORECASE
&&
3377 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3378 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3381 VOPXID_MAP_CR(dvp
, cr
);
3383 err
= fop_remove_dispatch(dvp
, nm
, cr
, ct
, flags
, true);
3385 VOPSTATS_UPDATE(dvp
, remove
);
3395 caller_context_t
*ct
,
3401 * If the target file system doesn't support case-insensitive access
3402 * and said access is requested, fail quickly.
3404 if (flags
& FIGNORECASE
&&
3405 (vfs_has_feature(tdvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3406 vfs_has_feature(tdvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3409 VOPXID_MAP_CR(tdvp
, cr
);
3411 err
= fop_link_dispatch(tdvp
, svp
, tnm
, cr
, ct
, flags
, true);
3413 VOPSTATS_UPDATE(tdvp
, link
);
3424 caller_context_t
*ct
,
3430 * If the file system involved does not support
3431 * case-insensitive access and said access is requested, fail
3434 if (flags
& FIGNORECASE
&&
3435 ((vfs_has_feature(sdvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3436 vfs_has_feature(sdvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0)))
3439 VOPXID_MAP_CR(tdvp
, cr
);
3441 err
= fop_rename_dispatch(sdvp
, snm
, tdvp
, tnm
, cr
, ct
, flags
, true);
3443 VOPSTATS_UPDATE(sdvp
, rename
);
3454 caller_context_t
*ct
,
3456 vsecattr_t
*vsecp
) /* ACL to set during create */
3460 if (vsecp
!= NULL
&&
3461 vfs_has_feature(dvp
->v_vfsp
, VFSFT_ACLONCREATE
) == 0) {
3465 * If this file system doesn't support case-insensitive access
3466 * and said access is requested, fail quickly.
3468 if (flags
& FIGNORECASE
&&
3469 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3470 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3473 VOPXID_MAP_CR(dvp
, cr
);
3475 ret
= fop_mkdir_dispatch(dvp
, dirname
, vap
, vpp
, cr
, ct
, flags
, vsecp
,
3478 if (ret
== 0 && *vpp
) {
3479 VOPSTATS_UPDATE(*vpp
, mkdir
);
3480 vn_updatepath(dvp
, *vpp
, dirname
);
3492 caller_context_t
*ct
,
3498 * If this file system doesn't support case-insensitive access
3499 * and said access is requested, fail quickly.
3501 if (flags
& FIGNORECASE
&&
3502 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3503 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3506 VOPXID_MAP_CR(dvp
, cr
);
3508 err
= fop_rmdir_dispatch(dvp
, nm
, cdir
, cr
, ct
, flags
, true);
3510 VOPSTATS_UPDATE(dvp
, rmdir
);
3520 caller_context_t
*ct
,
3524 ssize_t resid_start
= uiop
->uio_resid
;
3527 * If this file system doesn't support retrieving directory
3528 * entry flags and said access is requested, fail quickly.
3530 if (flags
& V_RDDIR_ENTFLAGS
&&
3531 vfs_has_feature(vp
->v_vfsp
, VFSFT_DIRENTFLAGS
) == 0)
3534 VOPXID_MAP_CR(vp
, cr
);
3536 err
= fop_readdir_dispatch(vp
, uiop
, cr
, eofp
, ct
, flags
, true);
3538 VOPSTATS_UPDATE_IO(vp
, readdir
,
3539 readdir_bytes
, (resid_start
- uiop
->uio_resid
));
3550 caller_context_t
*ct
,
3557 * If this file system doesn't support case-insensitive access
3558 * and said access is requested, fail quickly.
3560 if (flags
& FIGNORECASE
&&
3561 (vfs_has_feature(dvp
->v_vfsp
, VFSFT_CASEINSENSITIVE
) == 0 &&
3562 vfs_has_feature(dvp
->v_vfsp
, VFSFT_NOCASESENSITIVE
) == 0))
3565 VOPXID_MAP_CR(dvp
, cr
);
3567 /* check for reparse point */
3568 if ((vfs_has_feature(dvp
->v_vfsp
, VFSFT_REPARSE
)) &&
3569 (strncmp(target
, FS_REPARSE_TAG_STR
,
3570 strlen(FS_REPARSE_TAG_STR
)) == 0)) {
3571 if (!fs_reparse_mark(target
, vap
, &xvattr
))
3572 vap
= (vattr_t
*)&xvattr
;
3575 err
= fop_symlink_dispatch(dvp
, linkname
, vap
, target
, cr
, ct
, flags
,
3578 VOPSTATS_UPDATE(dvp
, symlink
);
3587 caller_context_t
*ct
)
3591 VOPXID_MAP_CR(vp
, cr
);
3593 err
= fop_readlink_dispatch(vp
, uiop
, cr
, ct
, true);
3595 VOPSTATS_UPDATE(vp
, readlink
);
3604 caller_context_t
*ct
)
3608 VOPXID_MAP_CR(vp
, cr
);
3610 err
= fop_fsync_dispatch(vp
, syncflag
, cr
, ct
, true);
3612 VOPSTATS_UPDATE(vp
, fsync
);
3620 caller_context_t
*ct
)
3622 /* Need to update stats before vop call since we may lose the vnode */
3623 VOPSTATS_UPDATE(vp
, inactive
);
3625 VOPXID_MAP_CR(vp
, cr
);
3627 fop_inactive_dispatch(vp
, cr
, ct
, true);
3634 caller_context_t
*ct
)
3638 err
= fop_fid_dispatch(vp
, fidp
, ct
, true);
3640 VOPSTATS_UPDATE(vp
, fid
);
3648 caller_context_t
*ct
)
3652 ret
= fop_rwlock_dispatch(vp
, write_lock
, ct
, true);
3654 VOPSTATS_UPDATE(vp
, rwlock
);
3662 caller_context_t
*ct
)
3664 fop_rwunlock_dispatch(vp
, write_lock
, ct
, true);
3666 VOPSTATS_UPDATE(vp
, rwunlock
);
3674 caller_context_t
*ct
)
3678 err
= fop_seek_dispatch(vp
, ooff
, noffp
, ct
, true);
3680 VOPSTATS_UPDATE(vp
, seek
);
3688 caller_context_t
*ct
)
3692 err
= fop_cmp_dispatch(vp1
, vp2
, ct
, true);
3694 VOPSTATS_UPDATE(vp1
, cmp
);
3705 struct flk_callback
*flk_cbp
,
3707 caller_context_t
*ct
)
3711 VOPXID_MAP_CR(vp
, cr
);
3713 err
= fop_frlock_dispatch(vp
, cmd
, bfp
, flag
, offset
, flk_cbp
, cr
,
3716 VOPSTATS_UPDATE(vp
, frlock
);
3728 caller_context_t
*ct
)
3732 VOPXID_MAP_CR(vp
, cr
);
3734 err
= fop_space_dispatch(vp
, cmd
, bfp
, flag
, offset
, cr
, ct
, true);
3736 VOPSTATS_UPDATE(vp
, space
);
3744 caller_context_t
*ct
)
3748 err
= fop_realvp_dispatch(vp
, vpp
, ct
, true);
3750 VOPSTATS_UPDATE(vp
, realvp
);
3766 caller_context_t
*ct
)
3770 VOPXID_MAP_CR(vp
, cr
);
3772 err
= fop_getpage_dispatch(vp
, off
, len
, protp
, plarr
, plsz
, seg
,
3773 addr
, rw
, cr
, ct
, true);
3775 VOPSTATS_UPDATE(vp
, getpage
);
3786 caller_context_t
*ct
)
3790 VOPXID_MAP_CR(vp
, cr
);
3792 err
= fop_putpage_dispatch(vp
, off
, len
, flags
, cr
, ct
, true);
3794 VOPSTATS_UPDATE(vp
, putpage
);
3809 caller_context_t
*ct
)
3813 VOPXID_MAP_CR(vp
, cr
);
3815 err
= fop_map_dispatch(vp
, off
, as
, addrp
, len
, prot
, maxprot
,
3816 flags
, cr
, ct
, true);
3818 VOPSTATS_UPDATE(vp
, map
);
3833 caller_context_t
*ct
)
3838 VOPXID_MAP_CR(vp
, cr
);
3840 error
= fop_addmap_dispatch(vp
, off
, as
, addr
, len
, prot
, maxprot
,
3841 flags
, cr
, ct
, true);
3843 if ((!error
) && (vp
->v_type
== VREG
)) {
3844 delta
= (u_longlong_t
)btopr(len
);
3846 * If file is declared MAP_PRIVATE, it can't be written back
3847 * even if open for write. Handle as read.
3849 if (flags
& MAP_PRIVATE
) {
3850 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3854 * atomic_add_64 forces the fetch of a 64 bit value to
3855 * be atomic on 32 bit machines
3857 if (maxprot
& PROT_WRITE
)
3858 atomic_add_64((uint64_t *)(&(vp
->v_mmap_write
)),
3860 if (maxprot
& PROT_READ
)
3861 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3863 if (maxprot
& PROT_EXEC
)
3864 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3868 VOPSTATS_UPDATE(vp
, addmap
);
3883 caller_context_t
*ct
)
3888 VOPXID_MAP_CR(vp
, cr
);
3890 error
= fop_delmap_dispatch(vp
, off
, as
, addr
, len
, prot
, maxprot
,
3891 flags
, cr
, ct
, true);
3894 * NFS calls into delmap twice, the first time
3895 * it simply establishes a callback mechanism and returns EAGAIN
3896 * while the real work is being done upon the second invocation.
3897 * We have to detect this here and only decrement the counts upon
3898 * the second delmap request.
3900 if ((error
!= EAGAIN
) && (vp
->v_type
== VREG
)) {
3902 delta
= (u_longlong_t
)btopr(len
);
3904 if (flags
& MAP_PRIVATE
) {
3905 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3909 * atomic_add_64 forces the fetch of a 64 bit value
3910 * to be atomic on 32 bit machines
3912 if (maxprot
& PROT_WRITE
)
3913 atomic_add_64((uint64_t *)(&(vp
->v_mmap_write
)),
3915 if (maxprot
& PROT_READ
)
3916 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3918 if (maxprot
& PROT_EXEC
)
3919 atomic_add_64((uint64_t *)(&(vp
->v_mmap_read
)),
3923 VOPSTATS_UPDATE(vp
, delmap
);
3934 struct pollhead
**phpp
,
3935 caller_context_t
*ct
)
3939 err
= fop_poll_dispatch(vp
, events
, anyyet
, reventsp
, phpp
, ct
, true);
3941 VOPSTATS_UPDATE(vp
, poll
);
3951 caller_context_t
*ct
)
3955 /* ensure lbdn and dblks can be passed safely to bdev_dump */
3956 if ((lbdn
!= (daddr_t
)lbdn
) || (dblks
!= (int)dblks
))
3959 err
= fop_dump_dispatch(vp
, addr
, lbdn
, dblks
, ct
, true);
3961 VOPSTATS_UPDATE(vp
, dump
);
3971 caller_context_t
*ct
)
3975 VOPXID_MAP_CR(vp
, cr
);
3977 err
= fop_pathconf_dispatch(vp
, cmd
, valp
, cr
, ct
, true);
3979 VOPSTATS_UPDATE(vp
, pathconf
);
3991 caller_context_t
*ct
)
3995 VOPXID_MAP_CR(vp
, cr
);
3997 err
= fop_pageio_dispatch(vp
, pp
, io_off
, io_len
, flags
, cr
, ct
, true);
3999 VOPSTATS_UPDATE(vp
, pageio
);
4008 caller_context_t
*ct
)
4012 err
= fop_dumpctl_dispatch(vp
, action
, blkp
, ct
, true);
4014 VOPSTATS_UPDATE(vp
, dumpctl
);
4025 caller_context_t
*ct
)
4027 /* Must do stats first since it's possible to lose the vnode */
4028 VOPSTATS_UPDATE(vp
, dispose
);
4030 VOPXID_MAP_CR(vp
, cr
);
4032 fop_dispose_dispatch(vp
, pp
, flag
, dn
, cr
, ct
, true);
4041 caller_context_t
*ct
)
4045 VOPXID_MAP_CR(vp
, cr
);
4048 * We're only allowed to skip the ACL check iff we used a 32 bit
4049 * ACE mask with fop_access() to determine permissions.
4051 if ((flag
& ATTR_NOACLCHECK
) &&
4052 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0) {
4056 err
= fop_setsecattr_dispatch(vp
, vsap
, flag
, cr
, ct
, true);
4058 VOPSTATS_UPDATE(vp
, setsecattr
);
4068 caller_context_t
*ct
)
4073 * We're only allowed to skip the ACL check iff we used a 32 bit
4074 * ACE mask with fop_access() to determine permissions.
4076 if ((flag
& ATTR_NOACLCHECK
) &&
4077 vfs_has_feature(vp
->v_vfsp
, VFSFT_ACEMASKONACCESS
) == 0) {
4081 VOPXID_MAP_CR(vp
, cr
);
4083 err
= fop_getsecattr_dispatch(vp
, vsap
, flag
, cr
, ct
, true);
4085 VOPSTATS_UPDATE(vp
, getsecattr
);
4093 struct shrlock
*shr
,
4096 caller_context_t
*ct
)
4100 VOPXID_MAP_CR(vp
, cr
);
4102 err
= fop_shrlock_dispatch(vp
, cmd
, shr
, flag
, cr
, ct
, true);
4104 VOPSTATS_UPDATE(vp
, shrlock
);
4109 fop_vnevent(vnode_t
*vp
, vnevent_t vnevent
, vnode_t
*dvp
, char *fnm
,
4110 caller_context_t
*ct
)
4114 err
= fop_vnevent_dispatch(vp
, vnevent
, dvp
, fnm
, ct
, true);
4116 VOPSTATS_UPDATE(vp
, vnevent
);
4121 fop_reqzcbuf(vnode_t
*vp
, enum uio_rw ioflag
, xuio_t
*uiop
, cred_t
*cr
,
4122 caller_context_t
*ct
)
4126 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_ZEROCOPY_SUPPORTED
) == 0)
4129 err
= fop_reqzcbuf_dispatch(vp
, ioflag
, uiop
, cr
, ct
, true);
4131 VOPSTATS_UPDATE(vp
, reqzcbuf
);
4136 fop_retzcbuf(vnode_t
*vp
, xuio_t
*uiop
, cred_t
*cr
, caller_context_t
*ct
)
4140 if (vfs_has_feature(vp
->v_vfsp
, VFSFT_ZEROCOPY_SUPPORTED
) == 0)
4143 err
= fop_retzcbuf_dispatch(vp
, uiop
, cr
, ct
, true);
4145 VOPSTATS_UPDATE(vp
, retzcbuf
);
4150 * Default destructor
4151 * Needed because NULL destructor means that the key is unused
4155 vsd_defaultdestructor(void *value
)
4159 * Create a key (index into per vnode array)
4160 * Locks out vsd_create, vsd_destroy, and vsd_free
4161 * May allocate memory with lock held
4164 vsd_create(uint_t
*keyp
, void (*destructor
)(void *))
4170 * if key is allocated, do nothing
4172 mutex_enter(&vsd_lock
);
4174 mutex_exit(&vsd_lock
);
4178 * find an unused key
4180 if (destructor
== NULL
)
4181 destructor
= vsd_defaultdestructor
;
4183 for (i
= 0; i
< vsd_nkeys
; ++i
)
4184 if (vsd_destructor
[i
] == NULL
)
4188 * if no unused keys, increase the size of the destructor array
4190 if (i
== vsd_nkeys
) {
4191 if ((nkeys
= (vsd_nkeys
<< 1)) == 0)
4194 (void (**)(void *))vsd_realloc((void *)vsd_destructor
,
4195 (size_t)(vsd_nkeys
* sizeof (void (*)(void *))),
4196 (size_t)(nkeys
* sizeof (void (*)(void *))));
4201 * allocate the next available unused key
4203 vsd_destructor
[i
] = destructor
;
4206 /* create vsd_list, if it doesn't exist */
4207 if (vsd_list
== NULL
) {
4208 vsd_list
= kmem_alloc(sizeof (list_t
), KM_SLEEP
);
4209 list_create(vsd_list
, sizeof (struct vsd_node
),
4210 offsetof(struct vsd_node
, vs_nodes
));
4213 mutex_exit(&vsd_lock
);
4219 * Assumes that the caller is preventing vsd_set and vsd_get
4220 * Locks out vsd_create, vsd_destroy, and vsd_free
4221 * May free memory with lock held
4224 vsd_destroy(uint_t
*keyp
)
4227 struct vsd_node
*vsd
;
4230 * protect the key namespace and our destructor lists
4232 mutex_enter(&vsd_lock
);
4236 ASSERT(key
<= vsd_nkeys
);
4239 * if the key is valid
4244 * for every vnode with VSD, call key's destructor
4246 for (vsd
= list_head(vsd_list
); vsd
!= NULL
;
4247 vsd
= list_next(vsd_list
, vsd
)) {
4249 * no VSD for key in this vnode
4251 if (key
> vsd
->vs_nkeys
)
4254 * call destructor for key
4256 if (vsd
->vs_value
[k
] && vsd_destructor
[k
])
4257 (*vsd_destructor
[k
])(vsd
->vs_value
[k
]);
4259 * reset value for key
4261 vsd
->vs_value
[k
] = NULL
;
4264 * actually free the key (NULL destructor == unused)
4266 vsd_destructor
[k
] = NULL
;
4269 mutex_exit(&vsd_lock
);
4273 * Quickly return the per vnode value that was stored with the specified key
4274 * Assumes the caller is protecting key from vsd_create and vsd_destroy
4275 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4278 vsd_get(vnode_t
*vp
, uint_t key
)
4280 struct vsd_node
*vsd
;
4283 ASSERT(mutex_owned(&vp
->v_vsd_lock
));
4287 if (key
&& vsd
!= NULL
&& key
<= vsd
->vs_nkeys
)
4288 return (vsd
->vs_value
[key
- 1]);
4293 * Set a per vnode value indexed with the specified key
4294 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4297 vsd_set(vnode_t
*vp
, uint_t key
, void *value
)
4299 struct vsd_node
*vsd
;
4302 ASSERT(mutex_owned(&vp
->v_vsd_lock
));
4309 vsd
= vp
->v_vsd
= kmem_zalloc(sizeof (*vsd
), KM_SLEEP
);
4312 * If the vsd was just allocated, vs_nkeys will be 0, so the following
4313 * code won't happen and we will continue down and allocate space for
4314 * the vs_value array.
4315 * If the caller is replacing one value with another, then it is up
4316 * to the caller to free/rele/destroy the previous value (if needed).
4318 if (key
<= vsd
->vs_nkeys
) {
4319 vsd
->vs_value
[key
- 1] = value
;
4323 ASSERT(key
<= vsd_nkeys
);
4325 if (vsd
->vs_nkeys
== 0) {
4326 mutex_enter(&vsd_lock
); /* lock out vsd_destroy() */
4328 * Link onto list of all VSD nodes.
4330 list_insert_head(vsd_list
, vsd
);
4331 mutex_exit(&vsd_lock
);
4335 * Allocate vnode local storage and set the value for key
4337 vsd
->vs_value
= vsd_realloc(vsd
->vs_value
,
4338 vsd
->vs_nkeys
* sizeof (void *),
4339 key
* sizeof (void *));
4340 vsd
->vs_nkeys
= key
;
4341 vsd
->vs_value
[key
- 1] = value
;
4347 * Called from vn_free() to run the destructor function for each vsd
4348 * Locks out vsd_create and vsd_destroy
4349 * Assumes that the destructor *DOES NOT* use vsd
4352 vsd_free(vnode_t
*vp
)
4355 struct vsd_node
*vsd
= vp
->v_vsd
;
4360 if (vsd
->vs_nkeys
== 0) {
4361 kmem_free(vsd
, sizeof (*vsd
));
4367 * lock out vsd_create and vsd_destroy, call
4368 * the destructor, and mark the value as destroyed.
4370 mutex_enter(&vsd_lock
);
4372 for (i
= 0; i
< vsd
->vs_nkeys
; i
++) {
4373 if (vsd
->vs_value
[i
] && vsd_destructor
[i
])
4374 (*vsd_destructor
[i
])(vsd
->vs_value
[i
]);
4375 vsd
->vs_value
[i
] = NULL
;
4379 * remove from linked list of VSD nodes
4381 list_remove(vsd_list
, vsd
);
4383 mutex_exit(&vsd_lock
);
4388 kmem_free(vsd
->vs_value
, vsd
->vs_nkeys
* sizeof (void *));
4389 kmem_free(vsd
, sizeof (struct vsd_node
));
4397 vsd_realloc(void *old
, size_t osize
, size_t nsize
)
4401 new = kmem_zalloc(nsize
, KM_SLEEP
);
4403 bcopy(old
, new, osize
);
4404 kmem_free(old
, osize
);
4410 * Setup the extensible system attribute for creating a reparse point.
4411 * The symlink data 'target' is validated for proper format of a reparse
4412 * string and a check also made to make sure the symlink data does not
4413 * point to an existing file.
4415 * return 0 if ok else -1.
4418 fs_reparse_mark(char *target
, vattr_t
*vap
, xvattr_t
*xvattr
)
4422 if ((!target
) || (!vap
) || (!xvattr
))
4425 /* validate reparse string */
4426 if (reparse_validate((const char *)target
))
4430 xvattr
->xva_vattr
= *vap
;
4431 xvattr
->xva_vattr
.va_mask
|= VATTR_XVATTR
;
4432 xoap
= xva_getxoptattr(xvattr
);
4434 XVA_SET_REQ(xvattr
, XAT_REPARSE
);
4435 xoap
->xoa_reparse
= 1;
4441 * Function to check whether a symlink is a reparse point.
4442 * Return B_TRUE if it is a reparse point, else return B_FALSE
4445 vn_is_reparse(vnode_t
*vp
, cred_t
*cr
, caller_context_t
*ct
)
4450 if ((vp
->v_type
!= VLNK
) ||
4451 !(vfs_has_feature(vp
->v_vfsp
, VFSFT_XVATTR
)))
4455 xoap
= xva_getxoptattr(&xvattr
);
4457 XVA_SET_REQ(&xvattr
, XAT_REPARSE
);
4459 if (fop_getattr(vp
, &xvattr
.xva_vattr
, 0, cr
, ct
))
4462 if ((!(xvattr
.xva_vattr
.va_mask
& VATTR_XVATTR
)) ||
4463 (!(XVA_ISSET_RTN(&xvattr
, XAT_REPARSE
))))
4466 return (xoap
->xoa_reparse
? B_TRUE
: B_FALSE
);