dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / kernel / fs / vnode.c
blobaa9647c2ba977a0e81e77b72b69896ba1139cd5c
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2017, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
29 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
33 * University Copyright- Copyright (c) 1982, 1986, 1988
34 * The Regents of the University of California
35 * All Rights Reserved
37 * University Acknowledgment- Portions of this document are derived from
38 * software developed by the University of California, Berkeley, and its
39 * contributors.
42 #include <sys/types.h>
43 #include <sys/param.h>
44 #include <sys/t_lock.h>
45 #include <sys/errno.h>
46 #include <sys/cred.h>
47 #include <sys/user.h>
48 #include <sys/uio.h>
49 #include <sys/file.h>
50 #include <sys/pathname.h>
51 #include <sys/vfs.h>
52 #include <sys/vnode.h>
53 #include <sys/vnode_dispatch.h>
54 #include <sys/rwstlock.h>
55 #include <sys/fem.h>
56 #include <sys/stat.h>
57 #include <sys/mode.h>
58 #include <sys/conf.h>
59 #include <sys/sysmacros.h>
60 #include <sys/cmn_err.h>
61 #include <sys/systm.h>
62 #include <sys/kmem.h>
63 #include <sys/debug.h>
64 #include <c2/audit.h>
65 #include <sys/acl.h>
66 #include <sys/nbmlock.h>
67 #include <sys/fcntl.h>
68 #include <sys/fs_subr.h>
69 #include <sys/taskq.h>
70 #include <sys/fs_reparse.h>
71 #include <sys/time.h>
72 #include <sys/sdt.h>
74 /* Determine if this vnode is a file that is read-only */
75 #define ISROFILE(vp) \
76 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
77 (vp)->v_type != VFIFO && vn_is_readonly(vp))
79 /* Tunable via /etc/system; used only by admin/install */
80 int nfs_global_client_only;
83 * Array of vopstats_t for per-FS-type vopstats. This array has the same
84 * number of entries as and parallel to the vfssw table. (Arguably, it could
85 * be part of the vfssw table.) Once it's initialized, it's accessed using
86 * the same fstype index that is used to index into the vfssw table.
88 vopstats_t **vopstats_fstype;
90 /* vopstats initialization template used for fast initialization via bcopy() */
91 static vopstats_t *vs_templatep;
93 /* Kmem cache handle for vsk_anchor_t allocations */
94 kmem_cache_t *vsk_anchor_cache;
96 /* file events cleanup routine */
97 extern void free_fopdata(vnode_t *);
100 * Root of AVL tree for the kstats associated with vopstats. Lock protects
101 * updates to vsktat_tree.
103 avl_tree_t vskstat_tree;
104 kmutex_t vskstat_tree_lock;
106 /* Global variable which enables/disables the vopstats collection */
107 int vopstats_enabled = 1;
109 /* Global used for empty/invalid v_path */
110 char *vn_vpath_empty = "";
113 * forward declarations for internal vnode specific data (vsd)
115 static void *vsd_realloc(void *, size_t, size_t);
118 * forward declarations for reparse point functions
120 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
123 * VSD -- VNODE SPECIFIC DATA
124 * The v_data pointer is typically used by a file system to store a
125 * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
126 * However, there are times when additional project private data needs
127 * to be stored separately from the data (node) pointed to by v_data.
128 * This additional data could be stored by the file system itself or
129 * by a completely different kernel entity. VSD provides a way for
130 * callers to obtain a key and store a pointer to private data associated
131 * with a vnode.
133 * Callers are responsible for protecting the vsd by holding v_vsd_lock
134 * for calls to vsd_set() and vsd_get().
138 * vsd_lock protects:
139 * vsd_nkeys - creation and deletion of vsd keys
140 * vsd_list - insertion and deletion of vsd_node in the vsd_list
141 * vsd_destructor - adding and removing destructors to the list
143 static kmutex_t vsd_lock;
144 static uint_t vsd_nkeys; /* size of destructor array */
145 /* list of vsd_node's */
146 static list_t *vsd_list = NULL;
147 /* per-key destructor funcs */
148 static void (**vsd_destructor)(void *);
151 * The following is the common set of actions needed to update the
152 * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and
153 * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
154 * recording of the bytes transferred. Since the code is similar
155 * but small, it is nearly a duplicate. Consequently any changes
156 * to one may need to be reflected in the other.
157 * Rundown of the variables:
158 * vp - Pointer to the vnode
159 * counter - Partial name structure member to update in vopstats for counts
160 * bytecounter - Partial name structure member to update in vopstats for bytes
161 * bytesval - Value to update in vopstats for bytes
162 * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
163 * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
166 #define VOPSTATS_UPDATE(vp, counter) { \
167 vfs_t *vfsp = (vp)->v_vfsp; \
168 if (vfsp && vfsp->vfs_implp && \
169 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
170 vopstats_t *vsp = &vfsp->vfs_vopstats; \
171 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
172 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
173 size_t, uint64_t *); \
174 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \
175 (*stataddr)++; \
176 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
177 vsp->n##counter.value.ui64++; \
182 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \
183 vfs_t *vfsp = (vp)->v_vfsp; \
184 if (vfsp && vfsp->vfs_implp && \
185 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
186 vopstats_t *vsp = &vfsp->vfs_vopstats; \
187 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
188 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
189 size_t, uint64_t *); \
190 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
191 (*stataddr)++; \
192 vsp->bytecounter.value.ui64 += bytesval; \
193 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
194 vsp->n##counter.value.ui64++; \
195 vsp->bytecounter.value.ui64 += bytesval; \
201 * If the filesystem does not support XIDs map credential
202 * If the vfsp is NULL, perhaps we should also map?
204 #define VOPXID_MAP_CR(vp, cr) { \
205 vfs_t *vfsp = (vp)->v_vfsp; \
206 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \
207 cr = crgetmapped(cr); \
211 * Convert stat(2) formats to vnode types and vice versa. (Knows about
212 * numerical order of S_IFMT and vnode types.)
214 enum vtype iftovt_tab[] = {
215 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
216 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
219 ushort_t vttoif_tab[] = {
220 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
221 S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
225 * The system vnode cache.
228 kmem_cache_t *vn_cache;
231 /* Extensible attribute (xva) routines. */
234 * Zero out the structure, set the size of the requested/returned bitmaps,
235 * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
236 * to the returned attributes array.
238 void
239 xva_init(xvattr_t *xvap)
241 bzero(xvap, sizeof (xvattr_t));
242 xvap->xva_mapsize = XVA_MAPSIZE;
243 xvap->xva_magic = XVA_MAGIC;
244 xvap->xva_vattr.va_mask = AT_XVATTR;
245 xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
249 * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
250 * structure. Otherwise, returns NULL.
252 xoptattr_t *
253 xva_getxoptattr(xvattr_t *xvap)
255 xoptattr_t *xoap = NULL;
256 if (xvap->xva_vattr.va_mask & AT_XVATTR)
257 xoap = &xvap->xva_xoptattrs;
258 return (xoap);
262 * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
263 * We use the f_fsid reported by VFS_STATVFS() since we use that for the
264 * kstat name.
266 static int
267 vska_compar(const void *n1, const void *n2)
269 int ret;
270 ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
271 ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
273 if (p1 < p2) {
274 ret = -1;
275 } else if (p1 > p2) {
276 ret = 1;
277 } else {
278 ret = 0;
281 return (ret);
285 * Used to create a single template which will be bcopy()ed to a newly
286 * allocated vsanchor_combo_t structure in new_vsanchor(), below.
288 static vopstats_t *
289 create_vopstats_template()
291 vopstats_t *vsp;
293 vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
294 bzero(vsp, sizeof (*vsp)); /* Start fresh */
296 /* fop_open */
297 kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
298 /* fop_close */
299 kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
300 /* fop_read I/O */
301 kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
302 kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
303 /* fop_write I/O */
304 kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
305 kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
306 /* fop_ioctl */
307 kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
308 /* fop_setfl */
309 kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
310 /* fop_getattr */
311 kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
312 /* fop_setattr */
313 kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
314 /* fop_access */
315 kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
316 /* fop_lookup */
317 kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
318 /* fop_create */
319 kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
320 /* fop_remove */
321 kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
322 /* fop_link */
323 kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
324 /* fop_rename */
325 kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
326 /* fop_mkdir */
327 kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
328 /* fop_rmdir */
329 kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
330 /* fop_readdir I/O */
331 kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
332 kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
333 KSTAT_DATA_UINT64);
334 /* fop_symlink */
335 kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
336 /* fop_readlink */
337 kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
338 /* fop_fsync */
339 kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
340 /* fop_inactive */
341 kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
342 /* fop_fid */
343 kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
344 /* fop_rwlock */
345 kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
346 /* fop_rwunlock */
347 kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
348 /* fop_seek */
349 kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
350 /* fop_cmp */
351 kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
352 /* fop_frlock */
353 kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
354 /* fop_space */
355 kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
356 /* fop_realvp */
357 kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
358 /* fop_getpage */
359 kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
360 /* fop_putpage */
361 kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
362 /* fop_map */
363 kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
364 /* fop_addmap */
365 kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
366 /* fop_delmap */
367 kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
368 /* fop_poll */
369 kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
370 /* fop_dump */
371 kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
372 /* fop_pathconf */
373 kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
374 /* fop_pageio */
375 kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
376 /* fop_dumpctl */
377 kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
378 /* fop_dispose */
379 kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
380 /* fop_setsecattr */
381 kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
382 /* fop_getsecattr */
383 kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
384 /* fop_shrlock */
385 kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
386 /* fop_vnevent */
387 kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
388 /* fop_reqzcbuf */
389 kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
390 /* fop_retzcbuf */
391 kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
393 return (vsp);
397 * Creates a kstat structure associated with a vopstats structure.
399 kstat_t *
400 new_vskstat(char *ksname, vopstats_t *vsp)
402 kstat_t *ksp;
404 if (!vopstats_enabled) {
405 return (NULL);
408 ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
409 sizeof (vopstats_t)/sizeof (kstat_named_t),
410 KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
411 if (ksp) {
412 ksp->ks_data = vsp;
413 kstat_install(ksp);
416 return (ksp);
420 * Called from vfsinit() to initialize the support mechanisms for vopstats
422 void
423 vopstats_startup()
425 if (!vopstats_enabled)
426 return;
429 * Creates the AVL tree which holds per-vfs vopstat anchors. This
430 * is necessary since we need to check if a kstat exists before we
431 * attempt to create it. Also, initialize its lock.
433 avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
434 offsetof(vsk_anchor_t, vsk_node));
435 mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
437 vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
438 sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
439 NULL, NULL, 0);
442 * Set up the array of pointers for the vopstats-by-FS-type.
443 * The entries will be allocated/initialized as each file system
444 * goes through modload/mod_installfs.
446 vopstats_fstype = (vopstats_t **)kmem_zalloc(
447 (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
449 /* Set up the global vopstats initialization template */
450 vs_templatep = create_vopstats_template();
454 * We need to have the all of the counters zeroed.
455 * The initialization of the vopstats_t includes on the order of
456 * 50 calls to kstat_named_init(). Rather that do that on every call,
457 * we do it once in a template (vs_templatep) then bcopy it over.
459 void
460 initialize_vopstats(vopstats_t *vsp)
462 if (vsp == NULL)
463 return;
465 bcopy(vs_templatep, vsp, sizeof (vopstats_t));
469 * If possible, determine which vopstats by fstype to use and
470 * return a pointer to the caller.
472 vopstats_t *
473 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
475 int fstype = 0; /* Index into vfssw[] */
476 vopstats_t *vsp = NULL;
478 if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
479 !vopstats_enabled)
480 return (NULL);
482 * Set up the fstype. We go to so much trouble because all versions
483 * of NFS use the same fstype in their vfs even though they have
484 * distinct entries in the vfssw[] table.
485 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
487 if (vswp) {
488 fstype = vswp - vfssw; /* Gets us the index */
489 } else {
490 fstype = vfsp->vfs_fstype;
494 * Point to the per-fstype vopstats. The only valid values are
495 * non-zero positive values less than the number of vfssw[] table
496 * entries.
498 if (fstype > 0 && fstype < nfstype) {
499 vsp = vopstats_fstype[fstype];
502 return (vsp);
506 * Generate a kstat name, create the kstat structure, and allocate a
507 * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t
508 * to the caller. This must only be called from a mount.
510 vsk_anchor_t *
511 get_vskstat_anchor(vfs_t *vfsp)
513 char kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
514 statvfs64_t statvfsbuf; /* Needed to find f_fsid */
515 vsk_anchor_t *vskp = NULL; /* vfs <--> kstat anchor */
516 kstat_t *ksp; /* Ptr to new kstat */
517 avl_index_t where; /* Location in the AVL tree */
519 if (vfsp == NULL || vfsp->vfs_implp == NULL ||
520 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
521 return (NULL);
523 /* Need to get the fsid to build a kstat name */
524 if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
525 /* Create a name for our kstats based on fsid */
526 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
527 VOPSTATS_STR, statvfsbuf.f_fsid);
529 /* Allocate and initialize the vsk_anchor_t */
530 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
531 bzero(vskp, sizeof (*vskp));
532 vskp->vsk_fsid = statvfsbuf.f_fsid;
534 mutex_enter(&vskstat_tree_lock);
535 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
536 avl_insert(&vskstat_tree, vskp, where);
537 mutex_exit(&vskstat_tree_lock);
540 * Now that we've got the anchor in the AVL
541 * tree, we can create the kstat.
543 ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
544 if (ksp) {
545 vskp->vsk_ksp = ksp;
547 } else {
548 /* Oops, found one! Release memory and lock. */
549 mutex_exit(&vskstat_tree_lock);
550 kmem_cache_free(vsk_anchor_cache, vskp);
551 vskp = NULL;
554 return (vskp);
558 * We're in the process of tearing down the vfs and need to cleanup
559 * the data structures associated with the vopstats. Must only be called
560 * from dounmount().
562 void
563 teardown_vopstats(vfs_t *vfsp)
565 vsk_anchor_t *vskap;
566 avl_index_t where;
568 if (vfsp == NULL || vfsp->vfs_implp == NULL ||
569 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
570 return;
572 /* This is a safe check since VFS_STATS must be set (see above) */
573 if ((vskap = vfsp->vfs_vskap) == NULL)
574 return;
576 /* Whack the pointer right away */
577 vfsp->vfs_vskap = NULL;
579 /* Lock the tree, remove the node, and delete the kstat */
580 mutex_enter(&vskstat_tree_lock);
581 if (avl_find(&vskstat_tree, vskap, &where)) {
582 avl_remove(&vskstat_tree, vskap);
585 if (vskap->vsk_ksp) {
586 kstat_delete(vskap->vsk_ksp);
588 mutex_exit(&vskstat_tree_lock);
590 kmem_cache_free(vsk_anchor_cache, vskap);
594 * Read or write a vnode. Called from kernel code.
597 vn_rdwr(
598 enum uio_rw rw,
599 struct vnode *vp,
600 caddr_t base,
601 ssize_t len,
602 offset_t offset,
603 enum uio_seg seg,
604 int ioflag,
605 rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */
606 cred_t *cr,
607 ssize_t *residp)
609 struct uio uio;
610 struct iovec iov;
611 int error;
612 int in_crit = 0;
614 if (rw == UIO_WRITE && ISROFILE(vp))
615 return (EROFS);
617 if (len < 0)
618 return (EIO);
620 VOPXID_MAP_CR(vp, cr);
622 iov.iov_base = base;
623 iov.iov_len = len;
624 uio.uio_iov = &iov;
625 uio.uio_iovcnt = 1;
626 uio.uio_loffset = offset;
627 uio.uio_segflg = (short)seg;
628 uio.uio_resid = len;
629 uio.uio_llimit = ulimit;
632 * We have to enter the critical region before calling fop_rwlock
633 * to avoid a deadlock with ufs.
635 if (nbl_need_check(vp)) {
636 int svmand;
638 nbl_start_crit(vp, RW_READER);
639 in_crit = 1;
640 error = nbl_svmand(vp, cr, &svmand);
641 if (error != 0)
642 goto done;
643 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
644 uio.uio_offset, uio.uio_resid, svmand, NULL)) {
645 error = EACCES;
646 goto done;
650 (void) fop_rwlock(vp,
651 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
652 if (rw == UIO_WRITE) {
653 uio.uio_fmode = FWRITE;
654 uio.uio_extflg = UIO_COPY_DEFAULT;
655 error = fop_write(vp, &uio, ioflag, cr, NULL);
656 } else {
657 uio.uio_fmode = FREAD;
658 uio.uio_extflg = UIO_COPY_CACHED;
659 error = fop_read(vp, &uio, ioflag, cr, NULL);
661 fop_rwunlock(vp,
662 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
663 if (residp)
664 *residp = uio.uio_resid;
665 else if (uio.uio_resid)
666 error = EIO;
668 done:
669 if (in_crit)
670 nbl_end_crit(vp);
671 return (error);
675 * Release a vnode. Call fop_inactive on last reference or
676 * decrement reference count.
678 * To avoid race conditions, the v_count is left at 1 for
679 * the call to fop_inactive. This prevents another thread
680 * from reclaiming and releasing the vnode *before* the
681 * fop_inactive routine has a chance to destroy the vnode.
682 * We can't have more than 1 thread calling fop_inactive
683 * on a vnode.
685 void
686 vn_rele(vnode_t *vp)
688 VERIFY(vp->v_count > 0);
689 mutex_enter(&vp->v_lock);
690 if (vp->v_count == 1) {
691 mutex_exit(&vp->v_lock);
692 fop_inactive(vp, CRED(), NULL);
693 return;
695 VN_RELE_LOCKED(vp);
696 mutex_exit(&vp->v_lock);
700 * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
701 * as a single reference, so v_count is not decremented until the last DNLC hold
702 * is released. This makes it possible to distinguish vnodes that are referenced
703 * only by the DNLC.
705 void
706 vn_rele_dnlc(vnode_t *vp)
708 VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
709 mutex_enter(&vp->v_lock);
710 if (--vp->v_count_dnlc == 0) {
711 if (vp->v_count == 1) {
712 mutex_exit(&vp->v_lock);
713 fop_inactive(vp, CRED(), NULL);
714 return;
716 VN_RELE_LOCKED(vp);
718 mutex_exit(&vp->v_lock);
722 * Like vn_rele() except that it clears v_stream under v_lock.
723 * This is used by sockfs when it dismantles the association between
724 * the sockfs node and the vnode in the underlying file system.
725 * v_lock has to be held to prevent a thread coming through the lookupname
726 * path from accessing a stream head that is going away.
728 void
729 vn_rele_stream(vnode_t *vp)
731 VERIFY(vp->v_count > 0);
732 mutex_enter(&vp->v_lock);
733 vp->v_stream = NULL;
734 if (vp->v_count == 1) {
735 mutex_exit(&vp->v_lock);
736 fop_inactive(vp, CRED(), NULL);
737 return;
739 VN_RELE_LOCKED(vp);
740 mutex_exit(&vp->v_lock);
743 static void
744 vn_rele_inactive(vnode_t *vp)
746 fop_inactive(vp, CRED(), NULL);
750 * Like vn_rele() except if we are going to call fop_inactive() then do it
751 * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
752 * the file system as a result of releasing the vnode. Note, file systems
753 * already have to handle the race where the vnode is incremented before the
754 * inactive routine is called and does its locking.
756 * Warning: Excessive use of this routine can lead to performance problems.
757 * This is because taskqs throttle back allocation if too many are created.
759 void
760 vn_rele_async(vnode_t *vp, taskq_t *taskq)
762 VERIFY(vp->v_count > 0);
763 mutex_enter(&vp->v_lock);
764 if (vp->v_count == 1) {
765 mutex_exit(&vp->v_lock);
766 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
767 vp, TQ_SLEEP) != (uintptr_t)NULL);
768 return;
770 VN_RELE_LOCKED(vp);
771 mutex_exit(&vp->v_lock);
775 vn_open(
776 char *pnamep,
777 enum uio_seg seg,
778 int filemode,
779 int createmode,
780 struct vnode **vpp,
781 enum create crwhy,
782 mode_t umask)
784 return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
785 umask, NULL, -1));
790 * Open/create a vnode.
791 * This may be callable by the kernel, the only known use
792 * of user context being that the current user credentials
793 * are used for permissions. crwhy is defined iff filemode & FCREAT.
796 vn_openat(
797 char *pnamep,
798 enum uio_seg seg,
799 int filemode,
800 int createmode,
801 struct vnode **vpp,
802 enum create crwhy,
803 mode_t umask,
804 struct vnode *startvp,
805 int fd)
807 struct vnode *vp;
808 int mode;
809 int accessflags;
810 int error;
811 int in_crit = 0;
812 int open_done = 0;
813 int shrlock_done = 0;
814 struct vattr vattr;
815 enum symfollow follow;
816 int estale_retry = 0;
817 struct shrlock shr;
818 struct shr_locowner shr_own;
820 if (filemode & FSEARCH)
821 filemode |= FDIRECTORY;
823 mode = 0;
824 accessflags = 0;
825 if (filemode & FREAD)
826 mode |= VREAD;
827 if (filemode & (FWRITE|FTRUNC))
828 mode |= VWRITE;
829 if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
830 mode |= VEXEC;
832 /* symlink interpretation */
833 if (filemode & FNOFOLLOW)
834 follow = NO_FOLLOW;
835 else
836 follow = FOLLOW;
838 if (filemode & FAPPEND)
839 accessflags |= V_APPEND;
841 top:
842 if (filemode & FCREAT && !(filemode & FDIRECTORY)) {
843 enum vcexcl excl;
845 /* Wish to create a file. */
846 vattr.va_type = VREG;
847 vattr.va_mode = createmode;
848 vattr.va_mask = AT_TYPE|AT_MODE;
849 if (filemode & FTRUNC) {
850 vattr.va_size = 0;
851 vattr.va_mask |= AT_SIZE;
853 if (filemode & FEXCL)
854 excl = EXCL;
855 else
856 excl = NONEXCL;
858 if (error =
859 vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
860 (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
861 return (error);
862 } else {
863 /* Wish to open a file. Just look it up. */
864 if (error = lookupnameat(pnamep, seg, follow,
865 NULLVPP, &vp, startvp)) {
866 if ((error == ESTALE) &&
867 fs_need_estale_retry(estale_retry++))
868 goto top;
869 return (error);
873 * Get the attributes to check whether file is large.
874 * We do this only if the FOFFMAX flag is not set and
875 * only for regular files.
878 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
879 vattr.va_mask = AT_SIZE;
880 if ((error = fop_getattr(vp, &vattr, 0,
881 CRED(), NULL))) {
882 goto out;
884 if (vattr.va_size > (uoff_t)MAXOFF32_T) {
886 * Large File API - regular open fails
887 * if FOFFMAX flag is set in file mode
889 error = EOVERFLOW;
890 goto out;
894 * Can't write directories, active texts, or
895 * read-only filesystems. Can't truncate files
896 * on which mandatory locking is in effect.
898 if (filemode & (FWRITE|FTRUNC)) {
900 * Allow writable directory if VDIROPEN flag is set.
902 if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
903 error = EISDIR;
904 goto out;
906 if (ISROFILE(vp)) {
907 error = EROFS;
908 goto out;
911 * Can't truncate files on which
912 * sysv mandatory locking is in effect.
914 if (filemode & FTRUNC) {
915 vnode_t *rvp;
917 if (fop_realvp(vp, &rvp, NULL) != 0)
918 rvp = vp;
919 if (rvp->v_filocks != NULL) {
920 vattr.va_mask = AT_MODE;
921 if ((error = fop_getattr(vp,
922 &vattr, 0, CRED(), NULL)) == 0 &&
923 MANDLOCK(vp, vattr.va_mode))
924 error = EAGAIN;
927 if (error)
928 goto out;
931 * Check permissions.
933 if (error = fop_access(vp, mode, accessflags, CRED(), NULL))
934 goto out;
936 * Require FDIRECTORY to return a directory.
937 * Require FEXEC to return a regular file.
939 if ((filemode & FDIRECTORY) && vp->v_type != VDIR) {
940 error = ENOTDIR;
941 goto out;
943 if ((filemode & FEXEC) && vp->v_type != VREG) {
944 error = ENOEXEC; /* XXX: error code? */
945 goto out;
950 * Do remaining checks for FNOFOLLOW and FNOLINKS.
952 if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
953 error = ELOOP;
954 goto out;
956 if (filemode & FNOLINKS) {
957 vattr.va_mask = AT_NLINK;
958 if ((error = fop_getattr(vp, &vattr, 0, CRED(), NULL))) {
959 goto out;
961 if (vattr.va_nlink != 1) {
962 error = EMLINK;
963 goto out;
968 * Opening a socket corresponding to the AF_UNIX pathname
969 * in the filesystem name space is not supported.
970 * However, VSOCK nodes in namefs are supported in order
971 * to make fattach work for sockets.
973 * XXX This uses fop_realvp to distinguish between
974 * an unopened namefs node (where fop_realvp returns a
975 * different VSOCK vnode) and a VSOCK created by vn_create
976 * in some file system (where fop_realvp would never return
977 * a different vnode).
979 if (vp->v_type == VSOCK) {
980 struct vnode *nvp;
982 error = fop_realvp(vp, &nvp, NULL);
983 if (error != 0 || nvp == NULL || nvp == vp ||
984 nvp->v_type != VSOCK) {
985 error = EOPNOTSUPP;
986 goto out;
990 if ((vp->v_type == VREG) && nbl_need_check(vp)) {
991 /* get share reservation */
992 shr.s_access = 0;
993 if (filemode & FWRITE)
994 shr.s_access |= F_WRACC;
995 if (filemode & FREAD)
996 shr.s_access |= F_RDACC;
997 shr.s_deny = 0;
998 shr.s_sysid = 0;
999 shr.s_pid = ttoproc(curthread)->p_pid;
1000 shr_own.sl_pid = shr.s_pid;
1001 shr_own.sl_id = fd;
1002 shr.s_own_len = sizeof (shr_own);
1003 shr.s_owner = (caddr_t)&shr_own;
1004 error = fop_shrlock(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1005 NULL);
1006 if (error)
1007 goto out;
1008 shrlock_done = 1;
1010 /* nbmand conflict check if truncating file */
1011 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1012 nbl_start_crit(vp, RW_READER);
1013 in_crit = 1;
1015 vattr.va_mask = AT_SIZE;
1016 if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL))
1017 goto out;
1018 if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1019 NULL)) {
1020 error = EACCES;
1021 goto out;
1027 * Do opening protocol.
1029 error = fop_open(&vp, filemode, CRED(), NULL);
1030 if (error)
1031 goto out;
1032 open_done = 1;
1035 * Truncate if required.
1037 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1038 vattr.va_size = 0;
1039 vattr.va_mask = AT_SIZE;
1040 if ((error = fop_setattr(vp, &vattr, 0, CRED(), NULL)) != 0)
1041 goto out;
1043 out:
1044 ASSERT(vp->v_count > 0);
1046 if (in_crit) {
1047 nbl_end_crit(vp);
1048 in_crit = 0;
1050 if (error) {
1051 if (open_done) {
1052 (void) fop_close(vp, filemode, 1, (offset_t)0, CRED(),
1053 NULL);
1054 open_done = 0;
1055 shrlock_done = 0;
1057 if (shrlock_done) {
1058 (void) fop_shrlock(vp, F_UNSHARE, &shr, 0, CRED(),
1059 NULL);
1060 shrlock_done = 0;
1064 * The following clause was added to handle a problem
1065 * with NFS consistency. It is possible that a lookup
1066 * of the file to be opened succeeded, but the file
1067 * itself doesn't actually exist on the server. This
1068 * is chiefly due to the DNLC containing an entry for
1069 * the file which has been removed on the server. In
1070 * this case, we just start over. If there was some
1071 * other cause for the ESTALE error, then the lookup
1072 * of the file will fail and the error will be returned
1073 * above instead of looping around from here.
1075 VN_RELE(vp);
1076 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1077 goto top;
1078 } else
1079 *vpp = vp;
1080 return (error);
1084 * The following two accessor functions are for the NFSv4 server. Since there
1085 * is no fop_open_UP/DOWNGRADE we need a way for the NFS server to keep the
1086 * vnode open counts correct when a client "upgrades" an open or does an
1087 * open_downgrade. In NFS, an upgrade or downgrade can not only change the
1088 * open mode (add or subtract read or write), but also change the share/deny
1089 * modes. However, share reservations are not integrated with OPEN, yet, so
1090 * we need to handle each separately. These functions are cleaner than having
1091 * the NFS server manipulate the counts directly, however, nobody else should
1092 * use these functions.
1094 void
1095 vn_open_upgrade(
1096 vnode_t *vp,
1097 int filemode)
1099 ASSERT(vp->v_type == VREG);
1101 if (filemode & FREAD)
1102 atomic_inc_32(&vp->v_rdcnt);
1103 if (filemode & FWRITE)
1104 atomic_inc_32(&vp->v_wrcnt);
1108 void
1109 vn_open_downgrade(
1110 vnode_t *vp,
1111 int filemode)
1113 ASSERT(vp->v_type == VREG);
1115 if (filemode & FREAD) {
1116 ASSERT(vp->v_rdcnt > 0);
1117 atomic_dec_32(&vp->v_rdcnt);
1119 if (filemode & FWRITE) {
1120 ASSERT(vp->v_wrcnt > 0);
1121 atomic_dec_32(&vp->v_wrcnt);
1127 vn_create(
1128 char *pnamep,
1129 enum uio_seg seg,
1130 struct vattr *vap,
1131 enum vcexcl excl,
1132 int mode,
1133 struct vnode **vpp,
1134 enum create why,
1135 int flag,
1136 mode_t umask)
1138 return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1139 umask, NULL));
1143 * Create a vnode (makenode).
1146 vn_createat(
1147 char *pnamep,
1148 enum uio_seg seg,
1149 struct vattr *vap,
1150 enum vcexcl excl,
1151 int mode,
1152 struct vnode **vpp,
1153 enum create why,
1154 int flag,
1155 mode_t umask,
1156 struct vnode *startvp)
1158 struct vnode *dvp; /* ptr to parent dir vnode */
1159 struct vnode *vp = NULL;
1160 struct pathname pn;
1161 int error;
1162 int in_crit = 0;
1163 struct vattr vattr;
1164 enum symfollow follow;
1165 int estale_retry = 0;
1166 uint32_t auditing = AU_AUDITING();
1168 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1170 /* symlink interpretation */
1171 if ((flag & FNOFOLLOW) || excl == EXCL)
1172 follow = NO_FOLLOW;
1173 else
1174 follow = FOLLOW;
1175 flag &= ~(FNOFOLLOW|FNOLINKS);
1177 top:
1179 * Lookup directory.
1180 * If new object is a file, call lower level to create it.
1181 * Note that it is up to the lower level to enforce exclusive
1182 * creation, if the file is already there.
1183 * This allows the lower level to do whatever
1184 * locking or protocol that is needed to prevent races.
1185 * If the new object is directory call lower level to make
1186 * the new directory, with "." and "..".
1188 if (error = pn_get(pnamep, seg, &pn))
1189 return (error);
1190 if (auditing)
1191 audit_vncreate_start();
1192 dvp = NULL;
1193 *vpp = NULL;
1195 * lookup will find the parent directory for the vnode.
1196 * When it is done the pn holds the name of the entry
1197 * in the directory.
1198 * If this is a non-exclusive create we also find the node itself.
1200 error = lookuppnat(&pn, NULL, follow, &dvp,
1201 (excl == EXCL) ? NULLVPP : vpp, startvp);
1202 if (error) {
1203 pn_free(&pn);
1204 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1205 goto top;
1206 if (why == CRMKDIR && error == EINVAL)
1207 error = EEXIST; /* SVID */
1208 return (error);
1211 if (why != CRMKNOD)
1212 vap->va_mode &= ~VSVTX;
1215 * If default ACLs are defined for the directory don't apply the
1216 * umask if umask is passed.
1219 if (umask) {
1221 vsecattr_t vsec;
1223 vsec.vsa_aclcnt = 0;
1224 vsec.vsa_aclentp = NULL;
1225 vsec.vsa_dfaclcnt = 0;
1226 vsec.vsa_dfaclentp = NULL;
1227 vsec.vsa_mask = VSA_DFACLCNT;
1228 error = fop_getsecattr(dvp, &vsec, 0, CRED(), NULL);
1230 * If error is ENOSYS then treat it as no error
1231 * Don't want to force all file systems to support
1232 * aclent_t style of ACL's.
1234 if (error == ENOSYS)
1235 error = 0;
1236 if (error) {
1237 if (*vpp != NULL)
1238 VN_RELE(*vpp);
1239 goto out;
1240 } else {
1242 * Apply the umask if no default ACLs.
1244 if (vsec.vsa_dfaclcnt == 0)
1245 vap->va_mode &= ~umask;
1248 * fop_getsecattr() may have allocated memory for
1249 * ACLs we didn't request, so double-check and
1250 * free it if necessary.
1252 if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1253 kmem_free((caddr_t)vsec.vsa_aclentp,
1254 vsec.vsa_aclcnt * sizeof (aclent_t));
1255 if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1256 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1257 vsec.vsa_dfaclcnt * sizeof (aclent_t));
1262 * In general we want to generate EROFS if the file system is
1263 * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1
1264 * documents the open system call, and it says that O_CREAT has no
1265 * effect if the file already exists. Bug 1119649 states
1266 * that open(path, O_CREAT, ...) fails when attempting to open an
1267 * existing file on a read only file system. Thus, the first part
1268 * of the following if statement has 3 checks:
1269 * if the file exists &&
1270 * it is being open with write access &&
1271 * the file system is read only
1272 * then generate EROFS
1274 if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1275 (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1276 if (*vpp)
1277 VN_RELE(*vpp);
1278 error = EROFS;
1279 } else if (excl == NONEXCL && *vpp != NULL) {
1280 vnode_t *rvp;
1283 * File already exists. If a mandatory lock has been
1284 * applied, return error.
1286 vp = *vpp;
1287 if (fop_realvp(vp, &rvp, NULL) != 0)
1288 rvp = vp;
1289 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1290 nbl_start_crit(vp, RW_READER);
1291 in_crit = 1;
1293 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1294 vattr.va_mask = AT_MODE|AT_SIZE;
1295 if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL)) {
1296 goto out;
1298 if (MANDLOCK(vp, vattr.va_mode)) {
1299 error = EAGAIN;
1300 goto out;
1303 * File cannot be truncated if non-blocking mandatory
1304 * locks are currently on the file.
1306 if ((vap->va_mask & AT_SIZE) && in_crit) {
1307 uoff_t offset;
1308 ssize_t length;
1310 offset = vap->va_size > vattr.va_size ?
1311 vattr.va_size : vap->va_size;
1312 length = vap->va_size > vattr.va_size ?
1313 vap->va_size - vattr.va_size :
1314 vattr.va_size - vap->va_size;
1315 if (nbl_conflict(vp, NBL_WRITE, offset,
1316 length, 0, NULL)) {
1317 error = EACCES;
1318 goto out;
1324 * If the file is the root of a VFS, we've crossed a
1325 * mount point and the "containing" directory that we
1326 * acquired above (dvp) is irrelevant because it's in
1327 * a different file system. We apply fop_create to the
1328 * target itself instead of to the containing directory
1329 * and supply a null path name to indicate (conventionally)
1330 * the node itself as the "component" of interest.
1332 * The call to fop_create() is necessary to ensure
1333 * that the appropriate permission checks are made,
1334 * i.e. EISDIR, EACCES, etc. We already know that vpp
1335 * exists since we are in the else condition where this
1336 * was checked.
1338 if (vp->v_flag & VROOT) {
1339 ASSERT(why != CRMKDIR);
1340 error = fop_create(vp, "", vap, excl, mode, vpp,
1341 CRED(), flag, NULL, NULL);
1343 * If the create succeeded, it will have created a
1344 * new reference on a new vnode (*vpp) in the child
1345 * file system, so we want to drop our reference on
1346 * the old (vp) upon exit.
1348 goto out;
1352 * Large File API - non-large open (FOFFMAX flag not set)
1353 * of regular file fails if the file size exceeds MAXOFF32_T.
1355 if (why != CRMKDIR &&
1356 !(flag & FOFFMAX) &&
1357 (vp->v_type == VREG)) {
1358 vattr.va_mask = AT_SIZE;
1359 if ((error = fop_getattr(vp, &vattr, 0,
1360 CRED(), NULL))) {
1361 goto out;
1363 if ((vattr.va_size > (uoff_t)MAXOFF32_T)) {
1364 error = EOVERFLOW;
1365 goto out;
1370 if (error == 0) {
1372 * Call mkdir() if specified, otherwise create().
1374 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */
1376 if (why == CRMKDIR)
1378 * N.B., if vn_createat() ever requests
1379 * case-insensitive behavior then it will need
1380 * to be passed to fop_mkdir(). fop_create()
1381 * will already get it via "flag"
1383 error = fop_mkdir(dvp, pn.pn_path, vap, vpp, CRED(),
1384 NULL, 0, NULL);
1385 else if (!must_be_dir)
1386 error = fop_create(dvp, pn.pn_path, vap,
1387 excl, mode, vpp, CRED(), flag, NULL, NULL);
1388 else
1389 error = ENOTDIR;
1392 out:
1394 if (auditing)
1395 audit_vncreate_finish(*vpp, error);
1396 if (in_crit) {
1397 nbl_end_crit(vp);
1398 in_crit = 0;
1400 if (vp != NULL) {
1401 VN_RELE(vp);
1402 vp = NULL;
1404 pn_free(&pn);
1405 VN_RELE(dvp);
1407 * The following clause was added to handle a problem
1408 * with NFS consistency. It is possible that a lookup
1409 * of the file to be created succeeded, but the file
1410 * itself doesn't actually exist on the server. This
1411 * is chiefly due to the DNLC containing an entry for
1412 * the file which has been removed on the server. In
1413 * this case, we just start over. If there was some
1414 * other cause for the ESTALE error, then the lookup
1415 * of the file will fail and the error will be returned
1416 * above instead of looping around from here.
1418 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1419 goto top;
1420 return (error);
1424 vn_link(char *from, char *to, enum uio_seg seg)
1426 return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1430 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1431 vnode_t *tstartvp, char *to, enum uio_seg seg)
1433 struct vnode *fvp; /* from vnode ptr */
1434 struct vnode *tdvp; /* to directory vnode ptr */
1435 struct pathname pn;
1436 int error;
1437 struct vattr vattr;
1438 dev_t fsid;
1439 int estale_retry = 0;
1440 uint32_t auditing = AU_AUDITING();
1442 top:
1443 fvp = tdvp = NULL;
1444 if (error = pn_get(to, seg, &pn))
1445 return (error);
1446 if (auditing && fstartvp != NULL)
1447 audit_setfsat_path(1);
1448 if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1449 goto out;
1450 if (auditing && tstartvp != NULL)
1451 audit_setfsat_path(3);
1452 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1453 goto out;
1455 * Make sure both source vnode and target directory vnode are
1456 * in the same vfs and that it is writeable.
1458 vattr.va_mask = AT_FSID;
1459 if (error = fop_getattr(fvp, &vattr, 0, CRED(), NULL))
1460 goto out;
1461 fsid = vattr.va_fsid;
1462 vattr.va_mask = AT_FSID;
1463 if (error = fop_getattr(tdvp, &vattr, 0, CRED(), NULL))
1464 goto out;
1465 if (fsid != vattr.va_fsid) {
1466 error = EXDEV;
1467 goto out;
1469 if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1470 error = EROFS;
1471 goto out;
1474 * Do the link.
1476 (void) pn_fixslash(&pn);
1477 error = fop_link(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1478 out:
1479 pn_free(&pn);
1480 if (fvp)
1481 VN_RELE(fvp);
1482 if (tdvp)
1483 VN_RELE(tdvp);
1484 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1485 goto top;
1486 return (error);
1490 vn_rename(char *from, char *to, enum uio_seg seg)
1492 return (vn_renameat(NULL, from, NULL, to, seg));
1496 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1497 char *tname, enum uio_seg seg)
1499 int error;
1500 struct vattr vattr;
1501 struct pathname fpn; /* from pathname */
1502 struct pathname tpn; /* to pathname */
1503 dev_t fsid;
1504 int in_crit_src, in_crit_targ;
1505 vnode_t *fromvp, *fvp;
1506 vnode_t *tovp, *targvp;
1507 int estale_retry = 0;
1508 uint32_t auditing = AU_AUDITING();
1510 top:
1511 fvp = fromvp = tovp = targvp = NULL;
1512 in_crit_src = in_crit_targ = 0;
1514 * Get to and from pathnames.
1516 if (error = pn_get(fname, seg, &fpn))
1517 return (error);
1518 if (error = pn_get(tname, seg, &tpn)) {
1519 pn_free(&fpn);
1520 return (error);
1524 * First we need to resolve the correct directories
1525 * The passed in directories may only be a starting point,
1526 * but we need the real directories the file(s) live in.
1527 * For example the fname may be something like usr/lib/sparc
1528 * and we were passed in the / directory, but we need to
1529 * use the lib directory for the rename.
1532 if (auditing && fdvp != NULL)
1533 audit_setfsat_path(1);
1535 * Lookup to and from directories.
1537 if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1538 goto out;
1542 * Make sure there is an entry.
1544 if (fvp == NULL) {
1545 error = ENOENT;
1546 goto out;
1549 if (auditing && tdvp != NULL)
1550 audit_setfsat_path(3);
1551 if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1552 goto out;
1556 * Make sure both the from vnode directory and the to directory
1557 * are in the same vfs and the to directory is writable.
1558 * We check fsid's, not vfs pointers, so loopback fs works.
1560 if (fromvp != tovp) {
1561 vattr.va_mask = AT_FSID;
1562 if (error = fop_getattr(fromvp, &vattr, 0, CRED(), NULL))
1563 goto out;
1564 fsid = vattr.va_fsid;
1565 vattr.va_mask = AT_FSID;
1566 if (error = fop_getattr(tovp, &vattr, 0, CRED(), NULL))
1567 goto out;
1568 if (fsid != vattr.va_fsid) {
1569 error = EXDEV;
1570 goto out;
1574 if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1575 error = EROFS;
1576 goto out;
1580 * Make sure "from" vp is not a mount point.
1581 * Note, lookup did traverse() already, so
1582 * we'll be looking at the mounted FS root.
1583 * (but allow files like mnttab)
1585 if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1586 error = EBUSY;
1587 goto out;
1590 if (targvp && (fvp != targvp)) {
1591 nbl_start_crit(targvp, RW_READER);
1592 in_crit_targ = 1;
1593 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1594 error = EACCES;
1595 goto out;
1599 if (nbl_need_check(fvp)) {
1600 nbl_start_crit(fvp, RW_READER);
1601 in_crit_src = 1;
1602 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1603 error = EACCES;
1604 goto out;
1609 * Do the rename.
1611 (void) pn_fixslash(&tpn);
1612 error = fop_rename(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1613 NULL, 0);
1615 out:
1616 pn_free(&fpn);
1617 pn_free(&tpn);
1618 if (in_crit_src)
1619 nbl_end_crit(fvp);
1620 if (in_crit_targ)
1621 nbl_end_crit(targvp);
1622 if (fromvp)
1623 VN_RELE(fromvp);
1624 if (tovp)
1625 VN_RELE(tovp);
1626 if (targvp)
1627 VN_RELE(targvp);
1628 if (fvp)
1629 VN_RELE(fvp);
1630 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1631 goto top;
1632 return (error);
1636 * Remove a file or directory.
1639 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1641 return (vn_removeat(NULL, fnamep, seg, dirflag));
1645 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1647 struct vnode *vp; /* entry vnode */
1648 struct vnode *dvp; /* ptr to parent dir vnode */
1649 struct vnode *coveredvp;
1650 struct pathname pn; /* name of entry */
1651 enum vtype vtype;
1652 int error;
1653 struct vfs *vfsp;
1654 struct vfs *dvfsp; /* ptr to parent dir vfs */
1655 int in_crit = 0;
1656 int estale_retry = 0;
1658 top:
1659 if (error = pn_get(fnamep, seg, &pn))
1660 return (error);
1661 dvp = vp = NULL;
1662 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1663 pn_free(&pn);
1664 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1665 goto top;
1666 return (error);
1670 * Make sure there is an entry.
1672 if (vp == NULL) {
1673 error = ENOENT;
1674 goto out;
1677 vfsp = vp->v_vfsp;
1678 dvfsp = dvp->v_vfsp;
1681 * If the named file is the root of a mounted filesystem, fail,
1682 * unless it's marked unlinkable. In that case, unmount the
1683 * filesystem and proceed to unlink the covered vnode. (If the
1684 * covered vnode is a directory, use rmdir instead of unlink,
1685 * to avoid file system corruption.)
1687 if (vp->v_flag & VROOT) {
1688 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1689 error = EBUSY;
1690 goto out;
1694 * Namefs specific code starts here.
1697 if (dirflag == RMDIRECTORY) {
1699 * User called rmdir(2) on a file that has
1700 * been namefs mounted on top of. Since
1701 * namefs doesn't allow directories to
1702 * be mounted on other files we know
1703 * vp is not of type VDIR so fail to operation.
1705 error = ENOTDIR;
1706 goto out;
1710 * If VROOT is still set after grabbing vp->v_lock,
1711 * noone has finished nm_unmount so far and coveredvp
1712 * is valid.
1713 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1714 * vp->v_lock, any race window is eliminated.
1717 mutex_enter(&vp->v_lock);
1718 if ((vp->v_flag & VROOT) == 0) {
1719 /* Someone beat us to the unmount */
1720 mutex_exit(&vp->v_lock);
1721 error = EBUSY;
1722 goto out;
1724 vfsp = vp->v_vfsp;
1725 coveredvp = vfsp->vfs_vnodecovered;
1726 ASSERT(coveredvp);
1728 * Note: Implementation of vn_vfswlock shows that ordering of
1729 * v_lock / vn_vfswlock is not an issue here.
1731 error = vn_vfswlock(coveredvp);
1732 mutex_exit(&vp->v_lock);
1734 if (error)
1735 goto out;
1737 VN_HOLD(coveredvp);
1738 VN_RELE(vp);
1739 error = dounmount(vfsp, 0, CRED());
1742 * Unmounted the namefs file system; now get
1743 * the object it was mounted over.
1745 vp = coveredvp;
1747 * If namefs was mounted over a directory, then
1748 * we want to use rmdir() instead of unlink().
1750 if (vp->v_type == VDIR)
1751 dirflag = RMDIRECTORY;
1753 if (error)
1754 goto out;
1758 * Make sure filesystem is writeable.
1759 * We check the parent directory's vfs in case this is an lofs vnode.
1761 if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1762 error = EROFS;
1763 goto out;
1766 vtype = vp->v_type;
1769 * If there is the possibility of an nbmand share reservation, make
1770 * sure it's okay to remove the file. Keep a reference to the
1771 * vnode, so that we can exit the nbl critical region after
1772 * calling fop_remove.
1773 * If there is no possibility of an nbmand share reservation,
1774 * release the vnode reference now. Filesystems like NFS may
1775 * behave differently if there is an extra reference, so get rid of
1776 * this one. Fortunately, we can't have nbmand mounts on NFS
1777 * filesystems.
1779 if (nbl_need_check(vp)) {
1780 nbl_start_crit(vp, RW_READER);
1781 in_crit = 1;
1782 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1783 error = EACCES;
1784 goto out;
1786 } else {
1787 VN_RELE(vp);
1788 vp = NULL;
1791 if (dirflag == RMDIRECTORY) {
1793 * Caller is using rmdir(2), which can only be applied to
1794 * directories.
1796 if (vtype != VDIR) {
1797 error = ENOTDIR;
1798 } else {
1799 vnode_t *cwd;
1800 proc_t *pp = curproc;
1802 mutex_enter(&pp->p_lock);
1803 cwd = PTOU(pp)->u_cdir;
1804 VN_HOLD(cwd);
1805 mutex_exit(&pp->p_lock);
1806 error = fop_rmdir(dvp, pn.pn_path, cwd, CRED(),
1807 NULL, 0);
1808 VN_RELE(cwd);
1810 } else {
1812 * Unlink(2) can be applied to anything.
1814 error = fop_remove(dvp, pn.pn_path, CRED(), NULL, 0);
1817 out:
1818 pn_free(&pn);
1819 if (in_crit) {
1820 nbl_end_crit(vp);
1821 in_crit = 0;
1823 if (vp != NULL)
1824 VN_RELE(vp);
1825 if (dvp != NULL)
1826 VN_RELE(dvp);
1827 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1828 goto top;
1829 return (error);
1833 * Utility function to compare equality of vnodes.
1834 * Compare the underlying real vnodes, if there are underlying vnodes.
1835 * This is a more thorough comparison than the VN_CMP() macro provides.
1838 vn_compare(vnode_t *vp1, vnode_t *vp2)
1840 vnode_t *realvp;
1842 if (vp1 != NULL && fop_realvp(vp1, &realvp, NULL) == 0)
1843 vp1 = realvp;
1844 if (vp2 != NULL && fop_realvp(vp2, &realvp, NULL) == 0)
1845 vp2 = realvp;
1846 return (VN_CMP(vp1, vp2));
1850 * The number of locks to hash into. This value must be a power
1851 * of 2 minus 1 and should probably also be prime.
1853 #define NUM_BUCKETS 1023
1855 struct vn_vfslocks_bucket {
1856 kmutex_t vb_lock;
1857 vn_vfslocks_entry_t *vb_list;
1858 char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
1862 * Total number of buckets will be NUM_BUCKETS + 1 .
1865 #pragma align 64(vn_vfslocks_buckets)
1866 static struct vn_vfslocks_bucket vn_vfslocks_buckets[NUM_BUCKETS + 1];
1868 #define VN_VFSLOCKS_SHIFT 9
1870 #define VN_VFSLOCKS_HASH(vfsvpptr) \
1871 ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
1874 * vn_vfslocks_getlock() uses an HASH scheme to generate
1875 * rwstlock using vfs/vnode pointer passed to it.
1877 * vn_vfslocks_rele() releases a reference in the
1878 * HASH table which allows the entry allocated by
1879 * vn_vfslocks_getlock() to be freed at a later
1880 * stage when the refcount drops to zero.
1883 vn_vfslocks_entry_t *
1884 vn_vfslocks_getlock(void *vfsvpptr)
1886 struct vn_vfslocks_bucket *bp;
1887 vn_vfslocks_entry_t *vep;
1888 vn_vfslocks_entry_t *tvep;
1890 ASSERT(vfsvpptr != NULL);
1891 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
1893 mutex_enter(&bp->vb_lock);
1894 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
1895 if (vep->ve_vpvfs == vfsvpptr) {
1896 vep->ve_refcnt++;
1897 mutex_exit(&bp->vb_lock);
1898 return (vep);
1901 mutex_exit(&bp->vb_lock);
1902 vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
1903 rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
1904 vep->ve_vpvfs = (char *)vfsvpptr;
1905 vep->ve_refcnt = 1;
1906 mutex_enter(&bp->vb_lock);
1907 for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
1908 if (tvep->ve_vpvfs == vfsvpptr) {
1909 tvep->ve_refcnt++;
1910 mutex_exit(&bp->vb_lock);
1913 * There is already an entry in the hash
1914 * destroy what we just allocated.
1916 rwst_destroy(&vep->ve_lock);
1917 kmem_free(vep, sizeof (*vep));
1918 return (tvep);
1921 vep->ve_next = bp->vb_list;
1922 bp->vb_list = vep;
1923 mutex_exit(&bp->vb_lock);
1924 return (vep);
1927 void
1928 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
1930 struct vn_vfslocks_bucket *bp;
1931 vn_vfslocks_entry_t *vep;
1932 vn_vfslocks_entry_t *pvep;
1934 ASSERT(vepent != NULL);
1935 ASSERT(vepent->ve_vpvfs != NULL);
1937 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
1939 mutex_enter(&bp->vb_lock);
1940 vepent->ve_refcnt--;
1942 if ((int32_t)vepent->ve_refcnt < 0)
1943 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
1945 if (vepent->ve_refcnt == 0) {
1946 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
1947 if (vep->ve_vpvfs == vepent->ve_vpvfs) {
1948 if (bp->vb_list == vep)
1949 bp->vb_list = vep->ve_next;
1950 else {
1951 /* LINTED */
1952 pvep->ve_next = vep->ve_next;
1954 mutex_exit(&bp->vb_lock);
1955 rwst_destroy(&vep->ve_lock);
1956 kmem_free(vep, sizeof (*vep));
1957 return;
1959 pvep = vep;
1961 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
1963 mutex_exit(&bp->vb_lock);
1967 * vn_vfswlock_wait is used to implement a lock which is logically a writers
1968 * lock protecting the v_vfsmountedhere field.
1969 * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
1970 * except that it blocks to acquire the lock VVFSLOCK.
1972 * traverse() and routines re-implementing part of traverse (e.g. autofs)
1973 * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
1974 * need the non-blocking version of the writers lock i.e. vn_vfswlock
1977 vn_vfswlock_wait(vnode_t *vp)
1979 int retval;
1980 vn_vfslocks_entry_t *vpvfsentry;
1981 ASSERT(vp != NULL);
1983 vpvfsentry = vn_vfslocks_getlock(vp);
1984 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
1986 if (retval == EINTR) {
1987 vn_vfslocks_rele(vpvfsentry);
1988 return (EINTR);
1990 return (retval);
1994 vn_vfsrlock_wait(vnode_t *vp)
1996 int retval;
1997 vn_vfslocks_entry_t *vpvfsentry;
1998 ASSERT(vp != NULL);
2000 vpvfsentry = vn_vfslocks_getlock(vp);
2001 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2003 if (retval == EINTR) {
2004 vn_vfslocks_rele(vpvfsentry);
2005 return (EINTR);
2008 return (retval);
2013 * vn_vfswlock is used to implement a lock which is logically a writers lock
2014 * protecting the v_vfsmountedhere field.
2017 vn_vfswlock(vnode_t *vp)
2019 vn_vfslocks_entry_t *vpvfsentry;
2022 * If vp is NULL then somebody is trying to lock the covered vnode
2023 * of /. (vfs_vnodecovered is NULL for /). This situation will
2024 * only happen when unmounting /. Since that operation will fail
2025 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2027 if (vp == NULL)
2028 return (EBUSY);
2030 vpvfsentry = vn_vfslocks_getlock(vp);
2032 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2033 return (0);
2035 vn_vfslocks_rele(vpvfsentry);
2036 return (EBUSY);
2040 vn_vfsrlock(vnode_t *vp)
2042 vn_vfslocks_entry_t *vpvfsentry;
2045 * If vp is NULL then somebody is trying to lock the covered vnode
2046 * of /. (vfs_vnodecovered is NULL for /). This situation will
2047 * only happen when unmounting /. Since that operation will fail
2048 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2050 if (vp == NULL)
2051 return (EBUSY);
2053 vpvfsentry = vn_vfslocks_getlock(vp);
2055 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2056 return (0);
2058 vn_vfslocks_rele(vpvfsentry);
2059 return (EBUSY);
2062 void
2063 vn_vfsunlock(vnode_t *vp)
2065 vn_vfslocks_entry_t *vpvfsentry;
2068 * ve_refcnt needs to be decremented twice.
2069 * 1. To release refernce after a call to vn_vfslocks_getlock()
2070 * 2. To release the reference from the locking routines like
2071 * vn_vfsrlock/vn_vfswlock etc,.
2073 vpvfsentry = vn_vfslocks_getlock(vp);
2074 vn_vfslocks_rele(vpvfsentry);
2076 rwst_exit(&vpvfsentry->ve_lock);
2077 vn_vfslocks_rele(vpvfsentry);
2081 vn_vfswlock_held(vnode_t *vp)
2083 int held;
2084 vn_vfslocks_entry_t *vpvfsentry;
2086 ASSERT(vp != NULL);
2088 vpvfsentry = vn_vfslocks_getlock(vp);
2089 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2091 vn_vfslocks_rele(vpvfsentry);
2092 return (held);
2097 * Vnode cache.
2100 /* ARGSUSED */
2101 static int
2102 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2104 struct vnode *vp;
2106 vp = buf;
2108 mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2109 mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2110 cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2111 rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2112 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2113 vp->v_path = vn_vpath_empty;
2114 vp->v_path_stamp = 0;
2115 vp->v_mpssdata = NULL;
2116 vp->v_vsd = NULL;
2117 vp->v_fopdata = NULL;
2119 vmobject_init(&vp->v_object, vp);
2121 return (0);
2124 /* ARGSUSED */
2125 static void
2126 vn_cache_destructor(void *buf, void *cdrarg)
2128 struct vnode *vp;
2130 vp = buf;
2132 vmobject_fini(&vp->v_object);
2134 rw_destroy(&vp->v_nbllock);
2135 cv_destroy(&vp->v_cv);
2136 mutex_destroy(&vp->v_vsd_lock);
2137 mutex_destroy(&vp->v_lock);
2140 void
2141 vn_create_cache(void)
2143 /* LINTED */
2144 ASSERT((1 << VNODE_ALIGN_LOG2) ==
2145 P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2146 vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2147 VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2148 NULL, 0);
2151 void
2152 vn_destroy_cache(void)
2154 kmem_cache_destroy(vn_cache);
2158 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2159 * cached by the file system and vnodes remain associated.
2161 void
2162 vn_recycle(vnode_t *vp)
2164 ASSERT(!vn_has_cached_data(vp));
2165 VERIFY(vp->v_path != NULL);
2168 * XXX - This really belongs in vn_reinit(), but we have some issues
2169 * with the counts. Best to have it here for clean initialization.
2171 vp->v_rdcnt = 0;
2172 vp->v_wrcnt = 0;
2173 vp->v_mmap_read = 0;
2174 vp->v_mmap_write = 0;
2177 * If FEM was in use, make sure everything gets cleaned up
2178 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2179 * constructor.
2181 if (vp->v_femhead) {
2182 /* XXX - There should be a free_femhead() that does all this */
2183 ASSERT(vp->v_femhead->femh_list == NULL);
2184 mutex_destroy(&vp->v_femhead->femh_lock);
2185 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2186 vp->v_femhead = NULL;
2188 if (vp->v_path != vn_vpath_empty) {
2189 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2190 vp->v_path = vn_vpath_empty;
2192 vp->v_path_stamp = 0;
2194 if (vp->v_fopdata != NULL) {
2195 free_fopdata(vp);
2197 vp->v_mpssdata = NULL;
2198 vsd_free(vp);
2202 * Used to reset the vnode fields including those that are directly accessible
2203 * as well as those which require an accessor function.
2205 * Does not initialize:
2206 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2207 * v_data (since FS-nodes and vnodes point to each other and should
2208 * be updated simultaneously)
2209 * v_op (in case someone needs to make a VOP call on this object)
2211 void
2212 vn_reinit(vnode_t *vp)
2214 vp->v_count = 1;
2215 vp->v_count_dnlc = 0;
2216 vp->v_vfsp = NULL;
2217 vp->v_stream = NULL;
2218 vp->v_vfsmountedhere = NULL;
2219 vp->v_flag = 0;
2220 vp->v_type = VNON;
2221 vp->v_rdev = NODEV;
2223 vp->v_filocks = NULL;
2224 vp->v_shrlocks = NULL;
2225 VERIFY(!vn_has_cached_data(vp));
2227 vp->v_locality = NULL;
2228 vp->v_xattrdir = NULL;
2231 * In a few specific instances, vn_reinit() is used to initialize
2232 * locally defined vnode_t instances. Lacking the construction offered
2233 * by vn_alloc(), these vnodes require v_path initialization.
2235 if (vp->v_path == NULL) {
2236 vp->v_path = vn_vpath_empty;
2239 /* Handles v_femhead, v_path, and the r/w/map counts */
2240 vn_recycle(vp);
2243 vnode_t *
2244 vn_alloc(int kmflag)
2246 vnode_t *vp;
2248 vp = kmem_cache_alloc(vn_cache, kmflag);
2250 if (vp != NULL) {
2251 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2252 vp->v_fopdata = NULL;
2253 vn_reinit(vp);
2256 return (vp);
2259 void
2260 vn_free(vnode_t *vp)
2262 ASSERT(vp->v_shrlocks == NULL);
2263 ASSERT(vp->v_filocks == NULL);
2266 * Some file systems call vn_free() with v_count of zero,
2267 * some with v_count of 1. In any case, the value should
2268 * never be anything else.
2270 ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2271 ASSERT(vp->v_count_dnlc == 0);
2272 VERIFY(vp->v_path != NULL);
2273 if (vp->v_path != vn_vpath_empty) {
2274 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2275 vp->v_path = vn_vpath_empty;
2278 /* If FEM was in use, make sure everything gets cleaned up */
2279 if (vp->v_femhead) {
2280 /* XXX - There should be a free_femhead() that does all this */
2281 ASSERT(vp->v_femhead->femh_list == NULL);
2282 mutex_destroy(&vp->v_femhead->femh_lock);
2283 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2284 vp->v_femhead = NULL;
2287 if (vp->v_fopdata != NULL) {
2288 free_fopdata(vp);
2290 vp->v_mpssdata = NULL;
2291 vsd_free(vp);
2292 kmem_cache_free(vn_cache, vp);
2296 * vnode status changes, should define better states than 1, 0.
2298 void
2299 vn_reclaim(vnode_t *vp)
2301 vfs_t *vfsp = vp->v_vfsp;
2303 if (vfsp == NULL ||
2304 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2305 return;
2307 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2310 void
2311 vn_idle(vnode_t *vp)
2313 vfs_t *vfsp = vp->v_vfsp;
2315 if (vfsp == NULL ||
2316 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2317 return;
2319 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2321 void
2322 vn_exists(vnode_t *vp)
2324 vfs_t *vfsp = vp->v_vfsp;
2326 if (vfsp == NULL ||
2327 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2328 return;
2330 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2333 void
2334 vn_invalid(vnode_t *vp)
2336 vfs_t *vfsp = vp->v_vfsp;
2338 if (vfsp == NULL ||
2339 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2340 return;
2342 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2345 /* Vnode event notification */
2348 vnevent_support(vnode_t *vp, caller_context_t *ct)
2350 if (vp == NULL)
2351 return (EINVAL);
2353 return (fop_vnevent(vp, VE_SUPPORT, NULL, NULL, ct));
2356 void
2357 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2359 if (vp == NULL || vp->v_femhead == NULL) {
2360 return;
2362 (void) fop_vnevent(vp, VE_RENAME_SRC, dvp, name, ct);
2365 void
2366 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2367 caller_context_t *ct)
2369 if (vp == NULL || vp->v_femhead == NULL) {
2370 return;
2372 (void) fop_vnevent(vp, VE_RENAME_DEST, dvp, name, ct);
2375 void
2376 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2378 if (vp == NULL || vp->v_femhead == NULL) {
2379 return;
2381 (void) fop_vnevent(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2384 void
2385 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2387 if (vp == NULL || vp->v_femhead == NULL) {
2388 return;
2390 (void) fop_vnevent(vp, VE_REMOVE, dvp, name, ct);
2393 void
2394 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2396 if (vp == NULL || vp->v_femhead == NULL) {
2397 return;
2399 (void) fop_vnevent(vp, VE_RMDIR, dvp, name, ct);
2402 void
2403 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2404 caller_context_t *ct)
2406 if (vp == NULL || vp->v_femhead == NULL) {
2407 return;
2409 (void) fop_vnevent(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2412 void
2413 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2414 caller_context_t *ct)
2416 if (vp == NULL || vp->v_femhead == NULL) {
2417 return;
2419 (void) fop_vnevent(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2422 void
2423 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2424 caller_context_t *ct)
2426 if (vp == NULL || vp->v_femhead == NULL) {
2427 return;
2429 (void) fop_vnevent(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2432 void
2433 vnevent_create(vnode_t *vp, caller_context_t *ct)
2435 if (vp == NULL || vp->v_femhead == NULL) {
2436 return;
2438 (void) fop_vnevent(vp, VE_CREATE, NULL, NULL, ct);
2441 void
2442 vnevent_link(vnode_t *vp, caller_context_t *ct)
2444 if (vp == NULL || vp->v_femhead == NULL) {
2445 return;
2447 (void) fop_vnevent(vp, VE_LINK, NULL, NULL, ct);
2450 void
2451 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2453 if (vp == NULL || vp->v_femhead == NULL) {
2454 return;
2456 (void) fop_vnevent(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2459 void
2460 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2462 if (vp == NULL || vp->v_femhead == NULL) {
2463 return;
2465 (void) fop_vnevent(vp, VE_TRUNCATE, NULL, NULL, ct);
2469 * Vnode accessors.
2473 vn_is_readonly(vnode_t *vp)
2475 return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2479 vn_has_flocks(vnode_t *vp)
2481 return (vp->v_filocks != NULL);
2485 vn_has_mandatory_locks(vnode_t *vp, int mode)
2487 return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2491 vn_has_cached_data(vnode_t *vp)
2493 return (!list_is_empty(&vp->v_object.list));
2497 * Return 0 if the vnode in question shouldn't be permitted into a zone via
2498 * zone_enter(2).
2501 vn_can_change_zones(vnode_t *vp)
2503 struct vfssw *vswp;
2504 int allow = 1;
2505 vnode_t *rvp;
2507 if (nfs_global_client_only != 0)
2508 return (1);
2511 * We always want to look at the underlying vnode if there is one.
2513 if (fop_realvp(vp, &rvp, NULL) != 0)
2514 rvp = vp;
2516 * Some pseudo filesystems (including doorfs) don't actually register
2517 * their vfsops_t, so the following may return NULL; we happily let
2518 * such vnodes switch zones.
2520 vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2521 if (vswp != NULL) {
2522 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2523 allow = 0;
2524 vfs_unrefvfssw(vswp);
2526 return (allow);
2530 * Return nonzero if the vnode is a mount point, zero if not.
2533 vn_ismntpt(vnode_t *vp)
2535 return (vp->v_vfsmountedhere != NULL);
2538 /* Retrieve the vfs (if any) mounted on this vnode */
2539 vfs_t *
2540 vn_mountedvfs(vnode_t *vp)
2542 return (vp->v_vfsmountedhere);
2546 * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2549 vn_in_dnlc(vnode_t *vp)
2551 return (vp->v_count_dnlc > 0);
2555 * vn_has_other_opens() checks whether a particular file is opened by more than
2556 * just the caller and whether the open is for read and/or write.
2557 * This routine is for calling after the caller has already called fop_open()
2558 * and the caller wishes to know if they are the only one with it open for
2559 * the mode(s) specified.
2561 * Vnode counts are only kept on regular files (v_type=VREG).
2564 vn_has_other_opens(
2565 vnode_t *vp,
2566 v_mode_t mode)
2569 ASSERT(vp != NULL);
2571 switch (mode) {
2572 case V_WRITE:
2573 if (vp->v_wrcnt > 1)
2574 return (V_TRUE);
2575 break;
2576 case V_RDORWR:
2577 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2578 return (V_TRUE);
2579 break;
2580 case V_RDANDWR:
2581 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2582 return (V_TRUE);
2583 break;
2584 case V_READ:
2585 if (vp->v_rdcnt > 1)
2586 return (V_TRUE);
2587 break;
2590 return (V_FALSE);
2594 * vn_is_opened() checks whether a particular file is opened and
2595 * whether the open is for read and/or write.
2597 * Vnode counts are only kept on regular files (v_type=VREG).
2600 vn_is_opened(
2601 vnode_t *vp,
2602 v_mode_t mode)
2605 ASSERT(vp != NULL);
2607 switch (mode) {
2608 case V_WRITE:
2609 if (vp->v_wrcnt)
2610 return (V_TRUE);
2611 break;
2612 case V_RDANDWR:
2613 if (vp->v_rdcnt && vp->v_wrcnt)
2614 return (V_TRUE);
2615 break;
2616 case V_RDORWR:
2617 if (vp->v_rdcnt || vp->v_wrcnt)
2618 return (V_TRUE);
2619 break;
2620 case V_READ:
2621 if (vp->v_rdcnt)
2622 return (V_TRUE);
2623 break;
2626 return (V_FALSE);
2630 * vn_is_mapped() checks whether a particular file is mapped and whether
2631 * the file is mapped read and/or write.
2634 vn_is_mapped(
2635 vnode_t *vp,
2636 v_mode_t mode)
2639 ASSERT(vp != NULL);
2641 #if !defined(_LP64)
2642 switch (mode) {
2644 * The atomic_add_64_nv functions force atomicity in the
2645 * case of 32 bit architectures. Otherwise the 64 bit values
2646 * require two fetches. The value of the fields may be
2647 * (potentially) changed between the first fetch and the
2648 * second
2650 case V_WRITE:
2651 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2652 return (V_TRUE);
2653 break;
2654 case V_RDANDWR:
2655 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2656 (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2657 return (V_TRUE);
2658 break;
2659 case V_RDORWR:
2660 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2661 (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2662 return (V_TRUE);
2663 break;
2664 case V_READ:
2665 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2666 return (V_TRUE);
2667 break;
2669 #else
2670 switch (mode) {
2671 case V_WRITE:
2672 if (vp->v_mmap_write)
2673 return (V_TRUE);
2674 break;
2675 case V_RDANDWR:
2676 if (vp->v_mmap_read && vp->v_mmap_write)
2677 return (V_TRUE);
2678 break;
2679 case V_RDORWR:
2680 if (vp->v_mmap_read || vp->v_mmap_write)
2681 return (V_TRUE);
2682 break;
2683 case V_READ:
2684 if (vp->v_mmap_read)
2685 return (V_TRUE);
2686 break;
2688 #endif
2690 return (V_FALSE);
2694 * Set the operations vector for a vnode.
2696 void
2697 vn_setops(struct vnode *vnode, const struct vnodeops *ops)
2699 vnode->v_op = ops;
2703 * Retrieve the operations vector for a vnode
2705 const struct vnodeops *
2706 vn_getops(struct vnode *vnode)
2708 return vnode->v_op;
2712 * Returns non-zero (1) if the vnodeops matches that of the vnode.
2713 * Returns zero (0) if not.
2716 vn_matchops(struct vnode *vp, const struct vnodeops *vnodeops)
2718 return (vn_getops(vp) == vnodeops);
2722 * fs_new_caller_id() needs to return a unique ID on a given local system.
2723 * The IDs do not need to survive across reboots. These are primarily
2724 * used so that (FEM) monitors can detect particular callers (such as
2725 * the NFS server) to a given vnode/vfs operation.
2727 u_longlong_t
2728 fs_new_caller_id()
2730 static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2732 return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2736 * The value stored in v_path is relative to rootdir, located in the global
2737 * zone. Zones or chroot environments which reside deeper inside the VFS
2738 * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
2739 * what lies below their perceived root. In order to keep v_path usable for
2740 * these child environments, its allocations are allowed to exceed MAXPATHLEN.
2742 * An upper bound of max_vnode_path is placed upon v_path allocations to
2743 * prevent the system from going too wild at the behest of pathological
2744 * behavior from the operator.
2746 size_t max_vnode_path = 4 * MAXPATHLEN;
2749 void
2750 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
2752 char *buf;
2754 mutex_enter(&vp->v_lock);
2756 * If the snapshot of v_path_stamp passed in via compare_stamp does not
2757 * match the present value on the vnode, it indicates that subsequent
2758 * changes have occurred. The v_path value is not cleared in this case
2759 * since the new value may be valid.
2761 if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
2762 mutex_exit(&vp->v_lock);
2763 return;
2765 buf = vp->v_path;
2766 vp->v_path = vn_vpath_empty;
2767 vp->v_path_stamp = 0;
2768 mutex_exit(&vp->v_lock);
2769 if (buf != vn_vpath_empty) {
2770 kmem_free(buf, strlen(buf) + 1);
2774 static void
2775 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
2776 boolean_t is_rename)
2778 char *buf, *oldbuf;
2779 hrtime_t pstamp;
2780 size_t baselen, buflen = 0;
2782 /* Handle the vn_setpath_str case. */
2783 if (pvp == NULL) {
2784 if (len + 1 > max_vnode_path) {
2785 DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
2786 vnode_t *, vp, char *, name, size_t, len + 1);
2787 return;
2789 buf = kmem_alloc(len + 1, KM_SLEEP);
2790 bcopy(name, buf, len);
2791 buf[len] = '\0';
2793 mutex_enter(&vp->v_lock);
2794 oldbuf = vp->v_path;
2795 vp->v_path = buf;
2796 vp->v_path_stamp = gethrtime();
2797 mutex_exit(&vp->v_lock);
2798 if (oldbuf != vn_vpath_empty) {
2799 kmem_free(oldbuf, strlen(oldbuf) + 1);
2801 return;
2804 /* Take snapshot of parent dir */
2805 mutex_enter(&pvp->v_lock);
2807 if ((pvp->v_flag & VTRAVERSE) != 0) {
2809 * When the parent vnode has VTRAVERSE set in its flags, normal
2810 * assumptions about v_path calculation no longer apply. The
2811 * primary situation where this occurs is via the VFS tricks
2812 * which procfs plays in order to allow /proc/PID/(root|cwd) to
2813 * yield meaningful results.
2815 * When this flag is set, v_path on the child must not be
2816 * updated since the calculated value is likely to be
2817 * incorrect, given the current context.
2819 mutex_exit(&pvp->v_lock);
2820 return;
2823 retrybuf:
2824 if (pvp->v_path == vn_vpath_empty) {
2826 * Without v_path from the parent directory, generating a child
2827 * path from the name is impossible.
2829 if (len > 0) {
2830 pstamp = pvp->v_path_stamp;
2831 mutex_exit(&pvp->v_lock);
2832 vn_clearpath(vp, pstamp);
2833 return;
2837 * The only feasible case here is where a NUL lookup is being
2838 * performed on rootdir prior to its v_path being populated.
2840 ASSERT(pvp->v_path_stamp == 0);
2841 baselen = 0;
2842 pstamp = 0;
2843 } else {
2844 pstamp = pvp->v_path_stamp;
2845 baselen = strlen(pvp->v_path);
2846 /* ignore a trailing slash if present */
2847 if (pvp->v_path[baselen - 1] == '/') {
2848 /* This should only the be case for rootdir */
2849 ASSERT(baselen == 1 && pvp == rootdir);
2850 baselen--;
2853 mutex_exit(&pvp->v_lock);
2855 if (buflen != 0) {
2856 /* Free the existing (mis-sized) buffer in case of retry */
2857 kmem_free(buf, buflen);
2859 /* base, '/', name and trailing NUL */
2860 buflen = baselen + len + 2;
2861 if (buflen > max_vnode_path) {
2862 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
2863 vnode_t *, vp, char *, name, size_t, buflen);
2864 return;
2866 buf = kmem_alloc(buflen, KM_SLEEP);
2868 mutex_enter(&pvp->v_lock);
2869 if (pvp->v_path_stamp != pstamp) {
2870 size_t vlen;
2873 * Since v_path_stamp changed on the parent, it is likely that
2874 * v_path has been altered as well. If the length does not
2875 * exactly match what was previously measured, the buffer
2876 * allocation must be repeated for proper sizing.
2878 if (pvp->v_path == vn_vpath_empty) {
2879 /* Give up if parent lack v_path */
2880 mutex_exit(&pvp->v_lock);
2881 kmem_free(buf, buflen);
2882 return;
2884 vlen = strlen(pvp->v_path);
2885 if (pvp->v_path[vlen - 1] == '/') {
2886 vlen--;
2888 if (vlen != baselen) {
2889 goto retrybuf;
2892 bcopy(pvp->v_path, buf, baselen);
2893 mutex_exit(&pvp->v_lock);
2895 buf[baselen] = '/';
2896 baselen++;
2897 bcopy(name, &buf[baselen], len + 1);
2899 mutex_enter(&vp->v_lock);
2900 if (vp->v_path_stamp == 0) {
2901 /* never-visited vnode can inherit stamp from parent */
2902 ASSERT(vp->v_path == vn_vpath_empty);
2903 vp->v_path_stamp = pstamp;
2904 vp->v_path = buf;
2905 mutex_exit(&vp->v_lock);
2906 } else if (vp->v_path_stamp < pstamp || is_rename) {
2908 * Install the updated path and stamp, ensuring that the v_path
2909 * pointer is valid at all times for dtrace.
2911 oldbuf = vp->v_path;
2912 vp->v_path = buf;
2913 vp->v_path_stamp = gethrtime();
2914 mutex_exit(&vp->v_lock);
2915 kmem_free(oldbuf, strlen(oldbuf) + 1);
2916 } else {
2918 * If the timestamp matches or is greater, it means another
2919 * thread performed the update first while locks were dropped
2920 * here to make the allocation. We defer to the newer value.
2922 mutex_exit(&vp->v_lock);
2923 kmem_free(buf, buflen);
2925 ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
2928 void
2929 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
2931 size_t len;
2934 * If the parent is older or empty, there's nothing further to do.
2936 if (pvp->v_path == vn_vpath_empty ||
2937 pvp->v_path_stamp <= vp->v_path_stamp) {
2938 return;
2942 * Given the lack of appropriate context, meaningful updates to v_path
2943 * cannot be made for during lookups for the '.' or '..' entries.
2945 len = strlen(name);
2946 if (len == 0 || (len == 1 && name[0] == '.') ||
2947 (len == 2 && name[0] == '.' && name[1] == '.')) {
2948 return;
2951 vn_setpath_common(pvp, vp, name, len, B_FALSE);
2955 * Given a starting vnode and a path, updates the path in the target vnode in
2956 * a safe manner. If the vnode already has path information embedded, then the
2957 * cached path is left untouched.
2959 /* ARGSUSED */
2960 void
2961 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
2962 size_t len)
2964 vn_setpath_common(pvp, vp, name, len, B_FALSE);
2968 * Sets the path to the vnode to be the given string, regardless of current
2969 * context. The string must be a complete path from rootdir. This is only used
2970 * by fsop_root() for setting the path based on the mountpoint.
2972 void
2973 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
2975 vn_setpath_common(NULL, vp, str, len, B_FALSE);
2979 * Called from within filesystem's vop_rename() to handle renames once the
2980 * target vnode is available.
2982 void
2983 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
2985 vn_setpath_common(pvp, vp, name, len, B_TRUE);
2989 * Similar to vn_setpath_str(), this function sets the path of the destination
2990 * vnode to the be the same as the source vnode.
2992 void
2993 vn_copypath(struct vnode *src, struct vnode *dst)
2995 char *buf;
2996 hrtime_t stamp;
2997 size_t buflen;
2999 mutex_enter(&src->v_lock);
3000 if (src->v_path == vn_vpath_empty) {
3001 mutex_exit(&src->v_lock);
3002 return;
3004 buflen = strlen(src->v_path) + 1;
3005 mutex_exit(&src->v_lock);
3007 buf = kmem_alloc(buflen, KM_SLEEP);
3009 mutex_enter(&src->v_lock);
3010 if (src->v_path == vn_vpath_empty ||
3011 strlen(src->v_path) + 1 != buflen) {
3012 mutex_exit(&src->v_lock);
3013 kmem_free(buf, buflen);
3014 return;
3016 bcopy(src->v_path, buf, buflen);
3017 stamp = src->v_path_stamp;
3018 mutex_exit(&src->v_lock);
3020 mutex_enter(&dst->v_lock);
3021 if (dst->v_path != vn_vpath_empty) {
3022 mutex_exit(&dst->v_lock);
3023 kmem_free(buf, buflen);
3024 return;
3026 dst->v_path = buf;
3027 dst->v_path_stamp = stamp;
3028 mutex_exit(&dst->v_lock);
3033 * XXX Private interface for segvn routines that handle vnode
3034 * large page segments.
3036 * return 1 if vp's file system fop_pageio() implementation
3037 * can be safely used instead of fop_getpage() for handling
3038 * pagefaults against regular non swap files. fop_pageio()
3039 * interface is considered safe here if its implementation
3040 * is very close to fop_getpage() implementation.
3041 * e.g. It zero's out the part of the page beyond EOF. Doesn't
3042 * panic if there're file holes but instead returns an error.
3043 * Doesn't assume file won't be changed by user writes, etc.
3045 * return 0 otherwise.
3047 * For now allow segvn to only use fop_pageio() with ufs and nfs.
3050 vn_vmpss_usepageio(vnode_t *vp)
3052 vfs_t *vfsp = vp->v_vfsp;
3053 char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3054 char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3055 char **fsok = pageio_ok_fss;
3057 if (fsname == NULL) {
3058 return (0);
3061 for (; *fsok; fsok++) {
3062 if (strcmp(*fsok, fsname) == 0) {
3063 return (1);
3066 return (0);
3069 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3072 fop_open(
3073 vnode_t **vpp,
3074 int mode,
3075 cred_t *cr,
3076 caller_context_t *ct)
3078 int ret;
3079 vnode_t *vp = *vpp;
3081 VN_HOLD(vp);
3083 * Adding to the vnode counts before calling open
3084 * avoids the need for a mutex. It circumvents a race
3085 * condition where a query made on the vnode counts results in a
3086 * false negative. The inquirer goes away believing the file is
3087 * not open when there is an open on the file already under way.
3089 * The counts are meant to prevent NFS from granting a delegation
3090 * when it would be dangerous to do so.
3092 * The vnode counts are only kept on regular files
3094 if ((*vpp)->v_type == VREG) {
3095 if (mode & FREAD)
3096 atomic_inc_32(&(*vpp)->v_rdcnt);
3097 if (mode & FWRITE)
3098 atomic_inc_32(&(*vpp)->v_wrcnt);
3101 VOPXID_MAP_CR(vp, cr);
3103 ret = fop_open_dispatch(vpp, mode, cr, ct, true);
3105 if (ret) {
3107 * Use the saved vp just in case the vnode ptr got trashed
3108 * by the error.
3110 VOPSTATS_UPDATE(vp, open);
3111 if ((vp->v_type == VREG) && (mode & FREAD))
3112 atomic_dec_32(&vp->v_rdcnt);
3113 if ((vp->v_type == VREG) && (mode & FWRITE))
3114 atomic_dec_32(&vp->v_wrcnt);
3115 } else {
3117 * Some filesystems will return a different vnode,
3118 * but the same path was still used to open it.
3119 * So if we do change the vnode and need to
3120 * copy over the path, do so here, rather than special
3121 * casing each filesystem. Adjust the vnode counts to
3122 * reflect the vnode switch.
3124 VOPSTATS_UPDATE(*vpp, open);
3125 if (*vpp != vp && *vpp != NULL) {
3126 vn_copypath(vp, *vpp);
3127 if (((*vpp)->v_type == VREG) && (mode & FREAD))
3128 atomic_inc_32(&(*vpp)->v_rdcnt);
3129 if ((vp->v_type == VREG) && (mode & FREAD))
3130 atomic_dec_32(&vp->v_rdcnt);
3131 if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3132 atomic_inc_32(&(*vpp)->v_wrcnt);
3133 if ((vp->v_type == VREG) && (mode & FWRITE))
3134 atomic_dec_32(&vp->v_wrcnt);
3137 VN_RELE(vp);
3138 return (ret);
3142 fop_close(
3143 vnode_t *vp,
3144 int flag,
3145 int count,
3146 offset_t offset,
3147 cred_t *cr,
3148 caller_context_t *ct)
3150 int err;
3152 VOPXID_MAP_CR(vp, cr);
3154 err = fop_close_dispatch(vp, flag, count, offset, cr, ct, true);
3156 VOPSTATS_UPDATE(vp, close);
3158 * Check passed in count to handle possible dups. Vnode counts are only
3159 * kept on regular files
3161 if ((vp->v_type == VREG) && (count == 1)) {
3162 if (flag & FREAD) {
3163 ASSERT(vp->v_rdcnt > 0);
3164 atomic_dec_32(&vp->v_rdcnt);
3166 if (flag & FWRITE) {
3167 ASSERT(vp->v_wrcnt > 0);
3168 atomic_dec_32(&vp->v_wrcnt);
3171 return (err);
3175 fop_read(
3176 vnode_t *vp,
3177 uio_t *uiop,
3178 int ioflag,
3179 cred_t *cr,
3180 caller_context_t *ct)
3182 int err;
3183 ssize_t resid_start = uiop->uio_resid;
3185 VOPXID_MAP_CR(vp, cr);
3187 err = fop_read_dispatch(vp, uiop, ioflag, cr, ct, true);
3189 VOPSTATS_UPDATE_IO(vp, read,
3190 read_bytes, (resid_start - uiop->uio_resid));
3191 return (err);
3195 fop_write(
3196 vnode_t *vp,
3197 uio_t *uiop,
3198 int ioflag,
3199 cred_t *cr,
3200 caller_context_t *ct)
3202 int err;
3203 ssize_t resid_start = uiop->uio_resid;
3205 VOPXID_MAP_CR(vp, cr);
3207 err = fop_write_dispatch(vp, uiop, ioflag, cr, ct, true);
3209 VOPSTATS_UPDATE_IO(vp, write,
3210 write_bytes, (resid_start - uiop->uio_resid));
3211 return (err);
3215 fop_ioctl(
3216 vnode_t *vp,
3217 int cmd,
3218 intptr_t arg,
3219 int flag,
3220 cred_t *cr,
3221 int *rvalp,
3222 caller_context_t *ct)
3224 int err;
3226 VOPXID_MAP_CR(vp, cr);
3228 err = fop_ioctl_dispatch(vp, cmd, arg, flag, cr, rvalp, ct, true);
3230 VOPSTATS_UPDATE(vp, ioctl);
3231 return (err);
3235 fop_setfl(
3236 vnode_t *vp,
3237 int oflags,
3238 int nflags,
3239 cred_t *cr,
3240 caller_context_t *ct)
3242 int err;
3244 VOPXID_MAP_CR(vp, cr);
3246 err = fop_setfl_dispatch(vp, oflags, nflags, cr, ct, true);
3248 VOPSTATS_UPDATE(vp, setfl);
3249 return (err);
3253 fop_getattr(
3254 vnode_t *vp,
3255 vattr_t *vap,
3256 int flags,
3257 cred_t *cr,
3258 caller_context_t *ct)
3260 int err;
3262 VOPXID_MAP_CR(vp, cr);
3265 * If this file system doesn't understand the xvattr extensions
3266 * then turn off the xvattr bit.
3268 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3269 vap->va_mask &= ~AT_XVATTR;
3273 * We're only allowed to skip the ACL check iff we used a 32 bit
3274 * ACE mask with fop_access() to determine permissions.
3276 if ((flags & ATTR_NOACLCHECK) &&
3277 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3278 return (EINVAL);
3280 err = fop_getattr_dispatch(vp, vap, flags, cr, ct, true);
3282 VOPSTATS_UPDATE(vp, getattr);
3283 return (err);
3287 fop_setattr(
3288 vnode_t *vp,
3289 vattr_t *vap,
3290 int flags,
3291 cred_t *cr,
3292 caller_context_t *ct)
3294 int err;
3296 VOPXID_MAP_CR(vp, cr);
3299 * If this file system doesn't understand the xvattr extensions
3300 * then turn off the xvattr bit.
3302 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3303 vap->va_mask &= ~AT_XVATTR;
3307 * We're only allowed to skip the ACL check iff we used a 32 bit
3308 * ACE mask with fop_access() to determine permissions.
3310 if ((flags & ATTR_NOACLCHECK) &&
3311 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3312 return (EINVAL);
3314 err = fop_setattr_dispatch(vp, vap, flags, cr, ct, true);
3316 VOPSTATS_UPDATE(vp, setattr);
3317 return (err);
3321 fop_access(
3322 vnode_t *vp,
3323 int mode,
3324 int flags,
3325 cred_t *cr,
3326 caller_context_t *ct)
3328 int err;
3330 if ((flags & V_ACE_MASK) &&
3331 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3332 return (EINVAL);
3335 VOPXID_MAP_CR(vp, cr);
3337 err = fop_access_dispatch(vp, mode, flags, cr, ct, true);
3339 VOPSTATS_UPDATE(vp, access);
3340 return (err);
3344 fop_lookup(
3345 vnode_t *dvp,
3346 char *nm,
3347 vnode_t **vpp,
3348 pathname_t *pnp,
3349 int flags,
3350 vnode_t *rdir,
3351 cred_t *cr,
3352 caller_context_t *ct,
3353 int *deflags, /* Returned per-dirent flags */
3354 pathname_t *ppnp) /* Returned case-preserved name in directory */
3356 int ret;
3359 * If this file system doesn't support case-insensitive access
3360 * and said access is requested, fail quickly. It is required
3361 * that if the vfs supports case-insensitive lookup, it also
3362 * supports extended dirent flags.
3364 if (flags & FIGNORECASE &&
3365 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3366 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3367 return (EINVAL);
3369 VOPXID_MAP_CR(dvp, cr);
3371 if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3372 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3373 } else {
3374 ret = fop_lookup_dispatch(dvp, nm, vpp, pnp, flags, rdir, cr,
3375 ct, deflags, ppnp, true);
3378 if (ret == 0 && *vpp) {
3379 VOPSTATS_UPDATE(*vpp, lookup);
3380 vn_updatepath(dvp, *vpp, nm);
3383 return (ret);
3387 fop_create(
3388 vnode_t *dvp,
3389 char *name,
3390 vattr_t *vap,
3391 vcexcl_t excl,
3392 int mode,
3393 vnode_t **vpp,
3394 cred_t *cr,
3395 int flags,
3396 caller_context_t *ct,
3397 vsecattr_t *vsecp) /* ACL to set during create */
3399 int ret;
3401 if (vsecp != NULL &&
3402 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3403 return (EINVAL);
3406 * If this file system doesn't support case-insensitive access
3407 * and said access is requested, fail quickly.
3409 if (flags & FIGNORECASE &&
3410 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3411 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3412 return (EINVAL);
3414 VOPXID_MAP_CR(dvp, cr);
3416 ret = fop_create_dispatch(dvp, name, vap, excl, mode, vpp, cr, flags,
3417 ct, vsecp, true);
3419 if (ret == 0 && *vpp) {
3420 VOPSTATS_UPDATE(*vpp, create);
3421 vn_updatepath(dvp, *vpp, name);
3424 return (ret);
3428 fop_remove(
3429 vnode_t *dvp,
3430 char *nm,
3431 cred_t *cr,
3432 caller_context_t *ct,
3433 int flags)
3435 int err;
3438 * If this file system doesn't support case-insensitive access
3439 * and said access is requested, fail quickly.
3441 if (flags & FIGNORECASE &&
3442 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3443 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3444 return (EINVAL);
3446 VOPXID_MAP_CR(dvp, cr);
3448 err = fop_remove_dispatch(dvp, nm, cr, ct, flags, true);
3450 VOPSTATS_UPDATE(dvp, remove);
3451 return (err);
3455 fop_link(
3456 vnode_t *tdvp,
3457 vnode_t *svp,
3458 char *tnm,
3459 cred_t *cr,
3460 caller_context_t *ct,
3461 int flags)
3463 int err;
3466 * If the target file system doesn't support case-insensitive access
3467 * and said access is requested, fail quickly.
3469 if (flags & FIGNORECASE &&
3470 (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3471 vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3472 return (EINVAL);
3474 VOPXID_MAP_CR(tdvp, cr);
3476 err = fop_link_dispatch(tdvp, svp, tnm, cr, ct, flags, true);
3478 VOPSTATS_UPDATE(tdvp, link);
3479 return (err);
3483 fop_rename(
3484 vnode_t *sdvp,
3485 char *snm,
3486 vnode_t *tdvp,
3487 char *tnm,
3488 cred_t *cr,
3489 caller_context_t *ct,
3490 int flags)
3492 int err;
3495 * If the file system involved does not support
3496 * case-insensitive access and said access is requested, fail
3497 * quickly.
3499 if (flags & FIGNORECASE &&
3500 ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3501 vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3502 return (EINVAL);
3504 VOPXID_MAP_CR(tdvp, cr);
3506 err = fop_rename_dispatch(sdvp, snm, tdvp, tnm, cr, ct, flags, true);
3508 VOPSTATS_UPDATE(sdvp, rename);
3509 return (err);
3513 fop_mkdir(
3514 vnode_t *dvp,
3515 char *dirname,
3516 vattr_t *vap,
3517 vnode_t **vpp,
3518 cred_t *cr,
3519 caller_context_t *ct,
3520 int flags,
3521 vsecattr_t *vsecp) /* ACL to set during create */
3523 int ret;
3525 if (vsecp != NULL &&
3526 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3527 return (EINVAL);
3530 * If this file system doesn't support case-insensitive access
3531 * and said access is requested, fail quickly.
3533 if (flags & FIGNORECASE &&
3534 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3535 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3536 return (EINVAL);
3538 VOPXID_MAP_CR(dvp, cr);
3540 ret = fop_mkdir_dispatch(dvp, dirname, vap, vpp, cr, ct, flags, vsecp,
3541 true);
3543 if (ret == 0 && *vpp) {
3544 VOPSTATS_UPDATE(*vpp, mkdir);
3545 vn_updatepath(dvp, *vpp, dirname);
3548 return (ret);
3552 fop_rmdir(
3553 vnode_t *dvp,
3554 char *nm,
3555 vnode_t *cdir,
3556 cred_t *cr,
3557 caller_context_t *ct,
3558 int flags)
3560 int err;
3563 * If this file system doesn't support case-insensitive access
3564 * and said access is requested, fail quickly.
3566 if (flags & FIGNORECASE &&
3567 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3568 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3569 return (EINVAL);
3571 VOPXID_MAP_CR(dvp, cr);
3573 err = fop_rmdir_dispatch(dvp, nm, cdir, cr, ct, flags, true);
3575 VOPSTATS_UPDATE(dvp, rmdir);
3576 return (err);
3580 fop_readdir(
3581 vnode_t *vp,
3582 uio_t *uiop,
3583 cred_t *cr,
3584 int *eofp,
3585 caller_context_t *ct,
3586 int flags)
3588 int err;
3589 ssize_t resid_start = uiop->uio_resid;
3592 * If this file system doesn't support retrieving directory
3593 * entry flags and said access is requested, fail quickly.
3595 if (flags & V_RDDIR_ENTFLAGS &&
3596 vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3597 return (EINVAL);
3599 VOPXID_MAP_CR(vp, cr);
3601 err = fop_readdir_dispatch(vp, uiop, cr, eofp, ct, flags, true);
3603 VOPSTATS_UPDATE_IO(vp, readdir,
3604 readdir_bytes, (resid_start - uiop->uio_resid));
3605 return (err);
3609 fop_symlink(
3610 vnode_t *dvp,
3611 char *linkname,
3612 vattr_t *vap,
3613 char *target,
3614 cred_t *cr,
3615 caller_context_t *ct,
3616 int flags)
3618 int err;
3619 xvattr_t xvattr;
3622 * If this file system doesn't support case-insensitive access
3623 * and said access is requested, fail quickly.
3625 if (flags & FIGNORECASE &&
3626 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3627 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3628 return (EINVAL);
3630 VOPXID_MAP_CR(dvp, cr);
3632 /* check for reparse point */
3633 if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3634 (strncmp(target, FS_REPARSE_TAG_STR,
3635 strlen(FS_REPARSE_TAG_STR)) == 0)) {
3636 if (!fs_reparse_mark(target, vap, &xvattr))
3637 vap = (vattr_t *)&xvattr;
3640 err = fop_symlink_dispatch(dvp, linkname, vap, target, cr, ct, flags,
3641 true);
3643 VOPSTATS_UPDATE(dvp, symlink);
3644 return (err);
3648 fop_readlink(
3649 vnode_t *vp,
3650 uio_t *uiop,
3651 cred_t *cr,
3652 caller_context_t *ct)
3654 int err;
3656 VOPXID_MAP_CR(vp, cr);
3658 err = fop_readlink_dispatch(vp, uiop, cr, ct, true);
3660 VOPSTATS_UPDATE(vp, readlink);
3661 return (err);
3665 fop_fsync(
3666 vnode_t *vp,
3667 int syncflag,
3668 cred_t *cr,
3669 caller_context_t *ct)
3671 int err;
3673 VOPXID_MAP_CR(vp, cr);
3675 err = fop_fsync_dispatch(vp, syncflag, cr, ct, true);
3677 VOPSTATS_UPDATE(vp, fsync);
3678 return (err);
3681 void
3682 fop_inactive(
3683 vnode_t *vp,
3684 cred_t *cr,
3685 caller_context_t *ct)
3687 /* Need to update stats before vop call since we may lose the vnode */
3688 VOPSTATS_UPDATE(vp, inactive);
3690 VOPXID_MAP_CR(vp, cr);
3692 fop_inactive_dispatch(vp, cr, ct, true);
3696 fop_fid(
3697 vnode_t *vp,
3698 fid_t *fidp,
3699 caller_context_t *ct)
3701 int err;
3703 err = fop_fid_dispatch(vp, fidp, ct, true);
3705 VOPSTATS_UPDATE(vp, fid);
3706 return (err);
3710 fop_rwlock(
3711 vnode_t *vp,
3712 int write_lock,
3713 caller_context_t *ct)
3715 int ret;
3717 ret = fop_rwlock_dispatch(vp, write_lock, ct, true);
3719 VOPSTATS_UPDATE(vp, rwlock);
3720 return (ret);
3723 void
3724 fop_rwunlock(
3725 vnode_t *vp,
3726 int write_lock,
3727 caller_context_t *ct)
3729 fop_rwunlock_dispatch(vp, write_lock, ct, true);
3731 VOPSTATS_UPDATE(vp, rwunlock);
3735 fop_seek(
3736 vnode_t *vp,
3737 offset_t ooff,
3738 offset_t *noffp,
3739 caller_context_t *ct)
3741 int err;
3743 err = fop_seek_dispatch(vp, ooff, noffp, ct, true);
3745 VOPSTATS_UPDATE(vp, seek);
3746 return (err);
3750 fop_cmp(
3751 vnode_t *vp1,
3752 vnode_t *vp2,
3753 caller_context_t *ct)
3755 int err;
3757 err = fop_cmp_dispatch(vp1, vp2, ct, true);
3759 VOPSTATS_UPDATE(vp1, cmp);
3760 return (err);
3764 fop_frlock(
3765 vnode_t *vp,
3766 int cmd,
3767 flock64_t *bfp,
3768 int flag,
3769 offset_t offset,
3770 struct flk_callback *flk_cbp,
3771 cred_t *cr,
3772 caller_context_t *ct)
3774 int err;
3776 VOPXID_MAP_CR(vp, cr);
3778 err = fop_frlock_dispatch(vp, cmd, bfp, flag, offset, flk_cbp, cr,
3779 ct, true);
3781 VOPSTATS_UPDATE(vp, frlock);
3782 return (err);
3786 fop_space(
3787 vnode_t *vp,
3788 int cmd,
3789 flock64_t *bfp,
3790 int flag,
3791 offset_t offset,
3792 cred_t *cr,
3793 caller_context_t *ct)
3795 int err;
3797 VOPXID_MAP_CR(vp, cr);
3799 err = fop_space_dispatch(vp, cmd, bfp, flag, offset, cr, ct, true);
3801 VOPSTATS_UPDATE(vp, space);
3802 return (err);
3806 fop_realvp(
3807 vnode_t *vp,
3808 vnode_t **vpp,
3809 caller_context_t *ct)
3811 int err;
3813 err = fop_realvp_dispatch(vp, vpp, ct, true);
3815 VOPSTATS_UPDATE(vp, realvp);
3816 return (err);
3820 fop_getpage(
3821 vnode_t *vp,
3822 offset_t off,
3823 size_t len,
3824 uint_t *protp,
3825 page_t **plarr,
3826 size_t plsz,
3827 struct seg *seg,
3828 caddr_t addr,
3829 enum seg_rw rw,
3830 cred_t *cr,
3831 caller_context_t *ct)
3833 int err;
3835 VOPXID_MAP_CR(vp, cr);
3837 err = fop_getpage_dispatch(vp, off, len, protp, plarr, plsz, seg,
3838 addr, rw, cr, ct, true);
3840 VOPSTATS_UPDATE(vp, getpage);
3841 return (err);
3845 fop_putpage(
3846 vnode_t *vp,
3847 offset_t off,
3848 size_t len,
3849 int flags,
3850 cred_t *cr,
3851 caller_context_t *ct)
3853 int err;
3855 VOPXID_MAP_CR(vp, cr);
3857 err = fop_putpage_dispatch(vp, off, len, flags, cr, ct, true);
3859 VOPSTATS_UPDATE(vp, putpage);
3860 return (err);
3864 fop_map(
3865 vnode_t *vp,
3866 offset_t off,
3867 struct as *as,
3868 caddr_t *addrp,
3869 size_t len,
3870 uchar_t prot,
3871 uchar_t maxprot,
3872 uint_t flags,
3873 cred_t *cr,
3874 caller_context_t *ct)
3876 int err;
3878 VOPXID_MAP_CR(vp, cr);
3880 err = fop_map_dispatch(vp, off, as, addrp, len, prot, maxprot,
3881 flags, cr, ct, true);
3883 VOPSTATS_UPDATE(vp, map);
3884 return (err);
3888 fop_addmap(
3889 vnode_t *vp,
3890 offset_t off,
3891 struct as *as,
3892 caddr_t addr,
3893 size_t len,
3894 uchar_t prot,
3895 uchar_t maxprot,
3896 uint_t flags,
3897 cred_t *cr,
3898 caller_context_t *ct)
3900 int error;
3901 u_longlong_t delta;
3903 VOPXID_MAP_CR(vp, cr);
3905 error = fop_addmap_dispatch(vp, off, as, addr, len, prot, maxprot,
3906 flags, cr, ct, true);
3908 if ((!error) && (vp->v_type == VREG)) {
3909 delta = (u_longlong_t)btopr(len);
3911 * If file is declared MAP_PRIVATE, it can't be written back
3912 * even if open for write. Handle as read.
3914 if (flags & MAP_PRIVATE) {
3915 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3916 (int64_t)delta);
3917 } else {
3919 * atomic_add_64 forces the fetch of a 64 bit value to
3920 * be atomic on 32 bit machines
3922 if (maxprot & PROT_WRITE)
3923 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3924 (int64_t)delta);
3925 if (maxprot & PROT_READ)
3926 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3927 (int64_t)delta);
3928 if (maxprot & PROT_EXEC)
3929 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3930 (int64_t)delta);
3933 VOPSTATS_UPDATE(vp, addmap);
3934 return (error);
3938 fop_delmap(
3939 vnode_t *vp,
3940 offset_t off,
3941 struct as *as,
3942 caddr_t addr,
3943 size_t len,
3944 uint_t prot,
3945 uint_t maxprot,
3946 uint_t flags,
3947 cred_t *cr,
3948 caller_context_t *ct)
3950 int error;
3951 u_longlong_t delta;
3953 VOPXID_MAP_CR(vp, cr);
3955 error = fop_delmap_dispatch(vp, off, as, addr, len, prot, maxprot,
3956 flags, cr, ct, true);
3959 * NFS calls into delmap twice, the first time
3960 * it simply establishes a callback mechanism and returns EAGAIN
3961 * while the real work is being done upon the second invocation.
3962 * We have to detect this here and only decrement the counts upon
3963 * the second delmap request.
3965 if ((error != EAGAIN) && (vp->v_type == VREG)) {
3967 delta = (u_longlong_t)btopr(len);
3969 if (flags & MAP_PRIVATE) {
3970 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3971 (int64_t)(-delta));
3972 } else {
3974 * atomic_add_64 forces the fetch of a 64 bit value
3975 * to be atomic on 32 bit machines
3977 if (maxprot & PROT_WRITE)
3978 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3979 (int64_t)(-delta));
3980 if (maxprot & PROT_READ)
3981 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3982 (int64_t)(-delta));
3983 if (maxprot & PROT_EXEC)
3984 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3985 (int64_t)(-delta));
3988 VOPSTATS_UPDATE(vp, delmap);
3989 return (error);
3994 fop_poll(
3995 vnode_t *vp,
3996 short events,
3997 int anyyet,
3998 short *reventsp,
3999 struct pollhead **phpp,
4000 caller_context_t *ct)
4002 int err;
4004 err = fop_poll_dispatch(vp, events, anyyet, reventsp, phpp, ct, true);
4006 VOPSTATS_UPDATE(vp, poll);
4007 return (err);
4011 fop_dump(
4012 vnode_t *vp,
4013 caddr_t addr,
4014 offset_t lbdn,
4015 offset_t dblks,
4016 caller_context_t *ct)
4018 int err;
4020 /* ensure lbdn and dblks can be passed safely to bdev_dump */
4021 if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4022 return (EIO);
4024 err = fop_dump_dispatch(vp, addr, lbdn, dblks, ct, true);
4026 VOPSTATS_UPDATE(vp, dump);
4027 return (err);
4031 fop_pathconf(
4032 vnode_t *vp,
4033 int cmd,
4034 ulong_t *valp,
4035 cred_t *cr,
4036 caller_context_t *ct)
4038 int err;
4040 VOPXID_MAP_CR(vp, cr);
4042 err = fop_pathconf_dispatch(vp, cmd, valp, cr, ct, true);
4044 VOPSTATS_UPDATE(vp, pathconf);
4045 return (err);
4049 fop_pageio(
4050 vnode_t *vp,
4051 struct page *pp,
4052 uoff_t io_off,
4053 size_t io_len,
4054 int flags,
4055 cred_t *cr,
4056 caller_context_t *ct)
4058 int err;
4060 VOPXID_MAP_CR(vp, cr);
4062 err = fop_pageio_dispatch(vp, pp, io_off, io_len, flags, cr, ct, true);
4064 VOPSTATS_UPDATE(vp, pageio);
4065 return (err);
4069 fop_dumpctl(
4070 vnode_t *vp,
4071 int action,
4072 offset_t *blkp,
4073 caller_context_t *ct)
4075 int err;
4077 err = fop_dumpctl_dispatch(vp, action, blkp, ct, true);
4079 VOPSTATS_UPDATE(vp, dumpctl);
4080 return (err);
4083 void
4084 fop_dispose(
4085 vnode_t *vp,
4086 page_t *pp,
4087 int flag,
4088 int dn,
4089 cred_t *cr,
4090 caller_context_t *ct)
4092 /* Must do stats first since it's possible to lose the vnode */
4093 VOPSTATS_UPDATE(vp, dispose);
4095 VOPXID_MAP_CR(vp, cr);
4097 fop_dispose_dispatch(vp, pp, flag, dn, cr, ct, true);
4101 fop_setsecattr(
4102 vnode_t *vp,
4103 vsecattr_t *vsap,
4104 int flag,
4105 cred_t *cr,
4106 caller_context_t *ct)
4108 int err;
4110 VOPXID_MAP_CR(vp, cr);
4113 * We're only allowed to skip the ACL check iff we used a 32 bit
4114 * ACE mask with fop_access() to determine permissions.
4116 if ((flag & ATTR_NOACLCHECK) &&
4117 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4118 return (EINVAL);
4121 err = fop_setsecattr_dispatch(vp, vsap, flag, cr, ct, true);
4123 VOPSTATS_UPDATE(vp, setsecattr);
4124 return (err);
4128 fop_getsecattr(
4129 vnode_t *vp,
4130 vsecattr_t *vsap,
4131 int flag,
4132 cred_t *cr,
4133 caller_context_t *ct)
4135 int err;
4138 * We're only allowed to skip the ACL check iff we used a 32 bit
4139 * ACE mask with fop_access() to determine permissions.
4141 if ((flag & ATTR_NOACLCHECK) &&
4142 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4143 return (EINVAL);
4146 VOPXID_MAP_CR(vp, cr);
4148 err = fop_getsecattr_dispatch(vp, vsap, flag, cr, ct, true);
4150 VOPSTATS_UPDATE(vp, getsecattr);
4151 return (err);
4155 fop_shrlock(
4156 vnode_t *vp,
4157 int cmd,
4158 struct shrlock *shr,
4159 int flag,
4160 cred_t *cr,
4161 caller_context_t *ct)
4163 int err;
4165 VOPXID_MAP_CR(vp, cr);
4167 err = fop_shrlock_dispatch(vp, cmd, shr, flag, cr, ct, true);
4169 VOPSTATS_UPDATE(vp, shrlock);
4170 return (err);
4174 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4175 caller_context_t *ct)
4177 int err;
4179 err = fop_vnevent_dispatch(vp, vnevent, dvp, fnm, ct, true);
4181 VOPSTATS_UPDATE(vp, vnevent);
4182 return (err);
4186 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4187 caller_context_t *ct)
4189 int err;
4191 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4192 return (ENOTSUP);
4194 err = fop_reqzcbuf_dispatch(vp, ioflag, uiop, cr, ct, true);
4196 VOPSTATS_UPDATE(vp, reqzcbuf);
4197 return (err);
4201 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4203 int err;
4205 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4206 return (ENOTSUP);
4208 err = fop_retzcbuf_dispatch(vp, uiop, cr, ct, true);
4210 VOPSTATS_UPDATE(vp, retzcbuf);
4211 return (err);
4215 * Default destructor
4216 * Needed because NULL destructor means that the key is unused
4218 /* ARGSUSED */
4219 void
4220 vsd_defaultdestructor(void *value)
4224 * Create a key (index into per vnode array)
4225 * Locks out vsd_create, vsd_destroy, and vsd_free
4226 * May allocate memory with lock held
4228 void
4229 vsd_create(uint_t *keyp, void (*destructor)(void *))
4231 int i;
4232 uint_t nkeys;
4235 * if key is allocated, do nothing
4237 mutex_enter(&vsd_lock);
4238 if (*keyp) {
4239 mutex_exit(&vsd_lock);
4240 return;
4243 * find an unused key
4245 if (destructor == NULL)
4246 destructor = vsd_defaultdestructor;
4248 for (i = 0; i < vsd_nkeys; ++i)
4249 if (vsd_destructor[i] == NULL)
4250 break;
4253 * if no unused keys, increase the size of the destructor array
4255 if (i == vsd_nkeys) {
4256 if ((nkeys = (vsd_nkeys << 1)) == 0)
4257 nkeys = 1;
4258 vsd_destructor =
4259 (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4260 (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4261 (size_t)(nkeys * sizeof (void (*)(void *))));
4262 vsd_nkeys = nkeys;
4266 * allocate the next available unused key
4268 vsd_destructor[i] = destructor;
4269 *keyp = i + 1;
4271 /* create vsd_list, if it doesn't exist */
4272 if (vsd_list == NULL) {
4273 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4274 list_create(vsd_list, sizeof (struct vsd_node),
4275 offsetof(struct vsd_node, vs_nodes));
4278 mutex_exit(&vsd_lock);
4282 * Destroy a key
4284 * Assumes that the caller is preventing vsd_set and vsd_get
4285 * Locks out vsd_create, vsd_destroy, and vsd_free
4286 * May free memory with lock held
4288 void
4289 vsd_destroy(uint_t *keyp)
4291 uint_t key;
4292 struct vsd_node *vsd;
4295 * protect the key namespace and our destructor lists
4297 mutex_enter(&vsd_lock);
4298 key = *keyp;
4299 *keyp = 0;
4301 ASSERT(key <= vsd_nkeys);
4304 * if the key is valid
4306 if (key != 0) {
4307 uint_t k = key - 1;
4309 * for every vnode with VSD, call key's destructor
4311 for (vsd = list_head(vsd_list); vsd != NULL;
4312 vsd = list_next(vsd_list, vsd)) {
4314 * no VSD for key in this vnode
4316 if (key > vsd->vs_nkeys)
4317 continue;
4319 * call destructor for key
4321 if (vsd->vs_value[k] && vsd_destructor[k])
4322 (*vsd_destructor[k])(vsd->vs_value[k]);
4324 * reset value for key
4326 vsd->vs_value[k] = NULL;
4329 * actually free the key (NULL destructor == unused)
4331 vsd_destructor[k] = NULL;
4334 mutex_exit(&vsd_lock);
4338 * Quickly return the per vnode value that was stored with the specified key
4339 * Assumes the caller is protecting key from vsd_create and vsd_destroy
4340 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4342 void *
4343 vsd_get(vnode_t *vp, uint_t key)
4345 struct vsd_node *vsd;
4347 ASSERT(vp != NULL);
4348 ASSERT(mutex_owned(&vp->v_vsd_lock));
4350 vsd = vp->v_vsd;
4352 if (key && vsd != NULL && key <= vsd->vs_nkeys)
4353 return (vsd->vs_value[key - 1]);
4354 return (NULL);
4358 * Set a per vnode value indexed with the specified key
4359 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4362 vsd_set(vnode_t *vp, uint_t key, void *value)
4364 struct vsd_node *vsd;
4366 ASSERT(vp != NULL);
4367 ASSERT(mutex_owned(&vp->v_vsd_lock));
4369 if (key == 0)
4370 return (EINVAL);
4372 vsd = vp->v_vsd;
4373 if (vsd == NULL)
4374 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4377 * If the vsd was just allocated, vs_nkeys will be 0, so the following
4378 * code won't happen and we will continue down and allocate space for
4379 * the vs_value array.
4380 * If the caller is replacing one value with another, then it is up
4381 * to the caller to free/rele/destroy the previous value (if needed).
4383 if (key <= vsd->vs_nkeys) {
4384 vsd->vs_value[key - 1] = value;
4385 return (0);
4388 ASSERT(key <= vsd_nkeys);
4390 if (vsd->vs_nkeys == 0) {
4391 mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4393 * Link onto list of all VSD nodes.
4395 list_insert_head(vsd_list, vsd);
4396 mutex_exit(&vsd_lock);
4400 * Allocate vnode local storage and set the value for key
4402 vsd->vs_value = vsd_realloc(vsd->vs_value,
4403 vsd->vs_nkeys * sizeof (void *),
4404 key * sizeof (void *));
4405 vsd->vs_nkeys = key;
4406 vsd->vs_value[key - 1] = value;
4408 return (0);
4412 * Called from vn_free() to run the destructor function for each vsd
4413 * Locks out vsd_create and vsd_destroy
4414 * Assumes that the destructor *DOES NOT* use vsd
4416 void
4417 vsd_free(vnode_t *vp)
4419 int i;
4420 struct vsd_node *vsd = vp->v_vsd;
4422 if (vsd == NULL)
4423 return;
4425 if (vsd->vs_nkeys == 0) {
4426 kmem_free(vsd, sizeof (*vsd));
4427 vp->v_vsd = NULL;
4428 return;
4432 * lock out vsd_create and vsd_destroy, call
4433 * the destructor, and mark the value as destroyed.
4435 mutex_enter(&vsd_lock);
4437 for (i = 0; i < vsd->vs_nkeys; i++) {
4438 if (vsd->vs_value[i] && vsd_destructor[i])
4439 (*vsd_destructor[i])(vsd->vs_value[i]);
4440 vsd->vs_value[i] = NULL;
4444 * remove from linked list of VSD nodes
4446 list_remove(vsd_list, vsd);
4448 mutex_exit(&vsd_lock);
4451 * free up the VSD
4453 kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4454 kmem_free(vsd, sizeof (struct vsd_node));
4455 vp->v_vsd = NULL;
4459 * realloc
4461 static void *
4462 vsd_realloc(void *old, size_t osize, size_t nsize)
4464 void *new;
4466 new = kmem_zalloc(nsize, KM_SLEEP);
4467 if (old) {
4468 bcopy(old, new, osize);
4469 kmem_free(old, osize);
4471 return (new);
4475 * Setup the extensible system attribute for creating a reparse point.
4476 * The symlink data 'target' is validated for proper format of a reparse
4477 * string and a check also made to make sure the symlink data does not
4478 * point to an existing file.
4480 * return 0 if ok else -1.
4482 static int
4483 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4485 xoptattr_t *xoap;
4487 if ((!target) || (!vap) || (!xvattr))
4488 return (-1);
4490 /* validate reparse string */
4491 if (reparse_validate((const char *)target))
4492 return (-1);
4494 xva_init(xvattr);
4495 xvattr->xva_vattr = *vap;
4496 xvattr->xva_vattr.va_mask |= AT_XVATTR;
4497 xoap = xva_getxoptattr(xvattr);
4498 ASSERT(xoap);
4499 XVA_SET_REQ(xvattr, XAT_REPARSE);
4500 xoap->xoa_reparse = 1;
4502 return (0);
4506 * Function to check whether a symlink is a reparse point.
4507 * Return B_TRUE if it is a reparse point, else return B_FALSE
4509 boolean_t
4510 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4512 xvattr_t xvattr;
4513 xoptattr_t *xoap;
4515 if ((vp->v_type != VLNK) ||
4516 !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4517 return (B_FALSE);
4519 xva_init(&xvattr);
4520 xoap = xva_getxoptattr(&xvattr);
4521 ASSERT(xoap);
4522 XVA_SET_REQ(&xvattr, XAT_REPARSE);
4524 if (fop_getattr(vp, &xvattr.xva_vattr, 0, cr, ct))
4525 return (B_FALSE);
4527 if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4528 (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4529 return (B_FALSE);
4531 return (xoap->xoa_reparse ? B_TRUE : B_FALSE);