Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / kernel / fs / vnode.c
blobf2ff5d8336a36e11fb8c23b5fe239ed44fa52fcf
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2017, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
29 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
33 * University Copyright- Copyright (c) 1982, 1986, 1988
34 * The Regents of the University of California
35 * All Rights Reserved
37 * University Acknowledgment- Portions of this document are derived from
38 * software developed by the University of California, Berkeley, and its
39 * contributors.
42 #include <sys/types.h>
43 #include <sys/param.h>
44 #include <sys/t_lock.h>
45 #include <sys/errno.h>
46 #include <sys/cred.h>
47 #include <sys/user.h>
48 #include <sys/uio.h>
49 #include <sys/file.h>
50 #include <sys/pathname.h>
51 #include <sys/atomic.h>
52 #include <sys/vfs.h>
53 #include <sys/vnode.h>
54 #include <sys/vnode_dispatch.h>
55 #include <sys/rwstlock.h>
56 #include <sys/fem.h>
57 #include <sys/stat.h>
58 #include <sys/mode.h>
59 #include <sys/conf.h>
60 #include <sys/sysmacros.h>
61 #include <sys/cmn_err.h>
62 #include <sys/systm.h>
63 #include <sys/kmem.h>
64 #include <sys/debug.h>
65 #include <sys/acl.h>
66 #include <sys/nbmlock.h>
67 #include <sys/fcntl.h>
68 #include <sys/fs_subr.h>
69 #include <sys/taskq.h>
70 #include <sys/fs_reparse.h>
71 #include <sys/time.h>
72 #include <sys/sdt.h>
74 /* Determine if this vnode is a file that is read-only */
75 #define ISROFILE(vp) \
76 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
77 (vp)->v_type != VFIFO && vn_is_readonly(vp))
79 /* Tunable via /etc/system; used only by admin/install */
80 int nfs_global_client_only;
83 * Array of vopstats_t for per-FS-type vopstats. This array has the same
84 * number of entries as and parallel to the vfssw table. (Arguably, it could
85 * be part of the vfssw table.) Once it's initialized, it's accessed using
86 * the same fstype index that is used to index into the vfssw table.
88 vopstats_t **vopstats_fstype;
90 /* vopstats initialization template used for fast initialization via bcopy() */
91 static vopstats_t *vs_templatep;
93 /* Kmem cache handle for vsk_anchor_t allocations */
94 kmem_cache_t *vsk_anchor_cache;
96 /* file events cleanup routine */
97 extern void free_fopdata(vnode_t *);
100 * Root of AVL tree for the kstats associated with vopstats. Lock protects
101 * updates to vsktat_tree.
103 avl_tree_t vskstat_tree;
104 kmutex_t vskstat_tree_lock;
106 /* Global variable which enables/disables the vopstats collection */
107 int vopstats_enabled = 1;
109 /* Global used for empty/invalid v_path */
110 char *vn_vpath_empty = "";
113 * forward declarations for internal vnode specific data (vsd)
115 static void *vsd_realloc(void *, size_t, size_t);
118 * forward declarations for reparse point functions
120 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
123 * VSD -- VNODE SPECIFIC DATA
124 * The v_data pointer is typically used by a file system to store a
125 * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
126 * However, there are times when additional project private data needs
127 * to be stored separately from the data (node) pointed to by v_data.
128 * This additional data could be stored by the file system itself or
129 * by a completely different kernel entity. VSD provides a way for
130 * callers to obtain a key and store a pointer to private data associated
131 * with a vnode.
133 * Callers are responsible for protecting the vsd by holding v_vsd_lock
134 * for calls to vsd_set() and vsd_get().
138 * vsd_lock protects:
139 * vsd_nkeys - creation and deletion of vsd keys
140 * vsd_list - insertion and deletion of vsd_node in the vsd_list
141 * vsd_destructor - adding and removing destructors to the list
143 static kmutex_t vsd_lock;
144 static uint_t vsd_nkeys; /* size of destructor array */
145 /* list of vsd_node's */
146 static list_t *vsd_list = NULL;
147 /* per-key destructor funcs */
148 static void (**vsd_destructor)(void *);
151 * The following is the common set of actions needed to update the
152 * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and
153 * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
154 * recording of the bytes transferred. Since the code is similar
155 * but small, it is nearly a duplicate. Consequently any changes
156 * to one may need to be reflected in the other.
157 * Rundown of the variables:
158 * vp - Pointer to the vnode
159 * counter - Partial name structure member to update in vopstats for counts
160 * bytecounter - Partial name structure member to update in vopstats for bytes
161 * bytesval - Value to update in vopstats for bytes
162 * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
163 * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
166 #define VOPSTATS_UPDATE(vp, counter) { \
167 vfs_t *vfsp = (vp)->v_vfsp; \
168 if (vfsp && vfsp->vfs_implp && \
169 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
170 vopstats_t *vsp = &vfsp->vfs_vopstats; \
171 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
172 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
173 size_t, uint64_t *); \
174 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \
175 (*stataddr)++; \
176 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
177 vsp->n##counter.value.ui64++; \
182 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \
183 vfs_t *vfsp = (vp)->v_vfsp; \
184 if (vfsp && vfsp->vfs_implp && \
185 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
186 vopstats_t *vsp = &vfsp->vfs_vopstats; \
187 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
188 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
189 size_t, uint64_t *); \
190 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
191 (*stataddr)++; \
192 vsp->bytecounter.value.ui64 += bytesval; \
193 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
194 vsp->n##counter.value.ui64++; \
195 vsp->bytecounter.value.ui64 += bytesval; \
201 * If the filesystem does not support XIDs map credential
202 * If the vfsp is NULL, perhaps we should also map?
204 #define VOPXID_MAP_CR(vp, cr) { \
205 vfs_t *vfsp = (vp)->v_vfsp; \
206 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \
207 cr = crgetmapped(cr); \
211 * Convert stat(2) formats to vnode types and vice versa. (Knows about
212 * numerical order of S_IFMT and vnode types.)
214 enum vtype iftovt_tab[] = {
215 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
216 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
219 ushort_t vttoif_tab[] = {
220 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
221 S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
225 * The system vnode cache.
228 kmem_cache_t *vn_cache;
231 /* Extensible attribute (xva) routines. */
234 * Zero out the structure, set the size of the requested/returned bitmaps,
235 * set VATTR_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
236 * to the returned attributes array.
238 void
239 xva_init(xvattr_t *xvap)
241 bzero(xvap, sizeof (xvattr_t));
242 xvap->xva_mapsize = XVA_MAPSIZE;
243 xvap->xva_magic = XVA_MAGIC;
244 xvap->xva_vattr.va_mask = VATTR_XVATTR;
245 xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
249 * If VATTR_XVATTR is set, returns a pointer to the embedded xoptattr_t
250 * structure. Otherwise, returns NULL.
252 xoptattr_t *
253 xva_getxoptattr(xvattr_t *xvap)
255 xoptattr_t *xoap = NULL;
256 if (xvap->xva_vattr.va_mask & VATTR_XVATTR)
257 xoap = &xvap->xva_xoptattrs;
258 return (xoap);
262 * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
263 * We use the f_fsid reported by VFS_STATVFS() since we use that for the
264 * kstat name.
266 static int
267 vska_compar(const void *n1, const void *n2)
269 int ret;
270 ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
271 ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
273 if (p1 < p2) {
274 ret = -1;
275 } else if (p1 > p2) {
276 ret = 1;
277 } else {
278 ret = 0;
281 return (ret);
285 * Used to create a single template which will be bcopy()ed to a newly
286 * allocated vsanchor_combo_t structure in new_vsanchor(), below.
288 static vopstats_t *
289 create_vopstats_template()
291 vopstats_t *vsp;
293 vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
294 bzero(vsp, sizeof (*vsp)); /* Start fresh */
296 /* fop_open */
297 kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
298 /* fop_close */
299 kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
300 /* fop_read I/O */
301 kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
302 kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
303 /* fop_write I/O */
304 kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
305 kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
306 /* fop_ioctl */
307 kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
308 /* fop_setfl */
309 kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
310 /* fop_getattr */
311 kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
312 /* fop_setattr */
313 kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
314 /* fop_access */
315 kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
316 /* fop_lookup */
317 kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
318 /* fop_create */
319 kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
320 /* fop_remove */
321 kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
322 /* fop_link */
323 kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
324 /* fop_rename */
325 kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
326 /* fop_mkdir */
327 kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
328 /* fop_rmdir */
329 kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
330 /* fop_readdir I/O */
331 kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
332 kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
333 KSTAT_DATA_UINT64);
334 /* fop_symlink */
335 kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
336 /* fop_readlink */
337 kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
338 /* fop_fsync */
339 kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
340 /* fop_inactive */
341 kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
342 /* fop_fid */
343 kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
344 /* fop_rwlock */
345 kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
346 /* fop_rwunlock */
347 kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
348 /* fop_seek */
349 kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
350 /* fop_cmp */
351 kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
352 /* fop_frlock */
353 kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
354 /* fop_space */
355 kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
356 /* fop_realvp */
357 kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
358 /* fop_getpage */
359 kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
360 /* fop_putpage */
361 kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
362 /* fop_map */
363 kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
364 /* fop_addmap */
365 kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
366 /* fop_delmap */
367 kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
368 /* fop_poll */
369 kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
370 /* fop_dump */
371 kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
372 /* fop_pathconf */
373 kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
374 /* fop_pageio */
375 kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
376 /* fop_dumpctl */
377 kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
378 /* fop_dispose */
379 kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
380 /* fop_setsecattr */
381 kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
382 /* fop_getsecattr */
383 kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
384 /* fop_shrlock */
385 kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
386 /* fop_vnevent */
387 kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
388 /* fop_reqzcbuf */
389 kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
390 /* fop_retzcbuf */
391 kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
393 return (vsp);
397 * Creates a kstat structure associated with a vopstats structure.
399 kstat_t *
400 new_vskstat(char *ksname, vopstats_t *vsp)
402 kstat_t *ksp;
404 if (!vopstats_enabled) {
405 return (NULL);
408 ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
409 sizeof (vopstats_t)/sizeof (kstat_named_t),
410 KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
411 if (ksp) {
412 ksp->ks_data = vsp;
413 kstat_install(ksp);
416 return (ksp);
420 * Called from vfsinit() to initialize the support mechanisms for vopstats
422 void
423 vopstats_startup()
425 if (!vopstats_enabled)
426 return;
429 * Creates the AVL tree which holds per-vfs vopstat anchors. This
430 * is necessary since we need to check if a kstat exists before we
431 * attempt to create it. Also, initialize its lock.
433 avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
434 offsetof(vsk_anchor_t, vsk_node));
435 mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
437 vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
438 sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
439 NULL, NULL, 0);
442 * Set up the array of pointers for the vopstats-by-FS-type.
443 * The entries will be allocated/initialized as each file system
444 * goes through modload/mod_installfs.
446 vopstats_fstype = (vopstats_t **)kmem_zalloc(
447 (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
449 /* Set up the global vopstats initialization template */
450 vs_templatep = create_vopstats_template();
454 * We need to have the all of the counters zeroed.
455 * The initialization of the vopstats_t includes on the order of
456 * 50 calls to kstat_named_init(). Rather that do that on every call,
457 * we do it once in a template (vs_templatep) then bcopy it over.
459 void
460 initialize_vopstats(vopstats_t *vsp)
462 if (vsp == NULL)
463 return;
465 bcopy(vs_templatep, vsp, sizeof (vopstats_t));
469 * If possible, determine which vopstats by fstype to use and
470 * return a pointer to the caller.
472 vopstats_t *
473 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
475 int fstype = 0; /* Index into vfssw[] */
476 vopstats_t *vsp = NULL;
478 if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
479 !vopstats_enabled)
480 return (NULL);
482 * Set up the fstype. We go to so much trouble because all versions
483 * of NFS use the same fstype in their vfs even though they have
484 * distinct entries in the vfssw[] table.
485 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
487 if (vswp) {
488 fstype = vswp - vfssw; /* Gets us the index */
489 } else {
490 fstype = vfsp->vfs_fstype;
494 * Point to the per-fstype vopstats. The only valid values are
495 * non-zero positive values less than the number of vfssw[] table
496 * entries.
498 if (fstype > 0 && fstype < nfstype) {
499 vsp = vopstats_fstype[fstype];
502 return (vsp);
506 * Generate a kstat name, create the kstat structure, and allocate a
507 * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t
508 * to the caller. This must only be called from a mount.
510 vsk_anchor_t *
511 get_vskstat_anchor(vfs_t *vfsp)
513 char kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
514 statvfs64_t statvfsbuf; /* Needed to find f_fsid */
515 vsk_anchor_t *vskp = NULL; /* vfs <--> kstat anchor */
516 kstat_t *ksp; /* Ptr to new kstat */
517 avl_index_t where; /* Location in the AVL tree */
519 if (vfsp == NULL || vfsp->vfs_implp == NULL ||
520 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
521 return (NULL);
523 /* Need to get the fsid to build a kstat name */
524 if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
525 /* Create a name for our kstats based on fsid */
526 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
527 VOPSTATS_STR, statvfsbuf.f_fsid);
529 /* Allocate and initialize the vsk_anchor_t */
530 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
531 bzero(vskp, sizeof (*vskp));
532 vskp->vsk_fsid = statvfsbuf.f_fsid;
534 mutex_enter(&vskstat_tree_lock);
535 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
536 avl_insert(&vskstat_tree, vskp, where);
537 mutex_exit(&vskstat_tree_lock);
540 * Now that we've got the anchor in the AVL
541 * tree, we can create the kstat.
543 ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
544 if (ksp) {
545 vskp->vsk_ksp = ksp;
547 } else {
548 /* Oops, found one! Release memory and lock. */
549 mutex_exit(&vskstat_tree_lock);
550 kmem_cache_free(vsk_anchor_cache, vskp);
551 vskp = NULL;
554 return (vskp);
558 * We're in the process of tearing down the vfs and need to cleanup
559 * the data structures associated with the vopstats. Must only be called
560 * from dounmount().
562 void
563 teardown_vopstats(vfs_t *vfsp)
565 vsk_anchor_t *vskap;
566 avl_index_t where;
568 if (vfsp == NULL || vfsp->vfs_implp == NULL ||
569 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
570 return;
572 /* This is a safe check since VFS_STATS must be set (see above) */
573 if ((vskap = vfsp->vfs_vskap) == NULL)
574 return;
576 /* Whack the pointer right away */
577 vfsp->vfs_vskap = NULL;
579 /* Lock the tree, remove the node, and delete the kstat */
580 mutex_enter(&vskstat_tree_lock);
581 if (avl_find(&vskstat_tree, vskap, &where)) {
582 avl_remove(&vskstat_tree, vskap);
585 if (vskap->vsk_ksp) {
586 kstat_delete(vskap->vsk_ksp);
588 mutex_exit(&vskstat_tree_lock);
590 kmem_cache_free(vsk_anchor_cache, vskap);
594 * Read or write a vnode. Called from kernel code.
597 vn_rdwr(
598 enum uio_rw rw,
599 struct vnode *vp,
600 caddr_t base,
601 ssize_t len,
602 offset_t offset,
603 enum uio_seg seg,
604 int ioflag,
605 rlim_t ulimit, /* meaningful only if rw is UIO_WRITE */
606 cred_t *cr,
607 ssize_t *residp)
609 struct uio uio;
610 struct iovec iov;
611 int error;
612 int in_crit = 0;
614 if (rw == UIO_WRITE && ISROFILE(vp))
615 return (EROFS);
617 if (len < 0)
618 return (EIO);
620 VOPXID_MAP_CR(vp, cr);
622 iov.iov_base = base;
623 iov.iov_len = len;
624 uio.uio_iov = &iov;
625 uio.uio_iovcnt = 1;
626 uio.uio_loffset = offset;
627 uio.uio_segflg = (short)seg;
628 uio.uio_resid = len;
629 uio.uio_llimit = ulimit;
632 * We have to enter the critical region before calling fop_rwlock
633 * to avoid a deadlock with ufs.
635 if (nbl_need_check(vp)) {
636 int svmand;
638 nbl_start_crit(vp, RW_READER);
639 in_crit = 1;
640 error = nbl_svmand(vp, cr, &svmand);
641 if (error != 0)
642 goto done;
643 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
644 uio.uio_offset, uio.uio_resid, svmand, NULL)) {
645 error = EACCES;
646 goto done;
650 (void) fop_rwlock(vp,
651 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
652 if (rw == UIO_WRITE) {
653 uio.uio_fmode = FWRITE;
654 uio.uio_extflg = UIO_COPY_DEFAULT;
655 error = fop_write(vp, &uio, ioflag, cr, NULL);
656 } else {
657 uio.uio_fmode = FREAD;
658 uio.uio_extflg = UIO_COPY_CACHED;
659 error = fop_read(vp, &uio, ioflag, cr, NULL);
661 fop_rwunlock(vp,
662 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
663 if (residp)
664 *residp = uio.uio_resid;
665 else if (uio.uio_resid)
666 error = EIO;
668 done:
669 if (in_crit)
670 nbl_end_crit(vp);
671 return (error);
675 * Release a vnode. Call fop_inactive on last reference or
676 * decrement reference count.
678 * To avoid race conditions, the v_count is left at 1 for
679 * the call to fop_inactive. This prevents another thread
680 * from reclaiming and releasing the vnode *before* the
681 * fop_inactive routine has a chance to destroy the vnode.
682 * We can't have more than 1 thread calling fop_inactive
683 * on a vnode.
685 void
686 vn_rele(vnode_t *vp)
688 VERIFY(vp->v_count > 0);
689 mutex_enter(&vp->v_lock);
690 if (vp->v_count == 1) {
691 mutex_exit(&vp->v_lock);
692 fop_inactive(vp, CRED(), NULL);
693 return;
695 VN_RELE_LOCKED(vp);
696 mutex_exit(&vp->v_lock);
700 * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
701 * as a single reference, so v_count is not decremented until the last DNLC hold
702 * is released. This makes it possible to distinguish vnodes that are referenced
703 * only by the DNLC.
705 void
706 vn_rele_dnlc(vnode_t *vp)
708 VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
709 mutex_enter(&vp->v_lock);
710 if (--vp->v_count_dnlc == 0) {
711 if (vp->v_count == 1) {
712 mutex_exit(&vp->v_lock);
713 fop_inactive(vp, CRED(), NULL);
714 return;
716 VN_RELE_LOCKED(vp);
718 mutex_exit(&vp->v_lock);
722 * Like vn_rele() except that it clears v_stream under v_lock.
723 * This is used by sockfs when it dismantles the association between
724 * the sockfs node and the vnode in the underlying file system.
725 * v_lock has to be held to prevent a thread coming through the lookupname
726 * path from accessing a stream head that is going away.
728 void
729 vn_rele_stream(vnode_t *vp)
731 VERIFY(vp->v_count > 0);
732 mutex_enter(&vp->v_lock);
733 vp->v_stream = NULL;
734 if (vp->v_count == 1) {
735 mutex_exit(&vp->v_lock);
736 fop_inactive(vp, CRED(), NULL);
737 return;
739 VN_RELE_LOCKED(vp);
740 mutex_exit(&vp->v_lock);
743 static void
744 vn_rele_inactive(vnode_t *vp)
746 fop_inactive(vp, CRED(), NULL);
750 * Like vn_rele() except if we are going to call fop_inactive() then do it
751 * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
752 * the file system as a result of releasing the vnode. Note, file systems
753 * already have to handle the race where the vnode is incremented before the
754 * inactive routine is called and does its locking.
756 * Warning: Excessive use of this routine can lead to performance problems.
757 * This is because taskqs throttle back allocation if too many are created.
759 void
760 vn_rele_async(vnode_t *vp, taskq_t *taskq)
762 VERIFY(vp->v_count > 0);
763 mutex_enter(&vp->v_lock);
764 if (vp->v_count == 1) {
765 mutex_exit(&vp->v_lock);
766 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
767 vp, TQ_SLEEP) != (uintptr_t)NULL);
768 return;
770 VN_RELE_LOCKED(vp);
771 mutex_exit(&vp->v_lock);
775 vn_open(
776 char *pnamep,
777 enum uio_seg seg,
778 int filemode,
779 int createmode,
780 struct vnode **vpp,
781 enum create crwhy,
782 mode_t umask)
784 return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
785 umask, NULL, -1));
790 * Open/create a vnode.
791 * This may be callable by the kernel, the only known use
792 * of user context being that the current user credentials
793 * are used for permissions. crwhy is defined iff filemode & FCREAT.
796 vn_openat(
797 char *pnamep,
798 enum uio_seg seg,
799 int filemode,
800 int createmode,
801 struct vnode **vpp,
802 enum create crwhy,
803 mode_t umask,
804 struct vnode *startvp,
805 int fd)
807 struct vnode *vp;
808 int mode;
809 int accessflags;
810 int error;
811 int in_crit = 0;
812 int open_done = 0;
813 int shrlock_done = 0;
814 struct vattr vattr;
815 enum symfollow follow;
816 int estale_retry = 0;
817 struct shrlock shr;
818 struct shr_locowner shr_own;
820 if (filemode & FSEARCH)
821 filemode |= FDIRECTORY;
823 mode = 0;
824 accessflags = 0;
825 if (filemode & FREAD)
826 mode |= VREAD;
827 if (filemode & (FWRITE|FTRUNC))
828 mode |= VWRITE;
829 if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
830 mode |= VEXEC;
832 /* symlink interpretation */
833 if (filemode & FNOFOLLOW)
834 follow = NO_FOLLOW;
835 else
836 follow = FOLLOW;
838 if (filemode & FAPPEND)
839 accessflags |= V_APPEND;
841 top:
842 if (filemode & FCREAT && !(filemode & FDIRECTORY)) {
843 enum vcexcl excl;
845 /* Wish to create a file. */
846 vattr.va_type = VREG;
847 vattr.va_mode = createmode;
848 vattr.va_mask = VATTR_TYPE|VATTR_MODE;
849 if (filemode & FTRUNC) {
850 vattr.va_size = 0;
851 vattr.va_mask |= VATTR_SIZE;
853 if (filemode & FEXCL)
854 excl = EXCL;
855 else
856 excl = NONEXCL;
858 if (error =
859 vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
860 (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
861 return (error);
862 } else {
863 /* Wish to open a file. Just look it up. */
864 if (error = lookupnameat(pnamep, seg, follow,
865 NULLVPP, &vp, startvp)) {
866 if ((error == ESTALE) &&
867 fs_need_estale_retry(estale_retry++))
868 goto top;
869 return (error);
873 * Can't write directories, active texts, or
874 * read-only filesystems. Can't truncate files
875 * on which mandatory locking is in effect.
877 if (filemode & (FWRITE|FTRUNC)) {
879 * Allow writable directory if VDIROPEN flag is set.
881 if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
882 error = EISDIR;
883 goto out;
885 if (ISROFILE(vp)) {
886 error = EROFS;
887 goto out;
890 * Can't truncate files on which
891 * sysv mandatory locking is in effect.
893 if (filemode & FTRUNC) {
894 vnode_t *rvp;
896 if (fop_realvp(vp, &rvp, NULL) != 0)
897 rvp = vp;
898 if (rvp->v_filocks != NULL) {
899 vattr.va_mask = VATTR_MODE;
900 if ((error = fop_getattr(vp,
901 &vattr, 0, CRED(), NULL)) == 0 &&
902 MANDLOCK(vp, vattr.va_mode))
903 error = EAGAIN;
906 if (error)
907 goto out;
910 * Check permissions.
912 if (error = fop_access(vp, mode, accessflags, CRED(), NULL))
913 goto out;
915 * Require FDIRECTORY to return a directory.
916 * Require FEXEC to return a regular file.
918 if ((filemode & FDIRECTORY) && vp->v_type != VDIR) {
919 error = ENOTDIR;
920 goto out;
922 if ((filemode & FEXEC) && vp->v_type != VREG) {
923 error = ENOEXEC; /* XXX: error code? */
924 goto out;
929 * Do remaining checks for FNOFOLLOW and FNOLINKS.
931 if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
932 error = ELOOP;
933 goto out;
935 if (filemode & FNOLINKS) {
936 vattr.va_mask = VATTR_NLINK;
937 if ((error = fop_getattr(vp, &vattr, 0, CRED(), NULL))) {
938 goto out;
940 if (vattr.va_nlink != 1) {
941 error = EMLINK;
942 goto out;
947 * Opening a socket corresponding to the AF_UNIX pathname
948 * in the filesystem name space is not supported.
949 * However, VSOCK nodes in namefs are supported in order
950 * to make fattach work for sockets.
952 * XXX This uses fop_realvp to distinguish between
953 * an unopened namefs node (where fop_realvp returns a
954 * different VSOCK vnode) and a VSOCK created by vn_create
955 * in some file system (where fop_realvp would never return
956 * a different vnode).
958 if (vp->v_type == VSOCK) {
959 struct vnode *nvp;
961 error = fop_realvp(vp, &nvp, NULL);
962 if (error != 0 || nvp == NULL || nvp == vp ||
963 nvp->v_type != VSOCK) {
964 error = EOPNOTSUPP;
965 goto out;
969 if ((vp->v_type == VREG) && nbl_need_check(vp)) {
970 /* get share reservation */
971 shr.s_access = 0;
972 if (filemode & FWRITE)
973 shr.s_access |= F_WRACC;
974 if (filemode & FREAD)
975 shr.s_access |= F_RDACC;
976 shr.s_deny = 0;
977 shr.s_sysid = 0;
978 shr.s_pid = ttoproc(curthread)->p_pid;
979 shr_own.sl_pid = shr.s_pid;
980 shr_own.sl_id = fd;
981 shr.s_own_len = sizeof (shr_own);
982 shr.s_owner = (caddr_t)&shr_own;
983 error = fop_shrlock(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
984 NULL);
985 if (error)
986 goto out;
987 shrlock_done = 1;
989 /* nbmand conflict check if truncating file */
990 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
991 nbl_start_crit(vp, RW_READER);
992 in_crit = 1;
994 vattr.va_mask = VATTR_SIZE;
995 if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL))
996 goto out;
997 if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
998 NULL)) {
999 error = EACCES;
1000 goto out;
1006 * Do opening protocol.
1008 error = fop_open(&vp, filemode, CRED(), NULL);
1009 if (error)
1010 goto out;
1011 open_done = 1;
1014 * Truncate if required.
1016 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1017 vattr.va_size = 0;
1018 vattr.va_mask = VATTR_SIZE;
1019 if ((error = fop_setattr(vp, &vattr, 0, CRED(), NULL)) != 0)
1020 goto out;
1022 out:
1023 ASSERT(vp->v_count > 0);
1025 if (in_crit) {
1026 nbl_end_crit(vp);
1027 in_crit = 0;
1029 if (error) {
1030 if (open_done) {
1031 (void) fop_close(vp, filemode, 1, 0, CRED(),
1032 NULL);
1033 open_done = 0;
1034 shrlock_done = 0;
1036 if (shrlock_done) {
1037 (void) fop_shrlock(vp, F_UNSHARE, &shr, 0, CRED(),
1038 NULL);
1039 shrlock_done = 0;
1043 * The following clause was added to handle a problem
1044 * with NFS consistency. It is possible that a lookup
1045 * of the file to be opened succeeded, but the file
1046 * itself doesn't actually exist on the server. This
1047 * is chiefly due to the DNLC containing an entry for
1048 * the file which has been removed on the server. In
1049 * this case, we just start over. If there was some
1050 * other cause for the ESTALE error, then the lookup
1051 * of the file will fail and the error will be returned
1052 * above instead of looping around from here.
1054 VN_RELE(vp);
1055 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1056 goto top;
1057 } else
1058 *vpp = vp;
1059 return (error);
1063 * The following two accessor functions are for the NFSv4 server. Since there
1064 * is no fop_open_UP/DOWNGRADE we need a way for the NFS server to keep the
1065 * vnode open counts correct when a client "upgrades" an open or does an
1066 * open_downgrade. In NFS, an upgrade or downgrade can not only change the
1067 * open mode (add or subtract read or write), but also change the share/deny
1068 * modes. However, share reservations are not integrated with OPEN, yet, so
1069 * we need to handle each separately. These functions are cleaner than having
1070 * the NFS server manipulate the counts directly, however, nobody else should
1071 * use these functions.
1073 void
1074 vn_open_upgrade(
1075 vnode_t *vp,
1076 int filemode)
1078 ASSERT(vp->v_type == VREG);
1080 if (filemode & FREAD)
1081 atomic_inc_32(&vp->v_rdcnt);
1082 if (filemode & FWRITE)
1083 atomic_inc_32(&vp->v_wrcnt);
1087 void
1088 vn_open_downgrade(
1089 vnode_t *vp,
1090 int filemode)
1092 ASSERT(vp->v_type == VREG);
1094 if (filemode & FREAD) {
1095 ASSERT(vp->v_rdcnt > 0);
1096 atomic_dec_32(&vp->v_rdcnt);
1098 if (filemode & FWRITE) {
1099 ASSERT(vp->v_wrcnt > 0);
1100 atomic_dec_32(&vp->v_wrcnt);
1106 vn_create(
1107 char *pnamep,
1108 enum uio_seg seg,
1109 struct vattr *vap,
1110 enum vcexcl excl,
1111 int mode,
1112 struct vnode **vpp,
1113 enum create why,
1114 int flag,
1115 mode_t umask)
1117 return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1118 umask, NULL));
1122 * Create a vnode (makenode).
1125 vn_createat(
1126 char *pnamep,
1127 enum uio_seg seg,
1128 struct vattr *vap,
1129 enum vcexcl excl,
1130 int mode,
1131 struct vnode **vpp,
1132 enum create why,
1133 int flag,
1134 mode_t umask,
1135 struct vnode *startvp)
1137 struct vnode *dvp; /* ptr to parent dir vnode */
1138 struct vnode *vp = NULL;
1139 struct pathname pn;
1140 int error;
1141 int in_crit = 0;
1142 struct vattr vattr;
1143 enum symfollow follow;
1144 int estale_retry = 0;
1146 ASSERT((vap->va_mask & (VATTR_TYPE|VATTR_MODE)) == (VATTR_TYPE|VATTR_MODE));
1148 /* symlink interpretation */
1149 if ((flag & FNOFOLLOW) || excl == EXCL)
1150 follow = NO_FOLLOW;
1151 else
1152 follow = FOLLOW;
1153 flag &= ~(FNOFOLLOW|FNOLINKS);
1155 top:
1157 * Lookup directory.
1158 * If new object is a file, call lower level to create it.
1159 * Note that it is up to the lower level to enforce exclusive
1160 * creation, if the file is already there.
1161 * This allows the lower level to do whatever
1162 * locking or protocol that is needed to prevent races.
1163 * If the new object is directory call lower level to make
1164 * the new directory, with "." and "..".
1166 if (error = pn_get(pnamep, seg, &pn))
1167 return (error);
1168 dvp = NULL;
1169 *vpp = NULL;
1171 * lookup will find the parent directory for the vnode.
1172 * When it is done the pn holds the name of the entry
1173 * in the directory.
1174 * If this is a non-exclusive create we also find the node itself.
1176 error = lookuppnat(&pn, NULL, follow, &dvp,
1177 (excl == EXCL) ? NULLVPP : vpp, startvp);
1178 if (error) {
1179 pn_free(&pn);
1180 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1181 goto top;
1182 if (why == CRMKDIR && error == EINVAL)
1183 error = EEXIST; /* SVID */
1184 return (error);
1187 if (why != CRMKNOD)
1188 vap->va_mode &= ~VSVTX;
1191 * If default ACLs are defined for the directory don't apply the
1192 * umask if umask is passed.
1195 if (umask) {
1197 vsecattr_t vsec;
1199 vsec.vsa_aclcnt = 0;
1200 vsec.vsa_aclentp = NULL;
1201 vsec.vsa_dfaclcnt = 0;
1202 vsec.vsa_dfaclentp = NULL;
1203 vsec.vsa_mask = VSA_DFACLCNT;
1204 error = fop_getsecattr(dvp, &vsec, 0, CRED(), NULL);
1206 * If error is ENOSYS then treat it as no error
1207 * Don't want to force all file systems to support
1208 * aclent_t style of ACL's.
1210 if (error == ENOSYS)
1211 error = 0;
1212 if (error) {
1213 if (*vpp != NULL)
1214 VN_RELE(*vpp);
1215 goto out;
1216 } else {
1218 * Apply the umask if no default ACLs.
1220 if (vsec.vsa_dfaclcnt == 0)
1221 vap->va_mode &= ~umask;
1224 * fop_getsecattr() may have allocated memory for
1225 * ACLs we didn't request, so double-check and
1226 * free it if necessary.
1228 if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1229 kmem_free((caddr_t)vsec.vsa_aclentp,
1230 vsec.vsa_aclcnt * sizeof (aclent_t));
1231 if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1232 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1233 vsec.vsa_dfaclcnt * sizeof (aclent_t));
1238 * In general we want to generate EROFS if the file system is
1239 * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1
1240 * documents the open system call, and it says that O_CREAT has no
1241 * effect if the file already exists. Bug 1119649 states
1242 * that open(path, O_CREAT, ...) fails when attempting to open an
1243 * existing file on a read only file system. Thus, the first part
1244 * of the following if statement has 3 checks:
1245 * if the file exists &&
1246 * it is being open with write access &&
1247 * the file system is read only
1248 * then generate EROFS
1250 if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1251 (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1252 if (*vpp)
1253 VN_RELE(*vpp);
1254 error = EROFS;
1255 } else if (excl == NONEXCL && *vpp != NULL) {
1256 vnode_t *rvp;
1259 * File already exists. If a mandatory lock has been
1260 * applied, return error.
1262 vp = *vpp;
1263 if (fop_realvp(vp, &rvp, NULL) != 0)
1264 rvp = vp;
1265 if ((vap->va_mask & VATTR_SIZE) && nbl_need_check(vp)) {
1266 nbl_start_crit(vp, RW_READER);
1267 in_crit = 1;
1269 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1270 vattr.va_mask = VATTR_MODE|VATTR_SIZE;
1271 if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL)) {
1272 goto out;
1274 if (MANDLOCK(vp, vattr.va_mode)) {
1275 error = EAGAIN;
1276 goto out;
1279 * File cannot be truncated if non-blocking mandatory
1280 * locks are currently on the file.
1282 if ((vap->va_mask & VATTR_SIZE) && in_crit) {
1283 uoff_t offset;
1284 ssize_t length;
1286 offset = vap->va_size > vattr.va_size ?
1287 vattr.va_size : vap->va_size;
1288 length = vap->va_size > vattr.va_size ?
1289 vap->va_size - vattr.va_size :
1290 vattr.va_size - vap->va_size;
1291 if (nbl_conflict(vp, NBL_WRITE, offset,
1292 length, 0, NULL)) {
1293 error = EACCES;
1294 goto out;
1300 * If the file is the root of a VFS, we've crossed a
1301 * mount point and the "containing" directory that we
1302 * acquired above (dvp) is irrelevant because it's in
1303 * a different file system. We apply fop_create to the
1304 * target itself instead of to the containing directory
1305 * and supply a null path name to indicate (conventionally)
1306 * the node itself as the "component" of interest.
1308 * The call to fop_create() is necessary to ensure
1309 * that the appropriate permission checks are made,
1310 * i.e. EISDIR, EACCES, etc. We already know that vpp
1311 * exists since we are in the else condition where this
1312 * was checked.
1314 if (vp->v_flag & VROOT) {
1315 ASSERT(why != CRMKDIR);
1316 error = fop_create(vp, "", vap, excl, mode, vpp,
1317 CRED(), flag, NULL, NULL);
1319 * If the create succeeded, it will have created a
1320 * new reference on a new vnode (*vpp) in the child
1321 * file system, so we want to drop our reference on
1322 * the old (vp) upon exit.
1324 goto out;
1328 if (error == 0) {
1330 * Call mkdir() if specified, otherwise create().
1332 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */
1334 if (why == CRMKDIR)
1336 * N.B., if vn_createat() ever requests
1337 * case-insensitive behavior then it will need
1338 * to be passed to fop_mkdir(). fop_create()
1339 * will already get it via "flag"
1341 error = fop_mkdir(dvp, pn.pn_path, vap, vpp, CRED(),
1342 NULL, 0, NULL);
1343 else if (!must_be_dir)
1344 error = fop_create(dvp, pn.pn_path, vap,
1345 excl, mode, vpp, CRED(), flag, NULL, NULL);
1346 else
1347 error = ENOTDIR;
1350 out:
1352 if (in_crit) {
1353 nbl_end_crit(vp);
1354 in_crit = 0;
1356 if (vp != NULL) {
1357 VN_RELE(vp);
1358 vp = NULL;
1360 pn_free(&pn);
1361 VN_RELE(dvp);
1363 * The following clause was added to handle a problem
1364 * with NFS consistency. It is possible that a lookup
1365 * of the file to be created succeeded, but the file
1366 * itself doesn't actually exist on the server. This
1367 * is chiefly due to the DNLC containing an entry for
1368 * the file which has been removed on the server. In
1369 * this case, we just start over. If there was some
1370 * other cause for the ESTALE error, then the lookup
1371 * of the file will fail and the error will be returned
1372 * above instead of looping around from here.
1374 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1375 goto top;
1376 return (error);
1380 vn_link(char *from, char *to, enum uio_seg seg)
1382 return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1386 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1387 vnode_t *tstartvp, char *to, enum uio_seg seg)
1389 struct vnode *fvp; /* from vnode ptr */
1390 struct vnode *tdvp; /* to directory vnode ptr */
1391 struct pathname pn;
1392 int error;
1393 struct vattr vattr;
1394 dev_t fsid;
1395 int estale_retry = 0;
1397 top:
1398 fvp = tdvp = NULL;
1399 if (error = pn_get(to, seg, &pn))
1400 return (error);
1401 if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1402 goto out;
1403 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1404 goto out;
1406 * Make sure both source vnode and target directory vnode are
1407 * in the same vfs and that it is writeable.
1409 vattr.va_mask = VATTR_FSID;
1410 if (error = fop_getattr(fvp, &vattr, 0, CRED(), NULL))
1411 goto out;
1412 fsid = vattr.va_fsid;
1413 vattr.va_mask = VATTR_FSID;
1414 if (error = fop_getattr(tdvp, &vattr, 0, CRED(), NULL))
1415 goto out;
1416 if (fsid != vattr.va_fsid) {
1417 error = EXDEV;
1418 goto out;
1420 if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1421 error = EROFS;
1422 goto out;
1425 * Do the link.
1427 (void) pn_fixslash(&pn);
1428 error = fop_link(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1429 out:
1430 pn_free(&pn);
1431 if (fvp)
1432 VN_RELE(fvp);
1433 if (tdvp)
1434 VN_RELE(tdvp);
1435 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1436 goto top;
1437 return (error);
1441 vn_rename(char *from, char *to, enum uio_seg seg)
1443 return (vn_renameat(NULL, from, NULL, to, seg));
1447 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1448 char *tname, enum uio_seg seg)
1450 int error;
1451 struct vattr vattr;
1452 struct pathname fpn; /* from pathname */
1453 struct pathname tpn; /* to pathname */
1454 dev_t fsid;
1455 int in_crit_src, in_crit_targ;
1456 vnode_t *fromvp, *fvp;
1457 vnode_t *tovp, *targvp;
1458 int estale_retry = 0;
1460 top:
1461 fvp = fromvp = tovp = targvp = NULL;
1462 in_crit_src = in_crit_targ = 0;
1464 * Get to and from pathnames.
1466 if (error = pn_get(fname, seg, &fpn))
1467 return (error);
1468 if (error = pn_get(tname, seg, &tpn)) {
1469 pn_free(&fpn);
1470 return (error);
1474 * First we need to resolve the correct directories
1475 * The passed in directories may only be a starting point,
1476 * but we need the real directories the file(s) live in.
1477 * For example the fname may be something like usr/lib/sparc
1478 * and we were passed in the / directory, but we need to
1479 * use the lib directory for the rename.
1483 * Lookup to and from directories.
1485 if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1486 goto out;
1490 * Make sure there is an entry.
1492 if (fvp == NULL) {
1493 error = ENOENT;
1494 goto out;
1497 if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1498 goto out;
1502 * Make sure both the from vnode directory and the to directory
1503 * are in the same vfs and the to directory is writable.
1504 * We check fsid's, not vfs pointers, so loopback fs works.
1506 if (fromvp != tovp) {
1507 vattr.va_mask = VATTR_FSID;
1508 if (error = fop_getattr(fromvp, &vattr, 0, CRED(), NULL))
1509 goto out;
1510 fsid = vattr.va_fsid;
1511 vattr.va_mask = VATTR_FSID;
1512 if (error = fop_getattr(tovp, &vattr, 0, CRED(), NULL))
1513 goto out;
1514 if (fsid != vattr.va_fsid) {
1515 error = EXDEV;
1516 goto out;
1520 if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1521 error = EROFS;
1522 goto out;
1526 * Make sure "from" vp is not a mount point.
1527 * Note, lookup did traverse() already, so
1528 * we'll be looking at the mounted FS root.
1529 * (but allow files like mnttab)
1531 if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1532 error = EBUSY;
1533 goto out;
1536 if (targvp && (fvp != targvp)) {
1537 nbl_start_crit(targvp, RW_READER);
1538 in_crit_targ = 1;
1539 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1540 error = EACCES;
1541 goto out;
1545 if (nbl_need_check(fvp)) {
1546 nbl_start_crit(fvp, RW_READER);
1547 in_crit_src = 1;
1548 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1549 error = EACCES;
1550 goto out;
1555 * Do the rename.
1557 (void) pn_fixslash(&tpn);
1558 error = fop_rename(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1559 NULL, 0);
1561 out:
1562 pn_free(&fpn);
1563 pn_free(&tpn);
1564 if (in_crit_src)
1565 nbl_end_crit(fvp);
1566 if (in_crit_targ)
1567 nbl_end_crit(targvp);
1568 if (fromvp)
1569 VN_RELE(fromvp);
1570 if (tovp)
1571 VN_RELE(tovp);
1572 if (targvp)
1573 VN_RELE(targvp);
1574 if (fvp)
1575 VN_RELE(fvp);
1576 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1577 goto top;
1578 return (error);
1582 * Remove a file or directory.
1585 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1587 return (vn_removeat(NULL, fnamep, seg, dirflag));
1591 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1593 struct vnode *vp; /* entry vnode */
1594 struct vnode *dvp; /* ptr to parent dir vnode */
1595 struct vnode *coveredvp;
1596 struct pathname pn; /* name of entry */
1597 enum vtype vtype;
1598 int error;
1599 struct vfs *vfsp;
1600 struct vfs *dvfsp; /* ptr to parent dir vfs */
1601 int in_crit = 0;
1602 int estale_retry = 0;
1604 top:
1605 if (error = pn_get(fnamep, seg, &pn))
1606 return (error);
1607 dvp = vp = NULL;
1608 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1609 pn_free(&pn);
1610 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1611 goto top;
1612 return (error);
1616 * Make sure there is an entry.
1618 if (vp == NULL) {
1619 error = ENOENT;
1620 goto out;
1623 vfsp = vp->v_vfsp;
1624 dvfsp = dvp->v_vfsp;
1627 * If the named file is the root of a mounted filesystem, fail,
1628 * unless it's marked unlinkable. In that case, unmount the
1629 * filesystem and proceed to unlink the covered vnode. (If the
1630 * covered vnode is a directory, use rmdir instead of unlink,
1631 * to avoid file system corruption.)
1633 if (vp->v_flag & VROOT) {
1634 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1635 error = EBUSY;
1636 goto out;
1640 * Namefs specific code starts here.
1643 if (dirflag == RMDIRECTORY) {
1645 * User called rmdir(2) on a file that has
1646 * been namefs mounted on top of. Since
1647 * namefs doesn't allow directories to
1648 * be mounted on other files we know
1649 * vp is not of type VDIR so fail to operation.
1651 error = ENOTDIR;
1652 goto out;
1656 * If VROOT is still set after grabbing vp->v_lock,
1657 * noone has finished nm_unmount so far and coveredvp
1658 * is valid.
1659 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1660 * vp->v_lock, any race window is eliminated.
1663 mutex_enter(&vp->v_lock);
1664 if ((vp->v_flag & VROOT) == 0) {
1665 /* Someone beat us to the unmount */
1666 mutex_exit(&vp->v_lock);
1667 error = EBUSY;
1668 goto out;
1670 vfsp = vp->v_vfsp;
1671 coveredvp = vfsp->vfs_vnodecovered;
1672 ASSERT(coveredvp);
1674 * Note: Implementation of vn_vfswlock shows that ordering of
1675 * v_lock / vn_vfswlock is not an issue here.
1677 error = vn_vfswlock(coveredvp);
1678 mutex_exit(&vp->v_lock);
1680 if (error)
1681 goto out;
1683 VN_HOLD(coveredvp);
1684 VN_RELE(vp);
1685 error = dounmount(vfsp, 0, CRED());
1688 * Unmounted the namefs file system; now get
1689 * the object it was mounted over.
1691 vp = coveredvp;
1693 * If namefs was mounted over a directory, then
1694 * we want to use rmdir() instead of unlink().
1696 if (vp->v_type == VDIR)
1697 dirflag = RMDIRECTORY;
1699 if (error)
1700 goto out;
1704 * Make sure filesystem is writeable.
1705 * We check the parent directory's vfs in case this is an lofs vnode.
1707 if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1708 error = EROFS;
1709 goto out;
1712 vtype = vp->v_type;
1715 * If there is the possibility of an nbmand share reservation, make
1716 * sure it's okay to remove the file. Keep a reference to the
1717 * vnode, so that we can exit the nbl critical region after
1718 * calling fop_remove.
1719 * If there is no possibility of an nbmand share reservation,
1720 * release the vnode reference now. Filesystems like NFS may
1721 * behave differently if there is an extra reference, so get rid of
1722 * this one. Fortunately, we can't have nbmand mounts on NFS
1723 * filesystems.
1725 if (nbl_need_check(vp)) {
1726 nbl_start_crit(vp, RW_READER);
1727 in_crit = 1;
1728 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1729 error = EACCES;
1730 goto out;
1732 } else {
1733 VN_RELE(vp);
1734 vp = NULL;
1737 if (dirflag == RMDIRECTORY) {
1739 * Caller is using rmdir(2), which can only be applied to
1740 * directories.
1742 if (vtype != VDIR) {
1743 error = ENOTDIR;
1744 } else {
1745 vnode_t *cwd;
1746 proc_t *pp = curproc;
1748 mutex_enter(&pp->p_lock);
1749 cwd = PTOU(pp)->u_cdir;
1750 VN_HOLD(cwd);
1751 mutex_exit(&pp->p_lock);
1752 error = fop_rmdir(dvp, pn.pn_path, cwd, CRED(),
1753 NULL, 0);
1754 VN_RELE(cwd);
1756 } else {
1758 * Unlink(2) can be applied to anything.
1760 error = fop_remove(dvp, pn.pn_path, CRED(), NULL, 0);
1763 out:
1764 pn_free(&pn);
1765 if (in_crit) {
1766 nbl_end_crit(vp);
1767 in_crit = 0;
1769 if (vp != NULL)
1770 VN_RELE(vp);
1771 if (dvp != NULL)
1772 VN_RELE(dvp);
1773 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1774 goto top;
1775 return (error);
1779 * Utility function to compare equality of vnodes.
1780 * Compare the underlying real vnodes, if there are underlying vnodes.
1781 * This is a more thorough comparison than the VN_CMP() macro provides.
1784 vn_compare(vnode_t *vp1, vnode_t *vp2)
1786 vnode_t *realvp;
1788 if (vp1 != NULL && fop_realvp(vp1, &realvp, NULL) == 0)
1789 vp1 = realvp;
1790 if (vp2 != NULL && fop_realvp(vp2, &realvp, NULL) == 0)
1791 vp2 = realvp;
1792 return (VN_CMP(vp1, vp2));
1796 * The number of locks to hash into. This value must be a power
1797 * of 2 minus 1 and should probably also be prime.
1799 #define NUM_BUCKETS 1023
1801 struct vn_vfslocks_bucket {
1802 kmutex_t vb_lock;
1803 vn_vfslocks_entry_t *vb_list;
1804 char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
1808 * Total number of buckets will be NUM_BUCKETS + 1 .
1811 #pragma align 64(vn_vfslocks_buckets)
1812 static struct vn_vfslocks_bucket vn_vfslocks_buckets[NUM_BUCKETS + 1];
1814 #define VN_VFSLOCKS_SHIFT 9
1816 #define VN_VFSLOCKS_HASH(vfsvpptr) \
1817 ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
1820 * vn_vfslocks_getlock() uses an HASH scheme to generate
1821 * rwstlock using vfs/vnode pointer passed to it.
1823 * vn_vfslocks_rele() releases a reference in the
1824 * HASH table which allows the entry allocated by
1825 * vn_vfslocks_getlock() to be freed at a later
1826 * stage when the refcount drops to zero.
1829 vn_vfslocks_entry_t *
1830 vn_vfslocks_getlock(void *vfsvpptr)
1832 struct vn_vfslocks_bucket *bp;
1833 vn_vfslocks_entry_t *vep;
1834 vn_vfslocks_entry_t *tvep;
1836 ASSERT(vfsvpptr != NULL);
1837 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
1839 mutex_enter(&bp->vb_lock);
1840 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
1841 if (vep->ve_vpvfs == vfsvpptr) {
1842 vep->ve_refcnt++;
1843 mutex_exit(&bp->vb_lock);
1844 return (vep);
1847 mutex_exit(&bp->vb_lock);
1848 vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
1849 rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
1850 vep->ve_vpvfs = (char *)vfsvpptr;
1851 vep->ve_refcnt = 1;
1852 mutex_enter(&bp->vb_lock);
1853 for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
1854 if (tvep->ve_vpvfs == vfsvpptr) {
1855 tvep->ve_refcnt++;
1856 mutex_exit(&bp->vb_lock);
1859 * There is already an entry in the hash
1860 * destroy what we just allocated.
1862 rwst_destroy(&vep->ve_lock);
1863 kmem_free(vep, sizeof (*vep));
1864 return (tvep);
1867 vep->ve_next = bp->vb_list;
1868 bp->vb_list = vep;
1869 mutex_exit(&bp->vb_lock);
1870 return (vep);
1873 void
1874 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
1876 struct vn_vfslocks_bucket *bp;
1877 vn_vfslocks_entry_t *vep;
1878 vn_vfslocks_entry_t *pvep;
1880 ASSERT(vepent != NULL);
1881 ASSERT(vepent->ve_vpvfs != NULL);
1883 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
1885 mutex_enter(&bp->vb_lock);
1886 vepent->ve_refcnt--;
1888 if ((int32_t)vepent->ve_refcnt < 0)
1889 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
1891 if (vepent->ve_refcnt == 0) {
1892 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
1893 if (vep->ve_vpvfs == vepent->ve_vpvfs) {
1894 if (bp->vb_list == vep)
1895 bp->vb_list = vep->ve_next;
1896 else {
1897 /* LINTED */
1898 pvep->ve_next = vep->ve_next;
1900 mutex_exit(&bp->vb_lock);
1901 rwst_destroy(&vep->ve_lock);
1902 kmem_free(vep, sizeof (*vep));
1903 return;
1905 pvep = vep;
1907 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
1909 mutex_exit(&bp->vb_lock);
1913 * vn_vfswlock_wait is used to implement a lock which is logically a writers
1914 * lock protecting the v_vfsmountedhere field.
1915 * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
1916 * except that it blocks to acquire the lock VVFSLOCK.
1918 * traverse() and routines re-implementing part of traverse (e.g. autofs)
1919 * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
1920 * need the non-blocking version of the writers lock i.e. vn_vfswlock
1923 vn_vfswlock_wait(vnode_t *vp)
1925 int retval;
1926 vn_vfslocks_entry_t *vpvfsentry;
1927 ASSERT(vp != NULL);
1929 vpvfsentry = vn_vfslocks_getlock(vp);
1930 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
1932 if (retval == EINTR) {
1933 vn_vfslocks_rele(vpvfsentry);
1934 return (EINTR);
1936 return (retval);
1940 vn_vfsrlock_wait(vnode_t *vp)
1942 int retval;
1943 vn_vfslocks_entry_t *vpvfsentry;
1944 ASSERT(vp != NULL);
1946 vpvfsentry = vn_vfslocks_getlock(vp);
1947 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
1949 if (retval == EINTR) {
1950 vn_vfslocks_rele(vpvfsentry);
1951 return (EINTR);
1954 return (retval);
1959 * vn_vfswlock is used to implement a lock which is logically a writers lock
1960 * protecting the v_vfsmountedhere field.
1963 vn_vfswlock(vnode_t *vp)
1965 vn_vfslocks_entry_t *vpvfsentry;
1968 * If vp is NULL then somebody is trying to lock the covered vnode
1969 * of /. (vfs_vnodecovered is NULL for /). This situation will
1970 * only happen when unmounting /. Since that operation will fail
1971 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
1973 if (vp == NULL)
1974 return (EBUSY);
1976 vpvfsentry = vn_vfslocks_getlock(vp);
1978 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
1979 return (0);
1981 vn_vfslocks_rele(vpvfsentry);
1982 return (EBUSY);
1986 vn_vfsrlock(vnode_t *vp)
1988 vn_vfslocks_entry_t *vpvfsentry;
1991 * If vp is NULL then somebody is trying to lock the covered vnode
1992 * of /. (vfs_vnodecovered is NULL for /). This situation will
1993 * only happen when unmounting /. Since that operation will fail
1994 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
1996 if (vp == NULL)
1997 return (EBUSY);
1999 vpvfsentry = vn_vfslocks_getlock(vp);
2001 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2002 return (0);
2004 vn_vfslocks_rele(vpvfsentry);
2005 return (EBUSY);
2008 void
2009 vn_vfsunlock(vnode_t *vp)
2011 vn_vfslocks_entry_t *vpvfsentry;
2014 * ve_refcnt needs to be decremented twice.
2015 * 1. To release refernce after a call to vn_vfslocks_getlock()
2016 * 2. To release the reference from the locking routines like
2017 * vn_vfsrlock/vn_vfswlock etc,.
2019 vpvfsentry = vn_vfslocks_getlock(vp);
2020 vn_vfslocks_rele(vpvfsentry);
2022 rwst_exit(&vpvfsentry->ve_lock);
2023 vn_vfslocks_rele(vpvfsentry);
2027 vn_vfswlock_held(vnode_t *vp)
2029 int held;
2030 vn_vfslocks_entry_t *vpvfsentry;
2032 ASSERT(vp != NULL);
2034 vpvfsentry = vn_vfslocks_getlock(vp);
2035 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2037 vn_vfslocks_rele(vpvfsentry);
2038 return (held);
2043 * Vnode cache.
2046 /* ARGSUSED */
2047 static int
2048 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2050 struct vnode *vp;
2052 vp = buf;
2054 mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2055 mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2056 cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2057 rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2058 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2059 vp->v_path = vn_vpath_empty;
2060 vp->v_path_stamp = 0;
2061 vp->v_mpssdata = NULL;
2062 vp->v_vsd = NULL;
2063 vp->v_fopdata = NULL;
2065 vmobject_init(&vp->v_object, vp);
2067 return (0);
2070 /* ARGSUSED */
2071 static void
2072 vn_cache_destructor(void *buf, void *cdrarg)
2074 struct vnode *vp;
2076 vp = buf;
2078 vmobject_fini(&vp->v_object);
2080 rw_destroy(&vp->v_nbllock);
2081 cv_destroy(&vp->v_cv);
2082 mutex_destroy(&vp->v_vsd_lock);
2083 mutex_destroy(&vp->v_lock);
2086 void
2087 vn_create_cache(void)
2089 /* LINTED */
2090 ASSERT((1 << VNODE_ALIGN_LOG2) ==
2091 P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2092 vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2093 VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2094 NULL, 0);
2097 void
2098 vn_destroy_cache(void)
2100 kmem_cache_destroy(vn_cache);
2104 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2105 * cached by the file system and vnodes remain associated.
2107 void
2108 vn_recycle(vnode_t *vp)
2110 ASSERT(!vn_has_cached_data(vp));
2111 VERIFY(vp->v_path != NULL);
2114 * XXX - This really belongs in vn_reinit(), but we have some issues
2115 * with the counts. Best to have it here for clean initialization.
2117 vp->v_rdcnt = 0;
2118 vp->v_wrcnt = 0;
2119 vp->v_mmap_read = 0;
2120 vp->v_mmap_write = 0;
2123 * If FEM was in use, make sure everything gets cleaned up
2124 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2125 * constructor.
2127 if (vp->v_femhead) {
2128 /* XXX - There should be a free_femhead() that does all this */
2129 ASSERT(vp->v_femhead->femh_list == NULL);
2130 mutex_destroy(&vp->v_femhead->femh_lock);
2131 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2132 vp->v_femhead = NULL;
2134 if (vp->v_path != vn_vpath_empty) {
2135 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2136 vp->v_path = vn_vpath_empty;
2138 vp->v_path_stamp = 0;
2140 if (vp->v_fopdata != NULL) {
2141 free_fopdata(vp);
2143 vp->v_mpssdata = NULL;
2144 vsd_free(vp);
2148 * Used to reset the vnode fields including those that are directly accessible
2149 * as well as those which require an accessor function.
2151 * Does not initialize:
2152 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2153 * v_data (since FS-nodes and vnodes point to each other and should
2154 * be updated simultaneously)
2155 * v_op (in case someone needs to make a VOP call on this object)
2157 void
2158 vn_reinit(vnode_t *vp)
2160 vp->v_count = 1;
2161 vp->v_count_dnlc = 0;
2162 vp->v_vfsp = NULL;
2163 vp->v_stream = NULL;
2164 vp->v_vfsmountedhere = NULL;
2165 vp->v_flag = 0;
2166 vp->v_type = VNON;
2167 vp->v_rdev = NODEV;
2169 vp->v_filocks = NULL;
2170 vp->v_shrlocks = NULL;
2171 VERIFY(!vn_has_cached_data(vp));
2173 vp->v_locality = NULL;
2174 vp->v_xattrdir = NULL;
2177 * In a few specific instances, vn_reinit() is used to initialize
2178 * locally defined vnode_t instances. Lacking the construction offered
2179 * by vn_alloc(), these vnodes require v_path initialization.
2181 if (vp->v_path == NULL) {
2182 vp->v_path = vn_vpath_empty;
2185 /* Handles v_femhead, v_path, and the r/w/map counts */
2186 vn_recycle(vp);
2189 vnode_t *
2190 vn_alloc(int kmflag)
2192 vnode_t *vp;
2194 vp = kmem_cache_alloc(vn_cache, kmflag);
2196 if (vp != NULL) {
2197 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2198 vp->v_fopdata = NULL;
2199 vn_reinit(vp);
2202 return (vp);
2205 void
2206 vn_free(vnode_t *vp)
2208 ASSERT(vp->v_shrlocks == NULL);
2209 ASSERT(vp->v_filocks == NULL);
2212 * Some file systems call vn_free() with v_count of zero,
2213 * some with v_count of 1. In any case, the value should
2214 * never be anything else.
2216 ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2217 ASSERT(vp->v_count_dnlc == 0);
2218 VERIFY(vp->v_path != NULL);
2219 if (vp->v_path != vn_vpath_empty) {
2220 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2221 vp->v_path = vn_vpath_empty;
2224 /* If FEM was in use, make sure everything gets cleaned up */
2225 if (vp->v_femhead) {
2226 /* XXX - There should be a free_femhead() that does all this */
2227 ASSERT(vp->v_femhead->femh_list == NULL);
2228 mutex_destroy(&vp->v_femhead->femh_lock);
2229 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2230 vp->v_femhead = NULL;
2233 if (vp->v_fopdata != NULL) {
2234 free_fopdata(vp);
2236 vp->v_mpssdata = NULL;
2237 vsd_free(vp);
2238 kmem_cache_free(vn_cache, vp);
2242 * vnode status changes, should define better states than 1, 0.
2244 void
2245 vn_reclaim(vnode_t *vp)
2247 vfs_t *vfsp = vp->v_vfsp;
2249 if (vfsp == NULL ||
2250 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2251 return;
2253 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2256 void
2257 vn_idle(vnode_t *vp)
2259 vfs_t *vfsp = vp->v_vfsp;
2261 if (vfsp == NULL ||
2262 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2263 return;
2265 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2267 void
2268 vn_exists(vnode_t *vp)
2270 vfs_t *vfsp = vp->v_vfsp;
2272 if (vfsp == NULL ||
2273 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2274 return;
2276 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2279 void
2280 vn_invalid(vnode_t *vp)
2282 vfs_t *vfsp = vp->v_vfsp;
2284 if (vfsp == NULL ||
2285 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2286 return;
2288 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2291 /* Vnode event notification */
2294 vnevent_support(vnode_t *vp, caller_context_t *ct)
2296 if (vp == NULL)
2297 return (EINVAL);
2299 return (fop_vnevent(vp, VE_SUPPORT, NULL, NULL, ct));
2302 void
2303 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2305 if (vp == NULL || vp->v_femhead == NULL) {
2306 return;
2308 (void) fop_vnevent(vp, VE_RENAME_SRC, dvp, name, ct);
2311 void
2312 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2313 caller_context_t *ct)
2315 if (vp == NULL || vp->v_femhead == NULL) {
2316 return;
2318 (void) fop_vnevent(vp, VE_RENAME_DEST, dvp, name, ct);
2321 void
2322 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2324 if (vp == NULL || vp->v_femhead == NULL) {
2325 return;
2327 (void) fop_vnevent(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2330 void
2331 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2333 if (vp == NULL || vp->v_femhead == NULL) {
2334 return;
2336 (void) fop_vnevent(vp, VE_REMOVE, dvp, name, ct);
2339 void
2340 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2342 if (vp == NULL || vp->v_femhead == NULL) {
2343 return;
2345 (void) fop_vnevent(vp, VE_RMDIR, dvp, name, ct);
2348 void
2349 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2350 caller_context_t *ct)
2352 if (vp == NULL || vp->v_femhead == NULL) {
2353 return;
2355 (void) fop_vnevent(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2358 void
2359 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2360 caller_context_t *ct)
2362 if (vp == NULL || vp->v_femhead == NULL) {
2363 return;
2365 (void) fop_vnevent(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2368 void
2369 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2370 caller_context_t *ct)
2372 if (vp == NULL || vp->v_femhead == NULL) {
2373 return;
2375 (void) fop_vnevent(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2378 void
2379 vnevent_create(vnode_t *vp, caller_context_t *ct)
2381 if (vp == NULL || vp->v_femhead == NULL) {
2382 return;
2384 (void) fop_vnevent(vp, VE_CREATE, NULL, NULL, ct);
2387 void
2388 vnevent_link(vnode_t *vp, caller_context_t *ct)
2390 if (vp == NULL || vp->v_femhead == NULL) {
2391 return;
2393 (void) fop_vnevent(vp, VE_LINK, NULL, NULL, ct);
2396 void
2397 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2399 if (vp == NULL || vp->v_femhead == NULL) {
2400 return;
2402 (void) fop_vnevent(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2405 void
2406 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2408 if (vp == NULL || vp->v_femhead == NULL) {
2409 return;
2411 (void) fop_vnevent(vp, VE_TRUNCATE, NULL, NULL, ct);
2415 * Vnode accessors.
2419 vn_is_readonly(vnode_t *vp)
2421 return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2425 vn_has_flocks(vnode_t *vp)
2427 return (vp->v_filocks != NULL);
2431 vn_has_mandatory_locks(vnode_t *vp, int mode)
2433 return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2437 vn_has_cached_data(vnode_t *vp)
2439 return (!list_is_empty(&vp->v_object.list));
2443 * Return 0 if the vnode in question shouldn't be permitted into a zone via
2444 * zone_enter(2).
2447 vn_can_change_zones(vnode_t *vp)
2449 struct vfssw *vswp;
2450 int allow = 1;
2451 vnode_t *rvp;
2453 if (nfs_global_client_only != 0)
2454 return (1);
2457 * We always want to look at the underlying vnode if there is one.
2459 if (fop_realvp(vp, &rvp, NULL) != 0)
2460 rvp = vp;
2462 * Some pseudo filesystems (including doorfs) don't actually register
2463 * their vfsops_t, so the following may return NULL; we happily let
2464 * such vnodes switch zones.
2466 vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2467 if (vswp != NULL) {
2468 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2469 allow = 0;
2470 vfs_unrefvfssw(vswp);
2472 return (allow);
2476 * Return nonzero if the vnode is a mount point, zero if not.
2479 vn_ismntpt(vnode_t *vp)
2481 return (vp->v_vfsmountedhere != NULL);
2484 /* Retrieve the vfs (if any) mounted on this vnode */
2485 vfs_t *
2486 vn_mountedvfs(vnode_t *vp)
2488 return (vp->v_vfsmountedhere);
2492 * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2495 vn_in_dnlc(vnode_t *vp)
2497 return (vp->v_count_dnlc > 0);
2501 * vn_has_other_opens() checks whether a particular file is opened by more than
2502 * just the caller and whether the open is for read and/or write.
2503 * This routine is for calling after the caller has already called fop_open()
2504 * and the caller wishes to know if they are the only one with it open for
2505 * the mode(s) specified.
2507 * Vnode counts are only kept on regular files (v_type=VREG).
2509 bool
2510 vn_has_other_opens(struct vnode *vp, v_mode_t mode)
2512 ASSERT(vp != NULL);
2514 switch (mode) {
2515 case V_WRITE:
2516 if (vp->v_wrcnt > 1)
2517 return true;
2518 break;
2519 case V_RDORWR:
2520 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2521 return true;
2522 break;
2523 case V_RDANDWR:
2524 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2525 return true;
2526 break;
2527 case V_READ:
2528 if (vp->v_rdcnt > 1)
2529 return true;
2530 break;
2533 return false;
2537 * vn_is_opened() checks whether a particular file is opened and
2538 * whether the open is for read and/or write.
2540 * Vnode counts are only kept on regular files (v_type=VREG).
2542 bool vn_is_opened(struct vnode *vp, v_mode_t mode)
2544 ASSERT(vp != NULL);
2546 switch (mode) {
2547 case V_WRITE:
2548 if (vp->v_wrcnt)
2549 return true;
2550 break;
2551 case V_RDANDWR:
2552 if (vp->v_rdcnt && vp->v_wrcnt)
2553 return true;
2554 break;
2555 case V_RDORWR:
2556 if (vp->v_rdcnt || vp->v_wrcnt)
2557 return true;
2558 break;
2559 case V_READ:
2560 if (vp->v_rdcnt)
2561 return true;
2562 break;
2565 return false;
2569 * vn_is_mapped() checks whether a particular file is mapped and whether
2570 * the file is mapped read and/or write.
2572 bool vn_is_mapped(struct vnode *vp, v_mode_t mode)
2574 ASSERT(vp != NULL);
2576 #if !defined(_LP64)
2577 switch (mode) {
2579 * The atomic_add_64_nv functions force atomicity in the
2580 * case of 32 bit architectures. Otherwise the 64 bit values
2581 * require two fetches. The value of the fields may be
2582 * (potentially) changed between the first fetch and the
2583 * second
2585 case V_WRITE:
2586 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2587 return true;
2588 break;
2589 case V_RDANDWR:
2590 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2591 (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2592 return true;
2593 break;
2594 case V_RDORWR:
2595 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2596 (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2597 return true;
2598 break;
2599 case V_READ:
2600 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2601 return true;
2602 break;
2604 #else
2605 switch (mode) {
2606 case V_WRITE:
2607 if (vp->v_mmap_write)
2608 return true;
2609 break;
2610 case V_RDANDWR:
2611 if (vp->v_mmap_read && vp->v_mmap_write)
2612 return true;
2613 break;
2614 case V_RDORWR:
2615 if (vp->v_mmap_read || vp->v_mmap_write)
2616 return true;
2617 break;
2618 case V_READ:
2619 if (vp->v_mmap_read)
2620 return true;
2621 break;
2623 #endif
2625 return false;
2629 * Set the operations vector for a vnode.
2631 void
2632 vn_setops(struct vnode *vnode, const struct vnodeops *ops)
2634 vnode->v_op = ops;
2638 * Retrieve the operations vector for a vnode
2640 const struct vnodeops *
2641 vn_getops(struct vnode *vnode)
2643 return vnode->v_op;
2647 * Returns non-zero (1) if the vnodeops matches that of the vnode.
2648 * Returns zero (0) if not.
2651 vn_matchops(struct vnode *vp, const struct vnodeops *vnodeops)
2653 return (vn_getops(vp) == vnodeops);
2657 * fs_new_caller_id() needs to return a unique ID on a given local system.
2658 * The IDs do not need to survive across reboots. These are primarily
2659 * used so that (FEM) monitors can detect particular callers (such as
2660 * the NFS server) to a given vnode/vfs operation.
2662 u_longlong_t
2663 fs_new_caller_id()
2665 static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2667 return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2671 * The value stored in v_path is relative to rootdir, located in the global
2672 * zone. Zones or chroot environments which reside deeper inside the VFS
2673 * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
2674 * what lies below their perceived root. In order to keep v_path usable for
2675 * these child environments, its allocations are allowed to exceed MAXPATHLEN.
2677 * An upper bound of max_vnode_path is placed upon v_path allocations to
2678 * prevent the system from going too wild at the behest of pathological
2679 * behavior from the operator.
2681 size_t max_vnode_path = 4 * MAXPATHLEN;
2684 void
2685 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
2687 char *buf;
2689 mutex_enter(&vp->v_lock);
2691 * If the snapshot of v_path_stamp passed in via compare_stamp does not
2692 * match the present value on the vnode, it indicates that subsequent
2693 * changes have occurred. The v_path value is not cleared in this case
2694 * since the new value may be valid.
2696 if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
2697 mutex_exit(&vp->v_lock);
2698 return;
2700 buf = vp->v_path;
2701 vp->v_path = vn_vpath_empty;
2702 vp->v_path_stamp = 0;
2703 mutex_exit(&vp->v_lock);
2704 if (buf != vn_vpath_empty) {
2705 kmem_free(buf, strlen(buf) + 1);
2709 static void
2710 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
2711 boolean_t is_rename)
2713 char *buf, *oldbuf;
2714 hrtime_t pstamp;
2715 size_t baselen, buflen = 0;
2717 /* Handle the vn_setpath_str case. */
2718 if (pvp == NULL) {
2719 if (len + 1 > max_vnode_path) {
2720 DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
2721 vnode_t *, vp, char *, name, size_t, len + 1);
2722 return;
2724 buf = kmem_alloc(len + 1, KM_SLEEP);
2725 bcopy(name, buf, len);
2726 buf[len] = '\0';
2728 mutex_enter(&vp->v_lock);
2729 oldbuf = vp->v_path;
2730 vp->v_path = buf;
2731 vp->v_path_stamp = gethrtime();
2732 mutex_exit(&vp->v_lock);
2733 if (oldbuf != vn_vpath_empty) {
2734 kmem_free(oldbuf, strlen(oldbuf) + 1);
2736 return;
2739 /* Take snapshot of parent dir */
2740 mutex_enter(&pvp->v_lock);
2742 if ((pvp->v_flag & VTRAVERSE) != 0) {
2744 * When the parent vnode has VTRAVERSE set in its flags, normal
2745 * assumptions about v_path calculation no longer apply. The
2746 * primary situation where this occurs is via the VFS tricks
2747 * which procfs plays in order to allow /proc/PID/(root|cwd) to
2748 * yield meaningful results.
2750 * When this flag is set, v_path on the child must not be
2751 * updated since the calculated value is likely to be
2752 * incorrect, given the current context.
2754 mutex_exit(&pvp->v_lock);
2755 return;
2758 retrybuf:
2759 if (pvp->v_path == vn_vpath_empty) {
2761 * Without v_path from the parent directory, generating a child
2762 * path from the name is impossible.
2764 if (len > 0) {
2765 pstamp = pvp->v_path_stamp;
2766 mutex_exit(&pvp->v_lock);
2767 vn_clearpath(vp, pstamp);
2768 return;
2772 * The only feasible case here is where a NUL lookup is being
2773 * performed on rootdir prior to its v_path being populated.
2775 ASSERT(pvp->v_path_stamp == 0);
2776 baselen = 0;
2777 pstamp = 0;
2778 } else {
2779 pstamp = pvp->v_path_stamp;
2780 baselen = strlen(pvp->v_path);
2781 /* ignore a trailing slash if present */
2782 if (pvp->v_path[baselen - 1] == '/') {
2783 /* This should only the be case for rootdir */
2784 ASSERT(baselen == 1 && pvp == rootdir);
2785 baselen--;
2788 mutex_exit(&pvp->v_lock);
2790 if (buflen != 0) {
2791 /* Free the existing (mis-sized) buffer in case of retry */
2792 kmem_free(buf, buflen);
2794 /* base, '/', name and trailing NUL */
2795 buflen = baselen + len + 2;
2796 if (buflen > max_vnode_path) {
2797 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
2798 vnode_t *, vp, char *, name, size_t, buflen);
2799 return;
2801 buf = kmem_alloc(buflen, KM_SLEEP);
2803 mutex_enter(&pvp->v_lock);
2804 if (pvp->v_path_stamp != pstamp) {
2805 size_t vlen;
2808 * Since v_path_stamp changed on the parent, it is likely that
2809 * v_path has been altered as well. If the length does not
2810 * exactly match what was previously measured, the buffer
2811 * allocation must be repeated for proper sizing.
2813 if (pvp->v_path == vn_vpath_empty) {
2814 /* Give up if parent lack v_path */
2815 mutex_exit(&pvp->v_lock);
2816 kmem_free(buf, buflen);
2817 return;
2819 vlen = strlen(pvp->v_path);
2820 if (pvp->v_path[vlen - 1] == '/') {
2821 vlen--;
2823 if (vlen != baselen) {
2824 goto retrybuf;
2827 bcopy(pvp->v_path, buf, baselen);
2828 mutex_exit(&pvp->v_lock);
2830 buf[baselen] = '/';
2831 baselen++;
2832 bcopy(name, &buf[baselen], len + 1);
2834 mutex_enter(&vp->v_lock);
2835 if (vp->v_path_stamp == 0) {
2836 /* never-visited vnode can inherit stamp from parent */
2837 ASSERT(vp->v_path == vn_vpath_empty);
2838 vp->v_path_stamp = pstamp;
2839 vp->v_path = buf;
2840 mutex_exit(&vp->v_lock);
2841 } else if (vp->v_path_stamp < pstamp || is_rename) {
2843 * Install the updated path and stamp, ensuring that the v_path
2844 * pointer is valid at all times for dtrace.
2846 oldbuf = vp->v_path;
2847 vp->v_path = buf;
2848 vp->v_path_stamp = gethrtime();
2849 mutex_exit(&vp->v_lock);
2850 kmem_free(oldbuf, strlen(oldbuf) + 1);
2851 } else {
2853 * If the timestamp matches or is greater, it means another
2854 * thread performed the update first while locks were dropped
2855 * here to make the allocation. We defer to the newer value.
2857 mutex_exit(&vp->v_lock);
2858 kmem_free(buf, buflen);
2860 ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
2863 void
2864 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
2866 size_t len;
2869 * If the parent is older or empty, there's nothing further to do.
2871 if (pvp->v_path == vn_vpath_empty ||
2872 pvp->v_path_stamp <= vp->v_path_stamp) {
2873 return;
2877 * Given the lack of appropriate context, meaningful updates to v_path
2878 * cannot be made for during lookups for the '.' or '..' entries.
2880 len = strlen(name);
2881 if (len == 0 || (len == 1 && name[0] == '.') ||
2882 (len == 2 && name[0] == '.' && name[1] == '.')) {
2883 return;
2886 vn_setpath_common(pvp, vp, name, len, B_FALSE);
2890 * Given a starting vnode and a path, updates the path in the target vnode in
2891 * a safe manner. If the vnode already has path information embedded, then the
2892 * cached path is left untouched.
2894 /* ARGSUSED */
2895 void
2896 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
2897 size_t len)
2899 vn_setpath_common(pvp, vp, name, len, B_FALSE);
2903 * Sets the path to the vnode to be the given string, regardless of current
2904 * context. The string must be a complete path from rootdir. This is only used
2905 * by fsop_root() for setting the path based on the mountpoint.
2907 void
2908 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
2910 vn_setpath_common(NULL, vp, str, len, B_FALSE);
2914 * Called from within filesystem's vop_rename() to handle renames once the
2915 * target vnode is available.
2917 void
2918 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
2920 vn_setpath_common(pvp, vp, name, len, B_TRUE);
2924 * Similar to vn_setpath_str(), this function sets the path of the destination
2925 * vnode to the be the same as the source vnode.
2927 void
2928 vn_copypath(struct vnode *src, struct vnode *dst)
2930 char *buf;
2931 hrtime_t stamp;
2932 size_t buflen;
2934 mutex_enter(&src->v_lock);
2935 if (src->v_path == vn_vpath_empty) {
2936 mutex_exit(&src->v_lock);
2937 return;
2939 buflen = strlen(src->v_path) + 1;
2940 mutex_exit(&src->v_lock);
2942 buf = kmem_alloc(buflen, KM_SLEEP);
2944 mutex_enter(&src->v_lock);
2945 if (src->v_path == vn_vpath_empty ||
2946 strlen(src->v_path) + 1 != buflen) {
2947 mutex_exit(&src->v_lock);
2948 kmem_free(buf, buflen);
2949 return;
2951 bcopy(src->v_path, buf, buflen);
2952 stamp = src->v_path_stamp;
2953 mutex_exit(&src->v_lock);
2955 mutex_enter(&dst->v_lock);
2956 if (dst->v_path != vn_vpath_empty) {
2957 mutex_exit(&dst->v_lock);
2958 kmem_free(buf, buflen);
2959 return;
2961 dst->v_path = buf;
2962 dst->v_path_stamp = stamp;
2963 mutex_exit(&dst->v_lock);
2968 * XXX Private interface for segvn routines that handle vnode
2969 * large page segments.
2971 * return 1 if vp's file system fop_pageio() implementation
2972 * can be safely used instead of fop_getpage() for handling
2973 * pagefaults against regular non swap files. fop_pageio()
2974 * interface is considered safe here if its implementation
2975 * is very close to fop_getpage() implementation.
2976 * e.g. It zero's out the part of the page beyond EOF. Doesn't
2977 * panic if there're file holes but instead returns an error.
2978 * Doesn't assume file won't be changed by user writes, etc.
2980 * return 0 otherwise.
2982 * For now allow segvn to only use fop_pageio() with ufs and nfs.
2985 vn_vmpss_usepageio(vnode_t *vp)
2987 vfs_t *vfsp = vp->v_vfsp;
2988 char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
2989 char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
2990 char **fsok = pageio_ok_fss;
2992 if (fsname == NULL) {
2993 return (0);
2996 for (; *fsok; fsok++) {
2997 if (strcmp(*fsok, fsname) == 0) {
2998 return (1);
3001 return (0);
3004 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3007 fop_open(
3008 vnode_t **vpp,
3009 int mode,
3010 cred_t *cr,
3011 caller_context_t *ct)
3013 int ret;
3014 vnode_t *vp = *vpp;
3016 VN_HOLD(vp);
3018 * Adding to the vnode counts before calling open
3019 * avoids the need for a mutex. It circumvents a race
3020 * condition where a query made on the vnode counts results in a
3021 * false negative. The inquirer goes away believing the file is
3022 * not open when there is an open on the file already under way.
3024 * The counts are meant to prevent NFS from granting a delegation
3025 * when it would be dangerous to do so.
3027 * The vnode counts are only kept on regular files
3029 if ((*vpp)->v_type == VREG) {
3030 if (mode & FREAD)
3031 atomic_inc_32(&(*vpp)->v_rdcnt);
3032 if (mode & FWRITE)
3033 atomic_inc_32(&(*vpp)->v_wrcnt);
3036 VOPXID_MAP_CR(vp, cr);
3038 ret = fop_open_dispatch(vpp, mode, cr, ct, true);
3040 if (ret) {
3042 * Use the saved vp just in case the vnode ptr got trashed
3043 * by the error.
3045 VOPSTATS_UPDATE(vp, open);
3046 if ((vp->v_type == VREG) && (mode & FREAD))
3047 atomic_dec_32(&vp->v_rdcnt);
3048 if ((vp->v_type == VREG) && (mode & FWRITE))
3049 atomic_dec_32(&vp->v_wrcnt);
3050 } else {
3052 * Some filesystems will return a different vnode,
3053 * but the same path was still used to open it.
3054 * So if we do change the vnode and need to
3055 * copy over the path, do so here, rather than special
3056 * casing each filesystem. Adjust the vnode counts to
3057 * reflect the vnode switch.
3059 VOPSTATS_UPDATE(*vpp, open);
3060 if (*vpp != vp && *vpp != NULL) {
3061 vn_copypath(vp, *vpp);
3062 if (((*vpp)->v_type == VREG) && (mode & FREAD))
3063 atomic_inc_32(&(*vpp)->v_rdcnt);
3064 if ((vp->v_type == VREG) && (mode & FREAD))
3065 atomic_dec_32(&vp->v_rdcnt);
3066 if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3067 atomic_inc_32(&(*vpp)->v_wrcnt);
3068 if ((vp->v_type == VREG) && (mode & FWRITE))
3069 atomic_dec_32(&vp->v_wrcnt);
3072 VN_RELE(vp);
3073 return (ret);
3077 fop_close(
3078 vnode_t *vp,
3079 int flag,
3080 int count,
3081 offset_t offset,
3082 cred_t *cr,
3083 caller_context_t *ct)
3085 int err;
3087 VOPXID_MAP_CR(vp, cr);
3089 err = fop_close_dispatch(vp, flag, count, offset, cr, ct, true);
3091 VOPSTATS_UPDATE(vp, close);
3093 * Check passed in count to handle possible dups. Vnode counts are only
3094 * kept on regular files
3096 if ((vp->v_type == VREG) && (count == 1)) {
3097 if (flag & FREAD) {
3098 ASSERT(vp->v_rdcnt > 0);
3099 atomic_dec_32(&vp->v_rdcnt);
3101 if (flag & FWRITE) {
3102 ASSERT(vp->v_wrcnt > 0);
3103 atomic_dec_32(&vp->v_wrcnt);
3106 return (err);
3110 fop_read(
3111 vnode_t *vp,
3112 uio_t *uiop,
3113 int ioflag,
3114 cred_t *cr,
3115 caller_context_t *ct)
3117 int err;
3118 ssize_t resid_start = uiop->uio_resid;
3120 VOPXID_MAP_CR(vp, cr);
3122 err = fop_read_dispatch(vp, uiop, ioflag, cr, ct, true);
3124 VOPSTATS_UPDATE_IO(vp, read,
3125 read_bytes, (resid_start - uiop->uio_resid));
3126 return (err);
3130 fop_write(
3131 vnode_t *vp,
3132 uio_t *uiop,
3133 int ioflag,
3134 cred_t *cr,
3135 caller_context_t *ct)
3137 int err;
3138 ssize_t resid_start = uiop->uio_resid;
3140 VOPXID_MAP_CR(vp, cr);
3142 err = fop_write_dispatch(vp, uiop, ioflag, cr, ct, true);
3144 VOPSTATS_UPDATE_IO(vp, write,
3145 write_bytes, (resid_start - uiop->uio_resid));
3146 return (err);
3150 fop_ioctl(
3151 vnode_t *vp,
3152 int cmd,
3153 intptr_t arg,
3154 int flag,
3155 cred_t *cr,
3156 int *rvalp,
3157 caller_context_t *ct)
3159 int err;
3161 VOPXID_MAP_CR(vp, cr);
3163 err = fop_ioctl_dispatch(vp, cmd, arg, flag, cr, rvalp, ct, true);
3165 VOPSTATS_UPDATE(vp, ioctl);
3166 return (err);
3170 fop_setfl(
3171 vnode_t *vp,
3172 int oflags,
3173 int nflags,
3174 cred_t *cr,
3175 caller_context_t *ct)
3177 int err;
3179 VOPXID_MAP_CR(vp, cr);
3181 err = fop_setfl_dispatch(vp, oflags, nflags, cr, ct, true);
3183 VOPSTATS_UPDATE(vp, setfl);
3184 return (err);
3188 fop_getattr(
3189 vnode_t *vp,
3190 vattr_t *vap,
3191 int flags,
3192 cred_t *cr,
3193 caller_context_t *ct)
3195 int err;
3197 VOPXID_MAP_CR(vp, cr);
3200 * If this file system doesn't understand the xvattr extensions
3201 * then turn off the xvattr bit.
3203 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3204 vap->va_mask &= ~VATTR_XVATTR;
3208 * We're only allowed to skip the ACL check iff we used a 32 bit
3209 * ACE mask with fop_access() to determine permissions.
3211 if ((flags & ATTR_NOACLCHECK) &&
3212 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3213 return (EINVAL);
3215 err = fop_getattr_dispatch(vp, vap, flags, cr, ct, true);
3217 VOPSTATS_UPDATE(vp, getattr);
3218 return (err);
3222 fop_setattr(
3223 vnode_t *vp,
3224 vattr_t *vap,
3225 int flags,
3226 cred_t *cr,
3227 caller_context_t *ct)
3229 int err;
3231 VOPXID_MAP_CR(vp, cr);
3234 * If this file system doesn't understand the xvattr extensions
3235 * then turn off the xvattr bit.
3237 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3238 vap->va_mask &= ~VATTR_XVATTR;
3242 * We're only allowed to skip the ACL check iff we used a 32 bit
3243 * ACE mask with fop_access() to determine permissions.
3245 if ((flags & ATTR_NOACLCHECK) &&
3246 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0)
3247 return (EINVAL);
3249 err = fop_setattr_dispatch(vp, vap, flags, cr, ct, true);
3251 VOPSTATS_UPDATE(vp, setattr);
3252 return (err);
3256 fop_access(
3257 vnode_t *vp,
3258 int mode,
3259 int flags,
3260 cred_t *cr,
3261 caller_context_t *ct)
3263 int err;
3265 if ((flags & V_ACE_MASK) &&
3266 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3267 return (EINVAL);
3270 VOPXID_MAP_CR(vp, cr);
3272 err = fop_access_dispatch(vp, mode, flags, cr, ct, true);
3274 VOPSTATS_UPDATE(vp, access);
3275 return (err);
3279 fop_lookup(
3280 vnode_t *dvp,
3281 char *nm,
3282 vnode_t **vpp,
3283 pathname_t *pnp,
3284 int flags,
3285 vnode_t *rdir,
3286 cred_t *cr,
3287 caller_context_t *ct,
3288 int *deflags, /* Returned per-dirent flags */
3289 pathname_t *ppnp) /* Returned case-preserved name in directory */
3291 int ret;
3294 * If this file system doesn't support case-insensitive access
3295 * and said access is requested, fail quickly. It is required
3296 * that if the vfs supports case-insensitive lookup, it also
3297 * supports extended dirent flags.
3299 if (flags & FIGNORECASE &&
3300 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3301 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3302 return (EINVAL);
3304 VOPXID_MAP_CR(dvp, cr);
3306 if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3307 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3308 } else {
3309 ret = fop_lookup_dispatch(dvp, nm, vpp, pnp, flags, rdir, cr,
3310 ct, deflags, ppnp, true);
3313 if (ret == 0 && *vpp) {
3314 VOPSTATS_UPDATE(*vpp, lookup);
3315 vn_updatepath(dvp, *vpp, nm);
3318 return (ret);
3322 fop_create(
3323 vnode_t *dvp,
3324 char *name,
3325 vattr_t *vap,
3326 vcexcl_t excl,
3327 int mode,
3328 vnode_t **vpp,
3329 cred_t *cr,
3330 int flags,
3331 caller_context_t *ct,
3332 vsecattr_t *vsecp) /* ACL to set during create */
3334 int ret;
3336 if (vsecp != NULL &&
3337 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3338 return (EINVAL);
3341 * If this file system doesn't support case-insensitive access
3342 * and said access is requested, fail quickly.
3344 if (flags & FIGNORECASE &&
3345 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3346 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3347 return (EINVAL);
3349 VOPXID_MAP_CR(dvp, cr);
3351 ret = fop_create_dispatch(dvp, name, vap, excl, mode, vpp, cr, flags,
3352 ct, vsecp, true);
3354 if (ret == 0 && *vpp) {
3355 VOPSTATS_UPDATE(*vpp, create);
3356 vn_updatepath(dvp, *vpp, name);
3359 return (ret);
3363 fop_remove(
3364 vnode_t *dvp,
3365 char *nm,
3366 cred_t *cr,
3367 caller_context_t *ct,
3368 int flags)
3370 int err;
3373 * If this file system doesn't support case-insensitive access
3374 * and said access is requested, fail quickly.
3376 if (flags & FIGNORECASE &&
3377 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3378 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3379 return (EINVAL);
3381 VOPXID_MAP_CR(dvp, cr);
3383 err = fop_remove_dispatch(dvp, nm, cr, ct, flags, true);
3385 VOPSTATS_UPDATE(dvp, remove);
3386 return (err);
3390 fop_link(
3391 vnode_t *tdvp,
3392 vnode_t *svp,
3393 char *tnm,
3394 cred_t *cr,
3395 caller_context_t *ct,
3396 int flags)
3398 int err;
3401 * If the target file system doesn't support case-insensitive access
3402 * and said access is requested, fail quickly.
3404 if (flags & FIGNORECASE &&
3405 (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3406 vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3407 return (EINVAL);
3409 VOPXID_MAP_CR(tdvp, cr);
3411 err = fop_link_dispatch(tdvp, svp, tnm, cr, ct, flags, true);
3413 VOPSTATS_UPDATE(tdvp, link);
3414 return (err);
3418 fop_rename(
3419 vnode_t *sdvp,
3420 char *snm,
3421 vnode_t *tdvp,
3422 char *tnm,
3423 cred_t *cr,
3424 caller_context_t *ct,
3425 int flags)
3427 int err;
3430 * If the file system involved does not support
3431 * case-insensitive access and said access is requested, fail
3432 * quickly.
3434 if (flags & FIGNORECASE &&
3435 ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3436 vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3437 return (EINVAL);
3439 VOPXID_MAP_CR(tdvp, cr);
3441 err = fop_rename_dispatch(sdvp, snm, tdvp, tnm, cr, ct, flags, true);
3443 VOPSTATS_UPDATE(sdvp, rename);
3444 return (err);
3448 fop_mkdir(
3449 vnode_t *dvp,
3450 char *dirname,
3451 vattr_t *vap,
3452 vnode_t **vpp,
3453 cred_t *cr,
3454 caller_context_t *ct,
3455 int flags,
3456 vsecattr_t *vsecp) /* ACL to set during create */
3458 int ret;
3460 if (vsecp != NULL &&
3461 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3462 return (EINVAL);
3465 * If this file system doesn't support case-insensitive access
3466 * and said access is requested, fail quickly.
3468 if (flags & FIGNORECASE &&
3469 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3470 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3471 return (EINVAL);
3473 VOPXID_MAP_CR(dvp, cr);
3475 ret = fop_mkdir_dispatch(dvp, dirname, vap, vpp, cr, ct, flags, vsecp,
3476 true);
3478 if (ret == 0 && *vpp) {
3479 VOPSTATS_UPDATE(*vpp, mkdir);
3480 vn_updatepath(dvp, *vpp, dirname);
3483 return (ret);
3487 fop_rmdir(
3488 vnode_t *dvp,
3489 char *nm,
3490 vnode_t *cdir,
3491 cred_t *cr,
3492 caller_context_t *ct,
3493 int flags)
3495 int err;
3498 * If this file system doesn't support case-insensitive access
3499 * and said access is requested, fail quickly.
3501 if (flags & FIGNORECASE &&
3502 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3503 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3504 return (EINVAL);
3506 VOPXID_MAP_CR(dvp, cr);
3508 err = fop_rmdir_dispatch(dvp, nm, cdir, cr, ct, flags, true);
3510 VOPSTATS_UPDATE(dvp, rmdir);
3511 return (err);
3515 fop_readdir(
3516 vnode_t *vp,
3517 uio_t *uiop,
3518 cred_t *cr,
3519 int *eofp,
3520 caller_context_t *ct,
3521 int flags)
3523 int err;
3524 ssize_t resid_start = uiop->uio_resid;
3527 * If this file system doesn't support retrieving directory
3528 * entry flags and said access is requested, fail quickly.
3530 if (flags & V_RDDIR_ENTFLAGS &&
3531 vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3532 return (EINVAL);
3534 VOPXID_MAP_CR(vp, cr);
3536 err = fop_readdir_dispatch(vp, uiop, cr, eofp, ct, flags, true);
3538 VOPSTATS_UPDATE_IO(vp, readdir,
3539 readdir_bytes, (resid_start - uiop->uio_resid));
3540 return (err);
3544 fop_symlink(
3545 vnode_t *dvp,
3546 char *linkname,
3547 vattr_t *vap,
3548 char *target,
3549 cred_t *cr,
3550 caller_context_t *ct,
3551 int flags)
3553 int err;
3554 xvattr_t xvattr;
3557 * If this file system doesn't support case-insensitive access
3558 * and said access is requested, fail quickly.
3560 if (flags & FIGNORECASE &&
3561 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3562 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3563 return (EINVAL);
3565 VOPXID_MAP_CR(dvp, cr);
3567 /* check for reparse point */
3568 if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3569 (strncmp(target, FS_REPARSE_TAG_STR,
3570 strlen(FS_REPARSE_TAG_STR)) == 0)) {
3571 if (!fs_reparse_mark(target, vap, &xvattr))
3572 vap = (vattr_t *)&xvattr;
3575 err = fop_symlink_dispatch(dvp, linkname, vap, target, cr, ct, flags,
3576 true);
3578 VOPSTATS_UPDATE(dvp, symlink);
3579 return (err);
3583 fop_readlink(
3584 vnode_t *vp,
3585 uio_t *uiop,
3586 cred_t *cr,
3587 caller_context_t *ct)
3589 int err;
3591 VOPXID_MAP_CR(vp, cr);
3593 err = fop_readlink_dispatch(vp, uiop, cr, ct, true);
3595 VOPSTATS_UPDATE(vp, readlink);
3596 return (err);
3600 fop_fsync(
3601 vnode_t *vp,
3602 int syncflag,
3603 cred_t *cr,
3604 caller_context_t *ct)
3606 int err;
3608 VOPXID_MAP_CR(vp, cr);
3610 err = fop_fsync_dispatch(vp, syncflag, cr, ct, true);
3612 VOPSTATS_UPDATE(vp, fsync);
3613 return (err);
3616 void
3617 fop_inactive(
3618 vnode_t *vp,
3619 cred_t *cr,
3620 caller_context_t *ct)
3622 /* Need to update stats before vop call since we may lose the vnode */
3623 VOPSTATS_UPDATE(vp, inactive);
3625 VOPXID_MAP_CR(vp, cr);
3627 fop_inactive_dispatch(vp, cr, ct, true);
3631 fop_fid(
3632 vnode_t *vp,
3633 fid_t *fidp,
3634 caller_context_t *ct)
3636 int err;
3638 err = fop_fid_dispatch(vp, fidp, ct, true);
3640 VOPSTATS_UPDATE(vp, fid);
3641 return (err);
3645 fop_rwlock(
3646 vnode_t *vp,
3647 int write_lock,
3648 caller_context_t *ct)
3650 int ret;
3652 ret = fop_rwlock_dispatch(vp, write_lock, ct, true);
3654 VOPSTATS_UPDATE(vp, rwlock);
3655 return (ret);
3658 void
3659 fop_rwunlock(
3660 vnode_t *vp,
3661 int write_lock,
3662 caller_context_t *ct)
3664 fop_rwunlock_dispatch(vp, write_lock, ct, true);
3666 VOPSTATS_UPDATE(vp, rwunlock);
3670 fop_seek(
3671 vnode_t *vp,
3672 offset_t ooff,
3673 offset_t *noffp,
3674 caller_context_t *ct)
3676 int err;
3678 err = fop_seek_dispatch(vp, ooff, noffp, ct, true);
3680 VOPSTATS_UPDATE(vp, seek);
3681 return (err);
3685 fop_cmp(
3686 vnode_t *vp1,
3687 vnode_t *vp2,
3688 caller_context_t *ct)
3690 int err;
3692 err = fop_cmp_dispatch(vp1, vp2, ct, true);
3694 VOPSTATS_UPDATE(vp1, cmp);
3695 return (err);
3699 fop_frlock(
3700 vnode_t *vp,
3701 int cmd,
3702 flock64_t *bfp,
3703 int flag,
3704 offset_t offset,
3705 struct flk_callback *flk_cbp,
3706 cred_t *cr,
3707 caller_context_t *ct)
3709 int err;
3711 VOPXID_MAP_CR(vp, cr);
3713 err = fop_frlock_dispatch(vp, cmd, bfp, flag, offset, flk_cbp, cr,
3714 ct, true);
3716 VOPSTATS_UPDATE(vp, frlock);
3717 return (err);
3721 fop_space(
3722 vnode_t *vp,
3723 int cmd,
3724 flock64_t *bfp,
3725 int flag,
3726 offset_t offset,
3727 cred_t *cr,
3728 caller_context_t *ct)
3730 int err;
3732 VOPXID_MAP_CR(vp, cr);
3734 err = fop_space_dispatch(vp, cmd, bfp, flag, offset, cr, ct, true);
3736 VOPSTATS_UPDATE(vp, space);
3737 return (err);
3741 fop_realvp(
3742 vnode_t *vp,
3743 vnode_t **vpp,
3744 caller_context_t *ct)
3746 int err;
3748 err = fop_realvp_dispatch(vp, vpp, ct, true);
3750 VOPSTATS_UPDATE(vp, realvp);
3751 return (err);
3755 fop_getpage(
3756 vnode_t *vp,
3757 offset_t off,
3758 size_t len,
3759 uint_t *protp,
3760 page_t **plarr,
3761 size_t plsz,
3762 struct seg *seg,
3763 caddr_t addr,
3764 enum seg_rw rw,
3765 cred_t *cr,
3766 caller_context_t *ct)
3768 int err;
3770 VOPXID_MAP_CR(vp, cr);
3772 err = fop_getpage_dispatch(vp, off, len, protp, plarr, plsz, seg,
3773 addr, rw, cr, ct, true);
3775 VOPSTATS_UPDATE(vp, getpage);
3776 return (err);
3780 fop_putpage(
3781 vnode_t *vp,
3782 offset_t off,
3783 size_t len,
3784 int flags,
3785 cred_t *cr,
3786 caller_context_t *ct)
3788 int err;
3790 VOPXID_MAP_CR(vp, cr);
3792 err = fop_putpage_dispatch(vp, off, len, flags, cr, ct, true);
3794 VOPSTATS_UPDATE(vp, putpage);
3795 return (err);
3799 fop_map(
3800 vnode_t *vp,
3801 offset_t off,
3802 struct as *as,
3803 caddr_t *addrp,
3804 size_t len,
3805 uchar_t prot,
3806 uchar_t maxprot,
3807 uint_t flags,
3808 cred_t *cr,
3809 caller_context_t *ct)
3811 int err;
3813 VOPXID_MAP_CR(vp, cr);
3815 err = fop_map_dispatch(vp, off, as, addrp, len, prot, maxprot,
3816 flags, cr, ct, true);
3818 VOPSTATS_UPDATE(vp, map);
3819 return (err);
3823 fop_addmap(
3824 vnode_t *vp,
3825 offset_t off,
3826 struct as *as,
3827 caddr_t addr,
3828 size_t len,
3829 uchar_t prot,
3830 uchar_t maxprot,
3831 uint_t flags,
3832 cred_t *cr,
3833 caller_context_t *ct)
3835 int error;
3836 u_longlong_t delta;
3838 VOPXID_MAP_CR(vp, cr);
3840 error = fop_addmap_dispatch(vp, off, as, addr, len, prot, maxprot,
3841 flags, cr, ct, true);
3843 if ((!error) && (vp->v_type == VREG)) {
3844 delta = (u_longlong_t)btopr(len);
3846 * If file is declared MAP_PRIVATE, it can't be written back
3847 * even if open for write. Handle as read.
3849 if (flags & MAP_PRIVATE) {
3850 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3851 (int64_t)delta);
3852 } else {
3854 * atomic_add_64 forces the fetch of a 64 bit value to
3855 * be atomic on 32 bit machines
3857 if (maxprot & PROT_WRITE)
3858 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3859 (int64_t)delta);
3860 if (maxprot & PROT_READ)
3861 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3862 (int64_t)delta);
3863 if (maxprot & PROT_EXEC)
3864 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3865 (int64_t)delta);
3868 VOPSTATS_UPDATE(vp, addmap);
3869 return (error);
3873 fop_delmap(
3874 vnode_t *vp,
3875 offset_t off,
3876 struct as *as,
3877 caddr_t addr,
3878 size_t len,
3879 uint_t prot,
3880 uint_t maxprot,
3881 uint_t flags,
3882 cred_t *cr,
3883 caller_context_t *ct)
3885 int error;
3886 u_longlong_t delta;
3888 VOPXID_MAP_CR(vp, cr);
3890 error = fop_delmap_dispatch(vp, off, as, addr, len, prot, maxprot,
3891 flags, cr, ct, true);
3894 * NFS calls into delmap twice, the first time
3895 * it simply establishes a callback mechanism and returns EAGAIN
3896 * while the real work is being done upon the second invocation.
3897 * We have to detect this here and only decrement the counts upon
3898 * the second delmap request.
3900 if ((error != EAGAIN) && (vp->v_type == VREG)) {
3902 delta = (u_longlong_t)btopr(len);
3904 if (flags & MAP_PRIVATE) {
3905 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3906 (int64_t)(-delta));
3907 } else {
3909 * atomic_add_64 forces the fetch of a 64 bit value
3910 * to be atomic on 32 bit machines
3912 if (maxprot & PROT_WRITE)
3913 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3914 (int64_t)(-delta));
3915 if (maxprot & PROT_READ)
3916 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3917 (int64_t)(-delta));
3918 if (maxprot & PROT_EXEC)
3919 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3920 (int64_t)(-delta));
3923 VOPSTATS_UPDATE(vp, delmap);
3924 return (error);
3929 fop_poll(
3930 vnode_t *vp,
3931 short events,
3932 int anyyet,
3933 short *reventsp,
3934 struct pollhead **phpp,
3935 caller_context_t *ct)
3937 int err;
3939 err = fop_poll_dispatch(vp, events, anyyet, reventsp, phpp, ct, true);
3941 VOPSTATS_UPDATE(vp, poll);
3942 return (err);
3946 fop_dump(
3947 vnode_t *vp,
3948 caddr_t addr,
3949 offset_t lbdn,
3950 offset_t dblks,
3951 caller_context_t *ct)
3953 int err;
3955 /* ensure lbdn and dblks can be passed safely to bdev_dump */
3956 if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
3957 return (EIO);
3959 err = fop_dump_dispatch(vp, addr, lbdn, dblks, ct, true);
3961 VOPSTATS_UPDATE(vp, dump);
3962 return (err);
3966 fop_pathconf(
3967 vnode_t *vp,
3968 int cmd,
3969 ulong_t *valp,
3970 cred_t *cr,
3971 caller_context_t *ct)
3973 int err;
3975 VOPXID_MAP_CR(vp, cr);
3977 err = fop_pathconf_dispatch(vp, cmd, valp, cr, ct, true);
3979 VOPSTATS_UPDATE(vp, pathconf);
3980 return (err);
3984 fop_pageio(
3985 vnode_t *vp,
3986 struct page *pp,
3987 uoff_t io_off,
3988 size_t io_len,
3989 int flags,
3990 cred_t *cr,
3991 caller_context_t *ct)
3993 int err;
3995 VOPXID_MAP_CR(vp, cr);
3997 err = fop_pageio_dispatch(vp, pp, io_off, io_len, flags, cr, ct, true);
3999 VOPSTATS_UPDATE(vp, pageio);
4000 return (err);
4004 fop_dumpctl(
4005 vnode_t *vp,
4006 int action,
4007 offset_t *blkp,
4008 caller_context_t *ct)
4010 int err;
4012 err = fop_dumpctl_dispatch(vp, action, blkp, ct, true);
4014 VOPSTATS_UPDATE(vp, dumpctl);
4015 return (err);
4018 void
4019 fop_dispose(
4020 vnode_t *vp,
4021 page_t *pp,
4022 int flag,
4023 int dn,
4024 cred_t *cr,
4025 caller_context_t *ct)
4027 /* Must do stats first since it's possible to lose the vnode */
4028 VOPSTATS_UPDATE(vp, dispose);
4030 VOPXID_MAP_CR(vp, cr);
4032 fop_dispose_dispatch(vp, pp, flag, dn, cr, ct, true);
4036 fop_setsecattr(
4037 vnode_t *vp,
4038 vsecattr_t *vsap,
4039 int flag,
4040 cred_t *cr,
4041 caller_context_t *ct)
4043 int err;
4045 VOPXID_MAP_CR(vp, cr);
4048 * We're only allowed to skip the ACL check iff we used a 32 bit
4049 * ACE mask with fop_access() to determine permissions.
4051 if ((flag & ATTR_NOACLCHECK) &&
4052 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4053 return (EINVAL);
4056 err = fop_setsecattr_dispatch(vp, vsap, flag, cr, ct, true);
4058 VOPSTATS_UPDATE(vp, setsecattr);
4059 return (err);
4063 fop_getsecattr(
4064 vnode_t *vp,
4065 vsecattr_t *vsap,
4066 int flag,
4067 cred_t *cr,
4068 caller_context_t *ct)
4070 int err;
4073 * We're only allowed to skip the ACL check iff we used a 32 bit
4074 * ACE mask with fop_access() to determine permissions.
4076 if ((flag & ATTR_NOACLCHECK) &&
4077 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4078 return (EINVAL);
4081 VOPXID_MAP_CR(vp, cr);
4083 err = fop_getsecattr_dispatch(vp, vsap, flag, cr, ct, true);
4085 VOPSTATS_UPDATE(vp, getsecattr);
4086 return (err);
4090 fop_shrlock(
4091 vnode_t *vp,
4092 int cmd,
4093 struct shrlock *shr,
4094 int flag,
4095 cred_t *cr,
4096 caller_context_t *ct)
4098 int err;
4100 VOPXID_MAP_CR(vp, cr);
4102 err = fop_shrlock_dispatch(vp, cmd, shr, flag, cr, ct, true);
4104 VOPSTATS_UPDATE(vp, shrlock);
4105 return (err);
4109 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4110 caller_context_t *ct)
4112 int err;
4114 err = fop_vnevent_dispatch(vp, vnevent, dvp, fnm, ct, true);
4116 VOPSTATS_UPDATE(vp, vnevent);
4117 return (err);
4121 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4122 caller_context_t *ct)
4124 int err;
4126 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4127 return (ENOTSUP);
4129 err = fop_reqzcbuf_dispatch(vp, ioflag, uiop, cr, ct, true);
4131 VOPSTATS_UPDATE(vp, reqzcbuf);
4132 return (err);
4136 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4138 int err;
4140 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4141 return (ENOTSUP);
4143 err = fop_retzcbuf_dispatch(vp, uiop, cr, ct, true);
4145 VOPSTATS_UPDATE(vp, retzcbuf);
4146 return (err);
4150 * Default destructor
4151 * Needed because NULL destructor means that the key is unused
4153 /* ARGSUSED */
4154 void
4155 vsd_defaultdestructor(void *value)
4159 * Create a key (index into per vnode array)
4160 * Locks out vsd_create, vsd_destroy, and vsd_free
4161 * May allocate memory with lock held
4163 void
4164 vsd_create(uint_t *keyp, void (*destructor)(void *))
4166 int i;
4167 uint_t nkeys;
4170 * if key is allocated, do nothing
4172 mutex_enter(&vsd_lock);
4173 if (*keyp) {
4174 mutex_exit(&vsd_lock);
4175 return;
4178 * find an unused key
4180 if (destructor == NULL)
4181 destructor = vsd_defaultdestructor;
4183 for (i = 0; i < vsd_nkeys; ++i)
4184 if (vsd_destructor[i] == NULL)
4185 break;
4188 * if no unused keys, increase the size of the destructor array
4190 if (i == vsd_nkeys) {
4191 if ((nkeys = (vsd_nkeys << 1)) == 0)
4192 nkeys = 1;
4193 vsd_destructor =
4194 (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4195 (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4196 (size_t)(nkeys * sizeof (void (*)(void *))));
4197 vsd_nkeys = nkeys;
4201 * allocate the next available unused key
4203 vsd_destructor[i] = destructor;
4204 *keyp = i + 1;
4206 /* create vsd_list, if it doesn't exist */
4207 if (vsd_list == NULL) {
4208 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4209 list_create(vsd_list, sizeof (struct vsd_node),
4210 offsetof(struct vsd_node, vs_nodes));
4213 mutex_exit(&vsd_lock);
4217 * Destroy a key
4219 * Assumes that the caller is preventing vsd_set and vsd_get
4220 * Locks out vsd_create, vsd_destroy, and vsd_free
4221 * May free memory with lock held
4223 void
4224 vsd_destroy(uint_t *keyp)
4226 uint_t key;
4227 struct vsd_node *vsd;
4230 * protect the key namespace and our destructor lists
4232 mutex_enter(&vsd_lock);
4233 key = *keyp;
4234 *keyp = 0;
4236 ASSERT(key <= vsd_nkeys);
4239 * if the key is valid
4241 if (key != 0) {
4242 uint_t k = key - 1;
4244 * for every vnode with VSD, call key's destructor
4246 for (vsd = list_head(vsd_list); vsd != NULL;
4247 vsd = list_next(vsd_list, vsd)) {
4249 * no VSD for key in this vnode
4251 if (key > vsd->vs_nkeys)
4252 continue;
4254 * call destructor for key
4256 if (vsd->vs_value[k] && vsd_destructor[k])
4257 (*vsd_destructor[k])(vsd->vs_value[k]);
4259 * reset value for key
4261 vsd->vs_value[k] = NULL;
4264 * actually free the key (NULL destructor == unused)
4266 vsd_destructor[k] = NULL;
4269 mutex_exit(&vsd_lock);
4273 * Quickly return the per vnode value that was stored with the specified key
4274 * Assumes the caller is protecting key from vsd_create and vsd_destroy
4275 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4277 void *
4278 vsd_get(vnode_t *vp, uint_t key)
4280 struct vsd_node *vsd;
4282 ASSERT(vp != NULL);
4283 ASSERT(mutex_owned(&vp->v_vsd_lock));
4285 vsd = vp->v_vsd;
4287 if (key && vsd != NULL && key <= vsd->vs_nkeys)
4288 return (vsd->vs_value[key - 1]);
4289 return (NULL);
4293 * Set a per vnode value indexed with the specified key
4294 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4297 vsd_set(vnode_t *vp, uint_t key, void *value)
4299 struct vsd_node *vsd;
4301 ASSERT(vp != NULL);
4302 ASSERT(mutex_owned(&vp->v_vsd_lock));
4304 if (key == 0)
4305 return (EINVAL);
4307 vsd = vp->v_vsd;
4308 if (vsd == NULL)
4309 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4312 * If the vsd was just allocated, vs_nkeys will be 0, so the following
4313 * code won't happen and we will continue down and allocate space for
4314 * the vs_value array.
4315 * If the caller is replacing one value with another, then it is up
4316 * to the caller to free/rele/destroy the previous value (if needed).
4318 if (key <= vsd->vs_nkeys) {
4319 vsd->vs_value[key - 1] = value;
4320 return (0);
4323 ASSERT(key <= vsd_nkeys);
4325 if (vsd->vs_nkeys == 0) {
4326 mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4328 * Link onto list of all VSD nodes.
4330 list_insert_head(vsd_list, vsd);
4331 mutex_exit(&vsd_lock);
4335 * Allocate vnode local storage and set the value for key
4337 vsd->vs_value = vsd_realloc(vsd->vs_value,
4338 vsd->vs_nkeys * sizeof (void *),
4339 key * sizeof (void *));
4340 vsd->vs_nkeys = key;
4341 vsd->vs_value[key - 1] = value;
4343 return (0);
4347 * Called from vn_free() to run the destructor function for each vsd
4348 * Locks out vsd_create and vsd_destroy
4349 * Assumes that the destructor *DOES NOT* use vsd
4351 void
4352 vsd_free(vnode_t *vp)
4354 int i;
4355 struct vsd_node *vsd = vp->v_vsd;
4357 if (vsd == NULL)
4358 return;
4360 if (vsd->vs_nkeys == 0) {
4361 kmem_free(vsd, sizeof (*vsd));
4362 vp->v_vsd = NULL;
4363 return;
4367 * lock out vsd_create and vsd_destroy, call
4368 * the destructor, and mark the value as destroyed.
4370 mutex_enter(&vsd_lock);
4372 for (i = 0; i < vsd->vs_nkeys; i++) {
4373 if (vsd->vs_value[i] && vsd_destructor[i])
4374 (*vsd_destructor[i])(vsd->vs_value[i]);
4375 vsd->vs_value[i] = NULL;
4379 * remove from linked list of VSD nodes
4381 list_remove(vsd_list, vsd);
4383 mutex_exit(&vsd_lock);
4386 * free up the VSD
4388 kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4389 kmem_free(vsd, sizeof (struct vsd_node));
4390 vp->v_vsd = NULL;
4394 * realloc
4396 static void *
4397 vsd_realloc(void *old, size_t osize, size_t nsize)
4399 void *new;
4401 new = kmem_zalloc(nsize, KM_SLEEP);
4402 if (old) {
4403 bcopy(old, new, osize);
4404 kmem_free(old, osize);
4406 return (new);
4410 * Setup the extensible system attribute for creating a reparse point.
4411 * The symlink data 'target' is validated for proper format of a reparse
4412 * string and a check also made to make sure the symlink data does not
4413 * point to an existing file.
4415 * return 0 if ok else -1.
4417 static int
4418 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4420 xoptattr_t *xoap;
4422 if ((!target) || (!vap) || (!xvattr))
4423 return (-1);
4425 /* validate reparse string */
4426 if (reparse_validate((const char *)target))
4427 return (-1);
4429 xva_init(xvattr);
4430 xvattr->xva_vattr = *vap;
4431 xvattr->xva_vattr.va_mask |= VATTR_XVATTR;
4432 xoap = xva_getxoptattr(xvattr);
4433 ASSERT(xoap);
4434 XVA_SET_REQ(xvattr, XAT_REPARSE);
4435 xoap->xoa_reparse = 1;
4437 return (0);
4441 * Function to check whether a symlink is a reparse point.
4442 * Return B_TRUE if it is a reparse point, else return B_FALSE
4444 boolean_t
4445 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4447 xvattr_t xvattr;
4448 xoptattr_t *xoap;
4450 if ((vp->v_type != VLNK) ||
4451 !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4452 return (B_FALSE);
4454 xva_init(&xvattr);
4455 xoap = xva_getxoptattr(&xvattr);
4456 ASSERT(xoap);
4457 XVA_SET_REQ(&xvattr, XAT_REPARSE);
4459 if (fop_getattr(vp, &xvattr.xva_vattr, 0, cr, ct))
4460 return (B_FALSE);
4462 if ((!(xvattr.xva_vattr.va_mask & VATTR_XVATTR)) ||
4463 (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4464 return (B_FALSE);
4466 return (xoap->xoa_reparse ? B_TRUE : B_FALSE);