4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2017 by Delphix. All rights reserved.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
31 * This file supports the vfs operations for the NAMEFS file system.
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/debug.h>
38 #include <sys/errno.h>
40 #include <sys/inline.h>
44 #include <sys/statvfs.h>
45 #include <sys/mount.h>
46 #include <sys/sysmacros.h>
49 #include <sys/vfs_opreg.h>
50 #include <sys/vnode.h>
53 #include <sys/signal.h>
57 #include <sys/fs/namenode.h>
58 #include <sys/stream.h>
59 #include <sys/strsubr.h>
60 #include <sys/cmn_err.h>
61 #include <sys/modctl.h>
62 #include <fs/fs_subr.h>
63 #include <sys/policy.h>
65 #include <sys/fs/sdev_impl.h>
67 #define NM_INOQUANT (64 * 1024)
70 * Define global data structures.
74 struct namenode
*nm_filevp_hash
[NM_FILEVP_HASH_SIZE
];
78 static vmem_t
*nm_inoarena
; /* vmem arena to allocate inode no's from */
79 static kmutex_t nm_inolock
;
81 vfsops_t
*namefs_vfsops
;
83 * Functions to allocate node id's starting from 1. Based on vmem routines.
84 * The vmem arena is extended in NM_INOQUANT chunks.
87 namenodeno_alloc(void)
91 mutex_enter(&nm_inolock
);
92 nno
= (uint64_t)(uintptr_t)
93 vmem_alloc(nm_inoarena
, 1, VM_NOSLEEP
+ VM_FIRSTFIT
);
95 (void) vmem_add(nm_inoarena
, (void *)(vmem_size(nm_inoarena
,
96 VMEM_ALLOC
| VMEM_FREE
) + 1), NM_INOQUANT
, VM_SLEEP
);
97 nno
= (uint64_t)(uintptr_t)
98 vmem_alloc(nm_inoarena
, 1, VM_SLEEP
+ VM_FIRSTFIT
);
101 mutex_exit(&nm_inolock
);
102 ASSERT32(nno
<= ULONG_MAX
);
107 namenodeno_init(void)
109 nm_inoarena
= vmem_create("namefs_inodes", (void *)1, NM_INOQUANT
, 1,
110 NULL
, NULL
, NULL
, 1, VM_SLEEP
);
111 mutex_init(&nm_inolock
, NULL
, MUTEX_DEFAULT
, NULL
);
115 namenodeno_free(uint64_t nn
)
117 void *vaddr
= (void *)(uintptr_t)nn
;
119 ASSERT32((uint64_t)(uintptr_t)vaddr
== nn
);
121 mutex_enter(&nm_inolock
);
122 vmem_free(nm_inoarena
, vaddr
, 1);
123 mutex_exit(&nm_inolock
);
127 * Insert a namenode into the nm_filevp_hash table.
129 * Each link has a unique namenode with a unique nm_mountvp field.
130 * The nm_filevp field of the namenode need not be unique, since a
131 * file descriptor may be mounted to multiple nodes at the same time.
132 * We hash on nm_filevp since that's what discriminates the searches
133 * in namefind() and nm_unmountall().
136 nameinsert(struct namenode
*nodep
)
138 struct namenode
**bucket
;
140 ASSERT(MUTEX_HELD(&ntable_lock
));
142 bucket
= NM_FILEVP_HASH(nodep
->nm_filevp
);
143 nodep
->nm_nextp
= *bucket
;
148 * Remove a namenode from the hash table, if present.
151 nameremove(struct namenode
*nodep
)
153 struct namenode
*np
, **npp
;
155 ASSERT(MUTEX_HELD(&ntable_lock
));
157 for (npp
= NM_FILEVP_HASH(nodep
->nm_filevp
); (np
= *npp
) != NULL
;
158 npp
= &np
->nm_nextp
) {
167 * Search for a namenode that has a nm_filevp == vp and nm_mountpt == mnt.
168 * If mnt is NULL, return the first link with nm_filevp of vp.
169 * Returns namenode pointer on success, NULL on failure.
172 namefind(vnode_t
*vp
, vnode_t
*mnt
)
176 ASSERT(MUTEX_HELD(&ntable_lock
));
177 for (np
= *NM_FILEVP_HASH(vp
); np
!= NULL
; np
= np
->nm_nextp
)
178 if (np
->nm_filevp
== vp
&&
179 (mnt
== NULL
|| np
->nm_mountpt
== mnt
))
185 * Force the unmouting of a file descriptor from ALL of the nodes
186 * that it was mounted to.
187 * At the present time, the only usage for this routine is in the
188 * event one end of a pipe was mounted. At the time the unmounted
189 * end gets closed down, the mounted end is forced to be unmounted.
191 * This routine searches the namenode hash list for all namenodes
192 * that have a nm_filevp field equal to vp. Each time one is found,
193 * the dounmount() routine is called. This causes the nm_unmount()
194 * routine to be called and thus, the file descriptor is unmounted
197 * At the start of this routine, the reference count for vp is
198 * incremented to protect the vnode from being released in the
199 * event the mount was the only thing keeping the vnode active.
200 * If that is the case, the VOP_CLOSE operation is applied to
201 * the vnode, prior to it being released.
204 nm_umountall(vnode_t
*vp
, cred_t
*crp
)
207 struct namenode
*nodep
;
212 * For each namenode that is associated with the file:
213 * If the v_vfsp field is not namevfs, dounmount it. Otherwise,
214 * it was created in nm_open() and will be released in time.
215 * The following loop replicates some code from nm_find. That
216 * routine can't be used as is since the list isn't strictly
217 * consumed as it is traversed.
219 mutex_enter(&ntable_lock
);
220 nodep
= *NM_FILEVP_HASH(vp
);
222 if (nodep
->nm_filevp
== vp
&&
223 (vfsp
= NMTOV(nodep
)->v_vfsp
) != NULL
&&
224 vfsp
!= &namevfs
&& (NMTOV(nodep
)->v_flag
& VROOT
)) {
227 * If the vn_vfswlock fails, skip the vfs since
228 * somebody else may be unmounting it.
230 if (vn_vfswlock(vfsp
->vfs_vnodecovered
)) {
232 nodep
= nodep
->nm_nextp
;
237 * Can't hold ntable_lock across call to do_unmount
238 * because nm_unmount tries to acquire it. This means
239 * there is a window where another mount of vp can
240 * happen so it is possible that after nm_unmountall
241 * there are still some mounts. This situation existed
242 * without MT locking because dounmount can sleep
243 * so another mount could happen during that time.
244 * This situation is unlikely and doesn't really cause
247 mutex_exit(&ntable_lock
);
248 if ((error
= dounmount(vfsp
, 0, crp
)) != 0)
250 mutex_enter(&ntable_lock
);
252 * Since we dropped the ntable_lock, we
253 * have to start over from the beginning.
254 * If for some reasons dounmount() fails,
255 * start from beginning means that we will keep on
256 * trying unless another thread unmounts it for us.
258 nodep
= *NM_FILEVP_HASH(vp
);
260 nodep
= nodep
->nm_nextp
;
262 mutex_exit(&ntable_lock
);
267 * Force the unmouting of a file descriptor from ALL of the nodes
268 * that it was mounted to. XXX: fifo_close() calls this routine.
270 * nm_umountall() may return EBUSY.
271 * nm_unmountall() will keep on trying until it succeeds.
274 nm_unmountall(vnode_t
*vp
, cred_t
*crp
)
279 * Nm_umuontall() returns only if it succeeds or
280 * return with error EBUSY. If EBUSY, that means
281 * it cannot acquire the lock on the covered vnode,
282 * and we will keep on trying.
285 error
= nm_umountall(vp
, crp
);
288 delay(1); /* yield cpu briefly, then try again */
294 * Mount a file descriptor onto the node in the file system.
295 * Create a new vnode, update the attributes with info from the
296 * file descriptor and the mount point. The mask, mode, uid, gid,
297 * atime, mtime and ctime are taken from the mountpt. Link count is
298 * set to one, the file system id is namedev and nodeid is unique
299 * for each mounted object. Other attributes are taken from mount point.
300 * Make sure user is owner (or root) with write permissions on mount point.
301 * Hash the new vnode and return 0.
302 * Upon entry to this routine, the file descriptor is in the
303 * fd field of a struct namefd. Copy that structure from user
304 * space and retrieve the file descriptor.
307 nm_mount(vfs_t
*vfsp
, vnode_t
*mvp
, struct mounta
*uap
, cred_t
*crp
)
309 struct namefd namefdp
;
310 struct vnode
*filevp
; /* file descriptor vnode */
312 struct vnode
*newvp
; /* vnode representing this mount */
313 struct vnode
*rvp
; /* realvp (if any) for the mountpt */
314 struct namenode
*nodep
; /* namenode for this mount */
315 struct vattr filevattr
; /* attributes of file dec. */
316 struct vattr
*vattrp
; /* attributes of this mount */
318 char *resource_nodetype
;
323 * Get the file descriptor from user space.
324 * Make sure the file descriptor is valid and has an
325 * associated file pointer.
326 * If so, extract the vnode from the file pointer.
328 if (uap
->datalen
!= sizeof (struct namefd
))
331 if (copyin(uap
->dataptr
, &namefdp
, uap
->datalen
))
334 if ((fp
= getf(namefdp
.fd
)) == NULL
)
338 * If the mount point already has something mounted
339 * on it, disallow this mount. (This restriction may
340 * be removed in a later release).
341 * Or unmount has completed but the namefs ROOT vnode
342 * count has not decremented to zero, disallow this mount.
345 mutex_enter(&mvp
->v_lock
);
346 if ((mvp
->v_flag
& VROOT
) ||
347 vfs_matchops(mvp
->v_vfsp
, namefs_vfsops
)) {
348 mutex_exit(&mvp
->v_lock
);
349 releasef(namefdp
.fd
);
352 mutex_exit(&mvp
->v_lock
);
355 * Cannot allow users to fattach() in /dev/pts.
356 * First, there is no need for doing so and secondly
357 * we cannot allow arbitrary users to park on a node in
358 * /dev/pts or /dev/vt.
361 if (vn_matchops(mvp
, spec_getvnodeops()) &&
362 VOP_REALVP(mvp
, &rvp
, NULL
) == 0 && rvp
&&
363 (vn_matchops(rvp
, devpts_getvnodeops()) ||
364 vn_matchops(rvp
, devvt_getvnodeops()))) {
365 releasef(namefdp
.fd
);
369 filevp
= fp
->f_vnode
;
370 if (filevp
->v_type
== VDIR
|| filevp
->v_type
== VPORT
) {
371 releasef(namefdp
.fd
);
376 * If the fd being mounted refers to neither a door nor a stream,
377 * make sure the caller is privileged.
379 if (filevp
->v_type
!= VDOOR
&& filevp
->v_stream
== NULL
) {
380 if (secpolicy_fs_mount(crp
, filevp
, vfsp
) != 0) {
381 /* fd is neither a stream nor a door */
382 releasef(namefdp
.fd
);
388 * Make sure the file descriptor is not the root of some
390 * If it's not, create a reference and allocate a namenode
391 * to represent this mount request.
393 if (filevp
->v_flag
& VROOT
) {
394 releasef(namefdp
.fd
);
398 nodep
= kmem_zalloc(sizeof (struct namenode
), KM_SLEEP
);
400 mutex_init(&nodep
->nm_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
401 vattrp
= &nodep
->nm_vattr
;
402 vattrp
->va_mask
= AT_ALL
;
403 if (error
= VOP_GETATTR(mvp
, vattrp
, 0, crp
, NULL
))
406 filevattr
.va_mask
= AT_ALL
;
407 if (error
= VOP_GETATTR(filevp
, &filevattr
, 0, crp
, NULL
))
410 * Make sure the user is the owner of the mount point
411 * or has sufficient privileges.
413 if (error
= secpolicy_vnode_owner(crp
, vattrp
->va_uid
))
417 * Make sure the user has write permissions on the
418 * mount point (or has sufficient privileges).
420 if (secpolicy_vnode_access2(crp
, mvp
, vattrp
->va_uid
, vattrp
->va_mode
,
427 * If the file descriptor has file/record locking, don't
428 * allow the mount to succeed.
430 if (vn_has_flocks(filevp
)) {
436 * Initialize the namenode.
438 if (filevp
->v_stream
) {
439 struct stdata
*stp
= filevp
->v_stream
;
440 mutex_enter(&stp
->sd_lock
);
441 stp
->sd_flag
|= STRMOUNT
;
442 mutex_exit(&stp
->sd_lock
);
444 nodep
->nm_filevp
= filevp
;
445 mutex_enter(&fp
->f_tlock
);
447 mutex_exit(&fp
->f_tlock
);
449 releasef(namefdp
.fd
);
450 nodep
->nm_filep
= fp
;
451 nodep
->nm_mountpt
= mvp
;
454 * The attributes for the mounted file descriptor were initialized
455 * above by applying VOP_GETATTR to the mount point. Some of
456 * the fields of the attributes structure will be overwritten
457 * by the attributes from the file descriptor.
459 vattrp
->va_type
= filevattr
.va_type
;
460 vattrp
->va_fsid
= namedev
;
461 vattrp
->va_nodeid
= namenodeno_alloc();
462 vattrp
->va_nlink
= 1;
463 vattrp
->va_size
= filevattr
.va_size
;
464 vattrp
->va_rdev
= filevattr
.va_rdev
;
465 vattrp
->va_blksize
= filevattr
.va_blksize
;
466 vattrp
->va_nblocks
= filevattr
.va_nblocks
;
470 * Initialize new vnode structure for the mounted file descriptor.
472 nodep
->nm_vnode
= vn_alloc(KM_SLEEP
);
473 newvp
= NMTOV(nodep
);
475 newvp
->v_flag
= filevp
->v_flag
| VROOT
| VNOMAP
| VNOSWAP
;
476 vn_setops(newvp
, nm_vnodeops
);
477 newvp
->v_vfsp
= vfsp
;
478 newvp
->v_stream
= filevp
->v_stream
;
479 newvp
->v_type
= filevp
->v_type
;
480 newvp
->v_rdev
= filevp
->v_rdev
;
481 newvp
->v_data
= (caddr_t
)nodep
;
486 * Initialize the vfs structure.
488 vfsp
->vfs_vnodecovered
= NULL
;
489 vfsp
->vfs_flag
|= VFS_UNLINKABLE
;
490 vfsp
->vfs_bsize
= 1024;
491 vfsp
->vfs_fstype
= namefstype
;
492 vfs_make_fsid(&vfsp
->vfs_fsid
, namedev
, namefstype
);
493 vfsp
->vfs_data
= (caddr_t
)nodep
;
494 vfsp
->vfs_dev
= namedev
;
495 vfsp
->vfs_bcount
= 0;
498 * Set the name we mounted from.
500 switch (filevp
->v_type
) {
501 case VPROC
: /* VOP_GETATTR() translates this to VREG */
502 case VREG
: resource_nodetype
= "file"; break;
503 case VDIR
: resource_nodetype
= "directory"; break;
504 case VBLK
: resource_nodetype
= "device"; break;
505 case VCHR
: resource_nodetype
= "device"; break;
506 case VLNK
: resource_nodetype
= "link"; break;
507 case VFIFO
: resource_nodetype
= "fifo"; break;
508 case VDOOR
: resource_nodetype
= "door"; break;
509 case VSOCK
: resource_nodetype
= "socket"; break;
510 default: resource_nodetype
= "resource"; break;
513 #define RESOURCE_NAME_SZ 128 /* Maximum length of the resource name */
514 resource_name
= kmem_alloc(RESOURCE_NAME_SZ
, KM_SLEEP
);
515 svfsp
= kmem_alloc(sizeof (statvfs64_t
), KM_SLEEP
);
517 error
= VFS_STATVFS(filevp
->v_vfsp
, svfsp
);
519 (void) snprintf(resource_name
, RESOURCE_NAME_SZ
,
520 "unspecified_%s_%s", svfsp
->f_basetype
, resource_nodetype
);
522 (void) snprintf(resource_name
, RESOURCE_NAME_SZ
,
523 "unspecified_%s", resource_nodetype
);
526 vfs_setresource(vfsp
, resource_name
, 0);
528 kmem_free(svfsp
, sizeof (statvfs64_t
));
529 kmem_free(resource_name
, RESOURCE_NAME_SZ
);
530 #undef RESOURCE_NAME_SZ
533 * Insert the namenode.
535 mutex_enter(&ntable_lock
);
537 mutex_exit(&ntable_lock
);
540 releasef(namefdp
.fd
);
541 kmem_free(nodep
, sizeof (struct namenode
));
546 * Unmount a file descriptor from a node in the file system.
547 * If the user is not the owner of the file and is not privileged,
548 * the request is denied.
549 * Otherwise, remove the namenode from the hash list.
550 * If the mounted file descriptor was that of a stream and this
551 * was the last mount of the stream, turn off the STRMOUNT flag.
552 * If the rootvp is referenced other than through the mount,
553 * nm_inactive will clean up.
556 nm_unmount(vfs_t
*vfsp
, int flag
, cred_t
*crp
)
558 struct namenode
*nodep
= (struct namenode
*)vfsp
->vfs_data
;
559 vnode_t
*vp
, *thisvp
;
560 struct file
*fp
= NULL
;
562 ASSERT((nodep
->nm_flag
& NMNMNT
) == 0);
565 * forced unmount is not supported by this file system
566 * and thus, ENOTSUP, is being returned.
568 if (flag
& MS_FORCE
) {
572 vp
= nodep
->nm_filevp
;
573 mutex_enter(&nodep
->nm_lock
);
574 if (secpolicy_vnode_owner(crp
, nodep
->nm_vattr
.va_uid
) != 0) {
575 mutex_exit(&nodep
->nm_lock
);
579 mutex_exit(&nodep
->nm_lock
);
581 mutex_enter(&ntable_lock
);
583 thisvp
= NMTOV(nodep
);
584 mutex_enter(&thisvp
->v_lock
);
585 VN_RELE_LOCKED(thisvp
);
586 if (thisvp
->v_count
== 0) {
587 fp
= nodep
->nm_filep
;
588 mutex_exit(&thisvp
->v_lock
);
592 namenodeno_free(nodep
->nm_vattr
.va_nodeid
);
593 kmem_free(nodep
, sizeof (struct namenode
));
595 thisvp
->v_flag
&= ~VROOT
;
596 mutex_exit(&thisvp
->v_lock
);
598 if (namefind(vp
, NULLVP
) == NULL
&& vp
->v_stream
) {
599 struct stdata
*stp
= vp
->v_stream
;
600 mutex_enter(&stp
->sd_lock
);
601 stp
->sd_flag
&= ~STRMOUNT
;
602 mutex_exit(&stp
->sd_lock
);
604 mutex_exit(&ntable_lock
);
611 * Create a reference to the root of a mounted file descriptor.
612 * This routine is called from lookupname() in the event a path
613 * is being searched that has a mounted file descriptor in it.
616 nm_root(vfs_t
*vfsp
, vnode_t
**vpp
)
618 struct namenode
*nodep
= (struct namenode
*)vfsp
->vfs_data
;
619 struct vnode
*vp
= NMTOV(nodep
);
627 * Return in sp the status of this file system.
630 nm_statvfs(vfs_t
*vfsp
, struct statvfs64
*sp
)
634 bzero(sp
, sizeof (*sp
));
637 (void) cmpldev(&d32
, vfsp
->vfs_dev
);
639 (void) strcpy(sp
->f_basetype
, vfssw
[vfsp
->vfs_fstype
].vsw_name
);
640 sp
->f_flag
= vf_to_stf(vfsp
->vfs_flag
);
645 * Since this file system has no disk blocks of its own, apply
646 * the VOP_FSYNC operation on the mounted file descriptor.
649 nm_sync(vfs_t
*vfsp
, short flag
, cred_t
*crp
)
651 struct namenode
*nodep
;
656 nodep
= (struct namenode
*)vfsp
->vfs_data
;
657 if (flag
& SYNC_CLOSE
)
658 return (nm_umountall(nodep
->nm_filevp
, crp
));
660 return (VOP_FSYNC(nodep
->nm_filevp
, FSYNC
, crp
, NULL
));
664 * File system initialization routine. Save the file system type,
665 * establish a file system device number and initialize nm_filevp_hash[].
668 nameinit(int fstype
, char *name
)
670 static const fs_operation_def_t nm_vfsops_template
[] = {
671 VFSNAME_MOUNT
, { .vfs_mount
= nm_mount
},
672 VFSNAME_UNMOUNT
, { .vfs_unmount
= nm_unmount
},
673 VFSNAME_ROOT
, { .vfs_root
= nm_root
},
674 VFSNAME_STATVFS
, { .vfs_statvfs
= nm_statvfs
},
675 VFSNAME_SYNC
, { .vfs_sync
= nm_sync
},
678 static const fs_operation_def_t nm_dummy_vfsops_template
[] = {
679 VFSNAME_STATVFS
, { .vfs_statvfs
= nm_statvfs
},
680 VFSNAME_SYNC
, { .vfs_sync
= nm_sync
},
685 vfsops_t
*dummy_vfsops
;
687 error
= vfs_setfsops(fstype
, nm_vfsops_template
, &namefs_vfsops
);
689 cmn_err(CE_WARN
, "nameinit: bad vfs ops template");
693 error
= vfs_makefsops(nm_dummy_vfsops_template
, &dummy_vfsops
);
695 (void) vfs_freevfsops_by_type(fstype
);
696 cmn_err(CE_WARN
, "nameinit: bad dummy vfs ops template");
700 error
= vn_make_ops(name
, nm_vnodeops_template
, &nm_vnodeops
);
702 (void) vfs_freevfsops_by_type(fstype
);
703 vfs_freevfsops(dummy_vfsops
);
704 cmn_err(CE_WARN
, "nameinit: bad vnode ops template");
710 if ((dev
= getudev()) == (major_t
)-1) {
711 cmn_err(CE_WARN
, "nameinit: can't get unique device");
714 mutex_init(&ntable_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
715 namedev
= makedevice(dev
, 0);
716 bzero(nm_filevp_hash
, sizeof (nm_filevp_hash
));
717 vfs_setops(&namevfs
, dummy_vfsops
);
718 namevfs
.vfs_vnodecovered
= NULL
;
719 namevfs
.vfs_bsize
= 1024;
720 namevfs
.vfs_fstype
= namefstype
;
721 vfs_make_fsid(&namevfs
.vfs_fsid
, namedev
, namefstype
);
722 namevfs
.vfs_dev
= namedev
;
726 static mntopts_t nm_mntopts
= {
731 static vfsdef_t vfw
= {
735 VSW_HASPROTO
| VSW_ZMOUNT
,
740 * Module linkage information for the kernel.
742 static struct modlfs modlfs
= {
743 &mod_fsops
, "filesystem for namefs", &vfw
746 static struct modlinkage modlinkage
= {
747 MODREV_1
, (void *)&modlfs
, NULL
754 return (mod_install(&modlinkage
));
764 _info(struct modinfo
*modinfop
)
766 return (mod_info(&modlinkage
, modinfop
));