4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Joyent, Inc.
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
31 #include <sys/pathname.h>
33 #include <sys/vnode.h>
37 #include <sys/errno.h>
38 #include <sys/cmn_err.h>
40 #include <sys/statvfs.h>
41 #include <sys/mount.h>
42 #include <sys/debug.h>
43 #include <sys/systm.h>
44 #include <sys/mntent.h>
45 #include <sys/fs_subr.h>
48 #include <sys/model.h>
49 #include <sys/policy.h>
51 #include <sys/fs/swapnode.h>
52 #include <sys/fs/tmp.h>
53 #include <sys/fs/tmpnode.h>
55 static int tmpfsfstype
;
58 * tmpfs vfs operations.
60 static int tmpfsinit(int, char *);
61 static int tmp_mount(struct vfs
*, struct vnode
*,
62 struct mounta
*, struct cred
*);
63 static int tmp_unmount(struct vfs
*, int, struct cred
*);
64 static int tmp_root(struct vfs
*, struct vnode
**);
65 static int tmp_statvfs(struct vfs
*, struct statvfs64
*);
66 static int tmp_vget(struct vfs
*, struct vnode
**, struct fid
*);
69 * Loadable module wrapper
71 #include <sys/modctl.h>
73 static mntopts_t tmpfs_proto_opttbl
;
75 static vfsdef_t vfw
= {
79 VSW_HASPROTO
|VSW_CANREMOUNT
|VSW_STATS
|VSW_ZMOUNT
,
84 * in-kernel mnttab options
86 static char *xattr_cancel
[] = { MNTOPT_NOXATTR
, NULL
};
87 static char *noxattr_cancel
[] = { MNTOPT_XATTR
, NULL
};
89 static mntopt_t tmpfs_options
[] = {
90 /* Option name Cancel Opt Arg Flags Data */
91 { MNTOPT_XATTR
, xattr_cancel
, NULL
, MO_DEFAULT
, NULL
},
92 { MNTOPT_NOXATTR
, noxattr_cancel
, NULL
, 0, NULL
},
93 { "size", NULL
, "0", MO_HASVALUE
, NULL
},
94 { "mode", NULL
, NULL
, MO_HASVALUE
, NULL
},
98 static mntopts_t tmpfs_proto_opttbl
= {
99 sizeof (tmpfs_options
) / sizeof (mntopt_t
),
104 * Module linkage information
106 static struct modlfs modlfs
= {
107 &mod_fsops
, "filesystem for tmpfs", &vfw
110 static struct modlinkage modlinkage
= {
111 MODREV_1
, &modlfs
, NULL
117 return (mod_install(&modlinkage
));
125 error
= mod_remove(&modlinkage
);
129 * Tear down the operations vectors
131 (void) vfs_freevfsops_by_type(tmpfsfstype
);
136 _info(struct modinfo
*modinfop
)
138 return (mod_info(&modlinkage
, modinfop
));
142 * The following are patchable variables limiting the amount of system
143 * resources tmpfs can use.
145 * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory
146 * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries)
147 * It is not determined by setting a hard limit but rather as a percentage of
148 * physical memory which is determined when tmpfs is first used in the system.
150 * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for
151 * the rest of the system. In other words, if the amount of free swap space
152 * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs
153 * anon allocations will fail.
155 * There is also a per mount limit on the amount of swap space
156 * (tmount.tm_anonmax) settable via a mount option.
158 size_t tmpfs_maxkmem
= 0;
159 size_t tmpfs_minfree
= 0;
160 size_t tmp_kmemspace
; /* bytes of kernel heap used by all tmpfs */
162 static major_t tmpfs_major
;
163 static minor_t tmpfs_minor
;
164 static kmutex_t tmpfs_minor_lock
;
166 static const struct vfsops tmp_vfsops
= {
167 .vfs_mount
= tmp_mount
,
168 .vfs_unmount
= tmp_unmount
,
169 .vfs_root
= tmp_root
,
170 .vfs_statvfs
= tmp_statvfs
,
171 .vfs_vget
= tmp_vget
,
175 * initialize global tmpfs locks and such
176 * called when loading tmpfs module
179 tmpfsinit(int fstype
, char *name
)
182 extern void tmpfs_hash_init();
185 tmpfsfstype
= fstype
;
186 ASSERT(tmpfsfstype
!= 0);
188 error
= vfs_setfsops(fstype
, &tmp_vfsops
);
190 cmn_err(CE_WARN
, "tmpfsinit: bad fstype");
195 * tmpfs_minfree doesn't need to be some function of configured
196 * swap space since it really is an absolute limit of swap space
197 * which still allows other processes to execute.
199 if (tmpfs_minfree
== 0) {
203 tmpfs_minfree
= btopr(TMPMINFREE
);
207 * The maximum amount of space tmpfs can allocate is
208 * TMPMAXPROCKMEM percent of kernel memory
210 if (tmpfs_maxkmem
== 0)
211 tmpfs_maxkmem
= MAX(PAGESIZE
, kmem_maxavail() / TMPMAXFRACKMEM
);
213 if ((tmpfs_major
= getudev()) == (major_t
)-1) {
214 cmn_err(CE_WARN
, "tmpfsinit: Can't get unique device number.");
217 mutex_init(&tmpfs_minor_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
222 tmp_mount(vfs_t
*vfsp
, vnode_t
*mvp
, struct mounta
*uap
, cred_t
*cr
)
224 struct tmount
*tm
= NULL
;
231 boolean_t mode_arg
= B_FALSE
;
232 mode_t root_mode
= 0777;
235 if ((error
= secpolicy_fs_mount(cr
, mvp
, vfsp
)) != 0)
238 if (mvp
->v_type
!= VDIR
)
241 mutex_enter(&mvp
->v_lock
);
242 if ((uap
->flags
& MS_REMOUNT
) == 0 && (uap
->flags
& MS_OVERLAY
) == 0 &&
243 (mvp
->v_count
!= 1 || (mvp
->v_flag
& VROOT
))) {
244 mutex_exit(&mvp
->v_lock
);
247 mutex_exit(&mvp
->v_lock
);
250 * Having the resource be anything but "swap" doesn't make sense.
252 vfs_setresource(vfsp
, "swap", 0);
255 * now look for options we understand...
258 /* tmpfs doesn't support read-only mounts */
259 if (vfs_optionisset(vfsp
, MNTOPT_RO
, NULL
)) {
265 * tm_anonmax is set according to the mount arguments
266 * if any. Otherwise, it is set to a maximum value.
268 if (vfs_optionisset(vfsp
, "size", &argstr
)) {
269 if ((error
= tmp_convnum(argstr
, &anonmax
)) != 0)
276 * The "mode" mount argument allows the operator to override the
277 * permissions of the root of the tmpfs mount.
279 if (vfs_optionisset(vfsp
, "mode", &argstr
)) {
280 if ((error
= tmp_convmode(argstr
, &root_mode
)) != 0) {
286 if (error
= pn_get(uap
->dir
,
287 (uap
->flags
& MS_SYSSPACE
) ? UIO_SYSSPACE
: UIO_USERSPACE
, &dpn
))
290 if (uap
->flags
& MS_REMOUNT
) {
291 tm
= (struct tmount
*)VFSTOTM(vfsp
);
294 * If we change the size so its less than what is currently
295 * being used, we allow that. The file system will simply be
296 * full until enough files have been removed to get below the
299 mutex_enter(&tm
->tm_contents
);
300 tm
->tm_anonmax
= anonmax
;
301 mutex_exit(&tm
->tm_contents
);
305 if ((tm
= tmp_memalloc(sizeof (struct tmount
), 0)) == NULL
) {
312 * find an available minor device number for this mount
314 mutex_enter(&tmpfs_minor_lock
);
316 tmpfs_minor
= (tmpfs_minor
+ 1) & L_MAXMIN32
;
317 tm
->tm_dev
= makedevice(tmpfs_major
, tmpfs_minor
);
318 } while (vfs_devismounted(tm
->tm_dev
));
319 mutex_exit(&tmpfs_minor_lock
);
322 * Set but don't bother entering the mutex
323 * (tmount not on mount list yet)
325 mutex_init(&tm
->tm_contents
, NULL
, MUTEX_DEFAULT
, NULL
);
326 mutex_init(&tm
->tm_renamelck
, NULL
, MUTEX_DEFAULT
, NULL
);
329 tm
->tm_anonmax
= anonmax
;
331 vfsp
->vfs_data
= (caddr_t
)tm
;
332 vfsp
->vfs_fstype
= tmpfsfstype
;
333 vfsp
->vfs_dev
= tm
->tm_dev
;
334 vfsp
->vfs_bsize
= PAGESIZE
;
335 vfsp
->vfs_flag
|= VFS_NOTRUNC
;
336 vfs_make_fsid(&vfsp
->vfs_fsid
, tm
->tm_dev
, tmpfsfstype
);
337 tm
->tm_mntpath
= tmp_memalloc(dpn
.pn_pathlen
+ 1, TMP_MUSTHAVE
);
338 (void) strcpy(tm
->tm_mntpath
, dpn
.pn_path
);
341 * allocate and initialize root tmpnode structure
343 bzero(&rattr
, sizeof (struct vattr
));
344 rattr
.va_mode
= (mode_t
)(S_IFDIR
| root_mode
);
345 rattr
.va_type
= VDIR
;
347 tp
= tmp_memalloc(sizeof (struct tmpnode
), TMP_MUSTHAVE
);
348 tmpnode_init(tm
, tp
, &rattr
, cr
);
351 * Get the mode, uid, and gid from the underlying mount point.
353 rattr
.va_mask
= AT_MODE
|AT_UID
|AT_GID
; /* Hint to getattr */
354 got_attrs
= fop_getattr(mvp
, &rattr
, 0, cr
, NULL
);
356 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
357 TNTOV(tp
)->v_flag
|= VROOT
;
360 * If the getattr succeeded, use its results. Otherwise allow
361 * the previously set hardwired defaults to prevail.
363 if (got_attrs
== 0) {
366 * Only use the underlying mount point for the
367 * mode if the "mode" mount argument was not
370 tp
->tn_mode
= rattr
.va_mode
;
372 tp
->tn_uid
= rattr
.va_uid
;
373 tp
->tn_gid
= rattr
.va_gid
;
377 * initialize linked list of tmpnodes so that the back pointer of
378 * the root tmpnode always points to the last one on the list
379 * and the forward pointer of the last node is null
384 tm
->tm_rootnode
= tp
;
388 rw_exit(&tp
->tn_rwlock
);
395 vfs_set_feature(vfsp
, VFSFT_SYSATTR_VIEWS
);
401 tmp_unmount(struct vfs
*vfsp
, int flag
, struct cred
*cr
)
403 struct tmount
*tm
= (struct tmount
*)VFSTOTM(vfsp
);
404 struct tmpnode
*tnp
, *cancel
;
408 if ((error
= secpolicy_fs_unmount(cr
, vfsp
)) != 0)
412 * forced unmount is not supported by this file system
413 * and thus, ENOTSUP, is being returned.
418 mutex_enter(&tm
->tm_contents
);
421 * If there are no open files, only the root node should have
423 * With tm_contents held, nothing can be added or removed.
424 * There may be some dirty pages. To prevent fsflush from
425 * disrupting the unmount, put a hold on each node while scanning.
426 * If we find a previously referenced node, undo the holds we have
427 * placed and fail EBUSY.
429 tnp
= tm
->tm_rootnode
;
430 if (TNTOV(tnp
)->v_count
> 1) {
431 mutex_exit(&tm
->tm_contents
);
435 for (tnp
= tnp
->tn_forw
; tnp
; tnp
= tnp
->tn_forw
) {
436 if ((vp
= TNTOV(tnp
))->v_count
> 0) {
437 cancel
= tm
->tm_rootnode
->tn_forw
;
438 while (cancel
!= tnp
) {
440 ASSERT(vp
->v_count
> 0);
442 cancel
= cancel
->tn_forw
;
444 mutex_exit(&tm
->tm_contents
);
451 * We can drop the mutex now because no one can find this mount
453 mutex_exit(&tm
->tm_contents
);
456 * Free all kmemalloc'd and anonalloc'd memory associated with
457 * this filesystem. To do this, we go through the file list twice,
458 * once to remove all the directory entries, and then to remove
459 * all the files. We do this because there is useful code in
460 * tmpnode_free which assumes that the directory entry has been
461 * removed before the file.
464 * Remove all directory entries
466 for (tnp
= tm
->tm_rootnode
; tnp
; tnp
= tnp
->tn_forw
) {
467 rw_enter(&tnp
->tn_rwlock
, RW_WRITER
);
468 if (tnp
->tn_type
== VDIR
)
470 if (tnp
->tn_vnode
->v_flag
& V_XATTRDIR
) {
472 * Account for implicit attrdir reference.
474 ASSERT(tnp
->tn_nlink
> 0);
475 DECR_COUNT(&tnp
->tn_nlink
, &tnp
->tn_tlock
);
477 rw_exit(&tnp
->tn_rwlock
);
480 ASSERT(tm
->tm_rootnode
);
483 * All links are gone, v_count is keeping nodes in place.
484 * VN_RELE should make the node disappear, unless somebody
485 * is holding pages against it. Nap and retry until it disappears.
487 * We re-acquire the lock to prevent others who have a HOLD on
488 * a tmpnode via its pages or anon slots from blowing it away
489 * (in tmp_inactive) while we're trying to get to it here. Once
490 * we have a HOLD on it we know it'll stick around.
493 mutex_enter(&tm
->tm_contents
);
495 * Remove all the files (except the rootnode) backwards.
497 while ((tnp
= tm
->tm_rootnode
->tn_back
) != tm
->tm_rootnode
) {
498 mutex_exit(&tm
->tm_contents
);
500 * Inhibit tmp_inactive from touching attribute directory
501 * as all nodes will be released here.
502 * Note we handled the link count in pass 2 above.
504 rw_enter(&tnp
->tn_rwlock
, RW_WRITER
);
505 tnp
->tn_xattrdp
= NULL
;
506 rw_exit(&tnp
->tn_rwlock
);
509 mutex_enter(&tm
->tm_contents
);
511 * It's still there after the RELE. Someone else like pageout
512 * has a hold on it so wait a bit and then try again - we know
513 * they'll give it up soon.
515 if (tnp
== tm
->tm_rootnode
->tn_back
) {
517 mutex_exit(&tm
->tm_contents
);
519 mutex_enter(&tm
->tm_contents
);
522 mutex_exit(&tm
->tm_contents
);
524 tm
->tm_rootnode
->tn_xattrdp
= NULL
;
525 VN_RELE(TNTOV(tm
->tm_rootnode
));
527 ASSERT(tm
->tm_mntpath
);
529 tmp_memfree(tm
->tm_mntpath
, strlen(tm
->tm_mntpath
) + 1);
531 ASSERT(tm
->tm_anonmem
== 0);
533 mutex_destroy(&tm
->tm_contents
);
534 mutex_destroy(&tm
->tm_renamelck
);
535 tmp_memfree(tm
, sizeof (struct tmount
));
541 * return root tmpnode for given vnode
544 tmp_root(struct vfs
*vfsp
, struct vnode
**vpp
)
546 struct tmount
*tm
= (struct tmount
*)VFSTOTM(vfsp
);
547 struct tmpnode
*tp
= tm
->tm_rootnode
;
559 tmp_statvfs(struct vfs
*vfsp
, struct statvfs64
*sbp
)
561 struct tmount
*tm
= (struct tmount
*)VFSTOTM(vfsp
);
568 * The file system may have been mounted by the global zone on
569 * behalf of the non-global zone. In that case, the tmount zone_id
570 * will be the global zone. We still want to show the swap cap inside
571 * the zone in this case, even though the file system was mounted by
574 if (curproc
->p_zone
->zone_id
!= GLOBAL_ZONEUNIQID
)
575 zp
= curproc
->p_zone
;
577 zp
= tm
->tm_vfsp
->vfs_zone
;
580 eff_zid
= GLOBAL_ZONEUNIQID
;
582 eff_zid
= zp
->zone_id
;
584 sbp
->f_bsize
= PAGESIZE
;
585 sbp
->f_frsize
= PAGESIZE
;
588 * Find the amount of available physical and memory swap
590 mutex_enter(&anoninfo_lock
);
591 ASSERT(k_anoninfo
.ani_max
>= k_anoninfo
.ani_phys_resv
);
592 blocks
= (ulong_t
)CURRENT_TOTAL_AVAILABLE_SWAP
;
593 mutex_exit(&anoninfo_lock
);
596 * If tm_anonmax for this mount is less than the available swap space
597 * (minus the amount tmpfs can't use), use that instead
599 if (blocks
> tmpfs_minfree
)
600 sbp
->f_bfree
= MIN(blocks
- tmpfs_minfree
,
601 tm
->tm_anonmax
- tm
->tm_anonmem
);
605 sbp
->f_bavail
= sbp
->f_bfree
;
608 * Total number of blocks is what's available plus what's been used
610 sbp
->f_blocks
= (fsblkcnt64_t
)(sbp
->f_bfree
+ tm
->tm_anonmem
);
612 if (eff_zid
!= GLOBAL_ZONEUNIQID
&&
613 zp
->zone_max_swap_ctl
!= UINT64_MAX
) {
615 * If the fs is used by a non-global zone with a swap cap,
616 * then report the capped size.
618 rctl_qty_t cap
, used
;
619 pgcnt_t pgcap
, pgused
;
621 mutex_enter(&zp
->zone_mem_lock
);
622 cap
= zp
->zone_max_swap_ctl
;
623 used
= zp
->zone_max_swap
;
624 mutex_exit(&zp
->zone_mem_lock
);
629 sbp
->f_bfree
= MIN(pgcap
- pgused
, sbp
->f_bfree
);
630 sbp
->f_bavail
= sbp
->f_bfree
;
631 sbp
->f_blocks
= MIN(pgcap
, sbp
->f_blocks
);
635 * The maximum number of files available is approximately the number
636 * of tmpnodes we can allocate from the remaining kernel memory
637 * available to tmpfs. This is fairly inaccurate since it doesn't
638 * take into account the names stored in the directory entries.
640 if (tmpfs_maxkmem
> tmp_kmemspace
)
641 sbp
->f_ffree
= (tmpfs_maxkmem
- tmp_kmemspace
) /
642 (sizeof (struct tmpnode
) + sizeof (struct tdirent
));
646 sbp
->f_files
= tmpfs_maxkmem
/
647 (sizeof (struct tmpnode
) + sizeof (struct tdirent
));
648 sbp
->f_favail
= (fsfilcnt64_t
)(sbp
->f_ffree
);
649 (void) cmpldev(&d32
, vfsp
->vfs_dev
);
651 (void) strcpy(sbp
->f_basetype
, vfssw
[tmpfsfstype
].vsw_name
);
652 (void) strncpy(sbp
->f_fstr
, tm
->tm_mntpath
, sizeof (sbp
->f_fstr
));
654 * ensure null termination
656 sbp
->f_fstr
[sizeof (sbp
->f_fstr
) - 1] = '\0';
657 sbp
->f_flag
= vf_to_stf(vfsp
->vfs_flag
);
658 sbp
->f_namemax
= MAXNAMELEN
- 1;
663 tmp_vget(struct vfs
*vfsp
, struct vnode
**vpp
, struct fid
*fidp
)
666 struct tmount
*tm
= (struct tmount
*)VFSTOTM(vfsp
);
667 struct tmpnode
*tp
= NULL
;
669 tfid
= (struct tfid
*)fidp
;
672 mutex_enter(&tm
->tm_contents
);
673 for (tp
= tm
->tm_rootnode
; tp
; tp
= tp
->tn_forw
) {
674 mutex_enter(&tp
->tn_tlock
);
675 if (tp
->tn_nodeid
== tfid
->tfid_ino
) {
677 * If the gen numbers don't match we know the
678 * file won't be found since only one tmpnode
679 * can have this number at a time.
681 if (tp
->tn_gen
!= tfid
->tfid_gen
|| tp
->tn_nlink
== 0) {
682 mutex_exit(&tp
->tn_tlock
);
683 mutex_exit(&tm
->tm_contents
);
686 *vpp
= (struct vnode
*)TNTOV(tp
);
690 if ((tp
->tn_mode
& S_ISVTX
) &&
691 !(tp
->tn_mode
& (S_IXUSR
| S_IFDIR
))) {
692 mutex_enter(&(*vpp
)->v_lock
);
693 (*vpp
)->v_flag
|= VISSWAP
;
694 mutex_exit(&(*vpp
)->v_lock
);
696 mutex_exit(&tp
->tn_tlock
);
697 mutex_exit(&tm
->tm_contents
);
700 mutex_exit(&tp
->tn_tlock
);
702 mutex_exit(&tm
->tm_contents
);