4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
28 * Copyright (c) 2015, Joyent, Inc. All rights reserved.
29 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
30 * Copyright 2016 RackTop Systems.
31 * Copyright (c) 2017 by Delphix. All rights reserved.
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/t_lock.h>
37 #include <sys/systm.h>
38 #include <sys/sysmacros.h>
42 #include <sys/vfs_opreg.h>
43 #include <sys/vnode.h>
45 #include <sys/fcntl.h>
46 #include <sys/flock.h>
49 #include <sys/errno.h>
52 #include <sys/dirent.h>
53 #include <sys/pathname.h>
54 #include <sys/vmsystm.h>
55 #include <sys/fs/tmp.h>
56 #include <sys/fs/tmpnode.h>
59 #include <vm/seg_vn.h>
60 #include <vm/seg_map.h>
66 #include <sys/cmn_err.h>
67 #include <sys/debug.h>
71 #include <sys/vtrace.h>
72 #include <sys/policy.h>
73 #include <fs/fs_subr.h>
75 static int tmp_getapage(struct vnode
*, u_offset_t
, size_t, uint_t
*,
76 page_t
**, size_t, struct seg
*, caddr_t
, enum seg_rw
, struct cred
*);
77 static int tmp_putapage(struct vnode
*, page_t
*, u_offset_t
*, size_t *,
82 tmp_open(struct vnode
**vpp
, int flag
, struct cred
*cred
, caller_context_t
*ct
)
85 * swapon to a tmpfs file is not supported so access
86 * is denied on open if VISSWAP is set.
88 if ((*vpp
)->v_flag
& VISSWAP
)
101 caller_context_t
*ct
)
103 cleanlocks(vp
, ttoproc(curthread
)->p_pid
, 0);
104 cleanshares(vp
, ttoproc(curthread
)->p_pid
);
109 * wrtmp does the real work of write requests for tmpfs.
117 struct caller_context
*ct
)
119 pgcnt_t pageoffset
; /* offset in pages */
120 ulong_t segmap_offset
; /* pagesize byte offset into segmap */
121 caddr_t base
; /* base of segmap */
122 ssize_t bytes
; /* bytes to uiomove */
123 pfn_t pagenumber
; /* offset in pages into tmp file */
126 int pagecreate
; /* == 1 if we allocated a page */
128 rlim64_t limit
= uio
->uio_llimit
;
129 long oresid
= uio
->uio_resid
;
132 long tn_size_changed
= 0;
137 ASSERT(vp
->v_type
== VREG
);
139 TRACE_1(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_START
,
140 "tmp_wrtmp_start:vp %p", vp
);
142 ASSERT(RW_WRITE_HELD(&tp
->tn_contents
));
143 ASSERT(RW_WRITE_HELD(&tp
->tn_rwlock
));
145 if (MANDLOCK(vp
, tp
->tn_mode
)) {
146 rw_exit(&tp
->tn_contents
);
148 * tmp_getattr ends up being called by chklock
150 error
= chklock(vp
, FWRITE
, uio
->uio_loffset
, uio
->uio_resid
,
152 rw_enter(&tp
->tn_contents
, RW_WRITER
);
154 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
155 "tmp_wrtmp_end:vp %p error %d", vp
, error
);
160 if (uio
->uio_loffset
< 0)
163 if (limit
== RLIM64_INFINITY
|| limit
> MAXOFFSET_T
)
166 if (uio
->uio_loffset
>= limit
) {
167 proc_t
*p
= ttoproc(curthread
);
169 mutex_enter(&p
->p_lock
);
170 (void) rctl_action(rctlproc_legacy
[RLIMIT_FSIZE
], p
->p_rctls
,
171 p
, RCA_UNSAFE_SIGINFO
);
172 mutex_exit(&p
->p_lock
);
176 if (uio
->uio_loffset
>= MAXOFF_T
) {
177 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
178 "tmp_wrtmp_end:vp %p error %d", vp
, EINVAL
);
182 if (uio
->uio_resid
== 0) {
183 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
184 "tmp_wrtmp_end:vp %p error %d", vp
, 0);
188 if (limit
> MAXOFF_T
)
195 offset
= (long)uio
->uio_offset
;
196 pageoffset
= offset
& PAGEOFFSET
;
198 * A maximum of PAGESIZE bytes of data is transferred
199 * each pass through this loop
201 bytes
= MIN(PAGESIZE
- pageoffset
, uio
->uio_resid
);
203 if (offset
+ bytes
>= limit
) {
204 if (offset
>= limit
) {
208 bytes
= limit
- offset
;
210 pagenumber
= btop(offset
);
213 * delta is the amount of anonymous memory
214 * to reserve for the file.
215 * We always reserve in pagesize increments so
216 * unless we're extending the file into a new page,
217 * we don't need to call tmp_resv.
219 delta
= offset
+ bytes
-
220 P2ROUNDUP_TYPED(tp
->tn_size
, PAGESIZE
, u_offset_t
);
223 if (tmp_resv(tm
, tp
, delta
, pagecreate
)) {
225 * Log file system full in the zone that owns
226 * the tmpfs mount, as well as in the global
229 zcmn_err(tm
->tm_vfsp
->vfs_zone
->zone_id
,
230 CE_WARN
, "%s: File system full, "
231 "swap space limit exceeded",
234 if (tm
->tm_vfsp
->vfs_zone
->zone_id
!=
237 vfs_t
*vfs
= tm
->tm_vfsp
;
239 zcmn_err(GLOBAL_ZONEID
,
240 CE_WARN
, "%s: File system full, "
241 "swap space limit exceeded",
242 vfs
->vfs_vnodecovered
->v_path
);
247 tmpnode_growmap(tp
, (ulong_t
)offset
+ bytes
);
249 /* grow the file to the new length */
250 if (offset
+ bytes
> tp
->tn_size
) {
252 old_tn_size
= tp
->tn_size
;
254 * Postpone updating tp->tn_size until uiomove() is
257 new_tn_size
= offset
+ bytes
;
259 if (bytes
== PAGESIZE
) {
261 * Writing whole page so reading from disk
269 * If writing past EOF or filling in a hole
270 * we need to allocate an anon slot.
272 if (anon_get_ptr(tp
->tn_anon
, pagenumber
) == NULL
) {
273 (void) anon_set_ptr(tp
->tn_anon
, pagenumber
,
274 anon_alloc(vp
, ptob(pagenumber
)), ANON_SLEEP
);
280 * We have to drop the contents lock to allow the VM
281 * system to reacquire it in tmp_getpage()
283 rw_exit(&tp
->tn_contents
);
286 * Touch the page and fault it in if it is not in core
287 * before segmap_getmapflt or vpm_data_copy can lock it.
288 * This is to avoid the deadlock if the buffer is mapped
289 * to the same file through mmap which we want to write.
291 uio_prefaultpages((long)bytes
, uio
);
296 * Copy data. If new pages are created, part of
297 * the page that is not written will be initizliazed
300 error
= vpm_data_copy(vp
, offset
, bytes
, uio
,
301 !pagecreate
, &newpage
, 1, S_WRITE
);
303 /* Get offset within the segmap mapping */
304 segmap_offset
= (offset
& PAGEMASK
) & MAXBOFFSET
;
305 base
= segmap_getmapflt(segkmap
, vp
,
306 (offset
& MAXBMASK
), PAGESIZE
, !pagecreate
,
311 if (!vpm_enable
&& pagecreate
) {
313 * segmap_pagecreate() returns 1 if it calls
314 * page_create_va() to allocate any pages.
316 newpage
= segmap_pagecreate(segkmap
,
317 base
+ segmap_offset
, (size_t)PAGESIZE
, 0);
319 * Clear from the beginning of the page to the starting
320 * offset of the data.
323 (void) kzero(base
+ segmap_offset
,
328 error
= uiomove(base
+ segmap_offset
+ pageoffset
,
329 (long)bytes
, UIO_WRITE
, uio
);
332 if (!vpm_enable
&& pagecreate
&&
333 uio
->uio_offset
< P2ROUNDUP(offset
+ bytes
, PAGESIZE
)) {
334 long zoffset
; /* zero from offset into page */
336 * We created pages w/o initializing them completely,
337 * thus we need to zero the part that wasn't set up.
338 * This happens on most EOF write cases and if
339 * we had some sort of error during the uiomove.
343 nmoved
= uio
->uio_offset
- offset
;
344 ASSERT((nmoved
+ pageoffset
) <= PAGESIZE
);
347 * Zero from the end of data in the page to the
350 if ((zoffset
= pageoffset
+ nmoved
) < PAGESIZE
)
351 (void) kzero(base
+ segmap_offset
+ zoffset
,
352 (size_t)PAGESIZE
- zoffset
);
356 * Unlock the pages which have been allocated by
357 * page_create_va() in segmap_pagecreate()
359 if (!vpm_enable
&& newpage
) {
360 segmap_pageunlock(segkmap
, base
+ segmap_offset
,
361 (size_t)PAGESIZE
, S_WRITE
);
366 * If we failed on a write, we must
367 * be sure to invalidate any pages that may have
371 (void) vpm_sync_pages(vp
, offset
, PAGESIZE
,
374 (void) segmap_release(segkmap
, base
, SM_INVAL
);
378 error
= vpm_sync_pages(vp
, offset
, PAGESIZE
,
381 error
= segmap_release(segkmap
, base
, 0);
386 * Re-acquire contents lock.
388 rw_enter(&tp
->tn_contents
, RW_WRITER
);
394 tp
->tn_size
= new_tn_size
;
397 * If the uiomove failed, fix up tn_size.
400 if (tn_size_changed
) {
402 * The uiomove failed, and we
403 * allocated blocks,so get rid
406 (void) tmpnode_trunc(tm
, tp
,
407 (ulong_t
)old_tn_size
);
411 * XXX - Can this be out of the loop?
413 if ((tp
->tn_mode
& (S_IXUSR
| S_IXGRP
| S_IXOTH
)) &&
414 (tp
->tn_mode
& (S_ISUID
| S_ISGID
)) &&
415 secpolicy_vnode_setid_retain(cr
,
416 (tp
->tn_mode
& S_ISUID
) != 0 && tp
->tn_uid
== 0)) {
418 * Clear Set-UID & Set-GID bits on
419 * successful write if not privileged
420 * and at least one of the execute bits
421 * is set. If we always clear Set-GID,
422 * mandatory file and record locking is
425 tp
->tn_mode
&= ~(S_ISUID
| S_ISGID
);
431 } while (error
== 0 && uio
->uio_resid
> 0 && bytes
!= 0);
435 * If we've already done a partial-write, terminate
436 * the write but return no error.
438 if (oresid
!= uio
->uio_resid
)
440 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
441 "tmp_wrtmp_end:vp %p error %d", vp
, error
);
446 * rdtmp does the real work of read requests for tmpfs.
453 struct caller_context
*ct
)
455 ulong_t pageoffset
; /* offset in tmpfs file (uio_offset) */
456 ulong_t segmap_offset
; /* pagesize byte offset into segmap */
457 caddr_t base
; /* base of segmap */
458 ssize_t bytes
; /* bytes to uiomove */
461 long oresid
= uio
->uio_resid
;
468 TRACE_1(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_START
, "tmp_rdtmp_start:vp %p",
471 ASSERT(RW_LOCK_HELD(&tp
->tn_contents
));
473 if (MANDLOCK(vp
, tp
->tn_mode
)) {
474 rw_exit(&tp
->tn_contents
);
476 * tmp_getattr ends up being called by chklock
478 error
= chklock(vp
, FREAD
, uio
->uio_loffset
, uio
->uio_resid
,
480 rw_enter(&tp
->tn_contents
, RW_READER
);
482 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
483 "tmp_rdtmp_end:vp %p error %d", vp
, error
);
487 ASSERT(tp
->tn_type
== VREG
);
489 if (uio
->uio_loffset
>= MAXOFF_T
) {
490 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
491 "tmp_rdtmp_end:vp %p error %d", vp
, EINVAL
);
494 if (uio
->uio_loffset
< 0)
496 if (uio
->uio_resid
== 0) {
497 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
498 "tmp_rdtmp_end:vp %p error %d", vp
, 0);
508 offset
= uio
->uio_offset
;
509 pageoffset
= offset
& PAGEOFFSET
;
510 bytes
= MIN(PAGESIZE
- pageoffset
, uio
->uio_resid
);
512 diff
= tp
->tn_size
- offset
;
522 * We have to drop the contents lock to allow the VM system
523 * to reacquire it in tmp_getpage() should the uiomove cause a
526 rw_exit(&tp
->tn_contents
);
532 error
= vpm_data_copy(vp
, offset
, bytes
, uio
, 1, NULL
,
535 segmap_offset
= (offset
& PAGEMASK
) & MAXBOFFSET
;
536 base
= segmap_getmapflt(segkmap
, vp
, offset
& MAXBMASK
,
539 error
= uiomove(base
+ segmap_offset
+ pageoffset
,
540 (long)bytes
, UIO_READ
, uio
);
545 (void) vpm_sync_pages(vp
, offset
, PAGESIZE
, 0);
547 (void) segmap_release(segkmap
, base
, 0);
551 error
= vpm_sync_pages(vp
, offset
, PAGESIZE
,
554 error
= segmap_release(segkmap
, base
, 0);
559 * Re-acquire contents lock.
561 rw_enter(&tp
->tn_contents
, RW_READER
);
563 } while (error
== 0 && uio
->uio_resid
> 0);
566 gethrestime(&tp
->tn_atime
);
569 * If we've already done a partial read, terminate
570 * the read but return no error.
572 if (oresid
!= uio
->uio_resid
)
575 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
576 "tmp_rdtmp_end:vp %x error %d", vp
, error
);
582 tmp_read(struct vnode
*vp
, struct uio
*uiop
, int ioflag
, cred_t
*cred
,
583 struct caller_context
*ct
)
585 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
586 struct tmount
*tm
= (struct tmount
*)VTOTM(vp
);
590 * We don't currently support reading non-regular files
592 if (vp
->v_type
== VDIR
)
594 if (vp
->v_type
!= VREG
)
597 * tmp_rwlock should have already been called from layers above
599 ASSERT(RW_READ_HELD(&tp
->tn_rwlock
));
601 rw_enter(&tp
->tn_contents
, RW_READER
);
603 error
= rdtmp(tm
, tp
, uiop
, ct
);
605 rw_exit(&tp
->tn_contents
);
611 tmp_write(struct vnode
*vp
, struct uio
*uiop
, int ioflag
, struct cred
*cred
,
612 struct caller_context
*ct
)
614 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
615 struct tmount
*tm
= (struct tmount
*)VTOTM(vp
);
619 * We don't currently support writing to non-regular files
621 if (vp
->v_type
!= VREG
)
622 return (EINVAL
); /* XXX EISDIR? */
625 * tmp_rwlock should have already been called from layers above
627 ASSERT(RW_WRITE_HELD(&tp
->tn_rwlock
));
629 rw_enter(&tp
->tn_contents
, RW_WRITER
);
631 if (ioflag
& FAPPEND
) {
633 * In append mode start at end of file.
635 uiop
->uio_loffset
= tp
->tn_size
;
638 error
= wrtmp(tm
, tp
, uiop
, cred
, ct
);
640 rw_exit(&tp
->tn_contents
);
654 caller_context_t
*ct
)
666 caller_context_t
*ct
)
668 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
674 * A special case to handle the root tnode on a diskless nfs
675 * client who may have had its uid and gid inherited
676 * from an nfs vnode with nobody ownership. Likely the
677 * root filesystem. After nfs is fully functional the uid/gid
678 * may be mapable so ask again.
679 * vfsp can't get unmounted because we hold vp.
681 if (vp
->v_flag
& VROOT
&&
682 (mvp
= vp
->v_vfsp
->vfs_vnodecovered
) != NULL
) {
683 mutex_enter(&tp
->tn_tlock
);
684 if (tp
->tn_uid
== UID_NOBODY
|| tp
->tn_gid
== GID_NOBODY
) {
685 mutex_exit(&tp
->tn_tlock
);
686 bzero(&va
, sizeof (struct vattr
));
687 va
.va_mask
= AT_UID
|AT_GID
;
688 attrs
= VOP_GETATTR(mvp
, &va
, 0, cred
, ct
);
690 mutex_exit(&tp
->tn_tlock
);
693 mutex_enter(&tp
->tn_tlock
);
695 tp
->tn_uid
= va
.va_uid
;
696 tp
->tn_gid
= va
.va_gid
;
698 vap
->va_type
= vp
->v_type
;
699 vap
->va_mode
= tp
->tn_mode
& MODEMASK
;
700 vap
->va_uid
= tp
->tn_uid
;
701 vap
->va_gid
= tp
->tn_gid
;
702 vap
->va_fsid
= tp
->tn_fsid
;
703 vap
->va_nodeid
= (ino64_t
)tp
->tn_nodeid
;
704 vap
->va_nlink
= tp
->tn_nlink
;
705 vap
->va_size
= (u_offset_t
)tp
->tn_size
;
706 vap
->va_atime
= tp
->tn_atime
;
707 vap
->va_mtime
= tp
->tn_mtime
;
708 vap
->va_ctime
= tp
->tn_ctime
;
709 vap
->va_blksize
= PAGESIZE
;
710 vap
->va_rdev
= tp
->tn_rdev
;
711 vap
->va_seq
= tp
->tn_seq
;
714 * XXX Holes are not taken into account. We could take the time to
715 * run through the anon array looking for allocated slots...
717 vap
->va_nblocks
= (fsblkcnt64_t
)btodb(ptob(btopr(vap
->va_size
)));
718 mutex_exit(&tp
->tn_tlock
);
729 caller_context_t
*ct
)
731 struct tmount
*tm
= (struct tmount
*)VTOTM(vp
);
732 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
738 * Cannot set these attributes
740 if ((vap
->va_mask
& AT_NOSET
) || (vap
->va_mask
& AT_XVATTR
))
743 mutex_enter(&tp
->tn_tlock
);
747 * Change file access modes. Must be owner or have sufficient
750 error
= secpolicy_vnode_setattr(cred
, vp
, vap
, get
, flags
, tmp_taccess
,
758 if (mask
& AT_MODE
) {
759 get
->va_mode
&= S_IFMT
;
760 get
->va_mode
|= vap
->va_mode
& ~S_IFMT
;
764 get
->va_uid
= vap
->va_uid
;
766 get
->va_gid
= vap
->va_gid
;
768 get
->va_atime
= vap
->va_atime
;
770 get
->va_mtime
= vap
->va_mtime
;
772 if (mask
& (AT_UID
| AT_GID
| AT_MODE
| AT_MTIME
))
773 gethrestime(&tp
->tn_ctime
);
775 if (mask
& AT_SIZE
) {
776 ASSERT(vp
->v_type
!= VDIR
);
778 /* Don't support large files. */
779 if (vap
->va_size
> MAXOFF_T
) {
783 mutex_exit(&tp
->tn_tlock
);
785 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
786 rw_enter(&tp
->tn_contents
, RW_WRITER
);
787 error
= tmpnode_trunc(tm
, tp
, (ulong_t
)vap
->va_size
);
788 rw_exit(&tp
->tn_contents
);
789 rw_exit(&tp
->tn_rwlock
);
791 if (error
== 0 && vap
->va_size
== 0)
792 vnevent_truncate(vp
, ct
);
797 mutex_exit(&tp
->tn_tlock
);
809 caller_context_t
*ct
)
811 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
814 mutex_enter(&tp
->tn_tlock
);
815 error
= tmp_taccess(tp
, mode
, cred
);
816 mutex_exit(&tp
->tn_tlock
);
826 struct pathname
*pnp
,
830 caller_context_t
*ct
,
834 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(dvp
);
835 struct tmpnode
*ntp
= NULL
;
839 /* allow cd into @ dir */
840 if (flags
& LOOKUP_XATTR
) {
845 * don't allow attributes if not mounted XATTR support
847 if (!(dvp
->v_vfsp
->vfs_flag
& VFS_XATTR
))
850 if (tp
->tn_flags
& ISXATTR
)
851 /* No attributes on attributes */
854 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
855 if (tp
->tn_xattrdp
== NULL
) {
856 if (!(flags
& CREATE_XATTR_DIR
)) {
857 rw_exit(&tp
->tn_rwlock
);
862 * No attribute directory exists for this
863 * node - create the attr dir as a side effect
868 * Make sure we have adequate permission...
871 if ((error
= tmp_taccess(tp
, VWRITE
, cred
)) != 0) {
872 rw_exit(&tp
->tn_rwlock
);
876 xdp
= tmp_memalloc(sizeof (struct tmpnode
),
879 tmpnode_init(tm
, xdp
, &tp
->tn_attr
, NULL
);
881 * Fix-up fields unique to attribute directories.
883 xdp
->tn_flags
= ISXATTR
;
885 if (tp
->tn_type
== VDIR
) {
886 xdp
->tn_mode
= tp
->tn_attr
.va_mode
;
889 if (tp
->tn_attr
.va_mode
& 0040)
890 xdp
->tn_mode
|= 0750;
891 if (tp
->tn_attr
.va_mode
& 0004)
892 xdp
->tn_mode
|= 0705;
894 xdp
->tn_vnode
->v_type
= VDIR
;
895 xdp
->tn_vnode
->v_flag
|= V_XATTRDIR
;
897 tp
->tn_xattrdp
= xdp
;
899 VN_HOLD(tp
->tn_xattrdp
->tn_vnode
);
901 *vpp
= TNTOV(tp
->tn_xattrdp
);
902 rw_exit(&tp
->tn_rwlock
);
907 * Null component name is a synonym for directory being searched.
916 error
= tdirlookup(tp
, nm
, &ntp
, cred
);
922 * If vnode is a device return special vnode instead
924 if (IS_DEVVP(*vpp
)) {
927 newvp
= specvp(*vpp
, (*vpp
)->v_rdev
, (*vpp
)->v_type
,
933 TRACE_4(TR_FAC_TMPFS
, TR_TMPFS_LOOKUP
,
934 "tmpfs lookup:vp %p name %s vpp %p error %d",
935 dvp
, nm
, vpp
, error
);
945 enum vcexcl exclusive
,
950 caller_context_t
*ct
,
953 struct tmpnode
*parent
;
955 struct tmpnode
*self
;
957 struct tmpnode
*oldtp
;
960 parent
= (struct tmpnode
*)VTOTN(dvp
);
961 tm
= (struct tmount
*)VTOTM(dvp
);
966 /* device files not allowed in ext. attr dirs */
967 if ((parent
->tn_flags
& ISXATTR
) &&
968 (vap
->va_type
== VBLK
|| vap
->va_type
== VCHR
||
969 vap
->va_type
== VFIFO
|| vap
->va_type
== VDOOR
||
970 vap
->va_type
== VSOCK
|| vap
->va_type
== VPORT
))
973 if (vap
->va_type
== VREG
&& (vap
->va_mode
& VSVTX
)) {
974 /* Must be privileged to set sticky bit */
975 if (secpolicy_vnode_stky_modify(cred
))
976 vap
->va_mode
&= ~VSVTX
;
977 } else if (vap
->va_type
== VNON
) {
982 * Null component name is a synonym for directory being searched.
988 error
= tdirlookup(parent
, nm
, &oldtp
, cred
);
991 if (error
== 0) { /* name found */
992 boolean_t trunc
= B_FALSE
;
996 rw_enter(&oldtp
->tn_rwlock
, RW_WRITER
);
999 * if create/read-only an existing
1000 * directory, allow it
1002 if (exclusive
== EXCL
)
1004 else if ((oldtp
->tn_type
== VDIR
) && (mode
& VWRITE
))
1007 error
= tmp_taccess(oldtp
, mode
, cred
);
1011 rw_exit(&oldtp
->tn_rwlock
);
1012 tmpnode_rele(oldtp
);
1015 *vpp
= TNTOV(oldtp
);
1016 if ((*vpp
)->v_type
== VREG
&& (vap
->va_mask
& AT_SIZE
) &&
1017 vap
->va_size
== 0) {
1018 rw_enter(&oldtp
->tn_contents
, RW_WRITER
);
1019 (void) tmpnode_trunc(tm
, oldtp
, 0);
1020 rw_exit(&oldtp
->tn_contents
);
1023 rw_exit(&oldtp
->tn_rwlock
);
1024 if (IS_DEVVP(*vpp
)) {
1025 struct vnode
*newvp
;
1027 newvp
= specvp(*vpp
, (*vpp
)->v_rdev
, (*vpp
)->v_type
,
1030 if (newvp
== NULL
) {
1037 vnevent_create(*vpp
, ct
);
1042 if (error
!= ENOENT
)
1045 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1046 error
= tdirenter(tm
, parent
, nm
, DE_CREATE
,
1047 (struct tmpnode
*)NULL
, (struct tmpnode
*)NULL
,
1048 vap
, &self
, cred
, ct
);
1049 rw_exit(&parent
->tn_rwlock
);
1055 if (error
== EEXIST
) {
1057 * This means that the file was created sometime
1058 * after we checked and did not find it and when
1059 * we went to create it.
1060 * Since creat() is supposed to truncate a file
1061 * that already exits go back to the begining
1062 * of the function. This time we will find it
1063 * and go down the tmp_trunc() path
1072 if (!error
&& IS_DEVVP(*vpp
)) {
1073 struct vnode
*newvp
;
1075 newvp
= specvp(*vpp
, (*vpp
)->v_rdev
, (*vpp
)->v_type
, cred
);
1081 TRACE_3(TR_FAC_TMPFS
, TR_TMPFS_CREATE
,
1082 "tmpfs create:dvp %p nm %s vpp %p", dvp
, nm
, vpp
);
1092 caller_context_t
*ct
,
1095 struct tmpnode
*parent
= (struct tmpnode
*)VTOTN(dvp
);
1097 struct tmpnode
*tp
= NULL
;
1099 error
= tdirlookup(parent
, nm
, &tp
, cred
);
1104 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1105 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
1107 error
= (tp
->tn_type
== VDIR
) ? EPERM
:
1108 tdirdelete(parent
, tp
, nm
, DR_REMOVE
, cred
);
1110 rw_exit(&tp
->tn_rwlock
);
1111 rw_exit(&parent
->tn_rwlock
);
1112 vnevent_remove(TNTOV(tp
), dvp
, nm
, ct
);
1115 TRACE_3(TR_FAC_TMPFS
, TR_TMPFS_REMOVE
,
1116 "tmpfs remove:dvp %p nm %s error %d", dvp
, nm
, error
);
1124 struct vnode
*srcvp
,
1127 caller_context_t
*ct
,
1130 struct tmpnode
*parent
;
1131 struct tmpnode
*from
;
1132 struct tmount
*tm
= (struct tmount
*)VTOTM(dvp
);
1134 struct tmpnode
*found
= NULL
;
1135 struct vnode
*realvp
;
1137 if (VOP_REALVP(srcvp
, &realvp
, ct
) == 0)
1140 parent
= (struct tmpnode
*)VTOTN(dvp
);
1141 from
= (struct tmpnode
*)VTOTN(srcvp
);
1143 if (srcvp
->v_type
== VDIR
||
1144 (from
->tn_uid
!= crgetuid(cred
) && secpolicy_basic_link(cred
)))
1148 * Make sure link for extended attributes is valid
1149 * We only support hard linking of xattr's in xattrdir to an xattrdir
1151 if ((from
->tn_flags
& ISXATTR
) != (parent
->tn_flags
& ISXATTR
))
1154 error
= tdirlookup(parent
, tnm
, &found
, cred
);
1157 tmpnode_rele(found
);
1161 if (error
!= ENOENT
)
1164 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1165 error
= tdirenter(tm
, parent
, tnm
, DE_LINK
, (struct tmpnode
*)NULL
,
1166 from
, NULL
, (struct tmpnode
**)NULL
, cred
, ct
);
1167 rw_exit(&parent
->tn_rwlock
);
1169 vnevent_link(srcvp
, ct
);
1177 struct vnode
*odvp
, /* source parent vnode */
1178 char *onm
, /* source name */
1179 struct vnode
*ndvp
, /* destination parent vnode */
1180 char *nnm
, /* destination name */
1182 caller_context_t
*ct
,
1185 struct tmpnode
*fromparent
;
1186 struct tmpnode
*toparent
;
1187 struct tmpnode
*fromtp
= NULL
; /* source tmpnode */
1188 struct tmpnode
*totp
; /* target tmpnode */
1189 struct tmount
*tm
= (struct tmount
*)VTOTM(odvp
);
1191 int samedir
= 0; /* set if odvp == ndvp */
1192 struct vnode
*realvp
;
1194 if (VOP_REALVP(ndvp
, &realvp
, ct
) == 0)
1197 fromparent
= (struct tmpnode
*)VTOTN(odvp
);
1198 toparent
= (struct tmpnode
*)VTOTN(ndvp
);
1200 if ((fromparent
->tn_flags
& ISXATTR
) != (toparent
->tn_flags
& ISXATTR
))
1203 mutex_enter(&tm
->tm_renamelck
);
1206 * Look up tmpnode of file we're supposed to rename.
1208 error
= tdirlookup(fromparent
, onm
, &fromtp
, cred
);
1210 mutex_exit(&tm
->tm_renamelck
);
1215 * Make sure we can delete the old (source) entry. This
1216 * requires write permission on the containing directory. If
1217 * that directory is "sticky" it requires further checks.
1219 if (((error
= tmp_taccess(fromparent
, VWRITE
, cred
)) != 0) ||
1220 (error
= tmp_sticky_remove_access(fromparent
, fromtp
, cred
)) != 0)
1224 * Check for renaming to or from '.' or '..' or that
1225 * fromtp == fromparent
1227 if ((onm
[0] == '.' &&
1228 (onm
[1] == '\0' || (onm
[1] == '.' && onm
[2] == '\0'))) ||
1230 (nnm
[1] == '\0' || (nnm
[1] == '.' && nnm
[2] == '\0'))) ||
1231 (fromparent
== fromtp
)) {
1236 samedir
= (fromparent
== toparent
);
1238 * Make sure we can search and rename into the new
1239 * (destination) directory.
1242 error
= tmp_taccess(toparent
, VEXEC
|VWRITE
, cred
);
1247 if (tdirlookup(toparent
, nnm
, &totp
, cred
) == 0) {
1248 vnevent_pre_rename_dest(TNTOV(totp
), ndvp
, nnm
, ct
);
1252 /* Notify the target dir. if not the same as the source dir. */
1254 vnevent_pre_rename_dest_dir(ndvp
, TNTOV(fromtp
), nnm
, ct
);
1257 vnevent_pre_rename_src(TNTOV(fromtp
), odvp
, onm
, ct
);
1260 * Link source to new target
1262 rw_enter(&toparent
->tn_rwlock
, RW_WRITER
);
1263 error
= tdirenter(tm
, toparent
, nnm
, DE_RENAME
,
1264 fromparent
, fromtp
, (struct vattr
*)NULL
,
1265 (struct tmpnode
**)NULL
, cred
, ct
);
1266 rw_exit(&toparent
->tn_rwlock
);
1270 * ESAME isn't really an error; it indicates that the
1271 * operation should not be done because the source and target
1272 * are the same file, but that no error should be reported.
1280 * Unlink from source.
1282 rw_enter(&fromparent
->tn_rwlock
, RW_WRITER
);
1283 rw_enter(&fromtp
->tn_rwlock
, RW_WRITER
);
1285 error
= tdirdelete(fromparent
, fromtp
, onm
, DR_RENAME
, cred
);
1288 * The following handles the case where our source tmpnode was
1289 * removed before we got to it.
1291 * XXX We should also cleanup properly in the case where tdirdelete
1292 * fails for some other reason. Currently this case shouldn't happen.
1295 if (error
== ENOENT
)
1298 rw_exit(&fromtp
->tn_rwlock
);
1299 rw_exit(&fromparent
->tn_rwlock
);
1302 vnevent_rename_src(TNTOV(fromtp
), odvp
, onm
, ct
);
1304 * vnevent_rename_dest is called in tdirenter().
1305 * Notify the target dir if not same as source dir.
1308 vnevent_rename_dest_dir(ndvp
, ct
);
1312 tmpnode_rele(fromtp
);
1313 mutex_exit(&tm
->tm_renamelck
);
1315 TRACE_5(TR_FAC_TMPFS
, TR_TMPFS_RENAME
,
1316 "tmpfs rename:ovp %p onm %s nvp %p nnm %s error %d", odvp
, onm
,
1329 caller_context_t
*ct
,
1333 struct tmpnode
*parent
= (struct tmpnode
*)VTOTN(dvp
);
1334 struct tmpnode
*self
= NULL
;
1335 struct tmount
*tm
= (struct tmount
*)VTOTM(dvp
);
1338 /* no new dirs allowed in xattr dirs */
1339 if (parent
->tn_flags
& ISXATTR
)
1343 * Might be dangling directory. Catch it here,
1344 * because a ENOENT return from tdirlookup() is
1347 if (parent
->tn_nlink
== 0)
1350 error
= tdirlookup(parent
, nm
, &self
, cred
);
1356 if (error
!= ENOENT
)
1359 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1360 error
= tdirenter(tm
, parent
, nm
, DE_MKDIR
, (struct tmpnode
*)NULL
,
1361 (struct tmpnode
*)NULL
, va
, &self
, cred
, ct
);
1363 rw_exit(&parent
->tn_rwlock
);
1368 rw_exit(&parent
->tn_rwlock
);
1380 caller_context_t
*ct
,
1383 struct tmpnode
*parent
= (struct tmpnode
*)VTOTN(dvp
);
1384 struct tmpnode
*self
= NULL
;
1389 * Return error when removing . and ..
1391 if (strcmp(nm
, ".") == 0)
1393 if (strcmp(nm
, "..") == 0)
1394 return (EEXIST
); /* Should be ENOTEMPTY */
1395 error
= tdirlookup(parent
, nm
, &self
, cred
);
1399 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1400 rw_enter(&self
->tn_rwlock
, RW_WRITER
);
1403 if (vp
== dvp
|| vp
== cdir
) {
1407 if (self
->tn_type
!= VDIR
) {
1412 mutex_enter(&self
->tn_tlock
);
1413 if (self
->tn_nlink
> 2) {
1414 mutex_exit(&self
->tn_tlock
);
1418 mutex_exit(&self
->tn_tlock
);
1420 if (vn_vfswlock(vp
)) {
1424 if (vn_mountedvfs(vp
) != NULL
) {
1430 * Check for an empty directory
1431 * i.e. only includes entries for "." and ".."
1433 if (self
->tn_dirents
> 2) {
1434 error
= EEXIST
; /* SIGH should be ENOTEMPTY */
1436 * Update atime because checking tn_dirents is logically
1437 * equivalent to reading the directory
1439 gethrestime(&self
->tn_atime
);
1443 error
= tdirdelete(parent
, self
, nm
, DR_RMDIR
, cred
);
1447 rw_exit(&self
->tn_rwlock
);
1448 rw_exit(&parent
->tn_rwlock
);
1449 vnevent_rmdir(TNTOV(self
), dvp
, nm
, ct
);
1462 caller_context_t
*ct
,
1465 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
1466 struct tdirent
*tdp
;
1469 struct dirent64
*dp
;
1471 ulong_t total_bytes_wanted
;
1477 if (uiop
->uio_loffset
>= MAXOFF_T
) {
1483 * assuming system call has already called tmp_rwlock
1485 ASSERT(RW_READ_HELD(&tp
->tn_rwlock
));
1487 if (uiop
->uio_iovcnt
!= 1)
1490 if (vp
->v_type
!= VDIR
)
1494 * There's a window here where someone could have removed
1495 * all the entries in the directory after we put a hold on the
1496 * vnode but before we grabbed the rwlock. Just return.
1498 if (tp
->tn_dir
== NULL
) {
1500 panic("empty directory 0x%p", (void *)tp
);
1507 * Get space for multiple directory entries
1509 total_bytes_wanted
= uiop
->uio_iov
->iov_len
;
1510 bufsize
= total_bytes_wanted
+ sizeof (struct dirent64
);
1511 outbuf
= kmem_alloc(bufsize
, KM_SLEEP
);
1513 dp
= (struct dirent64
*)outbuf
;
1519 namelen
= strlen(tdp
->td_name
); /* no +1 needed */
1520 offset
= tdp
->td_offset
;
1521 if (offset
>= uiop
->uio_offset
) {
1522 reclen
= (int)DIRENT64_RECLEN(namelen
);
1523 if (outcount
+ reclen
> total_bytes_wanted
) {
1526 * Buffer too small for any entries.
1531 ASSERT(tdp
->td_tmpnode
!= NULL
);
1533 /* use strncpy(9f) to zero out uninitialized bytes */
1535 (void) strncpy(dp
->d_name
, tdp
->td_name
,
1536 DIRENT64_NAMELEN(reclen
));
1537 dp
->d_reclen
= (ushort_t
)reclen
;
1538 dp
->d_ino
= (ino64_t
)tdp
->td_tmpnode
->tn_nodeid
;
1539 dp
->d_off
= (offset_t
)tdp
->td_offset
+ 1;
1540 dp
= (struct dirent64
*)
1541 ((uintptr_t)dp
+ dp
->d_reclen
);
1543 ASSERT(outcount
<= bufsize
);
1549 error
= uiomove(outbuf
, outcount
, UIO_READ
, uiop
);
1552 /* If we reached the end of the list our offset */
1553 /* should now be just past the end. */
1560 uiop
->uio_offset
= offset
;
1562 gethrestime(&tp
->tn_atime
);
1563 kmem_free(outbuf
, bufsize
);
1575 caller_context_t
*ct
,
1578 struct tmpnode
*parent
= (struct tmpnode
*)VTOTN(dvp
);
1579 struct tmpnode
*self
= (struct tmpnode
*)NULL
;
1580 struct tmount
*tm
= (struct tmount
*)VTOTM(dvp
);
1585 /* no symlinks allowed to files in xattr dirs */
1586 if (parent
->tn_flags
& ISXATTR
)
1589 error
= tdirlookup(parent
, lnm
, &self
, cred
);
1592 * The entry already exists
1595 return (EEXIST
); /* was 0 */
1598 if (error
!= ENOENT
) {
1604 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1605 error
= tdirenter(tm
, parent
, lnm
, DE_CREATE
, (struct tmpnode
*)NULL
,
1606 (struct tmpnode
*)NULL
, tva
, &self
, cred
, ct
);
1607 rw_exit(&parent
->tn_rwlock
);
1614 len
= strlen(tnm
) + 1;
1615 cp
= tmp_memalloc(len
, 0);
1620 (void) strcpy(cp
, tnm
);
1622 self
->tn_symlink
= cp
;
1623 self
->tn_size
= len
- 1;
1634 caller_context_t
*ct
)
1636 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
1639 if (vp
->v_type
!= VLNK
)
1642 rw_enter(&tp
->tn_rwlock
, RW_READER
);
1643 rw_enter(&tp
->tn_contents
, RW_READER
);
1644 error
= uiomove(tp
->tn_symlink
, tp
->tn_size
, UIO_READ
, uiop
);
1645 gethrestime(&tp
->tn_atime
);
1646 rw_exit(&tp
->tn_contents
);
1647 rw_exit(&tp
->tn_rwlock
);
1657 caller_context_t
*ct
)
1664 tmp_inactive(struct vnode
*vp
, struct cred
*cred
, caller_context_t
*ct
)
1666 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
1667 struct tmount
*tm
= (struct tmount
*)VFSTOTM(vp
->v_vfsp
);
1669 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
1671 mutex_enter(&tp
->tn_tlock
);
1672 mutex_enter(&vp
->v_lock
);
1673 ASSERT(vp
->v_count
>= 1);
1676 * If we don't have the last hold or the link count is non-zero,
1677 * there's little to do -- just drop our hold.
1679 if (vp
->v_count
> 1 || tp
->tn_nlink
!= 0) {
1681 mutex_exit(&vp
->v_lock
);
1682 mutex_exit(&tp
->tn_tlock
);
1683 rw_exit(&tp
->tn_rwlock
);
1688 * We have the last hold *and* the link count is zero, so this
1689 * tmpnode is dead from the filesystem's viewpoint. However,
1690 * if the tmpnode has any pages associated with it (i.e. if it's
1691 * a normal file with non-zero size), the tmpnode can still be
1692 * discovered by pageout or fsflush via the page vnode pointers.
1693 * In this case we must drop all our locks, truncate the tmpnode,
1694 * and try the whole dance again.
1696 if (tp
->tn_size
!= 0) {
1697 if (tp
->tn_type
== VREG
) {
1698 mutex_exit(&vp
->v_lock
);
1699 mutex_exit(&tp
->tn_tlock
);
1700 rw_enter(&tp
->tn_contents
, RW_WRITER
);
1701 (void) tmpnode_trunc(tm
, tp
, 0);
1702 rw_exit(&tp
->tn_contents
);
1703 ASSERT(tp
->tn_size
== 0);
1704 ASSERT(tp
->tn_nblocks
== 0);
1707 if (tp
->tn_type
== VLNK
)
1708 tmp_memfree(tp
->tn_symlink
, tp
->tn_size
+ 1);
1712 * Remove normal file/dir's xattr dir and xattrs.
1714 if (tp
->tn_xattrdp
) {
1715 struct tmpnode
*xtp
= tp
->tn_xattrdp
;
1717 ASSERT(xtp
->tn_flags
& ISXATTR
);
1719 rw_enter(&xtp
->tn_rwlock
, RW_WRITER
);
1721 DECR_COUNT(&xtp
->tn_nlink
, &xtp
->tn_tlock
);
1722 tp
->tn_xattrdp
= NULL
;
1723 rw_exit(&xtp
->tn_rwlock
);
1727 mutex_exit(&vp
->v_lock
);
1728 mutex_exit(&tp
->tn_tlock
);
1729 /* Here's our chance to send invalid event while we're between locks */
1730 vn_invalid(TNTOV(tp
));
1731 mutex_enter(&tm
->tm_contents
);
1732 if (tp
->tn_forw
== NULL
)
1733 tm
->tm_rootnode
->tn_back
= tp
->tn_back
;
1735 tp
->tn_forw
->tn_back
= tp
->tn_back
;
1736 tp
->tn_back
->tn_forw
= tp
->tn_forw
;
1737 mutex_exit(&tm
->tm_contents
);
1738 rw_exit(&tp
->tn_rwlock
);
1739 rw_destroy(&tp
->tn_rwlock
);
1740 mutex_destroy(&tp
->tn_tlock
);
1742 tmp_memfree(tp
, sizeof (struct tmpnode
));
1747 tmp_fid(struct vnode
*vp
, struct fid
*fidp
, caller_context_t
*ct
)
1749 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
1752 if (fidp
->fid_len
< (sizeof (struct tfid
) - sizeof (ushort_t
))) {
1753 fidp
->fid_len
= sizeof (struct tfid
) - sizeof (ushort_t
);
1757 tfid
= (struct tfid
*)fidp
;
1758 bzero(tfid
, sizeof (struct tfid
));
1759 tfid
->tfid_len
= (int)sizeof (struct tfid
) - sizeof (ushort_t
);
1761 tfid
->tfid_ino
= tp
->tn_nodeid
;
1762 tfid
->tfid_gen
= tp
->tn_gen
;
1769 * Return all the pages from [off..off+len] in given file
1784 caller_context_t
*ct
)
1787 struct tmpnode
*tp
= VTOTN(vp
);
1788 anoff_t toff
= (anoff_t
)off
;
1793 rw_enter(&tp
->tn_contents
, RW_READER
);
1795 if (off
+ len
> tp
->tn_size
+ PAGEOFFSET
) {
1800 * Look for holes (no anon slot) in faulting range. If there are
1801 * holes we have to switch to a write lock and fill them in. Swap
1802 * space for holes was already reserved when the file was grown.
1805 if (non_anon(tp
->tn_anon
, btop(off
), &tmpoff
, &tlen
)) {
1806 if (!rw_tryupgrade(&tp
->tn_contents
)) {
1807 rw_exit(&tp
->tn_contents
);
1808 rw_enter(&tp
->tn_contents
, RW_WRITER
);
1809 /* Size may have changed when lock was dropped */
1810 if (off
+ len
> tp
->tn_size
+ PAGEOFFSET
) {
1815 for (toff
= (anoff_t
)off
; toff
< (anoff_t
)off
+ len
;
1817 if (anon_get_ptr(tp
->tn_anon
, btop(toff
)) == NULL
) {
1818 /* XXX - may allocate mem w. write lock held */
1819 (void) anon_set_ptr(tp
->tn_anon
, btop(toff
),
1820 anon_alloc(vp
, toff
), ANON_SLEEP
);
1824 rw_downgrade(&tp
->tn_contents
);
1828 err
= pvn_getpages(tmp_getapage
, vp
, (u_offset_t
)off
, len
, protp
,
1829 pl
, plsz
, seg
, addr
, rw
, cr
);
1837 rw_exit(&tp
->tn_contents
);
1842 * Called from pvn_getpages to get a particular page.
1867 if (pp
= page_lookup(vp
, off
, rw
== S_CREATE
? SE_EXCL
: SE_SHARED
)) {
1875 pp
= page_create_va(vp
, off
, PAGESIZE
,
1876 PG_WAIT
| PG_EXCL
, seg
, addr
);
1878 * Someone raced in and created the page after we did the
1879 * lookup but before we did the create, so go back and
1880 * try to look it up again.
1885 * Fill page from backing store, if any. If none, then
1886 * either this is a newly filled hole or page must have
1887 * been unmodified and freed so just zero it out.
1889 err
= swap_getphysname(vp
, off
, &pvp
, &poff
);
1891 panic("tmp_getapage: no anon slot vp %p "
1892 "off %llx pp %p\n", (void *)vp
, off
, (void *)pp
);
1895 flags
= (pl
== NULL
? B_ASYNC
|B_READ
: B_READ
);
1896 err
= VOP_PAGEIO(pvp
, pp
, (u_offset_t
)poff
, PAGESIZE
,
1898 if (flags
& B_ASYNC
)
1900 } else if (rw
!= S_CREATE
) {
1901 pagezero(pp
, 0, PAGESIZE
);
1904 pvn_read_done(pp
, B_ERROR
);
1907 pvn_plist_init(pp
, pl
, plsz
, off
, PAGESIZE
, rw
);
1917 * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
1918 * If len == 0, do from off to EOF.
1920 static int tmp_nopage
= 0; /* Don't do tmp_putpage's if set */
1925 register struct vnode
*vp
,
1930 caller_context_t
*ct
)
1932 register page_t
*pp
;
1936 struct tmpnode
*tp
= VTOTN(vp
);
1942 ASSERT(vp
->v_count
!= 0);
1944 if (vp
->v_flag
& VNOMAP
)
1948 * This being tmpfs, we don't ever do i/o unless we really
1949 * have to (when we're low on memory and pageout calls us
1950 * with B_ASYNC | B_FREE or the user explicitly asks for it with
1952 * XXX to approximately track the mod time like ufs we should
1953 * update the times here. The problem is, once someone does a
1954 * store we never clear the mod bit and do i/o, thus fsflush
1955 * will keep calling us every 30 seconds to do the i/o and we'll
1956 * continually update the mod time. At least we update the mod
1957 * time on the first store because this results in a call to getpage.
1959 if (flags
!= (B_ASYNC
| B_FREE
) && (flags
& B_INVAL
) == 0 &&
1960 (flags
& B_DONTNEED
) == 0)
1963 * If this thread owns the lock, i.e., this thread grabbed it
1964 * as writer somewhere above, then we don't need to grab the
1965 * lock as reader in this routine.
1967 dolock
= (rw_owner(&tp
->tn_contents
) != curthread
);
1970 * If this is pageout don't block on the lock as you could deadlock
1971 * when freemem == 0 (another thread has the read lock and is blocked
1972 * creating a page, and a third thread is waiting to get the writers
1973 * lock - waiting writers priority blocks us from getting the read
1974 * lock). Of course, if the only freeable pages are on this tmpnode
1975 * we're hosed anyways. A better solution might be a new lock type.
1976 * Note: ufs has the same problem.
1978 if (curproc
== proc_pageout
) {
1979 if (!rw_tryenter(&tp
->tn_contents
, RW_READER
))
1982 rw_enter(&tp
->tn_contents
, RW_READER
);
1984 if (!vn_has_cached_data(vp
))
1988 if (curproc
== proc_pageout
) {
1989 panic("tmp: pageout can't block");
1993 /* Search the entire vp list for pages >= off. */
1994 err
= pvn_vplist_dirty(vp
, (u_offset_t
)off
, tmp_putapage
,
2000 * Loop over all offsets in the range [off...off + len]
2001 * looking for pages to deal with.
2003 eoff
= MIN(off
+ len
, tp
->tn_size
);
2004 for (io_off
= off
; io_off
< eoff
; io_off
+= io_len
) {
2006 * If we are not invalidating, synchronously
2007 * freeing or writing pages use the routine
2008 * page_lookup_nowait() to prevent reclaiming
2009 * them from the free list.
2011 if ((flags
& B_INVAL
) || ((flags
& B_ASYNC
) == 0)) {
2012 pp
= page_lookup(vp
, io_off
,
2013 (flags
& (B_INVAL
| B_FREE
)) ?
2014 SE_EXCL
: SE_SHARED
);
2016 pp
= page_lookup_nowait(vp
, io_off
,
2017 (flags
& B_FREE
) ? SE_EXCL
: SE_SHARED
);
2020 if (pp
== NULL
|| pvn_getdirty(pp
, flags
) == 0)
2023 err
= tmp_putapage(vp
, pp
, &io_off
, &io_len
,
2030 /* If invalidating, verify all pages on vnode list are gone. */
2031 if (err
== 0 && off
== 0 && len
== 0 &&
2032 (flags
& B_INVAL
) && vn_has_cached_data(vp
)) {
2033 panic("tmp_putpage: B_INVAL, pages not gone");
2037 if ((curproc
== proc_pageout
) || dolock
)
2038 rw_exit(&tp
->tn_contents
);
2040 * Only reason putapage is going to give us SE_NOSWAP as error
2041 * is when we ask a page to be written to physical backing store
2042 * and there is none. Ignore this because we might be dealing
2043 * with a swap page which does not have any backing store
2044 * on disk. In any other case we won't get this error over here.
2046 if (err
== SE_NOSWAP
)
2051 long tmp_putpagecnt
, tmp_pagespushed
;
2054 * Write out a single page.
2055 * For tmpfs this means choose a physical swap slot and write the page
2056 * out using VOP_PAGEIO. For performance, we attempt to kluster; i.e.,
2057 * we try to find a bunch of other dirty pages adjacent in the file
2058 * and a bunch of contiguous swap slots, and then write all the pages
2059 * out in a single i/o.
2072 ulong_t klstart
, kllen
;
2073 page_t
*pplist
, *npplist
;
2074 extern int klustsize
;
2077 size_t pp_off
, pp_len
;
2085 ASSERT(PAGE_LOCKED(pp
));
2087 /* Kluster in tmp_klustsize chunks */
2089 tmp_klustsize
= klustsize
;
2090 offset
= pp
->p_offset
;
2091 klstart
= (offset
/ tmp_klustsize
) * tmp_klustsize
;
2092 kllen
= MIN(tmp_klustsize
, tp
->tn_size
- klstart
);
2094 /* Get a kluster of pages */
2096 pvn_write_kluster(vp
, pp
, &tmpoff
, &pp_len
, klstart
, kllen
, flags
);
2098 pp_off
= (size_t)tmpoff
;
2101 * Get a cluster of physical offsets for the pages; the amount we
2102 * get may be some subrange of what we ask for (io_off, io_len).
2106 err
= swap_newphysname(vp
, offset
, &io_off
, &io_len
, &pvp
, &pstart
);
2107 ASSERT(err
!= SE_NOANON
); /* anon slot must have been filled */
2109 pvn_write_done(pplist
, B_ERROR
| B_WRITE
| flags
);
2111 * If this routine is called as a result of segvn_sync
2112 * operation and we have no physical swap then we can get an
2113 * error here. In such case we would return SE_NOSWAP as error.
2114 * At this point, we expect only SE_NOSWAP.
2116 ASSERT(err
== SE_NOSWAP
);
2117 if (flags
& B_INVAL
)
2121 ASSERT(pp_off
<= io_off
&& io_off
+ io_len
<= pp_off
+ pp_len
);
2122 ASSERT(io_off
<= offset
&& offset
< io_off
+ io_len
);
2124 /* Toss pages at front/rear that we couldn't get physical backing for */
2125 if (io_off
!= pp_off
) {
2127 page_list_break(&pplist
, &npplist
, btop(io_off
- pp_off
));
2128 ASSERT(pplist
->p_offset
== pp_off
);
2129 ASSERT(pplist
->p_prev
->p_offset
== io_off
- PAGESIZE
);
2130 pvn_write_done(pplist
, B_ERROR
| B_WRITE
| flags
);
2133 if (io_off
+ io_len
< pp_off
+ pp_len
) {
2135 page_list_break(&pplist
, &npplist
, btop(io_len
));
2136 ASSERT(npplist
->p_offset
== io_off
+ io_len
);
2137 ASSERT(npplist
->p_prev
->p_offset
== pp_off
+ pp_len
- PAGESIZE
);
2138 pvn_write_done(npplist
, B_ERROR
| B_WRITE
| flags
);
2141 ASSERT(pplist
->p_offset
== io_off
);
2142 ASSERT(pplist
->p_prev
->p_offset
== io_off
+ io_len
- PAGESIZE
);
2143 ASSERT(btopr(io_len
) <= btopr(kllen
));
2145 /* Do i/o on the remaining kluster */
2146 err
= VOP_PAGEIO(pvp
, pplist
, (u_offset_t
)pstart
, io_len
,
2147 B_WRITE
| flags
, cr
, NULL
);
2149 if ((flags
& B_ASYNC
) == 0) {
2150 pvn_write_done(pplist
, ((err
) ? B_ERROR
: 0) | B_WRITE
| flags
);
2159 tmp_pagespushed
+= btop(io_len
);
2161 if (err
&& err
!= ENOMEM
&& err
!= SE_NOSWAP
)
2162 cmn_err(CE_WARN
, "tmp_putapage: err %d\n", err
);
2178 caller_context_t
*ct
)
2180 struct segvn_crargs vn_a
;
2181 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
2189 if (vp
->v_flag
& VNOMAP
)
2192 if (off
< 0 || (offset_t
)(off
+ len
) < 0 ||
2193 off
> MAXOFF_T
|| (off
+ len
) > MAXOFF_T
)
2196 if (vp
->v_type
!= VREG
)
2200 * Don't allow mapping to locked file
2202 if (vn_has_mandatory_locks(vp
, tp
->tn_mode
)) {
2207 error
= choose_addr(as
, addrp
, len
, off
, ADDR_VACALIGN
, flags
);
2214 vn_a
.offset
= (u_offset_t
)off
;
2215 vn_a
.type
= flags
& MAP_TYPE
;
2217 vn_a
.maxprot
= maxprot
;
2218 vn_a
.flags
= flags
& ~MAP_TYPE
;
2222 vn_a
.lgrp_mem_policy_flags
= 0;
2224 error
= as_map(as
, *addrp
, len
, segvn_create
, &vn_a
);
2230 * tmp_addmap and tmp_delmap can't be called since the vp
2231 * maintained in the segvn mapping is NULL.
2245 caller_context_t
*ct
)
2262 caller_context_t
*ct
)
2268 tmp_freesp(struct vnode
*vp
, struct flock64
*lp
, int flag
)
2271 register struct tmpnode
*tp
= VTOTN(vp
);
2274 ASSERT(vp
->v_type
== VREG
);
2275 ASSERT(lp
->l_start
>= 0);
2280 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
2281 if (tp
->tn_size
== lp
->l_start
) {
2282 rw_exit(&tp
->tn_rwlock
);
2287 * Check for any mandatory locks on the range
2289 if (MANDLOCK(vp
, tp
->tn_mode
)) {
2292 save_start
= lp
->l_start
;
2294 if (tp
->tn_size
< lp
->l_start
) {
2296 * "Truncate up" case: need to make sure there
2297 * is no lock beyond current end-of-file. To
2298 * do so, we need to set l_start to the size
2299 * of the file temporarily.
2301 lp
->l_start
= tp
->tn_size
;
2303 lp
->l_type
= F_WRLCK
;
2305 lp
->l_pid
= ttoproc(curthread
)->p_pid
;
2306 i
= (flag
& (FNDELAY
|FNONBLOCK
)) ? 0 : SLPFLCK
;
2307 if ((i
= reclock(vp
, lp
, i
, 0, lp
->l_start
, NULL
)) != 0 ||
2308 lp
->l_type
!= F_UNLCK
) {
2309 rw_exit(&tp
->tn_rwlock
);
2310 return (i
? i
: EAGAIN
);
2313 lp
->l_start
= save_start
;
2315 VFSTOTM(vp
->v_vfsp
);
2317 rw_enter(&tp
->tn_contents
, RW_WRITER
);
2318 error
= tmpnode_trunc((struct tmount
*)VFSTOTM(vp
->v_vfsp
),
2319 tp
, (ulong_t
)lp
->l_start
);
2320 rw_exit(&tp
->tn_contents
);
2321 rw_exit(&tp
->tn_rwlock
);
2330 struct flock64
*bfp
,
2334 caller_context_t
*ct
)
2338 if (cmd
!= F_FREESP
)
2340 if ((error
= convoff(vp
, bfp
, 0, (offset_t
)offset
)) == 0) {
2341 if ((bfp
->l_start
> MAXOFF_T
) || (bfp
->l_len
> MAXOFF_T
))
2343 error
= tmp_freesp(vp
, bfp
, flag
);
2345 if (error
== 0 && bfp
->l_start
== 0)
2346 vnevent_truncate(vp
, ct
);
2357 caller_context_t
*ct
)
2359 return ((*noffp
< 0 || *noffp
> MAXOFFSET_T
) ? EINVAL
: 0);
2364 tmp_rwlock(struct vnode
*vp
, int write_lock
, caller_context_t
*ctp
)
2366 struct tmpnode
*tp
= VTOTN(vp
);
2369 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
2371 rw_enter(&tp
->tn_rwlock
, RW_READER
);
2373 return (write_lock
);
2378 tmp_rwunlock(struct vnode
*vp
, int write_lock
, caller_context_t
*ctp
)
2380 struct tmpnode
*tp
= VTOTN(vp
);
2382 rw_exit(&tp
->tn_rwlock
);
2391 caller_context_t
*ct
)
2393 struct tmpnode
*tp
= NULL
;
2397 case _PC_XATTR_EXISTS
:
2398 if (vp
->v_vfsp
->vfs_flag
& VFS_XATTR
) {
2399 *valp
= 0; /* assume no attributes */
2400 error
= 0; /* okay to ask */
2402 rw_enter(&tp
->tn_rwlock
, RW_READER
);
2403 if (tp
->tn_xattrdp
) {
2404 rw_enter(&tp
->tn_xattrdp
->tn_rwlock
, RW_READER
);
2405 /* do not count "." and ".." */
2406 if (tp
->tn_xattrdp
->tn_dirents
> 2)
2408 rw_exit(&tp
->tn_xattrdp
->tn_rwlock
);
2410 rw_exit(&tp
->tn_rwlock
);
2415 case _PC_SATTR_ENABLED
:
2416 case _PC_SATTR_EXISTS
:
2417 *valp
= vfs_has_feature(vp
->v_vfsp
, VFSFT_SYSATTR_VIEWS
) &&
2418 (vp
->v_type
== VREG
|| vp
->v_type
== VDIR
);
2421 case _PC_TIMESTAMP_RESOLUTION
:
2422 /* nanosecond timestamp resolution */
2427 error
= fs_pathconf(vp
, cmd
, valp
, cr
, ct
);
2433 struct vnodeops
*tmp_vnodeops
;
2435 const fs_operation_def_t tmp_vnodeops_template
[] = {
2436 VOPNAME_OPEN
, { .vop_open
= tmp_open
},
2437 VOPNAME_CLOSE
, { .vop_close
= tmp_close
},
2438 VOPNAME_READ
, { .vop_read
= tmp_read
},
2439 VOPNAME_WRITE
, { .vop_write
= tmp_write
},
2440 VOPNAME_IOCTL
, { .vop_ioctl
= tmp_ioctl
},
2441 VOPNAME_GETATTR
, { .vop_getattr
= tmp_getattr
},
2442 VOPNAME_SETATTR
, { .vop_setattr
= tmp_setattr
},
2443 VOPNAME_ACCESS
, { .vop_access
= tmp_access
},
2444 VOPNAME_LOOKUP
, { .vop_lookup
= tmp_lookup
},
2445 VOPNAME_CREATE
, { .vop_create
= tmp_create
},
2446 VOPNAME_REMOVE
, { .vop_remove
= tmp_remove
},
2447 VOPNAME_LINK
, { .vop_link
= tmp_link
},
2448 VOPNAME_RENAME
, { .vop_rename
= tmp_rename
},
2449 VOPNAME_MKDIR
, { .vop_mkdir
= tmp_mkdir
},
2450 VOPNAME_RMDIR
, { .vop_rmdir
= tmp_rmdir
},
2451 VOPNAME_READDIR
, { .vop_readdir
= tmp_readdir
},
2452 VOPNAME_SYMLINK
, { .vop_symlink
= tmp_symlink
},
2453 VOPNAME_READLINK
, { .vop_readlink
= tmp_readlink
},
2454 VOPNAME_FSYNC
, { .vop_fsync
= tmp_fsync
},
2455 VOPNAME_INACTIVE
, { .vop_inactive
= tmp_inactive
},
2456 VOPNAME_FID
, { .vop_fid
= tmp_fid
},
2457 VOPNAME_RWLOCK
, { .vop_rwlock
= tmp_rwlock
},
2458 VOPNAME_RWUNLOCK
, { .vop_rwunlock
= tmp_rwunlock
},
2459 VOPNAME_SEEK
, { .vop_seek
= tmp_seek
},
2460 VOPNAME_SPACE
, { .vop_space
= tmp_space
},
2461 VOPNAME_GETPAGE
, { .vop_getpage
= tmp_getpage
},
2462 VOPNAME_PUTPAGE
, { .vop_putpage
= tmp_putpage
},
2463 VOPNAME_MAP
, { .vop_map
= tmp_map
},
2464 VOPNAME_ADDMAP
, { .vop_addmap
= tmp_addmap
},
2465 VOPNAME_DELMAP
, { .vop_delmap
= tmp_delmap
},
2466 VOPNAME_PATHCONF
, { .vop_pathconf
= tmp_pathconf
},
2467 VOPNAME_VNEVENT
, { .vop_vnevent
= fs_vnevent_support
},