4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
28 * Copyright (c) 2015, Joyent, Inc. All rights reserved.
29 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
30 * Copyright 2016 RackTop Systems.
31 * Copyright (c) 2017 by Delphix. All rights reserved.
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/t_lock.h>
37 #include <sys/systm.h>
38 #include <sys/sysmacros.h>
42 #include <sys/vnode.h>
44 #include <sys/fcntl.h>
45 #include <sys/flock.h>
48 #include <sys/errno.h>
51 #include <sys/dirent.h>
52 #include <sys/pathname.h>
53 #include <sys/vmsystm.h>
54 #include <sys/fs/tmp.h>
55 #include <sys/fs/tmpnode.h>
58 #include <vm/seg_vn.h>
59 #include <vm/seg_map.h>
65 #include <sys/cmn_err.h>
66 #include <sys/debug.h>
70 #include <sys/vtrace.h>
71 #include <sys/policy.h>
72 #include <sys/fs_subr.h>
74 static int tmp_getapage(struct vnode
*, uoff_t
, size_t, uint_t
*,
75 page_t
**, size_t, struct seg
*, caddr_t
, enum seg_rw
, struct cred
*);
76 static int tmp_putapage(struct vnode
*, page_t
*, uoff_t
*, size_t *,
81 tmp_open(struct vnode
**vpp
, int flag
, struct cred
*cred
, caller_context_t
*ct
)
84 * swapon to a tmpfs file is not supported so access
85 * is denied on open if VISSWAP is set.
87 if ((*vpp
)->v_flag
& VISSWAP
)
100 caller_context_t
*ct
)
102 cleanlocks(vp
, ttoproc(curthread
)->p_pid
, 0);
103 cleanshares(vp
, ttoproc(curthread
)->p_pid
);
108 * wrtmp does the real work of write requests for tmpfs.
116 struct caller_context
*ct
)
118 pgcnt_t pageoffset
; /* offset in pages */
119 ulong_t segmap_offset
; /* pagesize byte offset into segmap */
120 caddr_t base
; /* base of segmap */
121 ssize_t bytes
; /* bytes to uiomove */
122 pfn_t pagenumber
; /* offset in pages into tmp file */
125 int pagecreate
; /* == 1 if we allocated a page */
127 rlim64_t limit
= uio
->uio_llimit
;
128 long oresid
= uio
->uio_resid
;
131 long tn_size_changed
= 0;
136 ASSERT(vp
->v_type
== VREG
);
138 TRACE_1(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_START
,
139 "tmp_wrtmp_start:vp %p", vp
);
141 ASSERT(RW_WRITE_HELD(&tp
->tn_contents
));
142 ASSERT(RW_WRITE_HELD(&tp
->tn_rwlock
));
144 if (MANDLOCK(vp
, tp
->tn_mode
)) {
145 rw_exit(&tp
->tn_contents
);
147 * tmp_getattr ends up being called by chklock
149 error
= chklock(vp
, FWRITE
, uio
->uio_loffset
, uio
->uio_resid
,
151 rw_enter(&tp
->tn_contents
, RW_WRITER
);
153 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
154 "tmp_wrtmp_end:vp %p error %d", vp
, error
);
159 if (uio
->uio_loffset
< 0)
162 if (limit
== RLIM64_INFINITY
|| limit
> MAXOFFSET_T
)
165 if (uio
->uio_loffset
>= limit
) {
166 proc_t
*p
= ttoproc(curthread
);
168 mutex_enter(&p
->p_lock
);
169 (void) rctl_action(rctlproc_legacy
[RLIMIT_FSIZE
], p
->p_rctls
,
170 p
, RCA_UNSAFE_SIGINFO
);
171 mutex_exit(&p
->p_lock
);
175 if (uio
->uio_loffset
>= MAXOFF_T
) {
176 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
177 "tmp_wrtmp_end:vp %p error %d", vp
, EINVAL
);
181 if (uio
->uio_resid
== 0) {
182 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
183 "tmp_wrtmp_end:vp %p error %d", vp
, 0);
187 if (limit
> MAXOFF_T
)
194 offset
= (long)uio
->uio_offset
;
195 pageoffset
= offset
& PAGEOFFSET
;
197 * A maximum of PAGESIZE bytes of data is transferred
198 * each pass through this loop
200 bytes
= MIN(PAGESIZE
- pageoffset
, uio
->uio_resid
);
202 if (offset
+ bytes
>= limit
) {
203 if (offset
>= limit
) {
207 bytes
= limit
- offset
;
209 pagenumber
= btop(offset
);
212 * delta is the amount of anonymous memory
213 * to reserve for the file.
214 * We always reserve in pagesize increments so
215 * unless we're extending the file into a new page,
216 * we don't need to call tmp_resv.
218 delta
= offset
+ bytes
-
219 P2ROUNDUP_TYPED(tp
->tn_size
, PAGESIZE
, uoff_t
);
222 if (tmp_resv(tm
, tp
, delta
, pagecreate
)) {
224 * Log file system full in the zone that owns
225 * the tmpfs mount, as well as in the global
228 zcmn_err(tm
->tm_vfsp
->vfs_zone
->zone_id
,
229 CE_WARN
, "%s: File system full, "
230 "swap space limit exceeded",
233 if (tm
->tm_vfsp
->vfs_zone
->zone_id
!=
236 vfs_t
*vfs
= tm
->tm_vfsp
;
238 zcmn_err(GLOBAL_ZONEID
,
239 CE_WARN
, "%s: File system full, "
240 "swap space limit exceeded",
241 vfs
->vfs_vnodecovered
->v_path
);
246 tmpnode_growmap(tp
, (ulong_t
)offset
+ bytes
);
248 /* grow the file to the new length */
249 if (offset
+ bytes
> tp
->tn_size
) {
251 old_tn_size
= tp
->tn_size
;
253 * Postpone updating tp->tn_size until uiomove() is
256 new_tn_size
= offset
+ bytes
;
258 if (bytes
== PAGESIZE
) {
260 * Writing whole page so reading from disk
268 * If writing past EOF or filling in a hole
269 * we need to allocate an anon slot.
271 if (anon_get_ptr(tp
->tn_anon
, pagenumber
) == NULL
) {
272 (void) anon_set_ptr(tp
->tn_anon
, pagenumber
,
273 anon_alloc(vp
, ptob(pagenumber
)), ANON_SLEEP
);
279 * We have to drop the contents lock to allow the VM
280 * system to reacquire it in tmp_getpage()
282 rw_exit(&tp
->tn_contents
);
285 * Touch the page and fault it in if it is not in core
286 * before segmap_getmapflt or vpm_data_copy can lock it.
287 * This is to avoid the deadlock if the buffer is mapped
288 * to the same file through mmap which we want to write.
290 uio_prefaultpages((long)bytes
, uio
);
295 * Copy data. If new pages are created, part of
296 * the page that is not written will be initizliazed
299 error
= vpm_data_copy(vp
, offset
, bytes
, uio
,
300 !pagecreate
, &newpage
, 1, S_WRITE
);
302 /* Get offset within the segmap mapping */
303 segmap_offset
= (offset
& PAGEMASK
) & MAXBOFFSET
;
304 base
= segmap_getmapflt(segkmap
, vp
,
305 (offset
& MAXBMASK
), PAGESIZE
, !pagecreate
,
310 if (!vpm_enable
&& pagecreate
) {
312 * segmap_pagecreate() returns 1 if it calls
313 * page_create_va() to allocate any pages.
315 newpage
= segmap_pagecreate(segkmap
,
316 base
+ segmap_offset
, (size_t)PAGESIZE
, 0);
318 * Clear from the beginning of the page to the starting
319 * offset of the data.
322 (void) kzero(base
+ segmap_offset
,
327 error
= uiomove(base
+ segmap_offset
+ pageoffset
,
328 (long)bytes
, UIO_WRITE
, uio
);
331 if (!vpm_enable
&& pagecreate
&&
332 uio
->uio_offset
< P2ROUNDUP(offset
+ bytes
, PAGESIZE
)) {
333 long zoffset
; /* zero from offset into page */
335 * We created pages w/o initializing them completely,
336 * thus we need to zero the part that wasn't set up.
337 * This happens on most EOF write cases and if
338 * we had some sort of error during the uiomove.
342 nmoved
= uio
->uio_offset
- offset
;
343 ASSERT((nmoved
+ pageoffset
) <= PAGESIZE
);
346 * Zero from the end of data in the page to the
349 if ((zoffset
= pageoffset
+ nmoved
) < PAGESIZE
)
350 (void) kzero(base
+ segmap_offset
+ zoffset
,
351 (size_t)PAGESIZE
- zoffset
);
355 * Unlock the pages which have been allocated by
356 * page_create_va() in segmap_pagecreate()
358 if (!vpm_enable
&& newpage
) {
359 segmap_pageunlock(segkmap
, base
+ segmap_offset
,
360 (size_t)PAGESIZE
, S_WRITE
);
365 * If we failed on a write, we must
366 * be sure to invalidate any pages that may have
370 (void) vpm_sync_pages(vp
, offset
, PAGESIZE
,
373 (void) segmap_release(segkmap
, base
, SM_INVAL
);
377 error
= vpm_sync_pages(vp
, offset
, PAGESIZE
,
380 error
= segmap_release(segkmap
, base
, 0);
385 * Re-acquire contents lock.
387 rw_enter(&tp
->tn_contents
, RW_WRITER
);
393 tp
->tn_size
= new_tn_size
;
396 * If the uiomove failed, fix up tn_size.
399 if (tn_size_changed
) {
401 * The uiomove failed, and we
402 * allocated blocks,so get rid
405 (void) tmpnode_trunc(tm
, tp
,
406 (ulong_t
)old_tn_size
);
410 * XXX - Can this be out of the loop?
412 if ((tp
->tn_mode
& (S_IXUSR
| S_IXGRP
| S_IXOTH
)) &&
413 (tp
->tn_mode
& (S_ISUID
| S_ISGID
)) &&
414 secpolicy_vnode_setid_retain(cr
,
415 (tp
->tn_mode
& S_ISUID
) != 0 && tp
->tn_uid
== 0)) {
417 * Clear Set-UID & Set-GID bits on
418 * successful write if not privileged
419 * and at least one of the execute bits
420 * is set. If we always clear Set-GID,
421 * mandatory file and record locking is
424 tp
->tn_mode
&= ~(S_ISUID
| S_ISGID
);
430 } while (error
== 0 && uio
->uio_resid
> 0 && bytes
!= 0);
434 * If we've already done a partial-write, terminate
435 * the write but return no error.
437 if (oresid
!= uio
->uio_resid
)
439 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
440 "tmp_wrtmp_end:vp %p error %d", vp
, error
);
445 * rdtmp does the real work of read requests for tmpfs.
452 struct caller_context
*ct
)
454 ulong_t pageoffset
; /* offset in tmpfs file (uio_offset) */
455 ulong_t segmap_offset
; /* pagesize byte offset into segmap */
456 caddr_t base
; /* base of segmap */
457 ssize_t bytes
; /* bytes to uiomove */
460 long oresid
= uio
->uio_resid
;
464 TRACE_1(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_START
, "tmp_rdtmp_start:vp %p",
467 ASSERT(RW_LOCK_HELD(&tp
->tn_contents
));
469 if (MANDLOCK(vp
, tp
->tn_mode
)) {
470 rw_exit(&tp
->tn_contents
);
472 * tmp_getattr ends up being called by chklock
474 error
= chklock(vp
, FREAD
, uio
->uio_loffset
, uio
->uio_resid
,
476 rw_enter(&tp
->tn_contents
, RW_READER
);
478 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
479 "tmp_rdtmp_end:vp %p error %d", vp
, error
);
483 ASSERT(tp
->tn_type
== VREG
);
485 if (uio
->uio_loffset
>= MAXOFF_T
) {
486 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
487 "tmp_rdtmp_end:vp %p error %d", vp
, EINVAL
);
490 if (uio
->uio_loffset
< 0)
492 if (uio
->uio_resid
== 0) {
493 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
494 "tmp_rdtmp_end:vp %p error %d", vp
, 0);
504 offset
= uio
->uio_offset
;
505 pageoffset
= offset
& PAGEOFFSET
;
506 bytes
= MIN(PAGESIZE
- pageoffset
, uio
->uio_resid
);
508 diff
= tp
->tn_size
- offset
;
518 * We have to drop the contents lock to allow the VM system
519 * to reacquire it in tmp_getpage() should the uiomove cause a
522 rw_exit(&tp
->tn_contents
);
528 error
= vpm_data_copy(vp
, offset
, bytes
, uio
, 1, NULL
,
531 segmap_offset
= (offset
& PAGEMASK
) & MAXBOFFSET
;
532 base
= segmap_getmapflt(segkmap
, vp
, offset
& MAXBMASK
,
535 error
= uiomove(base
+ segmap_offset
+ pageoffset
,
536 (long)bytes
, UIO_READ
, uio
);
541 (void) vpm_sync_pages(vp
, offset
, PAGESIZE
, 0);
543 (void) segmap_release(segkmap
, base
, 0);
547 error
= vpm_sync_pages(vp
, offset
, PAGESIZE
,
550 error
= segmap_release(segkmap
, base
, 0);
555 * Re-acquire contents lock.
557 rw_enter(&tp
->tn_contents
, RW_READER
);
559 } while (error
== 0 && uio
->uio_resid
> 0);
562 gethrestime(&tp
->tn_atime
);
565 * If we've already done a partial read, terminate
566 * the read but return no error.
568 if (oresid
!= uio
->uio_resid
)
571 TRACE_2(TR_FAC_TMPFS
, TR_TMPFS_RWTMP_END
,
572 "tmp_rdtmp_end:vp %x error %d", vp
, error
);
578 tmp_read(struct vnode
*vp
, struct uio
*uiop
, int ioflag
, cred_t
*cred
,
579 struct caller_context
*ct
)
581 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
582 struct tmount
*tm
= (struct tmount
*)VTOTM(vp
);
586 * We don't currently support reading non-regular files
588 if (vp
->v_type
== VDIR
)
590 if (vp
->v_type
!= VREG
)
593 * tmp_rwlock should have already been called from layers above
595 ASSERT(RW_READ_HELD(&tp
->tn_rwlock
));
597 rw_enter(&tp
->tn_contents
, RW_READER
);
599 error
= rdtmp(tm
, tp
, uiop
, ct
);
601 rw_exit(&tp
->tn_contents
);
607 tmp_write(struct vnode
*vp
, struct uio
*uiop
, int ioflag
, struct cred
*cred
,
608 struct caller_context
*ct
)
610 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
611 struct tmount
*tm
= (struct tmount
*)VTOTM(vp
);
615 * We don't currently support writing to non-regular files
617 if (vp
->v_type
!= VREG
)
618 return (EINVAL
); /* XXX EISDIR? */
621 * tmp_rwlock should have already been called from layers above
623 ASSERT(RW_WRITE_HELD(&tp
->tn_rwlock
));
625 rw_enter(&tp
->tn_contents
, RW_WRITER
);
627 if (ioflag
& FAPPEND
) {
629 * In append mode start at end of file.
631 uiop
->uio_loffset
= tp
->tn_size
;
634 error
= wrtmp(tm
, tp
, uiop
, cred
, ct
);
636 rw_exit(&tp
->tn_contents
);
650 caller_context_t
*ct
)
662 caller_context_t
*ct
)
664 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
670 * A special case to handle the root tnode on a diskless nfs
671 * client who may have had its uid and gid inherited
672 * from an nfs vnode with nobody ownership. Likely the
673 * root filesystem. After nfs is fully functional the uid/gid
674 * may be mapable so ask again.
675 * vfsp can't get unmounted because we hold vp.
677 if (vp
->v_flag
& VROOT
&&
678 (mvp
= vp
->v_vfsp
->vfs_vnodecovered
) != NULL
) {
679 mutex_enter(&tp
->tn_tlock
);
680 if (tp
->tn_uid
== UID_NOBODY
|| tp
->tn_gid
== GID_NOBODY
) {
681 mutex_exit(&tp
->tn_tlock
);
682 bzero(&va
, sizeof (struct vattr
));
683 va
.va_mask
= AT_UID
|AT_GID
;
684 attrs
= fop_getattr(mvp
, &va
, 0, cred
, ct
);
686 mutex_exit(&tp
->tn_tlock
);
689 mutex_enter(&tp
->tn_tlock
);
691 tp
->tn_uid
= va
.va_uid
;
692 tp
->tn_gid
= va
.va_gid
;
694 vap
->va_type
= vp
->v_type
;
695 vap
->va_mode
= tp
->tn_mode
& MODEMASK
;
696 vap
->va_uid
= tp
->tn_uid
;
697 vap
->va_gid
= tp
->tn_gid
;
698 vap
->va_fsid
= tp
->tn_fsid
;
699 vap
->va_nodeid
= (ino64_t
)tp
->tn_nodeid
;
700 vap
->va_nlink
= tp
->tn_nlink
;
701 vap
->va_size
= (uoff_t
)tp
->tn_size
;
702 vap
->va_atime
= tp
->tn_atime
;
703 vap
->va_mtime
= tp
->tn_mtime
;
704 vap
->va_ctime
= tp
->tn_ctime
;
705 vap
->va_blksize
= PAGESIZE
;
706 vap
->va_rdev
= tp
->tn_rdev
;
707 vap
->va_seq
= tp
->tn_seq
;
710 * XXX Holes are not taken into account. We could take the time to
711 * run through the anon array looking for allocated slots...
713 vap
->va_nblocks
= (fsblkcnt64_t
)btodb(ptob(btopr(vap
->va_size
)));
714 mutex_exit(&tp
->tn_tlock
);
725 caller_context_t
*ct
)
727 struct tmount
*tm
= (struct tmount
*)VTOTM(vp
);
728 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
734 * Cannot set these attributes
736 if ((vap
->va_mask
& AT_NOSET
) || (vap
->va_mask
& AT_XVATTR
))
739 mutex_enter(&tp
->tn_tlock
);
743 * Change file access modes. Must be owner or have sufficient
746 error
= secpolicy_vnode_setattr(cred
, vp
, vap
, get
, flags
, tmp_taccess
,
754 if (mask
& AT_MODE
) {
755 get
->va_mode
&= S_IFMT
;
756 get
->va_mode
|= vap
->va_mode
& ~S_IFMT
;
760 get
->va_uid
= vap
->va_uid
;
762 get
->va_gid
= vap
->va_gid
;
764 get
->va_atime
= vap
->va_atime
;
766 get
->va_mtime
= vap
->va_mtime
;
768 if (mask
& (AT_UID
| AT_GID
| AT_MODE
| AT_MTIME
))
769 gethrestime(&tp
->tn_ctime
);
771 if (mask
& AT_SIZE
) {
772 ASSERT(vp
->v_type
!= VDIR
);
774 /* Don't support large files. */
775 if (vap
->va_size
> MAXOFF_T
) {
779 mutex_exit(&tp
->tn_tlock
);
781 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
782 rw_enter(&tp
->tn_contents
, RW_WRITER
);
783 error
= tmpnode_trunc(tm
, tp
, (ulong_t
)vap
->va_size
);
784 rw_exit(&tp
->tn_contents
);
785 rw_exit(&tp
->tn_rwlock
);
787 if (error
== 0 && vap
->va_size
== 0)
788 vnevent_truncate(vp
, ct
);
793 mutex_exit(&tp
->tn_tlock
);
805 caller_context_t
*ct
)
807 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
810 mutex_enter(&tp
->tn_tlock
);
811 error
= tmp_taccess(tp
, mode
, cred
);
812 mutex_exit(&tp
->tn_tlock
);
822 struct pathname
*pnp
,
826 caller_context_t
*ct
,
830 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(dvp
);
831 struct tmpnode
*ntp
= NULL
;
835 /* allow cd into @ dir */
836 if (flags
& LOOKUP_XATTR
) {
841 * don't allow attributes if not mounted XATTR support
843 if (!(dvp
->v_vfsp
->vfs_flag
& VFS_XATTR
))
846 if (tp
->tn_flags
& ISXATTR
)
847 /* No attributes on attributes */
850 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
851 if (tp
->tn_xattrdp
== NULL
) {
852 if (!(flags
& CREATE_XATTR_DIR
)) {
853 rw_exit(&tp
->tn_rwlock
);
858 * No attribute directory exists for this
859 * node - create the attr dir as a side effect
864 * Make sure we have adequate permission...
867 if ((error
= tmp_taccess(tp
, VWRITE
, cred
)) != 0) {
868 rw_exit(&tp
->tn_rwlock
);
872 xdp
= tmp_memalloc(sizeof (struct tmpnode
),
875 tmpnode_init(tm
, xdp
, &tp
->tn_attr
, NULL
);
877 * Fix-up fields unique to attribute directories.
879 xdp
->tn_flags
= ISXATTR
;
881 if (tp
->tn_type
== VDIR
) {
882 xdp
->tn_mode
= tp
->tn_attr
.va_mode
;
885 if (tp
->tn_attr
.va_mode
& 0040)
886 xdp
->tn_mode
|= 0750;
887 if (tp
->tn_attr
.va_mode
& 0004)
888 xdp
->tn_mode
|= 0705;
890 xdp
->tn_vnode
->v_type
= VDIR
;
891 xdp
->tn_vnode
->v_flag
|= V_XATTRDIR
;
893 tp
->tn_xattrdp
= xdp
;
895 VN_HOLD(tp
->tn_xattrdp
->tn_vnode
);
897 *vpp
= TNTOV(tp
->tn_xattrdp
);
898 rw_exit(&tp
->tn_rwlock
);
903 * Null component name is a synonym for directory being searched.
912 error
= tdirlookup(tp
, nm
, &ntp
, cred
);
918 * If vnode is a device return special vnode instead
920 if (IS_DEVVP(*vpp
)) {
923 newvp
= specvp(*vpp
, (*vpp
)->v_rdev
, (*vpp
)->v_type
,
929 TRACE_4(TR_FAC_TMPFS
, TR_TMPFS_LOOKUP
,
930 "tmpfs lookup:vp %p name %s vpp %p error %d",
931 dvp
, nm
, vpp
, error
);
941 enum vcexcl exclusive
,
946 caller_context_t
*ct
,
949 struct tmpnode
*parent
;
951 struct tmpnode
*self
;
953 struct tmpnode
*oldtp
;
956 parent
= (struct tmpnode
*)VTOTN(dvp
);
957 tm
= (struct tmount
*)VTOTM(dvp
);
962 /* device files not allowed in ext. attr dirs */
963 if ((parent
->tn_flags
& ISXATTR
) &&
964 (vap
->va_type
== VBLK
|| vap
->va_type
== VCHR
||
965 vap
->va_type
== VFIFO
|| vap
->va_type
== VDOOR
||
966 vap
->va_type
== VSOCK
|| vap
->va_type
== VPORT
))
969 if (vap
->va_type
== VREG
&& (vap
->va_mode
& VSVTX
)) {
970 /* Must be privileged to set sticky bit */
971 if (secpolicy_vnode_stky_modify(cred
))
972 vap
->va_mode
&= ~VSVTX
;
973 } else if (vap
->va_type
== VNON
) {
978 * Null component name is a synonym for directory being searched.
984 error
= tdirlookup(parent
, nm
, &oldtp
, cred
);
987 if (error
== 0) { /* name found */
988 boolean_t trunc
= B_FALSE
;
992 rw_enter(&oldtp
->tn_rwlock
, RW_WRITER
);
995 * if create/read-only an existing
996 * directory, allow it
998 if (exclusive
== EXCL
)
1000 else if ((oldtp
->tn_type
== VDIR
) && (mode
& VWRITE
))
1003 error
= tmp_taccess(oldtp
, mode
, cred
);
1007 rw_exit(&oldtp
->tn_rwlock
);
1008 tmpnode_rele(oldtp
);
1011 *vpp
= TNTOV(oldtp
);
1012 if ((*vpp
)->v_type
== VREG
&& (vap
->va_mask
& AT_SIZE
) &&
1013 vap
->va_size
== 0) {
1014 rw_enter(&oldtp
->tn_contents
, RW_WRITER
);
1015 (void) tmpnode_trunc(tm
, oldtp
, 0);
1016 rw_exit(&oldtp
->tn_contents
);
1019 rw_exit(&oldtp
->tn_rwlock
);
1020 if (IS_DEVVP(*vpp
)) {
1021 struct vnode
*newvp
;
1023 newvp
= specvp(*vpp
, (*vpp
)->v_rdev
, (*vpp
)->v_type
,
1026 if (newvp
== NULL
) {
1033 vnevent_create(*vpp
, ct
);
1038 if (error
!= ENOENT
)
1041 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1042 error
= tdirenter(tm
, parent
, nm
, DE_CREATE
, NULL
, NULL
, vap
, &self
,
1044 rw_exit(&parent
->tn_rwlock
);
1050 if (error
== EEXIST
) {
1052 * This means that the file was created sometime
1053 * after we checked and did not find it and when
1054 * we went to create it.
1055 * Since creat() is supposed to truncate a file
1056 * that already exits go back to the begining
1057 * of the function. This time we will find it
1058 * and go down the tmp_trunc() path
1067 if (!error
&& IS_DEVVP(*vpp
)) {
1068 struct vnode
*newvp
;
1070 newvp
= specvp(*vpp
, (*vpp
)->v_rdev
, (*vpp
)->v_type
, cred
);
1076 TRACE_3(TR_FAC_TMPFS
, TR_TMPFS_CREATE
,
1077 "tmpfs create:dvp %p nm %s vpp %p", dvp
, nm
, vpp
);
1087 caller_context_t
*ct
,
1090 struct tmpnode
*parent
= (struct tmpnode
*)VTOTN(dvp
);
1092 struct tmpnode
*tp
= NULL
;
1094 error
= tdirlookup(parent
, nm
, &tp
, cred
);
1099 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1100 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
1102 error
= (tp
->tn_type
== VDIR
) ? EPERM
:
1103 tdirdelete(parent
, tp
, nm
, DR_REMOVE
, cred
);
1105 rw_exit(&tp
->tn_rwlock
);
1106 rw_exit(&parent
->tn_rwlock
);
1107 vnevent_remove(TNTOV(tp
), dvp
, nm
, ct
);
1110 TRACE_3(TR_FAC_TMPFS
, TR_TMPFS_REMOVE
,
1111 "tmpfs remove:dvp %p nm %s error %d", dvp
, nm
, error
);
1119 struct vnode
*srcvp
,
1122 caller_context_t
*ct
,
1125 struct tmpnode
*parent
;
1126 struct tmpnode
*from
;
1127 struct tmount
*tm
= (struct tmount
*)VTOTM(dvp
);
1129 struct tmpnode
*found
= NULL
;
1130 struct vnode
*realvp
;
1132 if (fop_realvp(srcvp
, &realvp
, ct
) == 0)
1135 parent
= (struct tmpnode
*)VTOTN(dvp
);
1136 from
= (struct tmpnode
*)VTOTN(srcvp
);
1138 if (srcvp
->v_type
== VDIR
||
1139 (from
->tn_uid
!= crgetuid(cred
) && secpolicy_basic_link(cred
)))
1143 * Make sure link for extended attributes is valid
1144 * We only support hard linking of xattr's in xattrdir to an xattrdir
1146 if ((from
->tn_flags
& ISXATTR
) != (parent
->tn_flags
& ISXATTR
))
1149 error
= tdirlookup(parent
, tnm
, &found
, cred
);
1152 tmpnode_rele(found
);
1156 if (error
!= ENOENT
)
1159 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1160 error
= tdirenter(tm
, parent
, tnm
, DE_LINK
, NULL
,
1161 from
, NULL
, (struct tmpnode
**)NULL
, cred
, ct
);
1162 rw_exit(&parent
->tn_rwlock
);
1164 vnevent_link(srcvp
, ct
);
1172 struct vnode
*odvp
, /* source parent vnode */
1173 char *onm
, /* source name */
1174 struct vnode
*ndvp
, /* destination parent vnode */
1175 char *nnm
, /* destination name */
1177 caller_context_t
*ct
,
1180 struct tmpnode
*fromparent
;
1181 struct tmpnode
*toparent
;
1182 struct tmpnode
*fromtp
= NULL
; /* source tmpnode */
1183 struct tmpnode
*totp
; /* target tmpnode */
1184 struct tmount
*tm
= (struct tmount
*)VTOTM(odvp
);
1186 int samedir
= 0; /* set if odvp == ndvp */
1187 struct vnode
*realvp
;
1189 if (fop_realvp(ndvp
, &realvp
, ct
) == 0)
1192 fromparent
= (struct tmpnode
*)VTOTN(odvp
);
1193 toparent
= (struct tmpnode
*)VTOTN(ndvp
);
1195 if ((fromparent
->tn_flags
& ISXATTR
) != (toparent
->tn_flags
& ISXATTR
))
1198 mutex_enter(&tm
->tm_renamelck
);
1201 * Look up tmpnode of file we're supposed to rename.
1203 error
= tdirlookup(fromparent
, onm
, &fromtp
, cred
);
1205 mutex_exit(&tm
->tm_renamelck
);
1210 * Make sure we can delete the old (source) entry. This
1211 * requires write permission on the containing directory. If
1212 * that directory is "sticky" it requires further checks.
1214 if (((error
= tmp_taccess(fromparent
, VWRITE
, cred
)) != 0) ||
1215 (error
= tmp_sticky_remove_access(fromparent
, fromtp
, cred
)) != 0)
1219 * Check for renaming to or from '.' or '..' or that
1220 * fromtp == fromparent
1222 if ((onm
[0] == '.' &&
1223 (onm
[1] == '\0' || (onm
[1] == '.' && onm
[2] == '\0'))) ||
1225 (nnm
[1] == '\0' || (nnm
[1] == '.' && nnm
[2] == '\0'))) ||
1226 (fromparent
== fromtp
)) {
1231 samedir
= (fromparent
== toparent
);
1233 * Make sure we can search and rename into the new
1234 * (destination) directory.
1237 error
= tmp_taccess(toparent
, VEXEC
|VWRITE
, cred
);
1242 if (tdirlookup(toparent
, nnm
, &totp
, cred
) == 0) {
1243 vnevent_pre_rename_dest(TNTOV(totp
), ndvp
, nnm
, ct
);
1247 /* Notify the target dir. if not the same as the source dir. */
1249 vnevent_pre_rename_dest_dir(ndvp
, TNTOV(fromtp
), nnm
, ct
);
1252 vnevent_pre_rename_src(TNTOV(fromtp
), odvp
, onm
, ct
);
1255 * Link source to new target
1257 rw_enter(&toparent
->tn_rwlock
, RW_WRITER
);
1258 error
= tdirenter(tm
, toparent
, nnm
, DE_RENAME
,
1259 fromparent
, fromtp
, NULL
, (struct tmpnode
**)NULL
, cred
, ct
);
1260 rw_exit(&toparent
->tn_rwlock
);
1264 * ESAME isn't really an error; it indicates that the
1265 * operation should not be done because the source and target
1266 * are the same file, but that no error should be reported.
1274 * Unlink from source.
1276 rw_enter(&fromparent
->tn_rwlock
, RW_WRITER
);
1277 rw_enter(&fromtp
->tn_rwlock
, RW_WRITER
);
1279 error
= tdirdelete(fromparent
, fromtp
, onm
, DR_RENAME
, cred
);
1282 * The following handles the case where our source tmpnode was
1283 * removed before we got to it.
1285 * XXX We should also cleanup properly in the case where tdirdelete
1286 * fails for some other reason. Currently this case shouldn't happen.
1289 if (error
== ENOENT
)
1292 rw_exit(&fromtp
->tn_rwlock
);
1293 rw_exit(&fromparent
->tn_rwlock
);
1296 vnevent_rename_src(TNTOV(fromtp
), odvp
, onm
, ct
);
1298 * vnevent_rename_dest is called in tdirenter().
1299 * Notify the target dir if not same as source dir.
1302 vnevent_rename_dest_dir(ndvp
, ct
);
1306 tmpnode_rele(fromtp
);
1307 mutex_exit(&tm
->tm_renamelck
);
1309 TRACE_5(TR_FAC_TMPFS
, TR_TMPFS_RENAME
,
1310 "tmpfs rename:ovp %p onm %s nvp %p nnm %s error %d", odvp
, onm
,
1323 caller_context_t
*ct
,
1327 struct tmpnode
*parent
= (struct tmpnode
*)VTOTN(dvp
);
1328 struct tmpnode
*self
= NULL
;
1329 struct tmount
*tm
= (struct tmount
*)VTOTM(dvp
);
1332 /* no new dirs allowed in xattr dirs */
1333 if (parent
->tn_flags
& ISXATTR
)
1337 * Might be dangling directory. Catch it here,
1338 * because a ENOENT return from tdirlookup() is
1341 if (parent
->tn_nlink
== 0)
1344 error
= tdirlookup(parent
, nm
, &self
, cred
);
1350 if (error
!= ENOENT
)
1353 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1354 error
= tdirenter(tm
, parent
, nm
, DE_MKDIR
, NULL
,
1355 NULL
, va
, &self
, cred
, ct
);
1357 rw_exit(&parent
->tn_rwlock
);
1362 rw_exit(&parent
->tn_rwlock
);
1374 caller_context_t
*ct
,
1377 struct tmpnode
*parent
= (struct tmpnode
*)VTOTN(dvp
);
1378 struct tmpnode
*self
= NULL
;
1383 * Return error when removing . and ..
1385 if (strcmp(nm
, ".") == 0)
1387 if (strcmp(nm
, "..") == 0)
1388 return (EEXIST
); /* Should be ENOTEMPTY */
1389 error
= tdirlookup(parent
, nm
, &self
, cred
);
1393 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1394 rw_enter(&self
->tn_rwlock
, RW_WRITER
);
1397 if (vp
== dvp
|| vp
== cdir
) {
1401 if (self
->tn_type
!= VDIR
) {
1406 mutex_enter(&self
->tn_tlock
);
1407 if (self
->tn_nlink
> 2) {
1408 mutex_exit(&self
->tn_tlock
);
1412 mutex_exit(&self
->tn_tlock
);
1414 if (vn_vfswlock(vp
)) {
1418 if (vn_mountedvfs(vp
) != NULL
) {
1424 * Check for an empty directory
1425 * i.e. only includes entries for "." and ".."
1427 if (self
->tn_dirents
> 2) {
1428 error
= EEXIST
; /* SIGH should be ENOTEMPTY */
1430 * Update atime because checking tn_dirents is logically
1431 * equivalent to reading the directory
1433 gethrestime(&self
->tn_atime
);
1437 error
= tdirdelete(parent
, self
, nm
, DR_RMDIR
, cred
);
1441 rw_exit(&self
->tn_rwlock
);
1442 rw_exit(&parent
->tn_rwlock
);
1443 vnevent_rmdir(TNTOV(self
), dvp
, nm
, ct
);
1456 caller_context_t
*ct
,
1459 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
1460 struct tdirent
*tdp
;
1463 struct dirent64
*dp
;
1465 ulong_t total_bytes_wanted
;
1471 if (uiop
->uio_loffset
>= MAXOFF_T
) {
1477 * assuming system call has already called tmp_rwlock
1479 ASSERT(RW_READ_HELD(&tp
->tn_rwlock
));
1481 if (uiop
->uio_iovcnt
!= 1)
1484 if (vp
->v_type
!= VDIR
)
1488 * There's a window here where someone could have removed
1489 * all the entries in the directory after we put a hold on the
1490 * vnode but before we grabbed the rwlock. Just return.
1492 if (tp
->tn_dir
== NULL
) {
1494 panic("empty directory 0x%p", (void *)tp
);
1501 * Get space for multiple directory entries
1503 total_bytes_wanted
= uiop
->uio_iov
->iov_len
;
1504 bufsize
= total_bytes_wanted
+ sizeof (struct dirent64
);
1505 outbuf
= kmem_alloc(bufsize
, KM_SLEEP
);
1507 dp
= (struct dirent64
*)outbuf
;
1513 namelen
= strlen(tdp
->td_name
); /* no +1 needed */
1514 offset
= tdp
->td_offset
;
1515 if (offset
>= uiop
->uio_offset
) {
1516 reclen
= (int)DIRENT64_RECLEN(namelen
);
1517 if (outcount
+ reclen
> total_bytes_wanted
) {
1520 * Buffer too small for any entries.
1525 ASSERT(tdp
->td_tmpnode
!= NULL
);
1527 /* use strncpy(9f) to zero out uninitialized bytes */
1529 (void) strncpy(dp
->d_name
, tdp
->td_name
,
1530 DIRENT64_NAMELEN(reclen
));
1531 dp
->d_reclen
= (ushort_t
)reclen
;
1532 dp
->d_ino
= (ino64_t
)tdp
->td_tmpnode
->tn_nodeid
;
1533 dp
->d_off
= (offset_t
)tdp
->td_offset
+ 1;
1534 dp
= (struct dirent64
*)
1535 ((uintptr_t)dp
+ dp
->d_reclen
);
1537 ASSERT(outcount
<= bufsize
);
1543 error
= uiomove(outbuf
, outcount
, UIO_READ
, uiop
);
1546 /* If we reached the end of the list our offset */
1547 /* should now be just past the end. */
1554 uiop
->uio_offset
= offset
;
1556 gethrestime(&tp
->tn_atime
);
1557 kmem_free(outbuf
, bufsize
);
1569 caller_context_t
*ct
,
1572 struct tmpnode
*parent
= (struct tmpnode
*)VTOTN(dvp
);
1573 struct tmpnode
*self
= NULL
;
1574 struct tmount
*tm
= (struct tmount
*)VTOTM(dvp
);
1579 /* no symlinks allowed to files in xattr dirs */
1580 if (parent
->tn_flags
& ISXATTR
)
1583 error
= tdirlookup(parent
, lnm
, &self
, cred
);
1586 * The entry already exists
1589 return (EEXIST
); /* was 0 */
1592 if (error
!= ENOENT
) {
1598 rw_enter(&parent
->tn_rwlock
, RW_WRITER
);
1599 error
= tdirenter(tm
, parent
, lnm
, DE_CREATE
, NULL
,
1600 NULL
, tva
, &self
, cred
, ct
);
1601 rw_exit(&parent
->tn_rwlock
);
1608 len
= strlen(tnm
) + 1;
1609 cp
= tmp_memalloc(len
, 0);
1614 (void) strcpy(cp
, tnm
);
1616 self
->tn_symlink
= cp
;
1617 self
->tn_size
= len
- 1;
1628 caller_context_t
*ct
)
1630 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
1633 if (vp
->v_type
!= VLNK
)
1636 rw_enter(&tp
->tn_rwlock
, RW_READER
);
1637 rw_enter(&tp
->tn_contents
, RW_READER
);
1638 error
= uiomove(tp
->tn_symlink
, tp
->tn_size
, UIO_READ
, uiop
);
1639 gethrestime(&tp
->tn_atime
);
1640 rw_exit(&tp
->tn_contents
);
1641 rw_exit(&tp
->tn_rwlock
);
1651 caller_context_t
*ct
)
1658 tmp_inactive(struct vnode
*vp
, struct cred
*cred
, caller_context_t
*ct
)
1660 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
1661 struct tmount
*tm
= (struct tmount
*)VFSTOTM(vp
->v_vfsp
);
1663 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
1665 mutex_enter(&tp
->tn_tlock
);
1666 mutex_enter(&vp
->v_lock
);
1667 ASSERT(vp
->v_count
>= 1);
1670 * If we don't have the last hold or the link count is non-zero,
1671 * there's little to do -- just drop our hold.
1673 if (vp
->v_count
> 1 || tp
->tn_nlink
!= 0) {
1675 mutex_exit(&vp
->v_lock
);
1676 mutex_exit(&tp
->tn_tlock
);
1677 rw_exit(&tp
->tn_rwlock
);
1682 * We have the last hold *and* the link count is zero, so this
1683 * tmpnode is dead from the filesystem's viewpoint. However,
1684 * if the tmpnode has any pages associated with it (i.e. if it's
1685 * a normal file with non-zero size), the tmpnode can still be
1686 * discovered by pageout or fsflush via the page vnode pointers.
1687 * In this case we must drop all our locks, truncate the tmpnode,
1688 * and try the whole dance again.
1690 if (tp
->tn_size
!= 0) {
1691 if (tp
->tn_type
== VREG
) {
1692 mutex_exit(&vp
->v_lock
);
1693 mutex_exit(&tp
->tn_tlock
);
1694 rw_enter(&tp
->tn_contents
, RW_WRITER
);
1695 (void) tmpnode_trunc(tm
, tp
, 0);
1696 rw_exit(&tp
->tn_contents
);
1697 ASSERT(tp
->tn_size
== 0);
1698 ASSERT(tp
->tn_nblocks
== 0);
1701 if (tp
->tn_type
== VLNK
)
1702 tmp_memfree(tp
->tn_symlink
, tp
->tn_size
+ 1);
1706 * Remove normal file/dir's xattr dir and xattrs.
1708 if (tp
->tn_xattrdp
) {
1709 struct tmpnode
*xtp
= tp
->tn_xattrdp
;
1711 ASSERT(xtp
->tn_flags
& ISXATTR
);
1713 rw_enter(&xtp
->tn_rwlock
, RW_WRITER
);
1715 DECR_COUNT(&xtp
->tn_nlink
, &xtp
->tn_tlock
);
1716 tp
->tn_xattrdp
= NULL
;
1717 rw_exit(&xtp
->tn_rwlock
);
1721 mutex_exit(&vp
->v_lock
);
1722 mutex_exit(&tp
->tn_tlock
);
1723 /* Here's our chance to send invalid event while we're between locks */
1724 vn_invalid(TNTOV(tp
));
1725 mutex_enter(&tm
->tm_contents
);
1726 if (tp
->tn_forw
== NULL
)
1727 tm
->tm_rootnode
->tn_back
= tp
->tn_back
;
1729 tp
->tn_forw
->tn_back
= tp
->tn_back
;
1730 tp
->tn_back
->tn_forw
= tp
->tn_forw
;
1731 mutex_exit(&tm
->tm_contents
);
1732 rw_exit(&tp
->tn_rwlock
);
1733 rw_destroy(&tp
->tn_rwlock
);
1734 mutex_destroy(&tp
->tn_tlock
);
1736 tmp_memfree(tp
, sizeof (struct tmpnode
));
1741 tmp_fid(struct vnode
*vp
, struct fid
*fidp
, caller_context_t
*ct
)
1743 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
1746 if (fidp
->fid_len
< (sizeof (struct tfid
) - sizeof (ushort_t
))) {
1747 fidp
->fid_len
= sizeof (struct tfid
) - sizeof (ushort_t
);
1751 tfid
= (struct tfid
*)fidp
;
1752 bzero(tfid
, sizeof (struct tfid
));
1753 tfid
->tfid_len
= (int)sizeof (struct tfid
) - sizeof (ushort_t
);
1755 tfid
->tfid_ino
= tp
->tn_nodeid
;
1756 tfid
->tfid_gen
= tp
->tn_gen
;
1763 * Return all the pages from [off..off+len] in given file
1778 caller_context_t
*ct
)
1781 struct tmpnode
*tp
= VTOTN(vp
);
1782 anoff_t toff
= (anoff_t
)off
;
1787 rw_enter(&tp
->tn_contents
, RW_READER
);
1789 if (off
+ len
> tp
->tn_size
+ PAGEOFFSET
) {
1794 * Look for holes (no anon slot) in faulting range. If there are
1795 * holes we have to switch to a write lock and fill them in. Swap
1796 * space for holes was already reserved when the file was grown.
1799 if (non_anon(tp
->tn_anon
, btop(off
), &tmpoff
, &tlen
)) {
1800 if (!rw_tryupgrade(&tp
->tn_contents
)) {
1801 rw_exit(&tp
->tn_contents
);
1802 rw_enter(&tp
->tn_contents
, RW_WRITER
);
1803 /* Size may have changed when lock was dropped */
1804 if (off
+ len
> tp
->tn_size
+ PAGEOFFSET
) {
1809 for (toff
= (anoff_t
)off
; toff
< (anoff_t
)off
+ len
;
1811 if (anon_get_ptr(tp
->tn_anon
, btop(toff
)) == NULL
) {
1812 /* XXX - may allocate mem w. write lock held */
1813 (void) anon_set_ptr(tp
->tn_anon
, btop(toff
),
1814 anon_alloc(vp
, toff
), ANON_SLEEP
);
1818 rw_downgrade(&tp
->tn_contents
);
1822 err
= pvn_getpages(tmp_getapage
, vp
, (uoff_t
)off
, len
, protp
,
1823 pl
, plsz
, seg
, addr
, rw
, cr
);
1831 rw_exit(&tp
->tn_contents
);
1836 * Called from pvn_getpages to get a particular page.
1861 if (pp
= page_lookup(&vp
->v_object
, off
, rw
== S_CREATE
? SE_EXCL
: SE_SHARED
)) {
1869 pp
= page_create_va(&vp
->v_object
, off
, PAGESIZE
,
1870 PG_WAIT
| PG_EXCL
, seg
, addr
);
1872 * Someone raced in and created the page after we did the
1873 * lookup but before we did the create, so go back and
1874 * try to look it up again.
1879 * Fill page from backing store, if any. If none, then
1880 * either this is a newly filled hole or page must have
1881 * been unmodified and freed so just zero it out.
1883 err
= swap_getphysname(vp
, off
, &pvp
, &poff
);
1885 panic("tmp_getapage: no anon slot vp %p "
1886 "off %llx pp %p\n", (void *)vp
, off
, (void *)pp
);
1889 flags
= (pl
== NULL
? B_ASYNC
|B_READ
: B_READ
);
1890 err
= fop_pageio(pvp
, pp
, (uoff_t
)poff
, PAGESIZE
,
1892 if (flags
& B_ASYNC
)
1894 } else if (rw
!= S_CREATE
) {
1895 pagezero(pp
, 0, PAGESIZE
);
1898 pvn_read_done(pp
, B_ERROR
);
1901 pvn_plist_init(pp
, pl
, plsz
, off
, PAGESIZE
, rw
);
1911 * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
1912 * If len == 0, do from off to EOF.
1914 static int tmp_nopage
= 0; /* Don't do tmp_putpage's if set */
1919 register struct vnode
*vp
,
1924 caller_context_t
*ct
)
1926 register page_t
*pp
;
1930 struct tmpnode
*tp
= VTOTN(vp
);
1936 ASSERT(vp
->v_count
!= 0);
1938 if (vp
->v_flag
& VNOMAP
)
1942 * This being tmpfs, we don't ever do i/o unless we really
1943 * have to (when we're low on memory and pageout calls us
1944 * with B_ASYNC | B_FREE or the user explicitly asks for it with
1946 * XXX to approximately track the mod time like ufs we should
1947 * update the times here. The problem is, once someone does a
1948 * store we never clear the mod bit and do i/o, thus fsflush
1949 * will keep calling us every 30 seconds to do the i/o and we'll
1950 * continually update the mod time. At least we update the mod
1951 * time on the first store because this results in a call to getpage.
1953 if (flags
!= (B_ASYNC
| B_FREE
) && (flags
& B_INVAL
) == 0 &&
1954 (flags
& B_DONTNEED
) == 0)
1957 * If this thread owns the lock, i.e., this thread grabbed it
1958 * as writer somewhere above, then we don't need to grab the
1959 * lock as reader in this routine.
1961 dolock
= (rw_owner(&tp
->tn_contents
) != curthread
);
1964 * If this is pageout don't block on the lock as you could deadlock
1965 * when freemem == 0 (another thread has the read lock and is blocked
1966 * creating a page, and a third thread is waiting to get the writers
1967 * lock - waiting writers priority blocks us from getting the read
1968 * lock). Of course, if the only freeable pages are on this tmpnode
1969 * we're hosed anyways. A better solution might be a new lock type.
1970 * Note: ufs has the same problem.
1972 if (curproc
== proc_pageout
) {
1973 if (!rw_tryenter(&tp
->tn_contents
, RW_READER
))
1976 rw_enter(&tp
->tn_contents
, RW_READER
);
1978 if (!vn_has_cached_data(vp
))
1982 if (curproc
== proc_pageout
) {
1983 panic("tmp: pageout can't block");
1987 /* Search the entire vp list for pages >= off. */
1988 err
= pvn_vplist_dirty(vp
, (uoff_t
)off
, tmp_putapage
,
1994 * Loop over all offsets in the range [off...off + len]
1995 * looking for pages to deal with.
1997 eoff
= MIN(off
+ len
, tp
->tn_size
);
1998 for (io_off
= off
; io_off
< eoff
; io_off
+= io_len
) {
2000 * If we are not invalidating, synchronously
2001 * freeing or writing pages use the routine
2002 * page_lookup_nowait() to prevent reclaiming
2003 * them from the free list.
2005 if ((flags
& B_INVAL
) || ((flags
& B_ASYNC
) == 0)) {
2006 pp
= page_lookup(&vp
->v_object
, io_off
,
2007 (flags
& (B_INVAL
| B_FREE
)) ? SE_EXCL
: SE_SHARED
);
2009 pp
= page_lookup_nowait(&vp
->v_object
,
2011 (flags
& B_FREE
) ? SE_EXCL
: SE_SHARED
);
2014 if (pp
== NULL
|| pvn_getdirty(pp
, flags
) == 0)
2017 err
= tmp_putapage(vp
, pp
, &io_off
, &io_len
,
2024 /* If invalidating, verify all pages on vnode list are gone. */
2025 if (err
== 0 && off
== 0 && len
== 0 &&
2026 (flags
& B_INVAL
) && vn_has_cached_data(vp
)) {
2027 panic("tmp_putpage: B_INVAL, pages not gone");
2031 if ((curproc
== proc_pageout
) || dolock
)
2032 rw_exit(&tp
->tn_contents
);
2034 * Only reason putapage is going to give us SE_NOSWAP as error
2035 * is when we ask a page to be written to physical backing store
2036 * and there is none. Ignore this because we might be dealing
2037 * with a swap page which does not have any backing store
2038 * on disk. In any other case we won't get this error over here.
2040 if (err
== SE_NOSWAP
)
2045 long tmp_putpagecnt
, tmp_pagespushed
;
2048 * Write out a single page.
2049 * For tmpfs this means choose a physical swap slot and write the page
2050 * out using fop_pageio. For performance, we attempt to kluster; i.e.,
2051 * we try to find a bunch of other dirty pages adjacent in the file
2052 * and a bunch of contiguous swap slots, and then write all the pages
2053 * out in a single i/o.
2066 ulong_t klstart
, kllen
;
2067 page_t
*pplist
, *npplist
;
2068 extern int klustsize
;
2071 size_t pp_off
, pp_len
;
2079 ASSERT(PAGE_LOCKED(pp
));
2081 /* Kluster in tmp_klustsize chunks */
2083 tmp_klustsize
= klustsize
;
2084 offset
= pp
->p_offset
;
2085 klstart
= (offset
/ tmp_klustsize
) * tmp_klustsize
;
2086 kllen
= MIN(tmp_klustsize
, tp
->tn_size
- klstart
);
2088 /* Get a kluster of pages */
2090 pvn_write_kluster(vp
, pp
, &tmpoff
, &pp_len
, klstart
, kllen
, flags
);
2092 pp_off
= (size_t)tmpoff
;
2095 * Get a cluster of physical offsets for the pages; the amount we
2096 * get may be some subrange of what we ask for (io_off, io_len).
2100 err
= swap_newphysname(vp
, offset
, &io_off
, &io_len
, &pvp
, &pstart
);
2101 ASSERT(err
!= SE_NOANON
); /* anon slot must have been filled */
2103 pvn_write_done(pplist
, B_ERROR
| B_WRITE
| flags
);
2105 * If this routine is called as a result of segvn_sync
2106 * operation and we have no physical swap then we can get an
2107 * error here. In such case we would return SE_NOSWAP as error.
2108 * At this point, we expect only SE_NOSWAP.
2110 ASSERT(err
== SE_NOSWAP
);
2111 if (flags
& B_INVAL
)
2115 ASSERT(pp_off
<= io_off
&& io_off
+ io_len
<= pp_off
+ pp_len
);
2116 ASSERT(io_off
<= offset
&& offset
< io_off
+ io_len
);
2118 /* Toss pages at front/rear that we couldn't get physical backing for */
2119 if (io_off
!= pp_off
) {
2121 page_list_break(&pplist
, &npplist
, btop(io_off
- pp_off
));
2122 ASSERT(pplist
->p_offset
== pp_off
);
2123 ASSERT(pplist
->p_prev
->p_offset
== io_off
- PAGESIZE
);
2124 pvn_write_done(pplist
, B_ERROR
| B_WRITE
| flags
);
2127 if (io_off
+ io_len
< pp_off
+ pp_len
) {
2129 page_list_break(&pplist
, &npplist
, btop(io_len
));
2130 ASSERT(npplist
->p_offset
== io_off
+ io_len
);
2131 ASSERT(npplist
->p_prev
->p_offset
== pp_off
+ pp_len
- PAGESIZE
);
2132 pvn_write_done(npplist
, B_ERROR
| B_WRITE
| flags
);
2135 ASSERT(pplist
->p_offset
== io_off
);
2136 ASSERT(pplist
->p_prev
->p_offset
== io_off
+ io_len
- PAGESIZE
);
2137 ASSERT(btopr(io_len
) <= btopr(kllen
));
2139 /* Do i/o on the remaining kluster */
2140 err
= fop_pageio(pvp
, pplist
, (uoff_t
)pstart
, io_len
,
2141 B_WRITE
| flags
, cr
, NULL
);
2143 if ((flags
& B_ASYNC
) == 0) {
2144 pvn_write_done(pplist
, ((err
) ? B_ERROR
: 0) | B_WRITE
| flags
);
2153 tmp_pagespushed
+= btop(io_len
);
2155 if (err
&& err
!= ENOMEM
&& err
!= SE_NOSWAP
)
2156 cmn_err(CE_WARN
, "tmp_putapage: err %d\n", err
);
2172 caller_context_t
*ct
)
2174 struct segvn_crargs vn_a
;
2175 struct tmpnode
*tp
= (struct tmpnode
*)VTOTN(vp
);
2183 if (vp
->v_flag
& VNOMAP
)
2186 if (off
< 0 || (offset_t
)(off
+ len
) < 0 ||
2187 off
> MAXOFF_T
|| (off
+ len
) > MAXOFF_T
)
2190 if (vp
->v_type
!= VREG
)
2194 * Don't allow mapping to locked file
2196 if (vn_has_mandatory_locks(vp
, tp
->tn_mode
)) {
2201 error
= choose_addr(as
, addrp
, len
, off
, ADDR_VACALIGN
, flags
);
2208 vn_a
.offset
= (uoff_t
)off
;
2209 vn_a
.type
= flags
& MAP_TYPE
;
2211 vn_a
.maxprot
= maxprot
;
2212 vn_a
.flags
= flags
& ~MAP_TYPE
;
2216 vn_a
.lgrp_mem_policy_flags
= 0;
2218 error
= as_map(as
, *addrp
, len
, segvn_create
, &vn_a
);
2224 * tmp_addmap and tmp_delmap can't be called since the vp
2225 * maintained in the segvn mapping is NULL.
2239 caller_context_t
*ct
)
2256 caller_context_t
*ct
)
2262 tmp_freesp(struct vnode
*vp
, struct flock64
*lp
, int flag
)
2265 register struct tmpnode
*tp
= VTOTN(vp
);
2268 ASSERT(vp
->v_type
== VREG
);
2269 ASSERT(lp
->l_start
>= 0);
2274 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
2275 if (tp
->tn_size
== lp
->l_start
) {
2276 rw_exit(&tp
->tn_rwlock
);
2281 * Check for any mandatory locks on the range
2283 if (MANDLOCK(vp
, tp
->tn_mode
)) {
2286 save_start
= lp
->l_start
;
2288 if (tp
->tn_size
< lp
->l_start
) {
2290 * "Truncate up" case: need to make sure there
2291 * is no lock beyond current end-of-file. To
2292 * do so, we need to set l_start to the size
2293 * of the file temporarily.
2295 lp
->l_start
= tp
->tn_size
;
2297 lp
->l_type
= F_WRLCK
;
2299 lp
->l_pid
= ttoproc(curthread
)->p_pid
;
2300 i
= (flag
& (FNDELAY
|FNONBLOCK
)) ? 0 : SLPFLCK
;
2301 if ((i
= reclock(vp
, lp
, i
, 0, lp
->l_start
, NULL
)) != 0 ||
2302 lp
->l_type
!= F_UNLCK
) {
2303 rw_exit(&tp
->tn_rwlock
);
2304 return (i
? i
: EAGAIN
);
2307 lp
->l_start
= save_start
;
2309 VFSTOTM(vp
->v_vfsp
);
2311 rw_enter(&tp
->tn_contents
, RW_WRITER
);
2312 error
= tmpnode_trunc((struct tmount
*)VFSTOTM(vp
->v_vfsp
),
2313 tp
, (ulong_t
)lp
->l_start
);
2314 rw_exit(&tp
->tn_contents
);
2315 rw_exit(&tp
->tn_rwlock
);
2324 struct flock64
*bfp
,
2328 caller_context_t
*ct
)
2332 if (cmd
!= F_FREESP
)
2334 if ((error
= convoff(vp
, bfp
, 0, (offset_t
)offset
)) == 0) {
2335 if ((bfp
->l_start
> MAXOFF_T
) || (bfp
->l_len
> MAXOFF_T
))
2337 error
= tmp_freesp(vp
, bfp
, flag
);
2339 if (error
== 0 && bfp
->l_start
== 0)
2340 vnevent_truncate(vp
, ct
);
2351 caller_context_t
*ct
)
2353 return ((*noffp
< 0 || *noffp
> MAXOFFSET_T
) ? EINVAL
: 0);
2358 tmp_rwlock(struct vnode
*vp
, int write_lock
, caller_context_t
*ctp
)
2360 struct tmpnode
*tp
= VTOTN(vp
);
2363 rw_enter(&tp
->tn_rwlock
, RW_WRITER
);
2365 rw_enter(&tp
->tn_rwlock
, RW_READER
);
2367 return (write_lock
);
2372 tmp_rwunlock(struct vnode
*vp
, int write_lock
, caller_context_t
*ctp
)
2374 struct tmpnode
*tp
= VTOTN(vp
);
2376 rw_exit(&tp
->tn_rwlock
);
2385 caller_context_t
*ct
)
2387 struct tmpnode
*tp
= NULL
;
2391 case _PC_XATTR_EXISTS
:
2392 if (vp
->v_vfsp
->vfs_flag
& VFS_XATTR
) {
2393 *valp
= 0; /* assume no attributes */
2394 error
= 0; /* okay to ask */
2396 rw_enter(&tp
->tn_rwlock
, RW_READER
);
2397 if (tp
->tn_xattrdp
) {
2398 rw_enter(&tp
->tn_xattrdp
->tn_rwlock
, RW_READER
);
2399 /* do not count "." and ".." */
2400 if (tp
->tn_xattrdp
->tn_dirents
> 2)
2402 rw_exit(&tp
->tn_xattrdp
->tn_rwlock
);
2404 rw_exit(&tp
->tn_rwlock
);
2409 case _PC_SATTR_ENABLED
:
2410 case _PC_SATTR_EXISTS
:
2411 *valp
= vfs_has_feature(vp
->v_vfsp
, VFSFT_SYSATTR_VIEWS
) &&
2412 (vp
->v_type
== VREG
|| vp
->v_type
== VDIR
);
2415 case _PC_TIMESTAMP_RESOLUTION
:
2416 /* nanosecond timestamp resolution */
2421 error
= fs_pathconf(vp
, cmd
, valp
, cr
, ct
);
2427 const struct vnodeops tmp_vnodeops
= {
2428 .vnop_name
= "tmpfs",
2429 .vop_open
= tmp_open
,
2430 .vop_close
= tmp_close
,
2431 .vop_read
= tmp_read
,
2432 .vop_write
= tmp_write
,
2433 .vop_ioctl
= tmp_ioctl
,
2434 .vop_getattr
= tmp_getattr
,
2435 .vop_setattr
= tmp_setattr
,
2436 .vop_access
= tmp_access
,
2437 .vop_lookup
= tmp_lookup
,
2438 .vop_create
= tmp_create
,
2439 .vop_remove
= tmp_remove
,
2440 .vop_link
= tmp_link
,
2441 .vop_rename
= tmp_rename
,
2442 .vop_mkdir
= tmp_mkdir
,
2443 .vop_rmdir
= tmp_rmdir
,
2444 .vop_readdir
= tmp_readdir
,
2445 .vop_symlink
= tmp_symlink
,
2446 .vop_readlink
= tmp_readlink
,
2447 .vop_fsync
= tmp_fsync
,
2448 .vop_inactive
= tmp_inactive
,
2450 .vop_rwlock
= tmp_rwlock
,
2451 .vop_rwunlock
= tmp_rwunlock
,
2452 .vop_seek
= tmp_seek
,
2453 .vop_space
= tmp_space
,
2454 .vop_getpage
= tmp_getpage
,
2455 .vop_putpage
= tmp_putpage
,
2457 .vop_addmap
= tmp_addmap
,
2458 .vop_delmap
= tmp_delmap
,
2459 .vop_pathconf
= tmp_pathconf
,
2460 .vop_vnevent
= fs_vnevent_support
,