usr/src/uts/common/fs/tmpfs/tmp_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /*
  28  * Copyright (c) 2015, Joyent, Inc. All rights reserved.
  29  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  30  * Copyright 2016 RackTop Systems.
  31  * Copyright (c) 2017 by Delphix. All rights reserved.
  32  */
  33
  34 #include <sys/types.h>
  35 #include <sys/param.h>
  36 #include <sys/t_lock.h>
  37 #include <sys/systm.h>
  38 #include <sys/sysmacros.h>
  39 #include <sys/user.h>
  40 #include <sys/time.h>
  41 #include <sys/vfs.h>
  42 #include <sys/vfs_opreg.h>
  43 #include <sys/vnode.h>
  44 #include <sys/file.h>
  45 #include <sys/fcntl.h>
  46 #include <sys/flock.h>
  47 #include <sys/kmem.h>
  48 #include <sys/uio.h>
  49 #include <sys/errno.h>
  50 #include <sys/stat.h>
  51 #include <sys/cred.h>
  52 #include <sys/dirent.h>
  53 #include <sys/pathname.h>
  54 #include <sys/vmsystm.h>
  55 #include <sys/fs/tmp.h>
  56 #include <sys/fs/tmpnode.h>
  57 #include <sys/mman.h>
  58 #include <vm/hat.h>
  59 #include <vm/seg_vn.h>
  60 #include <vm/seg_map.h>
  61 #include <vm/seg.h>
  62 #include <vm/anon.h>
  63 #include <vm/as.h>
  64 #include <vm/page.h>
  65 #include <vm/pvn.h>
  66 #include <sys/cmn_err.h>
  67 #include <sys/debug.h>
  68 #include <sys/swap.h>
  69 #include <sys/buf.h>
  70 #include <sys/vm.h>
  71 #include <sys/vtrace.h>
  72 #include <sys/policy.h>
  73 #include <fs/fs_subr.h>
  74
  75 static int      tmp_getapage(struct vnode *, u_offset_t, size_t, uint_t *,
  76         page_t **, size_t, struct seg *, caddr_t, enum seg_rw, struct cred *);
  77 static int      tmp_putapage(struct vnode *, page_t *, u_offset_t *, size_t *,
  78         int, struct cred *);
  79
  80 /* ARGSUSED1 */
  81 static int
  82 tmp_open(struct vnode **vpp, int flag, struct cred *cred, caller_context_t *ct)
  83 {
  84         /*
  85          * swapon to a tmpfs file is not supported so access
  86          * is denied on open if VISSWAP is set.
  87          */
  88         if ((*vpp)->v_flag & VISSWAP)
  89                 return (EINVAL);
  90         return (0);
  91 }
  92
  93 /* ARGSUSED1 */
  94 static int
  95 tmp_close(
  96         struct vnode *vp,
  97         int flag,
  98         int count,
  99         offset_t offset,
 100         struct cred *cred,
 101         caller_context_t *ct)
 102 {
 103         cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 104         cleanshares(vp, ttoproc(curthread)->p_pid);
 105         return (0);
 106 }
 107
 108 /*
 109  * wrtmp does the real work of write requests for tmpfs.
 110  */
 111 static int
 112 wrtmp(
 113         struct tmount *tm,
 114         struct tmpnode *tp,
 115         struct uio *uio,
 116         struct cred *cr,
 117         struct caller_context *ct)
 118 {
 119         pgcnt_t pageoffset;     /* offset in pages */
 120         ulong_t segmap_offset;  /* pagesize byte offset into segmap */
 121         caddr_t base;           /* base of segmap */
 122         ssize_t bytes;          /* bytes to uiomove */
 123         pfn_t pagenumber;       /* offset in pages into tmp file */
 124         struct vnode *vp;
 125         int error = 0;
 126         int     pagecreate;     /* == 1 if we allocated a page */
 127         int     newpage;
 128         rlim64_t limit = uio->uio_llimit;
 129         long oresid = uio->uio_resid;
 130         timestruc_t now;
 131
 132         long tn_size_changed = 0;
 133         long old_tn_size;
 134         long new_tn_size;
 135
 136         vp = TNTOV(tp);
 137         ASSERT(vp->v_type == VREG);
 138
 139         TRACE_1(TR_FAC_TMPFS, TR_TMPFS_RWTMP_START,
 140             "tmp_wrtmp_start:vp %p", vp);
 141
 142         ASSERT(RW_WRITE_HELD(&tp->tn_contents));
 143         ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
 144
 145         if (MANDLOCK(vp, tp->tn_mode)) {
 146                 rw_exit(&tp->tn_contents);
 147                 /*
 148                  * tmp_getattr ends up being called by chklock
 149                  */
 150                 error = chklock(vp, FWRITE, uio->uio_loffset, uio->uio_resid,
 151                     uio->uio_fmode, ct);
 152                 rw_enter(&tp->tn_contents, RW_WRITER);
 153                 if (error != 0) {
 154                         TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
 155                             "tmp_wrtmp_end:vp %p error %d", vp, error);
 156                         return (error);
 157                 }
 158         }
 159
 160         if (uio->uio_loffset < 0)
 161                 return (EINVAL);
 162
 163         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 164                 limit = MAXOFFSET_T;
 165
 166         if (uio->uio_loffset >= limit) {
 167                 proc_t *p = ttoproc(curthread);
 168
 169                 mutex_enter(&p->p_lock);
 170                 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
 171                     p, RCA_UNSAFE_SIGINFO);
 172                 mutex_exit(&p->p_lock);
 173                 return (EFBIG);
 174         }
 175
 176         if (uio->uio_loffset >= MAXOFF_T) {
 177                 TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
 178                     "tmp_wrtmp_end:vp %p error %d", vp, EINVAL);
 179                 return (EFBIG);
 180         }
 181
 182         if (uio->uio_resid == 0) {
 183                 TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
 184                     "tmp_wrtmp_end:vp %p error %d", vp, 0);
 185                 return (0);
 186         }
 187
 188         if (limit > MAXOFF_T)
 189                 limit = MAXOFF_T;
 190
 191         do {
 192                 long    offset;
 193                 long    delta;
 194
 195                 offset = (long)uio->uio_offset;
 196                 pageoffset = offset & PAGEOFFSET;
 197                 /*
 198                  * A maximum of PAGESIZE bytes of data is transferred
 199                  * each pass through this loop
 200                  */
 201                 bytes = MIN(PAGESIZE - pageoffset, uio->uio_resid);
 202
 203                 if (offset + bytes >= limit) {
 204                         if (offset >= limit) {
 205                                 error = EFBIG;
 206                                 goto out;
 207                         }
 208                         bytes = limit - offset;
 209                 }
 210                 pagenumber = btop(offset);
 211
 212                 /*
 213                  * delta is the amount of anonymous memory
 214                  * to reserve for the file.
 215                  * We always reserve in pagesize increments so
 216                  * unless we're extending the file into a new page,
 217                  * we don't need to call tmp_resv.
 218                  */
 219                 delta = offset + bytes -
 220                     P2ROUNDUP_TYPED(tp->tn_size, PAGESIZE, u_offset_t);
 221                 if (delta > 0) {
 222                         pagecreate = 1;
 223                         if (tmp_resv(tm, tp, delta, pagecreate)) {
 224                                 /*
 225                                  * Log file system full in the zone that owns
 226                                  * the tmpfs mount, as well as in the global
 227                                  * zone if necessary.
 228                                  */
 229                                 zcmn_err(tm->tm_vfsp->vfs_zone->zone_id,
 230                                     CE_WARN, "%s: File system full, "
 231                                     "swap space limit exceeded",
 232                                     tm->tm_mntpath);
 233
 234                                 if (tm->tm_vfsp->vfs_zone->zone_id !=
 235                                     GLOBAL_ZONEID) {
 236
 237                                         vfs_t *vfs = tm->tm_vfsp;
 238
 239                                         zcmn_err(GLOBAL_ZONEID,
 240                                             CE_WARN, "%s: File system full, "
 241                                             "swap space limit exceeded",
 242                                             vfs->vfs_vnodecovered->v_path);
 243                                 }
 244                                 error = ENOSPC;
 245                                 break;
 246                         }
 247                         tmpnode_growmap(tp, (ulong_t)offset + bytes);
 248                 }
 249                 /* grow the file to the new length */
 250                 if (offset + bytes > tp->tn_size) {
 251                         tn_size_changed = 1;
 252                         old_tn_size = tp->tn_size;
 253                         /*
 254                          * Postpone updating tp->tn_size until uiomove() is
 255                          * done.
 256                          */
 257                         new_tn_size = offset + bytes;
 258                 }
 259                 if (bytes == PAGESIZE) {
 260                         /*
 261                          * Writing whole page so reading from disk
 262                          * is a waste
 263                          */
 264                         pagecreate = 1;
 265                 } else {
 266                         pagecreate = 0;
 267                 }
 268                 /*
 269                  * If writing past EOF or filling in a hole
 270                  * we need to allocate an anon slot.
 271                  */
 272                 if (anon_get_ptr(tp->tn_anon, pagenumber) == NULL) {
 273                         (void) anon_set_ptr(tp->tn_anon, pagenumber,
 274                             anon_alloc(vp, ptob(pagenumber)), ANON_SLEEP);
 275                         pagecreate = 1;
 276                         tp->tn_nblocks++;
 277                 }
 278
 279                 /*
 280                  * We have to drop the contents lock to allow the VM
 281                  * system to reacquire it in tmp_getpage()
 282                  */
 283                 rw_exit(&tp->tn_contents);
 284
 285                 /*
 286                  * Touch the page and fault it in if it is not in core
 287                  * before segmap_getmapflt or vpm_data_copy can lock it.
 288                  * This is to avoid the deadlock if the buffer is mapped
 289                  * to the same file through mmap which we want to write.
 290                  */
 291                 uio_prefaultpages((long)bytes, uio);
 292
 293                 newpage = 0;
 294                 if (vpm_enable) {
 295                         /*
 296                          * Copy data. If new pages are created, part of
 297                          * the page that is not written will be initizliazed
 298                          * with zeros.
 299                          */
 300                         error = vpm_data_copy(vp, offset, bytes, uio,
 301                             !pagecreate, &newpage, 1, S_WRITE);
 302                 } else {
 303                         /* Get offset within the segmap mapping */
 304                         segmap_offset = (offset & PAGEMASK) & MAXBOFFSET;
 305                         base = segmap_getmapflt(segkmap, vp,
 306                             (offset &  MAXBMASK), PAGESIZE, !pagecreate,
 307                             S_WRITE);
 308                 }
 309
 310
 311                 if (!vpm_enable && pagecreate) {
 312                         /*
 313                          * segmap_pagecreate() returns 1 if it calls
 314                          * page_create_va() to allocate any pages.
 315                          */
 316                         newpage = segmap_pagecreate(segkmap,
 317                             base + segmap_offset, (size_t)PAGESIZE, 0);
 318                         /*
 319                          * Clear from the beginning of the page to the starting
 320                          * offset of the data.
 321                          */
 322                         if (pageoffset != 0)
 323                                 (void) kzero(base + segmap_offset,
 324                                     (size_t)pageoffset);
 325                 }
 326
 327                 if (!vpm_enable) {
 328                         error = uiomove(base + segmap_offset + pageoffset,
 329                             (long)bytes, UIO_WRITE, uio);
 330                 }
 331
 332                 if (!vpm_enable && pagecreate &&
 333                     uio->uio_offset < P2ROUNDUP(offset + bytes, PAGESIZE)) {
 334                         long    zoffset; /* zero from offset into page */
 335                         /*
 336                          * We created pages w/o initializing them completely,
 337                          * thus we need to zero the part that wasn't set up.
 338                          * This happens on most EOF write cases and if
 339                          * we had some sort of error during the uiomove.
 340                          */
 341                         long nmoved;
 342
 343                         nmoved = uio->uio_offset - offset;
 344                         ASSERT((nmoved + pageoffset) <= PAGESIZE);
 345
 346                         /*
 347                          * Zero from the end of data in the page to the
 348                          * end of the page.
 349                          */
 350                         if ((zoffset = pageoffset + nmoved) < PAGESIZE)
 351                                 (void) kzero(base + segmap_offset + zoffset,
 352                                     (size_t)PAGESIZE - zoffset);
 353                 }
 354
 355                 /*
 356                  * Unlock the pages which have been allocated by
 357                  * page_create_va() in segmap_pagecreate()
 358                  */
 359                 if (!vpm_enable && newpage) {
 360                         segmap_pageunlock(segkmap, base + segmap_offset,
 361                             (size_t)PAGESIZE, S_WRITE);
 362                 }
 363
 364                 if (error) {
 365                         /*
 366                          * If we failed on a write, we must
 367                          * be sure to invalidate any pages that may have
 368                          * been allocated.
 369                          */
 370                         if (vpm_enable) {
 371                                 (void) vpm_sync_pages(vp, offset, PAGESIZE,
 372                                     SM_INVAL);
 373                         } else {
 374                                 (void) segmap_release(segkmap, base, SM_INVAL);
 375                         }
 376                 } else {
 377                         if (vpm_enable) {
 378                                 error = vpm_sync_pages(vp, offset, PAGESIZE,
 379                                     0);
 380                         } else {
 381                                 error = segmap_release(segkmap, base, 0);
 382                         }
 383                 }
 384
 385                 /*
 386                  * Re-acquire contents lock.
 387                  */
 388                 rw_enter(&tp->tn_contents, RW_WRITER);
 389
 390                 /*
 391                  * Update tn_size.
 392                  */
 393                 if (tn_size_changed)
 394                         tp->tn_size = new_tn_size;
 395
 396                 /*
 397                  * If the uiomove failed, fix up tn_size.
 398                  */
 399                 if (error) {
 400                         if (tn_size_changed) {
 401                                 /*
 402                                  * The uiomove failed, and we
 403                                  * allocated blocks,so get rid
 404                                  * of them.
 405                                  */
 406                                 (void) tmpnode_trunc(tm, tp,
 407                                     (ulong_t)old_tn_size);
 408                         }
 409                 } else {
 410                         /*
 411                          * XXX - Can this be out of the loop?
 412                          */
 413                         if ((tp->tn_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) &&
 414                             (tp->tn_mode & (S_ISUID | S_ISGID)) &&
 415                             secpolicy_vnode_setid_retain(cr,
 416                             (tp->tn_mode & S_ISUID) != 0 && tp->tn_uid == 0)) {
 417                                 /*
 418                                  * Clear Set-UID & Set-GID bits on
 419                                  * successful write if not privileged
 420                                  * and at least one of the execute bits
 421                                  * is set.  If we always clear Set-GID,
 422                                  * mandatory file and record locking is
 423                                  * unuseable.
 424                                  */
 425                                 tp->tn_mode &= ~(S_ISUID | S_ISGID);
 426                         }
 427                         gethrestime(&now);
 428                         tp->tn_mtime = now;
 429                         tp->tn_ctime = now;
 430                 }
 431         } while (error == 0 && uio->uio_resid > 0 && bytes != 0);
 432
 433 out:
 434         /*
 435          * If we've already done a partial-write, terminate
 436          * the write but return no error.
 437          */
 438         if (oresid != uio->uio_resid)
 439                 error = 0;
 440         TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
 441             "tmp_wrtmp_end:vp %p error %d", vp, error);
 442         return (error);
 443 }
 444
 445 /*
 446  * rdtmp does the real work of read requests for tmpfs.
 447  */
 448 static int
 449 rdtmp(
 450         struct tmount *tm,
 451         struct tmpnode *tp,
 452         struct uio *uio,
 453         struct caller_context *ct)
 454 {
 455         ulong_t pageoffset;     /* offset in tmpfs file (uio_offset) */
 456         ulong_t segmap_offset;  /* pagesize byte offset into segmap */
 457         caddr_t base;           /* base of segmap */
 458         ssize_t bytes;          /* bytes to uiomove */
 459         struct vnode *vp;
 460         int error;
 461         long oresid = uio->uio_resid;
 462
 463 #if defined(lint)
 464         tm = tm;
 465 #endif
 466         vp = TNTOV(tp);
 467
 468         TRACE_1(TR_FAC_TMPFS, TR_TMPFS_RWTMP_START, "tmp_rdtmp_start:vp %p",
 469             vp);
 470
 471         ASSERT(RW_LOCK_HELD(&tp->tn_contents));
 472
 473         if (MANDLOCK(vp, tp->tn_mode)) {
 474                 rw_exit(&tp->tn_contents);
 475                 /*
 476                  * tmp_getattr ends up being called by chklock
 477                  */
 478                 error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid,
 479                     uio->uio_fmode, ct);
 480                 rw_enter(&tp->tn_contents, RW_READER);
 481                 if (error != 0) {
 482                         TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
 483                             "tmp_rdtmp_end:vp %p error %d", vp, error);
 484                         return (error);
 485                 }
 486         }
 487         ASSERT(tp->tn_type == VREG);
 488
 489         if (uio->uio_loffset >= MAXOFF_T) {
 490                 TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
 491                     "tmp_rdtmp_end:vp %p error %d", vp, EINVAL);
 492                 return (0);
 493         }
 494         if (uio->uio_loffset < 0)
 495                 return (EINVAL);
 496         if (uio->uio_resid == 0) {
 497                 TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
 498                     "tmp_rdtmp_end:vp %p error %d", vp, 0);
 499                 return (0);
 500         }
 501
 502         vp = TNTOV(tp);
 503
 504         do {
 505                 long diff;
 506                 long offset;
 507
 508                 offset = uio->uio_offset;
 509                 pageoffset = offset & PAGEOFFSET;
 510                 bytes = MIN(PAGESIZE - pageoffset, uio->uio_resid);
 511
 512                 diff = tp->tn_size - offset;
 513
 514                 if (diff <= 0) {
 515                         error = 0;
 516                         goto out;
 517                 }
 518                 if (diff < bytes)
 519                         bytes = diff;
 520
 521                 /*
 522                  * We have to drop the contents lock to allow the VM system
 523                  * to reacquire it in tmp_getpage() should the uiomove cause a
 524                  * pagefault.
 525                  */
 526                 rw_exit(&tp->tn_contents);
 527
 528                 if (vpm_enable) {
 529                         /*
 530                          * Copy data.
 531                          */
 532                         error = vpm_data_copy(vp, offset, bytes, uio, 1, NULL,
 533                             0, S_READ);
 534                 } else {
 535                         segmap_offset = (offset & PAGEMASK) & MAXBOFFSET;
 536                         base = segmap_getmapflt(segkmap, vp, offset & MAXBMASK,
 537                             bytes, 1, S_READ);
 538
 539                         error = uiomove(base + segmap_offset + pageoffset,
 540                             (long)bytes, UIO_READ, uio);
 541                 }
 542
 543                 if (error) {
 544                         if (vpm_enable) {
 545                                 (void) vpm_sync_pages(vp, offset, PAGESIZE, 0);
 546                         } else {
 547                                 (void) segmap_release(segkmap, base, 0);
 548                         }
 549                 } else {
 550                         if (vpm_enable) {
 551                                 error = vpm_sync_pages(vp, offset, PAGESIZE,
 552                                     0);
 553                         } else {
 554                                 error = segmap_release(segkmap, base, 0);
 555                         }
 556                 }
 557
 558                 /*
 559                  * Re-acquire contents lock.
 560                  */
 561                 rw_enter(&tp->tn_contents, RW_READER);
 562
 563         } while (error == 0 && uio->uio_resid > 0);
 564
 565 out:
 566         gethrestime(&tp->tn_atime);
 567
 568         /*
 569          * If we've already done a partial read, terminate
 570          * the read but return no error.
 571          */
 572         if (oresid != uio->uio_resid)
 573                 error = 0;
 574
 575         TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
 576             "tmp_rdtmp_end:vp %x error %d", vp, error);
 577         return (error);
 578 }
 579
 580 /* ARGSUSED2 */
 581 static int
 582 tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
 583     struct caller_context *ct)
 584 {
 585         struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
 586         struct tmount *tm = (struct tmount *)VTOTM(vp);
 587         int error;
 588
 589         /*
 590          * We don't currently support reading non-regular files
 591          */
 592         if (vp->v_type == VDIR)
 593                 return (EISDIR);
 594         if (vp->v_type != VREG)
 595                 return (EINVAL);
 596         /*
 597          * tmp_rwlock should have already been called from layers above
 598          */
 599         ASSERT(RW_READ_HELD(&tp->tn_rwlock));
 600
 601         rw_enter(&tp->tn_contents, RW_READER);
 602
 603         error = rdtmp(tm, tp, uiop, ct);
 604
 605         rw_exit(&tp->tn_contents);
 606
 607         return (error);
 608 }
 609
 610 static int
 611 tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
 612     struct caller_context *ct)
 613 {
 614         struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
 615         struct tmount *tm = (struct tmount *)VTOTM(vp);
 616         int error;
 617
 618         /*
 619          * We don't currently support writing to non-regular files
 620          */
 621         if (vp->v_type != VREG)
 622                 return (EINVAL);        /* XXX EISDIR? */
 623
 624         /*
 625          * tmp_rwlock should have already been called from layers above
 626          */
 627         ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
 628
 629         rw_enter(&tp->tn_contents, RW_WRITER);
 630
 631         if (ioflag & FAPPEND) {
 632                 /*
 633                  * In append mode start at end of file.
 634                  */
 635                 uiop->uio_loffset = tp->tn_size;
 636         }
 637
 638         error = wrtmp(tm, tp, uiop, cred, ct);
 639
 640         rw_exit(&tp->tn_contents);
 641
 642         return (error);
 643 }
 644
 645 /* ARGSUSED */
 646 static int
 647 tmp_ioctl(
 648         struct vnode *vp,
 649         int com,
 650         intptr_t data,
 651         int flag,
 652         struct cred *cred,
 653         int *rvalp,
 654         caller_context_t *ct)
 655 {
 656         return (ENOTTY);
 657 }
 658
 659 /* ARGSUSED2 */
 660 static int
 661 tmp_getattr(
 662         struct vnode *vp,
 663         struct vattr *vap,
 664         int flags,
 665         struct cred *cred,
 666         caller_context_t *ct)
 667 {
 668         struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
 669         struct vnode *mvp;
 670         struct vattr va;
 671         int attrs = 1;
 672
 673         /*
 674          * A special case to handle the root tnode on a diskless nfs
 675          * client who may have had its uid and gid inherited
 676          * from an nfs vnode with nobody ownership.  Likely the
 677          * root filesystem. After nfs is fully functional the uid/gid
 678          * may be mapable so ask again.
 679          * vfsp can't get unmounted because we hold vp.
 680          */
 681         if (vp->v_flag & VROOT &&
 682             (mvp = vp->v_vfsp->vfs_vnodecovered) != NULL) {
 683                 mutex_enter(&tp->tn_tlock);
 684                 if (tp->tn_uid == UID_NOBODY || tp->tn_gid == GID_NOBODY) {
 685                         mutex_exit(&tp->tn_tlock);
 686                         bzero(&va, sizeof (struct vattr));
 687                         va.va_mask = AT_UID|AT_GID;
 688                         attrs = VOP_GETATTR(mvp, &va, 0, cred, ct);
 689                 } else {
 690                         mutex_exit(&tp->tn_tlock);
 691                 }
 692         }
 693         mutex_enter(&tp->tn_tlock);
 694         if (attrs == 0) {
 695                 tp->tn_uid = va.va_uid;
 696                 tp->tn_gid = va.va_gid;
 697         }
 698         vap->va_type = vp->v_type;
 699         vap->va_mode = tp->tn_mode & MODEMASK;
 700         vap->va_uid = tp->tn_uid;
 701         vap->va_gid = tp->tn_gid;
 702         vap->va_fsid = tp->tn_fsid;
 703         vap->va_nodeid = (ino64_t)tp->tn_nodeid;
 704         vap->va_nlink = tp->tn_nlink;
 705         vap->va_size = (u_offset_t)tp->tn_size;
 706         vap->va_atime = tp->tn_atime;
 707         vap->va_mtime = tp->tn_mtime;
 708         vap->va_ctime = tp->tn_ctime;
 709         vap->va_blksize = PAGESIZE;
 710         vap->va_rdev = tp->tn_rdev;
 711         vap->va_seq = tp->tn_seq;
 712
 713         /*
 714          * XXX Holes are not taken into account.  We could take the time to
 715          * run through the anon array looking for allocated slots...
 716          */
 717         vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
 718         mutex_exit(&tp->tn_tlock);
 719         return (0);
 720 }
 721
 722 /*ARGSUSED4*/
 723 static int
 724 tmp_setattr(
 725         struct vnode *vp,
 726         struct vattr *vap,
 727         int flags,
 728         struct cred *cred,
 729         caller_context_t *ct)
 730 {
 731         struct tmount *tm = (struct tmount *)VTOTM(vp);
 732         struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
 733         int error = 0;
 734         struct vattr *get;
 735         long mask;
 736
 737         /*
 738          * Cannot set these attributes
 739          */
 740         if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR))
 741                 return (EINVAL);
 742
 743         mutex_enter(&tp->tn_tlock);
 744
 745         get = &tp->tn_attr;
 746         /*
 747          * Change file access modes. Must be owner or have sufficient
 748          * privileges.
 749          */
 750         error = secpolicy_vnode_setattr(cred, vp, vap, get, flags, tmp_taccess,
 751             tp);
 752
 753         if (error)
 754                 goto out;
 755
 756         mask = vap->va_mask;
 757
 758         if (mask & AT_MODE) {
 759                 get->va_mode &= S_IFMT;
 760                 get->va_mode |= vap->va_mode & ~S_IFMT;
 761         }
 762
 763         if (mask & AT_UID)
 764                 get->va_uid = vap->va_uid;
 765         if (mask & AT_GID)
 766                 get->va_gid = vap->va_gid;
 767         if (mask & AT_ATIME)
 768                 get->va_atime = vap->va_atime;
 769         if (mask & AT_MTIME)
 770                 get->va_mtime = vap->va_mtime;
 771
 772         if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
 773                 gethrestime(&tp->tn_ctime);
 774
 775         if (mask & AT_SIZE) {
 776                 ASSERT(vp->v_type != VDIR);
 777
 778                 /* Don't support large files. */
 779                 if (vap->va_size > MAXOFF_T) {
 780                         error = EFBIG;
 781                         goto out;
 782                 }
 783                 mutex_exit(&tp->tn_tlock);
 784
 785                 rw_enter(&tp->tn_rwlock, RW_WRITER);
 786                 rw_enter(&tp->tn_contents, RW_WRITER);
 787                 error = tmpnode_trunc(tm, tp, (ulong_t)vap->va_size);
 788                 rw_exit(&tp->tn_contents);
 789                 rw_exit(&tp->tn_rwlock);
 790
 791                 if (error == 0 && vap->va_size == 0)
 792                         vnevent_truncate(vp, ct);
 793
 794                 goto out1;
 795         }
 796 out:
 797         mutex_exit(&tp->tn_tlock);
 798 out1:
 799         return (error);
 800 }
 801
 802 /* ARGSUSED2 */
 803 static int
 804 tmp_access(
 805         struct vnode *vp,
 806         int mode,
 807         int flags,
 808         struct cred *cred,
 809         caller_context_t *ct)
 810 {
 811         struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
 812         int error;
 813
 814         mutex_enter(&tp->tn_tlock);
 815         error = tmp_taccess(tp, mode, cred);
 816         mutex_exit(&tp->tn_tlock);
 817         return (error);
 818 }
 819
 820 /* ARGSUSED3 */
 821 static int
 822 tmp_lookup(
 823         struct vnode *dvp,
 824         char *nm,
 825         struct vnode **vpp,
 826         struct pathname *pnp,
 827         int flags,
 828         struct vnode *rdir,
 829         struct cred *cred,
 830         caller_context_t *ct,
 831         int *direntflags,
 832         pathname_t *realpnp)
 833 {
 834         struct tmpnode *tp = (struct tmpnode *)VTOTN(dvp);
 835         struct tmpnode *ntp = NULL;
 836         int error;
 837
 838
 839         /* allow cd into @ dir */
 840         if (flags & LOOKUP_XATTR) {
 841                 struct tmpnode *xdp;
 842                 struct tmount *tm;
 843
 844                 /*
 845                  * don't allow attributes if not mounted XATTR support
 846                  */
 847                 if (!(dvp->v_vfsp->vfs_flag & VFS_XATTR))
 848                         return (EINVAL);
 849
 850                 if (tp->tn_flags & ISXATTR)
 851                         /* No attributes on attributes */
 852                         return (EINVAL);
 853
 854                 rw_enter(&tp->tn_rwlock, RW_WRITER);
 855                 if (tp->tn_xattrdp == NULL) {
 856                         if (!(flags & CREATE_XATTR_DIR)) {
 857                                 rw_exit(&tp->tn_rwlock);
 858                                 return (ENOENT);
 859                         }
 860
 861                         /*
 862                          * No attribute directory exists for this
 863                          * node - create the attr dir as a side effect
 864                          * of this lookup.
 865                          */
 866
 867                         /*
 868                          * Make sure we have adequate permission...
 869                          */
 870
 871                         if ((error = tmp_taccess(tp, VWRITE, cred)) != 0) {
 872                                 rw_exit(&tp->tn_rwlock);
 873                                 return (error);
 874                         }
 875
 876                         xdp = tmp_memalloc(sizeof (struct tmpnode),
 877                             TMP_MUSTHAVE);
 878                         tm = VTOTM(dvp);
 879                         tmpnode_init(tm, xdp, &tp->tn_attr, NULL);
 880                         /*
 881                          * Fix-up fields unique to attribute directories.
 882                          */
 883                         xdp->tn_flags = ISXATTR;
 884                         xdp->tn_type = VDIR;
 885                         if (tp->tn_type == VDIR) {
 886                                 xdp->tn_mode = tp->tn_attr.va_mode;
 887                         } else {
 888                                 xdp->tn_mode = 0700;
 889                                 if (tp->tn_attr.va_mode & 0040)
 890                                         xdp->tn_mode |= 0750;
 891                                 if (tp->tn_attr.va_mode & 0004)
 892                                         xdp->tn_mode |= 0705;
 893                         }
 894                         xdp->tn_vnode->v_type = VDIR;
 895                         xdp->tn_vnode->v_flag |= V_XATTRDIR;
 896                         tdirinit(tp, xdp);
 897                         tp->tn_xattrdp = xdp;
 898                 } else {
 899                         VN_HOLD(tp->tn_xattrdp->tn_vnode);
 900                 }
 901                 *vpp = TNTOV(tp->tn_xattrdp);
 902                 rw_exit(&tp->tn_rwlock);
 903                 return (0);
 904         }
 905
 906         /*
 907          * Null component name is a synonym for directory being searched.
 908          */
 909         if (*nm == '\0') {
 910                 VN_HOLD(dvp);
 911                 *vpp = dvp;
 912                 return (0);
 913         }
 914         ASSERT(tp);
 915
 916         error = tdirlookup(tp, nm, &ntp, cred);
 917
 918         if (error == 0) {
 919                 ASSERT(ntp);
 920                 *vpp = TNTOV(ntp);
 921                 /*
 922                  * If vnode is a device return special vnode instead
 923                  */
 924                 if (IS_DEVVP(*vpp)) {
 925                         struct vnode *newvp;
 926
 927                         newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type,
 928                             cred);
 929                         VN_RELE(*vpp);
 930                         *vpp = newvp;
 931                 }
 932         }
 933         TRACE_4(TR_FAC_TMPFS, TR_TMPFS_LOOKUP,
 934             "tmpfs lookup:vp %p name %s vpp %p error %d",
 935             dvp, nm, vpp, error);
 936         return (error);
 937 }
 938
 939 /*ARGSUSED7*/
 940 static int
 941 tmp_create(
 942         struct vnode *dvp,
 943         char *nm,
 944         struct vattr *vap,
 945         enum vcexcl exclusive,
 946         int mode,
 947         struct vnode **vpp,
 948         struct cred *cred,
 949         int flag,
 950         caller_context_t *ct,
 951         vsecattr_t *vsecp)
 952 {
 953         struct tmpnode *parent;
 954         struct tmount *tm;
 955         struct tmpnode *self;
 956         int error;
 957         struct tmpnode *oldtp;
 958
 959 again:
 960         parent = (struct tmpnode *)VTOTN(dvp);
 961         tm = (struct tmount *)VTOTM(dvp);
 962         self = NULL;
 963         error = 0;
 964         oldtp = NULL;
 965
 966         /* device files not allowed in ext. attr dirs */
 967         if ((parent->tn_flags & ISXATTR) &&
 968             (vap->va_type == VBLK || vap->va_type == VCHR ||
 969             vap->va_type == VFIFO || vap->va_type == VDOOR ||
 970             vap->va_type == VSOCK || vap->va_type == VPORT))
 971                         return (EINVAL);
 972
 973         if (vap->va_type == VREG && (vap->va_mode & VSVTX)) {
 974                 /* Must be privileged to set sticky bit */
 975                 if (secpolicy_vnode_stky_modify(cred))
 976                         vap->va_mode &= ~VSVTX;
 977         } else if (vap->va_type == VNON) {
 978                 return (EINVAL);
 979         }
 980
 981         /*
 982          * Null component name is a synonym for directory being searched.
 983          */
 984         if (*nm == '\0') {
 985                 VN_HOLD(dvp);
 986                 oldtp = parent;
 987         } else {
 988                 error = tdirlookup(parent, nm, &oldtp, cred);
 989         }
 990
 991         if (error == 0) {       /* name found */
 992                 boolean_t trunc = B_FALSE;
 993
 994                 ASSERT(oldtp);
 995
 996                 rw_enter(&oldtp->tn_rwlock, RW_WRITER);
 997
 998                 /*
 999                  * if create/read-only an existing
1000                  * directory, allow it
1001                  */
1002                 if (exclusive == EXCL)
1003                         error = EEXIST;
1004                 else if ((oldtp->tn_type == VDIR) && (mode & VWRITE))
1005                         error = EISDIR;
1006                 else {
1007                         error = tmp_taccess(oldtp, mode, cred);
1008                 }
1009
1010                 if (error) {
1011                         rw_exit(&oldtp->tn_rwlock);
1012                         tmpnode_rele(oldtp);
1013                         return (error);
1014                 }
1015                 *vpp = TNTOV(oldtp);
1016                 if ((*vpp)->v_type == VREG && (vap->va_mask & AT_SIZE) &&
1017                     vap->va_size == 0) {
1018                         rw_enter(&oldtp->tn_contents, RW_WRITER);
1019                         (void) tmpnode_trunc(tm, oldtp, 0);
1020                         rw_exit(&oldtp->tn_contents);
1021                         trunc = B_TRUE;
1022                 }
1023                 rw_exit(&oldtp->tn_rwlock);
1024                 if (IS_DEVVP(*vpp)) {
1025                         struct vnode *newvp;
1026
1027                         newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type,
1028                             cred);
1029                         VN_RELE(*vpp);
1030                         if (newvp == NULL) {
1031                                 return (ENOSYS);
1032                         }
1033                         *vpp = newvp;
1034                 }
1035
1036                 if (trunc)
1037                         vnevent_create(*vpp, ct);
1038
1039                 return (0);
1040         }
1041
1042         if (error != ENOENT)
1043                 return (error);
1044
1045         rw_enter(&parent->tn_rwlock, RW_WRITER);
1046         error = tdirenter(tm, parent, nm, DE_CREATE,
1047             (struct tmpnode *)NULL, (struct tmpnode *)NULL,
1048             vap, &self, cred, ct);
1049         rw_exit(&parent->tn_rwlock);
1050
1051         if (error) {
1052                 if (self)
1053                         tmpnode_rele(self);
1054
1055                 if (error == EEXIST) {
1056                         /*
1057                          * This means that the file was created sometime
1058                          * after we checked and did not find it and when
1059                          * we went to create it.
1060                          * Since creat() is supposed to truncate a file
1061                          * that already exits go back to the begining
1062                          * of the function. This time we will find it
1063                          * and go down the tmp_trunc() path
1064                          */
1065                         goto again;
1066                 }
1067                 return (error);
1068         }
1069
1070         *vpp = TNTOV(self);
1071
1072         if (!error && IS_DEVVP(*vpp)) {
1073                 struct vnode *newvp;
1074
1075                 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cred);
1076                 VN_RELE(*vpp);
1077                 if (newvp == NULL)
1078                         return (ENOSYS);
1079                 *vpp = newvp;
1080         }
1081         TRACE_3(TR_FAC_TMPFS, TR_TMPFS_CREATE,
1082             "tmpfs create:dvp %p nm %s vpp %p", dvp, nm, vpp);
1083         return (0);
1084 }
1085
1086 /* ARGSUSED3 */
1087 static int
1088 tmp_remove(
1089         struct vnode *dvp,
1090         char *nm,
1091         struct cred *cred,
1092         caller_context_t *ct,
1093         int flags)
1094 {
1095         struct tmpnode *parent = (struct tmpnode *)VTOTN(dvp);
1096         int error;
1097         struct tmpnode *tp = NULL;
1098
1099         error = tdirlookup(parent, nm, &tp, cred);
1100         if (error)
1101                 return (error);
1102
1103         ASSERT(tp);
1104         rw_enter(&parent->tn_rwlock, RW_WRITER);
1105         rw_enter(&tp->tn_rwlock, RW_WRITER);
1106
1107         error = (tp->tn_type == VDIR) ? EPERM :
1108             tdirdelete(parent, tp, nm, DR_REMOVE, cred);
1109
1110         rw_exit(&tp->tn_rwlock);
1111         rw_exit(&parent->tn_rwlock);
1112         vnevent_remove(TNTOV(tp), dvp, nm, ct);
1113         tmpnode_rele(tp);
1114
1115         TRACE_3(TR_FAC_TMPFS, TR_TMPFS_REMOVE,
1116             "tmpfs remove:dvp %p nm %s error %d", dvp, nm, error);
1117         return (error);
1118 }
1119
1120 /* ARGSUSED4 */
1121 static int
1122 tmp_link(
1123         struct vnode *dvp,
1124         struct vnode *srcvp,
1125         char *tnm,
1126         struct cred *cred,
1127         caller_context_t *ct,
1128         int flags)
1129 {
1130         struct tmpnode *parent;
1131         struct tmpnode *from;
1132         struct tmount *tm = (struct tmount *)VTOTM(dvp);
1133         int error;
1134         struct tmpnode *found = NULL;
1135         struct vnode *realvp;
1136
1137         if (VOP_REALVP(srcvp, &realvp, ct) == 0)
1138                 srcvp = realvp;
1139
1140         parent = (struct tmpnode *)VTOTN(dvp);
1141         from = (struct tmpnode *)VTOTN(srcvp);
1142
1143         if (srcvp->v_type == VDIR ||
1144             (from->tn_uid != crgetuid(cred) && secpolicy_basic_link(cred)))
1145                 return (EPERM);
1146
1147         /*
1148          * Make sure link for extended attributes is valid
1149          * We only support hard linking of xattr's in xattrdir to an xattrdir
1150          */
1151         if ((from->tn_flags & ISXATTR) != (parent->tn_flags & ISXATTR))
1152                 return (EINVAL);
1153
1154         error = tdirlookup(parent, tnm, &found, cred);
1155         if (error == 0) {
1156                 ASSERT(found);
1157                 tmpnode_rele(found);
1158                 return (EEXIST);
1159         }
1160
1161         if (error != ENOENT)
1162                 return (error);
1163
1164         rw_enter(&parent->tn_rwlock, RW_WRITER);
1165         error = tdirenter(tm, parent, tnm, DE_LINK, (struct tmpnode *)NULL,
1166             from, NULL, (struct tmpnode **)NULL, cred, ct);
1167         rw_exit(&parent->tn_rwlock);
1168         if (error == 0) {
1169                 vnevent_link(srcvp, ct);
1170         }
1171         return (error);
1172 }
1173
1174 /* ARGSUSED5 */
1175 static int
1176 tmp_rename(
1177         struct vnode *odvp,     /* source parent vnode */
1178         char *onm,              /* source name */
1179         struct vnode *ndvp,     /* destination parent vnode */
1180         char *nnm,              /* destination name */
1181         struct cred *cred,
1182         caller_context_t *ct,
1183         int flags)
1184 {
1185         struct tmpnode *fromparent;
1186         struct tmpnode *toparent;
1187         struct tmpnode *fromtp = NULL;  /* source tmpnode */
1188         struct tmpnode *totp;           /* target tmpnode */
1189         struct tmount *tm = (struct tmount *)VTOTM(odvp);
1190         int error;
1191         int samedir = 0;        /* set if odvp == ndvp */
1192         struct vnode *realvp;
1193
1194         if (VOP_REALVP(ndvp, &realvp, ct) == 0)
1195                 ndvp = realvp;
1196
1197         fromparent = (struct tmpnode *)VTOTN(odvp);
1198         toparent = (struct tmpnode *)VTOTN(ndvp);
1199
1200         if ((fromparent->tn_flags & ISXATTR) != (toparent->tn_flags & ISXATTR))
1201                 return (EINVAL);
1202
1203         mutex_enter(&tm->tm_renamelck);
1204
1205         /*
1206          * Look up tmpnode of file we're supposed to rename.
1207          */
1208         error = tdirlookup(fromparent, onm, &fromtp, cred);
1209         if (error) {
1210                 mutex_exit(&tm->tm_renamelck);
1211                 return (error);
1212         }
1213
1214         /*
1215          * Make sure we can delete the old (source) entry.  This
1216          * requires write permission on the containing directory.  If
1217          * that directory is "sticky" it requires further checks.
1218          */
1219         if (((error = tmp_taccess(fromparent, VWRITE, cred)) != 0) ||
1220             (error = tmp_sticky_remove_access(fromparent, fromtp, cred)) != 0)
1221                 goto done;
1222
1223         /*
1224          * Check for renaming to or from '.' or '..' or that
1225          * fromtp == fromparent
1226          */
1227         if ((onm[0] == '.' &&
1228             (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) ||
1229             (nnm[0] == '.' &&
1230             (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) ||
1231             (fromparent == fromtp)) {
1232                 error = EINVAL;
1233                 goto done;
1234         }
1235
1236         samedir = (fromparent == toparent);
1237         /*
1238          * Make sure we can search and rename into the new
1239          * (destination) directory.
1240          */
1241         if (!samedir) {
1242                 error = tmp_taccess(toparent, VEXEC|VWRITE, cred);
1243                 if (error)
1244                         goto done;
1245         }
1246
1247         if (tdirlookup(toparent, nnm, &totp, cred) == 0) {
1248                 vnevent_pre_rename_dest(TNTOV(totp), ndvp, nnm, ct);
1249                 tmpnode_rele(totp);
1250         }
1251
1252         /* Notify the target dir. if not the same as the source dir. */
1253         if (ndvp != odvp) {
1254                 vnevent_pre_rename_dest_dir(ndvp, TNTOV(fromtp), nnm, ct);
1255         }
1256
1257         vnevent_pre_rename_src(TNTOV(fromtp), odvp, onm, ct);
1258
1259         /*
1260          * Link source to new target
1261          */
1262         rw_enter(&toparent->tn_rwlock, RW_WRITER);
1263         error = tdirenter(tm, toparent, nnm, DE_RENAME,
1264             fromparent, fromtp, (struct vattr *)NULL,
1265             (struct tmpnode **)NULL, cred, ct);
1266         rw_exit(&toparent->tn_rwlock);
1267
1268         if (error) {
1269                 /*
1270                  * ESAME isn't really an error; it indicates that the
1271                  * operation should not be done because the source and target
1272                  * are the same file, but that no error should be reported.
1273                  */
1274                 if (error == ESAME)
1275                         error = 0;
1276                 goto done;
1277         }
1278
1279         /*
1280          * Unlink from source.
1281          */
1282         rw_enter(&fromparent->tn_rwlock, RW_WRITER);
1283         rw_enter(&fromtp->tn_rwlock, RW_WRITER);
1284
1285         error = tdirdelete(fromparent, fromtp, onm, DR_RENAME, cred);
1286
1287         /*
1288          * The following handles the case where our source tmpnode was
1289          * removed before we got to it.
1290          *
1291          * XXX We should also cleanup properly in the case where tdirdelete
1292          * fails for some other reason.  Currently this case shouldn't happen.
1293          * (see 1184991).
1294          */
1295         if (error == ENOENT)
1296                 error = 0;
1297
1298         rw_exit(&fromtp->tn_rwlock);
1299         rw_exit(&fromparent->tn_rwlock);
1300
1301         if (error == 0) {
1302                 vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct);
1303                 /*
1304                  * vnevent_rename_dest is called in tdirenter().
1305                  * Notify the target dir if not same as source dir.
1306                  */
1307                 if (ndvp != odvp)
1308                         vnevent_rename_dest_dir(ndvp, ct);
1309         }
1310
1311 done:
1312         tmpnode_rele(fromtp);
1313         mutex_exit(&tm->tm_renamelck);
1314
1315         TRACE_5(TR_FAC_TMPFS, TR_TMPFS_RENAME,
1316             "tmpfs rename:ovp %p onm %s nvp %p nnm %s error %d", odvp, onm,
1317             ndvp, nnm, error);
1318         return (error);
1319 }
1320
1321 /* ARGSUSED5 */
1322 static int
1323 tmp_mkdir(
1324         struct vnode *dvp,
1325         char *nm,
1326         struct vattr *va,
1327         struct vnode **vpp,
1328         struct cred *cred,
1329         caller_context_t *ct,
1330         int flags,
1331         vsecattr_t *vsecp)
1332 {
1333         struct tmpnode *parent = (struct tmpnode *)VTOTN(dvp);
1334         struct tmpnode *self = NULL;
1335         struct tmount *tm = (struct tmount *)VTOTM(dvp);
1336         int error;
1337
1338         /* no new dirs allowed in xattr dirs */
1339         if (parent->tn_flags & ISXATTR)
1340                 return (EINVAL);
1341
1342         /*
1343          * Might be dangling directory.  Catch it here,
1344          * because a ENOENT return from tdirlookup() is
1345          * an "o.k. return".
1346          */
1347         if (parent->tn_nlink == 0)
1348                 return (ENOENT);
1349
1350         error = tdirlookup(parent, nm, &self, cred);
1351         if (error == 0) {
1352                 ASSERT(self);
1353                 tmpnode_rele(self);
1354                 return (EEXIST);
1355         }
1356         if (error != ENOENT)
1357                 return (error);
1358
1359         rw_enter(&parent->tn_rwlock, RW_WRITER);
1360         error = tdirenter(tm, parent, nm, DE_MKDIR, (struct tmpnode *)NULL,
1361             (struct tmpnode *)NULL, va, &self, cred, ct);
1362         if (error) {
1363                 rw_exit(&parent->tn_rwlock);
1364                 if (self)
1365                         tmpnode_rele(self);
1366                 return (error);
1367         }
1368         rw_exit(&parent->tn_rwlock);
1369         *vpp = TNTOV(self);
1370         return (0);
1371 }
1372
1373 /* ARGSUSED4 */
1374 static int
1375 tmp_rmdir(
1376         struct vnode *dvp,
1377         char *nm,
1378         struct vnode *cdir,
1379         struct cred *cred,
1380         caller_context_t *ct,
1381         int flags)
1382 {
1383         struct tmpnode *parent = (struct tmpnode *)VTOTN(dvp);
1384         struct tmpnode *self = NULL;
1385         struct vnode *vp;
1386         int error = 0;
1387
1388         /*
1389          * Return error when removing . and ..
1390          */
1391         if (strcmp(nm, ".") == 0)
1392                 return (EINVAL);
1393         if (strcmp(nm, "..") == 0)
1394                 return (EEXIST); /* Should be ENOTEMPTY */
1395         error = tdirlookup(parent, nm, &self, cred);
1396         if (error)
1397                 return (error);
1398
1399         rw_enter(&parent->tn_rwlock, RW_WRITER);
1400         rw_enter(&self->tn_rwlock, RW_WRITER);
1401
1402         vp = TNTOV(self);
1403         if (vp == dvp || vp == cdir) {
1404                 error = EINVAL;
1405                 goto done1;
1406         }
1407         if (self->tn_type != VDIR) {
1408                 error = ENOTDIR;
1409                 goto done1;
1410         }
1411
1412         mutex_enter(&self->tn_tlock);
1413         if (self->tn_nlink > 2) {
1414                 mutex_exit(&self->tn_tlock);
1415                 error = EEXIST;
1416                 goto done1;
1417         }
1418         mutex_exit(&self->tn_tlock);
1419
1420         if (vn_vfswlock(vp)) {
1421                 error = EBUSY;
1422                 goto done1;
1423         }
1424         if (vn_mountedvfs(vp) != NULL) {
1425                 error = EBUSY;
1426                 goto done;
1427         }
1428
1429         /*
1430          * Check for an empty directory
1431          * i.e. only includes entries for "." and ".."
1432          */
1433         if (self->tn_dirents > 2) {
1434                 error = EEXIST;         /* SIGH should be ENOTEMPTY */
1435                 /*
1436                  * Update atime because checking tn_dirents is logically
1437                  * equivalent to reading the directory
1438                  */
1439                 gethrestime(&self->tn_atime);
1440                 goto done;
1441         }
1442
1443         error = tdirdelete(parent, self, nm, DR_RMDIR, cred);
1444 done:
1445         vn_vfsunlock(vp);
1446 done1:
1447         rw_exit(&self->tn_rwlock);
1448         rw_exit(&parent->tn_rwlock);
1449         vnevent_rmdir(TNTOV(self), dvp, nm, ct);
1450         tmpnode_rele(self);
1451
1452         return (error);
1453 }
1454
1455 /* ARGSUSED2 */
1456 static int
1457 tmp_readdir(
1458         struct vnode *vp,
1459         struct uio *uiop,
1460         struct cred *cred,
1461         int *eofp,
1462         caller_context_t *ct,
1463         int flags)
1464 {
1465         struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
1466         struct tdirent *tdp;
1467         int error = 0;
1468         size_t namelen;
1469         struct dirent64 *dp;
1470         ulong_t offset;
1471         ulong_t total_bytes_wanted;
1472         long outcount = 0;
1473         long bufsize;
1474         int reclen;
1475         caddr_t outbuf;
1476
1477         if (uiop->uio_loffset >= MAXOFF_T) {
1478                 if (eofp)
1479                         *eofp = 1;
1480                 return (0);
1481         }
1482         /*
1483          * assuming system call has already called tmp_rwlock
1484          */
1485         ASSERT(RW_READ_HELD(&tp->tn_rwlock));
1486
1487         if (uiop->uio_iovcnt != 1)
1488                 return (EINVAL);
1489
1490         if (vp->v_type != VDIR)
1491                 return (ENOTDIR);
1492
1493         /*
1494          * There's a window here where someone could have removed
1495          * all the entries in the directory after we put a hold on the
1496          * vnode but before we grabbed the rwlock.  Just return.
1497          */
1498         if (tp->tn_dir == NULL) {
1499                 if (tp->tn_nlink) {
1500                         panic("empty directory 0x%p", (void *)tp);
1501                         /*NOTREACHED*/
1502                 }
1503                 return (0);
1504         }
1505
1506         /*
1507          * Get space for multiple directory entries
1508          */
1509         total_bytes_wanted = uiop->uio_iov->iov_len;
1510         bufsize = total_bytes_wanted + sizeof (struct dirent64);
1511         outbuf = kmem_alloc(bufsize, KM_SLEEP);
1512
1513         dp = (struct dirent64 *)outbuf;
1514
1515
1516         offset = 0;
1517         tdp = tp->tn_dir;
1518         while (tdp) {
1519                 namelen = strlen(tdp->td_name); /* no +1 needed */
1520                 offset = tdp->td_offset;
1521                 if (offset >= uiop->uio_offset) {
1522                         reclen = (int)DIRENT64_RECLEN(namelen);
1523                         if (outcount + reclen > total_bytes_wanted) {
1524                                 if (!outcount)
1525                                         /*
1526                                          * Buffer too small for any entries.
1527                                          */
1528                                         error = EINVAL;
1529                                 break;
1530                         }
1531                         ASSERT(tdp->td_tmpnode != NULL);
1532
1533                         /* use strncpy(9f) to zero out uninitialized bytes */
1534
1535                         (void) strncpy(dp->d_name, tdp->td_name,
1536                             DIRENT64_NAMELEN(reclen));
1537                         dp->d_reclen = (ushort_t)reclen;
1538                         dp->d_ino = (ino64_t)tdp->td_tmpnode->tn_nodeid;
1539                         dp->d_off = (offset_t)tdp->td_offset + 1;
1540                         dp = (struct dirent64 *)
1541                             ((uintptr_t)dp + dp->d_reclen);
1542                         outcount += reclen;
1543                         ASSERT(outcount <= bufsize);
1544                 }
1545                 tdp = tdp->td_next;
1546         }
1547
1548         if (!error)
1549                 error = uiomove(outbuf, outcount, UIO_READ, uiop);
1550
1551         if (!error) {
1552                 /* If we reached the end of the list our offset */
1553                 /* should now be just past the end. */
1554                 if (!tdp) {
1555                         offset += 1;
1556                         if (eofp)
1557                                 *eofp = 1;
1558                 } else if (eofp)
1559                         *eofp = 0;
1560                 uiop->uio_offset = offset;
1561         }
1562         gethrestime(&tp->tn_atime);
1563         kmem_free(outbuf, bufsize);
1564         return (error);
1565 }
1566
1567 /* ARGSUSED5 */
1568 static int
1569 tmp_symlink(
1570         struct vnode *dvp,
1571         char *lnm,
1572         struct vattr *tva,
1573         char *tnm,
1574         struct cred *cred,
1575         caller_context_t *ct,
1576         int flags)
1577 {
1578         struct tmpnode *parent = (struct tmpnode *)VTOTN(dvp);
1579         struct tmpnode *self = (struct tmpnode *)NULL;
1580         struct tmount *tm = (struct tmount *)VTOTM(dvp);
1581         char *cp = NULL;
1582         int error;
1583         size_t len;
1584
1585         /* no symlinks allowed to files in xattr dirs */
1586         if (parent->tn_flags & ISXATTR)
1587                 return (EINVAL);
1588
1589         error = tdirlookup(parent, lnm, &self, cred);
1590         if (error == 0) {
1591                 /*
1592                  * The entry already exists
1593                  */
1594                 tmpnode_rele(self);
1595                 return (EEXIST);        /* was 0 */
1596         }
1597
1598         if (error != ENOENT) {
1599                 if (self != NULL)
1600                         tmpnode_rele(self);
1601                 return (error);
1602         }
1603
1604         rw_enter(&parent->tn_rwlock, RW_WRITER);
1605         error = tdirenter(tm, parent, lnm, DE_CREATE, (struct tmpnode *)NULL,
1606             (struct tmpnode *)NULL, tva, &self, cred, ct);
1607         rw_exit(&parent->tn_rwlock);
1608
1609         if (error) {
1610                 if (self)
1611                         tmpnode_rele(self);
1612                 return (error);
1613         }
1614         len = strlen(tnm) + 1;
1615         cp = tmp_memalloc(len, 0);
1616         if (cp == NULL) {
1617                 tmpnode_rele(self);
1618                 return (ENOSPC);
1619         }
1620         (void) strcpy(cp, tnm);
1621
1622         self->tn_symlink = cp;
1623         self->tn_size = len - 1;
1624         tmpnode_rele(self);
1625         return (error);
1626 }
1627
1628 /* ARGSUSED2 */
1629 static int
1630 tmp_readlink(
1631         struct vnode *vp,
1632         struct uio *uiop,
1633         struct cred *cred,
1634         caller_context_t *ct)
1635 {
1636         struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
1637         int error = 0;
1638
1639         if (vp->v_type != VLNK)
1640                 return (EINVAL);
1641
1642         rw_enter(&tp->tn_rwlock, RW_READER);
1643         rw_enter(&tp->tn_contents, RW_READER);
1644         error = uiomove(tp->tn_symlink, tp->tn_size, UIO_READ, uiop);
1645         gethrestime(&tp->tn_atime);
1646         rw_exit(&tp->tn_contents);
1647         rw_exit(&tp->tn_rwlock);
1648         return (error);
1649 }
1650
1651 /* ARGSUSED */
1652 static int
1653 tmp_fsync(
1654         struct vnode *vp,
1655         int syncflag,
1656         struct cred *cred,
1657         caller_context_t *ct)
1658 {
1659         return (0);
1660 }
1661
1662 /* ARGSUSED */
1663 static void
1664 tmp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
1665 {
1666         struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
1667         struct tmount *tm = (struct tmount *)VFSTOTM(vp->v_vfsp);
1668
1669         rw_enter(&tp->tn_rwlock, RW_WRITER);
1670 top:
1671         mutex_enter(&tp->tn_tlock);
1672         mutex_enter(&vp->v_lock);
1673         ASSERT(vp->v_count >= 1);
1674
1675         /*
1676          * If we don't have the last hold or the link count is non-zero,
1677          * there's little to do -- just drop our hold.
1678          */
1679         if (vp->v_count > 1 || tp->tn_nlink != 0) {
1680                 VN_RELE_LOCKED(vp);
1681                 mutex_exit(&vp->v_lock);
1682                 mutex_exit(&tp->tn_tlock);
1683                 rw_exit(&tp->tn_rwlock);
1684                 return;
1685         }
1686
1687         /*
1688          * We have the last hold *and* the link count is zero, so this
1689          * tmpnode is dead from the filesystem's viewpoint.  However,
1690          * if the tmpnode has any pages associated with it (i.e. if it's
1691          * a normal file with non-zero size), the tmpnode can still be
1692          * discovered by pageout or fsflush via the page vnode pointers.
1693          * In this case we must drop all our locks, truncate the tmpnode,
1694          * and try the whole dance again.
1695          */
1696         if (tp->tn_size != 0) {
1697                 if (tp->tn_type == VREG) {
1698                         mutex_exit(&vp->v_lock);
1699                         mutex_exit(&tp->tn_tlock);
1700                         rw_enter(&tp->tn_contents, RW_WRITER);
1701                         (void) tmpnode_trunc(tm, tp, 0);
1702                         rw_exit(&tp->tn_contents);
1703                         ASSERT(tp->tn_size == 0);
1704                         ASSERT(tp->tn_nblocks == 0);
1705                         goto top;
1706                 }
1707                 if (tp->tn_type == VLNK)
1708                         tmp_memfree(tp->tn_symlink, tp->tn_size + 1);
1709         }
1710
1711         /*
1712          * Remove normal file/dir's xattr dir and xattrs.
1713          */
1714         if (tp->tn_xattrdp) {
1715                 struct tmpnode *xtp = tp->tn_xattrdp;
1716
1717                 ASSERT(xtp->tn_flags & ISXATTR);
1718                 tmpnode_hold(xtp);
1719                 rw_enter(&xtp->tn_rwlock, RW_WRITER);
1720                 tdirtrunc(xtp);
1721                 DECR_COUNT(&xtp->tn_nlink, &xtp->tn_tlock);
1722                 tp->tn_xattrdp = NULL;
1723                 rw_exit(&xtp->tn_rwlock);
1724                 tmpnode_rele(xtp);
1725         }
1726
1727         mutex_exit(&vp->v_lock);
1728         mutex_exit(&tp->tn_tlock);
1729         /* Here's our chance to send invalid event while we're between locks */
1730         vn_invalid(TNTOV(tp));
1731         mutex_enter(&tm->tm_contents);
1732         if (tp->tn_forw == NULL)
1733                 tm->tm_rootnode->tn_back = tp->tn_back;
1734         else
1735                 tp->tn_forw->tn_back = tp->tn_back;
1736         tp->tn_back->tn_forw = tp->tn_forw;
1737         mutex_exit(&tm->tm_contents);
1738         rw_exit(&tp->tn_rwlock);
1739         rw_destroy(&tp->tn_rwlock);
1740         mutex_destroy(&tp->tn_tlock);
1741         vn_free(TNTOV(tp));
1742         tmp_memfree(tp, sizeof (struct tmpnode));
1743 }
1744
1745 /* ARGSUSED2 */
1746 static int
1747 tmp_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1748 {
1749         struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
1750         struct tfid *tfid;
1751
1752         if (fidp->fid_len < (sizeof (struct tfid) - sizeof (ushort_t))) {
1753                 fidp->fid_len = sizeof (struct tfid) - sizeof (ushort_t);
1754                 return (ENOSPC);
1755         }
1756
1757         tfid = (struct tfid *)fidp;
1758         bzero(tfid, sizeof (struct tfid));
1759         tfid->tfid_len = (int)sizeof (struct tfid) - sizeof (ushort_t);
1760
1761         tfid->tfid_ino = tp->tn_nodeid;
1762         tfid->tfid_gen = tp->tn_gen;
1763
1764         return (0);
1765 }
1766
1767
1768 /*
1769  * Return all the pages from [off..off+len] in given file
1770  */
1771 /* ARGSUSED */
1772 static int
1773 tmp_getpage(
1774         struct vnode *vp,
1775         offset_t off,
1776         size_t len,
1777         uint_t *protp,
1778         page_t *pl[],
1779         size_t plsz,
1780         struct seg *seg,
1781         caddr_t addr,
1782         enum seg_rw rw,
1783         struct cred *cr,
1784         caller_context_t *ct)
1785 {
1786         int err = 0;
1787         struct tmpnode *tp = VTOTN(vp);
1788         anoff_t toff = (anoff_t)off;
1789         size_t tlen = len;
1790         u_offset_t tmpoff;
1791         timestruc_t now;
1792
1793         rw_enter(&tp->tn_contents, RW_READER);
1794
1795         if (off + len  > tp->tn_size + PAGEOFFSET) {
1796                 err = EFAULT;
1797                 goto out;
1798         }
1799         /*
1800          * Look for holes (no anon slot) in faulting range. If there are
1801          * holes we have to switch to a write lock and fill them in. Swap
1802          * space for holes was already reserved when the file was grown.
1803          */
1804         tmpoff = toff;
1805         if (non_anon(tp->tn_anon, btop(off), &tmpoff, &tlen)) {
1806                 if (!rw_tryupgrade(&tp->tn_contents)) {
1807                         rw_exit(&tp->tn_contents);
1808                         rw_enter(&tp->tn_contents, RW_WRITER);
1809                         /* Size may have changed when lock was dropped */
1810                         if (off + len  > tp->tn_size + PAGEOFFSET) {
1811                                 err = EFAULT;
1812                                 goto out;
1813                         }
1814                 }
1815                 for (toff = (anoff_t)off; toff < (anoff_t)off + len;
1816                     toff += PAGESIZE) {
1817                         if (anon_get_ptr(tp->tn_anon, btop(toff)) == NULL) {
1818                                 /* XXX - may allocate mem w. write lock held */
1819                                 (void) anon_set_ptr(tp->tn_anon, btop(toff),
1820                                     anon_alloc(vp, toff), ANON_SLEEP);
1821                                 tp->tn_nblocks++;
1822                         }
1823                 }
1824                 rw_downgrade(&tp->tn_contents);
1825         }
1826
1827
1828         err = pvn_getpages(tmp_getapage, vp, (u_offset_t)off, len, protp,
1829             pl, plsz, seg, addr, rw, cr);
1830
1831         gethrestime(&now);
1832         tp->tn_atime = now;
1833         if (rw == S_WRITE)
1834                 tp->tn_mtime = now;
1835
1836 out:
1837         rw_exit(&tp->tn_contents);
1838         return (err);
1839 }
1840
1841 /*
1842  * Called from pvn_getpages to get a particular page.
1843  */
1844 /*ARGSUSED*/
1845 static int
1846 tmp_getapage(
1847         struct vnode *vp,
1848         u_offset_t off,
1849         size_t len,
1850         uint_t *protp,
1851         page_t *pl[],
1852         size_t plsz,
1853         struct seg *seg,
1854         caddr_t addr,
1855         enum seg_rw rw,
1856         struct cred *cr)
1857 {
1858         struct page *pp;
1859         int flags;
1860         int err = 0;
1861         struct vnode *pvp;
1862         u_offset_t poff;
1863
1864         if (protp != NULL)
1865                 *protp = PROT_ALL;
1866 again:
1867         if (pp = page_lookup(vp, off, rw == S_CREATE ? SE_EXCL : SE_SHARED)) {
1868                 if (pl) {
1869                         pl[0] = pp;
1870                         pl[1] = NULL;
1871                 } else {
1872                         page_unlock(pp);
1873                 }
1874         } else {
1875                 pp = page_create_va(vp, off, PAGESIZE,
1876                     PG_WAIT | PG_EXCL, seg, addr);
1877                 /*
1878                  * Someone raced in and created the page after we did the
1879                  * lookup but before we did the create, so go back and
1880                  * try to look it up again.
1881                  */
1882                 if (pp == NULL)
1883                         goto again;
1884                 /*
1885                  * Fill page from backing store, if any. If none, then
1886                  * either this is a newly filled hole or page must have
1887                  * been unmodified and freed so just zero it out.
1888                  */
1889                 err = swap_getphysname(vp, off, &pvp, &poff);
1890                 if (err) {
1891                         panic("tmp_getapage: no anon slot vp %p "
1892                             "off %llx pp %p\n", (void *)vp, off, (void *)pp);
1893                 }
1894                 if (pvp) {
1895                         flags = (pl == NULL ? B_ASYNC|B_READ : B_READ);
1896                         err = VOP_PAGEIO(pvp, pp, (u_offset_t)poff, PAGESIZE,
1897                             flags, cr, NULL);
1898                         if (flags & B_ASYNC)
1899                                 pp = NULL;
1900                 } else if (rw != S_CREATE) {
1901                         pagezero(pp, 0, PAGESIZE);
1902                 }
1903                 if (err && pp)
1904                         pvn_read_done(pp, B_ERROR);
1905                 if (err == 0) {
1906                         if (pl)
1907                                 pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
1908                         else
1909                                 pvn_io_done(pp);
1910                 }
1911         }
1912         return (err);
1913 }
1914
1915
1916 /*
1917  * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
1918  * If len == 0, do from off to EOF.
1919  */
1920 static int tmp_nopage = 0;      /* Don't do tmp_putpage's if set */
1921
1922 /* ARGSUSED */
1923 int
1924 tmp_putpage(
1925         register struct vnode *vp,
1926         offset_t off,
1927         size_t len,
1928         int flags,
1929         struct cred *cr,
1930         caller_context_t *ct)
1931 {
1932         register page_t *pp;
1933         u_offset_t io_off;
1934         size_t io_len = 0;
1935         int err = 0;
1936         struct tmpnode *tp = VTOTN(vp);
1937         int dolock;
1938
1939         if (tmp_nopage)
1940                 return (0);
1941
1942         ASSERT(vp->v_count != 0);
1943
1944         if (vp->v_flag & VNOMAP)
1945                 return (ENOSYS);
1946
1947         /*
1948          * This being tmpfs, we don't ever do i/o unless we really
1949          * have to (when we're low on memory and pageout calls us
1950          * with B_ASYNC | B_FREE or the user explicitly asks for it with
1951          * B_DONTNEED).
1952          * XXX to approximately track the mod time like ufs we should
1953          * update the times here. The problem is, once someone does a
1954          * store we never clear the mod bit and do i/o, thus fsflush
1955          * will keep calling us every 30 seconds to do the i/o and we'll
1956          * continually update the mod time. At least we update the mod
1957          * time on the first store because this results in a call to getpage.
1958          */
1959         if (flags != (B_ASYNC | B_FREE) && (flags & B_INVAL) == 0 &&
1960             (flags & B_DONTNEED) == 0)
1961                 return (0);
1962         /*
1963          * If this thread owns the lock, i.e., this thread grabbed it
1964          * as writer somewhere above, then we don't need to grab the
1965          * lock as reader in this routine.
1966          */
1967         dolock = (rw_owner(&tp->tn_contents) != curthread);
1968
1969         /*
1970          * If this is pageout don't block on the lock as you could deadlock
1971          * when freemem == 0 (another thread has the read lock and is blocked
1972          * creating a page, and a third thread is waiting to get the writers
1973          * lock - waiting writers priority blocks us from getting the read
1974          * lock). Of course, if the only freeable pages are on this tmpnode
1975          * we're hosed anyways. A better solution might be a new lock type.
1976          * Note: ufs has the same problem.
1977          */
1978         if (curproc == proc_pageout) {
1979                 if (!rw_tryenter(&tp->tn_contents, RW_READER))
1980                         return (ENOMEM);
1981         } else if (dolock)
1982                 rw_enter(&tp->tn_contents, RW_READER);
1983
1984         if (!vn_has_cached_data(vp))
1985                 goto out;
1986
1987         if (len == 0) {
1988                 if (curproc == proc_pageout) {
1989                         panic("tmp: pageout can't block");
1990                         /*NOTREACHED*/
1991                 }
1992
1993                 /* Search the entire vp list for pages >= off. */
1994                 err = pvn_vplist_dirty(vp, (u_offset_t)off, tmp_putapage,
1995                     flags, cr);
1996         } else {
1997                 u_offset_t eoff;
1998
1999                 /*
2000                  * Loop over all offsets in the range [off...off + len]
2001                  * looking for pages to deal with.
2002                  */
2003                 eoff = MIN(off + len, tp->tn_size);
2004                 for (io_off = off; io_off < eoff; io_off += io_len) {
2005                         /*
2006                          * If we are not invalidating, synchronously
2007                          * freeing or writing pages use the routine
2008                          * page_lookup_nowait() to prevent reclaiming
2009                          * them from the free list.
2010                          */
2011                         if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
2012                                 pp = page_lookup(vp, io_off,
2013                                     (flags & (B_INVAL | B_FREE)) ?
2014                                     SE_EXCL : SE_SHARED);
2015                         } else {
2016                                 pp = page_lookup_nowait(vp, io_off,
2017                                     (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2018                         }
2019
2020                         if (pp == NULL || pvn_getdirty(pp, flags) == 0)
2021                                 io_len = PAGESIZE;
2022                         else {
2023                                 err = tmp_putapage(vp, pp, &io_off, &io_len,
2024                                     flags, cr);
2025                                 if (err != 0)
2026                                         break;
2027                         }
2028                 }
2029         }
2030         /* If invalidating, verify all pages on vnode list are gone. */
2031         if (err == 0 && off == 0 && len == 0 &&
2032             (flags & B_INVAL) && vn_has_cached_data(vp)) {
2033                 panic("tmp_putpage: B_INVAL, pages not gone");
2034                 /*NOTREACHED*/
2035         }
2036 out:
2037         if ((curproc == proc_pageout) || dolock)
2038                 rw_exit(&tp->tn_contents);
2039         /*
2040          * Only reason putapage is going to give us SE_NOSWAP as error
2041          * is when we ask a page to be written to physical backing store
2042          * and there is none. Ignore this because we might be dealing
2043          * with a swap page which does not have any backing store
2044          * on disk. In any other case we won't get this error over here.
2045          */
2046         if (err == SE_NOSWAP)
2047                 err = 0;
2048         return (err);
2049 }
2050
2051 long tmp_putpagecnt, tmp_pagespushed;
2052
2053 /*
2054  * Write out a single page.
2055  * For tmpfs this means choose a physical swap slot and write the page
2056  * out using VOP_PAGEIO. For performance, we attempt to kluster; i.e.,
2057  * we try to find a bunch of other dirty pages adjacent in the file
2058  * and a bunch of contiguous swap slots, and then write all the pages
2059  * out in a single i/o.
2060  */
2061 /*ARGSUSED*/
2062 static int
2063 tmp_putapage(
2064         struct vnode *vp,
2065         page_t *pp,
2066         u_offset_t *offp,
2067         size_t *lenp,
2068         int flags,
2069         struct cred *cr)
2070 {
2071         int err;
2072         ulong_t klstart, kllen;
2073         page_t *pplist, *npplist;
2074         extern int klustsize;
2075         long tmp_klustsize;
2076         struct tmpnode *tp;
2077         size_t pp_off, pp_len;
2078         u_offset_t io_off;
2079         size_t io_len;
2080         struct vnode *pvp;
2081         u_offset_t pstart;
2082         u_offset_t offset;
2083         u_offset_t tmpoff;
2084
2085         ASSERT(PAGE_LOCKED(pp));
2086
2087         /* Kluster in tmp_klustsize chunks */
2088         tp = VTOTN(vp);
2089         tmp_klustsize = klustsize;
2090         offset = pp->p_offset;
2091         klstart = (offset / tmp_klustsize) * tmp_klustsize;
2092         kllen = MIN(tmp_klustsize, tp->tn_size - klstart);
2093
2094         /* Get a kluster of pages */
2095         pplist =
2096             pvn_write_kluster(vp, pp, &tmpoff, &pp_len, klstart, kllen, flags);
2097
2098         pp_off = (size_t)tmpoff;
2099
2100         /*
2101          * Get a cluster of physical offsets for the pages; the amount we
2102          * get may be some subrange of what we ask for (io_off, io_len).
2103          */
2104         io_off = pp_off;
2105         io_len = pp_len;
2106         err = swap_newphysname(vp, offset, &io_off, &io_len, &pvp, &pstart);
2107         ASSERT(err != SE_NOANON); /* anon slot must have been filled */
2108         if (err) {
2109                 pvn_write_done(pplist, B_ERROR | B_WRITE | flags);
2110                 /*
2111                  * If this routine is called as a result of segvn_sync
2112                  * operation and we have no physical swap then we can get an
2113                  * error here. In such case we would return SE_NOSWAP as error.
2114                  * At this point, we expect only SE_NOSWAP.
2115                  */
2116                 ASSERT(err == SE_NOSWAP);
2117                 if (flags & B_INVAL)
2118                         err = ENOMEM;
2119                 goto out;
2120         }
2121         ASSERT(pp_off <= io_off && io_off + io_len <= pp_off + pp_len);
2122         ASSERT(io_off <= offset && offset < io_off + io_len);
2123
2124         /* Toss pages at front/rear that we couldn't get physical backing for */
2125         if (io_off != pp_off) {
2126                 npplist = NULL;
2127                 page_list_break(&pplist, &npplist, btop(io_off - pp_off));
2128                 ASSERT(pplist->p_offset == pp_off);
2129                 ASSERT(pplist->p_prev->p_offset == io_off - PAGESIZE);
2130                 pvn_write_done(pplist, B_ERROR | B_WRITE | flags);
2131                 pplist = npplist;
2132         }
2133         if (io_off + io_len < pp_off + pp_len) {
2134                 npplist = NULL;
2135                 page_list_break(&pplist, &npplist, btop(io_len));
2136                 ASSERT(npplist->p_offset == io_off + io_len);
2137                 ASSERT(npplist->p_prev->p_offset == pp_off + pp_len - PAGESIZE);
2138                 pvn_write_done(npplist, B_ERROR | B_WRITE | flags);
2139         }
2140
2141         ASSERT(pplist->p_offset == io_off);
2142         ASSERT(pplist->p_prev->p_offset == io_off + io_len - PAGESIZE);
2143         ASSERT(btopr(io_len) <= btopr(kllen));
2144
2145         /* Do i/o on the remaining kluster */
2146         err = VOP_PAGEIO(pvp, pplist, (u_offset_t)pstart, io_len,
2147             B_WRITE | flags, cr, NULL);
2148
2149         if ((flags & B_ASYNC) == 0) {
2150                 pvn_write_done(pplist, ((err) ? B_ERROR : 0) | B_WRITE | flags);
2151         }
2152 out:
2153         if (!err) {
2154                 if (offp)
2155                         *offp = io_off;
2156                 if (lenp)
2157                         *lenp = io_len;
2158                 tmp_putpagecnt++;
2159                 tmp_pagespushed += btop(io_len);
2160         }
2161         if (err && err != ENOMEM && err != SE_NOSWAP)
2162                 cmn_err(CE_WARN, "tmp_putapage: err %d\n", err);
2163         return (err);
2164 }
2165
2166 /* ARGSUSED */
2167 static int
2168 tmp_map(
2169         struct vnode *vp,
2170         offset_t off,
2171         struct as *as,
2172         caddr_t *addrp,
2173         size_t len,
2174         uchar_t prot,
2175         uchar_t maxprot,
2176         uint_t flags,
2177         struct cred *cred,
2178         caller_context_t *ct)
2179 {
2180         struct segvn_crargs vn_a;
2181         struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
2182         int error;
2183
2184 #ifdef _ILP32
2185         if (len > MAXOFF_T)
2186                 return (ENOMEM);
2187 #endif
2188
2189         if (vp->v_flag & VNOMAP)
2190                 return (ENOSYS);
2191
2192         if (off < 0 || (offset_t)(off + len) < 0 ||
2193             off > MAXOFF_T || (off + len) > MAXOFF_T)
2194                 return (ENXIO);
2195
2196         if (vp->v_type != VREG)
2197                 return (ENODEV);
2198
2199         /*
2200          * Don't allow mapping to locked file
2201          */
2202         if (vn_has_mandatory_locks(vp, tp->tn_mode)) {
2203                 return (EAGAIN);
2204         }
2205
2206         as_rangelock(as);
2207         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
2208         if (error != 0) {
2209                 as_rangeunlock(as);
2210                 return (error);
2211         }
2212
2213         vn_a.vp = vp;
2214         vn_a.offset = (u_offset_t)off;
2215         vn_a.type = flags & MAP_TYPE;
2216         vn_a.prot = prot;
2217         vn_a.maxprot = maxprot;
2218         vn_a.flags = flags & ~MAP_TYPE;
2219         vn_a.cred = cred;
2220         vn_a.amp = NULL;
2221         vn_a.szc = 0;
2222         vn_a.lgrp_mem_policy_flags = 0;
2223
2224         error = as_map(as, *addrp, len, segvn_create, &vn_a);
2225         as_rangeunlock(as);
2226         return (error);
2227 }
2228
2229 /*
2230  * tmp_addmap and tmp_delmap can't be called since the vp
2231  * maintained in the segvn mapping is NULL.
2232  */
2233 /* ARGSUSED */
2234 static int
2235 tmp_addmap(
2236         struct vnode *vp,
2237         offset_t off,
2238         struct as *as,
2239         caddr_t addr,
2240         size_t len,
2241         uchar_t prot,
2242         uchar_t maxprot,
2243         uint_t flags,
2244         struct cred *cred,
2245         caller_context_t *ct)
2246 {
2247         return (0);
2248 }
2249
2250 /* ARGSUSED */
2251 static int
2252 tmp_delmap(
2253         struct vnode *vp,
2254         offset_t off,
2255         struct as *as,
2256         caddr_t addr,
2257         size_t len,
2258         uint_t prot,
2259         uint_t maxprot,
2260         uint_t flags,
2261         struct cred *cred,
2262         caller_context_t *ct)
2263 {
2264         return (0);
2265 }
2266
2267 static int
2268 tmp_freesp(struct vnode *vp, struct flock64 *lp, int flag)
2269 {
2270         register int i;
2271         register struct tmpnode *tp = VTOTN(vp);
2272         int error;
2273
2274         ASSERT(vp->v_type == VREG);
2275         ASSERT(lp->l_start >= 0);
2276
2277         if (lp->l_len != 0)
2278                 return (EINVAL);
2279
2280         rw_enter(&tp->tn_rwlock, RW_WRITER);
2281         if (tp->tn_size == lp->l_start) {
2282                 rw_exit(&tp->tn_rwlock);
2283                 return (0);
2284         }
2285
2286         /*
2287          * Check for any mandatory locks on the range
2288          */
2289         if (MANDLOCK(vp, tp->tn_mode)) {
2290                 long save_start;
2291
2292                 save_start = lp->l_start;
2293
2294                 if (tp->tn_size < lp->l_start) {
2295                         /*
2296                          * "Truncate up" case: need to make sure there
2297                          * is no lock beyond current end-of-file. To
2298                          * do so, we need to set l_start to the size
2299                          * of the file temporarily.
2300                          */
2301                         lp->l_start = tp->tn_size;
2302                 }
2303                 lp->l_type = F_WRLCK;
2304                 lp->l_sysid = 0;
2305                 lp->l_pid = ttoproc(curthread)->p_pid;
2306                 i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK;
2307                 if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 ||
2308                     lp->l_type != F_UNLCK) {
2309                         rw_exit(&tp->tn_rwlock);
2310                         return (i ? i : EAGAIN);
2311                 }
2312
2313                 lp->l_start = save_start;
2314         }
2315         VFSTOTM(vp->v_vfsp);
2316
2317         rw_enter(&tp->tn_contents, RW_WRITER);
2318         error = tmpnode_trunc((struct tmount *)VFSTOTM(vp->v_vfsp),
2319             tp, (ulong_t)lp->l_start);
2320         rw_exit(&tp->tn_contents);
2321         rw_exit(&tp->tn_rwlock);
2322         return (error);
2323 }
2324
2325 /* ARGSUSED */
2326 static int
2327 tmp_space(
2328         struct vnode *vp,
2329         int cmd,
2330         struct flock64 *bfp,
2331         int flag,
2332         offset_t offset,
2333         cred_t *cred,
2334         caller_context_t *ct)
2335 {
2336         int error;
2337
2338         if (cmd != F_FREESP)
2339                 return (EINVAL);
2340         if ((error = convoff(vp, bfp, 0, (offset_t)offset)) == 0) {
2341                 if ((bfp->l_start > MAXOFF_T) || (bfp->l_len > MAXOFF_T))
2342                         return (EFBIG);
2343                 error = tmp_freesp(vp, bfp, flag);
2344
2345                 if (error == 0 && bfp->l_start == 0)
2346                         vnevent_truncate(vp, ct);
2347         }
2348         return (error);
2349 }
2350
2351 /* ARGSUSED */
2352 static int
2353 tmp_seek(
2354         struct vnode *vp,
2355         offset_t ooff,
2356         offset_t *noffp,
2357         caller_context_t *ct)
2358 {
2359         return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
2360 }
2361
2362 /* ARGSUSED2 */
2363 static int
2364 tmp_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
2365 {
2366         struct tmpnode *tp = VTOTN(vp);
2367
2368         if (write_lock) {
2369                 rw_enter(&tp->tn_rwlock, RW_WRITER);
2370         } else {
2371                 rw_enter(&tp->tn_rwlock, RW_READER);
2372         }
2373         return (write_lock);
2374 }
2375
2376 /* ARGSUSED1 */
2377 static void
2378 tmp_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
2379 {
2380         struct tmpnode *tp = VTOTN(vp);
2381
2382         rw_exit(&tp->tn_rwlock);
2383 }
2384
2385 static int
2386 tmp_pathconf(
2387         struct vnode *vp,
2388         int cmd,
2389         ulong_t *valp,
2390         cred_t *cr,
2391         caller_context_t *ct)
2392 {
2393         struct tmpnode *tp = NULL;
2394         int error;
2395
2396         switch (cmd) {
2397         case _PC_XATTR_EXISTS:
2398                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
2399                         *valp = 0;      /* assume no attributes */
2400                         error = 0;      /* okay to ask */
2401                         tp = VTOTN(vp);
2402                         rw_enter(&tp->tn_rwlock, RW_READER);
2403                         if (tp->tn_xattrdp) {
2404                                 rw_enter(&tp->tn_xattrdp->tn_rwlock, RW_READER);
2405                                 /* do not count "." and ".." */
2406                                 if (tp->tn_xattrdp->tn_dirents > 2)
2407                                         *valp = 1;
2408                                 rw_exit(&tp->tn_xattrdp->tn_rwlock);
2409                         }
2410                         rw_exit(&tp->tn_rwlock);
2411                 } else {
2412                         error = EINVAL;
2413                 }
2414                 break;
2415         case _PC_SATTR_ENABLED:
2416         case _PC_SATTR_EXISTS:
2417                 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2418                     (vp->v_type == VREG || vp->v_type == VDIR);
2419                 error = 0;
2420                 break;
2421         case _PC_TIMESTAMP_RESOLUTION:
2422                 /* nanosecond timestamp resolution */
2423                 *valp = 1L;
2424                 error = 0;
2425                 break;
2426         default:
2427                 error = fs_pathconf(vp, cmd, valp, cr, ct);
2428         }
2429         return (error);
2430 }
2431
2432
2433 struct vnodeops *tmp_vnodeops;
2434
2435 const fs_operation_def_t tmp_vnodeops_template[] = {
2436         VOPNAME_OPEN,           { .vop_open = tmp_open },
2437         VOPNAME_CLOSE,          { .vop_close = tmp_close },
2438         VOPNAME_READ,           { .vop_read = tmp_read },
2439         VOPNAME_WRITE,          { .vop_write = tmp_write },
2440         VOPNAME_IOCTL,          { .vop_ioctl = tmp_ioctl },
2441         VOPNAME_GETATTR,        { .vop_getattr = tmp_getattr },
2442         VOPNAME_SETATTR,        { .vop_setattr = tmp_setattr },
2443         VOPNAME_ACCESS,         { .vop_access = tmp_access },
2444         VOPNAME_LOOKUP,         { .vop_lookup = tmp_lookup },
2445         VOPNAME_CREATE,         { .vop_create = tmp_create },
2446         VOPNAME_REMOVE,         { .vop_remove = tmp_remove },
2447         VOPNAME_LINK,           { .vop_link = tmp_link },
2448         VOPNAME_RENAME,         { .vop_rename = tmp_rename },
2449         VOPNAME_MKDIR,          { .vop_mkdir = tmp_mkdir },
2450         VOPNAME_RMDIR,          { .vop_rmdir = tmp_rmdir },
2451         VOPNAME_READDIR,        { .vop_readdir = tmp_readdir },
2452         VOPNAME_SYMLINK,        { .vop_symlink = tmp_symlink },
2453         VOPNAME_READLINK,       { .vop_readlink = tmp_readlink },
2454         VOPNAME_FSYNC,          { .vop_fsync = tmp_fsync },
2455         VOPNAME_INACTIVE,       { .vop_inactive = tmp_inactive },
2456         VOPNAME_FID,            { .vop_fid = tmp_fid },
2457         VOPNAME_RWLOCK,         { .vop_rwlock = tmp_rwlock },
2458         VOPNAME_RWUNLOCK,       { .vop_rwunlock = tmp_rwunlock },
2459         VOPNAME_SEEK,           { .vop_seek = tmp_seek },
2460         VOPNAME_SPACE,          { .vop_space = tmp_space },
2461         VOPNAME_GETPAGE,        { .vop_getpage = tmp_getpage },
2462         VOPNAME_PUTPAGE,        { .vop_putpage = tmp_putpage },
2463         VOPNAME_MAP,            { .vop_map = tmp_map },
2464         VOPNAME_ADDMAP,         { .vop_addmap = tmp_addmap },
2465         VOPNAME_DELMAP,         { .vop_delmap = tmp_delmap },
2466         VOPNAME_PATHCONF,       { .vop_pathconf = tmp_pathconf },
2467         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
2468         NULL,                   NULL
2469 };