external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /* Portions Copyright 2007 Jeremy Teo */
  27
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/time.h>
  31 #include <sys/systm.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/resource.h>
  34 #include <sys/vfs.h>
  35 #include <sys/vnode.h>
  36 #include <sys/file.h>
  37 #include <sys/stat.h>
  38 #include <sys/kmem.h>
  39 #include <sys/taskq.h>
  40 #include <sys/uio.h>
  41 #include <sys/atomic.h>
  42 #include <sys/namei.h>
  43 #include <sys/mman.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/errno.h>
  46 #include <sys/unistd.h>
  47 #include <sys/zfs_dir.h>
  48 #include <sys/zfs_ioctl.h>
  49 #include <sys/fs/zfs.h>
  50 #include <sys/dmu.h>
  51 #include <sys/spa.h>
  52 #include <sys/txg.h>
  53 #include <sys/dbuf.h>
  54 #include <sys/zap.h>
  55 #include <sys/dirent.h>
  56 #include <sys/policy.h>
  57 #include <sys/sunddi.h>
  58 #include <sys/filio.h>
  59 #include <sys/zfs_ctldir.h>
  60 #include <sys/zfs_fuid.h>
  61 #include <sys/zfs_vfsops.h>
  62 #include <sys/dnlc.h>
  63 #include <sys/zfs_rlock.h>
  64 #include <sys/extdirent.h>
  65 #include <sys/kidmap.h>
  66 #include <sys/buf.h>
  67 #include <sys/sched.h>
  68 #include <sys/acl.h>
  69 #include <sys/extattr.h>
  70
  71 #ifdef __NetBSD__
  72 #include <miscfs/genfs/genfs.h>
  73 #endif
  74
  75 /*
  76  * Programming rules.
  77  *
  78  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  79  * properly lock its in-core state, create a DMU transaction, do the work,
  80  * record this work in the intent log (ZIL), commit the DMU transaction,
  81  * and wait for the intent log to commit if it is a synchronous operation.
  82  * Moreover, the vnode ops must work in both normal and log replay context.
  83  * The ordering of events is important to avoid deadlocks and references
  84  * to freed memory.  The example below illustrates the following Big Rules:
  85  *
  86  *  (1) A check must be made in each zfs thread for a mounted file system.
  87  *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  88  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  89  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  90  *      can return EIO from the calling function.
  91  *
  92  *  (2) VN_RELE() should always be the last thing except for zil_commit()
  93  *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
  94  *      First, if it's the last reference, the vnode/znode
  95  *      can be freed, so the zp may point to freed memory.  Second, the last
  96  *      reference will call zfs_zinactive(), which may induce a lot of work --
  97  *      pushing cached pages (which acquires range locks) and syncing out
  98  *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
  99  *      which could deadlock the system if you were already holding one.
 100  *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 101  *
 102  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 103  *      as they can span dmu_tx_assign() calls.
 104  *
 105  *  (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
 106  *      In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
 107  *      it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
 108  *      This is critical because we don't want to block while holding locks.
 109  *      Note, in particular, that if a lock is sometimes acquired before
 110  *      the tx assigns, and sometimes after (e.g. z_lock), then failing to
 111  *      use a non-blocking assign can deadlock the system.  The scenario:
 112  *
 113  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 114  *      Thread B is in an already-assigned tx, and blocks for this lock.
 115  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 116  *      forever, because the previous txg can't quiesce until B's tx commits.
 117  *
 118  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 119  *      then drop all locks, call dmu_tx_wait(), and try again.
 120  *
 121  *  (5) If the operation succeeded, generate the intent log entry for it
 122  *      before dropping locks.  This ensures that the ordering of events
 123  *      in the intent log matches the order in which they actually occurred.
 124  *
 125  *  (6) At the end of each vnode op, the DMU tx must always commit,
 126  *      regardless of whether there were any errors.
 127  *
 128  *  (7) After dropping all locks, invoke zil_commit(zilog, seq, foid)
 129  *      to ensure that synchronous semantics are provided when necessary.
 130  *
 131  * In general, this is how things should be ordered in each vnode op:
 132  *
 133  *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 134  * top:
 135  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may VN_HOLD())
 136  *      rw_enter(...);                  // grab any other locks you need
 137  *      tx = dmu_tx_create(...);        // get DMU tx
 138  *      dmu_tx_hold_*();                // hold each object you might modify
 139  *      error = dmu_tx_assign(tx, zfsvfs->z_assign);    // try to assign
 140  *      if (error) {
 141  *              rw_exit(...);           // drop locks
 142  *              zfs_dirent_unlock(...); // unlock directory entry
 143  *              VN_RELE(...);           // release held vnodes
 144  *              if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 145  *                      dmu_tx_wait(tx);
 146  *                      dmu_tx_abort(tx);
 147  *                      goto top;
 148  *              }
 149  *              dmu_tx_abort(tx);       // abort DMU tx
 150  *              ZFS_EXIT(zfsvfs);       // finished in zfs
 151  *              return (error);         // really out of space
 152  *      }
 153  *      error = do_real_work();         // do whatever this VOP does
 154  *      if (error == 0)
 155  *              zfs_log_*(...);         // on success, make ZIL entry
 156  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 157  *      rw_exit(...);                   // drop locks
 158  *      zfs_dirent_unlock(dl, 0);       // unlock directory entry
 159  *      VN_RELE(...);                   // release held vnodes
 160  *      zil_commit(zilog, seq, foid);   // synchronous when necessary
 161  *      ZFS_EXIT(zfsvfs);               // finished in zfs
 162  *      return (error);                 // done, report error
 163  */
 164
 165 /* ARGSUSED */
 166 static int
 167 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 168 {
 169         znode_t *zp = VTOZ(*vpp);
 170
 171         if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
 172             ((flag & FAPPEND) == 0)) {
 173                 return (EPERM);
 174         }
 175
 176         if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 177             ZTOV(zp)->v_type == VREG &&
 178             !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
 179             zp->z_phys->zp_size > 0)
 180                 if (fs_vscan(*vpp, cr, 0) != 0)
 181                         return (EACCES);
 182
 183         /* Keep a count of the synchronous opens in the znode */
 184         if (flag & (FSYNC | FDSYNC))
 185                 atomic_inc_32(&zp->z_sync_cnt);
 186
 187         return (0);
 188 }
 189
 190 /* ARGSUSED */
 191 static int
 192 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 193     caller_context_t *ct)
 194 {
 195         znode_t *zp = VTOZ(vp);
 196
 197         dprintf("zfs_close called \n");
 198         /* Decrement the synchronous opens in the znode */
 199         if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 200                 atomic_dec_32(&zp->z_sync_cnt);
 201
 202         /*
 203          * Clean up any locks held by this process on the vp.
 204          */
 205         cleanlocks(vp, ddi_get_pid(), 0);
 206         cleanshares(vp, ddi_get_pid());
 207
 208         if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 209             ZTOV(zp)->v_type == VREG &&
 210             !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
 211             zp->z_phys->zp_size > 0)
 212                 VERIFY(fs_vscan(vp, cr, 1) == 0);
 213
 214         return (0);
 215 }
 216
 217 #ifdef PORT_NETBSD
 218 /*
 219  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
 220  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
 221  */
 222 static int
 223 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
 224 {
 225         znode_t *zp = VTOZ(vp);
 226         uint64_t noff = (uint64_t)*off; /* new offset */
 227         uint64_t file_sz;
 228         int error;
 229         boolean_t hole;
 230
 231         file_sz = zp->z_phys->zp_size;
 232         if (noff >= file_sz)  {
 233                 return (ENXIO);
 234         }
 235
 236         if (cmd == _FIO_SEEK_HOLE)
 237                 hole = B_TRUE;
 238         else
 239                 hole = B_FALSE;
 240
 241         error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 242
 243         /* end of file? */
 244         if ((error == ESRCH) || (noff > file_sz)) {
 245                 /*
 246                  * Handle the virtual hole at the end of file.
 247                  */
 248                 if (hole) {
 249                         *off = file_sz;
 250                         return (0);
 251                 }
 252                 return (ENXIO);
 253         }
 254
 255         if (noff < *off)
 256                 return (error);
 257         *off = noff;
 258         return (error);
 259 }
 260 #endif /* PORT_NETBSD */
 261
 262 static int
 263 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
 264     int *rvalp, caller_context_t *ct)
 265 {
 266         offset_t off;
 267         int error;
 268         zfsvfs_t *zfsvfs;
 269         znode_t *zp;
 270
 271         switch (com) {
 272         case _FIOFFS:
 273                 return (0);
 274
 275                 /*
 276                  * The following two ioctls are used by bfu.  Faking out,
 277                  * necessary to avoid bfu errors.
 278                  */
 279         case _FIOGDIO:
 280         case _FIOSDIO:
 281                 return (0);
 282 #ifdef PORT_NETBSD /* XXX NetBSD Do we support holes in files ? */
 283         case _FIO_SEEK_DATA:
 284         case _FIO_SEEK_HOLE:
 285                 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 286                         return (EFAULT);
 287
 288                 zp = VTOZ(vp);
 289                 zfsvfs = zp->z_zfsvfs;
 290                 ZFS_ENTER(zfsvfs);
 291                 ZFS_VERIFY_ZP(zp);
 292
 293                 /* offset parameter is in/out */
 294                 error = zfs_holey(vp, com, &off);
 295                 ZFS_EXIT(zfsvfs);
 296                 if (error)
 297                         return (error);
 298                 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 299                         return (EFAULT);
 300                 return (0);
 301 #endif
 302         }
 303
 304         return (ENOTTY);
 305 }
 306
 307 #ifdef PORT_NETBSD
 308 /*
 309  * When a file is memory mapped, we must keep the IO data synchronized
 310  * between the DMU cache and the memory mapped pages.  What this means:
 311  *
 312  * On Write:    If we find a memory mapped page, we write to *both*
 313  *              the page and the dmu buffer.
 314  *
 315  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 316  *      the file is memory mapped.
 317  */
 318 static int
 319 mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
 320 {
 321         znode_t *zp = VTOZ(vp);
 322         objset_t *os = zp->z_zfsvfs->z_os;
 323         vm_object_t obj;
 324         vm_page_t m;
 325         struct sf_buf *sf;
 326         int64_t start, off;
 327         int len = nbytes;
 328         int error = 0;
 329         uint64_t dirbytes;
 330
 331         ASSERT(vp->v_mount != NULL);
 332         obj = vp->v_object;
 333         ASSERT(obj != NULL);
 334
 335         start = uio->uio_loffset;
 336         off = start & PAGEOFFSET;
 337         dirbytes = 0;
 338         VM_OBJECT_LOCK(obj);
 339         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 340                 uint64_t bytes = MIN(PAGESIZE - off, len);
 341                 uint64_t fsize;
 342
 343 again:
 344                 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 345                     vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
 346                         uint64_t woff;
 347                         caddr_t va;
 348
 349                         if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
 350                                 goto again;
 351                         fsize = obj->un_pager.vnp.vnp_size;
 352                         vm_page_busy(m);
 353                         vm_page_lock_queues();
 354                         vm_page_undirty(m);
 355                         vm_page_unlock_queues();
 356                         VM_OBJECT_UNLOCK(obj);
 357                         if (dirbytes > 0) {
 358                                 error = dmu_write_uio(os, zp->z_id, uio,
 359                                     dirbytes, tx);
 360                                 dirbytes = 0;
 361                         }
 362                         if (error == 0) {
 363                                 sched_pin();
 364                                 sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 365                                 va = (caddr_t)sf_buf_kva(sf);
 366                                 woff = uio->uio_loffset - off;
 367                                 error = uiomove(va + off, bytes, UIO_WRITE, uio);
 368                                 /*
 369                                  * The uiomove() above could have been partially
 370                                  * successful, that's why we call dmu_write()
 371                                  * below unconditionally. The page was marked
 372                                  * non-dirty above and we would lose the changes
 373                                  * without doing so. If the uiomove() failed
 374                                  * entirely, well, we just write what we got
 375                                  * before one more time.
 376                                  */
 377                                 dmu_write(os, zp->z_id, woff,
 378                                     MIN(PAGESIZE, fsize - woff), va, tx);
 379                                 sf_buf_free(sf);
 380                                 sched_unpin();
 381                         }
 382                         VM_OBJECT_LOCK(obj);
 383                         vm_page_wakeup(m);
 384                 } else {
 385                         if (__predict_false(obj->cache != NULL)) {
 386                                 vm_page_cache_free(obj, OFF_TO_IDX(start),
 387                                     OFF_TO_IDX(start) + 1);
 388                         }
 389                         dirbytes += bytes;
 390                 }
 391                 len -= bytes;
 392                 off = 0;
 393                 if (error)
 394                         break;
 395         }
 396         VM_OBJECT_UNLOCK(obj);
 397         if (error == 0 && dirbytes > 0)
 398                 error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
 399         return (error);
 400 }
 401
 402 /*
 403  * When a file is memory mapped, we must keep the IO data synchronized
 404  * between the DMU cache and the memory mapped pages.  What this means:
 405  *
 406  * On Read:     We "read" preferentially from memory mapped pages,
 407  *              else we default from the dmu buffer.
 408  *
 409  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 410  *      the file is memory mapped.
 411  */
 412 static int
 413 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 414 {
 415         znode_t *zp = VTOZ(vp);
 416         objset_t *os = zp->z_zfsvfs->z_os;
 417         vm_object_t obj;
 418         vm_page_t m;
 419         struct sf_buf *sf;
 420         int64_t start, off;
 421         caddr_t va;
 422         int len = nbytes;
 423         int error = 0;
 424         uint64_t dirbytes;
 425
 426         ASSERT(vp->v_mount != NULL);
 427         obj = vp->v_object;
 428         ASSERT(obj != NULL);
 429
 430         start = uio->uio_loffset;
 431         off = start & PAGEOFFSET;
 432         dirbytes = 0;
 433         VM_OBJECT_LOCK(obj);
 434         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 435                 uint64_t bytes = MIN(PAGESIZE - off, len);
 436
 437 again:
 438                 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 439                     vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
 440                         if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
 441                                 goto again;
 442                         vm_page_busy(m);
 443                         VM_OBJECT_UNLOCK(obj);
 444                         if (dirbytes > 0) {
 445                                 error = dmu_read_uio(os, zp->z_id, uio,
 446                                     dirbytes);
 447                                 dirbytes = 0;
 448                         }
 449                         if (error == 0) {
 450                                 sched_pin();
 451                                 sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 452                                 va = (caddr_t)sf_buf_kva(sf);
 453                                 error = uiomove(va + off, bytes, UIO_READ, uio);
 454                                 sf_buf_free(sf);
 455                                 sched_unpin();
 456                         }
 457                         VM_OBJECT_LOCK(obj);
 458                         vm_page_wakeup(m);
 459                 } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
 460                         /*
 461                          * The code below is here to make sendfile(2) work
 462                          * correctly with ZFS. As pointed out by ups@
 463                          * sendfile(2) should be changed to use VOP_GETPAGES(),
 464                          * but it pessimize performance of sendfile/UFS, that's
 465                          * why I handle this special case in ZFS code.
 466                          */
 467                         if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
 468                                 goto again;
 469                         vm_page_busy(m);
 470                         VM_OBJECT_UNLOCK(obj);
 471                         if (dirbytes > 0) {
 472                                 error = dmu_read_uio(os, zp->z_id, uio,
 473                                     dirbytes);
 474                                 dirbytes = 0;
 475                         }
 476                         if (error == 0) {
 477                                 sched_pin();
 478                                 sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 479                                 va = (caddr_t)sf_buf_kva(sf);
 480                                 error = dmu_read(os, zp->z_id, start + off,
 481                                     bytes, (void *)(va + off));
 482                                 sf_buf_free(sf);
 483                                 sched_unpin();
 484                         }
 485                         VM_OBJECT_LOCK(obj);
 486                         vm_page_wakeup(m);
 487                         if (error == 0)
 488                                 uio->uio_resid -= bytes;
 489                 } else {
 490                         dirbytes += bytes;
 491                 }
 492                 len -= bytes;
 493                 off = 0;
 494                 if (error)
 495                         break;
 496         }
 497         VM_OBJECT_UNLOCK(obj);
 498         if (error == 0 && dirbytes > 0)
 499                 error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
 500         return (error);
 501 }
 502 #endif  /* PORT_NETBSD */
 503 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 504
 505 /*
 506  * Read bytes from specified file into supplied buffer.
 507  *
 508  *      IN:     vp      - vnode of file to be read from.
 509  *              uio     - structure supplying read location, range info,
 510  *                        and return buffer.
 511  *              ioflag  - SYNC flags; used to provide FRSYNC semantics.
 512  *              cr      - credentials of caller.
 513  *              ct      - caller context
 514  *
 515  *      OUT:    uio     - updated offset and range, buffer filled.
 516  *
 517  *      RETURN: 0 if success
 518  *              error code if failure
 519  *
 520  * Side Effects:
 521  *      vp - atime updated if byte count > 0
 522  */
 523 /* ARGSUSED */
 524 static int
 525 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 526 {
 527         znode_t         *zp = VTOZ(vp);
 528         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 529         objset_t        *os;
 530         ssize_t         n, nbytes;
 531         int             error;
 532         rl_t            *rl;
 533
 534         dprintf("zfs_read called\n");
 535
 536         ZFS_ENTER(zfsvfs);
 537         ZFS_VERIFY_ZP(zp);
 538         os = zfsvfs->z_os;
 539
 540         if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
 541                 ZFS_EXIT(zfsvfs);
 542                 return (EACCES);
 543         }
 544
 545         /*
 546          * Validate file offset
 547          */
 548         if (uio->uio_loffset < (offset_t)0) {
 549                 ZFS_EXIT(zfsvfs);
 550                 return (EINVAL);
 551         }
 552
 553         /*
 554          * Fasttrack empty reads
 555          */
 556         if (uio->uio_resid == 0) {
 557                 ZFS_EXIT(zfsvfs);
 558                 return (0);
 559         }
 560
 561         /*
 562          * Check for mandatory locks
 563          */
 564         if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
 565                 if (error = chklock(vp, FREAD,
 566                     uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 567                         ZFS_EXIT(zfsvfs);
 568                         return (error);
 569                 }
 570         }
 571
 572         /*
 573          * If we're in FRSYNC mode, sync out this znode before reading it.
 574          */
 575         if (ioflag & FRSYNC)
 576                 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
 577
 578         /*
 579          * Lock the range against changes.
 580          */
 581         rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 582
 583         /*
 584          * If we are reading past end-of-file we can skip
 585          * to the end; but we might still need to set atime.
 586          */
 587         if (uio->uio_loffset >= zp->z_phys->zp_size) {
 588                 error = 0;
 589                 goto out;
 590         }
 591
 592         ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
 593         n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
 594
 595         while (n > 0) {
 596                 nbytes = MIN(n, zfs_read_chunk_size -
 597                     P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 598
 599 //              if (vn_has_cached_data(vp))
 600 //                      error = mappedread(vp, nbytes, uio);
 601 //              else
 602                         error = dmu_read_uio(os, zp->z_id, uio, nbytes);
 603                 if (error) {
 604                         /* convert checksum errors into IO errors */
 605                         if (error == ECKSUM)
 606                                 error = EIO;
 607                         break;
 608                 }
 609
 610                 n -= nbytes;
 611         }
 612
 613 out:
 614         zfs_range_unlock(rl);
 615
 616         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 617         ZFS_EXIT(zfsvfs);
 618         return (error);
 619 }
 620
 621 /*
 622  * Fault in the pages of the first n bytes specified by the uio structure.
 623  * 1 byte in each page is touched and the uio struct is unmodified.
 624  * Any error will exit this routine as this is only a best
 625  * attempt to get the pages resident. This is a copy of ufs_trans_touch().
 626  */
 627 static void
 628 zfs_prefault_write(ssize_t n, struct uio *uio)
 629 {
 630         struct iovec *iov;
 631         ulong_t cnt, incr;
 632         caddr_t p;
 633
 634         if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace))
 635                 return;
 636
 637         iov = uio->uio_iov;
 638
 639         while (n) {
 640                 cnt = MIN(iov->iov_len, n);
 641                 if (cnt == 0) {
 642                         /* empty iov entry */
 643                         iov++;
 644                         continue;
 645                 }
 646                 n -= cnt;
 647                 /*
 648                  * touch each page in this segment.
 649                  */
 650                 p = iov->iov_base;
 651                 while (cnt) {
 652                         if (fubyte(p) == -1)
 653                                 return;
 654                         incr = MIN(cnt, PAGESIZE);
 655                         p += incr;
 656                         cnt -= incr;
 657                 }
 658                 /*
 659                  * touch the last byte in case it straddles a page.
 660                  */
 661                 p--;
 662                 if (fubyte(p) == -1)
 663                         return;
 664                 iov++;
 665         }
 666 }
 667
 668 /*
 669  * Write the bytes to a file.
 670  *
 671  *      IN:     vp      - vnode of file to be written to.
 672  *              uio     - structure supplying write location, range info,
 673  *                        and data buffer.
 674  *              ioflag  - IO_APPEND flag set if in append mode.
 675  *              cr      - credentials of caller.
 676  *              ct      - caller context (NFS/CIFS fem monitor only)
 677  *
 678  *      OUT:    uio     - updated offset and range.
 679  *
 680  *      RETURN: 0 if success
 681  *              error code if failure
 682  *
 683  * Timestamps:
 684  *      vp - ctime|mtime updated if byte count > 0
 685  */
 686 /* ARGSUSED */
 687 static int
 688 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 689 {
 690         znode_t         *zp = VTOZ(vp);
 691         rlim64_t        limit = MAXOFFSET_T;
 692         ssize_t         start_resid = uio->uio_resid;
 693         ssize_t         tx_bytes;
 694         uint64_t        end_size;
 695         dmu_tx_t        *tx;
 696         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 697         zilog_t         *zilog;
 698         offset_t        woff;
 699         ssize_t         n, nbytes;
 700         rl_t            *rl;
 701         int             max_blksz = zfsvfs->z_max_blksz;
 702         uint64_t        pflags;
 703         int             error;
 704
 705         dprintf("zfs_write called\n");
 706
 707         /*
 708          * Fasttrack empty write
 709          */
 710         n = start_resid;
 711         if (n == 0)
 712                 return (0);
 713
 714         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 715                 limit = MAXOFFSET_T;
 716
 717         ZFS_ENTER(zfsvfs);
 718         ZFS_VERIFY_ZP(zp);
 719
 720         /*
 721          * If immutable or not appending then return EPERM
 722          */
 723         pflags = zp->z_phys->zp_flags;
 724         if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 725             ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 726             (uio->uio_loffset < zp->z_phys->zp_size))) {
 727                 ZFS_EXIT(zfsvfs);
 728                 return (EPERM);
 729         }
 730
 731         zilog = zfsvfs->z_log;
 732
 733         /*
 734          * Pre-fault the pages to ensure slow (eg NFS) pages
 735          * don't hold up txg.
 736          */
 737         zfs_prefault_write(n, uio);
 738
 739         /*
 740          * If in append mode, set the io offset pointer to eof.
 741          */
 742         if (ioflag & IO_APPEND) {
 743                 /*
 744                  * Range lock for a file append:
 745                  * The value for the start of range will be determined by
 746                  * zfs_range_lock() (to guarantee append semantics).
 747                  * If this write will cause the block size to increase,
 748                  * zfs_range_lock() will lock the entire file, so we must
 749                  * later reduce the range after we grow the block size.
 750                  */
 751                 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 752                 if (rl->r_len == UINT64_MAX) {
 753                         /* overlocked, zp_size can't change */
 754                         woff = uio->uio_loffset = zp->z_phys->zp_size;
 755                 } else {
 756                         woff = uio->uio_loffset = rl->r_off;
 757                 }
 758         } else {
 759                 woff = uio->uio_loffset;
 760                 /*
 761                  * Validate file offset
 762                  */
 763                 if (woff < 0) {
 764                         ZFS_EXIT(zfsvfs);
 765                         return (EINVAL);
 766                 }
 767
 768                 /*
 769                  * If we need to grow the block size then zfs_range_lock()
 770                  * will lock a wider range than we request here.
 771                  * Later after growing the block size we reduce the range.
 772                  */
 773                 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 774         }
 775
 776         if (woff >= limit) {
 777                 zfs_range_unlock(rl);
 778                 ZFS_EXIT(zfsvfs);
 779                 return (EFBIG);
 780         }
 781
 782         if ((woff + n) > limit || woff > (limit - n))
 783                 n = limit - woff;
 784
 785         /*
 786          * Check for mandatory locks
 787          */
 788         if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
 789             (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 790                 zfs_range_unlock(rl);
 791                 ZFS_EXIT(zfsvfs);
 792                 return (error);
 793         }
 794         end_size = MAX(zp->z_phys->zp_size, woff + n);
 795
 796         /*
 797          * Write the file in reasonable size chunks.  Each chunk is written
 798          * in a separate transaction; this keeps the intent log records small
 799          * and allows us to do more fine-grained space accounting.
 800          */
 801         while (n > 0) {
 802                 /*
 803                  * Start a transaction.
 804                  */
 805                 woff = uio->uio_loffset;
 806                 tx = dmu_tx_create(zfsvfs->z_os);
 807                 dmu_tx_hold_bonus(tx, zp->z_id);
 808                 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 809                 error = dmu_tx_assign(tx, zfsvfs->z_assign);
 810                 if (error) {
 811                         if (error == ERESTART &&
 812                             zfsvfs->z_assign == TXG_NOWAIT) {
 813                                 dmu_tx_wait(tx);
 814                                 dmu_tx_abort(tx);
 815                                 continue;
 816                         }
 817                         dmu_tx_abort(tx);
 818                         break;
 819                 }
 820
 821                 /*
 822                  * If zfs_range_lock() over-locked we grow the blocksize
 823                  * and then reduce the lock range.  This will only happen
 824                  * on the first iteration since zfs_range_reduce() will
 825                  * shrink down r_len to the appropriate size.
 826                  */
 827                 if (rl->r_len == UINT64_MAX) {
 828                         uint64_t new_blksz;
 829
 830                         if (zp->z_blksz > max_blksz) {
 831                                 ASSERT(!ISP2(zp->z_blksz));
 832                                 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 833                         } else {
 834                                 new_blksz = MIN(end_size, max_blksz);
 835                         }
 836                         zfs_grow_blocksize(zp, new_blksz, tx);
 837                         zfs_range_reduce(rl, woff, n);
 838                 }
 839
 840                 /*
 841                  * XXX - should we really limit each write to z_max_blksz?
 842                  * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 843                  */
 844                 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 845
 846                 if (woff + nbytes > zp->z_phys->zp_size)
 847                         uvm_vnp_setsize(vp, woff + nbytes);
 848
 849                 rw_enter(&zp->z_map_lock, RW_READER);
 850
 851                 tx_bytes = uio->uio_resid;
 852                 if (vn_has_cached_data(vp)) {
 853                         rw_exit(&zp->z_map_lock);
 854 //                      error = mappedwrite(vp, nbytes, uio, tx);
 855                 } else {
 856                         error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
 857                             uio, nbytes, tx);
 858                         rw_exit(&zp->z_map_lock);
 859                 }
 860                 tx_bytes -= uio->uio_resid;
 861
 862                 /*
 863                  * If we made no progress, we're done.  If we made even
 864                  * partial progress, update the znode and ZIL accordingly.
 865                  */
 866                 if (tx_bytes == 0) {
 867                         dmu_tx_commit(tx);
 868                         ASSERT(error != 0);
 869                         break;
 870                 }
 871
 872                 /*
 873                  * Clear Set-UID/Set-GID bits on successful write if not
 874                  * privileged and at least one of the excute bits is set.
 875                  *
 876                  * It would be nice to to this after all writes have
 877                  * been done, but that would still expose the ISUID/ISGID
 878                  * to another app after the partial write is committed.
 879                  *
 880                  * Note: we don't call zfs_fuid_map_id() here because
 881                  * user 0 is not an ephemeral uid.
 882                  */
 883                 mutex_enter(&zp->z_acl_lock);
 884                 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
 885                     (S_IXUSR >> 6))) != 0 &&
 886                     (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
 887                     secpolicy_vnode_setid_retain(cr, (zp->z_phys->zp_mode & S_ISUID) != 0 && zp->z_phys->zp_uid == 0) != 0) {
 888                         zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
 889                 }
 890                 mutex_exit(&zp->z_acl_lock);
 891
 892                 /*
 893                  * Update time stamp.  NOTE: This marks the bonus buffer as
 894                  * dirty, so we don't have to do it again for zp_size.
 895                  */
 896                 zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
 897
 898                 /*
 899                  * Update the file size (zp_size) if it has changed;
 900                  * account for possible concurrent updates.
 901                  */
 902                 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
 903                         (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
 904                             uio->uio_loffset);
 905                 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 906                 dmu_tx_commit(tx);
 907
 908                 if (error != 0)
 909                         break;
 910                 ASSERT(tx_bytes == nbytes);
 911                 n -= nbytes;
 912         }
 913
 914         zfs_range_unlock(rl);
 915
 916         /*
 917          * If we're in replay mode, or we made no progress, return error.
 918          * Otherwise, it's at least a partial write, so it's successful.
 919          */
 920         if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
 921                 ZFS_EXIT(zfsvfs);
 922                 return (error);
 923         }
 924
 925         if (ioflag & (FSYNC | FDSYNC))
 926                 zil_commit(zilog, zp->z_last_itx, zp->z_id);
 927
 928         ZFS_EXIT(zfsvfs);
 929
 930         return (0);
 931 }
 932
 933 void
 934 zfs_get_done(dmu_buf_t *db, void *vzgd)
 935 {
 936         zgd_t *zgd = (zgd_t *)vzgd;
 937         rl_t *rl = zgd->zgd_rl;
 938         vnode_t *vp = ZTOV(rl->r_zp);
 939         int vfslocked;
 940
 941         dmu_buf_rele(db, vzgd);
 942         zfs_range_unlock(rl);
 943         /*
 944          * Release the vnode asynchronously as we currently have the
 945          * txg stopped from syncing.
 946          */
 947         vrele(vp);
 948         zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 949         kmem_free(zgd, sizeof (zgd_t));
 950 }
 951
 952 /*
 953  * Get data to generate a TX_WRITE intent log record.
 954  */
 955 int
 956 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 957 {
 958         zfsvfs_t *zfsvfs = arg;
 959         objset_t *os = zfsvfs->z_os;
 960         znode_t *zp;
 961         uint64_t off = lr->lr_offset;
 962         dmu_buf_t *db;
 963         rl_t *rl;
 964         zgd_t *zgd;
 965         int dlen = lr->lr_length;               /* length of user data */
 966         int error = 0;
 967
 968         ASSERT(zio);
 969         ASSERT(dlen != 0);
 970
 971         /*
 972          * Nothing to do if the file has been removed
 973          */
 974         if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
 975                 return (ENOENT);
 976         if (zp->z_unlinked) {
 977                 /*
 978                  * Release the vnode asynchronously as we currently have the
 979                  * txg stopped from syncing.
 980                  */
 981                 vrele(ZTOV(zp));
 982
 983                 return (ENOENT);
 984         }
 985
 986         /*
 987          * Write records come in two flavors: immediate and indirect.
 988          * For small writes it's cheaper to store the data with the
 989          * log record (immediate); for large writes it's cheaper to
 990          * sync the data and get a pointer to it (indirect) so that
 991          * we don't have to write the data twice.
 992          */
 993         if (buf != NULL) { /* immediate write */
 994                 rl = zfs_range_lock(zp, off, dlen, RL_READER);
 995                 /* test for truncation needs to be done while range locked */
 996                 if (off >= zp->z_phys->zp_size) {
 997                         error = ENOENT;
 998                         goto out;
 999                 }
1000                 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
1001         } else { /* indirect write */
1002                 uint64_t boff; /* block starting offset */
1003
1004                 /*
1005                  * Have to lock the whole block to ensure when it's
1006                  * written out and it's checksum is being calculated
1007                  * that no one can change the data. We need to re-check
1008                  * blocksize after we get the lock in case it's changed!
1009                  */
1010                 for (;;) {
1011                         if (ISP2(zp->z_blksz)) {
1012                                 boff = P2ALIGN_TYPED(off, zp->z_blksz,
1013                                     uint64_t);
1014                         } else {
1015                                 boff = 0;
1016                         }
1017                         dlen = zp->z_blksz;
1018                         rl = zfs_range_lock(zp, boff, dlen, RL_READER);
1019                         if (zp->z_blksz == dlen)
1020                                 break;
1021                         zfs_range_unlock(rl);
1022                 }
1023                 /* test for truncation needs to be done while range locked */
1024                 if (off >= zp->z_phys->zp_size) {
1025                         error = ENOENT;
1026                         goto out;
1027                 }
1028                 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
1029                 zgd->zgd_rl = rl;
1030                 zgd->zgd_zilog = zfsvfs->z_log;
1031                 zgd->zgd_bp = &lr->lr_blkptr;
1032                 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
1033                 ASSERT(boff == db->db_offset);
1034                 lr->lr_blkoff = off - boff;
1035                 error = dmu_sync(zio, db, &lr->lr_blkptr,
1036                     lr->lr_common.lrc_txg, zfs_get_done, zgd);
1037                 ASSERT((error && error != EINPROGRESS) ||
1038                     lr->lr_length <= zp->z_blksz);
1039                 if (error == 0)
1040                         zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
1041                 /*
1042                  * If we get EINPROGRESS, then we need to wait for a
1043                  * write IO initiated by dmu_sync() to complete before
1044                  * we can release this dbuf.  We will finish everything
1045                  * up in the zfs_get_done() callback.
1046                  */
1047                 if (error == EINPROGRESS)
1048                         return (0);
1049                 dmu_buf_rele(db, zgd);
1050                 kmem_free(zgd, sizeof (zgd_t));
1051         }
1052 out:
1053         zfs_range_unlock(rl);
1054         /*
1055          * Release the vnode asynchronously as we currently have the
1056          * txg stopped from syncing.
1057          */
1058         vrele(ZTOV(zp));
1059         return (error);
1060 }
1061
1062 /*ARGSUSED*/
1063 static int
1064 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1065     caller_context_t *ct)
1066 {
1067         znode_t *zp = VTOZ(vp);
1068         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1069         int error;
1070
1071         ZFS_ENTER(zfsvfs);
1072         ZFS_VERIFY_ZP(zp);
1073
1074         if (flag & V_ACE_MASK)
1075                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1076         else
1077                 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1078
1079         ZFS_EXIT(zfsvfs);
1080         return (error);
1081 }
1082
1083 /*
1084  * Lookup an entry in a directory, or an extended attribute directory.
1085  * If it exists, return a held vnode reference for it.
1086  *
1087  *      IN:     dvp     - vnode of directory to search.
1088  *              nm      - name of entry to lookup.
1089  *              pnp     - full pathname to lookup [UNUSED].
1090  *              flags   - LOOKUP_XATTR set if looking for an attribute.
1091  *              rdir    - root directory vnode [UNUSED].
1092  *              cr      - credentials of caller.
1093  *              ct      - caller context
1094  *              direntflags - directory lookup flags
1095  *              realpnp - returned pathname.
1096  *
1097  *      OUT:    vpp     - vnode of located entry, NULL if not found.
1098  *
1099  *      RETURN: 0 if success
1100  *              error code if failure
1101  *
1102  * Timestamps:
1103  *      NA
1104  */
1105 /* ARGSUSED */
1106 static int
1107 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1108     int nameiop, cred_t *cr, int flags)
1109 {
1110         znode_t *zdp = VTOZ(dvp);
1111         zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1112         int     error;
1113         int *direntflags = NULL;
1114         void *realpnp = NULL;
1115
1116         ZFS_ENTER(zfsvfs);
1117         ZFS_VERIFY_ZP(zdp);
1118
1119         *vpp = NULL;
1120         dprintf("zfs_lookup called %s\n", nm);
1121         if (flags & LOOKUP_XATTR) {
1122 #ifdef TODO
1123                 /*
1124                  * If the xattr property is off, refuse the lookup request.
1125                  */
1126                 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1127                         ZFS_EXIT(zfsvfs);
1128                         return (EINVAL);
1129                 }
1130 #endif
1131
1132                 /*
1133                  * We don't allow recursive attributes..
1134                  * Maybe someday we will.
1135                  */
1136                 if (zdp->z_phys->zp_flags & ZFS_XATTR) {
1137                         ZFS_EXIT(zfsvfs);
1138                         return (EINVAL);
1139                 }
1140
1141                 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1142                         ZFS_EXIT(zfsvfs);
1143                         return (error);
1144                 }
1145
1146                 /*
1147                  * Do we have permission to get into attribute directory?
1148                  */
1149                 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1150                     B_FALSE, cr)) {
1151                         VN_RELE(*vpp);
1152                         *vpp = NULL;
1153                 }
1154
1155                 ZFS_EXIT(zfsvfs);
1156                 return (error);
1157         }
1158
1159         if (dvp->v_type != VDIR) {
1160                 ZFS_EXIT(zfsvfs);
1161                 return (ENOTDIR);
1162         }
1163
1164         /*
1165          * Check accessibility of directory.
1166          */
1167         if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1168                 ZFS_EXIT(zfsvfs);
1169                 return (error);
1170         }
1171
1172         /*
1173          * Before tediously performing a linear scan of the directory,
1174          * check the name cache to see if the directory/name pair
1175          * we are looking for is known already.
1176          */
1177
1178         if ((error = cache_lookup(dvp, vpp, cnp)) >= 0) {
1179                 ZFS_EXIT(zfsvfs);
1180                 return (error);
1181         }
1182
1183         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1184             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1185                 ZFS_EXIT(zfsvfs);
1186                 return (EILSEQ);
1187         }
1188
1189         error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1190         if (error == 0) {
1191                 /*
1192                  * Convert device special files
1193                  */
1194                 if (IS_DEVVP(*vpp)) {
1195                         vnode_t *svp;
1196
1197                         svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1198                         VN_RELE(*vpp);
1199                         if (svp == NULL)
1200                                 error = ENOSYS;
1201                         else
1202                                 *vpp = svp;
1203                 }
1204         }
1205
1206         ZFS_EXIT(zfsvfs);
1207
1208         /* Translate errors and add SAVENAME when needed. */
1209         if (cnp->cn_flags & ISLASTCN) {
1210                 switch (nameiop) {
1211                 case CREATE:
1212                 case RENAME:
1213                         if (error == ENOENT) {
1214                                 error = EJUSTRETURN;
1215                                 cnp->cn_flags |= SAVENAME;
1216                                 break;
1217                         }
1218                         /* FALLTHROUGH */
1219                 case DELETE:
1220                         if (error == 0)
1221                                 cnp->cn_flags |= SAVENAME;
1222                         break;
1223                 }
1224         }
1225
1226         if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1227                 int ltype = 0;
1228
1229                 if (cnp->cn_flags & ISDOTDOT) {
1230                         ltype = VOP_ISLOCKED(dvp);
1231                         VOP_UNLOCK(dvp, 0);
1232                 }
1233                 error = vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1234                 if (cnp->cn_flags & ISDOTDOT)
1235                         vn_lock(dvp, ltype | LK_RETRY);
1236                 if (error != 0) {
1237                         VN_RELE(*vpp);
1238                         *vpp = NULL;
1239                         return (error);
1240                 }
1241         }
1242
1243         /*
1244          * Insert name into cache if appropriate.
1245          */
1246         if ((cnp->cn_flags & MAKEENTRY) == 0){
1247                 return (error);
1248         }
1249         switch (error) {
1250         case 0:
1251                 cache_enter(dvp, *vpp, cnp);
1252                 break;
1253         case ENOENT:
1254                 if (nameiop != CREATE)
1255                         cache_enter(dvp, *vpp, cnp);
1256                 break;
1257         default:
1258                 break;
1259         }
1260         return (error);
1261 }
1262
1263 /*
1264  * Attempt to create a new entry in a directory.  If the entry
1265  * already exists, truncate the file if permissible, else return
1266  * an error.  Return the vp of the created or trunc'd file.
1267  *
1268  *      IN:     dvp     - vnode of directory to put new file entry in.
1269  *              name    - name of new file entry.
1270  *              vap     - attributes of new file.
1271  *              excl    - flag indicating exclusive or non-exclusive mode.
1272  *              mode    - mode to open file with.
1273  *              cr      - credentials of caller.
1274  *              flag    - large file flag [UNUSED].
1275  *              ct      - caller context
1276  *              vsecp   - ACL to be set
1277  *
1278  *      OUT:    vpp     - vnode of created or trunc'd entry.
1279  *
1280  *      RETURN: 0 if success
1281  *              error code if failure
1282  *
1283  * Timestamps:
1284  *      dvp - ctime|mtime updated if new entry created
1285  *       vp - ctime|mtime always, atime if new
1286  */
1287
1288 /* ARGSUSED */
1289 static int
1290 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1291     vnode_t **vpp, cred_t *cr)
1292 {
1293         znode_t         *zp, *dzp = VTOZ(dvp);
1294         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1295         zilog_t         *zilog;
1296         objset_t        *os;
1297         zfs_dirlock_t   *dl;
1298         dmu_tx_t        *tx;
1299         int             error;
1300         zfs_acl_t       *aclp = NULL;
1301         zfs_fuid_info_t *fuidp = NULL;
1302         void            *vsecp = NULL;
1303         int             flag = 0;
1304
1305         dprintf("zfs_create called\n");
1306         /*
1307          * If we have an ephemeral id, ACL, or XVATTR then
1308          * make sure file system is at proper version
1309          */
1310
1311         if (zfsvfs->z_use_fuids == B_FALSE &&
1312             (vsecp || (vap->va_mask & AT_XVATTR) ||
1313             IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
1314                 return (EINVAL);
1315
1316         ZFS_ENTER(zfsvfs);
1317         ZFS_VERIFY_ZP(dzp);
1318         os = zfsvfs->z_os;
1319         zilog = zfsvfs->z_log;
1320
1321         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1322             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1323                 ZFS_EXIT(zfsvfs);
1324                 return (EILSEQ);
1325         }
1326
1327         if (vap->va_mask & AT_XVATTR) {
1328                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1329                     crgetuid(cr), cr, vap->va_type)) != 0) {
1330                         ZFS_EXIT(zfsvfs);
1331                         return (error);
1332                 }
1333         }
1334 top:
1335         *vpp = NULL;
1336
1337         if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1338                 vap->va_mode &= ~S_ISVTX;
1339
1340         if (*name == '\0') {
1341                 /*
1342                  * Null component name refers to the directory itself.
1343                  */
1344                 VN_HOLD(dvp);
1345                 zp = dzp;
1346                 dl = NULL;
1347                 error = 0;
1348         } else {
1349                 /* possible VN_HOLD(zp) */
1350                 int zflg = 0;
1351
1352                 if (flag & FIGNORECASE)
1353                         zflg |= ZCILOOK;
1354
1355                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1356                     NULL, NULL);
1357                 if (error) {
1358                         if (strcmp(name, "..") == 0)
1359                                 error = EISDIR;
1360                         ZFS_EXIT(zfsvfs);
1361                         if (aclp)
1362                                 zfs_acl_free(aclp);
1363                         return (error);
1364                 }
1365         }
1366         if (vsecp && aclp == NULL) {
1367                 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
1368                 if (error) {
1369                         ZFS_EXIT(zfsvfs);
1370                         if (dl)
1371                                 zfs_dirent_unlock(dl, 0);
1372                         return (error);
1373                 }
1374         }
1375
1376         if (zp == NULL) {
1377                 uint64_t txtype;
1378
1379                 /*
1380                  * Create a new file object and update the directory
1381                  * to reference it.
1382                  */
1383                 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1384                         goto out;
1385                 }
1386
1387                 /*
1388                  * We only support the creation of regular files in
1389                  * extended attribute directories.
1390                  */
1391                 if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
1392                     (vap->va_type != VREG)) {
1393                         error = EINVAL;
1394                         goto out;
1395                 }
1396
1397                 tx = dmu_tx_create(os);
1398                 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1399                 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) ||
1400                     IS_EPHEMERAL(crgetgid(cr))) {
1401                         if (zfsvfs->z_fuid_obj == 0) {
1402                                 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1403                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1404                                     FUID_SIZE_ESTIMATE(zfsvfs));
1405                                 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
1406                                     FALSE, NULL);
1407                         } else {
1408                                 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
1409                                 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
1410                                     FUID_SIZE_ESTIMATE(zfsvfs));
1411                         }
1412                 }
1413                 dmu_tx_hold_bonus(tx, dzp->z_id);
1414                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1415                 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) {
1416                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1417                             0, SPA_MAXBLOCKSIZE);
1418                 }
1419                 error = dmu_tx_assign(tx, zfsvfs->z_assign);
1420                 if (error) {
1421                         zfs_dirent_unlock(dl, 0);
1422                         if (error == ERESTART &&
1423                             zfsvfs->z_assign == TXG_NOWAIT) {
1424                                 dmu_tx_wait(tx);
1425                                 dmu_tx_abort(tx);
1426                                 goto top;
1427                         }
1428                         dmu_tx_abort(tx);
1429                         ZFS_EXIT(zfsvfs);
1430                         if (aclp)
1431                                 zfs_acl_free(aclp);
1432                         return (error);
1433                 }
1434                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
1435                 (void) zfs_link_create(dl, zp, tx, ZNEW);
1436                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1437                 if (flag & FIGNORECASE)
1438                         txtype |= TX_CI;
1439                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1440                     vsecp, fuidp, vap);
1441                 if (fuidp)
1442                         zfs_fuid_info_free(fuidp);
1443                 dmu_tx_commit(tx);
1444         } else {
1445                 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1446
1447                 /*
1448                  * A directory entry already exists for this name.
1449                  */
1450                 /*
1451                  * Can't truncate an existing file if in exclusive mode.
1452                  */
1453                 if (excl == EXCL) {
1454                         error = EEXIST;
1455                         goto out;
1456                 }
1457                 /*
1458                  * Can't open a directory for writing.
1459                  */
1460                 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1461                         error = EISDIR;
1462                         goto out;
1463                 }
1464                 /*
1465                  * Verify requested access to file.
1466                  */
1467                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1468                         goto out;
1469                 }
1470
1471                 mutex_enter(&dzp->z_lock);
1472                 dzp->z_seq++;
1473                 mutex_exit(&dzp->z_lock);
1474
1475                 /*
1476                  * Truncate regular files if requested.
1477                  */
1478                 if ((ZTOV(zp)->v_type == VREG) &&
1479                     (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1480                         /* we can't hold any locks when calling zfs_freesp() */
1481                         zfs_dirent_unlock(dl, 0);
1482                         dl = NULL;
1483                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
1484                         if (error == 0) {
1485                                 vnevent_create(ZTOV(zp), NULL);
1486                         }
1487                 }
1488         }
1489 out:
1490         if (dl)
1491                 zfs_dirent_unlock(dl, 0);
1492
1493         if (error) {
1494                 if (zp)
1495                         VN_RELE(ZTOV(zp));
1496         } else {
1497                 *vpp = ZTOV(zp);
1498                 /*
1499                  * If vnode is for a device return a specfs vnode instead.
1500                  */
1501                 if (IS_DEVVP(*vpp)) {
1502                         struct vnode *svp;
1503
1504                         svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1505                         VN_RELE(*vpp);
1506                         if (svp == NULL) {
1507                                 error = ENOSYS;
1508                         }
1509                         *vpp = svp;
1510                 }
1511         }
1512         if (aclp)
1513                 zfs_acl_free(aclp);
1514
1515         ZFS_EXIT(zfsvfs);
1516         return (error);
1517 }
1518
1519 /*
1520  * Remove an entry from a directory.
1521  *
1522  *      IN:     dvp     - vnode of directory to remove entry from.
1523  *              name    - name of entry to remove.
1524  *              cr      - credentials of caller.
1525  *              ct      - caller context
1526  *              flags   - case flags
1527  *
1528  *      RETURN: 0 if success
1529  *              error code if failure
1530  *
1531  * Timestamps:
1532  *      dvp - ctime|mtime
1533  *       vp - ctime (if nlink > 0)
1534  */
1535 /*ARGSUSED*/
1536 static int
1537 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1538     int flags)
1539 {
1540         znode_t         *zp, *dzp = VTOZ(dvp);
1541         znode_t         *xzp = NULL;
1542         vnode_t         *vp;
1543         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1544         zilog_t         *zilog;
1545         uint64_t        acl_obj, xattr_obj;
1546         zfs_dirlock_t   *dl;
1547         dmu_tx_t        *tx;
1548         boolean_t       may_delete_now, delete_now = FALSE;
1549         boolean_t       unlinked, toobig = FALSE;
1550         uint64_t        txtype;
1551         pathname_t      *realnmp = NULL;
1552         pathname_t      realnm;
1553         int             error;
1554         int             zflg = ZEXISTS;
1555
1556         dprintf("zfs_remove called\n");
1557
1558         ZFS_ENTER(zfsvfs);
1559         ZFS_VERIFY_ZP(dzp);
1560         zilog = zfsvfs->z_log;
1561
1562         if (flags & FIGNORECASE) {
1563                 zflg |= ZCILOOK;
1564                 pn_alloc(&realnm);
1565                 realnmp = &realnm;
1566         }
1567
1568 top:
1569         /*
1570          * Attempt to lock directory; fail if entry doesn't exist.
1571          */
1572         if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1573             NULL, realnmp)) {
1574                 if (realnmp)
1575                         pn_free(realnmp);
1576                 ZFS_EXIT(zfsvfs);
1577                 return (error);
1578         }
1579
1580         vp = ZTOV(zp);
1581
1582         if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1583                 goto out;
1584         }
1585
1586         /*
1587          * Need to use rmdir for removing directories.
1588          */
1589         if (vp->v_type == VDIR) {
1590                 error = EPERM;
1591                 goto out;
1592         }
1593
1594         vnevent_remove(vp, dvp, name, ct);
1595
1596         if (realnmp)
1597                 dnlc_remove(dvp, realnmp->pn_buf);
1598         else
1599                 dnlc_remove(dvp, name);
1600
1601         may_delete_now = FALSE;
1602
1603         /*
1604          * We may delete the znode now, or we may put it in the unlinked set;
1605          * it depends on whether we're the last link, and on whether there are
1606          * other holds on the vnode.  So we dmu_tx_hold() the right things to
1607          * allow for either case.
1608          */
1609         tx = dmu_tx_create(zfsvfs->z_os);
1610         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1611         dmu_tx_hold_bonus(tx, zp->z_id);
1612         if (may_delete_now) {
1613                 toobig =
1614                     zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1615                 /* if the file is too big, only hold_free a token amount */
1616                 dmu_tx_hold_free(tx, zp->z_id, 0,
1617                     (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1618         }
1619
1620         /* are there any extended attributes? */
1621         if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
1622                 /* XXX - do we need this if we are deleting? */
1623                 dmu_tx_hold_bonus(tx, xattr_obj);
1624         }
1625
1626         /* are there any additional acls */
1627         if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
1628             may_delete_now)
1629                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1630
1631         /* charge as an update -- would be nice not to charge at all */
1632         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1633
1634         error = dmu_tx_assign(tx, zfsvfs->z_assign);
1635         if (error) {
1636                 zfs_dirent_unlock(dl, 0);
1637                 VN_RELE(vp);
1638                 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1639                         dmu_tx_wait(tx);
1640                         dmu_tx_abort(tx);
1641                         goto top;
1642                 }
1643                 if (realnmp)
1644                         pn_free(realnmp);
1645                 dmu_tx_abort(tx);
1646                 ZFS_EXIT(zfsvfs);
1647                 return (error);
1648         }
1649
1650         /*
1651          * Remove the directory entry.
1652          */
1653         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1654
1655         if (error) {
1656                 dmu_tx_commit(tx);
1657                 goto out;
1658         }
1659
1660         if (0 && unlinked) {
1661                 KASSERT(0);     /* NetBSD: must now happen now */
1662                 VI_LOCK(vp);
1663                 delete_now = may_delete_now && !toobig &&
1664                     vp->v_count == 1 && !vn_has_cached_data(vp) &&
1665                     zp->z_phys->zp_xattr == xattr_obj &&
1666                     zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
1667                 VI_UNLOCK(vp);
1668                 }
1669
1670         if (delete_now) {
1671                 KASSERT(0);     /* NetBSD: must now happen now */
1672                 if (zp->z_phys->zp_xattr) {
1673                         error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
1674                         ASSERT3U(error, ==, 0);
1675                         ASSERT3U(xzp->z_phys->zp_links, ==, 2);
1676                         dmu_buf_will_dirty(xzp->z_dbuf, tx);
1677                         mutex_enter(&xzp->z_lock);
1678                         xzp->z_unlinked = 1;
1679                         xzp->z_phys->zp_links = 0;
1680                         mutex_exit(&xzp->z_lock);
1681                         zfs_unlinked_add(xzp, tx);
1682                         zp->z_phys->zp_xattr = 0; /* probably unnecessary */
1683                 }
1684                 mutex_enter(&zp->z_lock);
1685                 VI_LOCK(vp);
1686                 vp->v_count--;
1687                 ASSERT3U(vp->v_count, ==, 0);
1688                 VI_UNLOCK(vp);
1689                 mutex_exit(&zp->z_lock);
1690                 zfs_znode_delete(zp, tx);
1691         } else if (unlinked) {
1692                 zfs_unlinked_add(zp, tx);
1693         }
1694
1695         txtype = TX_REMOVE;
1696         if (flags & FIGNORECASE)
1697                 txtype |= TX_CI;
1698         zfs_log_remove(zilog, tx, txtype, dzp, name);
1699
1700         dmu_tx_commit(tx);
1701 out:
1702         if (realnmp)
1703                 pn_free(realnmp);
1704
1705         zfs_dirent_unlock(dl, 0);
1706
1707         if (!delete_now) {
1708                 VN_RELE(vp);
1709         } else if (xzp) {
1710                 /* this rele is delayed to prevent nesting transactions */
1711                 VN_RELE(ZTOV(xzp));
1712         }
1713
1714         ZFS_EXIT(zfsvfs);
1715         return (error);
1716 }
1717
1718 /*
1719  * Create a new directory and insert it into dvp using the name
1720  * provided.  Return a pointer to the inserted directory.
1721  *
1722  *      IN:     dvp     - vnode of directory to add subdir to.
1723  *              dirname - name of new directory.
1724  *              vap     - attributes of new directory.
1725  *              cr      - credentials of caller.
1726  *              ct      - caller context
1727  *              vsecp   - ACL to be set
1728  *
1729  *      OUT:    vpp     - vnode of created directory.
1730  *
1731  *      RETURN: 0 if success
1732  *              error code if failure
1733  *
1734  * Timestamps:
1735  *      dvp - ctime|mtime updated
1736  *       vp - ctime|mtime|atime updated
1737  */
1738 /*ARGSUSED*/
1739 static int
1740 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1741     caller_context_t *ct, int flags, vsecattr_t *vsecp)
1742 {
1743         znode_t         *zp, *dzp = VTOZ(dvp);
1744         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1745         zilog_t         *zilog;
1746         zfs_dirlock_t   *dl;
1747         uint64_t        txtype;
1748         dmu_tx_t        *tx;
1749         int             error;
1750         zfs_acl_t       *aclp = NULL;
1751         zfs_fuid_info_t *fuidp = NULL;
1752         int             zf = ZNEW;
1753
1754         ASSERT(vap->va_type == VDIR);
1755
1756         /*
1757          * If we have an ephemeral id, ACL, or XVATTR then
1758          * make sure file system is at proper version
1759          */
1760
1761         if (zfsvfs->z_use_fuids == B_FALSE &&
1762             (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))||
1763             IS_EPHEMERAL(crgetgid(cr))))
1764                 return (EINVAL);
1765
1766         ZFS_ENTER(zfsvfs);
1767         ZFS_VERIFY_ZP(dzp);
1768         zilog = zfsvfs->z_log;
1769
1770         if (dzp->z_phys->zp_flags & ZFS_XATTR) {
1771                 ZFS_EXIT(zfsvfs);
1772                 return (EINVAL);
1773         }
1774
1775         if (zfsvfs->z_utf8 && u8_validate(dirname,
1776             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1777                 ZFS_EXIT(zfsvfs);
1778                 return (EILSEQ);
1779         }
1780         if (flags & FIGNORECASE)
1781                 zf |= ZCILOOK;
1782
1783         if (vap->va_mask & AT_XVATTR)
1784                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1785                     crgetuid(cr), cr, vap->va_type)) != 0) {
1786                         ZFS_EXIT(zfsvfs);
1787                         return (error);
1788                 }
1789
1790         /*
1791          * First make sure the new directory doesn't exist.
1792          */
1793 top:
1794         *vpp = NULL;
1795
1796         if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1797             NULL, NULL)) {
1798                 ZFS_EXIT(zfsvfs);
1799                 return (error);
1800         }
1801
1802         if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1803                 zfs_dirent_unlock(dl, 0);
1804                 ZFS_EXIT(zfsvfs);
1805                 return (error);
1806         }
1807
1808         if (vsecp && aclp == NULL) {
1809                 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
1810                 if (error) {
1811                         zfs_dirent_unlock(dl, 0);
1812                         ZFS_EXIT(zfsvfs);
1813                         return (error);
1814                 }
1815         }
1816         /*
1817          * Add a new entry to the directory.
1818          */
1819         tx = dmu_tx_create(zfsvfs->z_os);
1820         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1821         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1822         if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) ||
1823             IS_EPHEMERAL(crgetgid(cr))) {
1824                 if (zfsvfs->z_fuid_obj == 0) {
1825                         dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1826                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1827                             FUID_SIZE_ESTIMATE(zfsvfs));
1828                         dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
1829                 } else {
1830                         dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
1831                         dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
1832                             FUID_SIZE_ESTIMATE(zfsvfs));
1833                 }
1834         }
1835         if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
1836                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1837                     0, SPA_MAXBLOCKSIZE);
1838         error = dmu_tx_assign(tx, zfsvfs->z_assign);
1839         if (error) {
1840                 zfs_dirent_unlock(dl, 0);
1841                 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1842                         dmu_tx_wait(tx);
1843                         dmu_tx_abort(tx);
1844                         goto top;
1845                 }
1846                 dmu_tx_abort(tx);
1847                 ZFS_EXIT(zfsvfs);
1848                 if (aclp)
1849                         zfs_acl_free(aclp);
1850                 return (error);
1851         }
1852
1853         /*
1854          * Create new node.
1855          */
1856         zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
1857
1858         if (aclp)
1859                 zfs_acl_free(aclp);
1860
1861         /*
1862          * Now put new name in parent dir.
1863          */
1864         (void) zfs_link_create(dl, zp, tx, ZNEW);
1865
1866         *vpp = ZTOV(zp);
1867
1868         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1869         if (flags & FIGNORECASE)
1870                 txtype |= TX_CI;
1871         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap);
1872
1873         if (fuidp)
1874                 zfs_fuid_info_free(fuidp);
1875         dmu_tx_commit(tx);
1876
1877         zfs_dirent_unlock(dl, 0);
1878
1879         ZFS_EXIT(zfsvfs);
1880         return (0);
1881 }
1882
1883 /*
1884  * Remove a directory subdir entry.  If the current working
1885  * directory is the same as the subdir to be removed, the
1886  * remove will fail.
1887  *
1888  *      IN:     dvp     - vnode of directory to remove from.
1889  *              name    - name of directory to be removed.
1890  *              cwd     - vnode of current working directory.
1891  *              cr      - credentials of caller.
1892  *              ct      - caller context
1893  *              flags   - case flags
1894  *
1895  *      RETURN: 0 if success
1896  *              error code if failure
1897  *
1898  * Timestamps:
1899  *      dvp - ctime|mtime updated
1900  */
1901 /*ARGSUSED*/
1902 static int
1903 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
1904     caller_context_t *ct, int flags)
1905 {
1906         znode_t         *dzp = VTOZ(dvp);
1907         znode_t         *zp;
1908         vnode_t         *vp;
1909         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1910         zilog_t         *zilog;
1911         zfs_dirlock_t   *dl;
1912         dmu_tx_t        *tx;
1913         int             error;
1914         int             zflg = ZEXISTS;
1915
1916         ZFS_ENTER(zfsvfs);
1917         ZFS_VERIFY_ZP(dzp);
1918         zilog = zfsvfs->z_log;
1919
1920         if (flags & FIGNORECASE)
1921                 zflg |= ZCILOOK;
1922 top:
1923         zp = NULL;
1924
1925         /*
1926          * Attempt to lock directory; fail if entry doesn't exist.
1927          */
1928         if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1929             NULL, NULL)) {
1930                 ZFS_EXIT(zfsvfs);
1931                 return (error);
1932         }
1933
1934         vp = ZTOV(zp);
1935
1936         if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1937                 goto out;
1938         }
1939
1940         if (vp->v_type != VDIR) {
1941                 error = ENOTDIR;
1942                 goto out;
1943         }
1944
1945         if (vp == cwd) {
1946                 error = EINVAL;
1947                 goto out;
1948         }
1949
1950         vnevent_rmdir(vp, dvp, name, ct);
1951
1952         /*
1953          * Grab a lock on the parent pointer to make sure we play well
1954          * with the treewalk and directory rename code.
1955          */
1956         rw_enter(&zp->z_parent_lock, RW_WRITER);
1957
1958         tx = dmu_tx_create(zfsvfs->z_os);
1959         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1960         dmu_tx_hold_bonus(tx, zp->z_id);
1961         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1962         error = dmu_tx_assign(tx, zfsvfs->z_assign);
1963         if (error) {
1964                 rw_exit(&zp->z_parent_lock);
1965                 rw_exit(&zp->z_name_lock);
1966                 zfs_dirent_unlock(dl, 0);
1967                 VN_RELE(vp);
1968                 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1969                         dmu_tx_wait(tx);
1970                         dmu_tx_abort(tx);
1971                         goto top;
1972                 }
1973                 dmu_tx_abort(tx);
1974                 ZFS_EXIT(zfsvfs);
1975                 return (error);
1976         }
1977
1978         /* Purge cache entries, while still holding locks. */
1979         cache_purge(dvp);
1980         cache_purge(vp);
1981
1982         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1983
1984         if (error == 0) {
1985                 uint64_t txtype = TX_RMDIR;
1986                 if (flags & FIGNORECASE)
1987                         txtype |= TX_CI;
1988                 zfs_log_remove(zilog, tx, txtype, dzp, name);
1989         }
1990
1991         dmu_tx_commit(tx);
1992
1993         rw_exit(&zp->z_parent_lock);
1994         rw_exit(&zp->z_name_lock);
1995 out:
1996         zfs_dirent_unlock(dl, 0);
1997
1998         VN_RELE(vp);
1999
2000         ZFS_EXIT(zfsvfs);
2001         return (error);
2002 }
2003
2004 /*
2005  * Read as many directory entries as will fit into the provided
2006  * buffer from the given directory cursor position (specified in
2007  * the uio structure.
2008  *
2009  *      IN:     vp      - vnode of directory to read.
2010  *              uio     - structure supplying read location, range info,
2011  *                        and return buffer.
2012  *              cr      - credentials of caller.
2013  *              ct      - caller context
2014  *              flags   - case flags
2015  *
2016  *      OUT:    uio     - updated offset and range, buffer filled.
2017  *              eofp    - set to true if end-of-file detected.
2018  *
2019  *      RETURN: 0 if success
2020  *              error code if failure
2021  *
2022  * Timestamps:
2023  *      vp - atime updated
2024  *
2025  * Note that the low 4 bits of the cookie returned by zap is always zero.
2026  * This allows us to use the low range for "special" directory entries:
2027  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2028  * we use the offset 2 for the '.zfs' directory.
2029  */
2030 /* ARGSUSED */
2031 static int
2032 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2033 {
2034         znode_t         *zp = VTOZ(vp);
2035         iovec_t         *iovp;
2036         edirent_t       *eodp;
2037         dirent64_t      *odp;
2038         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2039         objset_t        *os;
2040         caddr_t         outbuf;
2041         size_t          bufsize;
2042         zap_cursor_t    zc;
2043         zap_attribute_t zap;
2044         uint_t          bytes_wanted;
2045         uint64_t        offset; /* must be unsigned; checks for < 1 */
2046         int             local_eof;
2047         int             outcount;
2048         int             error;
2049         uint8_t         prefetch;
2050         boolean_t       check_sysattrs;
2051         uint8_t         type;
2052         int             ncooks;
2053         u_long          *cooks = NULL;
2054         int             flags = 0;
2055
2056         dprintf("zfs_readdir called\n");
2057
2058         ZFS_ENTER(zfsvfs);
2059         ZFS_VERIFY_ZP(zp);
2060
2061         /*
2062          * If we are not given an eof variable,
2063          * use a local one.
2064          */
2065         if (eofp == NULL)
2066                 eofp = &local_eof;
2067
2068         /*
2069          * Check for valid iov_len.
2070          */
2071         if (uio->uio_iov->iov_len <= 0) {
2072                 ZFS_EXIT(zfsvfs);
2073                 return (EINVAL);
2074         }
2075
2076         /*
2077          * Quit if directory has been removed (posix)
2078          */
2079         if ((*eofp = zp->z_unlinked) != 0) {
2080                 ZFS_EXIT(zfsvfs);
2081                 return (0);
2082         }
2083
2084         error = 0;
2085         os = zfsvfs->z_os;
2086         offset = uio->uio_loffset;
2087         prefetch = zp->z_zn_prefetch;
2088
2089         /*
2090          * Initialize the iterator cursor.
2091          */
2092         if (offset <= 3) {
2093                 /*
2094                  * Start iteration from the beginning of the directory.
2095                  */
2096                 zap_cursor_init(&zc, os, zp->z_id);
2097         } else {
2098                 /*
2099                  * The offset is a serialized cursor.
2100                  */
2101                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2102         }
2103
2104         /*
2105          * Get space to change directory entries into fs independent format.
2106          */
2107         iovp = uio->uio_iov;
2108         bytes_wanted = iovp->iov_len;
2109         if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_iovcnt != 1) {
2110                 bufsize = bytes_wanted;
2111                 outbuf = kmem_alloc(bufsize, KM_SLEEP);
2112                 memset(outbuf, 0, bufsize);
2113                 odp = (struct dirent64 *)outbuf;
2114         } else {
2115                 bufsize = bytes_wanted;
2116                 odp = (struct dirent64 *)iovp->iov_base;
2117         }
2118         eodp = (struct edirent *)odp;
2119
2120         if (ncookies != NULL) {
2121                 /*
2122                  * Minimum entry size is dirent size and 1 byte for a file name.
2123                  */
2124                 ncooks = uio->uio_resid / _DIRENT_MINSIZE(odp);
2125 //                  sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2126                 cooks = kmem_alloc(ncooks * sizeof(u_long), KM_SLEEP);
2127
2128                 memset(cooks, 0, ncooks * sizeof(u_long));
2129                 *cookies = cooks;
2130                 *ncookies = ncooks;
2131         }
2132
2133         /*
2134          * If this VFS supports the system attribute view interface; and
2135          * we're looking at an extended attribute directory; and we care
2136          * about normalization conflicts on this vfs; then we must check
2137          * for normalization conflicts with the sysattr name space.
2138          */
2139 #ifdef TODO
2140         check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2141             (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2142             (flags & V_RDDIR_ENTFLAGS);
2143 #else
2144         check_sysattrs = 0;
2145 #endif
2146
2147         /*
2148          * Transform to file-system independent format
2149          */
2150         outcount = 0;
2151         while (outcount < bytes_wanted) {
2152                 ino64_t objnum;
2153                 ushort_t reclen;
2154                 off64_t *next;
2155
2156                 /*
2157                  * Special case `.', `..', and `.zfs'.
2158                  */
2159                 if (offset == 0) {
2160                         (void) strcpy(zap.za_name, ".");
2161                         zap.za_normalization_conflict = 0;
2162                         objnum = zp->z_id;
2163                         type = DT_DIR;
2164                 } else if (offset == 1) {
2165                         (void) strcpy(zap.za_name, "..");
2166                         zap.za_normalization_conflict = 0;
2167                         objnum = zp->z_phys->zp_parent;
2168                         type = DT_DIR;
2169                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2170                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2171                         zap.za_normalization_conflict = 0;
2172                         objnum = ZFSCTL_INO_ROOT;
2173                         type = DT_DIR;
2174                 } else {
2175                         /*
2176                          * Grab next entry.
2177                          */
2178                         if (error = zap_cursor_retrieve(&zc, &zap)) {
2179                                 if ((*eofp = (error == ENOENT)) != 0)
2180                                         break;
2181                                 else
2182                                         goto update;
2183                         }
2184
2185                         if (zap.za_integer_length != 8 ||
2186                             zap.za_num_integers != 1) {
2187                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
2188                                     "entry, obj = %lld, offset = %lld\n",
2189                                     (u_longlong_t)zp->z_id,
2190                                     (u_longlong_t)offset);
2191                                 error = ENXIO;
2192                                 goto update;
2193                         }
2194
2195                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2196                         /*
2197                          * MacOS X can extract the object type here such as:
2198                          * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2199                          */
2200                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2201
2202                         if (check_sysattrs && !zap.za_normalization_conflict) {
2203 #ifdef TODO
2204                                 zap.za_normalization_conflict =
2205                                     xattr_sysattr_casechk(zap.za_name);
2206 #else
2207                                 panic("%s:%u: TODO", __func__, __LINE__);
2208 #endif
2209                         }
2210                 }
2211
2212                 if (flags & V_RDDIR_ENTFLAGS)
2213                         reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2214                 else
2215                         reclen = _DIRENT_RECLEN(odp, strlen(zap.za_name));
2216
2217                 /*
2218                  * Will this entry fit in the buffer?
2219                  */
2220                 if (outcount + reclen > bufsize) {
2221                         /*
2222                          * Did we manage to fit anything in the buffer?
2223                          */
2224                         if (!outcount) {
2225                                 error = EINVAL;
2226                                 goto update;
2227                         }
2228                         break;
2229                 }
2230                 if (flags & V_RDDIR_ENTFLAGS) {
2231                         /*
2232                          * Add extended flag entry:
2233                          */
2234                         eodp->ed_ino = objnum;
2235                         eodp->ed_reclen = reclen;
2236                         /* NOTE: ed_off is the offset for the *next* entry */
2237                         next = &(eodp->ed_off);
2238                         eodp->ed_eflags = zap.za_normalization_conflict ?
2239                             ED_CASE_CONFLICT : 0;
2240                         (void) strncpy(eodp->ed_name, zap.za_name,
2241                             EDIRENT_NAMELEN(reclen));
2242                         eodp = (edirent_t *)((intptr_t)eodp + reclen);
2243                 } else {
2244                         /*
2245                          * Add normal entry:
2246                          */
2247                         odp->d_ino = objnum;
2248                         odp->d_reclen = reclen;
2249                         odp->d_namlen = strlen(zap.za_name);
2250                         (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2251                         odp->d_type = type;
2252                         odp = (dirent64_t *)((intptr_t)odp + reclen);
2253                 }
2254                 outcount += reclen;
2255
2256                 KASSERT(outcount <= bufsize);
2257
2258                 /* Prefetch znode */
2259                 if (prefetch)
2260                         dmu_prefetch(os, objnum, 0, 0);
2261
2262                 /*
2263                  * Move to the next entry, fill in the previous offset.
2264                  */
2265                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2266                         zap_cursor_advance(&zc);
2267                         offset = zap_cursor_serialize(&zc);
2268                 } else {
2269                         offset += 1;
2270                 }
2271
2272                 if (cooks != NULL) {
2273                         *cooks++ = offset;
2274                         ncooks--;
2275                         KASSERT(ncooks >= 0);
2276                 }
2277         }
2278         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2279
2280         /* Subtract unused cookies */
2281         if (ncookies != NULL)
2282                 *ncookies -= ncooks;
2283
2284         if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace) && uio->uio_iovcnt == 1) {
2285                 iovp->iov_base += outcount;
2286                 iovp->iov_len -= outcount;
2287                 uio->uio_resid -= outcount;
2288         } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2289                 /*
2290                  * Reset the pointer.
2291                  */
2292                 offset = uio->uio_loffset;
2293         }
2294
2295 update:
2296         zap_cursor_fini(&zc);
2297         if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_iovcnt != 1)
2298                 kmem_free(outbuf, bufsize);
2299
2300         if (error == ENOENT)
2301                 error = 0;
2302
2303         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2304
2305         uio->uio_loffset = offset;
2306         ZFS_EXIT(zfsvfs);
2307         if (error != 0 && cookies != NULL) {
2308                 kmem_free(*cookies, ncooks * sizeof(u_long));
2309                 *cookies = NULL;
2310                 *ncookies = 0;
2311         }
2312         return (error);
2313 }
2314
2315 ulong_t zfs_fsync_sync_cnt = 4;
2316
2317 static int
2318 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2319 {
2320         znode_t *zp = VTOZ(vp);
2321         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2322         int error;
2323
2324         error = 0;
2325
2326         dprintf("zfs_fsync called vp %p -- zfsvfs %p\n", vp, zfsvfs);
2327         (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2328
2329         ZFS_ENTER(zfsvfs);
2330         ZFS_VERIFY_ZP(zp);
2331         /*
2332          * NetBSD: if the sync is from reclaim or from ioflush,
2333          * push dirty atime now.  No need to lock: in the reclaim
2334          * case, everything is single threaded and for ioflush this
2335          * is a lazy writeback.
2336          *
2337          * XXXNETBSD: in the ioflush case, we don't want to push anything
2338          * to disk immediately.  We just want to queue the update so it
2339          * will happen "soon".  Check this is the case otherwise zfs will
2340          * perform poorly.
2341          */
2342         if (zp->z_atime_dirty && zp->z_unlinked == 0 &&
2343             (syncflag & (FSYNC_RECLAIM | FSYNC_LAZY)) != 0) {
2344                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
2345
2346                 dmu_tx_hold_bonus(tx, zp->z_id);
2347                 error = dmu_tx_assign(tx, TXG_WAIT);
2348                 if (error) {
2349                         dmu_tx_abort(tx);
2350                 } else {
2351                         dmu_buf_will_dirty(zp->z_dbuf, tx);
2352                         mutex_enter(&zp->z_lock);
2353                         zp->z_atime_dirty = 0;
2354                         mutex_exit(&zp->z_lock);
2355                         dmu_tx_commit(tx);
2356                 }
2357         }
2358         zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
2359         ZFS_EXIT(zfsvfs);
2360         return (0);
2361 }
2362
2363
2364 /*
2365  * Get the requested file attributes and place them in the provided
2366  * vattr structure.
2367  *
2368  *      IN:     vp      - vnode of file.
2369  *              vap     - va_mask identifies requested attributes.
2370  *                        If AT_XVATTR set, then optional attrs are requested
2371  *              flags   - ATTR_NOACLCHECK (CIFS server context)
2372  *              cr      - credentials of caller.
2373  *              ct      - caller context
2374  *
2375  *      OUT:    vap     - attribute values.
2376  *
2377  *      RETURN: 0 (always succeeds)
2378  */
2379 /* ARGSUSED */
2380 static int
2381 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2382     caller_context_t *ct)
2383 {
2384         znode_t *zp = VTOZ(vp);
2385         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2386         znode_phys_t *pzp;
2387         int     error = 0;
2388         uint32_t blksize;
2389         u_longlong_t nblocks;
2390         uint64_t links;
2391         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2392         xoptattr_t *xoap = NULL;
2393         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2394
2395         dprintf("zfs_getattr called\n");
2396
2397         ZFS_ENTER(zfsvfs);
2398         ZFS_VERIFY_ZP(zp);
2399         pzp = zp->z_phys;
2400
2401         mutex_enter(&zp->z_lock);
2402
2403         /*
2404          * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2405          * Also, if we are the owner don't bother, since owner should
2406          * always be allowed to read basic attributes of file.
2407          */
2408         if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
2409             (pzp->zp_uid != crgetuid(cr))) {
2410                 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2411                     skipaclchk, cr)) {
2412                         mutex_exit(&zp->z_lock);
2413                         ZFS_EXIT(zfsvfs);
2414                         return (error);
2415                 }
2416         }
2417
2418         /*
2419          * Return all attributes.  It's cheaper to provide the answer
2420          * than to determine whether we were asked the question.
2421          */
2422
2423         vap->va_type = IFTOVT(pzp->zp_mode);
2424         vap->va_mode = pzp->zp_mode & ~S_IFMT;
2425         zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2426         vap->va_nodeid = zp->z_id;
2427         if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2428                 links = pzp->zp_links + 1;
2429         else
2430                 links = pzp->zp_links;
2431         vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2432         vap->va_size = pzp->zp_size;
2433         vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
2434 //      vap->va_fsid = 0;
2435         vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
2436         vap->va_seq = zp->z_seq;
2437         vap->va_flags = 0;      /* FreeBSD: Reset chflags(2) flags. */
2438
2439         /*
2440          * Add in any requested optional attributes and the create time.
2441          * Also set the corresponding bits in the returned attribute bitmap.
2442          */
2443         if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2444                 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2445                         xoap->xoa_archive =
2446                             ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
2447                         XVA_SET_RTN(xvap, XAT_ARCHIVE);
2448                 }
2449
2450                 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2451                         xoap->xoa_readonly =
2452                             ((pzp->zp_flags & ZFS_READONLY) != 0);
2453                         XVA_SET_RTN(xvap, XAT_READONLY);
2454                 }
2455
2456                 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2457                         xoap->xoa_system =
2458                             ((pzp->zp_flags & ZFS_SYSTEM) != 0);
2459                         XVA_SET_RTN(xvap, XAT_SYSTEM);
2460                 }
2461
2462                 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2463                         xoap->xoa_hidden =
2464                             ((pzp->zp_flags & ZFS_HIDDEN) != 0);
2465                         XVA_SET_RTN(xvap, XAT_HIDDEN);
2466                 }
2467
2468                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2469                         xoap->xoa_nounlink =
2470                             ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
2471                         XVA_SET_RTN(xvap, XAT_NOUNLINK);
2472                 }
2473
2474                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2475                         xoap->xoa_immutable =
2476                             ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
2477                         XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2478                 }
2479
2480                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2481                         xoap->xoa_appendonly =
2482                             ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
2483                         XVA_SET_RTN(xvap, XAT_APPENDONLY);
2484                 }
2485
2486                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2487                         xoap->xoa_nodump =
2488                             ((pzp->zp_flags & ZFS_NODUMP) != 0);
2489                         XVA_SET_RTN(xvap, XAT_NODUMP);
2490                 }
2491
2492                 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2493                         xoap->xoa_opaque =
2494                             ((pzp->zp_flags & ZFS_OPAQUE) != 0);
2495                         XVA_SET_RTN(xvap, XAT_OPAQUE);
2496                 }
2497
2498                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2499                         xoap->xoa_av_quarantined =
2500                             ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
2501                         XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2502                 }
2503
2504                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2505                         xoap->xoa_av_modified =
2506                             ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
2507                         XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2508                 }
2509
2510                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2511                     vp->v_type == VREG &&
2512                     (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
2513                         size_t len;
2514                         dmu_object_info_t doi;
2515
2516                         /*
2517                          * Only VREG files have anti-virus scanstamps, so we
2518                          * won't conflict with symlinks in the bonus buffer.
2519                          */
2520                         dmu_object_info_from_db(zp->z_dbuf, &doi);
2521                         len = sizeof (xoap->xoa_av_scanstamp) +
2522                             sizeof (znode_phys_t);
2523                         if (len <= doi.doi_bonus_size) {
2524                                 /*
2525                                  * pzp points to the start of the
2526                                  * znode_phys_t. pzp + 1 points to the
2527                                  * first byte after the znode_phys_t.
2528                                  */
2529                                 (void) memcpy(xoap->xoa_av_scanstamp,
2530                                     pzp + 1,
2531                                     sizeof (xoap->xoa_av_scanstamp));
2532                                 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
2533                         }
2534                 }
2535
2536                 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2537                         ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
2538                         XVA_SET_RTN(xvap, XAT_CREATETIME);
2539                 }
2540         }
2541
2542         ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
2543         ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
2544         ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
2545         ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
2546
2547         mutex_exit(&zp->z_lock);
2548
2549         dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
2550         vap->va_blksize = blksize;
2551         vap->va_bytes = nblocks << 9;   /* nblocks * 512 */
2552
2553         if (zp->z_blksz == 0) {
2554                 /*
2555                  * Block size hasn't been set; suggest maximal I/O transfers.
2556                  */
2557                 vap->va_blksize = zfsvfs->z_max_blksz;
2558         }
2559
2560         ZFS_EXIT(zfsvfs);
2561         return (0);
2562 }
2563
2564 /*
2565  * Set the file attributes to the values contained in the
2566  * vattr structure.
2567  *
2568  *      IN:     vp      - vnode of file to be modified.
2569  *              vap     - new attribute values.
2570  *                        If AT_XVATTR set, then optional attrs are being set
2571  *              flags   - ATTR_UTIME set if non-default time values provided.
2572  *                      - ATTR_NOACLCHECK (CIFS context only).
2573  *              cr      - credentials of caller.
2574  *              ct      - caller context
2575  *
2576  *      RETURN: 0 if success
2577  *              error code if failure
2578  *
2579  * Timestamps:
2580  *      vp - ctime updated, mtime updated if size changed.
2581  */
2582 /* ARGSUSED */
2583 static int
2584 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2585         caller_context_t *ct)
2586 {
2587         znode_t         *zp = VTOZ(vp);
2588         znode_phys_t    *pzp;
2589         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2590         zilog_t         *zilog;
2591         dmu_tx_t        *tx;
2592         vattr_t         oldva;
2593         uint_t          mask = vap->va_mask;
2594         uint_t          saved_mask;
2595         int             trim_mask = 0;
2596         uint64_t        new_mode;
2597         znode_t         *attrzp;
2598         int             need_policy = FALSE;
2599         int             err;
2600         zfs_fuid_info_t *fuidp = NULL;
2601         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2602         xoptattr_t      *xoap;
2603         zfs_acl_t       *aclp = NULL;
2604         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2605
2606         dprintf("zfs_setattr called\n");
2607
2608         if (mask == 0)
2609                 return (0);
2610
2611         if (mask & AT_NOSET)
2612                 return (EINVAL);
2613
2614         ZFS_ENTER(zfsvfs);
2615         ZFS_VERIFY_ZP(zp);
2616
2617         pzp = zp->z_phys;
2618         zilog = zfsvfs->z_log;
2619
2620         /*
2621          * Make sure that if we have ephemeral uid/gid or xvattr specified
2622          * that file system is at proper version level
2623          */
2624
2625         if (zfsvfs->z_use_fuids == B_FALSE &&
2626             (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2627             ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2628             (mask & AT_XVATTR))) {
2629                 ZFS_EXIT(zfsvfs);
2630                 return (EINVAL);
2631         }
2632
2633         if (mask & AT_SIZE && vp->v_type == VDIR) {
2634                 ZFS_EXIT(zfsvfs);
2635                 return (EISDIR);
2636         }
2637
2638         if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2639                 ZFS_EXIT(zfsvfs);
2640                 return (EINVAL);
2641         }
2642
2643         /*
2644          * If this is an xvattr_t, then get a pointer to the structure of
2645          * optional attributes.  If this is NULL, then we have a vattr_t.
2646          */
2647         xoap = xva_getxoptattr(xvap);
2648
2649         /*
2650          * Immutable files can only alter immutable bit and atime
2651          */
2652         if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
2653             ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2654             ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2655                 ZFS_EXIT(zfsvfs);
2656                 return (EPERM);
2657         }
2658
2659         if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
2660                 ZFS_EXIT(zfsvfs);
2661                 return (EPERM);
2662         }
2663
2664         /*
2665          * Verify timestamps doesn't overflow 32 bits.
2666          * ZFS can handle large timestamps, but 32bit syscalls can't
2667          * handle times greater than 2039.  This check should be removed
2668          * once large timestamps are fully supported.
2669          */
2670         if (mask & (AT_ATIME | AT_MTIME)) {
2671                 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2672                     ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2673                         ZFS_EXIT(zfsvfs);
2674                         return (EOVERFLOW);
2675                 }
2676         }
2677
2678 top:
2679         attrzp = NULL;
2680
2681         if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2682                 ZFS_EXIT(zfsvfs);
2683                 return (EROFS);
2684         }
2685
2686         /*
2687          * First validate permissions
2688          */
2689         if (mask & AT_SIZE) {
2690                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2691                 if (err) {
2692                         ZFS_EXIT(zfsvfs);
2693                         return (err);
2694                 }
2695                 /*
2696                  * XXX - Note, we are not providing any open
2697                  * mode flags here (like FNDELAY), so we may
2698                  * block if there are locks present... this
2699                  * should be addressed in openat().
2700                  */
2701                 /* XXX - would it be OK to generate a log record here? */
2702                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2703                 if (err) {
2704                         ZFS_EXIT(zfsvfs);
2705                         return (err);
2706                 }
2707         }
2708
2709         if (mask & (AT_ATIME|AT_MTIME) ||
2710             ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2711             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2712             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2713             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2714             XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
2715                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2716                     skipaclchk, cr);
2717
2718         if (mask & (AT_UID|AT_GID)) {
2719                 int     idmask = (mask & (AT_UID|AT_GID));
2720                 int     take_owner;
2721                 int     take_group;
2722
2723                 /*
2724                  * NOTE: even if a new mode is being set,
2725                  * we may clear S_ISUID/S_ISGID bits.
2726                  */
2727
2728                 if (!(mask & AT_MODE))
2729                         vap->va_mode = pzp->zp_mode;
2730
2731                 /*
2732                  * Take ownership or chgrp to group we are a member of
2733                  */
2734
2735                 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2736                 take_group = (mask & AT_GID) &&
2737                     zfs_groupmember(zfsvfs, vap->va_gid, cr);
2738
2739                 /*
2740                  * If both AT_UID and AT_GID are set then take_owner and
2741                  * take_group must both be set in order to allow taking
2742                  * ownership.
2743                  *
2744                  * Otherwise, send the check through secpolicy_vnode_setattr()
2745                  *
2746                  */
2747
2748                 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2749                     ((idmask == AT_UID) && take_owner) ||
2750                     ((idmask == AT_GID) && take_group)) {
2751                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2752                             skipaclchk, cr) == 0) {
2753                                 /*
2754                                  * Remove setuid/setgid for non-privileged users
2755                                  */
2756                                 secpolicy_setid_clear(vap, cr);
2757                                 trim_mask = (mask & (AT_UID|AT_GID));
2758                         } else {
2759                                 need_policy =  TRUE;
2760                         }
2761                 } else {
2762                         need_policy =  TRUE;
2763                 }
2764         }
2765
2766         mutex_enter(&zp->z_lock);
2767         oldva.va_mode = pzp->zp_mode;
2768         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2769         if (mask & AT_XVATTR) {
2770                 if ((need_policy == FALSE) &&
2771                     (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) &&
2772                     xoap->xoa_appendonly !=
2773                     ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) ||
2774                     (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) &&
2775                     xoap->xoa_nounlink !=
2776                     ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) ||
2777                     (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) &&
2778                     xoap->xoa_immutable !=
2779                     ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) ||
2780                     (XVA_ISSET_REQ(xvap, XAT_NODUMP) &&
2781                     xoap->xoa_nodump !=
2782                     ((pzp->zp_flags & ZFS_NODUMP) != 0)) ||
2783                     (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) &&
2784                     xoap->xoa_av_modified !=
2785                     ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) ||
2786                     ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) &&
2787                     ((vp->v_type != VREG && xoap->xoa_av_quarantined) ||
2788                     xoap->xoa_av_quarantined !=
2789                     ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) ||
2790                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2791                     (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2792                         need_policy = TRUE;
2793                 }
2794         }
2795         mutex_exit(&zp->z_lock);
2796
2797         if (mask & AT_MODE) {
2798                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2799                         err = secpolicy_setid_setsticky_clear(vp, vap,
2800                             &oldva, cr);
2801                         if (err) {
2802                                 ZFS_EXIT(zfsvfs);
2803                                 return (err);
2804                         }
2805                         trim_mask |= AT_MODE;
2806                 } else {
2807                         need_policy = TRUE;
2808                 }
2809         }
2810
2811         if (need_policy) {
2812                 /*
2813                  * If trim_mask is set then take ownership
2814                  * has been granted or write_acl is present and user
2815                  * has the ability to modify mode.  In that case remove
2816                  * UID|GID and or MODE from mask so that
2817                  * secpolicy_vnode_setattr() doesn't revoke it.
2818                  */
2819
2820                 if (trim_mask) {
2821                         saved_mask = vap->va_mask;
2822                         vap->va_mask &= ~trim_mask;
2823                 }
2824                 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2825                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2826                 if (err) {
2827                         ZFS_EXIT(zfsvfs);
2828                         return (err);
2829                 }
2830
2831                 if (trim_mask)
2832                         vap->va_mask |= saved_mask;
2833         }
2834         /*
2835          * secpolicy_vnode_setattr, or take ownership may have
2836          * changed va_mask
2837          */
2838         mask = vap->va_mask;
2839
2840         tx = dmu_tx_create(zfsvfs->z_os);
2841         dmu_tx_hold_bonus(tx, zp->z_id);
2842         if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2843             ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) {
2844                 if (zfsvfs->z_fuid_obj == 0) {
2845                         dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
2846                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2847                             FUID_SIZE_ESTIMATE(zfsvfs));
2848                         dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
2849                 } else {
2850                         dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
2851                         dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
2852                             FUID_SIZE_ESTIMATE(zfsvfs));
2853                 }
2854         }
2855
2856         if (mask & AT_MODE) {
2857                 uint64_t pmode = pzp->zp_mode;
2858
2859                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2860
2861                 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) {
2862                         dmu_tx_abort(tx);
2863                         ZFS_EXIT(zfsvfs);
2864                         return (err);
2865                 }
2866                 if (pzp->zp_acl.z_acl_extern_obj) {
2867                         /* Are we upgrading ACL from old V0 format to new V1 */
2868                         if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
2869                             pzp->zp_acl.z_acl_version ==
2870                             ZFS_ACL_VERSION_INITIAL) {
2871                                 dmu_tx_hold_free(tx,
2872                                     pzp->zp_acl.z_acl_extern_obj, 0,
2873                                     DMU_OBJECT_END);
2874                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2875                                     0, aclp->z_acl_bytes);
2876                         } else {
2877                                 dmu_tx_hold_write(tx,
2878                                     pzp->zp_acl.z_acl_extern_obj, 0,
2879                                     aclp->z_acl_bytes);
2880                         }
2881                 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2882                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2883                             0, aclp->z_acl_bytes);
2884                 }
2885         }
2886
2887         if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) {
2888                 err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
2889                 if (err) {
2890                         dmu_tx_abort(tx);
2891                         ZFS_EXIT(zfsvfs);
2892                         if (aclp)
2893                                 zfs_acl_free(aclp);
2894                         return (err);
2895                 }
2896                 dmu_tx_hold_bonus(tx, attrzp->z_id);
2897         }
2898
2899         err = dmu_tx_assign(tx, zfsvfs->z_assign);
2900         if (err) {
2901                 if (attrzp)
2902                         VN_RELE(ZTOV(attrzp));
2903
2904                 if (aclp) {
2905                         zfs_acl_free(aclp);
2906                         aclp = NULL;
2907                 }
2908
2909                 if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2910                         dmu_tx_wait(tx);
2911                         dmu_tx_abort(tx);
2912                         goto top;
2913                 }
2914                 dmu_tx_abort(tx);
2915                 ZFS_EXIT(zfsvfs);
2916                 return (err);
2917         }
2918
2919         dmu_buf_will_dirty(zp->z_dbuf, tx);
2920
2921         /*
2922          * Set each attribute requested.
2923          * We group settings according to the locks they need to acquire.
2924          *
2925          * Note: you cannot set ctime directly, although it will be
2926          * updated as a side-effect of calling this function.
2927          */
2928
2929         mutex_enter(&zp->z_lock);
2930
2931         if (mask & AT_MODE) {
2932                 mutex_enter(&zp->z_acl_lock);
2933                 zp->z_phys->zp_mode = new_mode;
2934                 err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
2935                 ASSERT3U(err, ==, 0);
2936                 mutex_exit(&zp->z_acl_lock);
2937         }
2938
2939         if (attrzp)
2940                 mutex_enter(&attrzp->z_lock);
2941
2942         if (mask & AT_UID) {
2943                 pzp->zp_uid = zfs_fuid_create(zfsvfs,
2944                     vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
2945                 if (attrzp) {
2946                         attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs,
2947                             vap->va_uid,  cr, ZFS_OWNER, tx, &fuidp);
2948                 }
2949         }
2950
2951         if (mask & AT_GID) {
2952                 pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid,
2953                     cr, ZFS_GROUP, tx, &fuidp);
2954                 if (attrzp)
2955                         attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs,
2956                             vap->va_gid, cr, ZFS_GROUP, tx, &fuidp);
2957         }
2958
2959         if (aclp)
2960                 zfs_acl_free(aclp);
2961
2962         if (attrzp)
2963                 mutex_exit(&attrzp->z_lock);
2964
2965         if (mask & AT_ATIME)
2966                 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
2967
2968         if (mask & AT_MTIME)
2969                 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
2970
2971         /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
2972         if (mask & AT_SIZE)
2973                 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
2974         else if (mask != 0)
2975                 zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
2976         /*
2977          * Do this after setting timestamps to prevent timestamp
2978          * update from toggling bit
2979          */
2980
2981         if (xoap && (mask & AT_XVATTR)) {
2982                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
2983                         size_t len;
2984                         dmu_object_info_t doi;
2985
2986                         ASSERT(vp->v_type == VREG);
2987
2988                         /* Grow the bonus buffer if necessary. */
2989                         dmu_object_info_from_db(zp->z_dbuf, &doi);
2990                         len = sizeof (xoap->xoa_av_scanstamp) +
2991                             sizeof (znode_phys_t);
2992                         if (len > doi.doi_bonus_size)
2993                                 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
2994                 }
2995                 zfs_xvattr_set(zp, xvap);
2996         }
2997
2998         if (mask != 0)
2999                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3000
3001         if (fuidp)
3002                 zfs_fuid_info_free(fuidp);
3003         mutex_exit(&zp->z_lock);
3004
3005         if (attrzp)
3006                 VN_RELE(ZTOV(attrzp));
3007
3008         dmu_tx_commit(tx);
3009
3010         ZFS_EXIT(zfsvfs);
3011         return (err);
3012 }
3013
3014 typedef struct zfs_zlock {
3015         krwlock_t       *zl_rwlock;     /* lock we acquired */
3016         znode_t         *zl_znode;      /* znode we held */
3017         struct zfs_zlock *zl_next;      /* next in list */
3018 } zfs_zlock_t;
3019
3020 /*
3021  * Drop locks and release vnodes that were held by zfs_rename_lock().
3022  */
3023 static void
3024 zfs_rename_unlock(zfs_zlock_t **zlpp)
3025 {
3026         zfs_zlock_t *zl;
3027
3028         while ((zl = *zlpp) != NULL) {
3029                 if (zl->zl_znode != NULL)
3030                         VN_RELE(ZTOV(zl->zl_znode));
3031                 rw_exit(zl->zl_rwlock);
3032                 *zlpp = zl->zl_next;
3033                 kmem_free(zl, sizeof (*zl));
3034         }
3035 }
3036
3037 /*
3038  * Search back through the directory tree, using the ".." entries.
3039  * Lock each directory in the chain to prevent concurrent renames.
3040  * Fail any attempt to move a directory into one of its own descendants.
3041  * XXX - z_parent_lock can overlap with map or grow locks
3042  */
3043 static int
3044 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3045 {
3046         zfs_zlock_t     *zl;
3047         znode_t         *zp = tdzp;
3048         uint64_t        rootid = zp->z_zfsvfs->z_root;
3049         uint64_t        *oidp = &zp->z_id;
3050         krwlock_t       *rwlp = &szp->z_parent_lock;
3051         krw_t           rw = RW_WRITER;
3052
3053         /*
3054          * First pass write-locks szp and compares to zp->z_id.
3055          * Later passes read-lock zp and compare to zp->z_parent.
3056          */
3057         do {
3058                 if (!rw_tryenter(rwlp, rw)) {
3059                         /*
3060                          * Another thread is renaming in this path.
3061                          * Note that if we are a WRITER, we don't have any
3062                          * parent_locks held yet.
3063                          */
3064                         if (rw == RW_READER && zp->z_id > szp->z_id) {
3065                                 /*
3066                                  * Drop our locks and restart
3067                                  */
3068                                 zfs_rename_unlock(&zl);
3069                                 *zlpp = NULL;
3070                                 zp = tdzp;
3071                                 oidp = &zp->z_id;
3072                                 rwlp = &szp->z_parent_lock;
3073                                 rw = RW_WRITER;
3074                                 continue;
3075                         } else {
3076                                 /*
3077                                  * Wait for other thread to drop its locks
3078                                  */
3079                                 rw_enter(rwlp, rw);
3080                         }
3081                 }
3082
3083                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3084                 zl->zl_rwlock = rwlp;
3085                 zl->zl_znode = NULL;
3086                 zl->zl_next = *zlpp;
3087                 *zlpp = zl;
3088
3089                 if (*oidp == szp->z_id)         /* We're a descendant of szp */
3090                         return (EINVAL);
3091
3092                 if (*oidp == rootid)            /* We've hit the top */
3093                         return (0);
3094
3095                 if (rw == RW_READER) {          /* i.e. not the first pass */
3096                         int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
3097                         if (error)
3098                                 return (error);
3099                         zl->zl_znode = zp;
3100                 }
3101                 oidp = &zp->z_phys->zp_parent;
3102                 rwlp = &zp->z_parent_lock;
3103                 rw = RW_READER;
3104
3105         } while (zp->z_id != sdzp->z_id);
3106
3107         return (0);
3108 }
3109
3110 /*
3111  * Move an entry from the provided source directory to the target
3112  * directory.  Change the entry name as indicated.
3113  *
3114  *      IN:     sdvp    - Source directory containing the "old entry".
3115  *              snm     - Old entry name.
3116  *              tdvp    - Target directory to contain the "new entry".
3117  *              tnm     - New entry name.
3118  *              cr      - credentials of caller.
3119  *              ct      - caller context
3120  *              flags   - case flags
3121  *
3122  *      RETURN: 0 if success
3123  *              error code if failure
3124  *
3125  * Timestamps:
3126  *      sdvp,tdvp - ctime|mtime updated
3127  */
3128 /* XXX NetBSD There is significant problem with dirent locking during rename
3129  * of files which are in a same dir. zfs_dirent_lock is then called twice on
3130  * same lock which panics LOCKDEBUG kernel. Locking twice is not needed.
3131  * Proper solution for this is add new flag to zfs_dirent_lock which will
3132  * disable rw_enter in it. Renaming of files in same dir is considered as broken
3133  * on LOCKDEBUG kernels on NetBSD for now.
3134  */
3135 /*ARGSUSED*/
3136 static int
3137 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3138     caller_context_t *ct, int flags)
3139 {
3140         znode_t         *tdzp, *szp, *tzp;
3141         znode_t         *sdzp = VTOZ(sdvp);
3142         zfsvfs_t        *zfsvfs = sdzp->z_zfsvfs;
3143         zilog_t         *zilog;
3144         vnode_t         *realvp;
3145         zfs_dirlock_t   *sdl, *tdl;
3146         dmu_tx_t        *tx;
3147         zfs_zlock_t     *zl;
3148         int             cmp, serr, terr;
3149         int             error = 0;
3150         int             zflg = 0;
3151         int             samedir = 0;
3152
3153         tdl = NULL;
3154         sdl = NULL;
3155
3156         dprintf("zfs_rename called\n");
3157
3158         ZFS_ENTER(zfsvfs);
3159         ZFS_VERIFY_ZP(sdzp);
3160         zilog = zfsvfs->z_log;
3161
3162         /*
3163          * Make sure we have the real vp for the target directory.
3164          */
3165         if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3166                 tdvp = realvp;
3167
3168         if (tdvp->v_vfsp != sdvp->v_vfsp) {
3169                 ZFS_EXIT(zfsvfs);
3170                 return (EXDEV);
3171         }
3172
3173         tdzp = VTOZ(tdvp);
3174         ZFS_VERIFY_ZP(tdzp);
3175         if (zfsvfs->z_utf8 && u8_validate(tnm,
3176             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3177                 ZFS_EXIT(zfsvfs);
3178                 return (EILSEQ);
3179         }
3180
3181         if (flags & FIGNORECASE)
3182                 zflg |= ZCILOOK;
3183
3184 top:
3185         szp = NULL;
3186         tzp = NULL;
3187         zl = NULL;
3188
3189         /*
3190          * This is to prevent the creation of links into attribute space
3191          * by renaming a linked file into/outof an attribute directory.
3192          * See the comment in zfs_link() for why this is considered bad.
3193          */
3194         if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
3195             (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
3196                 ZFS_EXIT(zfsvfs);
3197                 return (EINVAL);
3198         }
3199
3200         /*
3201          * Lock source and target directory entries.  To prevent deadlock,
3202          * a lock ordering must be defined.  We lock the directory with
3203          * the smallest object id first, or if it's a tie, the one with
3204          * the lexically first name.
3205          */
3206         if (sdzp->z_id < tdzp->z_id) {
3207                 cmp = -1;
3208         } else if (sdzp->z_id > tdzp->z_id) {
3209                 cmp = 1;
3210         } else {
3211                 /*
3212                  * First compare the two name arguments without
3213                  * considering any case folding.
3214                  */
3215                 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3216
3217                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3218                 ASSERT(error == 0 || !zfsvfs->z_utf8);
3219                 if (cmp == 0) {
3220                         /*
3221                          * POSIX: "If the old argument and the new argument
3222                          * both refer to links to the same existing file,
3223                          * the rename() function shall return successfully
3224                          * and perform no other action."
3225                          */
3226                         ZFS_EXIT(zfsvfs);
3227                         return (0);
3228                 }
3229                 /*
3230                  * If the file system is case-folding, then we may
3231                  * have some more checking to do.  A case-folding file
3232                  * system is either supporting mixed case sensitivity
3233                  * access or is completely case-insensitive.  Note
3234                  * that the file system is always case preserving.
3235                  *
3236                  * In mixed sensitivity mode case sensitive behavior
3237                  * is the default.  FIGNORECASE must be used to
3238                  * explicitly request case insensitive behavior.
3239                  *
3240                  * If the source and target names provided differ only
3241                  * by case (e.g., a request to rename 'tim' to 'Tim'),
3242                  * we will treat this as a special case in the
3243                  * case-insensitive mode: as long as the source name
3244                  * is an exact match, we will allow this to proceed as
3245                  * a name-change request.
3246                  */
3247                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3248                     (zfsvfs->z_case == ZFS_CASE_MIXED &&
3249                     flags & FIGNORECASE)) &&
3250                     u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3251                     &error) == 0) {
3252                         /*
3253                          * case preserving rename request, require exact
3254                          * name matches
3255                          */
3256                         zflg |= ZCIEXACT;
3257                         zflg &= ~ZCILOOK;
3258                 }
3259         }
3260
3261         if (cmp < 0) {
3262
3263                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3264                     ZEXISTS | zflg, NULL, NULL);
3265                 if ((serr == 0) && (sdzp == tdzp)) {
3266                         /*
3267                          * If renaming within the one directory we must
3268                          * be careful not to recursively acquire locks.
3269                          */
3270                         zflg |= ZSAMEDIR;
3271                 }
3272                 terr = zfs_dirent_lock(&tdl,
3273                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3274         } else {
3275                 terr = zfs_dirent_lock(&tdl,
3276                     tdzp, tnm, &tzp, zflg, NULL, NULL);
3277
3278                 if ((terr == 0) && (sdzp == tdzp)) {
3279                         /*
3280                          * If renaming within the one directory we must
3281                          * be careful not to recursively acquire locks.
3282                          */
3283                         zflg |= ZSAMEDIR;
3284                 }
3285                 serr = zfs_dirent_lock(&sdl,
3286                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3287                     NULL, NULL);
3288         }
3289
3290         if (serr) {
3291                 /*
3292                  * Source entry invalid or not there.
3293                  */
3294                 if (!terr) {
3295                         zfs_dirent_unlock(tdl, 0);
3296                         if (tzp)
3297                                 VN_RELE(ZTOV(tzp));
3298                 }
3299                 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3300                         serr = EINVAL;
3301                 ZFS_EXIT(zfsvfs);
3302                 return (serr);
3303         }
3304         if (terr) {
3305                 if (sdl != NULL)
3306                         zfs_dirent_unlock(sdl, 0);
3307                 VN_RELE(ZTOV(szp));
3308                 if (strcmp(tnm, "..") == 0)
3309                         terr = EINVAL;
3310                 ZFS_EXIT(zfsvfs);
3311                 return (terr);
3312         }
3313
3314         /*
3315          * Must have write access at the source to remove the old entry
3316          * and write access at the target to create the new entry.
3317          * Note that if target and source are the same, this can be
3318          * done in a single check.
3319          */
3320
3321         if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3322                 goto out;
3323
3324         if (ZTOV(szp)->v_type == VDIR) {
3325                 /*
3326                  * Check to make sure rename is valid.
3327                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3328                  */
3329                 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3330                         goto out;
3331         }
3332
3333         /*
3334          * Does target exist?
3335          */
3336         if (tzp) {
3337                 /*
3338                  * Source and target must be the same type.
3339                  */
3340                 if (ZTOV(szp)->v_type == VDIR) {
3341                         if (ZTOV(tzp)->v_type != VDIR) {
3342                                 error = ENOTDIR;
3343                                 goto out;
3344                         }
3345                 } else {
3346                         if (ZTOV(tzp)->v_type == VDIR) {
3347                                 error = EISDIR;
3348                                 goto out;
3349                         }
3350                 }
3351                 /*
3352                  * POSIX dictates that when the source and target
3353                  * entries refer to the same file object, rename
3354                  * must do nothing and exit without error.
3355                  */
3356                 if (szp->z_id == tzp->z_id) {
3357                         error = 0;
3358                         goto out;
3359                 }
3360         }
3361
3362         vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3363         if (tzp)
3364                 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3365
3366         /*
3367          * notify the target directory if it is not the same
3368          * as source directory.
3369          */
3370         if (tdvp != sdvp) {
3371                 vnevent_rename_dest_dir(tdvp, ct);
3372         }
3373
3374         tx = dmu_tx_create(zfsvfs->z_os);
3375         dmu_tx_hold_bonus(tx, szp->z_id);       /* nlink changes */
3376         dmu_tx_hold_bonus(tx, sdzp->z_id);      /* nlink changes */
3377         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3378         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3379         if (sdzp != tdzp)
3380                 dmu_tx_hold_bonus(tx, tdzp->z_id);      /* nlink changes */
3381         if (tzp)
3382                 dmu_tx_hold_bonus(tx, tzp->z_id);       /* parent changes */
3383         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3384         error = dmu_tx_assign(tx, zfsvfs->z_assign);
3385         if (error) {
3386                 if (zl != NULL)
3387                         zfs_rename_unlock(&zl);
3388                 zfs_dirent_unlock(sdl, zflg);
3389                 zfs_dirent_unlock(tdl, 0);
3390                 VN_RELE(ZTOV(szp));
3391                 if (tzp)
3392                         VN_RELE(ZTOV(tzp));
3393                 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
3394                         dmu_tx_wait(tx);
3395                         dmu_tx_abort(tx);
3396                         goto top;
3397                 }
3398                 dmu_tx_abort(tx);
3399                 ZFS_EXIT(zfsvfs);
3400                 return (error);
3401         }
3402
3403         if (tzp)        /* Attempt to remove the existing target */
3404                 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3405
3406         if (error == 0) {
3407                 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3408                 if (error == 0) {
3409                         szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
3410
3411                         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3412                         ASSERT(error == 0);
3413
3414                         zfs_log_rename(zilog, tx,
3415                             TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
3416                             sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3417
3418                         /* Update path information for the target vnode */
3419                         vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm));
3420                 }
3421                 if (error == 0) {
3422                         /* Purge cache entries, while still holding locks. */
3423                         cache_purge(sdvp);
3424                         cache_purge(tdvp);
3425                 }
3426         }
3427
3428         dmu_tx_commit(tx);
3429 out:
3430         if (zl != NULL)
3431                 zfs_rename_unlock(&zl);
3432
3433         zfs_dirent_unlock(sdl, zflg);
3434         zfs_dirent_unlock(tdl, 0);
3435
3436         VN_RELE(ZTOV(szp));
3437         if (tzp)
3438                 VN_RELE(ZTOV(tzp));
3439
3440         ZFS_EXIT(zfsvfs);
3441
3442         return (error);
3443 }
3444
3445 /*
3446  * Insert the indicated symbolic reference entry into the directory.
3447  *
3448  *      IN:     dvp     - Directory to contain new symbolic link.
3449  *              link    - Name for new symlink entry.
3450  *              vap     - Attributes of new entry.
3451  *              target  - Target path of new symlink.
3452  *              cr      - credentials of caller.
3453  *              ct      - caller context
3454  *              flags   - case flags
3455  *
3456  *      RETURN: 0 if success
3457  *              error code if failure
3458  *
3459  * Timestamps:
3460  *      dvp - ctime|mtime updated
3461  */
3462 /*ARGSUSED*/
3463 static int
3464 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3465     cred_t *cr)
3466 {
3467         znode_t         *zp, *dzp = VTOZ(dvp);
3468         zfs_dirlock_t   *dl;
3469         dmu_tx_t        *tx;
3470         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3471         zilog_t         *zilog;
3472         int             len = strlen(link);
3473         int             error;
3474         int             zflg = ZNEW;
3475         zfs_fuid_info_t *fuidp = NULL;
3476         int             flags = 0;
3477
3478         ASSERT(vap->va_type == VLNK);
3479
3480         ZFS_ENTER(zfsvfs);
3481         ZFS_VERIFY_ZP(dzp);
3482         zilog = zfsvfs->z_log;
3483
3484         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3485             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3486                 ZFS_EXIT(zfsvfs);
3487                 return (EILSEQ);
3488         }
3489         if (flags & FIGNORECASE)
3490                 zflg |= ZCILOOK;
3491 top:
3492         if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3493                 ZFS_EXIT(zfsvfs);
3494                 return (error);
3495         }
3496
3497         if (len > MAXPATHLEN) {
3498                 ZFS_EXIT(zfsvfs);
3499                 return (ENAMETOOLONG);
3500         }
3501
3502         /*
3503          * Attempt to lock directory; fail if entry already exists.
3504          */
3505         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3506         if (error) {
3507                 ZFS_EXIT(zfsvfs);
3508                 return (error);
3509         }
3510
3511         tx = dmu_tx_create(zfsvfs->z_os);
3512         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3513         dmu_tx_hold_bonus(tx, dzp->z_id);
3514         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3515         if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
3516                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
3517         if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
3518                 if (zfsvfs->z_fuid_obj == 0) {
3519                         dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
3520                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3521                             FUID_SIZE_ESTIMATE(zfsvfs));
3522                         dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
3523                 } else {
3524                         dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
3525                         dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
3526                             FUID_SIZE_ESTIMATE(zfsvfs));
3527                 }
3528         }
3529         error = dmu_tx_assign(tx, zfsvfs->z_assign);
3530         if (error) {
3531                 zfs_dirent_unlock(dl, 0);
3532                 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
3533                         dmu_tx_wait(tx);
3534                         dmu_tx_abort(tx);
3535                         goto top;
3536                 }
3537                 dmu_tx_abort(tx);
3538                 ZFS_EXIT(zfsvfs);
3539                 return (error);
3540         }
3541
3542         dmu_buf_will_dirty(dzp->z_dbuf, tx);
3543
3544         /*
3545          * Create a new object for the symlink.
3546          * Put the link content into bonus buffer if it will fit;
3547          * otherwise, store it just like any other file data.
3548          */
3549         if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
3550                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp);
3551                 if (len != 0)
3552                         bcopy(link, zp->z_phys + 1, len);
3553         } else {
3554                 dmu_buf_t *dbp;
3555
3556                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp);
3557                 /*
3558                  * Nothing can access the znode yet so no locking needed
3559                  * for growing the znode's blocksize.
3560                  */
3561                 zfs_grow_blocksize(zp, len, tx);
3562
3563                 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
3564                     zp->z_id, 0, FTAG, &dbp));
3565                 dmu_buf_will_dirty(dbp, tx);
3566
3567                 ASSERT3U(len, <=, dbp->db_size);
3568                 bcopy(link, dbp->db_data, len);
3569                 dmu_buf_rele(dbp, FTAG);
3570         }
3571         zp->z_phys->zp_size = len;
3572
3573         /*
3574          * Insert the new object into the directory.
3575          */
3576         (void) zfs_link_create(dl, zp, tx, ZNEW);
3577 out:
3578         if (error == 0) {
3579                 uint64_t txtype = TX_SYMLINK;
3580                 if (flags & FIGNORECASE)
3581                         txtype |= TX_CI;
3582                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3583                 *vpp = ZTOV(zp);
3584         }
3585         if (fuidp)
3586                 zfs_fuid_info_free(fuidp);
3587
3588         dmu_tx_commit(tx);
3589
3590         zfs_dirent_unlock(dl, 0);
3591
3592         ZFS_EXIT(zfsvfs);
3593         return (error);
3594 }
3595
3596 /*
3597  * Return, in the buffer contained in the provided uio structure,
3598  * the symbolic path referred to by vp.
3599  *
3600  *      IN:     vp      - vnode of symbolic link.
3601  *              uoip    - structure to contain the link path.
3602  *              cr      - credentials of caller.
3603  *              ct      - caller context
3604  *
3605  *      OUT:    uio     - structure to contain the link path.
3606  *
3607  *      RETURN: 0 if success
3608  *              error code if failure
3609  *
3610  * Timestamps:
3611  *      vp - atime updated
3612  */
3613 /* ARGSUSED */
3614 static int
3615 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3616 {
3617         znode_t         *zp = VTOZ(vp);
3618         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3619         size_t          bufsz;
3620         int             error;
3621
3622         ZFS_ENTER(zfsvfs);
3623         ZFS_VERIFY_ZP(zp);
3624
3625         bufsz = (size_t)zp->z_phys->zp_size;
3626         if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
3627                 error = uiomove(zp->z_phys + 1,
3628                     MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3629         } else {
3630                 dmu_buf_t *dbp;
3631                 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
3632                 if (error) {
3633                         ZFS_EXIT(zfsvfs);
3634                         return (error);
3635                 }
3636                 error = uiomove(dbp->db_data,
3637                     MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3638                 dmu_buf_rele(dbp, FTAG);
3639         }
3640
3641         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3642         ZFS_EXIT(zfsvfs);
3643         return (error);
3644 }
3645
3646 /*
3647  * Insert a new entry into directory tdvp referencing svp.
3648  *
3649  *      IN:     tdvp    - Directory to contain new entry.
3650  *              svp     - vnode of new entry.
3651  *              name    - name of new entry.
3652  *              cr      - credentials of caller.
3653  *              ct      - caller context
3654  *
3655  *      RETURN: 0 if success
3656  *              error code if failure
3657  *
3658  * Timestamps:
3659  *      tdvp - ctime|mtime updated
3660  *       svp - ctime updated
3661  */
3662 /* ARGSUSED */
3663 static int
3664 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
3665     caller_context_t *ct, int flags)
3666 {
3667         znode_t         *dzp = VTOZ(tdvp);
3668         znode_t         *tzp, *szp;
3669         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3670         zilog_t         *zilog;
3671         zfs_dirlock_t   *dl;
3672         dmu_tx_t        *tx;
3673         vnode_t         *realvp;
3674         int             error;
3675         int             zf = ZNEW;
3676         uid_t           owner;
3677
3678         ASSERT(tdvp->v_type == VDIR);
3679
3680         ZFS_ENTER(zfsvfs);
3681         ZFS_VERIFY_ZP(dzp);
3682         zilog = zfsvfs->z_log;
3683
3684         if (VOP_REALVP(svp, &realvp, ct) == 0)
3685                 svp = realvp;
3686
3687         if (svp->v_vfsp != tdvp->v_vfsp) {
3688                 ZFS_EXIT(zfsvfs);
3689                 return (EXDEV);
3690         }
3691         szp = VTOZ(svp);
3692         ZFS_VERIFY_ZP(szp);
3693
3694         if (zfsvfs->z_utf8 && u8_validate(name,
3695             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3696                 ZFS_EXIT(zfsvfs);
3697                 return (EILSEQ);
3698         }
3699         if (flags & FIGNORECASE)
3700                 zf |= ZCILOOK;
3701
3702 top:
3703         /*
3704          * We do not support links between attributes and non-attributes
3705          * because of the potential security risk of creating links
3706          * into "normal" file space in order to circumvent restrictions
3707          * imposed in attribute space.
3708          */
3709         if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
3710             (dzp->z_phys->zp_flags & ZFS_XATTR)) {
3711                 ZFS_EXIT(zfsvfs);
3712                 return (EINVAL);
3713         }
3714
3715         /*
3716          * POSIX dictates that we return EPERM here.
3717          * Better choices include ENOTSUP or EISDIR.
3718          */
3719         if (svp->v_type == VDIR) {
3720                 ZFS_EXIT(zfsvfs);
3721                 return (EPERM);
3722         }
3723
3724         owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
3725         if (owner != crgetuid(cr) &&
3726             secpolicy_basic_link(cr) != 0) {
3727                 ZFS_EXIT(zfsvfs);
3728                 return (EPERM);
3729         }
3730
3731         if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3732                 ZFS_EXIT(zfsvfs);
3733                 return (error);
3734         }
3735
3736         /*
3737          * Attempt to lock directory; fail if entry already exists.
3738          */
3739         error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
3740         if (error) {
3741                 ZFS_EXIT(zfsvfs);
3742                 return (error);
3743         }
3744
3745         tx = dmu_tx_create(zfsvfs->z_os);
3746         dmu_tx_hold_bonus(tx, szp->z_id);
3747         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3748         error = dmu_tx_assign(tx, zfsvfs->z_assign);
3749         if (error) {
3750                 zfs_dirent_unlock(dl, 0);
3751                 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
3752                         dmu_tx_wait(tx);
3753                         dmu_tx_abort(tx);
3754                         goto top;
3755                 }
3756                 dmu_tx_abort(tx);
3757                 ZFS_EXIT(zfsvfs);
3758                 return (error);
3759         }
3760
3761         error = zfs_link_create(dl, szp, tx, 0);
3762
3763         if (error == 0) {
3764                 uint64_t txtype = TX_LINK;
3765                 if (flags & FIGNORECASE)
3766                         txtype |= TX_CI;
3767                 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
3768         }
3769
3770         dmu_tx_commit(tx);
3771
3772         zfs_dirent_unlock(dl, 0);
3773
3774         if (error == 0) {
3775                 vnevent_link(svp, ct);
3776         }
3777
3778         ZFS_EXIT(zfsvfs);
3779         return (error);
3780 }
3781
3782 /*ARGSUSED*/
3783
3784 /* CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); */
3785 /* CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); */
3786
3787 /*ARGSUSED*/
3788 static int
3789 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3790 {
3791         znode_t         *zp = VTOZ(vp);
3792         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3793         uint32_t        gen;
3794         uint64_t        object = zp->z_id;
3795         zfid_short_t    *zfid;
3796         int             size, i;
3797
3798         ZFS_ENTER(zfsvfs);
3799         ZFS_VERIFY_ZP(zp);
3800         gen = (uint32_t)zp->z_gen;
3801
3802         size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
3803         fidp->fid_len = size;
3804
3805         zfid = (zfid_short_t *)fidp;
3806
3807         zfid->zf_len = size;
3808
3809         for (i = 0; i < sizeof (zfid->zf_object); i++)
3810                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3811
3812         /* Must have a non-zero generation number to distinguish from .zfs */
3813         if (gen == 0)
3814                 gen = 1;
3815         for (i = 0; i < sizeof (zfid->zf_gen); i++)
3816                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3817
3818         if (size == LONG_FID_LEN) {
3819                 uint64_t        objsetid = dmu_objset_id(zfsvfs->z_os);
3820                 zfid_long_t     *zlfid;
3821
3822                 zlfid = (zfid_long_t *)fidp;
3823
3824                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
3825                         zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
3826
3827                 /* XXX - this should be the generation number for the objset */
3828                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
3829                         zlfid->zf_setgen[i] = 0;
3830         }
3831
3832         ZFS_EXIT(zfsvfs);
3833         return (0);
3834 }
3835
3836 static int
3837 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
3838     caller_context_t *ct)
3839 {
3840         znode_t         *zp, *xzp;
3841         zfsvfs_t        *zfsvfs;
3842         zfs_dirlock_t   *dl;
3843         int             error;
3844
3845         switch (cmd) {
3846         case _PC_LINK_MAX:
3847                 *valp = INT_MAX;
3848                 return (0);
3849
3850         case _PC_FILESIZEBITS:
3851                 *valp = 64;
3852                 return (0);
3853
3854 #if 0
3855         case _PC_XATTR_EXISTS:
3856                 zp = VTOZ(vp);
3857                 zfsvfs = zp->z_zfsvfs;
3858                 ZFS_ENTER(zfsvfs);
3859                 ZFS_VERIFY_ZP(zp);
3860                 *valp = 0;
3861                 error = zfs_dirent_lock(&dl, zp, "", &xzp,
3862                     ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
3863                 if (error == 0) {
3864                         zfs_dirent_unlock(dl, 0);
3865                         if (!zfs_dirempty(xzp))
3866                                 *valp = 1;
3867                         VN_RELE(ZTOV(xzp));
3868                 } else if (error == ENOENT) {
3869                         /*
3870                          * If there aren't extended attributes, it's the
3871                          * same as having zero of them.
3872                          */
3873                         error = 0;
3874                 }
3875                 ZFS_EXIT(zfsvfs);
3876                 return (error);
3877 #endif
3878
3879         case _PC_ACL_EXTENDED:
3880                 *valp = 0;      /* TODO */
3881                 return (0);
3882
3883         case _PC_MIN_HOLE_SIZE:
3884                 *valp = (int)SPA_MINBLOCKSIZE;
3885                 return (0);
3886
3887         default:
3888                 return (EOPNOTSUPP);
3889         }
3890 }
3891
3892 static int
3893 zfs_netbsd_open(struct vop_open_args *ap)
3894 {
3895         vnode_t *vp = ap->a_vp;
3896         znode_t *zp = VTOZ(vp);
3897         int error;
3898
3899         error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
3900
3901         return (error);
3902 }
3903
3904 static int
3905 zfs_netbsd_close(struct vop_close_args *ap)
3906 {
3907
3908         return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
3909 }
3910
3911 static int
3912 zfs_netbsd_ioctl(struct vop_ioctl_args *ap)
3913 {
3914
3915         return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
3916                 ap->a_fflag, ap->a_cred, NULL, NULL));
3917 }
3918
3919
3920 static int
3921 zfs_netbsd_read(struct vop_read_args *ap)
3922 {
3923
3924         return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3925 }
3926
3927 static int
3928 zfs_netbsd_write(struct vop_write_args *ap)
3929 {
3930
3931         return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3932 }
3933
3934 static int
3935 zfs_netbsd_access(struct vop_access_args *ap)
3936 {
3937
3938         /*
3939          * ZFS itself only knowns about VREAD, VWRITE and VEXEC, the rest
3940          * we have to handle by calling vaccess().
3941          */
3942         if ((ap->a_mode & ~(VREAD|VWRITE|VEXEC)) != 0) {
3943                 vnode_t *vp = ap->a_vp;
3944                 znode_t *zp = VTOZ(vp);
3945                 znode_phys_t *zphys = zp->z_phys;
3946
3947                 return (vaccess(vp->v_type, zphys->zp_mode, zphys->zp_uid,
3948                     zphys->zp_gid, ap->a_mode, ap->a_cred));
3949         }
3950
3951         return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred, NULL));
3952 }
3953
3954 static int
3955 zfs_netbsd_lookup(struct vop_lookup_args *ap)
3956 {
3957         struct componentname *cnp = ap->a_cnp;
3958         char nm[NAME_MAX + 1];
3959         int err;
3960
3961         ASSERT(cnp->cn_namelen < sizeof(nm));
3962         strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
3963
3964         err = zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
3965             cnp->cn_cred, 0);
3966
3967         return err;
3968 }
3969
3970 static int
3971 zfs_netbsd_create(struct vop_create_args *ap)
3972 {
3973         struct componentname *cnp = ap->a_cnp;
3974         vattr_t *vap = ap->a_vap;
3975         int mode;
3976
3977         ASSERT(cnp->cn_flags & SAVENAME);
3978
3979         vattr_init_mask(vap);
3980         mode = vap->va_mode & ALLPERMS;
3981
3982         return (zfs_create(ap->a_dvp, (char *)cnp->cn_nameptr, vap, !EXCL, mode,
3983                 ap->a_vpp, cnp->cn_cred));
3984 }
3985
3986 static int
3987 zfs_netbsd_remove(struct vop_remove_args *ap)
3988 {
3989
3990         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
3991
3992         return (zfs_remove(ap->a_dvp, (char *)ap->a_cnp->cn_nameptr,
3993             ap->a_cnp->cn_cred, NULL, 0));
3994 }
3995
3996 static int
3997 zfs_netbsd_mkdir(struct vop_mkdir_args *ap)
3998 {
3999         vattr_t *vap = ap->a_vap;
4000
4001         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4002
4003         vattr_init_mask(vap);
4004
4005         return (zfs_mkdir(ap->a_dvp, (char *)ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
4006             ap->a_cnp->cn_cred, NULL, 0, NULL));
4007 }
4008
4009 static int
4010 zfs_netbsd_rmdir(struct vop_rmdir_args *ap)
4011 {
4012         struct componentname *cnp = ap->a_cnp;
4013
4014         ASSERT(cnp->cn_flags & SAVENAME);
4015
4016         return (zfs_rmdir(ap->a_dvp, (char *)cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
4017 }
4018
4019 static int
4020 zfs_netbsd_readdir(struct vop_readdir_args *ap)
4021 {
4022
4023         return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
4024                 ap->a_ncookies, (u_long **)ap->a_cookies));
4025 }
4026
4027 static int
4028 zfs_netbsd_fsync(struct vop_fsync_args *ap)
4029 {
4030
4031         return (zfs_fsync(ap->a_vp, ap->a_flags, ap->a_cred, NULL));
4032 }
4033
4034 static int
4035 zfs_netbsd_getattr(struct vop_getattr_args *ap)
4036 {
4037         vattr_t *vap = ap->a_vap;
4038         xvattr_t xvap;
4039         u_long fflags = 0;
4040         int error;
4041
4042         xva_init(&xvap);
4043         xvap.xva_vattr = *vap;
4044         xvap.xva_vattr.va_mask |= AT_XVATTR;
4045
4046         /* Convert chflags into ZFS-type flags. */
4047         /* XXX: what about SF_SETTABLE?. */
4048         XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
4049         XVA_SET_REQ(&xvap, XAT_APPENDONLY);
4050         XVA_SET_REQ(&xvap, XAT_NOUNLINK);
4051         XVA_SET_REQ(&xvap, XAT_NODUMP);
4052         error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
4053         if (error != 0)
4054                 return (error);
4055
4056         /* Convert ZFS xattr into chflags. */
4057 #define FLAG_CHECK(fflag, xflag, xfield)        do {                    \
4058         if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)             \
4059                 fflags |= (fflag);                                      \
4060 } while (0)
4061         FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
4062             xvap.xva_xoptattrs.xoa_immutable);
4063         FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
4064             xvap.xva_xoptattrs.xoa_appendonly);
4065         FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
4066             xvap.xva_xoptattrs.xoa_nounlink);
4067         FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
4068             xvap.xva_xoptattrs.xoa_nodump);
4069 #undef  FLAG_CHECK
4070         *vap = xvap.xva_vattr;
4071         vap->va_flags = fflags;
4072         return (0);
4073 }
4074
4075 static int
4076 zfs_netbsd_setattr(struct vop_setattr_args *ap)
4077 {
4078         vnode_t *vp = ap->a_vp;
4079         vattr_t *vap = ap->a_vap;
4080         cred_t *cred = ap->a_cred;
4081         xvattr_t xvap;
4082         u_long fflags;
4083         uint64_t zflags;
4084
4085         vattr_init_mask(vap);
4086         vap->va_mask &= ~AT_NOSET;
4087
4088         xva_init(&xvap);
4089         xvap.xva_vattr = *vap;
4090
4091         zflags = VTOZ(vp)->z_phys->zp_flags;
4092
4093         if (vap->va_flags != VNOVAL) {
4094                 int error;
4095
4096                 fflags = vap->va_flags;
4097                 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
4098                         return (EOPNOTSUPP);
4099                 /*
4100                  * Callers may only modify the file flags on objects they
4101                  * have VADMIN rights for.
4102                  */
4103                 if ((error = VOP_ACCESS(vp, VWRITE, cred)) != 0)
4104                         return (error);
4105                 /*
4106                  * Unprivileged processes are not permitted to unset system
4107                  * flags, or modify flags if any system flags are set.
4108                  * Privileged non-jail processes may not modify system flags
4109                  * if securelevel > 0 and any existing system flags are set.
4110                  * Privileged jail processes behave like privileged non-jail
4111                  * processes if the security.jail.chflags_allowed sysctl is
4112                  * is non-zero; otherwise, they behave like unprivileged
4113                  * processes.
4114                  */
4115                 if (kauth_authorize_system(cred, KAUTH_SYSTEM_CHSYSFLAGS, 0,
4116                         NULL, NULL, NULL) != 0) {
4117
4118                         if (zflags &
4119                             (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4120                                 return (EPERM);
4121                         }
4122                         if (fflags &
4123                             (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
4124                                 return (EPERM);
4125                         }
4126                 }
4127
4128 #define FLAG_CHANGE(fflag, zflag, xflag, xfield)        do {            \
4129         if (((fflags & (fflag)) && !(zflags & (zflag))) ||              \
4130             ((zflags & (zflag)) && !(fflags & (fflag)))) {              \
4131                 XVA_SET_REQ(&xvap, (xflag));                            \
4132                 (xfield) = ((fflags & (fflag)) != 0);                   \
4133         }                                                               \
4134 } while (0)
4135                 /* Convert chflags into ZFS-type flags. */
4136                 /* XXX: what about SF_SETTABLE?. */
4137                 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
4138                     xvap.xva_xoptattrs.xoa_immutable);
4139                 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
4140                     xvap.xva_xoptattrs.xoa_appendonly);
4141                 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
4142                     xvap.xva_xoptattrs.xoa_nounlink);
4143                 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
4144                     xvap.xva_xoptattrs.xoa_nodump);
4145 #undef  FLAG_CHANGE
4146         }
4147         return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
4148 }
4149
4150 static int
4151 zfs_netbsd_rename(ap)
4152         struct vop_rename_args  /* {
4153                 struct vnode *a_fdvp;
4154                 struct vnode *a_fvp;
4155                 struct componentname *a_fcnp;
4156                 struct vnode *a_tdvp;
4157                 struct vnode *a_tvp;
4158                 struct componentname *a_tcnp;
4159         } */ *ap;
4160 {
4161         vnode_t *fdvp = ap->a_fdvp;
4162         vnode_t *fvp = ap->a_fvp;
4163         vnode_t *tdvp = ap->a_tdvp;
4164         vnode_t *tvp = ap->a_tvp;
4165         int error;
4166
4167         ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
4168         ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
4169
4170         error = zfs_rename(fdvp, (char *)ap->a_fcnp->cn_nameptr, tdvp,
4171             (char *)ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
4172
4173         if (tdvp == tvp)
4174                 VN_RELE(tdvp);
4175         else
4176                 VN_URELE(tdvp);
4177         if (tvp)
4178                 VN_URELE(tvp);
4179         VN_RELE(fdvp);
4180         VN_RELE(fvp);
4181
4182         return (error);
4183 }
4184
4185 static int
4186 zfs_netbsd_symlink(struct vop_symlink_args *ap)
4187 {
4188         struct componentname *cnp = ap->a_cnp;
4189         vattr_t *vap = ap->a_vap;
4190
4191         ASSERT(cnp->cn_flags & SAVENAME);
4192
4193         vap->va_type = VLNK;    /* Netbsd: Syscall only sets va_mode. */
4194         vattr_init_mask(vap);
4195
4196         return (zfs_symlink(ap->a_dvp, ap->a_vpp, (char *)cnp->cn_nameptr, vap,
4197             ap->a_target, cnp->cn_cred));
4198 }
4199
4200 static int
4201 zfs_netbsd_readlink(struct vop_readlink_args *ap)
4202 {
4203
4204         return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
4205 }
4206
4207 static int
4208 zfs_netbsd_link(struct vop_link_args *ap)
4209 {
4210         struct componentname *cnp = ap->a_cnp;
4211
4212         ASSERT(cnp->cn_flags & SAVENAME);
4213
4214         return (zfs_link(ap->a_dvp, ap->a_vp, (char *)cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
4215 }
4216
4217 static int
4218 zfs_netbsd_inactive(struct vop_inactive_args *ap)
4219 {
4220         vnode_t *vp = ap->a_vp;
4221         znode_t *zp = VTOZ(vp);
4222
4223         /*
4224          * NetBSD: nothing to do here, other than indicate if the
4225          * vnode should be reclaimed.  No need to lock, if we race
4226          * vrele() will call us again.
4227          */
4228         *ap->a_recycle = (zp->z_unlinked != 0);
4229         VOP_UNLOCK(vp, 0);
4230         return (0);
4231 }
4232
4233 /*
4234  * Destroy znode from taskq thread without ZFS_OBJ_MUTEX held.
4235  */
4236 static void
4237 zfs_reclaim_deferred(void *arg, int pending)
4238 {
4239         znode_t *zp = arg;
4240         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4241         uint64_t z_id = zp->z_id;
4242
4243         /*
4244          * Don't allow a zfs_zget() while were trying to release this znode
4245          */
4246         ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
4247
4248         /* Don't need to call ZFS_OBJ_HOLD_EXIT zfs_inactive did thatfor us. */
4249         zfs_zinactive(zp);
4250
4251 }
4252
4253 static int
4254 zfs_netbsd_reclaim(struct vop_reclaim_args *ap)
4255 {
4256         vnode_t *vp = ap->a_vp;
4257         znode_t *zp = VTOZ(vp);
4258         zfsvfs_t *zfsvfs;
4259         int locked;
4260
4261         locked = 0;
4262
4263         ASSERT(zp != NULL);
4264         KASSERT(!vn_has_cached_data(vp));
4265
4266         zfsvfs = zp->z_zfsvfs;
4267
4268         mutex_enter(&zp->z_lock);
4269         ASSERT(zp->z_phys);
4270
4271 //      dprintf("destroying znode %p -- vnode %p -- zp->z_buf = %p\n", zp, ZTOV(zp), zp->z_dbuf);
4272 //      rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4273         genfs_node_destroy(vp);
4274         cache_purge(vp);
4275
4276         if (zp->z_dbuf == NULL) {
4277                 /*
4278                  * The fs has been unmounted, or we did a
4279                  * suspend/resume and this file no longer exists.
4280                  */
4281                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4282                 mutex_exit(&zp->z_lock);
4283                 zfs_znode_free(zp);
4284                 return (0);
4285         }
4286         mutex_exit(&zp->z_lock);
4287
4288         mutex_enter(&zp->z_lock);
4289         if (!zp->z_unlinked) {
4290                 /*
4291                  * XXX Hack because ZFS_OBJ_MUTEX is held we can't call zfs_zinactive
4292                  * now. I need to defer zfs_zinactive to another thread which doesn't hold this mutex.
4293                  */
4294                 locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
4295                     ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
4296                 if (locked == 0) {
4297                         /*
4298                          * Lock can't be obtained due to deadlock possibility,
4299                          * so defer znode destruction.
4300                          */
4301                         taskq_dispatch(system_taskq, zfs_reclaim_deferred, zp, 0);
4302                 } else {
4303                         zfs_znode_dmu_fini(zp);
4304                         /* Our LWP is holding ZFS_OBJ_HELD mutex but it was locked before
4305                            zfs_zinactive was called therefore we can't release it. */
4306                         if (locked == 1)
4307                                 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
4308                         zfs_znode_free(zp);
4309                 }
4310         } else
4311                 mutex_exit(&zp->z_lock);
4312
4313         ZTOV(zp) = NULL;
4314         vp->v_data = NULL; /* v_data must be NULL for a cleaned vnode. */
4315
4316         return (0);
4317 }
4318
4319 static int
4320 zfs_netbsd_fid(struct vop_fid_args *ap)
4321 {
4322
4323         return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
4324 }
4325
4326 static int
4327 zfs_netbsd_pathconf(struct vop_pathconf_args *ap)
4328 {
4329         ulong_t val;
4330         int error;
4331
4332         error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->l_cred, NULL);
4333         if (error == 0)
4334                 *ap->a_retval = val;
4335         else if (error == EOPNOTSUPP) {
4336                 switch (ap->a_name) {
4337                 case _PC_NAME_MAX:
4338                         *ap->a_retval = NAME_MAX;
4339                         return (0);
4340                 case _PC_PATH_MAX:
4341                         *ap->a_retval = PATH_MAX;
4342                         return (0);
4343                 case _PC_LINK_MAX:
4344                         *ap->a_retval = LINK_MAX;
4345                         return (0);
4346                 case _PC_MAX_CANON:
4347                         *ap->a_retval = MAX_CANON;
4348                         return (0);
4349                 case _PC_MAX_INPUT:
4350                         *ap->a_retval = MAX_INPUT;
4351                         return (0);
4352                 case _PC_PIPE_BUF:
4353                         *ap->a_retval = PIPE_BUF;
4354                         return (0);
4355                 case _PC_CHOWN_RESTRICTED:
4356                         *ap->a_retval = 1;
4357                         return (0);
4358                 case _PC_VDISABLE:
4359                         *ap->a_retval = _POSIX_VDISABLE;
4360                         return (0);
4361                 default:
4362                         return (EINVAL);
4363                 }
4364                 /* NOTREACHED */
4365         }
4366         return (error);
4367 }
4368
4369 int
4370 zfs_netbsd_lock(struct vop_lock_args *ap)
4371 {
4372         struct vnode *vp = ap->a_vp;
4373         int flags = ap->a_flags;
4374
4375         if ((flags & LK_INTERLOCK) != 0) {
4376                 mutex_exit(&vp->v_interlock);
4377         }
4378
4379         return 0;
4380 }
4381
4382 int
4383 zfs_netbsd_unlock(void *v)
4384 {
4385
4386         return 0;
4387 }
4388 /*
4389 int
4390 zfs_netbsd_getpages(void *v)
4391 {
4392         struct vnode *vp = ((struct vop_getpages_args *)v)->a_vp;
4393         voff_t offset = ((struct vop_getpages_args *)v)->a_offset;
4394         struct vm_page **m = ((struct vop_getpages_args *)v)->a_m;
4395         int *count = ((struct vop_getpages_args *)v)->a_count;
4396         int centeridx = ((struct vop_getpages_args *)v)->a_centeridx;
4397         vm_prot_t access_type = ((struct vop_getpages_args *)v)->a_access_type;
4398         int advice = ((struct vop_getpages_args *)v)->a_advice;
4399         int flags = ((struct vop_getpages_args *)v)->a_flags;
4400
4401         int error;
4402
4403         error = 0;
4404
4405         KASSERT(!vn_has_cached_data(vp));
4406         mutex_exit(&vp->v_interlock);
4407
4408         return error;
4409 }
4410 */
4411
4412 int
4413 zfs_netbsd_putpages(void *v)
4414 {
4415         struct vnode *vp = ((struct vop_putpages_args *)v)->a_vp;
4416         voff_t offlo = ((struct vop_putpages_args *)v)->a_offlo;
4417         voff_t offhi = ((struct vop_putpages_args *)v)->a_offhi;
4418         int flags = ((struct vop_putpages_args *)v)->a_flags;
4419         znode_t *zp = VTOZ(vp);
4420
4421         int error;
4422
4423         dprintf("putpages entry %p -- zfsvfs %p\n", vp, zp->z_zfsvfs);
4424         error = genfs_putpages(v);
4425         dprintf("putpages exit %p -- zfsvfs %p\n", vp, zp->z_zfsvfs);
4426
4427         return error;
4428 }
4429
4430 #define zfs_netbsd_seek genfs_seek
4431 #define zfs_netbsd_mmap genfs_mmap
4432 #define zfs_netbsd_getpages genfs_compat_getpages
4433 //#define zfs_netbsd_putpages genfs_putpages
4434 #define zfs_netbsd_islocked genfs_islocked
4435
4436 int (**zfs_vnodeop_p)(void *);
4437 const struct vnodeopv_entry_desc zfs_vnodeop_entries[] = {
4438         { &vop_default_desc,            vn_default_error },
4439         { &vop_lookup_desc,             zfs_netbsd_lookup },
4440         { &vop_create_desc,             zfs_netbsd_create },
4441         { &vop_open_desc,               zfs_netbsd_open },
4442         { &vop_close_desc,              zfs_netbsd_close },
4443         { &vop_access_desc,             zfs_netbsd_access },
4444         { &vop_getattr_desc,            zfs_netbsd_getattr },
4445         { &vop_setattr_desc,            zfs_netbsd_setattr },
4446         { &vop_read_desc,               zfs_netbsd_read },
4447         { &vop_write_desc,              zfs_netbsd_write },
4448         { &vop_ioctl_desc,              zfs_netbsd_ioctl },
4449         { &vop_fsync_desc,              zfs_netbsd_fsync },
4450         { &vop_remove_desc,             zfs_netbsd_remove },
4451         { &vop_link_desc,               zfs_netbsd_link },
4452         { &vop_lock_desc,               zfs_netbsd_lock },
4453         { &vop_unlock_desc,             zfs_netbsd_unlock },
4454         { &vop_rename_desc,             zfs_netbsd_rename },
4455         { &vop_mkdir_desc,              zfs_netbsd_mkdir },
4456         { &vop_rmdir_desc,              zfs_netbsd_rmdir },
4457         { &vop_symlink_desc,            zfs_netbsd_symlink },
4458         { &vop_readdir_desc,            zfs_netbsd_readdir },
4459         { &vop_readlink_desc,           zfs_netbsd_readlink },
4460         { &vop_inactive_desc,           zfs_netbsd_inactive },
4461         { &vop_reclaim_desc,            zfs_netbsd_reclaim },
4462         { &vop_pathconf_desc,           zfs_netbsd_pathconf },
4463         { &vop_seek_desc,               zfs_netbsd_seek },
4464         { &vop_getpages_desc,           zfs_netbsd_getpages },
4465         { &vop_putpages_desc,           zfs_netbsd_putpages },
4466         { &vop_mmap_desc,               zfs_netbsd_mmap },
4467         { &vop_islocked_desc,           zfs_netbsd_islocked },
4468 #ifdef notyet
4469         { &vop_advlock_desc,            zfs_netbsd_advlock },
4470         { &vop_fcntl_desc,              zfs_netbsd_fcntl },
4471         { &vop_bmap_desc,               zfs_netbsd_bmap },
4472         { &vop_strategy_desc,           zfs_netbsd_strategy },
4473         { &vop_print_desc,              zfs_netbsd_print },
4474         { &vop_bwrite_desc,             zfs_netbsd_bwrite },
4475 #endif
4476         { NULL, NULL }
4477 };
4478
4479 const struct vnodeopv_desc zfs_vnodeop_opv_desc =
4480         { &zfs_vnodeop_p, zfs_vnodeop_entries };
4481
4482 #if 0
4483 struct vop_vector zfs_vnodeops;
4484 struct vop_vector zfs_fifoops;
4485
4486
4487
4488 struct vop_vector zfs_vnodeops = {
4489         .vop_default =          &default_vnodeops,
4490         .vop_inactive =         zfs_netbsd_inactive,
4491         .vop_reclaim =          zfs_netbsd_reclaim,
4492         .vop_access =           zfs_netbsd_access,
4493         .vop_lookup =           zfs_netbsd_lookup,
4494         .vop_getattr =          zfs_netbsd_getattr,
4495         .vop_setattr =          zfs_netbsd_setattr,
4496         .vop_create =           zfs_netbsd_create,
4497         .vop_mknod =            zfs_netbsd_create,
4498         .vop_mkdir =            zfs_netbsd_mkdir,
4499         .vop_readdir =          zfs_netbsd_readdir,
4500         .vop_fsync =            zfs_netbsd_fsync,
4501         .vop_open =             zfs_netbsd_open,
4502         .vop_close =            zfs_netbsd_close,
4503         .vop_rmdir =            zfs_netbsd_rmdir,
4504         .vop_ioctl =            zfs_netbsd_ioctl,
4505         .vop_link =             zfs_netbsd_link,
4506         .vop_lock =             zfs_netbsd_lock,
4507         .vop_unlock =           zfs_netbsd_unlock,
4508         .vop_symlink =          zfs_netbsd_symlink,
4509         .vop_readlink =         zfs_netbsd_readlink,
4510         .vop_read =             zfs_netbsd_read,
4511         .vop_write =            zfs_netbsd_write,
4512         .vop_remove =           zfs_netbsd_remove,
4513         .vop_rename =           zfs_netbsd_rename,
4514         .vop_pathconf =         zfs_netbsd_pathconf,
4515         .vop_bmap =             VOP_EOPNOTSUPP,
4516         .vop_fid =              zfs_netbsd_fid,
4517         .vop_getextattr =       zfs_getextattr,
4518         .vop_deleteextattr =    zfs_deleteextattr,
4519         .vop_setextattr =       zfs_setextattr,
4520         .vop_listextattr =      zfs_listextattr,
4521 #ifdef notyet
4522         .vop_getacl =           zfs_netbsd_getacl,
4523         .vop_setacl =           zfs_netbsd_setacl,
4524         .vop_aclcheck =         zfs_netbsd_aclcheck,
4525 #endif
4526 };
4527
4528 struct vop_vector zfs_fifoops = {
4529         .vop_default =          &fifo_specops,
4530         .vop_fsync =            VOP_PANIC,
4531         .vop_access =           zfs_netbsd_access,
4532         .vop_getattr =          zfs_netbsd_getattr,
4533         .vop_inactive =         zfs_netbsd_inactive,
4534         .vop_read =             VOP_PANIC,
4535         .vop_reclaim =          zfs_netbsd_reclaim,
4536         .vop_setattr =          zfs_netbsd_setattr,
4537         .vop_write =            VOP_PANIC,
4538         .vop_fid =              zfs_netbsd_fid,
4539 #ifdef notyet
4540         .vop_getacl =           zfs_netbsd_getacl,
4541         .vop_setacl =           zfs_netbsd_setacl,
4542         .vop_aclcheck =         zfs_netbsd_aclcheck,
4543 #endif
4544 };
4545 #endif