module/os/freebsd/zfs/zfs_vnops_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 Integros [integros.com]
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  */
  28
  29 /* Portions Copyright 2007 Jeremy Teo */
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32 #include <sys/param.h>
  33 #include <sys/time.h>
  34 #include <sys/systm.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/resource.h>
  37 #include <security/mac/mac_framework.h>
  38 #include <sys/vfs.h>
  39 #include <sys/endian.h>
  40 #include <sys/vm.h>
  41 #include <sys/vnode.h>
  42 #include <sys/smr.h>
  43 #include <sys/dirent.h>
  44 #include <sys/file.h>
  45 #include <sys/stat.h>
  46 #include <sys/kmem.h>
  47 #include <sys/taskq.h>
  48 #include <sys/uio.h>
  49 #include <sys/atomic.h>
  50 #include <sys/namei.h>
  51 #include <sys/mman.h>
  52 #include <sys/cmn_err.h>
  53 #include <sys/kdb.h>
  54 #include <sys/sysproto.h>
  55 #include <sys/errno.h>
  56 #include <sys/unistd.h>
  57 #include <sys/zfs_dir.h>
  58 #include <sys/zfs_ioctl.h>
  59 #include <sys/fs/zfs.h>
  60 #include <sys/dmu.h>
  61 #include <sys/dmu_objset.h>
  62 #include <sys/spa.h>
  63 #include <sys/txg.h>
  64 #include <sys/dbuf.h>
  65 #include <sys/zap.h>
  66 #include <sys/sa.h>
  67 #include <sys/policy.h>
  68 #include <sys/sunddi.h>
  69 #include <sys/filio.h>
  70 #include <sys/sid.h>
  71 #include <sys/zfs_ctldir.h>
  72 #include <sys/zfs_fuid.h>
  73 #include <sys/zfs_quota.h>
  74 #include <sys/zfs_sa.h>
  75 #include <sys/zfs_rlock.h>
  76 #include <sys/bio.h>
  77 #include <sys/buf.h>
  78 #include <sys/sched.h>
  79 #include <sys/acl.h>
  80 #include <sys/vmmeter.h>
  81 #include <vm/vm_param.h>
  82 #include <sys/zil.h>
  83 #include <sys/zfs_vnops.h>
  84 #include <sys/module.h>
  85 #include <sys/sysent.h>
  86 #include <sys/dmu_impl.h>
  87 #include <sys/brt.h>
  88 #include <sys/zfeature.h>
  89
  90 #include <vm/vm_object.h>
  91
  92 #include <sys/extattr.h>
  93 #include <sys/priv.h>
  94
  95 #ifndef VN_OPEN_INVFS
  96 #define VN_OPEN_INVFS   0x0
  97 #endif
  98
  99 VFS_SMR_DECLARE;
 100
 101 #ifdef DEBUG_VFS_LOCKS
 102 #define VNCHECKREF(vp)                            \
 103         VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp,       \
 104             ("%s: wrong ref counts", __func__));
 105 #else
 106 #define VNCHECKREF(vp)
 107 #endif
 108
 109 #if __FreeBSD_version >= 1400045
 110 typedef uint64_t cookie_t;
 111 #else
 112 typedef ulong_t cookie_t;
 113 #endif
 114
 115 /*
 116  * Programming rules.
 117  *
 118  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
 119  * properly lock its in-core state, create a DMU transaction, do the work,
 120  * record this work in the intent log (ZIL), commit the DMU transaction,
 121  * and wait for the intent log to commit if it is a synchronous operation.
 122  * Moreover, the vnode ops must work in both normal and log replay context.
 123  * The ordering of events is important to avoid deadlocks and references
 124  * to freed memory.  The example below illustrates the following Big Rules:
 125  *
 126  *  (1) A check must be made in each zfs thread for a mounted file system.
 127  *      This is done avoiding races using zfs_enter(zfsvfs).
 128  *      A zfs_exit(zfsvfs) is needed before all returns.  Any znodes
 129  *      must be checked with zfs_verify_zp(zp).  Both of these macros
 130  *      can return EIO from the calling function.
 131  *
 132  *  (2) VN_RELE() should always be the last thing except for zil_commit()
 133  *      (if necessary) and zfs_exit(). This is for 3 reasons:
 134  *      First, if it's the last reference, the vnode/znode
 135  *      can be freed, so the zp may point to freed memory.  Second, the last
 136  *      reference will call zfs_zinactive(), which may induce a lot of work --
 137  *      pushing cached pages (which acquires range locks) and syncing out
 138  *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 139  *      which could deadlock the system if you were already holding one.
 140  *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 141  *
 142  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 143  *      as they can span dmu_tx_assign() calls.
 144  *
 145  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 146  *      dmu_tx_assign().  This is critical because we don't want to block
 147  *      while holding locks.
 148  *
 149  *      If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT.  This
 150  *      reduces lock contention and CPU usage when we must wait (note that if
 151  *      throughput is constrained by the storage, nearly every transaction
 152  *      must wait).
 153  *
 154  *      Note, in particular, that if a lock is sometimes acquired before
 155  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 156  *      to use a non-blocking assign can deadlock the system.  The scenario:
 157  *
 158  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 159  *      Thread B is in an already-assigned tx, and blocks for this lock.
 160  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 161  *      forever, because the previous txg can't quiesce until B's tx commits.
 162  *
 163  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 164  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 165  *      calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
 166  *      to indicate that this operation has already called dmu_tx_wait().
 167  *      This will ensure that we don't retry forever, waiting a short bit
 168  *      each time.
 169  *
 170  *  (5) If the operation succeeded, generate the intent log entry for it
 171  *      before dropping locks.  This ensures that the ordering of events
 172  *      in the intent log matches the order in which they actually occurred.
 173  *      During ZIL replay the zfs_log_* functions will update the sequence
 174  *      number to indicate the zil transaction has replayed.
 175  *
 176  *  (6) At the end of each vnode op, the DMU tx must always commit,
 177  *      regardless of whether there were any errors.
 178  *
 179  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 180  *      to ensure that synchronous semantics are provided when necessary.
 181  *
 182  * In general, this is how things should be ordered in each vnode op:
 183  *
 184  *      zfs_enter(zfsvfs);              // exit if unmounted
 185  * top:
 186  *      zfs_dirent_lookup(&dl, ...)     // lock directory entry (may VN_HOLD())
 187  *      rw_enter(...);                  // grab any other locks you need
 188  *      tx = dmu_tx_create(...);        // get DMU tx
 189  *      dmu_tx_hold_*();                // hold each object you might modify
 190  *      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 191  *      if (error) {
 192  *              rw_exit(...);           // drop locks
 193  *              zfs_dirent_unlock(dl);  // unlock directory entry
 194  *              VN_RELE(...);           // release held vnodes
 195  *              if (error == ERESTART) {
 196  *                      waited = B_TRUE;
 197  *                      dmu_tx_wait(tx);
 198  *                      dmu_tx_abort(tx);
 199  *                      goto top;
 200  *              }
 201  *              dmu_tx_abort(tx);       // abort DMU tx
 202  *              zfs_exit(zfsvfs);       // finished in zfs
 203  *              return (error);         // really out of space
 204  *      }
 205  *      error = do_real_work();         // do whatever this VOP does
 206  *      if (error == 0)
 207  *              zfs_log_*(...);         // on success, make ZIL entry
 208  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 209  *      rw_exit(...);                   // drop locks
 210  *      zfs_dirent_unlock(dl);          // unlock directory entry
 211  *      VN_RELE(...);                   // release held vnodes
 212  *      zil_commit(zilog, foid);        // synchronous when necessary
 213  *      zfs_exit(zfsvfs);               // finished in zfs
 214  *      return (error);                 // done, report error
 215  */
 216 static int
 217 zfs_open(vnode_t **vpp, int flag, cred_t *cr)
 218 {
 219         (void) cr;
 220         znode_t *zp = VTOZ(*vpp);
 221         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 222         int error;
 223
 224         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 225                 return (error);
 226
 227         if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 228             ((flag & FAPPEND) == 0)) {
 229                 zfs_exit(zfsvfs, FTAG);
 230                 return (SET_ERROR(EPERM));
 231         }
 232
 233         /*
 234          * Keep a count of the synchronous opens in the znode.  On first
 235          * synchronous open we must convert all previous async transactions
 236          * into sync to keep correct ordering.
 237          */
 238         if (flag & O_SYNC) {
 239                 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
 240                         zil_async_to_sync(zfsvfs->z_log, zp->z_id);
 241         }
 242
 243         zfs_exit(zfsvfs, FTAG);
 244         return (0);
 245 }
 246
 247 static int
 248 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
 249 {
 250         (void) offset, (void) cr;
 251         znode_t *zp = VTOZ(vp);
 252         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 253         int error;
 254
 255         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 256                 return (error);
 257
 258         /* Decrement the synchronous opens in the znode */
 259         if ((flag & O_SYNC) && (count == 1))
 260                 atomic_dec_32(&zp->z_sync_cnt);
 261
 262         zfs_exit(zfsvfs, FTAG);
 263         return (0);
 264 }
 265
 266 static int
 267 zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
 268     int *rvalp)
 269 {
 270         (void) flag, (void) cred, (void) rvalp;
 271         loff_t off;
 272         int error;
 273
 274         switch (com) {
 275         case _FIOFFS:
 276         {
 277                 return (0);
 278
 279                 /*
 280                  * The following two ioctls are used by bfu.  Faking out,
 281                  * necessary to avoid bfu errors.
 282                  */
 283         }
 284         case _FIOGDIO:
 285         case _FIOSDIO:
 286         {
 287                 return (0);
 288         }
 289
 290         case F_SEEK_DATA:
 291         case F_SEEK_HOLE:
 292         {
 293                 off = *(offset_t *)data;
 294                 /* offset parameter is in/out */
 295                 error = zfs_holey(VTOZ(vp), com, &off);
 296                 if (error)
 297                         return (error);
 298                 *(offset_t *)data = off;
 299                 return (0);
 300         }
 301         }
 302         return (SET_ERROR(ENOTTY));
 303 }
 304
 305 static vm_page_t
 306 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 307 {
 308         vm_object_t obj;
 309         vm_page_t pp;
 310         int64_t end;
 311
 312         /*
 313          * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
 314          * aligned boundaries, if the range is not aligned.  As a result a
 315          * DEV_BSIZE subrange with partially dirty data may get marked as clean.
 316          * It may happen that all DEV_BSIZE subranges are marked clean and thus
 317          * the whole page would be considered clean despite have some
 318          * dirty data.
 319          * For this reason we should shrink the range to DEV_BSIZE aligned
 320          * boundaries before calling vm_page_clear_dirty.
 321          */
 322         end = rounddown2(off + nbytes, DEV_BSIZE);
 323         off = roundup2(off, DEV_BSIZE);
 324         nbytes = end - off;
 325
 326         obj = vp->v_object;
 327         vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
 328             VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL |
 329             VM_ALLOC_IGN_SBUSY);
 330         if (pp != NULL) {
 331                 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 332                 vm_object_pip_add(obj, 1);
 333                 pmap_remove_write(pp);
 334                 if (nbytes != 0)
 335                         vm_page_clear_dirty(pp, off, nbytes);
 336         }
 337         return (pp);
 338 }
 339
 340 static void
 341 page_unbusy(vm_page_t pp)
 342 {
 343
 344         vm_page_sunbusy(pp);
 345         vm_object_pip_wakeup(pp->object);
 346 }
 347
 348 static vm_page_t
 349 page_hold(vnode_t *vp, int64_t start)
 350 {
 351         vm_object_t obj;
 352         vm_page_t m;
 353
 354         obj = vp->v_object;
 355         vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
 356             VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
 357             VM_ALLOC_NOBUSY);
 358         return (m);
 359 }
 360
 361 static void
 362 page_unhold(vm_page_t pp)
 363 {
 364         vm_page_unwire(pp, PQ_ACTIVE);
 365 }
 366
 367 /*
 368  * When a file is memory mapped, we must keep the IO data synchronized
 369  * between the DMU cache and the memory mapped pages.  What this means:
 370  *
 371  * On Write:    If we find a memory mapped page, we write to *both*
 372  *              the page and the dmu buffer.
 373  */
 374 void
 375 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 376 {
 377         vm_object_t obj;
 378         struct sf_buf *sf;
 379         vnode_t *vp = ZTOV(zp);
 380         caddr_t va;
 381         int off;
 382
 383         ASSERT3P(vp->v_mount, !=, NULL);
 384         obj = vp->v_object;
 385         ASSERT3P(obj, !=, NULL);
 386
 387         off = start & PAGEOFFSET;
 388         vm_object_pip_add(obj, 1);
 389         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 390                 vm_page_t pp;
 391                 int nbytes = imin(PAGESIZE - off, len);
 392
 393                 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
 394                         va = zfs_map_page(pp, &sf);
 395                         (void) dmu_read(os, zp->z_id, start + off, nbytes,
 396                             va + off, DMU_READ_PREFETCH);
 397                         zfs_unmap_page(sf);
 398                         page_unbusy(pp);
 399                 }
 400                 len -= nbytes;
 401                 off = 0;
 402         }
 403         vm_object_pip_wakeup(obj);
 404 }
 405
 406 /*
 407  * Read with UIO_NOCOPY flag means that sendfile(2) requests
 408  * ZFS to populate a range of page cache pages with data.
 409  *
 410  * NOTE: this function could be optimized to pre-allocate
 411  * all pages in advance, drain exclusive busy on all of them,
 412  * map them into contiguous KVA region and populate them
 413  * in one single dmu_read() call.
 414  */
 415 int
 416 mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio)
 417 {
 418         vnode_t *vp = ZTOV(zp);
 419         objset_t *os = zp->z_zfsvfs->z_os;
 420         struct sf_buf *sf;
 421         vm_object_t obj;
 422         vm_page_t pp;
 423         int64_t start;
 424         caddr_t va;
 425         int len = nbytes;
 426         int error = 0;
 427
 428         ASSERT3U(zfs_uio_segflg(uio), ==, UIO_NOCOPY);
 429         ASSERT3P(vp->v_mount, !=, NULL);
 430         obj = vp->v_object;
 431         ASSERT3P(obj, !=, NULL);
 432         ASSERT0(zfs_uio_offset(uio) & PAGEOFFSET);
 433
 434         for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) {
 435                 int bytes = MIN(PAGESIZE, len);
 436
 437                 pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
 438                     VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
 439                 if (vm_page_none_valid(pp)) {
 440                         va = zfs_map_page(pp, &sf);
 441                         error = dmu_read(os, zp->z_id, start, bytes, va,
 442                             DMU_READ_PREFETCH);
 443                         if (bytes != PAGESIZE && error == 0)
 444                                 memset(va + bytes, 0, PAGESIZE - bytes);
 445                         zfs_unmap_page(sf);
 446                         if (error == 0) {
 447                                 vm_page_valid(pp);
 448                                 vm_page_activate(pp);
 449                                 vm_page_sunbusy(pp);
 450                         } else {
 451                                 zfs_vmobject_wlock(obj);
 452                                 if (!vm_page_wired(pp) && pp->valid == 0 &&
 453                                     vm_page_busy_tryupgrade(pp))
 454                                         vm_page_free(pp);
 455                                 else
 456                                         vm_page_sunbusy(pp);
 457                                 zfs_vmobject_wunlock(obj);
 458                         }
 459                 } else {
 460                         ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 461                         vm_page_sunbusy(pp);
 462                 }
 463                 if (error)
 464                         break;
 465                 zfs_uio_advance(uio, bytes);
 466                 len -= bytes;
 467         }
 468         return (error);
 469 }
 470
 471 /*
 472  * When a file is memory mapped, we must keep the IO data synchronized
 473  * between the DMU cache and the memory mapped pages.  What this means:
 474  *
 475  * On Read:     We "read" preferentially from memory mapped pages,
 476  *              else we default from the dmu buffer.
 477  *
 478  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 479  *       the file is memory mapped.
 480  */
 481 int
 482 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 483 {
 484         vnode_t *vp = ZTOV(zp);
 485         vm_object_t obj;
 486         int64_t start;
 487         int len = nbytes;
 488         int off;
 489         int error = 0;
 490
 491         ASSERT3P(vp->v_mount, !=, NULL);
 492         obj = vp->v_object;
 493         ASSERT3P(obj, !=, NULL);
 494
 495         start = zfs_uio_offset(uio);
 496         off = start & PAGEOFFSET;
 497         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 498                 vm_page_t pp;
 499                 uint64_t bytes = MIN(PAGESIZE - off, len);
 500
 501                 if ((pp = page_hold(vp, start))) {
 502                         struct sf_buf *sf;
 503                         caddr_t va;
 504
 505                         va = zfs_map_page(pp, &sf);
 506                         error = vn_io_fault_uiomove(va + off, bytes,
 507                             GET_UIO_STRUCT(uio));
 508                         zfs_unmap_page(sf);
 509                         page_unhold(pp);
 510                 } else {
 511                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 512                             uio, bytes);
 513                 }
 514                 len -= bytes;
 515                 off = 0;
 516                 if (error)
 517                         break;
 518         }
 519         return (error);
 520 }
 521
 522 int
 523 zfs_write_simple(znode_t *zp, const void *data, size_t len,
 524     loff_t pos, size_t *presid)
 525 {
 526         int error = 0;
 527         ssize_t resid;
 528
 529         error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos,
 530             UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread);
 531
 532         if (error) {
 533                 return (SET_ERROR(error));
 534         } else if (presid == NULL) {
 535                 if (resid != 0) {
 536                         error = SET_ERROR(EIO);
 537                 }
 538         } else {
 539                 *presid = resid;
 540         }
 541         return (error);
 542 }
 543
 544 void
 545 zfs_zrele_async(znode_t *zp)
 546 {
 547         vnode_t *vp = ZTOV(zp);
 548         objset_t *os = ITOZSB(vp)->z_os;
 549
 550         VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
 551 }
 552
 553 static int
 554 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 555 {
 556         int error;
 557
 558         *vpp = arg;
 559         error = vn_lock(*vpp, lkflags);
 560         if (error != 0)
 561                 vrele(*vpp);
 562         return (error);
 563 }
 564
 565 static int
 566 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
 567 {
 568         znode_t *zdp = VTOZ(dvp);
 569         zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs;
 570         int error;
 571         int ltype;
 572
 573         if (zfsvfs->z_replay == B_FALSE)
 574                 ASSERT_VOP_LOCKED(dvp, __func__);
 575
 576         if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
 577                 ASSERT3P(dvp, ==, vp);
 578                 vref(dvp);
 579                 ltype = lkflags & LK_TYPE_MASK;
 580                 if (ltype != VOP_ISLOCKED(dvp)) {
 581                         if (ltype == LK_EXCLUSIVE)
 582                                 vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 583                         else /* if (ltype == LK_SHARED) */
 584                                 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 585
 586                         /*
 587                          * Relock for the "." case could leave us with
 588                          * reclaimed vnode.
 589                          */
 590                         if (VN_IS_DOOMED(dvp)) {
 591                                 vrele(dvp);
 592                                 return (SET_ERROR(ENOENT));
 593                         }
 594                 }
 595                 return (0);
 596         } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
 597                 /*
 598                  * Note that in this case, dvp is the child vnode, and we
 599                  * are looking up the parent vnode - exactly reverse from
 600                  * normal operation.  Unlocking dvp requires some rather
 601                  * tricky unlock/relock dance to prevent mp from being freed;
 602                  * use vn_vget_ino_gen() which takes care of all that.
 603                  *
 604                  * XXX Note that there is a time window when both vnodes are
 605                  * unlocked.  It is possible, although highly unlikely, that
 606                  * during that window the parent-child relationship between
 607                  * the vnodes may change, for example, get reversed.
 608                  * In that case we would have a wrong lock order for the vnodes.
 609                  * All other filesystems seem to ignore this problem, so we
 610                  * do the same here.
 611                  * A potential solution could be implemented as follows:
 612                  * - using LK_NOWAIT when locking the second vnode and retrying
 613                  *   if necessary
 614                  * - checking that the parent-child relationship still holds
 615                  *   after locking both vnodes and retrying if it doesn't
 616                  */
 617                 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
 618                 return (error);
 619         } else {
 620                 error = vn_lock(vp, lkflags);
 621                 if (error != 0)
 622                         vrele(vp);
 623                 return (error);
 624         }
 625 }
 626
 627 /*
 628  * Lookup an entry in a directory, or an extended attribute directory.
 629  * If it exists, return a held vnode reference for it.
 630  *
 631  *      IN:     dvp     - vnode of directory to search.
 632  *              nm      - name of entry to lookup.
 633  *              pnp     - full pathname to lookup [UNUSED].
 634  *              flags   - LOOKUP_XATTR set if looking for an attribute.
 635  *              rdir    - root directory vnode [UNUSED].
 636  *              cr      - credentials of caller.
 637  *              ct      - caller context
 638  *
 639  *      OUT:    vpp     - vnode of located entry, NULL if not found.
 640  *
 641  *      RETURN: 0 on success, error code on failure.
 642  *
 643  * Timestamps:
 644  *      NA
 645  */
 646 static int
 647 zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
 648     struct componentname *cnp, int nameiop, cred_t *cr, int flags,
 649     boolean_t cached)
 650 {
 651         znode_t *zdp = VTOZ(dvp);
 652         znode_t *zp;
 653         zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 654         seqc_t dvp_seqc;
 655         int     error = 0;
 656
 657         /*
 658          * Fast path lookup, however we must skip DNLC lookup
 659          * for case folding or normalizing lookups because the
 660          * DNLC code only stores the passed in name.  This means
 661          * creating 'a' and removing 'A' on a case insensitive
 662          * file system would work, but DNLC still thinks 'a'
 663          * exists and won't let you create it again on the next
 664          * pass through fast path.
 665          */
 666         if (!(flags & LOOKUP_XATTR)) {
 667                 if (dvp->v_type != VDIR) {
 668                         return (SET_ERROR(ENOTDIR));
 669                 } else if (zdp->z_sa_hdl == NULL) {
 670                         return (SET_ERROR(EIO));
 671                 }
 672         }
 673
 674         DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp,
 675             const char *, nm);
 676
 677         if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
 678                 return (error);
 679
 680         dvp_seqc = vn_seqc_read_notmodify(dvp);
 681
 682         *vpp = NULL;
 683
 684         if (flags & LOOKUP_XATTR) {
 685                 /*
 686                  * If the xattr property is off, refuse the lookup request.
 687                  */
 688                 if (!(zfsvfs->z_flags & ZSB_XATTR)) {
 689                         zfs_exit(zfsvfs, FTAG);
 690                         return (SET_ERROR(EOPNOTSUPP));
 691                 }
 692
 693                 /*
 694                  * We don't allow recursive attributes..
 695                  * Maybe someday we will.
 696                  */
 697                 if (zdp->z_pflags & ZFS_XATTR) {
 698                         zfs_exit(zfsvfs, FTAG);
 699                         return (SET_ERROR(EINVAL));
 700                 }
 701
 702                 if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) {
 703                         zfs_exit(zfsvfs, FTAG);
 704                         return (error);
 705                 }
 706                 *vpp = ZTOV(zp);
 707
 708                 /*
 709                  * Do we have permission to get into attribute directory?
 710                  */
 711                 error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, NULL);
 712                 if (error) {
 713                         vrele(ZTOV(zp));
 714                 }
 715
 716                 zfs_exit(zfsvfs, FTAG);
 717                 return (error);
 718         }
 719
 720         /*
 721          * Check accessibility of directory if we're not coming in via
 722          * VOP_CACHEDLOOKUP.
 723          */
 724         if (!cached) {
 725 #ifdef NOEXECCHECK
 726                 if ((cnp->cn_flags & NOEXECCHECK) != 0) {
 727                         cnp->cn_flags &= ~NOEXECCHECK;
 728                 } else
 729 #endif
 730                 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
 731                     NULL))) {
 732                         zfs_exit(zfsvfs, FTAG);
 733                         return (error);
 734                 }
 735         }
 736
 737         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 738             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 739                 zfs_exit(zfsvfs, FTAG);
 740                 return (SET_ERROR(EILSEQ));
 741         }
 742
 743
 744         /*
 745          * First handle the special cases.
 746          */
 747         if ((cnp->cn_flags & ISDOTDOT) != 0) {
 748                 /*
 749                  * If we are a snapshot mounted under .zfs, return
 750                  * the vp for the snapshot directory.
 751                  */
 752                 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
 753                         struct componentname cn;
 754                         vnode_t *zfsctl_vp;
 755                         int ltype;
 756
 757                         zfs_exit(zfsvfs, FTAG);
 758                         ltype = VOP_ISLOCKED(dvp);
 759                         VOP_UNLOCK(dvp);
 760                         error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
 761                             &zfsctl_vp);
 762                         if (error == 0) {
 763                                 cn.cn_nameptr = "snapshot";
 764                                 cn.cn_namelen = strlen(cn.cn_nameptr);
 765                                 cn.cn_nameiop = cnp->cn_nameiop;
 766                                 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
 767                                 cn.cn_lkflags = cnp->cn_lkflags;
 768                                 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
 769                                 vput(zfsctl_vp);
 770                         }
 771                         vn_lock(dvp, ltype | LK_RETRY);
 772                         return (error);
 773                 }
 774         }
 775         if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
 776                 zfs_exit(zfsvfs, FTAG);
 777                 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 778                         return (SET_ERROR(ENOTSUP));
 779                 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
 780                 return (error);
 781         }
 782
 783         /*
 784          * The loop is retry the lookup if the parent-child relationship
 785          * changes during the dot-dot locking complexities.
 786          */
 787         for (;;) {
 788                 uint64_t parent;
 789
 790                 error = zfs_dirlook(zdp, nm, &zp);
 791                 if (error == 0)
 792                         *vpp = ZTOV(zp);
 793
 794                 zfs_exit(zfsvfs, FTAG);
 795                 if (error != 0)
 796                         break;
 797
 798                 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
 799                 if (error != 0) {
 800                         /*
 801                          * If we've got a locking error, then the vnode
 802                          * got reclaimed because of a force unmount.
 803                          * We never enter doomed vnodes into the name cache.
 804                          */
 805                         *vpp = NULL;
 806                         return (error);
 807                 }
 808
 809                 if ((cnp->cn_flags & ISDOTDOT) == 0)
 810                         break;
 811
 812                 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) {
 813                         vput(ZTOV(zp));
 814                         *vpp = NULL;
 815                         return (error);
 816                 }
 817                 if (zdp->z_sa_hdl == NULL) {
 818                         error = SET_ERROR(EIO);
 819                 } else {
 820                         error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 821                             &parent, sizeof (parent));
 822                 }
 823                 if (error != 0) {
 824                         zfs_exit(zfsvfs, FTAG);
 825                         vput(ZTOV(zp));
 826                         break;
 827                 }
 828                 if (zp->z_id == parent) {
 829                         zfs_exit(zfsvfs, FTAG);
 830                         break;
 831                 }
 832                 vput(ZTOV(zp));
 833         }
 834
 835         if (error != 0)
 836                 *vpp = NULL;
 837
 838         /* Translate errors and add SAVENAME when needed. */
 839         if (cnp->cn_flags & ISLASTCN) {
 840                 switch (nameiop) {
 841                 case CREATE:
 842                 case RENAME:
 843                         if (error == ENOENT) {
 844                                 error = EJUSTRETURN;
 845 #if __FreeBSD_version < 1400068
 846                                 cnp->cn_flags |= SAVENAME;
 847 #endif
 848                                 break;
 849                         }
 850                         zfs_fallthrough;
 851                 case DELETE:
 852 #if __FreeBSD_version < 1400068
 853                         if (error == 0)
 854                                 cnp->cn_flags |= SAVENAME;
 855 #endif
 856                         break;
 857                 }
 858         }
 859
 860         if ((cnp->cn_flags & ISDOTDOT) != 0) {
 861                 /*
 862                  * FIXME: zfs_lookup_lock relocks vnodes and does nothing to
 863                  * handle races. In particular different callers may end up
 864                  * with different vnodes and will try to add conflicting
 865                  * entries to the namecache.
 866                  *
 867                  * While finding different result may be acceptable in face
 868                  * of concurrent modification, adding conflicting entries
 869                  * trips over an assert in the namecache.
 870                  *
 871                  * Ultimately let an entry through once everything settles.
 872                  */
 873                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 874                         cnp->cn_flags &= ~MAKEENTRY;
 875                 }
 876         }
 877
 878         /* Insert name into cache (as non-existent) if appropriate. */
 879         if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
 880             error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
 881                 cache_enter(dvp, NULL, cnp);
 882
 883         /* Insert name into cache if appropriate. */
 884         if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
 885             error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 886                 if (!(cnp->cn_flags & ISLASTCN) ||
 887                     (nameiop != DELETE && nameiop != RENAME)) {
 888                         cache_enter(dvp, *vpp, cnp);
 889                 }
 890         }
 891
 892         return (error);
 893 }
 894
 895 /*
 896  * Attempt to create a new entry in a directory.  If the entry
 897  * already exists, truncate the file if permissible, else return
 898  * an error.  Return the vp of the created or trunc'd file.
 899  *
 900  *      IN:     dvp     - vnode of directory to put new file entry in.
 901  *              name    - name of new file entry.
 902  *              vap     - attributes of new file.
 903  *              excl    - flag indicating exclusive or non-exclusive mode.
 904  *              mode    - mode to open file with.
 905  *              cr      - credentials of caller.
 906  *              flag    - large file flag [UNUSED].
 907  *              ct      - caller context
 908  *              vsecp   - ACL to be set
 909  *              mnt_ns  - Unused on FreeBSD
 910  *
 911  *      OUT:    vpp     - vnode of created or trunc'd entry.
 912  *
 913  *      RETURN: 0 on success, error code on failure.
 914  *
 915  * Timestamps:
 916  *      dvp - ctime|mtime updated if new entry created
 917  *       vp - ctime|mtime always, atime if new
 918  */
 919 int
 920 zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
 921     znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns)
 922 {
 923         (void) excl, (void) mode, (void) flag;
 924         znode_t         *zp;
 925         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
 926         zilog_t         *zilog;
 927         objset_t        *os;
 928         dmu_tx_t        *tx;
 929         int             error;
 930         uid_t           uid = crgetuid(cr);
 931         gid_t           gid = crgetgid(cr);
 932         uint64_t        projid = ZFS_DEFAULT_PROJID;
 933         zfs_acl_ids_t   acl_ids;
 934         boolean_t       fuid_dirtied;
 935         uint64_t        txtype;
 936 #ifdef DEBUG_VFS_LOCKS
 937         vnode_t *dvp = ZTOV(dzp);
 938 #endif
 939
 940         /*
 941          * If we have an ephemeral id, ACL, or XVATTR then
 942          * make sure file system is at proper version
 943          */
 944         if (zfsvfs->z_use_fuids == B_FALSE &&
 945             (vsecp || (vap->va_mask & AT_XVATTR) ||
 946             IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 947                 return (SET_ERROR(EINVAL));
 948
 949         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 950                 return (error);
 951         os = zfsvfs->z_os;
 952         zilog = zfsvfs->z_log;
 953
 954         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 955             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 956                 zfs_exit(zfsvfs, FTAG);
 957                 return (SET_ERROR(EILSEQ));
 958         }
 959
 960         if (vap->va_mask & AT_XVATTR) {
 961                 if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
 962                     crgetuid(cr), cr, vap->va_type)) != 0) {
 963                         zfs_exit(zfsvfs, FTAG);
 964                         return (error);
 965                 }
 966         }
 967
 968         *zpp = NULL;
 969
 970         if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 971                 vap->va_mode &= ~S_ISVTX;
 972
 973         error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 974         if (error) {
 975                 zfs_exit(zfsvfs, FTAG);
 976                 return (error);
 977         }
 978         ASSERT3P(zp, ==, NULL);
 979
 980         /*
 981          * Create a new file object and update the directory
 982          * to reference it.
 983          */
 984         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 985                 goto out;
 986         }
 987
 988         /*
 989          * We only support the creation of regular files in
 990          * extended attribute directories.
 991          */
 992
 993         if ((dzp->z_pflags & ZFS_XATTR) &&
 994             (vap->va_type != VREG)) {
 995                 error = SET_ERROR(EINVAL);
 996                 goto out;
 997         }
 998
 999         if ((error = zfs_acl_ids_create(dzp, 0, vap,
1000             cr, vsecp, &acl_ids, NULL)) != 0)
1001                 goto out;
1002
1003         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
1004                 projid = zfs_inherit_projid(dzp);
1005         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
1006                 zfs_acl_ids_free(&acl_ids);
1007                 error = SET_ERROR(EDQUOT);
1008                 goto out;
1009         }
1010
1011         getnewvnode_reserve();
1012
1013         tx = dmu_tx_create(os);
1014
1015         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1016             ZFS_SA_BASE_ATTR_SIZE);
1017
1018         fuid_dirtied = zfsvfs->z_fuid_dirty;
1019         if (fuid_dirtied)
1020                 zfs_fuid_txhold(zfsvfs, tx);
1021         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1022         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1023         if (!zfsvfs->z_use_sa &&
1024             acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1025                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1026                     0, acl_ids.z_aclp->z_acl_bytes);
1027         }
1028         error = dmu_tx_assign(tx, TXG_WAIT);
1029         if (error) {
1030                 zfs_acl_ids_free(&acl_ids);
1031                 dmu_tx_abort(tx);
1032                 getnewvnode_drop_reserve();
1033                 zfs_exit(zfsvfs, FTAG);
1034                 return (error);
1035         }
1036         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1037
1038         error = zfs_link_create(dzp, name, zp, tx, ZNEW);
1039         if (error != 0) {
1040                 /*
1041                  * Since, we failed to add the directory entry for it,
1042                  * delete the newly created dnode.
1043                  */
1044                 zfs_znode_delete(zp, tx);
1045                 VOP_UNLOCK(ZTOV(zp));
1046                 zrele(zp);
1047                 zfs_acl_ids_free(&acl_ids);
1048                 dmu_tx_commit(tx);
1049                 getnewvnode_drop_reserve();
1050                 goto out;
1051         }
1052
1053         if (fuid_dirtied)
1054                 zfs_fuid_sync(zfsvfs, tx);
1055
1056         txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1057         zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1058             vsecp, acl_ids.z_fuidp, vap);
1059         zfs_acl_ids_free(&acl_ids);
1060         dmu_tx_commit(tx);
1061
1062         getnewvnode_drop_reserve();
1063
1064 out:
1065         VNCHECKREF(dvp);
1066         if (error == 0) {
1067                 *zpp = zp;
1068         }
1069
1070         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1071                 zil_commit(zilog, 0);
1072
1073         zfs_exit(zfsvfs, FTAG);
1074         return (error);
1075 }
1076
1077 /*
1078  * Remove an entry from a directory.
1079  *
1080  *      IN:     dvp     - vnode of directory to remove entry from.
1081  *              name    - name of entry to remove.
1082  *              cr      - credentials of caller.
1083  *              ct      - caller context
1084  *              flags   - case flags
1085  *
1086  *      RETURN: 0 on success, error code on failure.
1087  *
1088  * Timestamps:
1089  *      dvp - ctime|mtime
1090  *       vp - ctime (if nlink > 0)
1091  */
1092 static int
1093 zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
1094 {
1095         znode_t         *dzp = VTOZ(dvp);
1096         znode_t         *zp;
1097         znode_t         *xzp;
1098         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1099         zilog_t         *zilog;
1100         uint64_t        xattr_obj;
1101         uint64_t        obj = 0;
1102         dmu_tx_t        *tx;
1103         boolean_t       unlinked;
1104         uint64_t        txtype;
1105         int             error;
1106
1107
1108         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1109                 return (error);
1110         zp = VTOZ(vp);
1111         if ((error = zfs_verify_zp(zp)) != 0) {
1112                 zfs_exit(zfsvfs, FTAG);
1113                 return (error);
1114         }
1115         zilog = zfsvfs->z_log;
1116
1117         xattr_obj = 0;
1118         xzp = NULL;
1119
1120         if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
1121                 goto out;
1122         }
1123
1124         /*
1125          * Need to use rmdir for removing directories.
1126          */
1127         if (vp->v_type == VDIR) {
1128                 error = SET_ERROR(EPERM);
1129                 goto out;
1130         }
1131
1132         vnevent_remove(vp, dvp, name, ct);
1133
1134         obj = zp->z_id;
1135
1136         /* are there any extended attributes? */
1137         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1138             &xattr_obj, sizeof (xattr_obj));
1139         if (error == 0 && xattr_obj) {
1140                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1141                 ASSERT0(error);
1142         }
1143
1144         /*
1145          * We may delete the znode now, or we may put it in the unlinked set;
1146          * it depends on whether we're the last link, and on whether there are
1147          * other holds on the vnode.  So we dmu_tx_hold() the right things to
1148          * allow for either case.
1149          */
1150         tx = dmu_tx_create(zfsvfs->z_os);
1151         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1152         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1153         zfs_sa_upgrade_txholds(tx, zp);
1154         zfs_sa_upgrade_txholds(tx, dzp);
1155
1156         if (xzp) {
1157                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1158                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1159         }
1160
1161         /* charge as an update -- would be nice not to charge at all */
1162         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1163
1164         /*
1165          * Mark this transaction as typically resulting in a net free of space
1166          */
1167         dmu_tx_mark_netfree(tx);
1168
1169         error = dmu_tx_assign(tx, TXG_WAIT);
1170         if (error) {
1171                 dmu_tx_abort(tx);
1172                 zfs_exit(zfsvfs, FTAG);
1173                 return (error);
1174         }
1175
1176         /*
1177          * Remove the directory entry.
1178          */
1179         error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
1180
1181         if (error) {
1182                 dmu_tx_commit(tx);
1183                 goto out;
1184         }
1185
1186         if (unlinked) {
1187                 zfs_unlinked_add(zp, tx);
1188                 vp->v_vflag |= VV_NOSYNC;
1189         }
1190         /* XXX check changes to linux vnops */
1191         txtype = TX_REMOVE;
1192         zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1193
1194         dmu_tx_commit(tx);
1195 out:
1196
1197         if (xzp)
1198                 vrele(ZTOV(xzp));
1199
1200         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1201                 zil_commit(zilog, 0);
1202
1203
1204         zfs_exit(zfsvfs, FTAG);
1205         return (error);
1206 }
1207
1208
1209 static int
1210 zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp,
1211     struct componentname *cnp, int nameiop)
1212 {
1213         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1214         int error;
1215
1216         cnp->cn_nameptr = __DECONST(char *, name);
1217         cnp->cn_namelen = strlen(name);
1218         cnp->cn_nameiop = nameiop;
1219         cnp->cn_flags = ISLASTCN;
1220 #if __FreeBSD_version < 1400068
1221         cnp->cn_flags |= SAVENAME;
1222 #endif
1223         cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
1224         cnp->cn_cred = kcred;
1225 #if __FreeBSD_version < 1400037
1226         cnp->cn_thread = curthread;
1227 #endif
1228
1229         if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) {
1230                 struct vop_lookup_args a;
1231
1232                 a.a_gen.a_desc = &vop_lookup_desc;
1233                 a.a_dvp = ZTOV(dzp);
1234                 a.a_vpp = vpp;
1235                 a.a_cnp = cnp;
1236                 error = vfs_cache_lookup(&a);
1237         } else {
1238                 error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, 0,
1239                     B_FALSE);
1240         }
1241 #ifdef ZFS_DEBUG
1242         if (error) {
1243                 printf("got error %d on name %s on op %d\n", error, name,
1244                     nameiop);
1245                 kdb_backtrace();
1246         }
1247 #endif
1248         return (error);
1249 }
1250
1251 int
1252 zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags)
1253 {
1254         vnode_t *vp;
1255         int error;
1256         struct componentname cn;
1257
1258         if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
1259                 return (error);
1260
1261         error = zfs_remove_(ZTOV(dzp), vp, name, cr);
1262         vput(vp);
1263         return (error);
1264 }
1265 /*
1266  * Create a new directory and insert it into dvp using the name
1267  * provided.  Return a pointer to the inserted directory.
1268  *
1269  *      IN:     dvp     - vnode of directory to add subdir to.
1270  *              dirname - name of new directory.
1271  *              vap     - attributes of new directory.
1272  *              cr      - credentials of caller.
1273  *              ct      - caller context
1274  *              flags   - case flags
1275  *              vsecp   - ACL to be set
1276  *              mnt_ns  - Unused on FreeBSD
1277  *
1278  *      OUT:    vpp     - vnode of created directory.
1279  *
1280  *      RETURN: 0 on success, error code on failure.
1281  *
1282  * Timestamps:
1283  *      dvp - ctime|mtime updated
1284  *       vp - ctime|mtime|atime updated
1285  */
1286 int
1287 zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
1288     cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1289 {
1290         (void) flags, (void) vsecp;
1291         znode_t         *zp;
1292         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1293         zilog_t         *zilog;
1294         uint64_t        txtype;
1295         dmu_tx_t        *tx;
1296         int             error;
1297         uid_t           uid = crgetuid(cr);
1298         gid_t           gid = crgetgid(cr);
1299         zfs_acl_ids_t   acl_ids;
1300         boolean_t       fuid_dirtied;
1301
1302         ASSERT3U(vap->va_type, ==, VDIR);
1303
1304         /*
1305          * If we have an ephemeral id, ACL, or XVATTR then
1306          * make sure file system is at proper version
1307          */
1308         if (zfsvfs->z_use_fuids == B_FALSE &&
1309             ((vap->va_mask & AT_XVATTR) ||
1310             IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1311                 return (SET_ERROR(EINVAL));
1312
1313         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1314                 return (error);
1315         zilog = zfsvfs->z_log;
1316
1317         if (dzp->z_pflags & ZFS_XATTR) {
1318                 zfs_exit(zfsvfs, FTAG);
1319                 return (SET_ERROR(EINVAL));
1320         }
1321
1322         if (zfsvfs->z_utf8 && u8_validate(dirname,
1323             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1324                 zfs_exit(zfsvfs, FTAG);
1325                 return (SET_ERROR(EILSEQ));
1326         }
1327
1328         if (vap->va_mask & AT_XVATTR) {
1329                 if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
1330                     crgetuid(cr), cr, vap->va_type)) != 0) {
1331                         zfs_exit(zfsvfs, FTAG);
1332                         return (error);
1333                 }
1334         }
1335
1336         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1337             NULL, &acl_ids, NULL)) != 0) {
1338                 zfs_exit(zfsvfs, FTAG);
1339                 return (error);
1340         }
1341
1342         /*
1343          * First make sure the new directory doesn't exist.
1344          *
1345          * Existence is checked first to make sure we don't return
1346          * EACCES instead of EEXIST which can cause some applications
1347          * to fail.
1348          */
1349         *zpp = NULL;
1350
1351         if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) {
1352                 zfs_acl_ids_free(&acl_ids);
1353                 zfs_exit(zfsvfs, FTAG);
1354                 return (error);
1355         }
1356         ASSERT3P(zp, ==, NULL);
1357
1358         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
1359             mnt_ns))) {
1360                 zfs_acl_ids_free(&acl_ids);
1361                 zfs_exit(zfsvfs, FTAG);
1362                 return (error);
1363         }
1364
1365         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1366                 zfs_acl_ids_free(&acl_ids);
1367                 zfs_exit(zfsvfs, FTAG);
1368                 return (SET_ERROR(EDQUOT));
1369         }
1370
1371         /*
1372          * Add a new entry to the directory.
1373          */
1374         getnewvnode_reserve();
1375         tx = dmu_tx_create(zfsvfs->z_os);
1376         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1377         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1378         fuid_dirtied = zfsvfs->z_fuid_dirty;
1379         if (fuid_dirtied)
1380                 zfs_fuid_txhold(zfsvfs, tx);
1381         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1382                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1383                     acl_ids.z_aclp->z_acl_bytes);
1384         }
1385
1386         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1387             ZFS_SA_BASE_ATTR_SIZE);
1388
1389         error = dmu_tx_assign(tx, TXG_WAIT);
1390         if (error) {
1391                 zfs_acl_ids_free(&acl_ids);
1392                 dmu_tx_abort(tx);
1393                 getnewvnode_drop_reserve();
1394                 zfs_exit(zfsvfs, FTAG);
1395                 return (error);
1396         }
1397
1398         /*
1399          * Create new node.
1400          */
1401         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1402
1403         /*
1404          * Now put new name in parent dir.
1405          */
1406         error = zfs_link_create(dzp, dirname, zp, tx, ZNEW);
1407         if (error != 0) {
1408                 zfs_znode_delete(zp, tx);
1409                 VOP_UNLOCK(ZTOV(zp));
1410                 zrele(zp);
1411                 goto out;
1412         }
1413
1414         if (fuid_dirtied)
1415                 zfs_fuid_sync(zfsvfs, tx);
1416
1417         *zpp = zp;
1418
1419         txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
1420         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
1421             acl_ids.z_fuidp, vap);
1422
1423 out:
1424         zfs_acl_ids_free(&acl_ids);
1425
1426         dmu_tx_commit(tx);
1427
1428         getnewvnode_drop_reserve();
1429
1430         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1431                 zil_commit(zilog, 0);
1432
1433         zfs_exit(zfsvfs, FTAG);
1434         return (error);
1435 }
1436
1437 /*
1438  * Remove a directory subdir entry.  If the current working
1439  * directory is the same as the subdir to be removed, the
1440  * remove will fail.
1441  *
1442  *      IN:     dvp     - vnode of directory to remove from.
1443  *              name    - name of directory to be removed.
1444  *              cwd     - vnode of current working directory.
1445  *              cr      - credentials of caller.
1446  *              ct      - caller context
1447  *              flags   - case flags
1448  *
1449  *      RETURN: 0 on success, error code on failure.
1450  *
1451  * Timestamps:
1452  *      dvp - ctime|mtime updated
1453  */
1454 static int
1455 zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
1456 {
1457         znode_t         *dzp = VTOZ(dvp);
1458         znode_t         *zp = VTOZ(vp);
1459         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1460         zilog_t         *zilog;
1461         dmu_tx_t        *tx;
1462         int             error;
1463
1464         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1465                 return (error);
1466         if ((error = zfs_verify_zp(zp)) != 0) {
1467                 zfs_exit(zfsvfs, FTAG);
1468                 return (error);
1469         }
1470         zilog = zfsvfs->z_log;
1471
1472
1473         if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
1474                 goto out;
1475         }
1476
1477         if (vp->v_type != VDIR) {
1478                 error = SET_ERROR(ENOTDIR);
1479                 goto out;
1480         }
1481
1482         vnevent_rmdir(vp, dvp, name, ct);
1483
1484         tx = dmu_tx_create(zfsvfs->z_os);
1485         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1486         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1487         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1488         zfs_sa_upgrade_txholds(tx, zp);
1489         zfs_sa_upgrade_txholds(tx, dzp);
1490         dmu_tx_mark_netfree(tx);
1491         error = dmu_tx_assign(tx, TXG_WAIT);
1492         if (error) {
1493                 dmu_tx_abort(tx);
1494                 zfs_exit(zfsvfs, FTAG);
1495                 return (error);
1496         }
1497
1498         error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
1499
1500         if (error == 0) {
1501                 uint64_t txtype = TX_RMDIR;
1502                 zfs_log_remove(zilog, tx, txtype, dzp, name,
1503                     ZFS_NO_OBJECT, B_FALSE);
1504         }
1505
1506         dmu_tx_commit(tx);
1507
1508         if (zfsvfs->z_use_namecache)
1509                 cache_vop_rmdir(dvp, vp);
1510 out:
1511         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1512                 zil_commit(zilog, 0);
1513
1514         zfs_exit(zfsvfs, FTAG);
1515         return (error);
1516 }
1517
1518 int
1519 zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags)
1520 {
1521         struct componentname cn;
1522         vnode_t *vp;
1523         int error;
1524
1525         if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
1526                 return (error);
1527
1528         error = zfs_rmdir_(ZTOV(dzp), vp, name, cr);
1529         vput(vp);
1530         return (error);
1531 }
1532
1533 /*
1534  * Read as many directory entries as will fit into the provided
1535  * buffer from the given directory cursor position (specified in
1536  * the uio structure).
1537  *
1538  *      IN:     vp      - vnode of directory to read.
1539  *              uio     - structure supplying read location, range info,
1540  *                        and return buffer.
1541  *              cr      - credentials of caller.
1542  *              ct      - caller context
1543  *
1544  *      OUT:    uio     - updated offset and range, buffer filled.
1545  *              eofp    - set to true if end-of-file detected.
1546  *              ncookies- number of entries in cookies
1547  *              cookies - offsets to directory entries
1548  *
1549  *      RETURN: 0 on success, error code on failure.
1550  *
1551  * Timestamps:
1552  *      vp - atime updated
1553  *
1554  * Note that the low 4 bits of the cookie returned by zap is always zero.
1555  * This allows us to use the low range for "special" directory entries:
1556  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1557  * we use the offset 2 for the '.zfs' directory.
1558  */
1559 static int
1560 zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
1561     int *ncookies, cookie_t **cookies)
1562 {
1563         znode_t         *zp = VTOZ(vp);
1564         iovec_t         *iovp;
1565         dirent64_t      *odp;
1566         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
1567         objset_t        *os;
1568         caddr_t         outbuf;
1569         size_t          bufsize;
1570         zap_cursor_t    zc;
1571         zap_attribute_t zap;
1572         uint_t          bytes_wanted;
1573         uint64_t        offset; /* must be unsigned; checks for < 1 */
1574         uint64_t        parent;
1575         int             local_eof;
1576         int             outcount;
1577         int             error;
1578         uint8_t         prefetch;
1579         uint8_t         type;
1580         int             ncooks;
1581         cookie_t        *cooks = NULL;
1582
1583         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1584                 return (error);
1585
1586         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1587             &parent, sizeof (parent))) != 0) {
1588                 zfs_exit(zfsvfs, FTAG);
1589                 return (error);
1590         }
1591
1592         /*
1593          * If we are not given an eof variable,
1594          * use a local one.
1595          */
1596         if (eofp == NULL)
1597                 eofp = &local_eof;
1598
1599         /*
1600          * Check for valid iov_len.
1601          */
1602         if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) {
1603                 zfs_exit(zfsvfs, FTAG);
1604                 return (SET_ERROR(EINVAL));
1605         }
1606
1607         /*
1608          * Quit if directory has been removed (posix)
1609          */
1610         if ((*eofp = zp->z_unlinked) != 0) {
1611                 zfs_exit(zfsvfs, FTAG);
1612                 return (0);
1613         }
1614
1615         error = 0;
1616         os = zfsvfs->z_os;
1617         offset = zfs_uio_offset(uio);
1618         prefetch = zp->z_zn_prefetch;
1619
1620         /*
1621          * Initialize the iterator cursor.
1622          */
1623         if (offset <= 3) {
1624                 /*
1625                  * Start iteration from the beginning of the directory.
1626                  */
1627                 zap_cursor_init(&zc, os, zp->z_id);
1628         } else {
1629                 /*
1630                  * The offset is a serialized cursor.
1631                  */
1632                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1633         }
1634
1635         /*
1636          * Get space to change directory entries into fs independent format.
1637          */
1638         iovp = GET_UIO_STRUCT(uio)->uio_iov;
1639         bytes_wanted = iovp->iov_len;
1640         if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) {
1641                 bufsize = bytes_wanted;
1642                 outbuf = kmem_alloc(bufsize, KM_SLEEP);
1643                 odp = (struct dirent64 *)outbuf;
1644         } else {
1645                 bufsize = bytes_wanted;
1646                 outbuf = NULL;
1647                 odp = (struct dirent64 *)iovp->iov_base;
1648         }
1649
1650         if (ncookies != NULL) {
1651                 /*
1652                  * Minimum entry size is dirent size and 1 byte for a file name.
1653                  */
1654                 ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) -
1655                     sizeof (((struct dirent *)NULL)->d_name) + 1);
1656                 cooks = malloc(ncooks * sizeof (*cooks), M_TEMP, M_WAITOK);
1657                 *cookies = cooks;
1658                 *ncookies = ncooks;
1659         }
1660
1661         /*
1662          * Transform to file-system independent format
1663          */
1664         outcount = 0;
1665         while (outcount < bytes_wanted) {
1666                 ino64_t objnum;
1667                 ushort_t reclen;
1668                 off64_t *next = NULL;
1669
1670                 /*
1671                  * Special case `.', `..', and `.zfs'.
1672                  */
1673                 if (offset == 0) {
1674                         (void) strcpy(zap.za_name, ".");
1675                         zap.za_normalization_conflict = 0;
1676                         objnum = zp->z_id;
1677                         type = DT_DIR;
1678                 } else if (offset == 1) {
1679                         (void) strcpy(zap.za_name, "..");
1680                         zap.za_normalization_conflict = 0;
1681                         objnum = parent;
1682                         type = DT_DIR;
1683                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1684                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1685                         zap.za_normalization_conflict = 0;
1686                         objnum = ZFSCTL_INO_ROOT;
1687                         type = DT_DIR;
1688                 } else {
1689                         /*
1690                          * Grab next entry.
1691                          */
1692                         if ((error = zap_cursor_retrieve(&zc, &zap))) {
1693                                 if ((*eofp = (error == ENOENT)) != 0)
1694                                         break;
1695                                 else
1696                                         goto update;
1697                         }
1698
1699                         if (zap.za_integer_length != 8 ||
1700                             zap.za_num_integers != 1) {
1701                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
1702                                     "entry, obj = %lld, offset = %lld\n",
1703                                     (u_longlong_t)zp->z_id,
1704                                     (u_longlong_t)offset);
1705                                 error = SET_ERROR(ENXIO);
1706                                 goto update;
1707                         }
1708
1709                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1710                         /*
1711                          * MacOS X can extract the object type here such as:
1712                          * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1713                          */
1714                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1715                 }
1716
1717                 reclen = DIRENT64_RECLEN(strlen(zap.za_name));
1718
1719                 /*
1720                  * Will this entry fit in the buffer?
1721                  */
1722                 if (outcount + reclen > bufsize) {
1723                         /*
1724                          * Did we manage to fit anything in the buffer?
1725                          */
1726                         if (!outcount) {
1727                                 error = SET_ERROR(EINVAL);
1728                                 goto update;
1729                         }
1730                         break;
1731                 }
1732                 /*
1733                  * Add normal entry:
1734                  */
1735                 odp->d_ino = objnum;
1736                 odp->d_reclen = reclen;
1737                 odp->d_namlen = strlen(zap.za_name);
1738                 /* NOTE: d_off is the offset for the *next* entry. */
1739                 next = &odp->d_off;
1740                 strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
1741                 odp->d_type = type;
1742                 dirent_terminate(odp);
1743                 odp = (dirent64_t *)((intptr_t)odp + reclen);
1744
1745                 outcount += reclen;
1746
1747                 ASSERT3S(outcount, <=, bufsize);
1748
1749                 if (prefetch)
1750                         dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
1751
1752                 /*
1753                  * Move to the next entry, fill in the previous offset.
1754                  */
1755                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1756                         zap_cursor_advance(&zc);
1757                         offset = zap_cursor_serialize(&zc);
1758                 } else {
1759                         offset += 1;
1760                 }
1761
1762                 /* Fill the offset right after advancing the cursor. */
1763                 if (next != NULL)
1764                         *next = offset;
1765                 if (cooks != NULL) {
1766                         *cooks++ = offset;
1767                         ncooks--;
1768                         KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
1769                 }
1770         }
1771         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1772
1773         /* Subtract unused cookies */
1774         if (ncookies != NULL)
1775                 *ncookies -= ncooks;
1776
1777         if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) {
1778                 iovp->iov_base += outcount;
1779                 iovp->iov_len -= outcount;
1780                 zfs_uio_resid(uio) -= outcount;
1781         } else if ((error =
1782             zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) {
1783                 /*
1784                  * Reset the pointer.
1785                  */
1786                 offset = zfs_uio_offset(uio);
1787         }
1788
1789 update:
1790         zap_cursor_fini(&zc);
1791         if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1)
1792                 kmem_free(outbuf, bufsize);
1793
1794         if (error == ENOENT)
1795                 error = 0;
1796
1797         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
1798
1799         zfs_uio_setoffset(uio, offset);
1800         zfs_exit(zfsvfs, FTAG);
1801         if (error != 0 && cookies != NULL) {
1802                 free(*cookies, M_TEMP);
1803                 *cookies = NULL;
1804                 *ncookies = 0;
1805         }
1806         return (error);
1807 }
1808
1809 /*
1810  * Get the requested file attributes and place them in the provided
1811  * vattr structure.
1812  *
1813  *      IN:     vp      - vnode of file.
1814  *              vap     - va_mask identifies requested attributes.
1815  *                        If AT_XVATTR set, then optional attrs are requested
1816  *              flags   - ATTR_NOACLCHECK (CIFS server context)
1817  *              cr      - credentials of caller.
1818  *
1819  *      OUT:    vap     - attribute values.
1820  *
1821  *      RETURN: 0 (always succeeds).
1822  */
1823 static int
1824 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
1825 {
1826         znode_t *zp = VTOZ(vp);
1827         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1828         int     error = 0;
1829         uint32_t blksize;
1830         u_longlong_t nblocks;
1831         uint64_t mtime[2], ctime[2], crtime[2], rdev;
1832         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
1833         xoptattr_t *xoap = NULL;
1834         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1835         sa_bulk_attr_t bulk[4];
1836         int count = 0;
1837
1838         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1839                 return (error);
1840
1841         zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
1842
1843         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1844         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1845         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
1846         if (vp->v_type == VBLK || vp->v_type == VCHR)
1847                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
1848                     &rdev, 8);
1849
1850         if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
1851                 zfs_exit(zfsvfs, FTAG);
1852                 return (error);
1853         }
1854
1855         /*
1856          * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
1857          * Also, if we are the owner don't bother, since owner should
1858          * always be allowed to read basic attributes of file.
1859          */
1860         if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
1861             (vap->va_uid != crgetuid(cr))) {
1862                 if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
1863                     skipaclchk, cr, NULL))) {
1864                         zfs_exit(zfsvfs, FTAG);
1865                         return (error);
1866                 }
1867         }
1868
1869         /*
1870          * Return all attributes.  It's cheaper to provide the answer
1871          * than to determine whether we were asked the question.
1872          */
1873
1874         vap->va_type = IFTOVT(zp->z_mode);
1875         vap->va_mode = zp->z_mode & ~S_IFMT;
1876         vn_fsid(vp, vap);
1877         vap->va_nodeid = zp->z_id;
1878         vap->va_nlink = zp->z_links;
1879         if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
1880             zp->z_links < ZFS_LINK_MAX)
1881                 vap->va_nlink++;
1882         vap->va_size = zp->z_size;
1883         if (vp->v_type == VBLK || vp->v_type == VCHR)
1884                 vap->va_rdev = zfs_cmpldev(rdev);
1885         else
1886                 vap->va_rdev = 0;
1887         vap->va_gen = zp->z_gen;
1888         vap->va_flags = 0;      /* FreeBSD: Reset chflags(2) flags. */
1889         vap->va_filerev = zp->z_seq;
1890
1891         /*
1892          * Add in any requested optional attributes and the create time.
1893          * Also set the corresponding bits in the returned attribute bitmap.
1894          */
1895         if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
1896                 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1897                         xoap->xoa_archive =
1898                             ((zp->z_pflags & ZFS_ARCHIVE) != 0);
1899                         XVA_SET_RTN(xvap, XAT_ARCHIVE);
1900                 }
1901
1902                 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
1903                         xoap->xoa_readonly =
1904                             ((zp->z_pflags & ZFS_READONLY) != 0);
1905                         XVA_SET_RTN(xvap, XAT_READONLY);
1906                 }
1907
1908                 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1909                         xoap->xoa_system =
1910                             ((zp->z_pflags & ZFS_SYSTEM) != 0);
1911                         XVA_SET_RTN(xvap, XAT_SYSTEM);
1912                 }
1913
1914                 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
1915                         xoap->xoa_hidden =
1916                             ((zp->z_pflags & ZFS_HIDDEN) != 0);
1917                         XVA_SET_RTN(xvap, XAT_HIDDEN);
1918                 }
1919
1920                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1921                         xoap->xoa_nounlink =
1922                             ((zp->z_pflags & ZFS_NOUNLINK) != 0);
1923                         XVA_SET_RTN(xvap, XAT_NOUNLINK);
1924                 }
1925
1926                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1927                         xoap->xoa_immutable =
1928                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
1929                         XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1930                 }
1931
1932                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1933                         xoap->xoa_appendonly =
1934                             ((zp->z_pflags & ZFS_APPENDONLY) != 0);
1935                         XVA_SET_RTN(xvap, XAT_APPENDONLY);
1936                 }
1937
1938                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1939                         xoap->xoa_nodump =
1940                             ((zp->z_pflags & ZFS_NODUMP) != 0);
1941                         XVA_SET_RTN(xvap, XAT_NODUMP);
1942                 }
1943
1944                 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1945                         xoap->xoa_opaque =
1946                             ((zp->z_pflags & ZFS_OPAQUE) != 0);
1947                         XVA_SET_RTN(xvap, XAT_OPAQUE);
1948                 }
1949
1950                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1951                         xoap->xoa_av_quarantined =
1952                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
1953                         XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1954                 }
1955
1956                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1957                         xoap->xoa_av_modified =
1958                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
1959                         XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1960                 }
1961
1962                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
1963                     vp->v_type == VREG) {
1964                         zfs_sa_get_scanstamp(zp, xvap);
1965                 }
1966
1967                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1968                         xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
1969                         XVA_SET_RTN(xvap, XAT_REPARSE);
1970                 }
1971                 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
1972                         xoap->xoa_generation = zp->z_gen;
1973                         XVA_SET_RTN(xvap, XAT_GEN);
1974                 }
1975
1976                 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1977                         xoap->xoa_offline =
1978                             ((zp->z_pflags & ZFS_OFFLINE) != 0);
1979                         XVA_SET_RTN(xvap, XAT_OFFLINE);
1980                 }
1981
1982                 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1983                         xoap->xoa_sparse =
1984                             ((zp->z_pflags & ZFS_SPARSE) != 0);
1985                         XVA_SET_RTN(xvap, XAT_SPARSE);
1986                 }
1987
1988                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1989                         xoap->xoa_projinherit =
1990                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
1991                         XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1992                 }
1993
1994                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
1995                         xoap->xoa_projid = zp->z_projid;
1996                         XVA_SET_RTN(xvap, XAT_PROJID);
1997                 }
1998         }
1999
2000         ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2001         ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2002         ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2003         ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2004
2005
2006         sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2007         vap->va_blksize = blksize;
2008         vap->va_bytes = nblocks << 9;   /* nblocks * 512 */
2009
2010         if (zp->z_blksz == 0) {
2011                 /*
2012                  * Block size hasn't been set; suggest maximal I/O transfers.
2013                  */
2014                 vap->va_blksize = zfsvfs->z_max_blksz;
2015         }
2016
2017         zfs_exit(zfsvfs, FTAG);
2018         return (0);
2019 }
2020
2021 /*
2022  * Set the file attributes to the values contained in the
2023  * vattr structure.
2024  *
2025  *      IN:     zp      - znode of file to be modified.
2026  *              vap     - new attribute values.
2027  *                        If AT_XVATTR set, then optional attrs are being set
2028  *              flags   - ATTR_UTIME set if non-default time values provided.
2029  *                      - ATTR_NOACLCHECK (CIFS context only).
2030  *              cr      - credentials of caller.
2031  *              mnt_ns  - Unused on FreeBSD
2032  *
2033  *      RETURN: 0 on success, error code on failure.
2034  *
2035  * Timestamps:
2036  *      vp - ctime updated, mtime updated if size changed.
2037  */
2038 int
2039 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
2040 {
2041         vnode_t         *vp = ZTOV(zp);
2042         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2043         objset_t        *os;
2044         zilog_t         *zilog;
2045         dmu_tx_t        *tx;
2046         vattr_t         oldva;
2047         xvattr_t        tmpxvattr;
2048         uint_t          mask = vap->va_mask;
2049         uint_t          saved_mask = 0;
2050         uint64_t        saved_mode;
2051         int             trim_mask = 0;
2052         uint64_t        new_mode;
2053         uint64_t        new_uid, new_gid;
2054         uint64_t        xattr_obj;
2055         uint64_t        mtime[2], ctime[2];
2056         uint64_t        projid = ZFS_INVALID_PROJID;
2057         znode_t         *attrzp;
2058         int             need_policy = FALSE;
2059         int             err, err2;
2060         zfs_fuid_info_t *fuidp = NULL;
2061         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2062         xoptattr_t      *xoap;
2063         zfs_acl_t       *aclp;
2064         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2065         boolean_t       fuid_dirtied = B_FALSE;
2066         sa_bulk_attr_t  bulk[7], xattr_bulk[7];
2067         int             count = 0, xattr_count = 0;
2068
2069         if (mask == 0)
2070                 return (0);
2071
2072         if (mask & AT_NOSET)
2073                 return (SET_ERROR(EINVAL));
2074
2075         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
2076                 return (err);
2077
2078         os = zfsvfs->z_os;
2079         zilog = zfsvfs->z_log;
2080
2081         /*
2082          * Make sure that if we have ephemeral uid/gid or xvattr specified
2083          * that file system is at proper version level
2084          */
2085
2086         if (zfsvfs->z_use_fuids == B_FALSE &&
2087             (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2088             ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2089             (mask & AT_XVATTR))) {
2090                 zfs_exit(zfsvfs, FTAG);
2091                 return (SET_ERROR(EINVAL));
2092         }
2093
2094         if (mask & AT_SIZE && vp->v_type == VDIR) {
2095                 zfs_exit(zfsvfs, FTAG);
2096                 return (SET_ERROR(EISDIR));
2097         }
2098
2099         if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2100                 zfs_exit(zfsvfs, FTAG);
2101                 return (SET_ERROR(EINVAL));
2102         }
2103
2104         /*
2105          * If this is an xvattr_t, then get a pointer to the structure of
2106          * optional attributes.  If this is NULL, then we have a vattr_t.
2107          */
2108         xoap = xva_getxoptattr(xvap);
2109
2110         xva_init(&tmpxvattr);
2111
2112         /*
2113          * Immutable files can only alter immutable bit and atime
2114          */
2115         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2116             ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2117             ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2118                 zfs_exit(zfsvfs, FTAG);
2119                 return (SET_ERROR(EPERM));
2120         }
2121
2122         /*
2123          * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2124          */
2125
2126         /*
2127          * Verify timestamps doesn't overflow 32 bits.
2128          * ZFS can handle large timestamps, but 32bit syscalls can't
2129          * handle times greater than 2039.  This check should be removed
2130          * once large timestamps are fully supported.
2131          */
2132         if (mask & (AT_ATIME | AT_MTIME)) {
2133                 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2134                     ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2135                         zfs_exit(zfsvfs, FTAG);
2136                         return (SET_ERROR(EOVERFLOW));
2137                 }
2138         }
2139         if (xoap != NULL && (mask & AT_XVATTR)) {
2140                 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
2141                     TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
2142                         zfs_exit(zfsvfs, FTAG);
2143                         return (SET_ERROR(EOVERFLOW));
2144                 }
2145
2146                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2147                         if (!dmu_objset_projectquota_enabled(os) ||
2148                             (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) {
2149                                 zfs_exit(zfsvfs, FTAG);
2150                                 return (SET_ERROR(EOPNOTSUPP));
2151                         }
2152
2153                         projid = xoap->xoa_projid;
2154                         if (unlikely(projid == ZFS_INVALID_PROJID)) {
2155                                 zfs_exit(zfsvfs, FTAG);
2156                                 return (SET_ERROR(EINVAL));
2157                         }
2158
2159                         if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
2160                                 projid = ZFS_INVALID_PROJID;
2161                         else
2162                                 need_policy = TRUE;
2163                 }
2164
2165                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
2166                     (xoap->xoa_projinherit !=
2167                     ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
2168                     (!dmu_objset_projectquota_enabled(os) ||
2169                     (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) {
2170                         zfs_exit(zfsvfs, FTAG);
2171                         return (SET_ERROR(EOPNOTSUPP));
2172                 }
2173         }
2174
2175         attrzp = NULL;
2176         aclp = NULL;
2177
2178         if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2179                 zfs_exit(zfsvfs, FTAG);
2180                 return (SET_ERROR(EROFS));
2181         }
2182
2183         /*
2184          * First validate permissions
2185          */
2186
2187         if (mask & AT_SIZE) {
2188                 /*
2189                  * XXX - Note, we are not providing any open
2190                  * mode flags here (like FNDELAY), so we may
2191                  * block if there are locks present... this
2192                  * should be addressed in openat().
2193                  */
2194                 /* XXX - would it be OK to generate a log record here? */
2195                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2196                 if (err) {
2197                         zfs_exit(zfsvfs, FTAG);
2198                         return (err);
2199                 }
2200         }
2201
2202         if (mask & (AT_ATIME|AT_MTIME) ||
2203             ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2204             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2205             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2206             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2207             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2208             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2209             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2210                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2211                     skipaclchk, cr, mnt_ns);
2212         }
2213
2214         if (mask & (AT_UID|AT_GID)) {
2215                 int     idmask = (mask & (AT_UID|AT_GID));
2216                 int     take_owner;
2217                 int     take_group;
2218
2219                 /*
2220                  * NOTE: even if a new mode is being set,
2221                  * we may clear S_ISUID/S_ISGID bits.
2222                  */
2223
2224                 if (!(mask & AT_MODE))
2225                         vap->va_mode = zp->z_mode;
2226
2227                 /*
2228                  * Take ownership or chgrp to group we are a member of
2229                  */
2230
2231                 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2232                 take_group = (mask & AT_GID) &&
2233                     zfs_groupmember(zfsvfs, vap->va_gid, cr);
2234
2235                 /*
2236                  * If both AT_UID and AT_GID are set then take_owner and
2237                  * take_group must both be set in order to allow taking
2238                  * ownership.
2239                  *
2240                  * Otherwise, send the check through secpolicy_vnode_setattr()
2241                  *
2242                  */
2243
2244                 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2245                     ((idmask == AT_UID) && take_owner) ||
2246                     ((idmask == AT_GID) && take_group)) {
2247                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2248                             skipaclchk, cr, mnt_ns) == 0) {
2249                                 /*
2250                                  * Remove setuid/setgid for non-privileged users
2251                                  */
2252                                 secpolicy_setid_clear(vap, vp, cr);
2253                                 trim_mask = (mask & (AT_UID|AT_GID));
2254                         } else {
2255                                 need_policy =  TRUE;
2256                         }
2257                 } else {
2258                         need_policy =  TRUE;
2259                 }
2260         }
2261
2262         oldva.va_mode = zp->z_mode;
2263         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2264         if (mask & AT_XVATTR) {
2265                 /*
2266                  * Update xvattr mask to include only those attributes
2267                  * that are actually changing.
2268                  *
2269                  * the bits will be restored prior to actually setting
2270                  * the attributes so the caller thinks they were set.
2271                  */
2272                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2273                         if (xoap->xoa_appendonly !=
2274                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2275                                 need_policy = TRUE;
2276                         } else {
2277                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2278                                 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2279                         }
2280                 }
2281
2282                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2283                         if (xoap->xoa_projinherit !=
2284                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2285                                 need_policy = TRUE;
2286                         } else {
2287                                 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2288                                 XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT);
2289                         }
2290                 }
2291
2292                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2293                         if (xoap->xoa_nounlink !=
2294                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2295                                 need_policy = TRUE;
2296                         } else {
2297                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2298                                 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2299                         }
2300                 }
2301
2302                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2303                         if (xoap->xoa_immutable !=
2304                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2305                                 need_policy = TRUE;
2306                         } else {
2307                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2308                                 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2309                         }
2310                 }
2311
2312                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2313                         if (xoap->xoa_nodump !=
2314                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2315                                 need_policy = TRUE;
2316                         } else {
2317                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2318                                 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2319                         }
2320                 }
2321
2322                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2323                         if (xoap->xoa_av_modified !=
2324                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2325                                 need_policy = TRUE;
2326                         } else {
2327                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2328                                 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2329                         }
2330                 }
2331
2332                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2333                         if ((vp->v_type != VREG &&
2334                             xoap->xoa_av_quarantined) ||
2335                             xoap->xoa_av_quarantined !=
2336                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2337                                 need_policy = TRUE;
2338                         } else {
2339                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2340                                 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2341                         }
2342                 }
2343
2344                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2345                         zfs_exit(zfsvfs, FTAG);
2346                         return (SET_ERROR(EPERM));
2347                 }
2348
2349                 if (need_policy == FALSE &&
2350                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2351                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2352                         need_policy = TRUE;
2353                 }
2354         }
2355
2356         if (mask & AT_MODE) {
2357                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
2358                     mnt_ns) == 0) {
2359                         err = secpolicy_setid_setsticky_clear(vp, vap,
2360                             &oldva, cr);
2361                         if (err) {
2362                                 zfs_exit(zfsvfs, FTAG);
2363                                 return (err);
2364                         }
2365                         trim_mask |= AT_MODE;
2366                 } else {
2367                         need_policy = TRUE;
2368                 }
2369         }
2370
2371         if (need_policy) {
2372                 /*
2373                  * If trim_mask is set then take ownership
2374                  * has been granted or write_acl is present and user
2375                  * has the ability to modify mode.  In that case remove
2376                  * UID|GID and or MODE from mask so that
2377                  * secpolicy_vnode_setattr() doesn't revoke it.
2378                  */
2379
2380                 if (trim_mask) {
2381                         saved_mask = vap->va_mask;
2382                         vap->va_mask &= ~trim_mask;
2383                         if (trim_mask & AT_MODE) {
2384                                 /*
2385                                  * Save the mode, as secpolicy_vnode_setattr()
2386                                  * will overwrite it with ova.va_mode.
2387                                  */
2388                                 saved_mode = vap->va_mode;
2389                         }
2390                 }
2391                 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2392                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2393                 if (err) {
2394                         zfs_exit(zfsvfs, FTAG);
2395                         return (err);
2396                 }
2397
2398                 if (trim_mask) {
2399                         vap->va_mask |= saved_mask;
2400                         if (trim_mask & AT_MODE) {
2401                                 /*
2402                                  * Recover the mode after
2403                                  * secpolicy_vnode_setattr().
2404                                  */
2405                                 vap->va_mode = saved_mode;
2406                         }
2407                 }
2408         }
2409
2410         /*
2411          * secpolicy_vnode_setattr, or take ownership may have
2412          * changed va_mask
2413          */
2414         mask = vap->va_mask;
2415
2416         if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
2417                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2418                     &xattr_obj, sizeof (xattr_obj));
2419
2420                 if (err == 0 && xattr_obj) {
2421                         err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
2422                         if (err == 0) {
2423                                 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
2424                                 if (err != 0)
2425                                         vrele(ZTOV(attrzp));
2426                         }
2427                         if (err)
2428                                 goto out2;
2429                 }
2430                 if (mask & AT_UID) {
2431                         new_uid = zfs_fuid_create(zfsvfs,
2432                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2433                         if (new_uid != zp->z_uid &&
2434                             zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2435                             new_uid)) {
2436                                 if (attrzp)
2437                                         vput(ZTOV(attrzp));
2438                                 err = SET_ERROR(EDQUOT);
2439                                 goto out2;
2440                         }
2441                 }
2442
2443                 if (mask & AT_GID) {
2444                         new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
2445                             cr, ZFS_GROUP, &fuidp);
2446                         if (new_gid != zp->z_gid &&
2447                             zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2448                             new_gid)) {
2449                                 if (attrzp)
2450                                         vput(ZTOV(attrzp));
2451                                 err = SET_ERROR(EDQUOT);
2452                                 goto out2;
2453                         }
2454                 }
2455
2456                 if (projid != ZFS_INVALID_PROJID &&
2457                     zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2458                         if (attrzp)
2459                                 vput(ZTOV(attrzp));
2460                         err = SET_ERROR(EDQUOT);
2461                         goto out2;
2462                 }
2463         }
2464         tx = dmu_tx_create(os);
2465
2466         if (mask & AT_MODE) {
2467                 uint64_t pmode = zp->z_mode;
2468                 uint64_t acl_obj;
2469                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2470
2471                 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
2472                     !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2473                         err = SET_ERROR(EPERM);
2474                         goto out;
2475                 }
2476
2477                 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2478                         goto out;
2479
2480                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2481                         /*
2482                          * Are we upgrading ACL from old V0 format
2483                          * to V1 format?
2484                          */
2485                         if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2486                             zfs_znode_acl_version(zp) ==
2487                             ZFS_ACL_VERSION_INITIAL) {
2488                                 dmu_tx_hold_free(tx, acl_obj, 0,
2489                                     DMU_OBJECT_END);
2490                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2491                                     0, aclp->z_acl_bytes);
2492                         } else {
2493                                 dmu_tx_hold_write(tx, acl_obj, 0,
2494                                     aclp->z_acl_bytes);
2495                         }
2496                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2497                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2498                             0, aclp->z_acl_bytes);
2499                 }
2500                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2501         } else {
2502                 if (((mask & AT_XVATTR) &&
2503                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2504                     (projid != ZFS_INVALID_PROJID &&
2505                     !(zp->z_pflags & ZFS_PROJID)))
2506                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2507                 else
2508                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2509         }
2510
2511         if (attrzp) {
2512                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2513         }
2514
2515         fuid_dirtied = zfsvfs->z_fuid_dirty;
2516         if (fuid_dirtied)
2517                 zfs_fuid_txhold(zfsvfs, tx);
2518
2519         zfs_sa_upgrade_txholds(tx, zp);
2520
2521         err = dmu_tx_assign(tx, TXG_WAIT);
2522         if (err)
2523                 goto out;
2524
2525         count = 0;
2526         /*
2527          * Set each attribute requested.
2528          * We group settings according to the locks they need to acquire.
2529          *
2530          * Note: you cannot set ctime directly, although it will be
2531          * updated as a side-effect of calling this function.
2532          */
2533
2534         if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2535                 /*
2536                  * For the existed object that is upgraded from old system,
2537                  * its on-disk layout has no slot for the project ID attribute.
2538                  * But quota accounting logic needs to access related slots by
2539                  * offset directly. So we need to adjust old objects' layout
2540                  * to make the project ID to some unified and fixed offset.
2541                  */
2542                 if (attrzp)
2543                         err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2544                 if (err == 0)
2545                         err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2546
2547                 if (unlikely(err == EEXIST))
2548                         err = 0;
2549                 else if (err != 0)
2550                         goto out;
2551                 else
2552                         projid = ZFS_INVALID_PROJID;
2553         }
2554
2555         if (mask & (AT_UID|AT_GID|AT_MODE))
2556                 mutex_enter(&zp->z_acl_lock);
2557
2558         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2559             &zp->z_pflags, sizeof (zp->z_pflags));
2560
2561         if (attrzp) {
2562                 if (mask & (AT_UID|AT_GID|AT_MODE))
2563                         mutex_enter(&attrzp->z_acl_lock);
2564                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2565                     SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2566                     sizeof (attrzp->z_pflags));
2567                 if (projid != ZFS_INVALID_PROJID) {
2568                         attrzp->z_projid = projid;
2569                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2570                             SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2571                             sizeof (attrzp->z_projid));
2572                 }
2573         }
2574
2575         if (mask & (AT_UID|AT_GID)) {
2576
2577                 if (mask & AT_UID) {
2578                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2579                             &new_uid, sizeof (new_uid));
2580                         zp->z_uid = new_uid;
2581                         if (attrzp) {
2582                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2583                                     SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2584                                     sizeof (new_uid));
2585                                 attrzp->z_uid = new_uid;
2586                         }
2587                 }
2588
2589                 if (mask & AT_GID) {
2590                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2591                             NULL, &new_gid, sizeof (new_gid));
2592                         zp->z_gid = new_gid;
2593                         if (attrzp) {
2594                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2595                                     SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2596                                     sizeof (new_gid));
2597                                 attrzp->z_gid = new_gid;
2598                         }
2599                 }
2600                 if (!(mask & AT_MODE)) {
2601                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2602                             NULL, &new_mode, sizeof (new_mode));
2603                         new_mode = zp->z_mode;
2604                 }
2605                 err = zfs_acl_chown_setattr(zp);
2606                 ASSERT0(err);
2607                 if (attrzp) {
2608                         vn_seqc_write_begin(ZTOV(attrzp));
2609                         err = zfs_acl_chown_setattr(attrzp);
2610                         vn_seqc_write_end(ZTOV(attrzp));
2611                         ASSERT0(err);
2612                 }
2613         }
2614
2615         if (mask & AT_MODE) {
2616                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2617                     &new_mode, sizeof (new_mode));
2618                 zp->z_mode = new_mode;
2619                 ASSERT3P(aclp, !=, NULL);
2620                 err = zfs_aclset_common(zp, aclp, cr, tx);
2621                 ASSERT0(err);
2622                 if (zp->z_acl_cached)
2623                         zfs_acl_free(zp->z_acl_cached);
2624                 zp->z_acl_cached = aclp;
2625                 aclp = NULL;
2626         }
2627
2628
2629         if (mask & AT_ATIME) {
2630                 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
2631                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2632                     &zp->z_atime, sizeof (zp->z_atime));
2633         }
2634
2635         if (mask & AT_MTIME) {
2636                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2637                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2638                     mtime, sizeof (mtime));
2639         }
2640
2641         if (projid != ZFS_INVALID_PROJID) {
2642                 zp->z_projid = projid;
2643                 SA_ADD_BULK_ATTR(bulk, count,
2644                     SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2645                     sizeof (zp->z_projid));
2646         }
2647
2648         /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
2649         if (mask & AT_SIZE && !(mask & AT_MTIME)) {
2650                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
2651                     NULL, mtime, sizeof (mtime));
2652                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2653                     &ctime, sizeof (ctime));
2654                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
2655         } else if (mask != 0) {
2656                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2657                     &ctime, sizeof (ctime));
2658                 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime);
2659                 if (attrzp) {
2660                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2661                             SA_ZPL_CTIME(zfsvfs), NULL,
2662                             &ctime, sizeof (ctime));
2663                         zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
2664                             mtime, ctime);
2665                 }
2666         }
2667
2668         /*
2669          * Do this after setting timestamps to prevent timestamp
2670          * update from toggling bit
2671          */
2672
2673         if (xoap && (mask & AT_XVATTR)) {
2674
2675                 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
2676                         xoap->xoa_createtime = vap->va_birthtime;
2677                 /*
2678                  * restore trimmed off masks
2679                  * so that return masks can be set for caller.
2680                  */
2681
2682                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
2683                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
2684                 }
2685                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
2686                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
2687                 }
2688                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
2689                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2690                 }
2691                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
2692                         XVA_SET_REQ(xvap, XAT_NODUMP);
2693                 }
2694                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
2695                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2696                 }
2697                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
2698                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2699                 }
2700                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) {
2701                         XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2702                 }
2703
2704                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2705                         ASSERT3S(vp->v_type, ==, VREG);
2706
2707                 zfs_xvattr_set(zp, xvap, tx);
2708         }
2709
2710         if (fuid_dirtied)
2711                 zfs_fuid_sync(zfsvfs, tx);
2712
2713         if (mask != 0)
2714                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2715
2716         if (mask & (AT_UID|AT_GID|AT_MODE))
2717                 mutex_exit(&zp->z_acl_lock);
2718
2719         if (attrzp) {
2720                 if (mask & (AT_UID|AT_GID|AT_MODE))
2721                         mutex_exit(&attrzp->z_acl_lock);
2722         }
2723 out:
2724         if (err == 0 && attrzp) {
2725                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2726                     xattr_count, tx);
2727                 ASSERT0(err2);
2728         }
2729
2730         if (attrzp)
2731                 vput(ZTOV(attrzp));
2732
2733         if (aclp)
2734                 zfs_acl_free(aclp);
2735
2736         if (fuidp) {
2737                 zfs_fuid_info_free(fuidp);
2738                 fuidp = NULL;
2739         }
2740
2741         if (err) {
2742                 dmu_tx_abort(tx);
2743         } else {
2744                 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2745                 dmu_tx_commit(tx);
2746         }
2747
2748 out2:
2749         if (os->os_sync == ZFS_SYNC_ALWAYS)
2750                 zil_commit(zilog, 0);
2751
2752         zfs_exit(zfsvfs, FTAG);
2753         return (err);
2754 }
2755
2756 /*
2757  * Look up the directory entries corresponding to the source and target
2758  * directory/name pairs.
2759  */
2760 static int
2761 zfs_rename_relock_lookup(znode_t *sdzp, const struct componentname *scnp,
2762     znode_t **szpp, znode_t *tdzp, const struct componentname *tcnp,
2763     znode_t **tzpp)
2764 {
2765         zfsvfs_t *zfsvfs;
2766         znode_t *szp, *tzp;
2767         int error;
2768
2769         /*
2770          * Before using sdzp and tdzp we must ensure that they are live.
2771          * As a porting legacy from illumos we have two things to worry
2772          * about.  One is typical for FreeBSD and it is that the vnode is
2773          * not reclaimed (doomed).  The other is that the znode is live.
2774          * The current code can invalidate the znode without acquiring the
2775          * corresponding vnode lock if the object represented by the znode
2776          * and vnode is no longer valid after a rollback or receive operation.
2777          * z_teardown_lock hidden behind zfs_enter and zfs_exit is the lock
2778          * that protects the znodes from the invalidation.
2779          */
2780         zfsvfs = sdzp->z_zfsvfs;
2781         ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
2782         if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
2783                 return (error);
2784         if ((error = zfs_verify_zp(tdzp)) != 0) {
2785                 zfs_exit(zfsvfs, FTAG);
2786                 return (error);
2787         }
2788
2789         /*
2790          * Re-resolve svp to be certain it still exists and fetch the
2791          * correct vnode.
2792          */
2793         error = zfs_dirent_lookup(sdzp, scnp->cn_nameptr, &szp, ZEXISTS);
2794         if (error != 0) {
2795                 /* Source entry invalid or not there. */
2796                 if ((scnp->cn_flags & ISDOTDOT) != 0 ||
2797                     (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
2798                         error = SET_ERROR(EINVAL);
2799                 goto out;
2800         }
2801         *szpp = szp;
2802
2803         /*
2804          * Re-resolve tvp, if it disappeared we just carry on.
2805          */
2806         error = zfs_dirent_lookup(tdzp, tcnp->cn_nameptr, &tzp, 0);
2807         if (error != 0) {
2808                 vrele(ZTOV(szp));
2809                 if ((tcnp->cn_flags & ISDOTDOT) != 0)
2810                         error = SET_ERROR(EINVAL);
2811                 goto out;
2812         }
2813         *tzpp = tzp;
2814 out:
2815         zfs_exit(zfsvfs, FTAG);
2816         return (error);
2817 }
2818
2819 /*
2820  * We acquire all but fdvp locks using non-blocking acquisitions.  If we
2821  * fail to acquire any lock in the path we will drop all held locks,
2822  * acquire the new lock in a blocking fashion, and then release it and
2823  * restart the rename.  This acquire/release step ensures that we do not
2824  * spin on a lock waiting for release.  On error release all vnode locks
2825  * and decrement references the way tmpfs_rename() would do.
2826  */
2827 static int
2828 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
2829     struct vnode *tdvp, struct vnode **tvpp,
2830     const struct componentname *scnp, const struct componentname *tcnp)
2831 {
2832         struct vnode    *nvp, *svp, *tvp;
2833         znode_t         *sdzp, *tdzp, *szp, *tzp;
2834         int             error;
2835
2836         VOP_UNLOCK(tdvp);
2837         if (*tvpp != NULL && *tvpp != tdvp)
2838                 VOP_UNLOCK(*tvpp);
2839
2840 relock:
2841         error = vn_lock(sdvp, LK_EXCLUSIVE);
2842         if (error)
2843                 goto out;
2844         error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
2845         if (error != 0) {
2846                 VOP_UNLOCK(sdvp);
2847                 if (error != EBUSY)
2848                         goto out;
2849                 error = vn_lock(tdvp, LK_EXCLUSIVE);
2850                 if (error)
2851                         goto out;
2852                 VOP_UNLOCK(tdvp);
2853                 goto relock;
2854         }
2855         tdzp = VTOZ(tdvp);
2856         sdzp = VTOZ(sdvp);
2857
2858         error = zfs_rename_relock_lookup(sdzp, scnp, &szp, tdzp, tcnp, &tzp);
2859         if (error != 0) {
2860                 VOP_UNLOCK(sdvp);
2861                 VOP_UNLOCK(tdvp);
2862                 goto out;
2863         }
2864         svp = ZTOV(szp);
2865         tvp = tzp != NULL ? ZTOV(tzp) : NULL;
2866
2867         /*
2868          * Now try acquire locks on svp and tvp.
2869          */
2870         nvp = svp;
2871         error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
2872         if (error != 0) {
2873                 VOP_UNLOCK(sdvp);
2874                 VOP_UNLOCK(tdvp);
2875                 if (tvp != NULL)
2876                         vrele(tvp);
2877                 if (error != EBUSY) {
2878                         vrele(nvp);
2879                         goto out;
2880                 }
2881                 error = vn_lock(nvp, LK_EXCLUSIVE);
2882                 if (error != 0) {
2883                         vrele(nvp);
2884                         goto out;
2885                 }
2886                 VOP_UNLOCK(nvp);
2887                 /*
2888                  * Concurrent rename race.
2889                  * XXX ?
2890                  */
2891                 if (nvp == tdvp) {
2892                         vrele(nvp);
2893                         error = SET_ERROR(EINVAL);
2894                         goto out;
2895                 }
2896                 vrele(*svpp);
2897                 *svpp = nvp;
2898                 goto relock;
2899         }
2900         vrele(*svpp);
2901         *svpp = nvp;
2902
2903         if (*tvpp != NULL)
2904                 vrele(*tvpp);
2905         *tvpp = NULL;
2906         if (tvp != NULL) {
2907                 nvp = tvp;
2908                 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
2909                 if (error != 0) {
2910                         VOP_UNLOCK(sdvp);
2911                         VOP_UNLOCK(tdvp);
2912                         VOP_UNLOCK(*svpp);
2913                         if (error != EBUSY) {
2914                                 vrele(nvp);
2915                                 goto out;
2916                         }
2917                         error = vn_lock(nvp, LK_EXCLUSIVE);
2918                         if (error != 0) {
2919                                 vrele(nvp);
2920                                 goto out;
2921                         }
2922                         vput(nvp);
2923                         goto relock;
2924                 }
2925                 *tvpp = nvp;
2926         }
2927
2928         return (0);
2929
2930 out:
2931         return (error);
2932 }
2933
2934 /*
2935  * Note that we must use VRELE_ASYNC in this function as it walks
2936  * up the directory tree and vrele may need to acquire an exclusive
2937  * lock if a last reference to a vnode is dropped.
2938  */
2939 static int
2940 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
2941 {
2942         zfsvfs_t        *zfsvfs;
2943         znode_t         *zp, *zp1;
2944         uint64_t        parent;
2945         int             error;
2946
2947         zfsvfs = tdzp->z_zfsvfs;
2948         if (tdzp == szp)
2949                 return (SET_ERROR(EINVAL));
2950         if (tdzp == sdzp)
2951                 return (0);
2952         if (tdzp->z_id == zfsvfs->z_root)
2953                 return (0);
2954         zp = tdzp;
2955         for (;;) {
2956                 ASSERT(!zp->z_unlinked);
2957                 if ((error = sa_lookup(zp->z_sa_hdl,
2958                     SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
2959                         break;
2960
2961                 if (parent == szp->z_id) {
2962                         error = SET_ERROR(EINVAL);
2963                         break;
2964                 }
2965                 if (parent == zfsvfs->z_root)
2966                         break;
2967                 if (parent == sdzp->z_id)
2968                         break;
2969
2970                 error = zfs_zget(zfsvfs, parent, &zp1);
2971                 if (error != 0)
2972                         break;
2973
2974                 if (zp != tdzp)
2975                         VN_RELE_ASYNC(ZTOV(zp),
2976                             dsl_pool_zrele_taskq(
2977                             dmu_objset_pool(zfsvfs->z_os)));
2978                 zp = zp1;
2979         }
2980
2981         if (error == ENOTDIR)
2982                 panic("checkpath: .. not a directory\n");
2983         if (zp != tdzp)
2984                 VN_RELE_ASYNC(ZTOV(zp),
2985                     dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
2986         return (error);
2987 }
2988
2989 static int
2990 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
2991     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
2992     cred_t *cr);
2993
2994 /*
2995  * Move an entry from the provided source directory to the target
2996  * directory.  Change the entry name as indicated.
2997  *
2998  *      IN:     sdvp    - Source directory containing the "old entry".
2999  *              scnp    - Old entry name.
3000  *              tdvp    - Target directory to contain the "new entry".
3001  *              tcnp    - New entry name.
3002  *              cr      - credentials of caller.
3003  *      INOUT:  svpp    - Source file
3004  *              tvpp    - Target file, may point to NULL initially
3005  *
3006  *      RETURN: 0 on success, error code on failure.
3007  *
3008  * Timestamps:
3009  *      sdvp,tdvp - ctime|mtime updated
3010  */
3011 static int
3012 zfs_do_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3013     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3014     cred_t *cr)
3015 {
3016         int     error;
3017
3018         ASSERT_VOP_ELOCKED(tdvp, __func__);
3019         if (*tvpp != NULL)
3020                 ASSERT_VOP_ELOCKED(*tvpp, __func__);
3021
3022         /* Reject renames across filesystems. */
3023         if ((*svpp)->v_mount != tdvp->v_mount ||
3024             ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3025                 error = SET_ERROR(EXDEV);
3026                 goto out;
3027         }
3028
3029         if (zfsctl_is_node(tdvp)) {
3030                 error = SET_ERROR(EXDEV);
3031                 goto out;
3032         }
3033
3034         /*
3035          * Lock all four vnodes to ensure safety and semantics of renaming.
3036          */
3037         error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3038         if (error != 0) {
3039                 /* no vnodes are locked in the case of error here */
3040                 return (error);
3041         }
3042
3043         error = zfs_do_rename_impl(sdvp, svpp, scnp, tdvp, tvpp, tcnp, cr);
3044         VOP_UNLOCK(sdvp);
3045         VOP_UNLOCK(*svpp);
3046 out:
3047         if (*tvpp != NULL)
3048                 VOP_UNLOCK(*tvpp);
3049         if (tdvp != *tvpp)
3050                 VOP_UNLOCK(tdvp);
3051
3052         return (error);
3053 }
3054
3055 static int
3056 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3057     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3058     cred_t *cr)
3059 {
3060         dmu_tx_t        *tx;
3061         zfsvfs_t        *zfsvfs;
3062         zilog_t         *zilog;
3063         znode_t         *tdzp, *sdzp, *tzp, *szp;
3064         const char      *snm = scnp->cn_nameptr;
3065         const char      *tnm = tcnp->cn_nameptr;
3066         int             error;
3067
3068         tdzp = VTOZ(tdvp);
3069         sdzp = VTOZ(sdvp);
3070         zfsvfs = tdzp->z_zfsvfs;
3071
3072         if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3073                 return (error);
3074         if ((error = zfs_verify_zp(sdzp)) != 0) {
3075                 zfs_exit(zfsvfs, FTAG);
3076                 return (error);
3077         }
3078         zilog = zfsvfs->z_log;
3079
3080         if (zfsvfs->z_utf8 && u8_validate(tnm,
3081             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3082                 error = SET_ERROR(EILSEQ);
3083                 goto out;
3084         }
3085
3086         /* If source and target are the same file, there is nothing to do. */
3087         if ((*svpp) == (*tvpp)) {
3088                 error = 0;
3089                 goto out;
3090         }
3091
3092         if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3093             ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3094             (*tvpp)->v_mountedhere != NULL)) {
3095                 error = SET_ERROR(EXDEV);
3096                 goto out;
3097         }
3098
3099         szp = VTOZ(*svpp);
3100         if ((error = zfs_verify_zp(szp)) != 0) {
3101                 zfs_exit(zfsvfs, FTAG);
3102                 return (error);
3103         }
3104         tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3105         if (tzp != NULL) {
3106                 if ((error = zfs_verify_zp(tzp)) != 0) {
3107                         zfs_exit(zfsvfs, FTAG);
3108                         return (error);
3109                 }
3110         }
3111
3112         /*
3113          * This is to prevent the creation of links into attribute space
3114          * by renaming a linked file into/outof an attribute directory.
3115          * See the comment in zfs_link() for why this is considered bad.
3116          */
3117         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3118                 error = SET_ERROR(EINVAL);
3119                 goto out;
3120         }
3121
3122         /*
3123          * If we are using project inheritance, means if the directory has
3124          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3125          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3126          * such case, we only allow renames into our tree when the project
3127          * IDs are the same.
3128          */
3129         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3130             tdzp->z_projid != szp->z_projid) {
3131                 error = SET_ERROR(EXDEV);
3132                 goto out;
3133         }
3134
3135         /*
3136          * Must have write access at the source to remove the old entry
3137          * and write access at the target to create the new entry.
3138          * Note that if target and source are the same, this can be
3139          * done in a single check.
3140          */
3141         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, NULL)))
3142                 goto out;
3143
3144         if ((*svpp)->v_type == VDIR) {
3145                 /*
3146                  * Avoid ".", "..", and aliases of "." for obvious reasons.
3147                  */
3148                 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3149                     sdzp == szp ||
3150                     (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3151                         error = EINVAL;
3152                         goto out;
3153                 }
3154
3155                 /*
3156                  * Check to make sure rename is valid.
3157                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3158                  */
3159                 if ((error = zfs_rename_check(szp, sdzp, tdzp)))
3160                         goto out;
3161         }
3162
3163         /*
3164          * Does target exist?
3165          */
3166         if (tzp) {
3167                 /*
3168                  * Source and target must be the same type.
3169                  */
3170                 if ((*svpp)->v_type == VDIR) {
3171                         if ((*tvpp)->v_type != VDIR) {
3172                                 error = SET_ERROR(ENOTDIR);
3173                                 goto out;
3174                         } else {
3175                                 cache_purge(tdvp);
3176                                 if (sdvp != tdvp)
3177                                         cache_purge(sdvp);
3178                         }
3179                 } else {
3180                         if ((*tvpp)->v_type == VDIR) {
3181                                 error = SET_ERROR(EISDIR);
3182                                 goto out;
3183                         }
3184                 }
3185         }
3186
3187         vn_seqc_write_begin(*svpp);
3188         vn_seqc_write_begin(sdvp);
3189         if (*tvpp != NULL)
3190                 vn_seqc_write_begin(*tvpp);
3191         if (tdvp != *tvpp)
3192                 vn_seqc_write_begin(tdvp);
3193
3194         vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3195         if (tzp)
3196                 vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3197
3198         /*
3199          * notify the target directory if it is not the same
3200          * as source directory.
3201          */
3202         if (tdvp != sdvp) {
3203                 vnevent_rename_dest_dir(tdvp, ct);
3204         }
3205
3206         tx = dmu_tx_create(zfsvfs->z_os);
3207         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3208         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3209         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3210         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3211         if (sdzp != tdzp) {
3212                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3213                 zfs_sa_upgrade_txholds(tx, tdzp);
3214         }
3215         if (tzp) {
3216                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3217                 zfs_sa_upgrade_txholds(tx, tzp);
3218         }
3219
3220         zfs_sa_upgrade_txholds(tx, szp);
3221         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3222         error = dmu_tx_assign(tx, TXG_WAIT);
3223         if (error) {
3224                 dmu_tx_abort(tx);
3225                 goto out_seq;
3226         }
3227
3228         if (tzp)        /* Attempt to remove the existing target */
3229                 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3230
3231         if (error == 0) {
3232                 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3233                 if (error == 0) {
3234                         szp->z_pflags |= ZFS_AV_MODIFIED;
3235
3236                         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3237                             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3238                         ASSERT0(error);
3239
3240                         error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3241                             NULL);
3242                         if (error == 0) {
3243                                 zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3244                                     snm, tdzp, tnm, szp);
3245                         } else {
3246                                 /*
3247                                  * At this point, we have successfully created
3248                                  * the target name, but have failed to remove
3249                                  * the source name.  Since the create was done
3250                                  * with the ZRENAMING flag, there are
3251                                  * complications; for one, the link count is
3252                                  * wrong.  The easiest way to deal with this
3253                                  * is to remove the newly created target, and
3254                                  * return the original error.  This must
3255                                  * succeed; fortunately, it is very unlikely to
3256                                  * fail, since we just created it.
3257                                  */
3258                                 VERIFY0(zfs_link_destroy(tdzp, tnm, szp, tx,
3259                                     ZRENAMING, NULL));
3260                         }
3261                 }
3262                 if (error == 0) {
3263                         cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp);
3264                 }
3265         }
3266
3267         dmu_tx_commit(tx);
3268
3269 out_seq:
3270         vn_seqc_write_end(*svpp);
3271         vn_seqc_write_end(sdvp);
3272         if (*tvpp != NULL)
3273                 vn_seqc_write_end(*tvpp);
3274         if (tdvp != *tvpp)
3275                 vn_seqc_write_end(tdvp);
3276
3277 out:
3278         if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3279                 zil_commit(zilog, 0);
3280         zfs_exit(zfsvfs, FTAG);
3281
3282         return (error);
3283 }
3284
3285 int
3286 zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
3287     cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
3288 {
3289         struct componentname scn, tcn;
3290         vnode_t *sdvp, *tdvp;
3291         vnode_t *svp, *tvp;
3292         int error;
3293         svp = tvp = NULL;
3294
3295         if (rflags != 0 || wo_vap != NULL)
3296                 return (SET_ERROR(EINVAL));
3297
3298         sdvp = ZTOV(sdzp);
3299         tdvp = ZTOV(tdzp);
3300         error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
3301         if (sdzp->z_zfsvfs->z_replay == B_FALSE)
3302                 VOP_UNLOCK(sdvp);
3303         if (error != 0)
3304                 goto fail;
3305         VOP_UNLOCK(svp);
3306
3307         vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
3308         error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME);
3309         if (error == EJUSTRETURN)
3310                 tvp = NULL;
3311         else if (error != 0) {
3312                 VOP_UNLOCK(tdvp);
3313                 goto fail;
3314         }
3315
3316         error = zfs_do_rename(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr);
3317 fail:
3318         if (svp != NULL)
3319                 vrele(svp);
3320         if (tvp != NULL)
3321                 vrele(tvp);
3322
3323         return (error);
3324 }
3325
3326 /*
3327  * Insert the indicated symbolic reference entry into the directory.
3328  *
3329  *      IN:     dvp     - Directory to contain new symbolic link.
3330  *              link    - Name for new symlink entry.
3331  *              vap     - Attributes of new entry.
3332  *              cr      - credentials of caller.
3333  *              ct      - caller context
3334  *              flags   - case flags
3335  *              mnt_ns  - Unused on FreeBSD
3336  *
3337  *      RETURN: 0 on success, error code on failure.
3338  *
3339  * Timestamps:
3340  *      dvp - ctime|mtime updated
3341  */
3342 int
3343 zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
3344     const char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
3345 {
3346         (void) flags;
3347         znode_t         *zp;
3348         dmu_tx_t        *tx;
3349         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3350         zilog_t         *zilog;
3351         uint64_t        len = strlen(link);
3352         int             error;
3353         zfs_acl_ids_t   acl_ids;
3354         boolean_t       fuid_dirtied;
3355         uint64_t        txtype = TX_SYMLINK;
3356
3357         ASSERT3S(vap->va_type, ==, VLNK);
3358
3359         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3360                 return (error);
3361         zilog = zfsvfs->z_log;
3362
3363         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3364             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3365                 zfs_exit(zfsvfs, FTAG);
3366                 return (SET_ERROR(EILSEQ));
3367         }
3368
3369         if (len > MAXPATHLEN) {
3370                 zfs_exit(zfsvfs, FTAG);
3371                 return (SET_ERROR(ENAMETOOLONG));
3372         }
3373
3374         if ((error = zfs_acl_ids_create(dzp, 0,
3375             vap, cr, NULL, &acl_ids, NULL)) != 0) {
3376                 zfs_exit(zfsvfs, FTAG);
3377                 return (error);
3378         }
3379
3380         /*
3381          * Attempt to lock directory; fail if entry already exists.
3382          */
3383         error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
3384         if (error) {
3385                 zfs_acl_ids_free(&acl_ids);
3386                 zfs_exit(zfsvfs, FTAG);
3387                 return (error);
3388         }
3389
3390         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
3391                 zfs_acl_ids_free(&acl_ids);
3392                 zfs_exit(zfsvfs, FTAG);
3393                 return (error);
3394         }
3395
3396         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids,
3397             0 /* projid */)) {
3398                 zfs_acl_ids_free(&acl_ids);
3399                 zfs_exit(zfsvfs, FTAG);
3400                 return (SET_ERROR(EDQUOT));
3401         }
3402
3403         getnewvnode_reserve();
3404         tx = dmu_tx_create(zfsvfs->z_os);
3405         fuid_dirtied = zfsvfs->z_fuid_dirty;
3406         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3407         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3408         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3409             ZFS_SA_BASE_ATTR_SIZE + len);
3410         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3411         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3412                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3413                     acl_ids.z_aclp->z_acl_bytes);
3414         }
3415         if (fuid_dirtied)
3416                 zfs_fuid_txhold(zfsvfs, tx);
3417         error = dmu_tx_assign(tx, TXG_WAIT);
3418         if (error) {
3419                 zfs_acl_ids_free(&acl_ids);
3420                 dmu_tx_abort(tx);
3421                 getnewvnode_drop_reserve();
3422                 zfs_exit(zfsvfs, FTAG);
3423                 return (error);
3424         }
3425
3426         /*
3427          * Create a new object for the symlink.
3428          * for version 4 ZPL datasets the symlink will be an SA attribute
3429          */
3430         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3431
3432         if (fuid_dirtied)
3433                 zfs_fuid_sync(zfsvfs, tx);
3434
3435         if (zp->z_is_sa)
3436                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3437                     __DECONST(void *, link), len, tx);
3438         else
3439                 zfs_sa_symlink(zp, __DECONST(char *, link), len, tx);
3440
3441         zp->z_size = len;
3442         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3443             &zp->z_size, sizeof (zp->z_size), tx);
3444         /*
3445          * Insert the new object into the directory.
3446          */
3447         error = zfs_link_create(dzp, name, zp, tx, ZNEW);
3448         if (error != 0) {
3449                 zfs_znode_delete(zp, tx);
3450                 VOP_UNLOCK(ZTOV(zp));
3451                 zrele(zp);
3452         } else {
3453                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3454         }
3455
3456         zfs_acl_ids_free(&acl_ids);
3457
3458         dmu_tx_commit(tx);
3459
3460         getnewvnode_drop_reserve();
3461
3462         if (error == 0) {
3463                 *zpp = zp;
3464
3465                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3466                         zil_commit(zilog, 0);
3467         }
3468
3469         zfs_exit(zfsvfs, FTAG);
3470         return (error);
3471 }
3472
3473 /*
3474  * Return, in the buffer contained in the provided uio structure,
3475  * the symbolic path referred to by vp.
3476  *
3477  *      IN:     vp      - vnode of symbolic link.
3478  *              uio     - structure to contain the link path.
3479  *              cr      - credentials of caller.
3480  *              ct      - caller context
3481  *
3482  *      OUT:    uio     - structure containing the link path.
3483  *
3484  *      RETURN: 0 on success, error code on failure.
3485  *
3486  * Timestamps:
3487  *      vp - atime updated
3488  */
3489 static int
3490 zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct)
3491 {
3492         (void) cr, (void) ct;
3493         znode_t         *zp = VTOZ(vp);
3494         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3495         int             error;
3496
3497         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3498                 return (error);
3499
3500         if (zp->z_is_sa)
3501                 error = sa_lookup_uio(zp->z_sa_hdl,
3502                     SA_ZPL_SYMLINK(zfsvfs), uio);
3503         else
3504                 error = zfs_sa_readlink(zp, uio);
3505
3506         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3507
3508         zfs_exit(zfsvfs, FTAG);
3509         return (error);
3510 }
3511
3512 /*
3513  * Insert a new entry into directory tdvp referencing svp.
3514  *
3515  *      IN:     tdvp    - Directory to contain new entry.
3516  *              svp     - vnode of new entry.
3517  *              name    - name of new entry.
3518  *              cr      - credentials of caller.
3519  *
3520  *      RETURN: 0 on success, error code on failure.
3521  *
3522  * Timestamps:
3523  *      tdvp - ctime|mtime updated
3524  *       svp - ctime updated
3525  */
3526 int
3527 zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
3528     int flags)
3529 {
3530         (void) flags;
3531         znode_t         *tzp;
3532         zfsvfs_t        *zfsvfs = tdzp->z_zfsvfs;
3533         zilog_t         *zilog;
3534         dmu_tx_t        *tx;
3535         int             error;
3536         uint64_t        parent;
3537         uid_t           owner;
3538
3539         ASSERT3S(ZTOV(tdzp)->v_type, ==, VDIR);
3540
3541         if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3542                 return (error);
3543         zilog = zfsvfs->z_log;
3544
3545         /*
3546          * POSIX dictates that we return EPERM here.
3547          * Better choices include ENOTSUP or EISDIR.
3548          */
3549         if (ZTOV(szp)->v_type == VDIR) {
3550                 zfs_exit(zfsvfs, FTAG);
3551                 return (SET_ERROR(EPERM));
3552         }
3553
3554         if ((error = zfs_verify_zp(szp)) != 0) {
3555                 zfs_exit(zfsvfs, FTAG);
3556                 return (error);
3557         }
3558
3559         /*
3560          * If we are using project inheritance, means if the directory has
3561          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3562          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3563          * such case, we only allow hard link creation in our tree when the
3564          * project IDs are the same.
3565          */
3566         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3567             tdzp->z_projid != szp->z_projid) {
3568                 zfs_exit(zfsvfs, FTAG);
3569                 return (SET_ERROR(EXDEV));
3570         }
3571
3572         if (szp->z_pflags & (ZFS_APPENDONLY |
3573             ZFS_IMMUTABLE | ZFS_READONLY)) {
3574                 zfs_exit(zfsvfs, FTAG);
3575                 return (SET_ERROR(EPERM));
3576         }
3577
3578         /* Prevent links to .zfs/shares files */
3579
3580         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3581             &parent, sizeof (uint64_t))) != 0) {
3582                 zfs_exit(zfsvfs, FTAG);
3583                 return (error);
3584         }
3585         if (parent == zfsvfs->z_shares_dir) {
3586                 zfs_exit(zfsvfs, FTAG);
3587                 return (SET_ERROR(EPERM));
3588         }
3589
3590         if (zfsvfs->z_utf8 && u8_validate(name,
3591             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3592                 zfs_exit(zfsvfs, FTAG);
3593                 return (SET_ERROR(EILSEQ));
3594         }
3595
3596         /*
3597          * We do not support links between attributes and non-attributes
3598          * because of the potential security risk of creating links
3599          * into "normal" file space in order to circumvent restrictions
3600          * imposed in attribute space.
3601          */
3602         if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3603                 zfs_exit(zfsvfs, FTAG);
3604                 return (SET_ERROR(EINVAL));
3605         }
3606
3607
3608         owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
3609         if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) {
3610                 zfs_exit(zfsvfs, FTAG);
3611                 return (SET_ERROR(EPERM));
3612         }
3613
3614         if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, NULL))) {
3615                 zfs_exit(zfsvfs, FTAG);
3616                 return (error);
3617         }
3618
3619         /*
3620          * Attempt to lock directory; fail if entry already exists.
3621          */
3622         error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW);
3623         if (error) {
3624                 zfs_exit(zfsvfs, FTAG);
3625                 return (error);
3626         }
3627
3628         tx = dmu_tx_create(zfsvfs->z_os);
3629         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3630         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3631         zfs_sa_upgrade_txholds(tx, szp);
3632         zfs_sa_upgrade_txholds(tx, tdzp);
3633         error = dmu_tx_assign(tx, TXG_WAIT);
3634         if (error) {
3635                 dmu_tx_abort(tx);
3636                 zfs_exit(zfsvfs, FTAG);
3637                 return (error);
3638         }
3639
3640         error = zfs_link_create(tdzp, name, szp, tx, 0);
3641
3642         if (error == 0) {
3643                 uint64_t txtype = TX_LINK;
3644                 zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3645         }
3646
3647         dmu_tx_commit(tx);
3648
3649         if (error == 0) {
3650                 vnevent_link(ZTOV(szp), ct);
3651         }
3652
3653         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3654                 zil_commit(zilog, 0);
3655
3656         zfs_exit(zfsvfs, FTAG);
3657         return (error);
3658 }
3659
3660 /*
3661  * Free or allocate space in a file.  Currently, this function only
3662  * supports the `F_FREESP' command.  However, this command is somewhat
3663  * misnamed, as its functionality includes the ability to allocate as
3664  * well as free space.
3665  *
3666  *      IN:     ip      - inode of file to free data in.
3667  *              cmd     - action to take (only F_FREESP supported).
3668  *              bfp     - section of file to free/alloc.
3669  *              flag    - current file open mode flags.
3670  *              offset  - current file offset.
3671  *              cr      - credentials of caller.
3672  *
3673  *      RETURN: 0 on success, error code on failure.
3674  *
3675  * Timestamps:
3676  *      ip - ctime|mtime updated
3677  */
3678 int
3679 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
3680     offset_t offset, cred_t *cr)
3681 {
3682         (void) offset;
3683         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
3684         uint64_t        off, len;
3685         int             error;
3686
3687         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3688                 return (error);
3689
3690         if (cmd != F_FREESP) {
3691                 zfs_exit(zfsvfs, FTAG);
3692                 return (SET_ERROR(EINVAL));
3693         }
3694
3695         /*
3696          * Callers might not be able to detect properly that we are read-only,
3697          * so check it explicitly here.
3698          */
3699         if (zfs_is_readonly(zfsvfs)) {
3700                 zfs_exit(zfsvfs, FTAG);
3701                 return (SET_ERROR(EROFS));
3702         }
3703
3704         if (bfp->l_len < 0) {
3705                 zfs_exit(zfsvfs, FTAG);
3706                 return (SET_ERROR(EINVAL));
3707         }
3708
3709         /*
3710          * Permissions aren't checked on Solaris because on this OS
3711          * zfs_space() can only be called with an opened file handle.
3712          * On Linux we can get here through truncate_range() which
3713          * operates directly on inodes, so we need to check access rights.
3714          */
3715         if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, NULL))) {
3716                 zfs_exit(zfsvfs, FTAG);
3717                 return (error);
3718         }
3719
3720         off = bfp->l_start;
3721         len = bfp->l_len; /* 0 means from off to end of file */
3722
3723         error = zfs_freesp(zp, off, len, flag, TRUE);
3724
3725         zfs_exit(zfsvfs, FTAG);
3726         return (error);
3727 }
3728
3729 static void
3730 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
3731 {
3732         (void) cr, (void) ct;
3733         znode_t *zp = VTOZ(vp);
3734         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3735         int error;
3736
3737         ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
3738         if (zp->z_sa_hdl == NULL) {
3739                 /*
3740                  * The fs has been unmounted, or we did a
3741                  * suspend/resume and this file no longer exists.
3742                  */
3743                 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
3744                 vrecycle(vp);
3745                 return;
3746         }
3747
3748         if (zp->z_unlinked) {
3749                 /*
3750                  * Fast path to recycle a vnode of a removed file.
3751                  */
3752                 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
3753                 vrecycle(vp);
3754                 return;
3755         }
3756
3757         if (zp->z_atime_dirty && zp->z_unlinked == 0) {
3758                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3759
3760                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3761                 zfs_sa_upgrade_txholds(tx, zp);
3762                 error = dmu_tx_assign(tx, TXG_WAIT);
3763                 if (error) {
3764                         dmu_tx_abort(tx);
3765                 } else {
3766                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
3767                             (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
3768                         zp->z_atime_dirty = 0;
3769                         dmu_tx_commit(tx);
3770                 }
3771         }
3772         ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
3773 }
3774
3775
3776 _Static_assert(sizeof (struct zfid_short) <= sizeof (struct fid),
3777         "struct zfid_short bigger than struct fid");
3778 _Static_assert(sizeof (struct zfid_long) <= sizeof (struct fid),
3779         "struct zfid_long bigger than struct fid");
3780
3781 static int
3782 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3783 {
3784         (void) ct;
3785         znode_t         *zp = VTOZ(vp);
3786         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3787         uint32_t        gen;
3788         uint64_t        gen64;
3789         uint64_t        object = zp->z_id;
3790         zfid_short_t    *zfid;
3791         int             size, i, error;
3792
3793         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3794                 return (error);
3795
3796         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
3797             &gen64, sizeof (uint64_t))) != 0) {
3798                 zfs_exit(zfsvfs, FTAG);
3799                 return (error);
3800         }
3801
3802         gen = (uint32_t)gen64;
3803
3804         size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
3805         fidp->fid_len = size;
3806
3807         zfid = (zfid_short_t *)fidp;
3808
3809         zfid->zf_len = size;
3810
3811         for (i = 0; i < sizeof (zfid->zf_object); i++)
3812                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3813
3814         /* Must have a non-zero generation number to distinguish from .zfs */
3815         if (gen == 0)
3816                 gen = 1;
3817         for (i = 0; i < sizeof (zfid->zf_gen); i++)
3818                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3819
3820         if (size == LONG_FID_LEN) {
3821                 uint64_t        objsetid = dmu_objset_id(zfsvfs->z_os);
3822                 zfid_long_t     *zlfid;
3823
3824                 zlfid = (zfid_long_t *)fidp;
3825
3826                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
3827                         zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
3828
3829                 /* XXX - this should be the generation number for the objset */
3830                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
3831                         zlfid->zf_setgen[i] = 0;
3832         }
3833
3834         zfs_exit(zfsvfs, FTAG);
3835         return (0);
3836 }
3837
3838 static int
3839 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
3840     caller_context_t *ct)
3841 {
3842         znode_t *zp;
3843         zfsvfs_t *zfsvfs;
3844         int error;
3845
3846         switch (cmd) {
3847         case _PC_LINK_MAX:
3848                 *valp = MIN(LONG_MAX, ZFS_LINK_MAX);
3849                 return (0);
3850
3851         case _PC_FILESIZEBITS:
3852                 *valp = 64;
3853                 return (0);
3854         case _PC_MIN_HOLE_SIZE:
3855                 *valp = (int)SPA_MINBLOCKSIZE;
3856                 return (0);
3857         case _PC_ACL_EXTENDED:
3858 #if 0           /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */
3859                 zp = VTOZ(vp);
3860                 zfsvfs = zp->z_zfsvfs;
3861                 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3862                         return (error);
3863                 *valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0;
3864                 zfs_exit(zfsvfs, FTAG);
3865 #else
3866                 *valp = 0;
3867 #endif
3868                 return (0);
3869
3870         case _PC_ACL_NFS4:
3871                 zp = VTOZ(vp);
3872                 zfsvfs = zp->z_zfsvfs;
3873                 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3874                         return (error);
3875                 *valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0;
3876                 zfs_exit(zfsvfs, FTAG);
3877                 return (0);
3878
3879         case _PC_ACL_PATH_MAX:
3880                 *valp = ACL_MAX_ENTRIES;
3881                 return (0);
3882
3883         default:
3884                 return (EOPNOTSUPP);
3885         }
3886 }
3887
3888 static int
3889 zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
3890     int *rahead)
3891 {
3892         znode_t *zp = VTOZ(vp);
3893         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3894         zfs_locked_range_t *lr;
3895         vm_object_t object;
3896         off_t start, end, obj_size;
3897         uint_t blksz;
3898         int pgsin_b, pgsin_a;
3899         int error;
3900
3901         if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
3902                 return (zfs_vm_pagerret_error);
3903
3904         start = IDX_TO_OFF(ma[0]->pindex);
3905         end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
3906
3907         /*
3908          * Lock a range covering all required and optional pages.
3909          * Note that we need to handle the case of the block size growing.
3910          */
3911         for (;;) {
3912                 blksz = zp->z_blksz;
3913                 lr = zfs_rangelock_tryenter(&zp->z_rangelock,
3914                     rounddown(start, blksz),
3915                     roundup(end, blksz) - rounddown(start, blksz), RL_READER);
3916                 if (lr == NULL) {
3917                         if (rahead != NULL) {
3918                                 *rahead = 0;
3919                                 rahead = NULL;
3920                         }
3921                         if (rbehind != NULL) {
3922                                 *rbehind = 0;
3923                                 rbehind = NULL;
3924                         }
3925                         break;
3926                 }
3927                 if (blksz == zp->z_blksz)
3928                         break;
3929                 zfs_rangelock_exit(lr);
3930         }
3931
3932         object = ma[0]->object;
3933         zfs_vmobject_wlock(object);
3934         obj_size = object->un_pager.vnp.vnp_size;
3935         zfs_vmobject_wunlock(object);
3936         if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
3937                 if (lr != NULL)
3938                         zfs_rangelock_exit(lr);
3939                 zfs_exit(zfsvfs, FTAG);
3940                 return (zfs_vm_pagerret_bad);
3941         }
3942
3943         pgsin_b = 0;
3944         if (rbehind != NULL) {
3945                 pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
3946                 pgsin_b = MIN(*rbehind, pgsin_b);
3947         }
3948
3949         pgsin_a = 0;
3950         if (rahead != NULL) {
3951                 pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
3952                 if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
3953                         pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
3954                 pgsin_a = MIN(*rahead, pgsin_a);
3955         }
3956
3957         /*
3958          * NB: we need to pass the exact byte size of the data that we expect
3959          * to read after accounting for the file size.  This is required because
3960          * ZFS will panic if we request DMU to read beyond the end of the last
3961          * allocated block.
3962          */
3963         error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b,
3964             &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE));
3965
3966         if (lr != NULL)
3967                 zfs_rangelock_exit(lr);
3968         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3969
3970         dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE);
3971
3972         zfs_exit(zfsvfs, FTAG);
3973
3974         if (error != 0)
3975                 return (zfs_vm_pagerret_error);
3976
3977         VM_CNT_INC(v_vnodein);
3978         VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
3979         if (rbehind != NULL)
3980                 *rbehind = pgsin_b;
3981         if (rahead != NULL)
3982                 *rahead = pgsin_a;
3983         return (zfs_vm_pagerret_ok);
3984 }
3985
3986 #ifndef _SYS_SYSPROTO_H_
3987 struct vop_getpages_args {
3988         struct vnode *a_vp;
3989         vm_page_t *a_m;
3990         int a_count;
3991         int *a_rbehind;
3992         int *a_rahead;
3993 };
3994 #endif
3995
3996 static int
3997 zfs_freebsd_getpages(struct vop_getpages_args *ap)
3998 {
3999
4000         return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
4001             ap->a_rahead));
4002 }
4003
4004 static int
4005 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4006     int *rtvals)
4007 {
4008         znode_t         *zp = VTOZ(vp);
4009         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4010         zfs_locked_range_t              *lr;
4011         dmu_tx_t        *tx;
4012         struct sf_buf   *sf;
4013         vm_object_t     object;
4014         vm_page_t       m;
4015         caddr_t         va;
4016         size_t          tocopy;
4017         size_t          lo_len;
4018         vm_ooffset_t    lo_off;
4019         vm_ooffset_t    off;
4020         uint_t          blksz;
4021         int             ncount;
4022         int             pcount;
4023         int             err;
4024         int             i;
4025
4026         object = vp->v_object;
4027         KASSERT(ma[0]->object == object, ("mismatching object"));
4028         KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4029
4030         pcount = btoc(len);
4031         ncount = pcount;
4032         for (i = 0; i < pcount; i++)
4033                 rtvals[i] = zfs_vm_pagerret_error;
4034
4035         if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
4036                 return (zfs_vm_pagerret_error);
4037
4038         off = IDX_TO_OFF(ma[0]->pindex);
4039         blksz = zp->z_blksz;
4040         lo_off = rounddown(off, blksz);
4041         lo_len = roundup(len + (off - lo_off), blksz);
4042         lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
4043
4044         zfs_vmobject_wlock(object);
4045         if (len + off > object->un_pager.vnp.vnp_size) {
4046                 if (object->un_pager.vnp.vnp_size > off) {
4047                         int pgoff;
4048
4049                         len = object->un_pager.vnp.vnp_size - off;
4050                         ncount = btoc(len);
4051                         if ((pgoff = (int)len & PAGE_MASK) != 0) {
4052                                 /*
4053                                  * If the object is locked and the following
4054                                  * conditions hold, then the page's dirty
4055                                  * field cannot be concurrently changed by a
4056                                  * pmap operation.
4057                                  */
4058                                 m = ma[ncount - 1];
4059                                 vm_page_assert_sbusied(m);
4060                                 KASSERT(!pmap_page_is_write_mapped(m),
4061                                     ("zfs_putpages: page %p is not read-only",
4062                                     m));
4063                                 vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4064                                     pgoff);
4065                         }
4066                 } else {
4067                         len = 0;
4068                         ncount = 0;
4069                 }
4070                 if (ncount < pcount) {
4071                         for (i = ncount; i < pcount; i++) {
4072                                 rtvals[i] = zfs_vm_pagerret_bad;
4073                         }
4074                 }
4075         }
4076         zfs_vmobject_wunlock(object);
4077
4078         boolean_t commit = (flags & (zfs_vm_pagerput_sync |
4079             zfs_vm_pagerput_inval)) != 0 ||
4080             zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS;
4081
4082         if (ncount == 0)
4083                 goto out;
4084
4085         if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) ||
4086             zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) ||
4087             (zp->z_projid != ZFS_DEFAULT_PROJID &&
4088             zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
4089             zp->z_projid))) {
4090                 goto out;
4091         }
4092
4093         tx = dmu_tx_create(zfsvfs->z_os);
4094         dmu_tx_hold_write(tx, zp->z_id, off, len);
4095
4096         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4097         zfs_sa_upgrade_txholds(tx, zp);
4098         err = dmu_tx_assign(tx, TXG_WAIT);
4099         if (err != 0) {
4100                 dmu_tx_abort(tx);
4101                 goto out;
4102         }
4103
4104         if (zp->z_blksz < PAGE_SIZE) {
4105                 for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4106                         tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4107                         va = zfs_map_page(ma[i], &sf);
4108                         dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4109                         zfs_unmap_page(sf);
4110                 }
4111         } else {
4112                 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4113         }
4114
4115         if (err == 0) {
4116                 uint64_t mtime[2], ctime[2];
4117                 sa_bulk_attr_t bulk[3];
4118                 int count = 0;
4119
4120                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4121                     &mtime, 16);
4122                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4123                     &ctime, 16);
4124                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4125                     &zp->z_pflags, 8);
4126                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
4127                 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4128                 ASSERT0(err);
4129                 /*
4130                  * XXX we should be passing a callback to undirty
4131                  * but that would make the locking messier
4132                  */
4133                 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
4134                     len, commit, NULL, NULL);
4135
4136                 zfs_vmobject_wlock(object);
4137                 for (i = 0; i < ncount; i++) {
4138                         rtvals[i] = zfs_vm_pagerret_ok;
4139                         vm_page_undirty(ma[i]);
4140                 }
4141                 zfs_vmobject_wunlock(object);
4142                 VM_CNT_INC(v_vnodeout);
4143                 VM_CNT_ADD(v_vnodepgsout, ncount);
4144         }
4145         dmu_tx_commit(tx);
4146
4147 out:
4148         zfs_rangelock_exit(lr);
4149         if (commit)
4150                 zil_commit(zfsvfs->z_log, zp->z_id);
4151
4152         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len);
4153
4154         zfs_exit(zfsvfs, FTAG);
4155         return (rtvals[0]);
4156 }
4157
4158 #ifndef _SYS_SYSPROTO_H_
4159 struct vop_putpages_args {
4160         struct vnode *a_vp;
4161         vm_page_t *a_m;
4162         int a_count;
4163         int a_sync;
4164         int *a_rtvals;
4165 };
4166 #endif
4167
4168 static int
4169 zfs_freebsd_putpages(struct vop_putpages_args *ap)
4170 {
4171
4172         return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4173             ap->a_rtvals));
4174 }
4175
4176 #ifndef _SYS_SYSPROTO_H_
4177 struct vop_bmap_args {
4178         struct vnode *a_vp;
4179         daddr_t  a_bn;
4180         struct bufobj **a_bop;
4181         daddr_t *a_bnp;
4182         int *a_runp;
4183         int *a_runb;
4184 };
4185 #endif
4186
4187 static int
4188 zfs_freebsd_bmap(struct vop_bmap_args *ap)
4189 {
4190
4191         if (ap->a_bop != NULL)
4192                 *ap->a_bop = &ap->a_vp->v_bufobj;
4193         if (ap->a_bnp != NULL)
4194                 *ap->a_bnp = ap->a_bn;
4195         if (ap->a_runp != NULL)
4196                 *ap->a_runp = 0;
4197         if (ap->a_runb != NULL)
4198                 *ap->a_runb = 0;
4199
4200         return (0);
4201 }
4202
4203 #ifndef _SYS_SYSPROTO_H_
4204 struct vop_open_args {
4205         struct vnode *a_vp;
4206         int a_mode;
4207         struct ucred *a_cred;
4208         struct thread *a_td;
4209 };
4210 #endif
4211
4212 static int
4213 zfs_freebsd_open(struct vop_open_args *ap)
4214 {
4215         vnode_t *vp = ap->a_vp;
4216         znode_t *zp = VTOZ(vp);
4217         int error;
4218
4219         error = zfs_open(&vp, ap->a_mode, ap->a_cred);
4220         if (error == 0)
4221                 vnode_create_vobject(vp, zp->z_size, ap->a_td);
4222         return (error);
4223 }
4224
4225 #ifndef _SYS_SYSPROTO_H_
4226 struct vop_close_args {
4227         struct vnode *a_vp;
4228         int  a_fflag;
4229         struct ucred *a_cred;
4230         struct thread *a_td;
4231 };
4232 #endif
4233
4234 static int
4235 zfs_freebsd_close(struct vop_close_args *ap)
4236 {
4237
4238         return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred));
4239 }
4240
4241 #ifndef _SYS_SYSPROTO_H_
4242 struct vop_ioctl_args {
4243         struct vnode *a_vp;
4244         ulong_t a_command;
4245         caddr_t a_data;
4246         int a_fflag;
4247         struct ucred *cred;
4248         struct thread *td;
4249 };
4250 #endif
4251
4252 static int
4253 zfs_freebsd_ioctl(struct vop_ioctl_args *ap)
4254 {
4255
4256         return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4257             ap->a_fflag, ap->a_cred, NULL));
4258 }
4259
4260 static int
4261 ioflags(int ioflags)
4262 {
4263         int flags = 0;
4264
4265         if (ioflags & IO_APPEND)
4266                 flags |= O_APPEND;
4267         if (ioflags & IO_NDELAY)
4268                 flags |= O_NONBLOCK;
4269         if (ioflags & IO_SYNC)
4270                 flags |= O_SYNC;
4271
4272         return (flags);
4273 }
4274
4275 #ifndef _SYS_SYSPROTO_H_
4276 struct vop_read_args {
4277         struct vnode *a_vp;
4278         struct uio *a_uio;
4279         int a_ioflag;
4280         struct ucred *a_cred;
4281 };
4282 #endif
4283
4284 static int
4285 zfs_freebsd_read(struct vop_read_args *ap)
4286 {
4287         zfs_uio_t uio;
4288         zfs_uio_init(&uio, ap->a_uio);
4289         return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
4290             ap->a_cred));
4291 }
4292
4293 #ifndef _SYS_SYSPROTO_H_
4294 struct vop_write_args {
4295         struct vnode *a_vp;
4296         struct uio *a_uio;
4297         int a_ioflag;
4298         struct ucred *a_cred;
4299 };
4300 #endif
4301
4302 static int
4303 zfs_freebsd_write(struct vop_write_args *ap)
4304 {
4305         zfs_uio_t uio;
4306         zfs_uio_init(&uio, ap->a_uio);
4307         return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
4308             ap->a_cred));
4309 }
4310
4311 /*
4312  * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
4313  * the comment above cache_fplookup for details.
4314  */
4315 static int
4316 zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
4317 {
4318         vnode_t *vp;
4319         znode_t *zp;
4320         uint64_t pflags;
4321
4322         vp = v->a_vp;
4323         zp = VTOZ_SMR(vp);
4324         if (__predict_false(zp == NULL))
4325                 return (EAGAIN);
4326         pflags = atomic_load_64(&zp->z_pflags);
4327         if (pflags & ZFS_AV_QUARANTINED)
4328                 return (EAGAIN);
4329         if (pflags & ZFS_XATTR)
4330                 return (EAGAIN);
4331         if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
4332                 return (EAGAIN);
4333         return (0);
4334 }
4335
4336 static int
4337 zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args *v)
4338 {
4339         vnode_t *vp;
4340         znode_t *zp;
4341         char *target;
4342
4343         vp = v->a_vp;
4344         zp = VTOZ_SMR(vp);
4345         if (__predict_false(zp == NULL)) {
4346                 return (EAGAIN);
4347         }
4348
4349         target = atomic_load_consume_ptr(&zp->z_cached_symlink);
4350         if (target == NULL) {
4351                 return (EAGAIN);
4352         }
4353         return (cache_symlink_resolve(v->a_fpl, target, strlen(target)));
4354 }
4355
4356 #ifndef _SYS_SYSPROTO_H_
4357 struct vop_access_args {
4358         struct vnode *a_vp;
4359         accmode_t a_accmode;
4360         struct ucred *a_cred;
4361         struct thread *a_td;
4362 };
4363 #endif
4364
4365 static int
4366 zfs_freebsd_access(struct vop_access_args *ap)
4367 {
4368         vnode_t *vp = ap->a_vp;
4369         znode_t *zp = VTOZ(vp);
4370         accmode_t accmode;
4371         int error = 0;
4372
4373
4374         if (ap->a_accmode == VEXEC) {
4375                 if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0)
4376                         return (0);
4377         }
4378
4379         /*
4380          * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4381          */
4382         accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4383         if (accmode != 0)
4384                 error = zfs_access(zp, accmode, 0, ap->a_cred);
4385
4386         /*
4387          * VADMIN has to be handled by vaccess().
4388          */
4389         if (error == 0) {
4390                 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4391                 if (accmode != 0) {
4392                         error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4393                             zp->z_gid, accmode, ap->a_cred);
4394                 }
4395         }
4396
4397         /*
4398          * For VEXEC, ensure that at least one execute bit is set for
4399          * non-directories.
4400          */
4401         if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4402             (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4403                 error = EACCES;
4404         }
4405
4406         return (error);
4407 }
4408
4409 #ifndef _SYS_SYSPROTO_H_
4410 struct vop_lookup_args {
4411         struct vnode *a_dvp;
4412         struct vnode **a_vpp;
4413         struct componentname *a_cnp;
4414 };
4415 #endif
4416
4417 static int
4418 zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
4419 {
4420         struct componentname *cnp = ap->a_cnp;
4421         char nm[NAME_MAX + 1];
4422
4423         ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
4424         strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm)));
4425
4426         return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4427             cnp->cn_cred, 0, cached));
4428 }
4429
4430 static int
4431 zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
4432 {
4433
4434         return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
4435 }
4436
4437 #ifndef _SYS_SYSPROTO_H_
4438 struct vop_lookup_args {
4439         struct vnode *a_dvp;
4440         struct vnode **a_vpp;
4441         struct componentname *a_cnp;
4442 };
4443 #endif
4444
4445 static int
4446 zfs_cache_lookup(struct vop_lookup_args *ap)
4447 {
4448         zfsvfs_t *zfsvfs;
4449
4450         zfsvfs = ap->a_dvp->v_mount->mnt_data;
4451         if (zfsvfs->z_use_namecache)
4452                 return (vfs_cache_lookup(ap));
4453         else
4454                 return (zfs_freebsd_lookup(ap, B_FALSE));
4455 }
4456
4457 #ifndef _SYS_SYSPROTO_H_
4458 struct vop_create_args {
4459         struct vnode *a_dvp;
4460         struct vnode **a_vpp;
4461         struct componentname *a_cnp;
4462         struct vattr *a_vap;
4463 };
4464 #endif
4465
4466 static int
4467 zfs_freebsd_create(struct vop_create_args *ap)
4468 {
4469         zfsvfs_t *zfsvfs;
4470         struct componentname *cnp = ap->a_cnp;
4471         vattr_t *vap = ap->a_vap;
4472         znode_t *zp = NULL;
4473         int rc, mode;
4474
4475 #if __FreeBSD_version < 1400068
4476         ASSERT(cnp->cn_flags & SAVENAME);
4477 #endif
4478
4479         vattr_init_mask(vap);
4480         mode = vap->va_mode & ALLPERMS;
4481         zfsvfs = ap->a_dvp->v_mount->mnt_data;
4482         *ap->a_vpp = NULL;
4483
4484         rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode,
4485             &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
4486         if (rc == 0)
4487                 *ap->a_vpp = ZTOV(zp);
4488         if (zfsvfs->z_use_namecache &&
4489             rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
4490                 cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
4491
4492         return (rc);
4493 }
4494
4495 #ifndef _SYS_SYSPROTO_H_
4496 struct vop_remove_args {
4497         struct vnode *a_dvp;
4498         struct vnode *a_vp;
4499         struct componentname *a_cnp;
4500 };
4501 #endif
4502
4503 static int
4504 zfs_freebsd_remove(struct vop_remove_args *ap)
4505 {
4506
4507 #if __FreeBSD_version < 1400068
4508         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4509 #endif
4510
4511         return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
4512             ap->a_cnp->cn_cred));
4513 }
4514
4515 #ifndef _SYS_SYSPROTO_H_
4516 struct vop_mkdir_args {
4517         struct vnode *a_dvp;
4518         struct vnode **a_vpp;
4519         struct componentname *a_cnp;
4520         struct vattr *a_vap;
4521 };
4522 #endif
4523
4524 static int
4525 zfs_freebsd_mkdir(struct vop_mkdir_args *ap)
4526 {
4527         vattr_t *vap = ap->a_vap;
4528         znode_t *zp = NULL;
4529         int rc;
4530
4531 #if __FreeBSD_version < 1400068
4532         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4533 #endif
4534
4535         vattr_init_mask(vap);
4536         *ap->a_vpp = NULL;
4537
4538         rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp,
4539             ap->a_cnp->cn_cred, 0, NULL, NULL);
4540
4541         if (rc == 0)
4542                 *ap->a_vpp = ZTOV(zp);
4543         return (rc);
4544 }
4545
4546 #ifndef _SYS_SYSPROTO_H_
4547 struct vop_rmdir_args {
4548         struct vnode *a_dvp;
4549         struct vnode *a_vp;
4550         struct componentname *a_cnp;
4551 };
4552 #endif
4553
4554 static int
4555 zfs_freebsd_rmdir(struct vop_rmdir_args *ap)
4556 {
4557         struct componentname *cnp = ap->a_cnp;
4558
4559 #if __FreeBSD_version < 1400068
4560         ASSERT(cnp->cn_flags & SAVENAME);
4561 #endif
4562
4563         return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
4564 }
4565
4566 #ifndef _SYS_SYSPROTO_H_
4567 struct vop_readdir_args {
4568         struct vnode *a_vp;
4569         struct uio *a_uio;
4570         struct ucred *a_cred;
4571         int *a_eofflag;
4572         int *a_ncookies;
4573         cookie_t **a_cookies;
4574 };
4575 #endif
4576
4577 static int
4578 zfs_freebsd_readdir(struct vop_readdir_args *ap)
4579 {
4580         zfs_uio_t uio;
4581         zfs_uio_init(&uio, ap->a_uio);
4582         return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag,
4583             ap->a_ncookies, ap->a_cookies));
4584 }
4585
4586 #ifndef _SYS_SYSPROTO_H_
4587 struct vop_fsync_args {
4588         struct vnode *a_vp;
4589         int a_waitfor;
4590         struct thread *a_td;
4591 };
4592 #endif
4593
4594 static int
4595 zfs_freebsd_fsync(struct vop_fsync_args *ap)
4596 {
4597
4598         return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred));
4599 }
4600
4601 #ifndef _SYS_SYSPROTO_H_
4602 struct vop_getattr_args {
4603         struct vnode *a_vp;
4604         struct vattr *a_vap;
4605         struct ucred *a_cred;
4606 };
4607 #endif
4608
4609 static int
4610 zfs_freebsd_getattr(struct vop_getattr_args *ap)
4611 {
4612         vattr_t *vap = ap->a_vap;
4613         xvattr_t xvap;
4614         ulong_t fflags = 0;
4615         int error;
4616
4617         xva_init(&xvap);
4618         xvap.xva_vattr = *vap;
4619         xvap.xva_vattr.va_mask |= AT_XVATTR;
4620
4621         /* Convert chflags into ZFS-type flags. */
4622         /* XXX: what about SF_SETTABLE?. */
4623         XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
4624         XVA_SET_REQ(&xvap, XAT_APPENDONLY);
4625         XVA_SET_REQ(&xvap, XAT_NOUNLINK);
4626         XVA_SET_REQ(&xvap, XAT_NODUMP);
4627         XVA_SET_REQ(&xvap, XAT_READONLY);
4628         XVA_SET_REQ(&xvap, XAT_ARCHIVE);
4629         XVA_SET_REQ(&xvap, XAT_SYSTEM);
4630         XVA_SET_REQ(&xvap, XAT_HIDDEN);
4631         XVA_SET_REQ(&xvap, XAT_REPARSE);
4632         XVA_SET_REQ(&xvap, XAT_OFFLINE);
4633         XVA_SET_REQ(&xvap, XAT_SPARSE);
4634
4635         error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred);
4636         if (error != 0)
4637                 return (error);
4638
4639         /* Convert ZFS xattr into chflags. */
4640 #define FLAG_CHECK(fflag, xflag, xfield)        do {                    \
4641         if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)             \
4642                 fflags |= (fflag);                                      \
4643 } while (0)
4644         FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
4645             xvap.xva_xoptattrs.xoa_immutable);
4646         FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
4647             xvap.xva_xoptattrs.xoa_appendonly);
4648         FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
4649             xvap.xva_xoptattrs.xoa_nounlink);
4650         FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
4651             xvap.xva_xoptattrs.xoa_archive);
4652         FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
4653             xvap.xva_xoptattrs.xoa_nodump);
4654         FLAG_CHECK(UF_READONLY, XAT_READONLY,
4655             xvap.xva_xoptattrs.xoa_readonly);
4656         FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
4657             xvap.xva_xoptattrs.xoa_system);
4658         FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
4659             xvap.xva_xoptattrs.xoa_hidden);
4660         FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
4661             xvap.xva_xoptattrs.xoa_reparse);
4662         FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
4663             xvap.xva_xoptattrs.xoa_offline);
4664         FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
4665             xvap.xva_xoptattrs.xoa_sparse);
4666
4667 #undef  FLAG_CHECK
4668         *vap = xvap.xva_vattr;
4669         vap->va_flags = fflags;
4670         return (0);
4671 }
4672
4673 #ifndef _SYS_SYSPROTO_H_
4674 struct vop_setattr_args {
4675         struct vnode *a_vp;
4676         struct vattr *a_vap;
4677         struct ucred *a_cred;
4678 };
4679 #endif
4680
4681 static int
4682 zfs_freebsd_setattr(struct vop_setattr_args *ap)
4683 {
4684         vnode_t *vp = ap->a_vp;
4685         vattr_t *vap = ap->a_vap;
4686         cred_t *cred = ap->a_cred;
4687         xvattr_t xvap;
4688         ulong_t fflags;
4689         uint64_t zflags;
4690
4691         vattr_init_mask(vap);
4692         vap->va_mask &= ~AT_NOSET;
4693
4694         xva_init(&xvap);
4695         xvap.xva_vattr = *vap;
4696
4697         zflags = VTOZ(vp)->z_pflags;
4698
4699         if (vap->va_flags != VNOVAL) {
4700                 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
4701                 int error;
4702
4703                 if (zfsvfs->z_use_fuids == B_FALSE)
4704                         return (EOPNOTSUPP);
4705
4706                 fflags = vap->va_flags;
4707                 /*
4708                  * XXX KDM
4709                  * We need to figure out whether it makes sense to allow
4710                  * UF_REPARSE through, since we don't really have other
4711                  * facilities to handle reparse points and zfs_setattr()
4712                  * doesn't currently allow setting that attribute anyway.
4713                  */
4714                 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
4715                     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
4716                     UF_OFFLINE|UF_SPARSE)) != 0)
4717                         return (EOPNOTSUPP);
4718                 /*
4719                  * Unprivileged processes are not permitted to unset system
4720                  * flags, or modify flags if any system flags are set.
4721                  * Privileged non-jail processes may not modify system flags
4722                  * if securelevel > 0 and any existing system flags are set.
4723                  * Privileged jail processes behave like privileged non-jail
4724                  * processes if the PR_ALLOW_CHFLAGS permission bit is set;
4725                  * otherwise, they behave like unprivileged processes.
4726                  */
4727                 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
4728                     priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
4729                         if (zflags &
4730                             (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4731                                 error = securelevel_gt(cred, 0);
4732                                 if (error != 0)
4733                                         return (error);
4734                         }
4735                 } else {
4736                         /*
4737                          * Callers may only modify the file flags on
4738                          * objects they have VADMIN rights for.
4739                          */
4740                         if ((error = VOP_ACCESS(vp, VADMIN, cred,
4741                             curthread)) != 0)
4742                                 return (error);
4743                         if (zflags &
4744                             (ZFS_IMMUTABLE | ZFS_APPENDONLY |
4745                             ZFS_NOUNLINK)) {
4746                                 return (EPERM);
4747                         }
4748                         if (fflags &
4749                             (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
4750                                 return (EPERM);
4751                         }
4752                 }
4753
4754 #define FLAG_CHANGE(fflag, zflag, xflag, xfield)        do {            \
4755         if (((fflags & (fflag)) && !(zflags & (zflag))) ||              \
4756             ((zflags & (zflag)) && !(fflags & (fflag)))) {              \
4757                 XVA_SET_REQ(&xvap, (xflag));                            \
4758                 (xfield) = ((fflags & (fflag)) != 0);                   \
4759         }                                                               \
4760 } while (0)
4761                 /* Convert chflags into ZFS-type flags. */
4762                 /* XXX: what about SF_SETTABLE?. */
4763                 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
4764                     xvap.xva_xoptattrs.xoa_immutable);
4765                 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
4766                     xvap.xva_xoptattrs.xoa_appendonly);
4767                 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
4768                     xvap.xva_xoptattrs.xoa_nounlink);
4769                 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
4770                     xvap.xva_xoptattrs.xoa_archive);
4771                 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
4772                     xvap.xva_xoptattrs.xoa_nodump);
4773                 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
4774                     xvap.xva_xoptattrs.xoa_readonly);
4775                 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
4776                     xvap.xva_xoptattrs.xoa_system);
4777                 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
4778                     xvap.xva_xoptattrs.xoa_hidden);
4779                 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
4780                     xvap.xva_xoptattrs.xoa_reparse);
4781                 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
4782                     xvap.xva_xoptattrs.xoa_offline);
4783                 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
4784                     xvap.xva_xoptattrs.xoa_sparse);
4785 #undef  FLAG_CHANGE
4786         }
4787         if (vap->va_birthtime.tv_sec != VNOVAL) {
4788                 xvap.xva_vattr.va_mask |= AT_XVATTR;
4789                 XVA_SET_REQ(&xvap, XAT_CREATETIME);
4790         }
4791         return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred, NULL));
4792 }
4793
4794 #ifndef _SYS_SYSPROTO_H_
4795 struct vop_rename_args {
4796         struct vnode *a_fdvp;
4797         struct vnode *a_fvp;
4798         struct componentname *a_fcnp;
4799         struct vnode *a_tdvp;
4800         struct vnode *a_tvp;
4801         struct componentname *a_tcnp;
4802 };
4803 #endif
4804
4805 static int
4806 zfs_freebsd_rename(struct vop_rename_args *ap)
4807 {
4808         vnode_t *fdvp = ap->a_fdvp;
4809         vnode_t *fvp = ap->a_fvp;
4810         vnode_t *tdvp = ap->a_tdvp;
4811         vnode_t *tvp = ap->a_tvp;
4812         int error;
4813
4814 #if __FreeBSD_version < 1400068
4815         ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
4816         ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
4817 #endif
4818
4819         error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
4820             ap->a_tcnp, ap->a_fcnp->cn_cred);
4821
4822         vrele(fdvp);
4823         vrele(fvp);
4824         vrele(tdvp);
4825         if (tvp != NULL)
4826                 vrele(tvp);
4827
4828         return (error);
4829 }
4830
4831 #ifndef _SYS_SYSPROTO_H_
4832 struct vop_symlink_args {
4833         struct vnode *a_dvp;
4834         struct vnode **a_vpp;
4835         struct componentname *a_cnp;
4836         struct vattr *a_vap;
4837         char *a_target;
4838 };
4839 #endif
4840
4841 static int
4842 zfs_freebsd_symlink(struct vop_symlink_args *ap)
4843 {
4844         struct componentname *cnp = ap->a_cnp;
4845         vattr_t *vap = ap->a_vap;
4846         znode_t *zp = NULL;
4847         char *symlink;
4848         size_t symlink_len;
4849         int rc;
4850
4851 #if __FreeBSD_version < 1400068
4852         ASSERT(cnp->cn_flags & SAVENAME);
4853 #endif
4854
4855         vap->va_type = VLNK;    /* FreeBSD: Syscall only sets va_mode. */
4856         vattr_init_mask(vap);
4857         *ap->a_vpp = NULL;
4858
4859         rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap,
4860             ap->a_target, &zp, cnp->cn_cred, 0 /* flags */, NULL);
4861         if (rc == 0) {
4862                 *ap->a_vpp = ZTOV(zp);
4863                 ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
4864                 MPASS(zp->z_cached_symlink == NULL);
4865                 symlink_len = strlen(ap->a_target);
4866                 symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
4867                 if (symlink != NULL) {
4868                         memcpy(symlink, ap->a_target, symlink_len);
4869                         symlink[symlink_len] = '\0';
4870                         atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
4871                             (uintptr_t)symlink);
4872                 }
4873         }
4874         return (rc);
4875 }
4876
4877 #ifndef _SYS_SYSPROTO_H_
4878 struct vop_readlink_args {
4879         struct vnode *a_vp;
4880         struct uio *a_uio;
4881         struct ucred *a_cred;
4882 };
4883 #endif
4884
4885 static int
4886 zfs_freebsd_readlink(struct vop_readlink_args *ap)
4887 {
4888         zfs_uio_t uio;
4889         int error;
4890         znode_t *zp = VTOZ(ap->a_vp);
4891         char *symlink, *base;
4892         size_t symlink_len;
4893         bool trycache;
4894
4895         zfs_uio_init(&uio, ap->a_uio);
4896         trycache = false;
4897         if (zfs_uio_segflg(&uio) == UIO_SYSSPACE &&
4898             zfs_uio_iovcnt(&uio) == 1) {
4899                 base = zfs_uio_iovbase(&uio, 0);
4900                 symlink_len = zfs_uio_iovlen(&uio, 0);
4901                 trycache = true;
4902         }
4903         error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL);
4904         if (atomic_load_ptr(&zp->z_cached_symlink) != NULL ||
4905             error != 0 || !trycache) {
4906                 return (error);
4907         }
4908         symlink_len -= zfs_uio_resid(&uio);
4909         symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
4910         if (symlink != NULL) {
4911                 memcpy(symlink, base, symlink_len);
4912                 symlink[symlink_len] = '\0';
4913                 if (!atomic_cmpset_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
4914                     (uintptr_t)NULL, (uintptr_t)symlink)) {
4915                         cache_symlink_free(symlink, symlink_len + 1);
4916                 }
4917         }
4918         return (error);
4919 }
4920
4921 #ifndef _SYS_SYSPROTO_H_
4922 struct vop_link_args {
4923         struct vnode *a_tdvp;
4924         struct vnode *a_vp;
4925         struct componentname *a_cnp;
4926 };
4927 #endif
4928
4929 static int
4930 zfs_freebsd_link(struct vop_link_args *ap)
4931 {
4932         struct componentname *cnp = ap->a_cnp;
4933         vnode_t *vp = ap->a_vp;
4934         vnode_t *tdvp = ap->a_tdvp;
4935
4936         if (tdvp->v_mount != vp->v_mount)
4937                 return (EXDEV);
4938
4939 #if __FreeBSD_version < 1400068
4940         ASSERT(cnp->cn_flags & SAVENAME);
4941 #endif
4942
4943         return (zfs_link(VTOZ(tdvp), VTOZ(vp),
4944             cnp->cn_nameptr, cnp->cn_cred, 0));
4945 }
4946
4947 #ifndef _SYS_SYSPROTO_H_
4948 struct vop_inactive_args {
4949         struct vnode *a_vp;
4950         struct thread *a_td;
4951 };
4952 #endif
4953
4954 static int
4955 zfs_freebsd_inactive(struct vop_inactive_args *ap)
4956 {
4957         vnode_t *vp = ap->a_vp;
4958
4959         zfs_inactive(vp, curthread->td_ucred, NULL);
4960         return (0);
4961 }
4962
4963 #ifndef _SYS_SYSPROTO_H_
4964 struct vop_need_inactive_args {
4965         struct vnode *a_vp;
4966         struct thread *a_td;
4967 };
4968 #endif
4969
4970 static int
4971 zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap)
4972 {
4973         vnode_t *vp = ap->a_vp;
4974         znode_t *zp = VTOZ(vp);
4975         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4976         int need;
4977
4978         if (vn_need_pageq_flush(vp))
4979                 return (1);
4980
4981         if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs))
4982                 return (1);
4983         need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
4984         ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
4985
4986         return (need);
4987 }
4988
4989 #ifndef _SYS_SYSPROTO_H_
4990 struct vop_reclaim_args {
4991         struct vnode *a_vp;
4992         struct thread *a_td;
4993 };
4994 #endif
4995
4996 static int
4997 zfs_freebsd_reclaim(struct vop_reclaim_args *ap)
4998 {
4999         vnode_t *vp = ap->a_vp;
5000         znode_t *zp = VTOZ(vp);
5001         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5002
5003         ASSERT3P(zp, !=, NULL);
5004
5005         /*
5006          * z_teardown_inactive_lock protects from a race with
5007          * zfs_znode_dmu_fini in zfsvfs_teardown during
5008          * force unmount.
5009          */
5010         ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
5011         if (zp->z_sa_hdl == NULL)
5012                 zfs_znode_free(zp);
5013         else
5014                 zfs_zinactive(zp);
5015         ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
5016
5017         vp->v_data = NULL;
5018         return (0);
5019 }
5020
5021 #ifndef _SYS_SYSPROTO_H_
5022 struct vop_fid_args {
5023         struct vnode *a_vp;
5024         struct fid *a_fid;
5025 };
5026 #endif
5027
5028 static int
5029 zfs_freebsd_fid(struct vop_fid_args *ap)
5030 {
5031
5032         return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5033 }
5034
5035
5036 #ifndef _SYS_SYSPROTO_H_
5037 struct vop_pathconf_args {
5038         struct vnode *a_vp;
5039         int a_name;
5040         register_t *a_retval;
5041 } *ap;
5042 #endif
5043
5044 static int
5045 zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
5046 {
5047         ulong_t val;
5048         int error;
5049
5050         error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
5051             curthread->td_ucred, NULL);
5052         if (error == 0) {
5053                 *ap->a_retval = val;
5054                 return (error);
5055         }
5056         if (error != EOPNOTSUPP)
5057                 return (error);
5058
5059         switch (ap->a_name) {
5060         case _PC_NAME_MAX:
5061                 *ap->a_retval = NAME_MAX;
5062                 return (0);
5063 #if __FreeBSD_version >= 1400032
5064         case _PC_DEALLOC_PRESENT:
5065                 *ap->a_retval = 1;
5066                 return (0);
5067 #endif
5068         case _PC_PIPE_BUF:
5069                 if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
5070                         *ap->a_retval = PIPE_BUF;
5071                         return (0);
5072                 }
5073                 return (EINVAL);
5074         default:
5075                 return (vop_stdpathconf(ap));
5076         }
5077 }
5078
5079 static int zfs_xattr_compat = 1;
5080
5081 static int
5082 zfs_check_attrname(const char *name)
5083 {
5084         /* We don't allow '/' character in attribute name. */
5085         if (strchr(name, '/') != NULL)
5086                 return (SET_ERROR(EINVAL));
5087         /* We don't allow attribute names that start with a namespace prefix. */
5088         if (ZFS_XA_NS_PREFIX_FORBIDDEN(name))
5089                 return (SET_ERROR(EINVAL));
5090         return (0);
5091 }
5092
5093 /*
5094  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5095  * extended attribute name:
5096  *
5097  *      NAMESPACE       XATTR_COMPAT    PREFIX
5098  *      system          *               freebsd:system:
5099  *      user            1               (none, can be used to access ZFS
5100  *                                      fsattr(5) attributes created on Solaris)
5101  *      user            0               user.
5102  */
5103 static int
5104 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5105     size_t size, boolean_t compat)
5106 {
5107         const char *namespace, *prefix, *suffix;
5108
5109         memset(attrname, 0, size);
5110
5111         switch (attrnamespace) {
5112         case EXTATTR_NAMESPACE_USER:
5113                 if (compat) {
5114                         /*
5115                          * This is the default namespace by which we can access
5116                          * all attributes created on Solaris.
5117                          */
5118                         prefix = namespace = suffix = "";
5119                 } else {
5120                         /*
5121                          * This is compatible with the user namespace encoding
5122                          * on Linux prior to xattr_compat, but nothing
5123                          * else.
5124                          */
5125                         prefix = "";
5126                         namespace = "user";
5127                         suffix = ".";
5128                 }
5129                 break;
5130         case EXTATTR_NAMESPACE_SYSTEM:
5131                 prefix = "freebsd:";
5132                 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5133                 suffix = ":";
5134                 break;
5135         case EXTATTR_NAMESPACE_EMPTY:
5136         default:
5137                 return (SET_ERROR(EINVAL));
5138         }
5139         if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5140             name) >= size) {
5141                 return (SET_ERROR(ENAMETOOLONG));
5142         }
5143         return (0);
5144 }
5145
5146 static int
5147 zfs_ensure_xattr_cached(znode_t *zp)
5148 {
5149         int error = 0;
5150
5151         ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
5152
5153         if (zp->z_xattr_cached != NULL)
5154                 return (0);
5155
5156         if (rw_write_held(&zp->z_xattr_lock))
5157                 return (zfs_sa_get_xattr(zp));
5158
5159         if (!rw_tryupgrade(&zp->z_xattr_lock)) {
5160                 rw_exit(&zp->z_xattr_lock);
5161                 rw_enter(&zp->z_xattr_lock, RW_WRITER);
5162         }
5163         if (zp->z_xattr_cached == NULL)
5164                 error = zfs_sa_get_xattr(zp);
5165         rw_downgrade(&zp->z_xattr_lock);
5166         return (error);
5167 }
5168
5169 #ifndef _SYS_SYSPROTO_H_
5170 struct vop_getextattr {
5171         IN struct vnode *a_vp;
5172         IN int a_attrnamespace;
5173         IN const char *a_name;
5174         INOUT struct uio *a_uio;
5175         OUT size_t *a_size;
5176         IN struct ucred *a_cred;
5177         IN struct thread *a_td;
5178 };
5179 #endif
5180
5181 static int
5182 zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname)
5183 {
5184         struct thread *td = ap->a_td;
5185         struct nameidata nd;
5186         struct vattr va;
5187         vnode_t *xvp = NULL, *vp;
5188         int error, flags;
5189
5190         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5191             LOOKUP_XATTR, B_FALSE);
5192         if (error != 0)
5193                 return (error);
5194
5195         flags = FREAD;
5196 #if __FreeBSD_version < 1400043
5197         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5198             xvp, td);
5199 #else
5200         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
5201 #endif
5202         error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL);
5203         if (error != 0)
5204                 return (SET_ERROR(error));
5205         vp = nd.ni_vp;
5206         NDFREE_PNBUF(&nd);
5207
5208         if (ap->a_size != NULL) {
5209                 error = VOP_GETATTR(vp, &va, ap->a_cred);
5210                 if (error == 0)
5211                         *ap->a_size = (size_t)va.va_size;
5212         } else if (ap->a_uio != NULL)
5213                 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5214
5215         VOP_UNLOCK(vp);
5216         vn_close(vp, flags, ap->a_cred, td);
5217         return (error);
5218 }
5219
5220 static int
5221 zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname)
5222 {
5223         znode_t *zp = VTOZ(ap->a_vp);
5224         uchar_t *nv_value;
5225         uint_t nv_size;
5226         int error;
5227
5228         error = zfs_ensure_xattr_cached(zp);
5229         if (error != 0)
5230                 return (error);
5231
5232         ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
5233         ASSERT3P(zp->z_xattr_cached, !=, NULL);
5234
5235         error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname,
5236             &nv_value, &nv_size);
5237         if (error != 0)
5238                 return (SET_ERROR(error));
5239
5240         if (ap->a_size != NULL)
5241                 *ap->a_size = nv_size;
5242         else if (ap->a_uio != NULL)
5243                 error = uiomove(nv_value, nv_size, ap->a_uio);
5244         if (error != 0)
5245                 return (SET_ERROR(error));
5246
5247         return (0);
5248 }
5249
5250 static int
5251 zfs_getextattr_impl(struct vop_getextattr_args *ap, boolean_t compat)
5252 {
5253         znode_t *zp = VTOZ(ap->a_vp);
5254         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5255         char attrname[EXTATTR_MAXNAMELEN+1];
5256         int error;
5257
5258         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5259             sizeof (attrname), compat);
5260         if (error != 0)
5261                 return (error);
5262
5263         error = ENOENT;
5264         if (zfsvfs->z_use_sa && zp->z_is_sa)
5265                 error = zfs_getextattr_sa(ap, attrname);
5266         if (error == ENOENT)
5267                 error = zfs_getextattr_dir(ap, attrname);
5268         return (error);
5269 }
5270
5271 /*
5272  * Vnode operation to retrieve a named extended attribute.
5273  */
5274 static int
5275 zfs_getextattr(struct vop_getextattr_args *ap)
5276 {
5277         znode_t *zp = VTOZ(ap->a_vp);
5278         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5279         int error;
5280
5281         /*
5282          * If the xattr property is off, refuse the request.
5283          */
5284         if (!(zfsvfs->z_flags & ZSB_XATTR))
5285                 return (SET_ERROR(EOPNOTSUPP));
5286
5287         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5288             ap->a_cred, ap->a_td, VREAD);
5289         if (error != 0)
5290                 return (SET_ERROR(error));
5291
5292         error = zfs_check_attrname(ap->a_name);
5293         if (error != 0)
5294                 return (error);
5295
5296         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5297                 return (error);
5298         error = ENOENT;
5299         rw_enter(&zp->z_xattr_lock, RW_READER);
5300
5301         error = zfs_getextattr_impl(ap, zfs_xattr_compat);
5302         if ((error == ENOENT || error == ENOATTR) &&
5303             ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
5304                 /*
5305                  * Fall back to the alternate namespace format if we failed to
5306                  * find a user xattr.
5307                  */
5308                 error = zfs_getextattr_impl(ap, !zfs_xattr_compat);
5309         }
5310
5311         rw_exit(&zp->z_xattr_lock);
5312         zfs_exit(zfsvfs, FTAG);
5313         if (error == ENOENT)
5314                 error = SET_ERROR(ENOATTR);
5315         return (error);
5316 }
5317
5318 #ifndef _SYS_SYSPROTO_H_
5319 struct vop_deleteextattr {
5320         IN struct vnode *a_vp;
5321         IN int a_attrnamespace;
5322         IN const char *a_name;
5323         IN struct ucred *a_cred;
5324         IN struct thread *a_td;
5325 };
5326 #endif
5327
5328 static int
5329 zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname)
5330 {
5331         struct nameidata nd;
5332         vnode_t *xvp = NULL, *vp;
5333         int error;
5334
5335         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5336             LOOKUP_XATTR, B_FALSE);
5337         if (error != 0)
5338                 return (error);
5339
5340 #if __FreeBSD_version < 1400043
5341         NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5342             UIO_SYSSPACE, attrname, xvp, ap->a_td);
5343 #else
5344         NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5345             UIO_SYSSPACE, attrname, xvp);
5346 #endif
5347         error = namei(&nd);
5348         if (error != 0)
5349                 return (SET_ERROR(error));
5350
5351         vp = nd.ni_vp;
5352         error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5353         NDFREE_PNBUF(&nd);
5354
5355         vput(nd.ni_dvp);
5356         if (vp == nd.ni_dvp)
5357                 vrele(vp);
5358         else
5359                 vput(vp);
5360
5361         return (error);
5362 }
5363
5364 static int
5365 zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname)
5366 {
5367         znode_t *zp = VTOZ(ap->a_vp);
5368         nvlist_t *nvl;
5369         int error;
5370
5371         error = zfs_ensure_xattr_cached(zp);
5372         if (error != 0)
5373                 return (error);
5374
5375         ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
5376         ASSERT3P(zp->z_xattr_cached, !=, NULL);
5377
5378         nvl = zp->z_xattr_cached;
5379         error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY);
5380         if (error != 0)
5381                 error = SET_ERROR(error);
5382         else
5383                 error = zfs_sa_set_xattr(zp, attrname, NULL, 0);
5384         if (error != 0) {
5385                 zp->z_xattr_cached = NULL;
5386                 nvlist_free(nvl);
5387         }
5388         return (error);
5389 }
5390
5391 static int
5392 zfs_deleteextattr_impl(struct vop_deleteextattr_args *ap, boolean_t compat)
5393 {
5394         znode_t *zp = VTOZ(ap->a_vp);
5395         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5396         char attrname[EXTATTR_MAXNAMELEN+1];
5397         int error;
5398
5399         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5400             sizeof (attrname), compat);
5401         if (error != 0)
5402                 return (error);
5403
5404         error = ENOENT;
5405         if (zfsvfs->z_use_sa && zp->z_is_sa)
5406                 error = zfs_deleteextattr_sa(ap, attrname);
5407         if (error == ENOENT)
5408                 error = zfs_deleteextattr_dir(ap, attrname);
5409         return (error);
5410 }
5411
5412 /*
5413  * Vnode operation to remove a named attribute.
5414  */
5415 static int
5416 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5417 {
5418         znode_t *zp = VTOZ(ap->a_vp);
5419         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5420         int error;
5421
5422         /*
5423          * If the xattr property is off, refuse the request.
5424          */
5425         if (!(zfsvfs->z_flags & ZSB_XATTR))
5426                 return (SET_ERROR(EOPNOTSUPP));
5427
5428         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5429             ap->a_cred, ap->a_td, VWRITE);
5430         if (error != 0)
5431                 return (SET_ERROR(error));
5432
5433         error = zfs_check_attrname(ap->a_name);
5434         if (error != 0)
5435                 return (error);
5436
5437         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5438                 return (error);
5439         rw_enter(&zp->z_xattr_lock, RW_WRITER);
5440
5441         error = zfs_deleteextattr_impl(ap, zfs_xattr_compat);
5442         if ((error == ENOENT || error == ENOATTR) &&
5443             ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
5444                 /*
5445                  * Fall back to the alternate namespace format if we failed to
5446                  * find a user xattr.
5447                  */
5448                 error = zfs_deleteextattr_impl(ap, !zfs_xattr_compat);
5449         }
5450
5451         rw_exit(&zp->z_xattr_lock);
5452         zfs_exit(zfsvfs, FTAG);
5453         if (error == ENOENT)
5454                 error = SET_ERROR(ENOATTR);
5455         return (error);
5456 }
5457
5458 #ifndef _SYS_SYSPROTO_H_
5459 struct vop_setextattr {
5460         IN struct vnode *a_vp;
5461         IN int a_attrnamespace;
5462         IN const char *a_name;
5463         INOUT struct uio *a_uio;
5464         IN struct ucred *a_cred;
5465         IN struct thread *a_td;
5466 };
5467 #endif
5468
5469 static int
5470 zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname)
5471 {
5472         struct thread *td = ap->a_td;
5473         struct nameidata nd;
5474         struct vattr va;
5475         vnode_t *xvp = NULL, *vp;
5476         int error, flags;
5477
5478         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5479             LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
5480         if (error != 0)
5481                 return (error);
5482
5483         flags = FFLAGS(O_WRONLY | O_CREAT);
5484 #if __FreeBSD_version < 1400043
5485         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td);
5486 #else
5487         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
5488 #endif
5489         error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
5490             NULL);
5491         if (error != 0)
5492                 return (SET_ERROR(error));
5493         vp = nd.ni_vp;
5494         NDFREE_PNBUF(&nd);
5495
5496         VATTR_NULL(&va);
5497         va.va_size = 0;
5498         error = VOP_SETATTR(vp, &va, ap->a_cred);
5499         if (error == 0)
5500                 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5501
5502         VOP_UNLOCK(vp);
5503         vn_close(vp, flags, ap->a_cred, td);
5504         return (error);
5505 }
5506
5507 static int
5508 zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname)
5509 {
5510         znode_t *zp = VTOZ(ap->a_vp);
5511         nvlist_t *nvl;
5512         size_t sa_size;
5513         int error;
5514
5515         error = zfs_ensure_xattr_cached(zp);
5516         if (error != 0)
5517                 return (error);
5518
5519         ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
5520         ASSERT3P(zp->z_xattr_cached, !=, NULL);
5521
5522         nvl = zp->z_xattr_cached;
5523         size_t entry_size = ap->a_uio->uio_resid;
5524         if (entry_size > DXATTR_MAX_ENTRY_SIZE)
5525                 return (SET_ERROR(EFBIG));
5526         error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
5527         if (error != 0)
5528                 return (SET_ERROR(error));
5529         if (sa_size > DXATTR_MAX_SA_SIZE)
5530                 return (SET_ERROR(EFBIG));
5531         uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP);
5532         error = uiomove(buf, entry_size, ap->a_uio);
5533         if (error != 0) {
5534                 error = SET_ERROR(error);
5535         } else {
5536                 error = nvlist_add_byte_array(nvl, attrname, buf, entry_size);
5537                 if (error != 0)
5538                         error = SET_ERROR(error);
5539         }
5540         if (error == 0)
5541                 error = zfs_sa_set_xattr(zp, attrname, buf, entry_size);
5542         kmem_free(buf, entry_size);
5543         if (error != 0) {
5544                 zp->z_xattr_cached = NULL;
5545                 nvlist_free(nvl);
5546         }
5547         return (error);
5548 }
5549
5550 static int
5551 zfs_setextattr_impl(struct vop_setextattr_args *ap, boolean_t compat)
5552 {
5553         znode_t *zp = VTOZ(ap->a_vp);
5554         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5555         char attrname[EXTATTR_MAXNAMELEN+1];
5556         int error;
5557
5558         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5559             sizeof (attrname), compat);
5560         if (error != 0)
5561                 return (error);
5562
5563         struct vop_deleteextattr_args vda = {
5564                 .a_vp = ap->a_vp,
5565                 .a_attrnamespace = ap->a_attrnamespace,
5566                 .a_name = ap->a_name,
5567                 .a_cred = ap->a_cred,
5568                 .a_td = ap->a_td,
5569         };
5570         error = ENOENT;
5571         if (zfsvfs->z_use_sa && zp->z_is_sa && zfsvfs->z_xattr_sa) {
5572                 error = zfs_setextattr_sa(ap, attrname);
5573                 if (error == 0) {
5574                         /*
5575                          * Successfully put into SA, we need to clear the one
5576                          * in dir if present.
5577                          */
5578                         zfs_deleteextattr_dir(&vda, attrname);
5579                 }
5580         }
5581         if (error != 0) {
5582                 error = zfs_setextattr_dir(ap, attrname);
5583                 if (error == 0 && zp->z_is_sa) {
5584                         /*
5585                          * Successfully put into dir, we need to clear the one
5586                          * in SA if present.
5587                          */
5588                         zfs_deleteextattr_sa(&vda, attrname);
5589                 }
5590         }
5591         if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
5592                 /*
5593                  * Also clear all versions of the alternate compat name.
5594                  */
5595                 zfs_deleteextattr_impl(&vda, !compat);
5596         }
5597         return (error);
5598 }
5599
5600 /*
5601  * Vnode operation to set a named attribute.
5602  */
5603 static int
5604 zfs_setextattr(struct vop_setextattr_args *ap)
5605 {
5606         znode_t *zp = VTOZ(ap->a_vp);
5607         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5608         int error;
5609
5610         /*
5611          * If the xattr property is off, refuse the request.
5612          */
5613         if (!(zfsvfs->z_flags & ZSB_XATTR))
5614                 return (SET_ERROR(EOPNOTSUPP));
5615
5616         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5617             ap->a_cred, ap->a_td, VWRITE);
5618         if (error != 0)
5619                 return (SET_ERROR(error));
5620
5621         error = zfs_check_attrname(ap->a_name);
5622         if (error != 0)
5623                 return (error);
5624
5625         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5626                 return (error);
5627         rw_enter(&zp->z_xattr_lock, RW_WRITER);
5628
5629         error = zfs_setextattr_impl(ap, zfs_xattr_compat);
5630
5631         rw_exit(&zp->z_xattr_lock);
5632         zfs_exit(zfsvfs, FTAG);
5633         return (error);
5634 }
5635
5636 #ifndef _SYS_SYSPROTO_H_
5637 struct vop_listextattr {
5638         IN struct vnode *a_vp;
5639         IN int a_attrnamespace;
5640         INOUT struct uio *a_uio;
5641         OUT size_t *a_size;
5642         IN struct ucred *a_cred;
5643         IN struct thread *a_td;
5644 };
5645 #endif
5646
5647 static int
5648 zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix)
5649 {
5650         struct thread *td = ap->a_td;
5651         struct nameidata nd;
5652         uint8_t dirbuf[sizeof (struct dirent)];
5653         struct iovec aiov;
5654         struct uio auio;
5655         vnode_t *xvp = NULL, *vp;
5656         int error, eof;
5657
5658         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5659             LOOKUP_XATTR, B_FALSE);
5660         if (error != 0) {
5661                 /*
5662                  * ENOATTR means that the EA directory does not yet exist,
5663                  * i.e. there are no extended attributes there.
5664                  */
5665                 if (error == ENOATTR)
5666                         error = 0;
5667                 return (error);
5668         }
5669
5670 #if __FreeBSD_version < 1400043
5671         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5672             UIO_SYSSPACE, ".", xvp, td);
5673 #else
5674         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5675             UIO_SYSSPACE, ".", xvp);
5676 #endif
5677         error = namei(&nd);
5678         if (error != 0)
5679                 return (SET_ERROR(error));
5680         vp = nd.ni_vp;
5681         NDFREE_PNBUF(&nd);
5682
5683         auio.uio_iov = &aiov;
5684         auio.uio_iovcnt = 1;
5685         auio.uio_segflg = UIO_SYSSPACE;
5686         auio.uio_td = td;
5687         auio.uio_rw = UIO_READ;
5688         auio.uio_offset = 0;
5689
5690         size_t plen = strlen(attrprefix);
5691
5692         do {
5693                 aiov.iov_base = (void *)dirbuf;
5694                 aiov.iov_len = sizeof (dirbuf);
5695                 auio.uio_resid = sizeof (dirbuf);
5696                 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5697                 if (error != 0)
5698                         break;
5699                 int done = sizeof (dirbuf) - auio.uio_resid;
5700                 for (int pos = 0; pos < done; ) {
5701                         struct dirent *dp = (struct dirent *)(dirbuf + pos);
5702                         pos += dp->d_reclen;
5703                         /*
5704                          * XXX: Temporarily we also accept DT_UNKNOWN, as this
5705                          * is what we get when attribute was created on Solaris.
5706                          */
5707                         if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5708                                 continue;
5709                         else if (plen == 0 &&
5710                             ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name))
5711                                 continue;
5712                         else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5713                                 continue;
5714                         uint8_t nlen = dp->d_namlen - plen;
5715                         if (ap->a_size != NULL) {
5716                                 *ap->a_size += 1 + nlen;
5717                         } else if (ap->a_uio != NULL) {
5718                                 /*
5719                                  * Format of extattr name entry is one byte for
5720                                  * length and the rest for name.
5721                                  */
5722                                 error = uiomove(&nlen, 1, ap->a_uio);
5723                                 if (error == 0) {
5724                                         char *namep = dp->d_name + plen;
5725                                         error = uiomove(namep, nlen, ap->a_uio);
5726                                 }
5727                                 if (error != 0) {
5728                                         error = SET_ERROR(error);
5729                                         break;
5730                                 }
5731                         }
5732                 }
5733         } while (!eof && error == 0);
5734
5735         vput(vp);
5736         return (error);
5737 }
5738
5739 static int
5740 zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix)
5741 {
5742         znode_t *zp = VTOZ(ap->a_vp);
5743         int error;
5744
5745         error = zfs_ensure_xattr_cached(zp);
5746         if (error != 0)
5747                 return (error);
5748
5749         ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
5750         ASSERT3P(zp->z_xattr_cached, !=, NULL);
5751
5752         size_t plen = strlen(attrprefix);
5753         nvpair_t *nvp = NULL;
5754         while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
5755                 ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
5756
5757                 const char *name = nvpair_name(nvp);
5758                 if (plen == 0 && ZFS_XA_NS_PREFIX_FORBIDDEN(name))
5759                         continue;
5760                 else if (strncmp(name, attrprefix, plen) != 0)
5761                         continue;
5762                 uint8_t nlen = strlen(name) - plen;
5763                 if (ap->a_size != NULL) {
5764                         *ap->a_size += 1 + nlen;
5765                 } else if (ap->a_uio != NULL) {
5766                         /*
5767                          * Format of extattr name entry is one byte for
5768                          * length and the rest for name.
5769                          */
5770                         error = uiomove(&nlen, 1, ap->a_uio);
5771                         if (error == 0) {
5772                                 char *namep = __DECONST(char *, name) + plen;
5773                                 error = uiomove(namep, nlen, ap->a_uio);
5774                         }
5775                         if (error != 0) {
5776                                 error = SET_ERROR(error);
5777                                 break;
5778                         }
5779                 }
5780         }
5781
5782         return (error);
5783 }
5784
5785 static int
5786 zfs_listextattr_impl(struct vop_listextattr_args *ap, boolean_t compat)
5787 {
5788         znode_t *zp = VTOZ(ap->a_vp);
5789         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5790         char attrprefix[16];
5791         int error;
5792
5793         error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5794             sizeof (attrprefix), compat);
5795         if (error != 0)
5796                 return (error);
5797
5798         if (zfsvfs->z_use_sa && zp->z_is_sa)
5799                 error = zfs_listextattr_sa(ap, attrprefix);
5800         if (error == 0)
5801                 error = zfs_listextattr_dir(ap, attrprefix);
5802         return (error);
5803 }
5804
5805 /*
5806  * Vnode operation to retrieve extended attributes on a vnode.
5807  */
5808 static int
5809 zfs_listextattr(struct vop_listextattr_args *ap)
5810 {
5811         znode_t *zp = VTOZ(ap->a_vp);
5812         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5813         int error;
5814
5815         if (ap->a_size != NULL)
5816                 *ap->a_size = 0;
5817
5818         /*
5819          * If the xattr property is off, refuse the request.
5820          */
5821         if (!(zfsvfs->z_flags & ZSB_XATTR))
5822                 return (SET_ERROR(EOPNOTSUPP));
5823
5824         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5825             ap->a_cred, ap->a_td, VREAD);
5826         if (error != 0)
5827                 return (SET_ERROR(error));
5828
5829         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5830                 return (error);
5831         rw_enter(&zp->z_xattr_lock, RW_READER);
5832
5833         error = zfs_listextattr_impl(ap, zfs_xattr_compat);
5834         if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
5835                 /* Also list user xattrs with the alternate format. */
5836                 error = zfs_listextattr_impl(ap, !zfs_xattr_compat);
5837         }
5838
5839         rw_exit(&zp->z_xattr_lock);
5840         zfs_exit(zfsvfs, FTAG);
5841         return (error);
5842 }
5843
5844 #ifndef _SYS_SYSPROTO_H_
5845 struct vop_getacl_args {
5846         struct vnode *vp;
5847         acl_type_t type;
5848         struct acl *aclp;
5849         struct ucred *cred;
5850         struct thread *td;
5851 };
5852 #endif
5853
5854 static int
5855 zfs_freebsd_getacl(struct vop_getacl_args *ap)
5856 {
5857         int             error;
5858         vsecattr_t      vsecattr;
5859
5860         if (ap->a_type != ACL_TYPE_NFS4)
5861                 return (EINVAL);
5862
5863         vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5864         if ((error = zfs_getsecattr(VTOZ(ap->a_vp),
5865             &vsecattr, 0, ap->a_cred)))
5866                 return (error);
5867
5868         error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp,
5869             vsecattr.vsa_aclcnt);
5870         if (vsecattr.vsa_aclentp != NULL)
5871                 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5872
5873         return (error);
5874 }
5875
5876 #ifndef _SYS_SYSPROTO_H_
5877 struct vop_setacl_args {
5878         struct vnode *vp;
5879         acl_type_t type;
5880         struct acl *aclp;
5881         struct ucred *cred;
5882         struct thread *td;
5883 };
5884 #endif
5885
5886 static int
5887 zfs_freebsd_setacl(struct vop_setacl_args *ap)
5888 {
5889         int             error;
5890         vsecattr_t vsecattr;
5891         int             aclbsize;       /* size of acl list in bytes */
5892         aclent_t        *aaclp;
5893
5894         if (ap->a_type != ACL_TYPE_NFS4)
5895                 return (EINVAL);
5896
5897         if (ap->a_aclp == NULL)
5898                 return (EINVAL);
5899
5900         if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5901                 return (EINVAL);
5902
5903         /*
5904          * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5905          * splitting every entry into two and appending "canonical six"
5906          * entries at the end.  Don't allow for setting an ACL that would
5907          * cause chmod(2) to run out of ACL entries.
5908          */
5909         if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5910                 return (ENOSPC);
5911
5912         error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5913         if (error != 0)
5914                 return (error);
5915
5916         vsecattr.vsa_mask = VSA_ACE;
5917         aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t);
5918         vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5919         aaclp = vsecattr.vsa_aclentp;
5920         vsecattr.vsa_aclentsz = aclbsize;
5921
5922         aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5923         error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred);
5924         kmem_free(aaclp, aclbsize);
5925
5926         return (error);
5927 }
5928
5929 #ifndef _SYS_SYSPROTO_H_
5930 struct vop_aclcheck_args {
5931         struct vnode *vp;
5932         acl_type_t type;
5933         struct acl *aclp;
5934         struct ucred *cred;
5935         struct thread *td;
5936 };
5937 #endif
5938
5939 static int
5940 zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap)
5941 {
5942
5943         return (EOPNOTSUPP);
5944 }
5945
5946 static int
5947 zfs_vptocnp(struct vop_vptocnp_args *ap)
5948 {
5949         vnode_t *covered_vp;
5950         vnode_t *vp = ap->a_vp;
5951         zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5952         znode_t *zp = VTOZ(vp);
5953         int ltype;
5954         int error;
5955
5956         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5957                 return (error);
5958
5959         /*
5960          * If we are a snapshot mounted under .zfs, run the operation
5961          * on the covered vnode.
5962          */
5963         if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
5964                 char name[MAXNAMLEN + 1];
5965                 znode_t *dzp;
5966                 size_t len;
5967
5968                 error = zfs_znode_parent_and_name(zp, &dzp, name);
5969                 if (error == 0) {
5970                         len = strlen(name);
5971                         if (*ap->a_buflen < len)
5972                                 error = SET_ERROR(ENOMEM);
5973                 }
5974                 if (error == 0) {
5975                         *ap->a_buflen -= len;
5976                         memcpy(ap->a_buf + *ap->a_buflen, name, len);
5977                         *ap->a_vpp = ZTOV(dzp);
5978                 }
5979                 zfs_exit(zfsvfs, FTAG);
5980                 return (error);
5981         }
5982         zfs_exit(zfsvfs, FTAG);
5983
5984         covered_vp = vp->v_mount->mnt_vnodecovered;
5985         enum vgetstate vs = vget_prep(covered_vp);
5986         ltype = VOP_ISLOCKED(vp);
5987         VOP_UNLOCK(vp);
5988         error = vget_finish(covered_vp, LK_SHARED, vs);
5989         if (error == 0) {
5990                 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf,
5991                     ap->a_buflen);
5992                 vput(covered_vp);
5993         }
5994         vn_lock(vp, ltype | LK_RETRY);
5995         if (VN_IS_DOOMED(vp))
5996                 error = SET_ERROR(ENOENT);
5997         return (error);
5998 }
5999
6000 #if __FreeBSD_version >= 1400032
6001 static int
6002 zfs_deallocate(struct vop_deallocate_args *ap)
6003 {
6004         znode_t *zp = VTOZ(ap->a_vp);
6005         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6006         zilog_t *zilog;
6007         off_t off, len, file_sz;
6008         int error;
6009
6010         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6011                 return (error);
6012
6013         /*
6014          * Callers might not be able to detect properly that we are read-only,
6015          * so check it explicitly here.
6016          */
6017         if (zfs_is_readonly(zfsvfs)) {
6018                 zfs_exit(zfsvfs, FTAG);
6019                 return (SET_ERROR(EROFS));
6020         }
6021
6022         zilog = zfsvfs->z_log;
6023         off = *ap->a_offset;
6024         len = *ap->a_len;
6025         file_sz = zp->z_size;
6026         if (off + len > file_sz)
6027                 len = file_sz - off;
6028         /* Fast path for out-of-range request. */
6029         if (len <= 0) {
6030                 *ap->a_len = 0;
6031                 zfs_exit(zfsvfs, FTAG);
6032                 return (0);
6033         }
6034
6035         error = zfs_freesp(zp, off, len, O_RDWR, TRUE);
6036         if (error == 0) {
6037                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS ||
6038                     (ap->a_ioflag & IO_SYNC) != 0)
6039                         zil_commit(zilog, zp->z_id);
6040                 *ap->a_offset = off + len;
6041                 *ap->a_len = 0;
6042         }
6043
6044         zfs_exit(zfsvfs, FTAG);
6045         return (error);
6046 }
6047 #endif
6048
6049 #ifndef _SYS_SYSPROTO_H_
6050 struct vop_copy_file_range_args {
6051         struct vnode *a_invp;
6052         off_t *a_inoffp;
6053         struct vnode *a_outvp;
6054         off_t *a_outoffp;
6055         size_t *a_lenp;
6056         unsigned int a_flags;
6057         struct ucred *a_incred;
6058         struct ucred *a_outcred;
6059         struct thread *a_fsizetd;
6060 }
6061 #endif
6062 /*
6063  * TODO: FreeBSD will only call file system-specific copy_file_range() if both
6064  * files resides under the same mountpoint. In case of ZFS we want to be called
6065  * even is files are in different datasets (but on the same pools, but we need
6066  * to check that ourselves).
6067  */
6068 static int
6069 zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
6070 {
6071         zfsvfs_t *outzfsvfs;
6072         struct vnode *invp = ap->a_invp;
6073         struct vnode *outvp = ap->a_outvp;
6074         struct mount *mp;
6075         int error;
6076         uint64_t len = *ap->a_lenp;
6077
6078         if (!zfs_bclone_enabled) {
6079                 mp = NULL;
6080                 goto bad_write_fallback;
6081         }
6082
6083         /*
6084          * TODO: If offset/length is not aligned to recordsize, use
6085          * vn_generic_copy_file_range() on this fragment.
6086          * It would be better to do this after we lock the vnodes, but then we
6087          * need something else than vn_generic_copy_file_range().
6088          */
6089
6090         vn_start_write(outvp, &mp, V_WAIT);
6091         if (__predict_true(mp == outvp->v_mount)) {
6092                 outzfsvfs = (zfsvfs_t *)mp->mnt_data;
6093                 if (!spa_feature_is_enabled(dmu_objset_spa(outzfsvfs->z_os),
6094                     SPA_FEATURE_BLOCK_CLONING)) {
6095                         goto bad_write_fallback;
6096                 }
6097         }
6098         if (invp == outvp) {
6099                 if (vn_lock(outvp, LK_EXCLUSIVE) != 0) {
6100                         goto bad_write_fallback;
6101                 }
6102         } else {
6103 #if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \
6104         __FreeBSD_version >= 1400086
6105                 vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false,
6106                     LK_EXCLUSIVE);
6107 #else
6108                 vn_lock_pair(invp, false, outvp, false);
6109 #endif
6110                 if (VN_IS_DOOMED(invp) || VN_IS_DOOMED(outvp)) {
6111                         goto bad_locked_fallback;
6112                 }
6113         }
6114
6115 #ifdef MAC
6116         error = mac_vnode_check_write(curthread->td_ucred, ap->a_outcred,
6117             outvp);
6118         if (error != 0)
6119                 goto out_locked;
6120 #endif
6121
6122         error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
6123             ap->a_outoffp, &len, ap->a_outcred);
6124         if (error == EXDEV || error == EAGAIN || error == EINVAL ||
6125             error == EOPNOTSUPP)
6126                 goto bad_locked_fallback;
6127         *ap->a_lenp = (size_t)len;
6128 out_locked:
6129         if (invp != outvp)
6130                 VOP_UNLOCK(invp);
6131         VOP_UNLOCK(outvp);
6132         if (mp != NULL)
6133                 vn_finished_write(mp);
6134         return (error);
6135
6136 bad_locked_fallback:
6137         if (invp != outvp)
6138                 VOP_UNLOCK(invp);
6139         VOP_UNLOCK(outvp);
6140 bad_write_fallback:
6141         if (mp != NULL)
6142                 vn_finished_write(mp);
6143         error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp,
6144             ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags,
6145             ap->a_incred, ap->a_outcred, ap->a_fsizetd);
6146         return (error);
6147 }
6148
6149 struct vop_vector zfs_vnodeops;
6150 struct vop_vector zfs_fifoops;
6151 struct vop_vector zfs_shareops;
6152
6153 struct vop_vector zfs_vnodeops = {
6154         .vop_default =          &default_vnodeops,
6155         .vop_inactive =         zfs_freebsd_inactive,
6156         .vop_need_inactive =    zfs_freebsd_need_inactive,
6157         .vop_reclaim =          zfs_freebsd_reclaim,
6158         .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
6159         .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
6160         .vop_access =           zfs_freebsd_access,
6161         .vop_allocate =         VOP_EINVAL,
6162 #if __FreeBSD_version >= 1400032
6163         .vop_deallocate =       zfs_deallocate,
6164 #endif
6165         .vop_lookup =           zfs_cache_lookup,
6166         .vop_cachedlookup =     zfs_freebsd_cachedlookup,
6167         .vop_getattr =          zfs_freebsd_getattr,
6168         .vop_setattr =          zfs_freebsd_setattr,
6169         .vop_create =           zfs_freebsd_create,
6170         .vop_mknod =            (vop_mknod_t *)zfs_freebsd_create,
6171         .vop_mkdir =            zfs_freebsd_mkdir,
6172         .vop_readdir =          zfs_freebsd_readdir,
6173         .vop_fsync =            zfs_freebsd_fsync,
6174         .vop_open =             zfs_freebsd_open,
6175         .vop_close =            zfs_freebsd_close,
6176         .vop_rmdir =            zfs_freebsd_rmdir,
6177         .vop_ioctl =            zfs_freebsd_ioctl,
6178         .vop_link =             zfs_freebsd_link,
6179         .vop_symlink =          zfs_freebsd_symlink,
6180         .vop_readlink =         zfs_freebsd_readlink,
6181         .vop_read =             zfs_freebsd_read,
6182         .vop_write =            zfs_freebsd_write,
6183         .vop_remove =           zfs_freebsd_remove,
6184         .vop_rename =           zfs_freebsd_rename,
6185         .vop_pathconf =         zfs_freebsd_pathconf,
6186         .vop_bmap =             zfs_freebsd_bmap,
6187         .vop_fid =              zfs_freebsd_fid,
6188         .vop_getextattr =       zfs_getextattr,
6189         .vop_deleteextattr =    zfs_deleteextattr,
6190         .vop_setextattr =       zfs_setextattr,
6191         .vop_listextattr =      zfs_listextattr,
6192         .vop_getacl =           zfs_freebsd_getacl,
6193         .vop_setacl =           zfs_freebsd_setacl,
6194         .vop_aclcheck =         zfs_freebsd_aclcheck,
6195         .vop_getpages =         zfs_freebsd_getpages,
6196         .vop_putpages =         zfs_freebsd_putpages,
6197         .vop_vptocnp =          zfs_vptocnp,
6198         .vop_lock1 =            vop_lock,
6199         .vop_unlock =           vop_unlock,
6200         .vop_islocked =         vop_islocked,
6201 #if __FreeBSD_version >= 1400043
6202         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
6203 #endif
6204         .vop_copy_file_range =  zfs_freebsd_copy_file_range,
6205 };
6206 VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
6207
6208 struct vop_vector zfs_fifoops = {
6209         .vop_default =          &fifo_specops,
6210         .vop_fsync =            zfs_freebsd_fsync,
6211         .vop_fplookup_vexec =   zfs_freebsd_fplookup_vexec,
6212         .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
6213         .vop_access =           zfs_freebsd_access,
6214         .vop_getattr =          zfs_freebsd_getattr,
6215         .vop_inactive =         zfs_freebsd_inactive,
6216         .vop_read =             VOP_PANIC,
6217         .vop_reclaim =          zfs_freebsd_reclaim,
6218         .vop_setattr =          zfs_freebsd_setattr,
6219         .vop_write =            VOP_PANIC,
6220         .vop_pathconf =         zfs_freebsd_pathconf,
6221         .vop_fid =              zfs_freebsd_fid,
6222         .vop_getacl =           zfs_freebsd_getacl,
6223         .vop_setacl =           zfs_freebsd_setacl,
6224         .vop_aclcheck =         zfs_freebsd_aclcheck,
6225 #if __FreeBSD_version >= 1400043
6226         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
6227 #endif
6228 };
6229 VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
6230
6231 /*
6232  * special share hidden files vnode operations template
6233  */
6234 struct vop_vector zfs_shareops = {
6235         .vop_default =          &default_vnodeops,
6236         .vop_fplookup_vexec =   VOP_EAGAIN,
6237         .vop_fplookup_symlink = VOP_EAGAIN,
6238         .vop_access =           zfs_freebsd_access,
6239         .vop_inactive =         zfs_freebsd_inactive,
6240         .vop_reclaim =          zfs_freebsd_reclaim,
6241         .vop_fid =              zfs_freebsd_fid,
6242         .vop_pathconf =         zfs_freebsd_pathconf,
6243 #if __FreeBSD_version >= 1400043
6244         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
6245 #endif
6246 };
6247 VFS_VOP_VECTOR_REGISTER(zfs_shareops);
6248
6249 ZFS_MODULE_PARAM(zfs, zfs_, xattr_compat, INT, ZMOD_RW,
6250         "Use legacy ZFS xattr naming for writing new user namespace xattrs");