sys/kern/vfs_syscalls.c

   1 /*      $NetBSD: vfs_syscalls.c,v 1.401 2009/12/23 01:09:24 pooka Exp $ */
   2
   3 /*-
   4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Andrew Doran.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  * Copyright (c) 1989, 1993
  34  *      The Regents of the University of California.  All rights reserved.
  35  * (c) UNIX System Laboratories, Inc.
  36  * All or some portions of this file are derived from material licensed
  37  * to the University of California by American Telephone and Telegraph
  38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  39  * the permission of UNIX System Laboratories, Inc.
  40  *
  41  * Redistribution and use in source and binary forms, with or without
  42  * modification, are permitted provided that the following conditions
  43  * are met:
  44  * 1. Redistributions of source code must retain the above copyright
  45  *    notice, this list of conditions and the following disclaimer.
  46  * 2. Redistributions in binary form must reproduce the above copyright
  47  *    notice, this list of conditions and the following disclaimer in the
  48  *    documentation and/or other materials provided with the distribution.
  49  * 3. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.42 (Berkeley) 7/31/95
  66  */
  67
  68 #include <sys/cdefs.h>
  69 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.401 2009/12/23 01:09:24 pooka Exp $");
  70
  71 #ifdef _KERNEL_OPT
  72 #include "opt_fileassoc.h"
  73 #include "veriexec.h"
  74 #endif
  75
  76 #include <sys/param.h>
  77 #include <sys/systm.h>
  78 #include <sys/namei.h>
  79 #include <sys/filedesc.h>
  80 #include <sys/kernel.h>
  81 #include <sys/file.h>
  82 #include <sys/stat.h>
  83 #include <sys/vnode.h>
  84 #include <sys/mount.h>
  85 #include <sys/proc.h>
  86 #include <sys/uio.h>
  87 #include <sys/kmem.h>
  88 #include <sys/dirent.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syscallargs.h>
  91 #include <sys/vfs_syscalls.h>
  92 #include <sys/ktrace.h>
  93 #ifdef FILEASSOC
  94 #include <sys/fileassoc.h>
  95 #endif /* FILEASSOC */
  96 #include <sys/verified_exec.h>
  97 #include <sys/kauth.h>
  98 #include <sys/atomic.h>
  99 #include <sys/module.h>
 100 #include <sys/buf.h>
 101
 102 #include <miscfs/genfs/genfs.h>
 103 #include <miscfs/syncfs/syncfs.h>
 104 #include <miscfs/specfs/specdev.h>
 105
 106 #include <nfs/rpcv2.h>
 107 #include <nfs/nfsproto.h>
 108 #include <nfs/nfs.h>
 109 #include <nfs/nfs_var.h>
 110
 111 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount struct");
 112
 113 static int change_flags(struct vnode *, u_long, struct lwp *);
 114 static int change_mode(struct vnode *, int, struct lwp *l);
 115 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
 116
 117 void checkdirs(struct vnode *);
 118
 119 /*
 120  * Virtual File System System Calls
 121  */
 122
 123 /*
 124  * Mount a file system.
 125  */
 126
 127 /*
 128  * This table is used to maintain compatibility with 4.3BSD
 129  * and NetBSD 0.9 mount syscalls - and possibly other systems.
 130  * Note, the order is important!
 131  *
 132  * Do not modify this table. It should only contain filesystems
 133  * supported by NetBSD 0.9 and 4.3BSD.
 134  */
 135 const char * const mountcompatnames[] = {
 136         NULL,           /* 0 = MOUNT_NONE */
 137         MOUNT_FFS,      /* 1 = MOUNT_UFS */
 138         MOUNT_NFS,      /* 2 */
 139         MOUNT_MFS,      /* 3 */
 140         MOUNT_MSDOS,    /* 4 */
 141         MOUNT_CD9660,   /* 5 = MOUNT_ISOFS */
 142         MOUNT_FDESC,    /* 6 */
 143         MOUNT_KERNFS,   /* 7 */
 144         NULL,           /* 8 = MOUNT_DEVFS */
 145         MOUNT_AFS,      /* 9 */
 146 };
 147 const int nmountcompatnames = sizeof(mountcompatnames) /
 148     sizeof(mountcompatnames[0]);
 149
 150 static int
 151 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
 152     void *data, size_t *data_len)
 153 {
 154         struct mount *mp;
 155         int error = 0, saved_flags;
 156
 157         mp = vp->v_mount;
 158         saved_flags = mp->mnt_flag;
 159
 160         /* We can operate only on VV_ROOT nodes. */
 161         if ((vp->v_vflag & VV_ROOT) == 0) {
 162                 error = EINVAL;
 163                 goto out;
 164         }
 165
 166         /*
 167          * We only allow the filesystem to be reloaded if it
 168          * is currently mounted read-only.  Additionally, we
 169          * prevent read-write to read-only downgrades.
 170          */
 171         if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
 172             (mp->mnt_flag & MNT_RDONLY) == 0) {
 173                 error = EOPNOTSUPP;     /* Needs translation */
 174                 goto out;
 175         }
 176
 177         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
 178             KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
 179         if (error)
 180                 goto out;
 181
 182         if (vfs_busy(mp, NULL)) {
 183                 error = EPERM;
 184                 goto out;
 185         }
 186
 187         mutex_enter(&mp->mnt_updating);
 188
 189         mp->mnt_flag &= ~MNT_OP_FLAGS;
 190         mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 191
 192         /*
 193          * Set the mount level flags.
 194          */
 195         if (flags & MNT_RDONLY)
 196                 mp->mnt_flag |= MNT_RDONLY;
 197         else if (mp->mnt_flag & MNT_RDONLY)
 198                 mp->mnt_iflag |= IMNT_WANTRDWR;
 199         mp->mnt_flag &=
 200           ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 201             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
 202             MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
 203             MNT_LOG);
 204         mp->mnt_flag |= flags &
 205            (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 206             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
 207             MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
 208             MNT_LOG | MNT_IGNORE);
 209
 210         error = VFS_MOUNT(mp, path, data, data_len);
 211
 212         if (error && data != NULL) {
 213                 int error2;
 214
 215                 /*
 216                  * Update failed; let's try and see if it was an
 217                  * export request.  For compat with 3.0 and earlier.
 218                  */
 219                 error2 = vfs_hooks_reexport(mp, path, data);
 220
 221                 /*
 222                  * Only update error code if the export request was
 223                  * understood but some problem occurred while
 224                  * processing it.
 225                  */
 226                 if (error2 != EJUSTRETURN)
 227                         error = error2;
 228         }
 229
 230         if (mp->mnt_iflag & IMNT_WANTRDWR)
 231                 mp->mnt_flag &= ~MNT_RDONLY;
 232         if (error)
 233                 mp->mnt_flag = saved_flags;
 234         mp->mnt_flag &= ~MNT_OP_FLAGS;
 235         mp->mnt_iflag &= ~IMNT_WANTRDWR;
 236         if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
 237                 if (mp->mnt_syncer == NULL)
 238                         error = vfs_allocate_syncvnode(mp);
 239         } else {
 240                 if (mp->mnt_syncer != NULL)
 241                         vfs_deallocate_syncvnode(mp);
 242         }
 243         mutex_exit(&mp->mnt_updating);
 244         vfs_unbusy(mp, false, NULL);
 245
 246  out:
 247         return (error);
 248 }
 249
 250 static int
 251 mount_get_vfsops(const char *fstype, struct vfsops **vfsops)
 252 {
 253         char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
 254         int error;
 255
 256         /* Copy file-system type from userspace.  */
 257         error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
 258         if (error) {
 259                 /*
 260                  * Historically, filesystem types were identified by numbers.
 261                  * If we get an integer for the filesystem type instead of a
 262                  * string, we check to see if it matches one of the historic
 263                  * filesystem types.
 264                  */
 265                 u_long fsindex = (u_long)fstype;
 266                 if (fsindex >= nmountcompatnames ||
 267                     mountcompatnames[fsindex] == NULL)
 268                         return ENODEV;
 269                 strlcpy(fstypename, mountcompatnames[fsindex],
 270                     sizeof(fstypename));
 271         }
 272
 273         /* Accept `ufs' as an alias for `ffs', for compatibility. */
 274         if (strcmp(fstypename, "ufs") == 0)
 275                 fstypename[0] = 'f';
 276
 277         if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
 278                 return 0;
 279
 280         /* If we can autoload a vfs module, try again */
 281         mutex_enter(&module_lock);
 282         (void)module_autoload(fstypename, MODULE_CLASS_VFS);
 283         mutex_exit(&module_lock);
 284
 285         if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
 286                 return 0;
 287
 288         return ENODEV;
 289 }
 290
 291 static int
 292 mount_domount(struct lwp *l, struct vnode **vpp, struct vfsops *vfsops,
 293     const char *path, int flags, void *data, size_t *data_len, u_int recurse)
 294 {
 295         struct mount *mp;
 296         struct vnode *vp = *vpp;
 297         struct vattr va;
 298         int error;
 299
 300         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
 301             KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
 302         if (error)
 303                 return error;
 304
 305         /* Can't make a non-dir a mount-point (from here anyway). */
 306         if (vp->v_type != VDIR)
 307                 return ENOTDIR;
 308
 309         /*
 310          * If the user is not root, ensure that they own the directory
 311          * onto which we are attempting to mount.
 312          */
 313         if ((error = VOP_GETATTR(vp, &va, l->l_cred)) != 0 ||
 314             (va.va_uid != kauth_cred_geteuid(l->l_cred) &&
 315             (error = kauth_authorize_generic(l->l_cred,
 316             KAUTH_GENERIC_ISSUSER, NULL)) != 0)) {
 317                 return error;
 318         }
 319
 320         if (flags & MNT_EXPORTED)
 321                 return EINVAL;
 322
 323         if ((error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0)) != 0)
 324                 return error;
 325
 326         /*
 327          * Check if a file-system is not already mounted on this vnode.
 328          */
 329         if (vp->v_mountedhere != NULL)
 330                 return EBUSY;
 331
 332         if ((mp = vfs_mountalloc(vfsops, vp)) == NULL)
 333                 return ENOMEM;
 334
 335         mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
 336
 337         /*
 338          * The underlying file system may refuse the mount for
 339          * various reasons.  Allow the user to force it to happen.
 340          *
 341          * Set the mount level flags.
 342          */
 343         mp->mnt_flag = flags &
 344            (MNT_FORCE | MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 345             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
 346             MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
 347             MNT_LOG | MNT_IGNORE | MNT_RDONLY);
 348
 349         mutex_enter(&mp->mnt_updating);
 350         error = VFS_MOUNT(mp, path, data, data_len);
 351         mp->mnt_flag &= ~MNT_OP_FLAGS;
 352
 353         /*
 354          * Put the new filesystem on the mount list after root.
 355          */
 356         cache_purge(vp);
 357         if (error != 0) {
 358                 vp->v_mountedhere = NULL;
 359                 mutex_exit(&mp->mnt_updating);
 360                 vfs_unbusy(mp, false, NULL);
 361                 vfs_destroy(mp);
 362                 return error;
 363         }
 364
 365         mp->mnt_iflag &= ~IMNT_WANTRDWR;
 366         mutex_enter(&mountlist_lock);
 367         vp->v_mountedhere = mp;
 368         CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 369         mutex_exit(&mountlist_lock);
 370         vn_restorerecurse(vp, recurse);
 371         VOP_UNLOCK(vp, 0);
 372         checkdirs(vp);
 373         if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
 374                 error = vfs_allocate_syncvnode(mp);
 375         /* Hold an additional reference to the mount across VFS_START(). */
 376         mutex_exit(&mp->mnt_updating);
 377         vfs_unbusy(mp, true, NULL);
 378         (void) VFS_STATVFS(mp, &mp->mnt_stat);
 379         error = VFS_START(mp, 0);
 380         if (error)
 381                 vrele(vp);
 382         /* Drop reference held for VFS_START(). */
 383         vfs_destroy(mp);
 384         *vpp = NULL;
 385         return error;
 386 }
 387
 388 static int
 389 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
 390     void *data, size_t *data_len)
 391 {
 392         struct mount *mp;
 393         int error;
 394
 395         /* If MNT_GETARGS is specified, it should be the only flag. */
 396         if (flags & ~MNT_GETARGS)
 397                 return EINVAL;
 398
 399         mp = vp->v_mount;
 400
 401         /* XXX: probably some notion of "can see" here if we want isolation. */
 402         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
 403             KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
 404         if (error)
 405                 return error;
 406
 407         if ((vp->v_vflag & VV_ROOT) == 0)
 408                 return EINVAL;
 409
 410         if (vfs_busy(mp, NULL))
 411                 return EPERM;
 412
 413         mutex_enter(&mp->mnt_updating);
 414         mp->mnt_flag &= ~MNT_OP_FLAGS;
 415         mp->mnt_flag |= MNT_GETARGS;
 416         error = VFS_MOUNT(mp, path, data, data_len);
 417         mp->mnt_flag &= ~MNT_OP_FLAGS;
 418         mutex_exit(&mp->mnt_updating);
 419
 420         vfs_unbusy(mp, false, NULL);
 421         return (error);
 422 }
 423
 424 int
 425 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
 426 {
 427         /* {
 428                 syscallarg(const char *) type;
 429                 syscallarg(const char *) path;
 430                 syscallarg(int) flags;
 431                 syscallarg(void *) data;
 432                 syscallarg(size_t) data_len;
 433         } */
 434
 435         return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
 436             SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
 437             SCARG(uap, data_len), retval);
 438 }
 439
 440 int
 441 do_sys_mount(struct lwp *l, struct vfsops *vfsops, const char *type,
 442     const char *path, int flags, void *data, enum uio_seg data_seg,
 443     size_t data_len, register_t *retval)
 444 {
 445         struct vnode *vp;
 446         void *data_buf = data;
 447         u_int recurse;
 448         int error;
 449
 450         /*
 451          * Get vnode to be covered
 452          */
 453         error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
 454         if (error != 0)
 455                 return (error);
 456
 457         /*
 458          * A lookup in VFS_MOUNT might result in an attempt to
 459          * lock this vnode again, so make the lock recursive.
 460          */
 461         if (vfsops == NULL) {
 462                 if (flags & (MNT_GETARGS | MNT_UPDATE)) {
 463                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 464                         recurse = vn_setrecurse(vp);
 465                         vfsops = vp->v_mount->mnt_op;
 466                 } else {
 467                         /* 'type' is userspace */
 468                         error = mount_get_vfsops(type, &vfsops);
 469                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 470                         recurse = vn_setrecurse(vp);
 471                         if (error != 0)
 472                                 goto done;
 473                 }
 474         } else {
 475                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 476                 recurse = vn_setrecurse(vp);
 477         }
 478
 479         if (data != NULL && data_seg == UIO_USERSPACE) {
 480                 if (data_len == 0) {
 481                         /* No length supplied, use default for filesystem */
 482                         data_len = vfsops->vfs_min_mount_data;
 483                         if (data_len > VFS_MAX_MOUNT_DATA) {
 484                                 error = EINVAL;
 485                                 goto done;
 486                         }
 487                         /*
 488                          * Hopefully a longer buffer won't make copyin() fail.
 489                          * For compatibility with 3.0 and earlier.
 490                          */
 491                         if (flags & MNT_UPDATE
 492                             && data_len < sizeof (struct mnt_export_args30))
 493                                 data_len = sizeof (struct mnt_export_args30);
 494                 }
 495                 data_buf = kmem_alloc(data_len, KM_SLEEP);
 496
 497                 /* NFS needs the buffer even for mnt_getargs .... */
 498                 error = copyin(data, data_buf, data_len);
 499                 if (error != 0)
 500                         goto done;
 501         }
 502
 503         if (flags & MNT_GETARGS) {
 504                 if (data_len == 0) {
 505                         error = EINVAL;
 506                         goto done;
 507                 }
 508                 error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
 509                 if (error != 0)
 510                         goto done;
 511                 if (data_seg == UIO_USERSPACE)
 512                         error = copyout(data_buf, data, data_len);
 513                 *retval = data_len;
 514         } else if (flags & MNT_UPDATE) {
 515                 error = mount_update(l, vp, path, flags, data_buf, &data_len);
 516         } else {
 517                 /* Locking is handled internally in mount_domount(). */
 518                 error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
 519                     &data_len, recurse);
 520         }
 521
 522     done:
 523         if (vp != NULL) {
 524                 vn_restorerecurse(vp, recurse);
 525                 vput(vp);
 526         }
 527         if (data_buf != data)
 528                 kmem_free(data_buf, data_len);
 529         return (error);
 530 }
 531
 532 /*
 533  * Scan all active processes to see if any of them have a current
 534  * or root directory onto which the new filesystem has just been
 535  * mounted. If so, replace them with the new mount point.
 536  */
 537 void
 538 checkdirs(struct vnode *olddp)
 539 {
 540         struct cwdinfo *cwdi;
 541         struct vnode *newdp, *rele1, *rele2;
 542         struct proc *p;
 543         bool retry;
 544
 545         if (olddp->v_usecount == 1)
 546                 return;
 547         if (VFS_ROOT(olddp->v_mountedhere, &newdp))
 548                 panic("mount: lost mount");
 549
 550         do {
 551                 retry = false;
 552                 mutex_enter(proc_lock);
 553                 PROCLIST_FOREACH(p, &allproc) {
 554                         if ((p->p_flag & PK_MARKER) != 0)
 555                                 continue;
 556                         if ((cwdi = p->p_cwdi) == NULL)
 557                                 continue;
 558                         /*
 559                          * Can't change to the old directory any more,
 560                          * so even if we see a stale value it's not a
 561                          * problem.
 562                          */
 563                         if (cwdi->cwdi_cdir != olddp &&
 564                             cwdi->cwdi_rdir != olddp)
 565                                 continue;
 566                         retry = true;
 567                         rele1 = NULL;
 568                         rele2 = NULL;
 569                         atomic_inc_uint(&cwdi->cwdi_refcnt);
 570                         mutex_exit(proc_lock);
 571                         rw_enter(&cwdi->cwdi_lock, RW_WRITER);
 572                         if (cwdi->cwdi_cdir == olddp) {
 573                                 rele1 = cwdi->cwdi_cdir;
 574                                 vref(newdp);
 575                                 cwdi->cwdi_cdir = newdp;
 576                         }
 577                         if (cwdi->cwdi_rdir == olddp) {
 578                                 rele2 = cwdi->cwdi_rdir;
 579                                 vref(newdp);
 580                                 cwdi->cwdi_rdir = newdp;
 581                         }
 582                         rw_exit(&cwdi->cwdi_lock);
 583                         cwdfree(cwdi);
 584                         if (rele1 != NULL)
 585                                 vrele(rele1);
 586                         if (rele2 != NULL)
 587                                 vrele(rele2);
 588                         mutex_enter(proc_lock);
 589                         break;
 590                 }
 591                 mutex_exit(proc_lock);
 592         } while (retry);
 593
 594         if (rootvnode == olddp) {
 595                 vrele(rootvnode);
 596                 vref(newdp);
 597                 rootvnode = newdp;
 598         }
 599         vput(newdp);
 600 }
 601
 602 /*
 603  * Unmount a file system.
 604  *
 605  * Note: unmount takes a path to the vnode mounted on as argument,
 606  * not special file (as before).
 607  */
 608 /* ARGSUSED */
 609 int
 610 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
 611 {
 612         /* {
 613                 syscallarg(const char *) path;
 614                 syscallarg(int) flags;
 615         } */
 616         struct vnode *vp;
 617         struct mount *mp;
 618         int error;
 619         struct nameidata nd;
 620
 621         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
 622             SCARG(uap, path));
 623         if ((error = namei(&nd)) != 0)
 624                 return (error);
 625         vp = nd.ni_vp;
 626         mp = vp->v_mount;
 627         atomic_inc_uint(&mp->mnt_refcnt);
 628         VOP_UNLOCK(vp, 0);
 629
 630         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
 631             KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
 632         if (error) {
 633                 vrele(vp);
 634                 vfs_destroy(mp);
 635                 return (error);
 636         }
 637
 638         /*
 639          * Don't allow unmounting the root file system.
 640          */
 641         if (mp->mnt_flag & MNT_ROOTFS) {
 642                 vrele(vp);
 643                 vfs_destroy(mp);
 644                 return (EINVAL);
 645         }
 646
 647         /*
 648          * Must be the root of the filesystem
 649          */
 650         if ((vp->v_vflag & VV_ROOT) == 0) {
 651                 vrele(vp);
 652                 vfs_destroy(mp);
 653                 return (EINVAL);
 654         }
 655
 656         vrele(vp);
 657         error = dounmount(mp, SCARG(uap, flags), l);
 658         vfs_destroy(mp);
 659         return error;
 660 }
 661
 662 /*
 663  * Do the actual file system unmount.  File system is assumed to have
 664  * been locked by the caller.
 665  *
 666  * => Caller hold reference to the mount, explicitly for dounmount().
 667  */
 668 int
 669 dounmount(struct mount *mp, int flags, struct lwp *l)
 670 {
 671         struct vnode *coveredvp;
 672         int error;
 673         int async;
 674         int used_syncer;
 675
 676 #if NVERIEXEC > 0
 677         error = veriexec_unmountchk(mp);
 678         if (error)
 679                 return (error);
 680 #endif /* NVERIEXEC > 0 */
 681
 682         /*
 683          * XXX Freeze syncer.  Must do this before locking the
 684          * mount point.  See dounmount() for details.
 685          */
 686         mutex_enter(&syncer_mutex);
 687         rw_enter(&mp->mnt_unmounting, RW_WRITER);
 688         if ((mp->mnt_iflag & IMNT_GONE) != 0) {
 689                 rw_exit(&mp->mnt_unmounting);
 690                 mutex_exit(&syncer_mutex);
 691                 return ENOENT;
 692         }
 693
 694         used_syncer = (mp->mnt_syncer != NULL);
 695
 696         /*
 697          * XXX Syncer must be frozen when we get here.  This should really
 698          * be done on a per-mountpoint basis, but the syncer doesn't work
 699          * like that.
 700          *
 701          * The caller of dounmount() must acquire syncer_mutex because
 702          * the syncer itself acquires locks in syncer_mutex -> vfs_busy
 703          * order, and we must preserve that order to avoid deadlock.
 704          *
 705          * So, if the file system did not use the syncer, now is
 706          * the time to release the syncer_mutex.
 707          */
 708         if (used_syncer == 0)
 709                 mutex_exit(&syncer_mutex);
 710
 711         mp->mnt_iflag |= IMNT_UNMOUNT;
 712         async = mp->mnt_flag & MNT_ASYNC;
 713         mp->mnt_flag &= ~MNT_ASYNC;
 714         cache_purgevfs(mp);     /* remove cache entries for this file sys */
 715         if (mp->mnt_syncer != NULL)
 716                 vfs_deallocate_syncvnode(mp);
 717         error = 0;
 718         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 719                 error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
 720         }
 721         vfs_scrubvnlist(mp);
 722         if (error == 0 || (flags & MNT_FORCE))
 723                 error = VFS_UNMOUNT(mp, flags);
 724         if (error) {
 725                 if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
 726                         (void) vfs_allocate_syncvnode(mp);
 727                 mp->mnt_iflag &= ~IMNT_UNMOUNT;
 728                 mp->mnt_flag |= async;
 729                 rw_exit(&mp->mnt_unmounting);
 730                 if (used_syncer)
 731                         mutex_exit(&syncer_mutex);
 732                 return (error);
 733         }
 734         vfs_scrubvnlist(mp);
 735         mutex_enter(&mountlist_lock);
 736         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP)
 737                 coveredvp->v_mountedhere = NULL;
 738         CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
 739         mp->mnt_iflag |= IMNT_GONE;
 740         mutex_exit(&mountlist_lock);
 741         if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
 742                 panic("unmount: dangling vnode");
 743         if (used_syncer)
 744                 mutex_exit(&syncer_mutex);
 745         vfs_hooks_unmount(mp);
 746         rw_exit(&mp->mnt_unmounting);
 747         vfs_destroy(mp);        /* reference from mount() */
 748         if (coveredvp != NULLVP)
 749                 vrele(coveredvp);
 750         return (0);
 751 }
 752
 753 /*
 754  * Sync each mounted filesystem.
 755  */
 756 #ifdef DEBUG
 757 int syncprt = 0;
 758 struct ctldebug debug0 = { "syncprt", &syncprt };
 759 #endif
 760
 761 /* ARGSUSED */
 762 int
 763 sys_sync(struct lwp *l, const void *v, register_t *retval)
 764 {
 765         struct mount *mp, *nmp;
 766         int asyncflag;
 767
 768         if (l == NULL)
 769                 l = &lwp0;
 770
 771         mutex_enter(&mountlist_lock);
 772         for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
 773              mp = nmp) {
 774                 if (vfs_busy(mp, &nmp)) {
 775                         continue;
 776                 }
 777                 mutex_enter(&mp->mnt_updating);
 778                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 779                         asyncflag = mp->mnt_flag & MNT_ASYNC;
 780                         mp->mnt_flag &= ~MNT_ASYNC;
 781                         VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
 782                         if (asyncflag)
 783                                  mp->mnt_flag |= MNT_ASYNC;
 784                 }
 785                 mutex_exit(&mp->mnt_updating);
 786                 vfs_unbusy(mp, false, &nmp);
 787         }
 788         mutex_exit(&mountlist_lock);
 789 #ifdef DEBUG
 790         if (syncprt)
 791                 vfs_bufstats();
 792 #endif /* DEBUG */
 793         return (0);
 794 }
 795
 796 /*
 797  * Change filesystem quotas.
 798  */
 799 /* ARGSUSED */
 800 int
 801 sys_quotactl(struct lwp *l, const struct sys_quotactl_args *uap, register_t *retval)
 802 {
 803         /* {
 804                 syscallarg(const char *) path;
 805                 syscallarg(int) cmd;
 806                 syscallarg(int) uid;
 807                 syscallarg(void *) arg;
 808         } */
 809         struct mount *mp;
 810         int error;
 811         struct vnode *vp;
 812
 813         error = namei_simple_user(SCARG(uap, path),
 814                                 NSM_FOLLOW_TRYEMULROOT, &vp);
 815         if (error != 0)
 816                 return (error);
 817         mp = vp->v_mount;
 818         error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
 819             SCARG(uap, arg));
 820         vrele(vp);
 821         return (error);
 822 }
 823
 824 int
 825 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
 826     int root)
 827 {
 828         struct cwdinfo *cwdi = l->l_proc->p_cwdi;
 829         int error = 0;
 830
 831         /*
 832          * If MNT_NOWAIT or MNT_LAZY is specified, do not
 833          * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
 834          * overrides MNT_NOWAIT.
 835          */
 836         if (flags == MNT_NOWAIT || flags == MNT_LAZY ||
 837             (flags != MNT_WAIT && flags != 0)) {
 838                 memcpy(sp, &mp->mnt_stat, sizeof(*sp));
 839                 goto done;
 840         }
 841
 842         /* Get the filesystem stats now */
 843         memset(sp, 0, sizeof(*sp));
 844         if ((error = VFS_STATVFS(mp, sp)) != 0) {
 845                 return error;
 846         }
 847
 848         if (cwdi->cwdi_rdir == NULL)
 849                 (void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
 850 done:
 851         if (cwdi->cwdi_rdir != NULL) {
 852                 size_t len;
 853                 char *bp;
 854                 char c;
 855                 char *path = PNBUF_GET();
 856
 857                 bp = path + MAXPATHLEN;
 858                 *--bp = '\0';
 859                 rw_enter(&cwdi->cwdi_lock, RW_READER);
 860                 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
 861                     MAXPATHLEN / 2, 0, l);
 862                 rw_exit(&cwdi->cwdi_lock);
 863                 if (error) {
 864                         PNBUF_PUT(path);
 865                         return error;
 866                 }
 867                 len = strlen(bp);
 868                 if (len != 1) {
 869                         /*
 870                          * for mount points that are below our root, we can see
 871                          * them, so we fix up the pathname and return them. The
 872                          * rest we cannot see, so we don't allow viewing the
 873                          * data.
 874                          */
 875                         if (strncmp(bp, sp->f_mntonname, len) == 0 &&
 876                             ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
 877                                 (void)strlcpy(sp->f_mntonname,
 878                                     c == '\0' ? "/" : &sp->f_mntonname[len],
 879                                     sizeof(sp->f_mntonname));
 880                         } else {
 881                                 if (root)
 882                                         (void)strlcpy(sp->f_mntonname, "/",
 883                                             sizeof(sp->f_mntonname));
 884                                 else
 885                                         error = EPERM;
 886                         }
 887                 }
 888                 PNBUF_PUT(path);
 889         }
 890         sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
 891         return error;
 892 }
 893
 894 /*
 895  * Get filesystem statistics by path.
 896  */
 897 int
 898 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
 899 {
 900         struct mount *mp;
 901         int error;
 902         struct vnode *vp;
 903
 904         error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
 905         if (error != 0)
 906                 return error;
 907         mp = vp->v_mount;
 908         error = dostatvfs(mp, sb, l, flags, 1);
 909         vrele(vp);
 910         return error;
 911 }
 912
 913 /* ARGSUSED */
 914 int
 915 sys_statvfs1(struct lwp *l, const struct sys_statvfs1_args *uap, register_t *retval)
 916 {
 917         /* {
 918                 syscallarg(const char *) path;
 919                 syscallarg(struct statvfs *) buf;
 920                 syscallarg(int) flags;
 921         } */
 922         struct statvfs *sb;
 923         int error;
 924
 925         sb = STATVFSBUF_GET();
 926         error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
 927         if (error == 0)
 928                 error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
 929         STATVFSBUF_PUT(sb);
 930         return error;
 931 }
 932
 933 /*
 934  * Get filesystem statistics by fd.
 935  */
 936 int
 937 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
 938 {
 939         file_t *fp;
 940         struct mount *mp;
 941         int error;
 942
 943         /* fd_getvnode() will use the descriptor for us */
 944         if ((error = fd_getvnode(fd, &fp)) != 0)
 945                 return (error);
 946         mp = ((struct vnode *)fp->f_data)->v_mount;
 947         error = dostatvfs(mp, sb, curlwp, flags, 1);
 948         fd_putfile(fd);
 949         return error;
 950 }
 951
 952 /* ARGSUSED */
 953 int
 954 sys_fstatvfs1(struct lwp *l, const struct sys_fstatvfs1_args *uap, register_t *retval)
 955 {
 956         /* {
 957                 syscallarg(int) fd;
 958                 syscallarg(struct statvfs *) buf;
 959                 syscallarg(int) flags;
 960         } */
 961         struct statvfs *sb;
 962         int error;
 963
 964         sb = STATVFSBUF_GET();
 965         error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
 966         if (error == 0)
 967                 error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
 968         STATVFSBUF_PUT(sb);
 969         return error;
 970 }
 971
 972
 973 /*
 974  * Get statistics on all filesystems.
 975  */
 976 int
 977 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
 978     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
 979     register_t *retval)
 980 {
 981         int root = 0;
 982         struct proc *p = l->l_proc;
 983         struct mount *mp, *nmp;
 984         struct statvfs *sb;
 985         size_t count, maxcount;
 986         int error = 0;
 987
 988         sb = STATVFSBUF_GET();
 989         maxcount = bufsize / entry_sz;
 990         mutex_enter(&mountlist_lock);
 991         count = 0;
 992         for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
 993              mp = nmp) {
 994                 if (vfs_busy(mp, &nmp)) {
 995                         continue;
 996                 }
 997                 if (sfsp && count < maxcount) {
 998                         error = dostatvfs(mp, sb, l, flags, 0);
 999                         if (error) {
1000                                 vfs_unbusy(mp, false, &nmp);
1001                                 error = 0;
1002                                 continue;
1003                         }
1004                         error = copyfn(sb, sfsp, entry_sz);
1005                         if (error) {
1006                                 vfs_unbusy(mp, false, NULL);
1007                                 goto out;
1008                         }
1009                         sfsp = (char *)sfsp + entry_sz;
1010                         root |= strcmp(sb->f_mntonname, "/") == 0;
1011                 }
1012                 count++;
1013                 vfs_unbusy(mp, false, &nmp);
1014         }
1015         mutex_exit(&mountlist_lock);
1016
1017         if (root == 0 && p->p_cwdi->cwdi_rdir) {
1018                 /*
1019                  * fake a root entry
1020                  */
1021                 error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1022                     sb, l, flags, 1);
1023                 if (error != 0)
1024                         goto out;
1025                 if (sfsp) {
1026                         error = copyfn(sb, sfsp, entry_sz);
1027                         if (error != 0)
1028                                 goto out;
1029                 }
1030                 count++;
1031         }
1032         if (sfsp && count > maxcount)
1033                 *retval = maxcount;
1034         else
1035                 *retval = count;
1036 out:
1037         STATVFSBUF_PUT(sb);
1038         return error;
1039 }
1040
1041 int
1042 sys_getvfsstat(struct lwp *l, const struct sys_getvfsstat_args *uap, register_t *retval)
1043 {
1044         /* {
1045                 syscallarg(struct statvfs *) buf;
1046                 syscallarg(size_t) bufsize;
1047                 syscallarg(int) flags;
1048         } */
1049
1050         return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1051             SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1052 }
1053
1054 /*
1055  * Change current working directory to a given file descriptor.
1056  */
1057 /* ARGSUSED */
1058 int
1059 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1060 {
1061         /* {
1062                 syscallarg(int) fd;
1063         } */
1064         struct proc *p = l->l_proc;
1065         struct cwdinfo *cwdi;
1066         struct vnode *vp, *tdp;
1067         struct mount *mp;
1068         file_t *fp;
1069         int error, fd;
1070
1071         /* fd_getvnode() will use the descriptor for us */
1072         fd = SCARG(uap, fd);
1073         if ((error = fd_getvnode(fd, &fp)) != 0)
1074                 return (error);
1075         vp = fp->f_data;
1076
1077         vref(vp);
1078         vn_lock(vp,  LK_EXCLUSIVE | LK_RETRY);
1079         if (vp->v_type != VDIR)
1080                 error = ENOTDIR;
1081         else
1082                 error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1083         if (error) {
1084                 vput(vp);
1085                 goto out;
1086         }
1087         while ((mp = vp->v_mountedhere) != NULL) {
1088                 error = vfs_busy(mp, NULL);
1089                 vput(vp);
1090                 if (error != 0)
1091                         goto out;
1092                 error = VFS_ROOT(mp, &tdp);
1093                 vfs_unbusy(mp, false, NULL);
1094                 if (error)
1095                         goto out;
1096                 vp = tdp;
1097         }
1098         VOP_UNLOCK(vp, 0);
1099
1100         /*
1101          * Disallow changing to a directory not under the process's
1102          * current root directory (if there is one).
1103          */
1104         cwdi = p->p_cwdi;
1105         rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1106         if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1107                 vrele(vp);
1108                 error = EPERM;  /* operation not permitted */
1109         } else {
1110                 vrele(cwdi->cwdi_cdir);
1111                 cwdi->cwdi_cdir = vp;
1112         }
1113         rw_exit(&cwdi->cwdi_lock);
1114
1115  out:
1116         fd_putfile(fd);
1117         return (error);
1118 }
1119
1120 /*
1121  * Change this process's notion of the root directory to a given file
1122  * descriptor.
1123  */
1124 int
1125 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1126 {
1127         struct proc *p = l->l_proc;
1128         struct vnode    *vp;
1129         file_t  *fp;
1130         int              error, fd = SCARG(uap, fd);
1131
1132         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1133             KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1134                 return error;
1135         /* fd_getvnode() will use the descriptor for us */
1136         if ((error = fd_getvnode(fd, &fp)) != 0)
1137                 return error;
1138         vp = fp->f_data;
1139         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1140         if (vp->v_type != VDIR)
1141                 error = ENOTDIR;
1142         else
1143                 error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1144         VOP_UNLOCK(vp, 0);
1145         if (error)
1146                 goto out;
1147         vref(vp);
1148
1149         change_root(p->p_cwdi, vp, l);
1150
1151  out:
1152         fd_putfile(fd);
1153         return (error);
1154 }
1155
1156 /*
1157  * Change current working directory (``.'').
1158  */
1159 /* ARGSUSED */
1160 int
1161 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1162 {
1163         /* {
1164                 syscallarg(const char *) path;
1165         } */
1166         struct proc *p = l->l_proc;
1167         struct cwdinfo *cwdi;
1168         int error;
1169         struct vnode *vp;
1170
1171         if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1172                                   &vp, l)) != 0)
1173                 return (error);
1174         cwdi = p->p_cwdi;
1175         rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1176         vrele(cwdi->cwdi_cdir);
1177         cwdi->cwdi_cdir = vp;
1178         rw_exit(&cwdi->cwdi_lock);
1179         return (0);
1180 }
1181
1182 /*
1183  * Change notion of root (``/'') directory.
1184  */
1185 /* ARGSUSED */
1186 int
1187 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1188 {
1189         /* {
1190                 syscallarg(const char *) path;
1191         } */
1192         struct proc *p = l->l_proc;
1193         int error;
1194         struct vnode *vp;
1195
1196         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1197             KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1198                 return (error);
1199         if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1200                                   &vp, l)) != 0)
1201                 return (error);
1202
1203         change_root(p->p_cwdi, vp, l);
1204
1205         return (0);
1206 }
1207
1208 /*
1209  * Common routine for chroot and fchroot.
1210  * NB: callers need to properly authorize the change root operation.
1211  */
1212 void
1213 change_root(struct cwdinfo *cwdi, struct vnode *vp, struct lwp *l)
1214 {
1215
1216         rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1217         if (cwdi->cwdi_rdir != NULL)
1218                 vrele(cwdi->cwdi_rdir);
1219         cwdi->cwdi_rdir = vp;
1220
1221         /*
1222          * Prevent escaping from chroot by putting the root under
1223          * the working directory.  Silently chdir to / if we aren't
1224          * already there.
1225          */
1226         if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1227                 /*
1228                  * XXX would be more failsafe to change directory to a
1229                  * deadfs node here instead
1230                  */
1231                 vrele(cwdi->cwdi_cdir);
1232                 vref(vp);
1233                 cwdi->cwdi_cdir = vp;
1234         }
1235         rw_exit(&cwdi->cwdi_lock);
1236 }
1237
1238 /*
1239  * Common routine for chroot and chdir.
1240  */
1241 int
1242 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1243 {
1244         struct nameidata nd;
1245         int error;
1246
1247         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, where,
1248             path);
1249         if ((error = namei(&nd)) != 0)
1250                 return (error);
1251         *vpp = nd.ni_vp;
1252         if ((*vpp)->v_type != VDIR)
1253                 error = ENOTDIR;
1254         else
1255                 error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1256
1257         if (error)
1258                 vput(*vpp);
1259         else
1260                 VOP_UNLOCK(*vpp, 0);
1261         return (error);
1262 }
1263
1264 /*
1265  * Check permissions, allocate an open file structure,
1266  * and call the device open routine if any.
1267  */
1268 int
1269 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1270 {
1271         /* {
1272                 syscallarg(const char *) path;
1273                 syscallarg(int) flags;
1274                 syscallarg(int) mode;
1275         } */
1276         struct proc *p = l->l_proc;
1277         struct cwdinfo *cwdi = p->p_cwdi;
1278         file_t *fp;
1279         struct vnode *vp;
1280         int flags, cmode;
1281         int type, indx, error;
1282         struct flock lf;
1283         struct nameidata nd;
1284
1285         flags = FFLAGS(SCARG(uap, flags));
1286         if ((flags & (FREAD | FWRITE)) == 0)
1287                 return (EINVAL);
1288         if ((error = fd_allocfile(&fp, &indx)) != 0)
1289                 return (error);
1290         /* We're going to read cwdi->cwdi_cmask unlocked here. */
1291         cmode = ((SCARG(uap, mode) &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1292         NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
1293             SCARG(uap, path));
1294         l->l_dupfd = -indx - 1;                 /* XXX check for fdopen */
1295         if ((error = vn_open(&nd, flags, cmode)) != 0) {
1296                 fd_abort(p, fp, indx);
1297                 if ((error == EDUPFD || error == EMOVEFD) &&
1298                     l->l_dupfd >= 0 &&                  /* XXX from fdopen */
1299                     (error =
1300                         fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1301                         *retval = indx;
1302                         return (0);
1303                 }
1304                 if (error == ERESTART)
1305                         error = EINTR;
1306                 return (error);
1307         }
1308
1309         l->l_dupfd = 0;
1310         vp = nd.ni_vp;
1311         fp->f_flag = flags & FMASK;
1312         fp->f_type = DTYPE_VNODE;
1313         fp->f_ops = &vnops;
1314         fp->f_data = vp;
1315         if (flags & (O_EXLOCK | O_SHLOCK)) {
1316                 lf.l_whence = SEEK_SET;
1317                 lf.l_start = 0;
1318                 lf.l_len = 0;
1319                 if (flags & O_EXLOCK)
1320                         lf.l_type = F_WRLCK;
1321                 else
1322                         lf.l_type = F_RDLCK;
1323                 type = F_FLOCK;
1324                 if ((flags & FNONBLOCK) == 0)
1325                         type |= F_WAIT;
1326                 VOP_UNLOCK(vp, 0);
1327                 error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
1328                 if (error) {
1329                         (void) vn_close(vp, fp->f_flag, fp->f_cred);
1330                         fd_abort(p, fp, indx);
1331                         return (error);
1332                 }
1333                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1334                 atomic_or_uint(&fp->f_flag, FHASLOCK);
1335         }
1336         VOP_UNLOCK(vp, 0);
1337         *retval = indx;
1338         fd_affix(p, fp, indx);
1339         return (0);
1340 }
1341
1342 static void
1343 vfs__fhfree(fhandle_t *fhp)
1344 {
1345         size_t fhsize;
1346
1347         if (fhp == NULL) {
1348                 return;
1349         }
1350         fhsize = FHANDLE_SIZE(fhp);
1351         kmem_free(fhp, fhsize);
1352 }
1353
1354 /*
1355  * vfs_composefh: compose a filehandle.
1356  */
1357
1358 int
1359 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1360 {
1361         struct mount *mp;
1362         struct fid *fidp;
1363         int error;
1364         size_t needfhsize;
1365         size_t fidsize;
1366
1367         mp = vp->v_mount;
1368         fidp = NULL;
1369         if (*fh_size < FHANDLE_SIZE_MIN) {
1370                 fidsize = 0;
1371         } else {
1372                 fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1373                 if (fhp != NULL) {
1374                         memset(fhp, 0, *fh_size);
1375                         fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1376                         fidp = &fhp->fh_fid;
1377                 }
1378         }
1379         error = VFS_VPTOFH(vp, fidp, &fidsize);
1380         needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1381         if (error == 0 && *fh_size < needfhsize) {
1382                 error = E2BIG;
1383         }
1384         *fh_size = needfhsize;
1385         return error;
1386 }
1387
1388 int
1389 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1390 {
1391         struct mount *mp;
1392         fhandle_t *fhp;
1393         size_t fhsize;
1394         size_t fidsize;
1395         int error;
1396
1397         *fhpp = NULL;
1398         mp = vp->v_mount;
1399         fidsize = 0;
1400         error = VFS_VPTOFH(vp, NULL, &fidsize);
1401         KASSERT(error != 0);
1402         if (error != E2BIG) {
1403                 goto out;
1404         }
1405         fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1406         fhp = kmem_zalloc(fhsize, KM_SLEEP);
1407         if (fhp == NULL) {
1408                 error = ENOMEM;
1409                 goto out;
1410         }
1411         fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1412         error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1413         if (error == 0) {
1414                 KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1415                     FHANDLE_FILEID(fhp)->fid_len == fidsize));
1416                 *fhpp = fhp;
1417         } else {
1418                 kmem_free(fhp, fhsize);
1419         }
1420 out:
1421         return error;
1422 }
1423
1424 void
1425 vfs_composefh_free(fhandle_t *fhp)
1426 {
1427
1428         vfs__fhfree(fhp);
1429 }
1430
1431 /*
1432  * vfs_fhtovp: lookup a vnode by a filehandle.
1433  */
1434
1435 int
1436 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1437 {
1438         struct mount *mp;
1439         int error;
1440
1441         *vpp = NULL;
1442         mp = vfs_getvfs(FHANDLE_FSID(fhp));
1443         if (mp == NULL) {
1444                 error = ESTALE;
1445                 goto out;
1446         }
1447         if (mp->mnt_op->vfs_fhtovp == NULL) {
1448                 error = EOPNOTSUPP;
1449                 goto out;
1450         }
1451         error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), vpp);
1452 out:
1453         return error;
1454 }
1455
1456 /*
1457  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1458  * the needed size.
1459  */
1460
1461 int
1462 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1463 {
1464         fhandle_t *fhp;
1465         int error;
1466
1467         *fhpp = NULL;
1468         if (fhsize > FHANDLE_SIZE_MAX) {
1469                 return EINVAL;
1470         }
1471         if (fhsize < FHANDLE_SIZE_MIN) {
1472                 return EINVAL;
1473         }
1474 again:
1475         fhp = kmem_alloc(fhsize, KM_SLEEP);
1476         if (fhp == NULL) {
1477                 return ENOMEM;
1478         }
1479         error = copyin(ufhp, fhp, fhsize);
1480         if (error == 0) {
1481                 /* XXX this check shouldn't be here */
1482                 if (FHANDLE_SIZE(fhp) == fhsize) {
1483                         *fhpp = fhp;
1484                         return 0;
1485                 } else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1486                         /*
1487                          * a kludge for nfsv2 padded handles.
1488                          */
1489                         size_t sz;
1490
1491                         sz = FHANDLE_SIZE(fhp);
1492                         kmem_free(fhp, fhsize);
1493                         fhsize = sz;
1494                         goto again;
1495                 } else {
1496                         /*
1497                          * userland told us wrong size.
1498                          */
1499                         error = EINVAL;
1500                 }
1501         }
1502         kmem_free(fhp, fhsize);
1503         return error;
1504 }
1505
1506 void
1507 vfs_copyinfh_free(fhandle_t *fhp)
1508 {
1509
1510         vfs__fhfree(fhp);
1511 }
1512
1513 /*
1514  * Get file handle system call
1515  */
1516 int
1517 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1518 {
1519         /* {
1520                 syscallarg(char *) fname;
1521                 syscallarg(fhandle_t *) fhp;
1522                 syscallarg(size_t *) fh_size;
1523         } */
1524         struct vnode *vp;
1525         fhandle_t *fh;
1526         int error;
1527         struct nameidata nd;
1528         size_t sz;
1529         size_t usz;
1530
1531         /*
1532          * Must be super user
1533          */
1534         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1535             0, NULL, NULL, NULL);
1536         if (error)
1537                 return (error);
1538         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1539             SCARG(uap, fname));
1540         error = namei(&nd);
1541         if (error)
1542                 return (error);
1543         vp = nd.ni_vp;
1544         error = vfs_composefh_alloc(vp, &fh);
1545         vput(vp);
1546         if (error != 0) {
1547                 goto out;
1548         }
1549         error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1550         if (error != 0) {
1551                 goto out;
1552         }
1553         sz = FHANDLE_SIZE(fh);
1554         error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1555         if (error != 0) {
1556                 goto out;
1557         }
1558         if (usz >= sz) {
1559                 error = copyout(fh, SCARG(uap, fhp), sz);
1560         } else {
1561                 error = E2BIG;
1562         }
1563 out:
1564         vfs_composefh_free(fh);
1565         return (error);
1566 }
1567
1568 /*
1569  * Open a file given a file handle.
1570  *
1571  * Check permissions, allocate an open file structure,
1572  * and call the device open routine if any.
1573  */
1574
1575 int
1576 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
1577     register_t *retval)
1578 {
1579         file_t *fp;
1580         struct vnode *vp = NULL;
1581         kauth_cred_t cred = l->l_cred;
1582         file_t *nfp;
1583         int type, indx, error=0;
1584         struct flock lf;
1585         struct vattr va;
1586         fhandle_t *fh;
1587         int flags;
1588         proc_t *p;
1589
1590         p = curproc;
1591
1592         /*
1593          * Must be super user
1594          */
1595         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1596             0, NULL, NULL, NULL)))
1597                 return (error);
1598
1599         flags = FFLAGS(oflags);
1600         if ((flags & (FREAD | FWRITE)) == 0)
1601                 return (EINVAL);
1602         if ((flags & O_CREAT))
1603                 return (EINVAL);
1604         if ((error = fd_allocfile(&nfp, &indx)) != 0)
1605                 return (error);
1606         fp = nfp;
1607         error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1608         if (error != 0) {
1609                 goto bad;
1610         }
1611         error = vfs_fhtovp(fh, &vp);
1612         if (error != 0) {
1613                 goto bad;
1614         }
1615
1616         /* Now do an effective vn_open */
1617
1618         if (vp->v_type == VSOCK) {
1619                 error = EOPNOTSUPP;
1620                 goto bad;
1621         }
1622         error = vn_openchk(vp, cred, flags);
1623         if (error != 0)
1624                 goto bad;
1625         if (flags & O_TRUNC) {
1626                 VOP_UNLOCK(vp, 0);                      /* XXX */
1627                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
1628                 vattr_null(&va);
1629                 va.va_size = 0;
1630                 error = VOP_SETATTR(vp, &va, cred);
1631                 if (error)
1632                         goto bad;
1633         }
1634         if ((error = VOP_OPEN(vp, flags, cred)) != 0)
1635                 goto bad;
1636         if (flags & FWRITE) {
1637                 mutex_enter(&vp->v_interlock);
1638                 vp->v_writecount++;
1639                 mutex_exit(&vp->v_interlock);
1640         }
1641
1642         /* done with modified vn_open, now finish what sys_open does. */
1643
1644         fp->f_flag = flags & FMASK;
1645         fp->f_type = DTYPE_VNODE;
1646         fp->f_ops = &vnops;
1647         fp->f_data = vp;
1648         if (flags & (O_EXLOCK | O_SHLOCK)) {
1649                 lf.l_whence = SEEK_SET;
1650                 lf.l_start = 0;
1651                 lf.l_len = 0;
1652                 if (flags & O_EXLOCK)
1653                         lf.l_type = F_WRLCK;
1654                 else
1655                         lf.l_type = F_RDLCK;
1656                 type = F_FLOCK;
1657                 if ((flags & FNONBLOCK) == 0)
1658                         type |= F_WAIT;
1659                 VOP_UNLOCK(vp, 0);
1660                 error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
1661                 if (error) {
1662                         (void) vn_close(vp, fp->f_flag, fp->f_cred);
1663                         fd_abort(p, fp, indx);
1664                         return (error);
1665                 }
1666                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1667                 atomic_or_uint(&fp->f_flag, FHASLOCK);
1668         }
1669         VOP_UNLOCK(vp, 0);
1670         *retval = indx;
1671         fd_affix(p, fp, indx);
1672         vfs_copyinfh_free(fh);
1673         return (0);
1674
1675 bad:
1676         fd_abort(p, fp, indx);
1677         if (vp != NULL)
1678                 vput(vp);
1679         vfs_copyinfh_free(fh);
1680         return (error);
1681 }
1682
1683 int
1684 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
1685 {
1686         /* {
1687                 syscallarg(const void *) fhp;
1688                 syscallarg(size_t) fh_size;
1689                 syscallarg(int) flags;
1690         } */
1691
1692         return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
1693             SCARG(uap, flags), retval);
1694 }
1695
1696 int
1697 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
1698 {
1699         int error;
1700         fhandle_t *fh;
1701         struct vnode *vp;
1702
1703         /*
1704          * Must be super user
1705          */
1706         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1707             0, NULL, NULL, NULL)))
1708                 return (error);
1709
1710         error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1711         if (error != 0)
1712                 return error;
1713
1714         error = vfs_fhtovp(fh, &vp);
1715         vfs_copyinfh_free(fh);
1716         if (error != 0)
1717                 return error;
1718
1719         error = vn_stat(vp, sb);
1720         vput(vp);
1721         return error;
1722 }
1723
1724
1725 /* ARGSUSED */
1726 int
1727 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
1728 {
1729         /* {
1730                 syscallarg(const void *) fhp;
1731                 syscallarg(size_t) fh_size;
1732                 syscallarg(struct stat *) sb;
1733         } */
1734         struct stat sb;
1735         int error;
1736
1737         error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
1738         if (error)
1739                 return error;
1740         return copyout(&sb, SCARG(uap, sb), sizeof(sb));
1741 }
1742
1743 int
1744 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
1745     int flags)
1746 {
1747         fhandle_t *fh;
1748         struct mount *mp;
1749         struct vnode *vp;
1750         int error;
1751
1752         /*
1753          * Must be super user
1754          */
1755         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1756             0, NULL, NULL, NULL)))
1757                 return error;
1758
1759         error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1760         if (error != 0)
1761                 return error;
1762
1763         error = vfs_fhtovp(fh, &vp);
1764         vfs_copyinfh_free(fh);
1765         if (error != 0)
1766                 return error;
1767
1768         mp = vp->v_mount;
1769         error = dostatvfs(mp, sb, l, flags, 1);
1770         vput(vp);
1771         return error;
1772 }
1773
1774 /* ARGSUSED */
1775 int
1776 sys___fhstatvfs140(struct lwp *l, const struct sys___fhstatvfs140_args *uap, register_t *retval)
1777 {
1778         /* {
1779                 syscallarg(const void *) fhp;
1780                 syscallarg(size_t) fh_size;
1781                 syscallarg(struct statvfs *) buf;
1782                 syscallarg(int) flags;
1783         } */
1784         struct statvfs *sb = STATVFSBUF_GET();
1785         int error;
1786
1787         error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
1788             SCARG(uap, flags));
1789         if (error == 0)
1790                 error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1791         STATVFSBUF_PUT(sb);
1792         return error;
1793 }
1794
1795 /*
1796  * Create a special file.
1797  */
1798 /* ARGSUSED */
1799 int
1800 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
1801     register_t *retval)
1802 {
1803         /* {
1804                 syscallarg(const char *) path;
1805                 syscallarg(mode_t) mode;
1806                 syscallarg(dev_t) dev;
1807         } */
1808         return do_sys_mknod(l, SCARG(uap, path), SCARG(uap, mode),
1809             SCARG(uap, dev), retval, UIO_USERSPACE);
1810 }
1811
1812 int
1813 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
1814     register_t *retval, enum uio_seg seg)
1815 {
1816         struct proc *p = l->l_proc;
1817         struct vnode *vp;
1818         struct vattr vattr;
1819         int error, optype;
1820         struct nameidata nd;
1821         char *path;
1822         const char *cpath;
1823
1824         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
1825             0, NULL, NULL, NULL)) != 0)
1826                 return (error);
1827
1828         optype = VOP_MKNOD_DESCOFFSET;
1829
1830         VERIEXEC_PATH_GET(pathname, seg, cpath, path);
1831         NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, seg, cpath);
1832
1833         if ((error = namei(&nd)) != 0)
1834                 goto out;
1835         vp = nd.ni_vp;
1836         if (vp != NULL)
1837                 error = EEXIST;
1838         else {
1839                 vattr_null(&vattr);
1840                 /* We will read cwdi->cwdi_cmask unlocked. */
1841                 vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1842                 vattr.va_rdev = dev;
1843
1844                 switch (mode & S_IFMT) {
1845                 case S_IFMT:    /* used by badsect to flag bad sectors */
1846                         vattr.va_type = VBAD;
1847                         break;
1848                 case S_IFCHR:
1849                         vattr.va_type = VCHR;
1850                         break;
1851                 case S_IFBLK:
1852                         vattr.va_type = VBLK;
1853                         break;
1854                 case S_IFWHT:
1855                         optype = VOP_WHITEOUT_DESCOFFSET;
1856                         break;
1857                 case S_IFREG:
1858 #if NVERIEXEC > 0
1859                         error = veriexec_openchk(l, nd.ni_vp, nd.ni_dirp,
1860                             O_CREAT);
1861 #endif /* NVERIEXEC > 0 */
1862                         vattr.va_type = VREG;
1863                         vattr.va_rdev = VNOVAL;
1864                         optype = VOP_CREATE_DESCOFFSET;
1865                         break;
1866                 default:
1867                         error = EINVAL;
1868                         break;
1869                 }
1870         }
1871         if (!error) {
1872                 switch (optype) {
1873                 case VOP_WHITEOUT_DESCOFFSET:
1874                         error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1875                         if (error)
1876                                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1877                         vput(nd.ni_dvp);
1878                         break;
1879
1880                 case VOP_MKNOD_DESCOFFSET:
1881                         error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1882                                                 &nd.ni_cnd, &vattr);
1883                         if (error == 0)
1884                                 vput(nd.ni_vp);
1885                         break;
1886
1887                 case VOP_CREATE_DESCOFFSET:
1888                         error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
1889                                                 &nd.ni_cnd, &vattr);
1890                         if (error == 0)
1891                                 vput(nd.ni_vp);
1892                         break;
1893                 }
1894         } else {
1895                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1896                 if (nd.ni_dvp == vp)
1897                         vrele(nd.ni_dvp);
1898                 else
1899                         vput(nd.ni_dvp);
1900                 if (vp)
1901                         vrele(vp);
1902         }
1903 out:
1904         VERIEXEC_PATH_PUT(path);
1905         return (error);
1906 }
1907
1908 /*
1909  * Create a named pipe.
1910  */
1911 /* ARGSUSED */
1912 int
1913 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
1914 {
1915         /* {
1916                 syscallarg(const char *) path;
1917                 syscallarg(int) mode;
1918         } */
1919         struct proc *p = l->l_proc;
1920         struct vattr vattr;
1921         int error;
1922         struct nameidata nd;
1923
1924         NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
1925             SCARG(uap, path));
1926         if ((error = namei(&nd)) != 0)
1927                 return (error);
1928         if (nd.ni_vp != NULL) {
1929                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1930                 if (nd.ni_dvp == nd.ni_vp)
1931                         vrele(nd.ni_dvp);
1932                 else
1933                         vput(nd.ni_dvp);
1934                 vrele(nd.ni_vp);
1935                 return (EEXIST);
1936         }
1937         vattr_null(&vattr);
1938         vattr.va_type = VFIFO;
1939         /* We will read cwdi->cwdi_cmask unlocked. */
1940         vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1941         error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1942         if (error == 0)
1943                 vput(nd.ni_vp);
1944         return (error);
1945 }
1946
1947 /*
1948  * Make a hard file link.
1949  */
1950 /* ARGSUSED */
1951 int
1952 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
1953 {
1954         /* {
1955                 syscallarg(const char *) path;
1956                 syscallarg(const char *) link;
1957         } */
1958         struct vnode *vp;
1959         struct nameidata nd;
1960         int error;
1961
1962         error = namei_simple_user(SCARG(uap, path),
1963                                 NSM_FOLLOW_TRYEMULROOT, &vp);
1964         if (error != 0)
1965                 return (error);
1966         NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
1967             SCARG(uap, link));
1968         if ((error = namei(&nd)) != 0)
1969                 goto out;
1970         if (nd.ni_vp) {
1971                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1972                 if (nd.ni_dvp == nd.ni_vp)
1973                         vrele(nd.ni_dvp);
1974                 else
1975                         vput(nd.ni_dvp);
1976                 vrele(nd.ni_vp);
1977                 error = EEXIST;
1978                 goto out;
1979         }
1980         error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1981 out:
1982         vrele(vp);
1983         return (error);
1984 }
1985
1986 /*
1987  * Make a symbolic link.
1988  */
1989 /* ARGSUSED */
1990 int
1991 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
1992 {
1993         /* {
1994                 syscallarg(const char *) path;
1995                 syscallarg(const char *) link;
1996         } */
1997         struct proc *p = l->l_proc;
1998         struct vattr vattr;
1999         char *path;
2000         int error;
2001         struct nameidata nd;
2002
2003         path = PNBUF_GET();
2004         error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL);
2005         if (error)
2006                 goto out;
2007         NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
2008             SCARG(uap, link));
2009         if ((error = namei(&nd)) != 0)
2010                 goto out;
2011         if (nd.ni_vp) {
2012                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2013                 if (nd.ni_dvp == nd.ni_vp)
2014                         vrele(nd.ni_dvp);
2015                 else
2016                         vput(nd.ni_dvp);
2017                 vrele(nd.ni_vp);
2018                 error = EEXIST;
2019                 goto out;
2020         }
2021         vattr_null(&vattr);
2022         vattr.va_type = VLNK;
2023         /* We will read cwdi->cwdi_cmask unlocked. */
2024         vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2025         error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2026         if (error == 0)
2027                 vput(nd.ni_vp);
2028 out:
2029         PNBUF_PUT(path);
2030         return (error);
2031 }
2032
2033 /*
2034  * Delete a whiteout from the filesystem.
2035  */
2036 /* ARGSUSED */
2037 int
2038 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2039 {
2040         /* {
2041                 syscallarg(const char *) path;
2042         } */
2043         int error;
2044         struct nameidata nd;
2045
2046         NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT,
2047             UIO_USERSPACE, SCARG(uap, path));
2048         error = namei(&nd);
2049         if (error)
2050                 return (error);
2051
2052         if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2053                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2054                 if (nd.ni_dvp == nd.ni_vp)
2055                         vrele(nd.ni_dvp);
2056                 else
2057                         vput(nd.ni_dvp);
2058                 if (nd.ni_vp)
2059                         vrele(nd.ni_vp);
2060                 return (EEXIST);
2061         }
2062         if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2063                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2064         vput(nd.ni_dvp);
2065         return (error);
2066 }
2067
2068 /*
2069  * Delete a name from the filesystem.
2070  */
2071 /* ARGSUSED */
2072 int
2073 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2074 {
2075         /* {
2076                 syscallarg(const char *) path;
2077         } */
2078
2079         return do_sys_unlink(SCARG(uap, path), UIO_USERSPACE);
2080 }
2081
2082 int
2083 do_sys_unlink(const char *arg, enum uio_seg seg)
2084 {
2085         struct vnode *vp;
2086         int error;
2087         struct nameidata nd;
2088         char *path;
2089         const char *cpath;
2090
2091         VERIEXEC_PATH_GET(arg, seg, cpath, path);
2092         NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, seg, cpath);
2093
2094         if ((error = namei(&nd)) != 0)
2095                 goto out;
2096         vp = nd.ni_vp;
2097
2098         /*
2099          * The root of a mounted filesystem cannot be deleted.
2100          */
2101         if (vp->v_vflag & VV_ROOT) {
2102                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2103                 if (nd.ni_dvp == vp)
2104                         vrele(nd.ni_dvp);
2105                 else
2106                         vput(nd.ni_dvp);
2107                 vput(vp);
2108                 error = EBUSY;
2109                 goto out;
2110         }
2111
2112 #if NVERIEXEC > 0
2113         /* Handle remove requests for veriexec entries. */
2114         if ((error = veriexec_removechk(curlwp, nd.ni_vp, nd.ni_dirp)) != 0) {
2115                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2116                 if (nd.ni_dvp == vp)
2117                         vrele(nd.ni_dvp);
2118                 else
2119                         vput(nd.ni_dvp);
2120                 vput(vp);
2121                 goto out;
2122         }
2123 #endif /* NVERIEXEC > 0 */
2124
2125 #ifdef FILEASSOC
2126         (void)fileassoc_file_delete(vp);
2127 #endif /* FILEASSOC */
2128         error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2129 out:
2130         VERIEXEC_PATH_PUT(path);
2131         return (error);
2132 }
2133
2134 /*
2135  * Reposition read/write file offset.
2136  */
2137 int
2138 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2139 {
2140         /* {
2141                 syscallarg(int) fd;
2142                 syscallarg(int) pad;
2143                 syscallarg(off_t) offset;
2144                 syscallarg(int) whence;
2145         } */
2146         kauth_cred_t cred = l->l_cred;
2147         file_t *fp;
2148         struct vnode *vp;
2149         struct vattr vattr;
2150         off_t newoff;
2151         int error, fd;
2152
2153         fd = SCARG(uap, fd);
2154
2155         if ((fp = fd_getfile(fd)) == NULL)
2156                 return (EBADF);
2157
2158         vp = fp->f_data;
2159         if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2160                 error = ESPIPE;
2161                 goto out;
2162         }
2163
2164         switch (SCARG(uap, whence)) {
2165         case SEEK_CUR:
2166                 newoff = fp->f_offset + SCARG(uap, offset);
2167                 break;
2168         case SEEK_END:
2169                 error = VOP_GETATTR(vp, &vattr, cred);
2170                 if (error) {
2171                         goto out;
2172                 }
2173                 newoff = SCARG(uap, offset) + vattr.va_size;
2174                 break;
2175         case SEEK_SET:
2176                 newoff = SCARG(uap, offset);
2177                 break;
2178         default:
2179                 error = EINVAL;
2180                 goto out;
2181         }
2182         if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2183                 *(off_t *)retval = fp->f_offset = newoff;
2184         }
2185  out:
2186         fd_putfile(fd);
2187         return (error);
2188 }
2189
2190 /*
2191  * Positional read system call.
2192  */
2193 int
2194 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2195 {
2196         /* {
2197                 syscallarg(int) fd;
2198                 syscallarg(void *) buf;
2199                 syscallarg(size_t) nbyte;
2200                 syscallarg(off_t) offset;
2201         } */
2202         file_t *fp;
2203         struct vnode *vp;
2204         off_t offset;
2205         int error, fd = SCARG(uap, fd);
2206
2207         if ((fp = fd_getfile(fd)) == NULL)
2208                 return (EBADF);
2209
2210         if ((fp->f_flag & FREAD) == 0) {
2211                 fd_putfile(fd);
2212                 return (EBADF);
2213         }
2214
2215         vp = fp->f_data;
2216         if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2217                 error = ESPIPE;
2218                 goto out;
2219         }
2220
2221         offset = SCARG(uap, offset);
2222
2223         /*
2224          * XXX This works because no file systems actually
2225          * XXX take any action on the seek operation.
2226          */
2227         if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2228                 goto out;
2229
2230         /* dofileread() will unuse the descriptor for us */
2231         return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2232             &offset, 0, retval));
2233
2234  out:
2235         fd_putfile(fd);
2236         return (error);
2237 }
2238
2239 /*
2240  * Positional scatter read system call.
2241  */
2242 int
2243 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2244 {
2245         /* {
2246                 syscallarg(int) fd;
2247                 syscallarg(const struct iovec *) iovp;
2248                 syscallarg(int) iovcnt;
2249                 syscallarg(off_t) offset;
2250         } */
2251         off_t offset = SCARG(uap, offset);
2252
2253         return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2254             SCARG(uap, iovcnt), &offset, 0, retval);
2255 }
2256
2257 /*
2258  * Positional write system call.
2259  */
2260 int
2261 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2262 {
2263         /* {
2264                 syscallarg(int) fd;
2265                 syscallarg(const void *) buf;
2266                 syscallarg(size_t) nbyte;
2267                 syscallarg(off_t) offset;
2268         } */
2269         file_t *fp;
2270         struct vnode *vp;
2271         off_t offset;
2272         int error, fd = SCARG(uap, fd);
2273
2274         if ((fp = fd_getfile(fd)) == NULL)
2275                 return (EBADF);
2276
2277         if ((fp->f_flag & FWRITE) == 0) {
2278                 fd_putfile(fd);
2279                 return (EBADF);
2280         }
2281
2282         vp = fp->f_data;
2283         if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2284                 error = ESPIPE;
2285                 goto out;
2286         }
2287
2288         offset = SCARG(uap, offset);
2289
2290         /*
2291          * XXX This works because no file systems actually
2292          * XXX take any action on the seek operation.
2293          */
2294         if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2295                 goto out;
2296
2297         /* dofilewrite() will unuse the descriptor for us */
2298         return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2299             &offset, 0, retval));
2300
2301  out:
2302         fd_putfile(fd);
2303         return (error);
2304 }
2305
2306 /*
2307  * Positional gather write system call.
2308  */
2309 int
2310 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
2311 {
2312         /* {
2313                 syscallarg(int) fd;
2314                 syscallarg(const struct iovec *) iovp;
2315                 syscallarg(int) iovcnt;
2316                 syscallarg(off_t) offset;
2317         } */
2318         off_t offset = SCARG(uap, offset);
2319
2320         return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2321             SCARG(uap, iovcnt), &offset, 0, retval);
2322 }
2323
2324 /*
2325  * Check access permissions.
2326  */
2327 int
2328 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
2329 {
2330         /* {
2331                 syscallarg(const char *) path;
2332                 syscallarg(int) flags;
2333         } */
2334         kauth_cred_t cred;
2335         struct vnode *vp;
2336         int error, flags;
2337         struct nameidata nd;
2338
2339         cred = kauth_cred_dup(l->l_cred);
2340         kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
2341         kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
2342         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2343             SCARG(uap, path));
2344         /* Override default credentials */
2345         nd.ni_cnd.cn_cred = cred;
2346         if ((error = namei(&nd)) != 0)
2347                 goto out;
2348         vp = nd.ni_vp;
2349
2350         /* Flags == 0 means only check for existence. */
2351         if (SCARG(uap, flags)) {
2352                 flags = 0;
2353                 if (SCARG(uap, flags) & R_OK)
2354                         flags |= VREAD;
2355                 if (SCARG(uap, flags) & W_OK)
2356                         flags |= VWRITE;
2357                 if (SCARG(uap, flags) & X_OK)
2358                         flags |= VEXEC;
2359
2360                 error = VOP_ACCESS(vp, flags, cred);
2361                 if (!error && (flags & VWRITE))
2362                         error = vn_writechk(vp);
2363         }
2364         vput(vp);
2365 out:
2366         kauth_cred_free(cred);
2367         return (error);
2368 }
2369
2370 /*
2371  * Common code for all sys_stat functions, including compat versions.
2372  */
2373 int
2374 do_sys_stat(const char *path, unsigned int nd_flags, struct stat *sb)
2375 {
2376         int error;
2377         struct nameidata nd;
2378
2379         NDINIT(&nd, LOOKUP, nd_flags | LOCKLEAF | TRYEMULROOT,
2380             UIO_USERSPACE, path);
2381         error = namei(&nd);
2382         if (error != 0)
2383                 return error;
2384         error = vn_stat(nd.ni_vp, sb);
2385         vput(nd.ni_vp);
2386         return error;
2387 }
2388
2389 /*
2390  * Get file status; this version follows links.
2391  */
2392 /* ARGSUSED */
2393 int
2394 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
2395 {
2396         /* {
2397                 syscallarg(const char *) path;
2398                 syscallarg(struct stat *) ub;
2399         } */
2400         struct stat sb;
2401         int error;
2402
2403         error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb);
2404         if (error)
2405                 return error;
2406         return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2407 }
2408
2409 /*
2410  * Get file status; this version does not follow links.
2411  */
2412 /* ARGSUSED */
2413 int
2414 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
2415 {
2416         /* {
2417                 syscallarg(const char *) path;
2418                 syscallarg(struct stat *) ub;
2419         } */
2420         struct stat sb;
2421         int error;
2422
2423         error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb);
2424         if (error)
2425                 return error;
2426         return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2427 }
2428
2429 /*
2430  * Get configurable pathname variables.
2431  */
2432 /* ARGSUSED */
2433 int
2434 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval)
2435 {
2436         /* {
2437                 syscallarg(const char *) path;
2438                 syscallarg(int) name;
2439         } */
2440         int error;
2441         struct nameidata nd;
2442
2443         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2444             SCARG(uap, path));
2445         if ((error = namei(&nd)) != 0)
2446                 return (error);
2447         error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
2448         vput(nd.ni_vp);
2449         return (error);
2450 }
2451
2452 /*
2453  * Return target name of a symbolic link.
2454  */
2455 /* ARGSUSED */
2456 int
2457 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap, register_t *retval)
2458 {
2459         /* {
2460                 syscallarg(const char *) path;
2461                 syscallarg(char *) buf;
2462                 syscallarg(size_t) count;
2463         } */
2464         struct vnode *vp;
2465         struct iovec aiov;
2466         struct uio auio;
2467         int error;
2468         struct nameidata nd;
2469
2470         NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2471             SCARG(uap, path));
2472         if ((error = namei(&nd)) != 0)
2473                 return (error);
2474         vp = nd.ni_vp;
2475         if (vp->v_type != VLNK)
2476                 error = EINVAL;
2477         else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
2478             (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
2479                 aiov.iov_base = SCARG(uap, buf);
2480                 aiov.iov_len = SCARG(uap, count);
2481                 auio.uio_iov = &aiov;
2482                 auio.uio_iovcnt = 1;
2483                 auio.uio_offset = 0;
2484                 auio.uio_rw = UIO_READ;
2485                 KASSERT(l == curlwp);
2486                 auio.uio_vmspace = l->l_proc->p_vmspace;
2487                 auio.uio_resid = SCARG(uap, count);
2488                 error = VOP_READLINK(vp, &auio, l->l_cred);
2489         }
2490         vput(vp);
2491         *retval = SCARG(uap, count) - auio.uio_resid;
2492         return (error);
2493 }
2494
2495 /*
2496  * Change flags of a file given a path name.
2497  */
2498 /* ARGSUSED */
2499 int
2500 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
2501 {
2502         /* {
2503                 syscallarg(const char *) path;
2504                 syscallarg(u_long) flags;
2505         } */
2506         struct vnode *vp;
2507         int error;
2508
2509         error = namei_simple_user(SCARG(uap, path),
2510                                 NSM_FOLLOW_TRYEMULROOT, &vp);
2511         if (error != 0)
2512                 return (error);
2513         error = change_flags(vp, SCARG(uap, flags), l);
2514         vput(vp);
2515         return (error);
2516 }
2517
2518 /*
2519  * Change flags of a file given a file descriptor.
2520  */
2521 /* ARGSUSED */
2522 int
2523 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
2524 {
2525         /* {
2526                 syscallarg(int) fd;
2527                 syscallarg(u_long) flags;
2528         } */
2529         struct vnode *vp;
2530         file_t *fp;
2531         int error;
2532
2533         /* fd_getvnode() will use the descriptor for us */
2534         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2535                 return (error);
2536         vp = fp->f_data;
2537         error = change_flags(vp, SCARG(uap, flags), l);
2538         VOP_UNLOCK(vp, 0);
2539         fd_putfile(SCARG(uap, fd));
2540         return (error);
2541 }
2542
2543 /*
2544  * Change flags of a file given a path name; this version does
2545  * not follow links.
2546  */
2547 int
2548 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
2549 {
2550         /* {
2551                 syscallarg(const char *) path;
2552                 syscallarg(u_long) flags;
2553         } */
2554         struct vnode *vp;
2555         int error;
2556
2557         error = namei_simple_user(SCARG(uap, path),
2558                                 NSM_NOFOLLOW_TRYEMULROOT, &vp);
2559         if (error != 0)
2560                 return (error);
2561         error = change_flags(vp, SCARG(uap, flags), l);
2562         vput(vp);
2563         return (error);
2564 }
2565
2566 /*
2567  * Common routine to change flags of a file.
2568  */
2569 int
2570 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
2571 {
2572         struct vattr vattr;
2573         int error;
2574
2575         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2576         /*
2577          * Non-superusers cannot change the flags on devices, even if they
2578          * own them.
2579          */
2580         if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
2581                 if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2582                         goto out;
2583                 if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2584                         error = EINVAL;
2585                         goto out;
2586                 }
2587         }
2588         vattr_null(&vattr);
2589         vattr.va_flags = flags;
2590         error = VOP_SETATTR(vp, &vattr, l->l_cred);
2591 out:
2592         return (error);
2593 }
2594
2595 /*
2596  * Change mode of a file given path name; this version follows links.
2597  */
2598 /* ARGSUSED */
2599 int
2600 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
2601 {
2602         /* {
2603                 syscallarg(const char *) path;
2604                 syscallarg(int) mode;
2605         } */
2606         int error;
2607         struct vnode *vp;
2608
2609         error = namei_simple_user(SCARG(uap, path),
2610                                 NSM_FOLLOW_TRYEMULROOT, &vp);
2611         if (error != 0)
2612                 return (error);
2613
2614         error = change_mode(vp, SCARG(uap, mode), l);
2615
2616         vrele(vp);
2617         return (error);
2618 }
2619
2620 /*
2621  * Change mode of a file given a file descriptor.
2622  */
2623 /* ARGSUSED */
2624 int
2625 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
2626 {
2627         /* {
2628                 syscallarg(int) fd;
2629                 syscallarg(int) mode;
2630         } */
2631         file_t *fp;
2632         int error;
2633
2634         /* fd_getvnode() will use the descriptor for us */
2635         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2636                 return (error);
2637         error = change_mode(fp->f_data, SCARG(uap, mode), l);
2638         fd_putfile(SCARG(uap, fd));
2639         return (error);
2640 }
2641
2642 /*
2643  * Change mode of a file given path name; this version does not follow links.
2644  */
2645 /* ARGSUSED */
2646 int
2647 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
2648 {
2649         /* {
2650                 syscallarg(const char *) path;
2651                 syscallarg(int) mode;
2652         } */
2653         int error;
2654         struct vnode *vp;
2655
2656         error = namei_simple_user(SCARG(uap, path),
2657                                 NSM_NOFOLLOW_TRYEMULROOT, &vp);
2658         if (error != 0)
2659                 return (error);
2660
2661         error = change_mode(vp, SCARG(uap, mode), l);
2662
2663         vrele(vp);
2664         return (error);
2665 }
2666
2667 /*
2668  * Common routine to set mode given a vnode.
2669  */
2670 static int
2671 change_mode(struct vnode *vp, int mode, struct lwp *l)
2672 {
2673         struct vattr vattr;
2674         int error;
2675
2676         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2677         vattr_null(&vattr);
2678         vattr.va_mode = mode & ALLPERMS;
2679         error = VOP_SETATTR(vp, &vattr, l->l_cred);
2680         VOP_UNLOCK(vp, 0);
2681         return (error);
2682 }
2683
2684 /*
2685  * Set ownership given a path name; this version follows links.
2686  */
2687 /* ARGSUSED */
2688 int
2689 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
2690 {
2691         /* {
2692                 syscallarg(const char *) path;
2693                 syscallarg(uid_t) uid;
2694                 syscallarg(gid_t) gid;
2695         } */
2696         int error;
2697         struct vnode *vp;
2698
2699         error = namei_simple_user(SCARG(uap, path),
2700                                 NSM_FOLLOW_TRYEMULROOT, &vp);
2701         if (error != 0)
2702                 return (error);
2703
2704         error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2705
2706         vrele(vp);
2707         return (error);
2708 }
2709
2710 /*
2711  * Set ownership given a path name; this version follows links.
2712  * Provides POSIX semantics.
2713  */
2714 /* ARGSUSED */
2715 int
2716 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
2717 {
2718         /* {
2719                 syscallarg(const char *) path;
2720                 syscallarg(uid_t) uid;
2721                 syscallarg(gid_t) gid;
2722         } */
2723         int error;
2724         struct vnode *vp;
2725
2726         error = namei_simple_user(SCARG(uap, path),
2727                                 NSM_FOLLOW_TRYEMULROOT, &vp);
2728         if (error != 0)
2729                 return (error);
2730
2731         error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2732
2733         vrele(vp);
2734         return (error);
2735 }
2736
2737 /*
2738  * Set ownership given a file descriptor.
2739  */
2740 /* ARGSUSED */
2741 int
2742 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
2743 {
2744         /* {
2745                 syscallarg(int) fd;
2746                 syscallarg(uid_t) uid;
2747                 syscallarg(gid_t) gid;
2748         } */
2749         int error;
2750         file_t *fp;
2751
2752         /* fd_getvnode() will use the descriptor for us */
2753         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2754                 return (error);
2755         error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2756             l, 0);
2757         fd_putfile(SCARG(uap, fd));
2758         return (error);
2759 }
2760
2761 /*
2762  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
2763  */
2764 /* ARGSUSED */
2765 int
2766 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
2767 {
2768         /* {
2769                 syscallarg(int) fd;
2770                 syscallarg(uid_t) uid;
2771                 syscallarg(gid_t) gid;
2772         } */
2773         int error;
2774         file_t *fp;
2775
2776         /* fd_getvnode() will use the descriptor for us */
2777         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2778                 return (error);
2779         error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2780             l, 1);
2781         fd_putfile(SCARG(uap, fd));
2782         return (error);
2783 }
2784
2785 /*
2786  * Set ownership given a path name; this version does not follow links.
2787  */
2788 /* ARGSUSED */
2789 int
2790 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
2791 {
2792         /* {
2793                 syscallarg(const char *) path;
2794                 syscallarg(uid_t) uid;
2795                 syscallarg(gid_t) gid;
2796         } */
2797         int error;
2798         struct vnode *vp;
2799
2800         error = namei_simple_user(SCARG(uap, path),
2801                                 NSM_NOFOLLOW_TRYEMULROOT, &vp);
2802         if (error != 0)
2803                 return (error);
2804
2805         error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2806
2807         vrele(vp);
2808         return (error);
2809 }
2810
2811 /*
2812  * Set ownership given a path name; this version does not follow links.
2813  * Provides POSIX/XPG semantics.
2814  */
2815 /* ARGSUSED */
2816 int
2817 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
2818 {
2819         /* {
2820                 syscallarg(const char *) path;
2821                 syscallarg(uid_t) uid;
2822                 syscallarg(gid_t) gid;
2823         } */
2824         int error;
2825         struct vnode *vp;
2826
2827         error = namei_simple_user(SCARG(uap, path),
2828                                 NSM_NOFOLLOW_TRYEMULROOT, &vp);
2829         if (error != 0)
2830                 return (error);
2831
2832         error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2833
2834         vrele(vp);
2835         return (error);
2836 }
2837
2838 /*
2839  * Common routine to set ownership given a vnode.
2840  */
2841 static int
2842 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
2843     int posix_semantics)
2844 {
2845         struct vattr vattr;
2846         mode_t newmode;
2847         int error;
2848
2849         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2850         if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2851                 goto out;
2852
2853 #define CHANGED(x) ((int)(x) != -1)
2854         newmode = vattr.va_mode;
2855         if (posix_semantics) {
2856                 /*
2857                  * POSIX/XPG semantics: if the caller is not the super-user,
2858                  * clear set-user-id and set-group-id bits.  Both POSIX and
2859                  * the XPG consider the behaviour for calls by the super-user
2860                  * implementation-defined; we leave the set-user-id and set-
2861                  * group-id settings intact in that case.
2862                  */
2863                 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
2864                                       NULL) != 0)
2865                         newmode &= ~(S_ISUID | S_ISGID);
2866         } else {
2867                 /*
2868                  * NetBSD semantics: when changing owner and/or group,
2869                  * clear the respective bit(s).
2870                  */
2871                 if (CHANGED(uid))
2872                         newmode &= ~S_ISUID;
2873                 if (CHANGED(gid))
2874                         newmode &= ~S_ISGID;
2875         }
2876         /* Update va_mode iff altered. */
2877         if (vattr.va_mode == newmode)
2878                 newmode = VNOVAL;
2879
2880         vattr_null(&vattr);
2881         vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
2882         vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
2883         vattr.va_mode = newmode;
2884         error = VOP_SETATTR(vp, &vattr, l->l_cred);
2885 #undef CHANGED
2886
2887 out:
2888         VOP_UNLOCK(vp, 0);
2889         return (error);
2890 }
2891
2892 /*
2893  * Set the access and modification times given a path name; this
2894  * version follows links.
2895  */
2896 /* ARGSUSED */
2897 int
2898 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
2899     register_t *retval)
2900 {
2901         /* {
2902                 syscallarg(const char *) path;
2903                 syscallarg(const struct timeval *) tptr;
2904         } */
2905
2906         return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
2907             SCARG(uap, tptr), UIO_USERSPACE);
2908 }
2909
2910 /*
2911  * Set the access and modification times given a file descriptor.
2912  */
2913 /* ARGSUSED */
2914 int
2915 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
2916     register_t *retval)
2917 {
2918         /* {
2919                 syscallarg(int) fd;
2920                 syscallarg(const struct timeval *) tptr;
2921         } */
2922         int error;
2923         file_t *fp;
2924
2925         /* fd_getvnode() will use the descriptor for us */
2926         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2927                 return (error);
2928         error = do_sys_utimes(l, fp->f_data, NULL, 0, SCARG(uap, tptr),
2929             UIO_USERSPACE);
2930         fd_putfile(SCARG(uap, fd));
2931         return (error);
2932 }
2933
2934 /*
2935  * Set the access and modification times given a path name; this
2936  * version does not follow links.
2937  */
2938 int
2939 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
2940     register_t *retval)
2941 {
2942         /* {
2943                 syscallarg(const char *) path;
2944                 syscallarg(const struct timeval *) tptr;
2945         } */
2946
2947         return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
2948             SCARG(uap, tptr), UIO_USERSPACE);
2949 }
2950
2951 /*
2952  * Common routine to set access and modification times given a vnode.
2953  */
2954 int
2955 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
2956     const struct timeval *tptr, enum uio_seg seg)
2957 {
2958         struct vattr vattr;
2959         int error, dorele = 0;
2960         namei_simple_flags_t sflags;
2961
2962         bool vanull, setbirthtime;
2963         struct timespec ts[2];
2964
2965         /*
2966          * I have checked all callers and they pass either FOLLOW,
2967          * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
2968          * is 0. More to the point, they don't pass anything else.
2969          * Let's keep it that way at least until the namei interfaces
2970          * are fully sanitized.
2971          */
2972         KASSERT(flag == NOFOLLOW || flag == FOLLOW);
2973         sflags = (flag == FOLLOW) ?
2974                 NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
2975
2976         if (tptr == NULL) {
2977                 vanull = true;
2978                 nanotime(&ts[0]);
2979                 ts[1] = ts[0];
2980         } else {
2981                 struct timeval tv[2];
2982
2983                 vanull = false;
2984                 if (seg != UIO_SYSSPACE) {
2985                         error = copyin(tptr, tv, sizeof (tv));
2986                         if (error != 0)
2987                                 return error;
2988                         tptr = tv;
2989                 }
2990                 TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
2991                 TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
2992         }
2993
2994         if (vp == NULL) {
2995                 /* note: SEG describes TPTR, not PATH; PATH is always user */
2996                 error = namei_simple_user(path, sflags, &vp);
2997                 if (error != 0)
2998                         return error;
2999                 dorele = 1;
3000         }
3001
3002         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3003         setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
3004             timespeccmp(&ts[1], &vattr.va_birthtime, <));
3005         vattr_null(&vattr);
3006         vattr.va_atime = ts[0];
3007         vattr.va_mtime = ts[1];
3008         if (setbirthtime)
3009                 vattr.va_birthtime = ts[1];
3010         if (vanull)
3011                 vattr.va_vaflags |= VA_UTIMES_NULL;
3012         error = VOP_SETATTR(vp, &vattr, l->l_cred);
3013         VOP_UNLOCK(vp, 0);
3014
3015         if (dorele != 0)
3016                 vrele(vp);
3017
3018         return error;
3019 }
3020
3021 /*
3022  * Truncate a file given its path name.
3023  */
3024 /* ARGSUSED */
3025 int
3026 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
3027 {
3028         /* {
3029                 syscallarg(const char *) path;
3030                 syscallarg(int) pad;
3031                 syscallarg(off_t) length;
3032         } */
3033         struct vnode *vp;
3034         struct vattr vattr;
3035         int error;
3036
3037         error = namei_simple_user(SCARG(uap, path),
3038                                 NSM_FOLLOW_TRYEMULROOT, &vp);
3039         if (error != 0)
3040                 return (error);
3041         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3042         if (vp->v_type == VDIR)
3043                 error = EISDIR;
3044         else if ((error = vn_writechk(vp)) == 0 &&
3045             (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
3046                 vattr_null(&vattr);
3047                 vattr.va_size = SCARG(uap, length);
3048                 error = VOP_SETATTR(vp, &vattr, l->l_cred);
3049         }
3050         vput(vp);
3051         return (error);
3052 }
3053
3054 /*
3055  * Truncate a file given a file descriptor.
3056  */
3057 /* ARGSUSED */
3058 int
3059 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
3060 {
3061         /* {
3062                 syscallarg(int) fd;
3063                 syscallarg(int) pad;
3064                 syscallarg(off_t) length;
3065         } */
3066         struct vattr vattr;
3067         struct vnode *vp;
3068         file_t *fp;
3069         int error;
3070
3071         /* fd_getvnode() will use the descriptor for us */
3072         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3073                 return (error);
3074         if ((fp->f_flag & FWRITE) == 0) {
3075                 error = EINVAL;
3076                 goto out;
3077         }
3078         vp = fp->f_data;
3079         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3080         if (vp->v_type == VDIR)
3081                 error = EISDIR;
3082         else if ((error = vn_writechk(vp)) == 0) {
3083                 vattr_null(&vattr);
3084                 vattr.va_size = SCARG(uap, length);
3085                 error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3086         }
3087         VOP_UNLOCK(vp, 0);
3088  out:
3089         fd_putfile(SCARG(uap, fd));
3090         return (error);
3091 }
3092
3093 /*
3094  * Sync an open file.
3095  */
3096 /* ARGSUSED */
3097 int
3098 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
3099 {
3100         /* {
3101                 syscallarg(int) fd;
3102         } */
3103         struct vnode *vp;
3104         file_t *fp;
3105         int error;
3106
3107         /* fd_getvnode() will use the descriptor for us */
3108         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3109                 return (error);
3110         vp = fp->f_data;
3111         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3112         error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
3113         VOP_UNLOCK(vp, 0);
3114         fd_putfile(SCARG(uap, fd));
3115         return (error);
3116 }
3117
3118 /*
3119  * Sync a range of file data.  API modeled after that found in AIX.
3120  *
3121  * FDATASYNC indicates that we need only save enough metadata to be able
3122  * to re-read the written data.  Note we duplicate AIX's requirement that
3123  * the file be open for writing.
3124  */
3125 /* ARGSUSED */
3126 int
3127 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
3128 {
3129         /* {
3130                 syscallarg(int) fd;
3131                 syscallarg(int) flags;
3132                 syscallarg(off_t) start;
3133                 syscallarg(off_t) length;
3134         } */
3135         struct vnode *vp;
3136         file_t *fp;
3137         int flags, nflags;
3138         off_t s, e, len;
3139         int error;
3140
3141         /* fd_getvnode() will use the descriptor for us */
3142         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3143                 return (error);
3144
3145         if ((fp->f_flag & FWRITE) == 0) {
3146                 error = EBADF;
3147                 goto out;
3148         }
3149
3150         flags = SCARG(uap, flags);
3151         if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
3152             ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
3153                 error = EINVAL;
3154                 goto out;
3155         }
3156         /* Now set up the flags for value(s) to pass to VOP_FSYNC() */
3157         if (flags & FDATASYNC)
3158                 nflags = FSYNC_DATAONLY | FSYNC_WAIT;
3159         else
3160                 nflags = FSYNC_WAIT;
3161         if (flags & FDISKSYNC)
3162                 nflags |= FSYNC_CACHE;
3163
3164         len = SCARG(uap, length);
3165         /* If length == 0, we do the whole file, and s = l = 0 will do that */
3166         if (len) {
3167                 s = SCARG(uap, start);
3168                 e = s + len;
3169                 if (e < s) {
3170                         error = EINVAL;
3171                         goto out;
3172                 }
3173         } else {
3174                 e = 0;
3175                 s = 0;
3176         }
3177
3178         vp = fp->f_data;
3179         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3180         error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
3181         VOP_UNLOCK(vp, 0);
3182 out:
3183         fd_putfile(SCARG(uap, fd));
3184         return (error);
3185 }
3186
3187 /*
3188  * Sync the data of an open file.
3189  */
3190 /* ARGSUSED */
3191 int
3192 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
3193 {
3194         /* {
3195                 syscallarg(int) fd;
3196         } */
3197         struct vnode *vp;
3198         file_t *fp;
3199         int error;
3200
3201         /* fd_getvnode() will use the descriptor for us */
3202         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3203                 return (error);
3204         if ((fp->f_flag & FWRITE) == 0) {
3205                 fd_putfile(SCARG(uap, fd));
3206                 return (EBADF);
3207         }
3208         vp = fp->f_data;
3209         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3210         error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
3211         VOP_UNLOCK(vp, 0);
3212         fd_putfile(SCARG(uap, fd));
3213         return (error);
3214 }
3215
3216 /*
3217  * Rename files, (standard) BSD semantics frontend.
3218  */
3219 /* ARGSUSED */
3220 int
3221 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
3222 {
3223         /* {
3224                 syscallarg(const char *) from;
3225                 syscallarg(const char *) to;
3226         } */
3227
3228         return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 0));
3229 }
3230
3231 /*
3232  * Rename files, POSIX semantics frontend.
3233  */
3234 /* ARGSUSED */
3235 int
3236 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
3237 {
3238         /* {
3239                 syscallarg(const char *) from;
3240                 syscallarg(const char *) to;
3241         } */
3242
3243         return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 1));
3244 }
3245
3246 /*
3247  * Rename files.  Source and destination must either both be directories,
3248  * or both not be directories.  If target is a directory, it must be empty.
3249  * If `from' and `to' refer to the same object, the value of the `retain'
3250  * argument is used to determine whether `from' will be
3251  *
3252  * (retain == 0)        deleted unless `from' and `to' refer to the same
3253  *                      object in the file system's name space (BSD).
3254  * (retain == 1)        always retained (POSIX).
3255  */
3256 int
3257 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
3258 {
3259         struct vnode *tvp, *fvp, *tdvp;
3260         struct nameidata fromnd, tond;
3261         struct mount *fs;
3262         struct lwp *l = curlwp;
3263         struct proc *p;
3264         uint32_t saveflag;
3265         int error;
3266
3267         NDINIT(&fromnd, DELETE, LOCKPARENT | SAVESTART | TRYEMULROOT | INRENAME,
3268             seg, from);
3269         if ((error = namei(&fromnd)) != 0)
3270                 return (error);
3271         if (fromnd.ni_dvp != fromnd.ni_vp)
3272                 VOP_UNLOCK(fromnd.ni_dvp, 0);
3273         fvp = fromnd.ni_vp;
3274
3275         fs = fvp->v_mount;
3276         error = VFS_RENAMELOCK_ENTER(fs);
3277         if (error) {
3278                 VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3279                 vrele(fromnd.ni_dvp);
3280                 vrele(fvp);
3281                 goto out1;
3282         }
3283
3284         /*
3285          * close, partially, yet another race - ideally we should only
3286          * go as far as getting fromnd.ni_dvp before getting the per-fs
3287          * lock, and then continue to get fromnd.ni_vp, but we can't do
3288          * that with namei as it stands.
3289          *
3290          * This still won't prevent rmdir from nuking fromnd.ni_vp
3291          * under us. The real fix is to get the locks in the right
3292          * order and do the lookups in the right places, but that's a
3293          * major rototill.
3294          *
3295          * Preserve the SAVESTART in cn_flags, because who knows what
3296          * might happen if we don't.
3297          *
3298          * Note: this logic (as well as this whole function) is cloned
3299          * in nfs_serv.c. Proceed accordingly.
3300          */
3301         vrele(fvp);
3302         if ((fromnd.ni_cnd.cn_namelen == 1 &&
3303              fromnd.ni_cnd.cn_nameptr[0] == '.') ||
3304             (fromnd.ni_cnd.cn_namelen == 2 &&
3305              fromnd.ni_cnd.cn_nameptr[0] == '.' &&
3306              fromnd.ni_cnd.cn_nameptr[1] == '.')) {
3307                 error = EINVAL;
3308                 VFS_RENAMELOCK_EXIT(fs);
3309                 VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3310                 vrele(fromnd.ni_dvp);
3311                 goto out1;
3312         }
3313         saveflag = fromnd.ni_cnd.cn_flags & SAVESTART;
3314         fromnd.ni_cnd.cn_flags &= ~SAVESTART;
3315         vn_lock(fromnd.ni_dvp, LK_EXCLUSIVE | LK_RETRY);
3316         error = relookup(fromnd.ni_dvp, &fromnd.ni_vp, &fromnd.ni_cnd);
3317         fromnd.ni_cnd.cn_flags |= saveflag;
3318         if (error) {
3319                 VOP_UNLOCK(fromnd.ni_dvp, 0);
3320                 VFS_RENAMELOCK_EXIT(fs);
3321                 VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3322                 vrele(fromnd.ni_dvp);
3323                 goto out1;
3324         }
3325         VOP_UNLOCK(fromnd.ni_vp, 0);
3326         if (fromnd.ni_dvp != fromnd.ni_vp)
3327                 VOP_UNLOCK(fromnd.ni_dvp, 0);
3328         fvp = fromnd.ni_vp;
3329
3330         NDINIT(&tond, RENAME,
3331             LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | TRYEMULROOT
3332               | INRENAME | (fvp->v_type == VDIR ? CREATEDIR : 0),
3333             seg, to);
3334         if ((error = namei(&tond)) != 0) {
3335                 VFS_RENAMELOCK_EXIT(fs);
3336                 VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3337                 vrele(fromnd.ni_dvp);
3338                 vrele(fvp);
3339                 goto out1;
3340         }
3341         tdvp = tond.ni_dvp;
3342         tvp = tond.ni_vp;
3343
3344         if (tvp != NULL) {
3345                 if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3346                         error = ENOTDIR;
3347                         goto out;
3348                 } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3349                         error = EISDIR;
3350                         goto out;
3351                 }
3352         }
3353
3354         if (fvp == tdvp)
3355                 error = EINVAL;
3356
3357         /*
3358          * Source and destination refer to the same object.
3359          */
3360         if (fvp == tvp) {
3361                 if (retain)
3362                         error = -1;
3363                 else if (fromnd.ni_dvp == tdvp &&
3364                     fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
3365                     !memcmp(fromnd.ni_cnd.cn_nameptr,
3366                           tond.ni_cnd.cn_nameptr,
3367                           fromnd.ni_cnd.cn_namelen))
3368                 error = -1;
3369         }
3370
3371 #if NVERIEXEC > 0
3372         if (!error) {
3373                 char *f1, *f2;
3374                 size_t f1_len;
3375                 size_t f2_len;
3376
3377                 f1_len = fromnd.ni_cnd.cn_namelen + 1;
3378                 f1 = kmem_alloc(f1_len, KM_SLEEP);
3379                 strlcpy(f1, fromnd.ni_cnd.cn_nameptr, f1_len);
3380
3381                 f2_len = tond.ni_cnd.cn_namelen + 1;
3382                 f2 = kmem_alloc(f2_len, KM_SLEEP);
3383                 strlcpy(f2, tond.ni_cnd.cn_nameptr, f2_len);
3384
3385                 error = veriexec_renamechk(l, fvp, f1, tvp, f2);
3386
3387                 kmem_free(f1, f1_len);
3388                 kmem_free(f2, f2_len);
3389         }
3390 #endif /* NVERIEXEC > 0 */
3391
3392 out:
3393         p = l->l_proc;
3394         if (!error) {
3395                 error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3396                                    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3397                 VFS_RENAMELOCK_EXIT(fs);
3398         } else {
3399                 VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
3400                 if (tdvp == tvp)
3401                         vrele(tdvp);
3402                 else
3403                         vput(tdvp);
3404                 if (tvp)
3405                         vput(tvp);
3406                 VFS_RENAMELOCK_EXIT(fs);
3407                 VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3408                 vrele(fromnd.ni_dvp);
3409                 vrele(fvp);
3410         }
3411         vrele(tond.ni_startdir);
3412         PNBUF_PUT(tond.ni_cnd.cn_pnbuf);
3413 out1:
3414         if (fromnd.ni_startdir)
3415                 vrele(fromnd.ni_startdir);
3416         PNBUF_PUT(fromnd.ni_cnd.cn_pnbuf);
3417         return (error == -1 ? 0 : error);
3418 }
3419
3420 /*
3421  * Make a directory file.
3422  */
3423 /* ARGSUSED */
3424 int
3425 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
3426 {
3427         /* {
3428                 syscallarg(const char *) path;
3429                 syscallarg(int) mode;
3430         } */
3431
3432         return do_sys_mkdir(SCARG(uap, path), SCARG(uap, mode), UIO_USERSPACE);
3433 }
3434
3435 int
3436 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
3437 {
3438         struct proc *p = curlwp->l_proc;
3439         struct vnode *vp;
3440         struct vattr vattr;
3441         int error;
3442         struct nameidata nd;
3443
3444         NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT,
3445             seg, path);
3446         if ((error = namei(&nd)) != 0)
3447                 return (error);
3448         vp = nd.ni_vp;
3449         if (vp != NULL) {
3450                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3451                 if (nd.ni_dvp == vp)
3452                         vrele(nd.ni_dvp);
3453                 else
3454                         vput(nd.ni_dvp);
3455                 vrele(vp);
3456                 return (EEXIST);
3457         }
3458         vattr_null(&vattr);
3459         vattr.va_type = VDIR;
3460         /* We will read cwdi->cwdi_cmask unlocked. */
3461         vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
3462         error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3463         if (!error)
3464                 vput(nd.ni_vp);
3465         return (error);
3466 }
3467
3468 /*
3469  * Remove a directory file.
3470  */
3471 /* ARGSUSED */
3472 int
3473 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
3474 {
3475         /* {
3476                 syscallarg(const char *) path;
3477         } */
3478         struct vnode *vp;
3479         int error;
3480         struct nameidata nd;
3481
3482         NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
3483             SCARG(uap, path));
3484         if ((error = namei(&nd)) != 0)
3485                 return (error);
3486         vp = nd.ni_vp;
3487         if (vp->v_type != VDIR) {
3488                 error = ENOTDIR;
3489                 goto out;
3490         }
3491         /*
3492          * No rmdir "." please.
3493          */
3494         if (nd.ni_dvp == vp) {
3495                 error = EINVAL;
3496                 goto out;
3497         }
3498         /*
3499          * The root of a mounted filesystem cannot be deleted.
3500          */
3501         if ((vp->v_vflag & VV_ROOT) != 0 || vp->v_mountedhere != NULL) {
3502                 error = EBUSY;
3503                 goto out;
3504         }
3505         error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3506         return (error);
3507
3508 out:
3509         VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3510         if (nd.ni_dvp == vp)
3511                 vrele(nd.ni_dvp);
3512         else
3513                 vput(nd.ni_dvp);
3514         vput(vp);
3515         return (error);
3516 }
3517
3518 /*
3519  * Read a block of directory entries in a file system independent format.
3520  */
3521 int
3522 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
3523 {
3524         /* {
3525                 syscallarg(int) fd;
3526                 syscallarg(char *) buf;
3527                 syscallarg(size_t) count;
3528         } */
3529         file_t *fp;
3530         int error, done;
3531
3532         /* fd_getvnode() will use the descriptor for us */
3533         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3534                 return (error);
3535         if ((fp->f_flag & FREAD) == 0) {
3536                 error = EBADF;
3537                 goto out;
3538         }
3539         error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
3540                         SCARG(uap, count), &done, l, 0, 0);
3541         ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
3542         *retval = done;
3543  out:
3544         fd_putfile(SCARG(uap, fd));
3545         return (error);
3546 }
3547
3548 /*
3549  * Set the mode mask for creation of filesystem nodes.
3550  */
3551 int
3552 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
3553 {
3554         /* {
3555                 syscallarg(mode_t) newmask;
3556         } */
3557         struct proc *p = l->l_proc;
3558         struct cwdinfo *cwdi;
3559
3560         /*
3561          * cwdi->cwdi_cmask will be read unlocked elsewhere.  What's
3562          * important is that we serialize changes to the mask.  The
3563          * rw_exit() will issue a write memory barrier on our behalf,
3564          * and force the changes out to other CPUs (as it must use an
3565          * atomic operation, draining the local CPU's store buffers).
3566          */
3567         cwdi = p->p_cwdi;
3568         rw_enter(&cwdi->cwdi_lock, RW_WRITER);
3569         *retval = cwdi->cwdi_cmask;
3570         cwdi->cwdi_cmask = SCARG(uap, newmask) & ALLPERMS;
3571         rw_exit(&cwdi->cwdi_lock);
3572
3573         return (0);
3574 }
3575
3576 int
3577 dorevoke(struct vnode *vp, kauth_cred_t cred)
3578 {
3579         struct vattr vattr;
3580         int error;
3581
3582         if ((error = VOP_GETATTR(vp, &vattr, cred)) != 0)
3583                 return error;
3584         if (kauth_cred_geteuid(cred) == vattr.va_uid ||
3585             (error = kauth_authorize_generic(cred,
3586             KAUTH_GENERIC_ISSUSER, NULL)) == 0)
3587                 VOP_REVOKE(vp, REVOKEALL);
3588         return (error);
3589 }
3590
3591 /*
3592  * Void all references to file by ripping underlying filesystem
3593  * away from vnode.
3594  */
3595 /* ARGSUSED */
3596 int
3597 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
3598 {
3599         /* {
3600                 syscallarg(const char *) path;
3601         } */
3602         struct vnode *vp;
3603         int error;
3604
3605         error = namei_simple_user(SCARG(uap, path),
3606                                 NSM_FOLLOW_TRYEMULROOT, &vp);
3607         if (error != 0)
3608                 return (error);
3609         error = dorevoke(vp, l->l_cred);
3610         vrele(vp);
3611         return (error);
3612 }