usr/src/uts/common/fs/lofs/lofs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2015 Joyent, Inc.
  25  */
  26
  27 #include <sys/param.h>
  28 #include <sys/systm.h>
  29 #include <sys/errno.h>
  30 #include <sys/vnode.h>
  31 #include <sys/vfs.h>
  32 #include <sys/vfs_opreg.h>
  33 #include <sys/uio.h>
  34 #include <sys/cred.h>
  35 #include <sys/pathname.h>
  36 #include <sys/debug.h>
  37 #include <sys/fs/lofs_node.h>
  38 #include <sys/fs/lofs_info.h>
  39 #include <fs/fs_subr.h>
  40 #include <vm/as.h>
  41 #include <vm/seg.h>
  42
  43 /*
  44  * These are the vnode ops routines which implement the vnode interface to
  45  * the looped-back file system.  These routines just take their parameters,
  46  * and then calling the appropriate real vnode routine(s) to do the work.
  47  */
  48
  49 static int
  50 lo_open(vnode_t **vpp, int flag, struct cred *cr, caller_context_t *ct)
  51 {
  52         vnode_t *vp = *vpp;
  53         vnode_t *rvp;
  54         vnode_t *oldvp;
  55         int error;
  56
  57 #ifdef LODEBUG
  58         lo_dprint(4, "lo_open vp %p cnt=%d realvp %p cnt=%d\n",
  59             vp, vp->v_count, realvp(vp), realvp(vp)->v_count);
  60 #endif
  61
  62         oldvp = vp;
  63         vp = rvp = realvp(vp);
  64         /*
  65          * Need to hold new reference to vp since VOP_OPEN() may
  66          * decide to release it.
  67          */
  68         VN_HOLD(vp);
  69         error = VOP_OPEN(&rvp, flag, cr, ct);
  70
  71         if (!error && rvp != vp) {
  72                 /*
  73                  * the FS which we called should have released the
  74                  * new reference on vp
  75                  */
  76                 *vpp = makelonode(rvp, vtoli(oldvp->v_vfsp), 0);
  77                 if ((*vpp)->v_type == VDIR) {
  78                         /*
  79                          * Copy over any looping flags to the new lnode.
  80                          */
  81                         (vtol(*vpp))->lo_looping |= (vtol(oldvp))->lo_looping;
  82                 }
  83                 if (IS_DEVVP(*vpp)) {
  84                         vnode_t *svp;
  85
  86                         svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
  87                         VN_RELE(*vpp);
  88                         if (svp == NULL)
  89                                 error = ENOSYS;
  90                         else
  91                                 *vpp = svp;
  92                 }
  93                 VN_RELE(oldvp);
  94         } else {
  95                 ASSERT(rvp->v_count > 1);
  96                 VN_RELE(rvp);
  97         }
  98
  99         return (error);
 100 }
 101
 102 static int
 103 lo_close(
 104         vnode_t *vp,
 105         int flag,
 106         int count,
 107         offset_t offset,
 108         struct cred *cr,
 109         caller_context_t *ct)
 110 {
 111 #ifdef LODEBUG
 112         lo_dprint(4, "lo_close vp %p realvp %p\n", vp, realvp(vp));
 113 #endif
 114         vp = realvp(vp);
 115         return (VOP_CLOSE(vp, flag, count, offset, cr, ct));
 116 }
 117
 118 static int
 119 lo_read(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr,
 120         caller_context_t *ct)
 121 {
 122 #ifdef LODEBUG
 123         lo_dprint(4, "lo_read vp %p realvp %p\n", vp, realvp(vp));
 124 #endif
 125         vp = realvp(vp);
 126         return (VOP_READ(vp, uiop, ioflag, cr, ct));
 127 }
 128
 129 static int
 130 lo_write(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr,
 131         caller_context_t *ct)
 132 {
 133 #ifdef LODEBUG
 134         lo_dprint(4, "lo_write vp %p realvp %p\n", vp, realvp(vp));
 135 #endif
 136         vp = realvp(vp);
 137         return (VOP_WRITE(vp, uiop, ioflag, cr, ct));
 138 }
 139
 140 static int
 141 lo_ioctl(
 142         vnode_t *vp,
 143         int cmd,
 144         intptr_t arg,
 145         int flag,
 146         struct cred *cr,
 147         int *rvalp,
 148         caller_context_t *ct)
 149 {
 150 #ifdef LODEBUG
 151         lo_dprint(4, "lo_ioctl vp %p realvp %p\n", vp, realvp(vp));
 152 #endif
 153         vp = realvp(vp);
 154         return (VOP_IOCTL(vp, cmd, arg, flag, cr, rvalp, ct));
 155 }
 156
 157 static int
 158 lo_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, caller_context_t *ct)
 159 {
 160         vp = realvp(vp);
 161         return (VOP_SETFL(vp, oflags, nflags, cr, ct));
 162 }
 163
 164 static int
 165 lo_getattr(
 166         vnode_t *vp,
 167         struct vattr *vap,
 168         int flags,
 169         struct cred *cr,
 170         caller_context_t *ct)
 171 {
 172         int error;
 173
 174 #ifdef LODEBUG
 175         lo_dprint(4, "lo_getattr vp %p realvp %p\n", vp, realvp(vp));
 176 #endif
 177         if (error = VOP_GETATTR(realvp(vp), vap, flags, cr, ct))
 178                 return (error);
 179
 180         return (0);
 181 }
 182
 183 static int
 184 lo_setattr(
 185         vnode_t *vp,
 186         struct vattr *vap,
 187         int flags,
 188         struct cred *cr,
 189         caller_context_t *ct)
 190 {
 191 #ifdef LODEBUG
 192         lo_dprint(4, "lo_setattr vp %p realvp %p\n", vp, realvp(vp));
 193 #endif
 194         vp = realvp(vp);
 195         return (VOP_SETATTR(vp, vap, flags, cr, ct));
 196 }
 197
 198 static int
 199 lo_access(
 200         vnode_t *vp,
 201         int mode,
 202         int flags,
 203         struct cred *cr,
 204         caller_context_t *ct)
 205 {
 206 #ifdef LODEBUG
 207         lo_dprint(4, "lo_access vp %p realvp %p\n", vp, realvp(vp));
 208 #endif
 209         if (mode & VWRITE) {
 210                 if (vp->v_type == VREG && vn_is_readonly(vp))
 211                         return (EROFS);
 212         }
 213         vp = realvp(vp);
 214         return (VOP_ACCESS(vp, mode, flags, cr, ct));
 215 }
 216
 217 static int
 218 lo_fsync(vnode_t *vp, int syncflag, struct cred *cr, caller_context_t *ct)
 219 {
 220 #ifdef LODEBUG
 221         lo_dprint(4, "lo_fsync vp %p realvp %p\n", vp, realvp(vp));
 222 #endif
 223         vp = realvp(vp);
 224         return (VOP_FSYNC(vp, syncflag, cr, ct));
 225 }
 226
 227 /*ARGSUSED*/
 228 static void
 229 lo_inactive(vnode_t *vp, struct cred *cr, caller_context_t *ct)
 230 {
 231 #ifdef LODEBUG
 232         lo_dprint(4, "lo_inactive %p, realvp %p\n", vp, realvp(vp));
 233 #endif
 234         freelonode(vtol(vp));
 235 }
 236
 237 /* ARGSUSED */
 238 static int
 239 lo_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct)
 240 {
 241 #ifdef LODEBUG
 242         lo_dprint(4, "lo_fid %p, realvp %p\n", vp, realvp(vp));
 243 #endif
 244         vp = realvp(vp);
 245         return (VOP_FID(vp, fidp, ct));
 246 }
 247
 248 /*
 249  * Given a vnode of lofs type, lookup nm name and
 250  * return a shadow vnode (of lofs type) of the
 251  * real vnode found.
 252  *
 253  * Due to the nature of lofs, there is a potential
 254  * looping in path traversal.
 255  *
 256  * starting from the mount point of an lofs;
 257  * a loop is defined to be a traversal path
 258  * where the mount point or the real vnode of
 259  * the root of this lofs is encountered twice.
 260  * Once at the start of traversal and second
 261  * when the looping is found.
 262  *
 263  * When a loop is encountered, a shadow of the
 264  * covered vnode is returned to stop the looping.
 265  *
 266  * This normally works, but with the advent of
 267  * the new automounter, returning the shadow of the
 268  * covered vnode (autonode, in this case) does not
 269  * stop the loop.  Because further lookup on this
 270  * lonode will cause the autonode to call lo_lookup()
 271  * on the lonode covering it.
 272  *
 273  * example "/net/jurassic/net/jurassic" is a loop.
 274  * returning the shadow of the autonode corresponding to
 275  * "/net/jurassic/net/jurassic" will not terminate the
 276  * loop.   To solve this problem we allow the loop to go
 277  * through one more level component lookup.  Whichever
 278  * directory is then looked up in "/net/jurassic/net/jurassic"
 279  * the vnode returned is the vnode covered by the autonode
 280  * "net" and this will terminate the loop.
 281  *
 282  * Lookup for dot dot has to be dealt with separately.
 283  * It will be nice to have a "one size fits all" kind
 284  * of solution, so that we don't have so many ifs statement
 285  * in the lo_lookup() to handle dotdot.  But, since
 286  * there are so many special cases to handle different
 287  * kinds looping above, we need special codes to handle
 288  * dotdot lookup as well.
 289  */
 290 static int
 291 lo_lookup(
 292         vnode_t *dvp,
 293         char *nm,
 294         vnode_t **vpp,
 295         struct pathname *pnp,
 296         int flags,
 297         vnode_t *rdir,
 298         struct cred *cr,
 299         caller_context_t *ct,
 300         int *direntflags,
 301         pathname_t *realpnp)
 302 {
 303         vnode_t *vp = NULL, *tvp = NULL, *nonlovp;
 304         int error, is_indirectloop;
 305         vnode_t *realdvp = realvp(dvp);
 306         struct loinfo *li = vtoli(dvp->v_vfsp);
 307         int looping = 0;
 308         int autoloop = 0;
 309         int doingdotdot = 0;
 310         int nosub = 0;
 311         int mkflag = 0;
 312
 313         /*
 314          * If name is empty and no XATTR flags are set, then return
 315          * dvp (empty name == lookup ".").  If an XATTR flag is set
 316          * then we need to call VOP_LOOKUP to get the xattr dir.
 317          */
 318         if (nm[0] == '\0' && ! (flags & (CREATE_XATTR_DIR|LOOKUP_XATTR))) {
 319                 VN_HOLD(dvp);
 320                 *vpp = dvp;
 321                 return (0);
 322         }
 323
 324         if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
 325                 doingdotdot++;
 326                 /*
 327                  * Handle ".." out of mounted filesystem
 328                  */
 329                 while ((realdvp->v_flag & VROOT) && realdvp != rootdir) {
 330                         realdvp = realdvp->v_vfsp->vfs_vnodecovered;
 331                         ASSERT(realdvp != NULL);
 332                 }
 333         }
 334
 335         *vpp = NULL;    /* default(error) case */
 336
 337         /*
 338          * Do the normal lookup
 339          */
 340         if (error = VOP_LOOKUP(realdvp, nm, &vp, pnp, flags, rdir, cr,
 341             ct, direntflags, realpnp)) {
 342                 vp = NULL;
 343                 goto out;
 344         }
 345
 346         /*
 347          * We do this check here to avoid returning a stale file handle to the
 348          * caller.
 349          */
 350         if (nm[0] == '.' && nm[1] == '\0') {
 351                 ASSERT(vp == realdvp);
 352                 VN_HOLD(dvp);
 353                 VN_RELE(vp);
 354                 *vpp = dvp;
 355                 return (0);
 356         }
 357
 358         if (doingdotdot) {
 359                 if ((vtol(dvp))->lo_looping & LO_LOOPING) {
 360                         vfs_t *vfsp;
 361
 362                         error = vn_vfsrlock_wait(realdvp);
 363                         if (error)
 364                                 goto out;
 365                         vfsp = vn_mountedvfs(realdvp);
 366                         /*
 367                          * In the standard case if the looping flag is set and
 368                          * performing dotdot we would be returning from a
 369                          * covered vnode, implying vfsp could not be null. The
 370                          * exceptions being if we have looping and overlay
 371                          * mounts or looping and covered file systems.
 372                          */
 373                         if (vfsp == NULL) {
 374                                 /*
 375                                  * Overlay mount or covered file system,
 376                                  * so just make the shadow node.
 377                                  */
 378                                 vn_vfsunlock(realdvp);
 379                                 *vpp = makelonode(vp, li, 0);
 380                                 (vtol(*vpp))->lo_looping |= LO_LOOPING;
 381                                 return (0);
 382                         }
 383                         /*
 384                          * When looping get the actual found vnode
 385                          * instead of the vnode covered.
 386                          * Here we have to hold the lock for realdvp
 387                          * since an unmount during the traversal to the
 388                          * root vnode would turn *vfsp into garbage
 389                          * which would be fatal.
 390                          */
 391                         error = VFS_ROOT(vfsp, &tvp);
 392                         vn_vfsunlock(realdvp);
 393
 394                         if (error)
 395                                 goto out;
 396
 397                         if ((tvp == li->li_rootvp) && (vp == realvp(tvp))) {
 398                                 /*
 399                                  * we're back at the real vnode
 400                                  * of the rootvp
 401                                  *
 402                                  * return the rootvp
 403                                  * Ex: /mnt/mnt/..
 404                                  * where / has been lofs-mounted
 405                                  * onto /mnt.  Return the lofs
 406                                  * node mounted at /mnt.
 407                                  */
 408                                 *vpp = tvp;
 409                                 VN_RELE(vp);
 410                                 return (0);
 411                         } else {
 412                                 /*
 413                                  * We are returning from a covered
 414                                  * node whose vfs_mountedhere is
 415                                  * not pointing to vfs of the current
 416                                  * root vnode.
 417                                  * This is a condn where in we
 418                                  * returned a covered node say Zc
 419                                  * but Zc is not the cover of current
 420                                  * root.
 421                                  * i.e.., if X is the root vnode
 422                                  * lookup(Zc,"..") is taking us to
 423                                  * X.
 424                                  * Ex: /net/X/net/X/Y
 425                                  *
 426                                  * If LO_AUTOLOOP (autofs/lofs looping detected)
 427                                  * has been set then we are encountering the
 428                                  * cover of Y (Y being any directory vnode
 429                                  * under /net/X/net/X/).
 430                                  * When performing a dotdot set the
 431                                  * returned vp to the vnode covered
 432                                  * by the mounted lofs, ie /net/X/net/X
 433                                  */
 434                                 VN_RELE(tvp);
 435                                 if ((vtol(dvp))->lo_looping & LO_AUTOLOOP) {
 436                                         VN_RELE(vp);
 437                                         vp = li->li_rootvp;
 438                                         vp = vp->v_vfsp->vfs_vnodecovered;
 439                                         VN_HOLD(vp);
 440                                         *vpp = makelonode(vp, li, 0);
 441                                         (vtol(*vpp))->lo_looping |= LO_LOOPING;
 442                                         return (0);
 443                                 }
 444                         }
 445                 } else {
 446                         /*
 447                          * No frills just make the shadow node.
 448                          */
 449                         *vpp = makelonode(vp, li, 0);
 450                         return (0);
 451                 }
 452         }
 453
 454         nosub = (vtoli(dvp->v_vfsp)->li_flag & LO_NOSUB);
 455
 456         /*
 457          * If this vnode is mounted on, then we
 458          * traverse to the vnode which is the root of
 459          * the mounted file system.
 460          */
 461         if (!nosub && (error = traverse(&vp)))
 462                 goto out;
 463
 464         /*
 465          * Make a lnode for the real vnode.
 466          */
 467         if (vp->v_type != VDIR || nosub) {
 468                 *vpp = makelonode(vp, li, 0);
 469                 if (IS_DEVVP(*vpp)) {
 470                         vnode_t *svp;
 471
 472                         svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
 473                         VN_RELE(*vpp);
 474                         if (svp == NULL)
 475                                 error = ENOSYS;
 476                         else
 477                                 *vpp = svp;
 478                 }
 479                 return (error);
 480         }
 481
 482         /*
 483          * if the found vnode (vp) is not of type lofs
 484          * then we're just going to make a shadow of that
 485          * vp and get out.
 486          *
 487          * If the found vnode (vp) is of lofs type, and
 488          * we're not doing dotdot, check if we are
 489          * looping.
 490          */
 491         if (!doingdotdot && vfs_matchops(vp->v_vfsp, lo_vfsops)) {
 492                 /*
 493                  * Check if we're looping, i.e.
 494                  * vp equals the root vp of the lofs, directly
 495                  * or indirectly, return the covered node.
 496                  */
 497
 498                 if (!((vtol(dvp))->lo_looping & LO_LOOPING)) {
 499                         if (vp == li->li_rootvp) {
 500                                 /*
 501                                  * Direct looping condn.
 502                                  * Ex:- X is / mounted directory so lookup of
 503                                  * /X/X is a direct looping condn.
 504                                  */
 505                                 tvp = vp;
 506                                 vp = vp->v_vfsp->vfs_vnodecovered;
 507                                 VN_HOLD(vp);
 508                                 VN_RELE(tvp);
 509                                 looping++;
 510                         } else {
 511                                 /*
 512                                  * Indirect looping can be defined as
 513                                  * real lookup returning rootvp of the current
 514                                  * tree in any level of recursion.
 515                                  *
 516                                  * This check is useful if there are multiple
 517                                  * levels of lofs indirections. Suppose vnode X
 518                                  * in the current lookup has as its real vnode
 519                                  * another lofs node. Y = realvp(X) Y should be
 520                                  * a lofs node for the check to continue or Y
 521                                  * is not the rootvp of X.
 522                                  * Ex:- say X and Y are two vnodes
 523                                  * say real(Y) is X and real(X) is Z
 524                                  * parent vnode for X and Y is Z
 525                                  * lookup(Y,"path") say we are looking for Y
 526                                  * again under Y and we have to return Yc.
 527                                  * but the lookup of Y under Y doesnot return
 528                                  * Y the root vnode again here is why.
 529                                  * 1. lookup(Y,"path of Y") will go to
 530                                  * 2. lookup(real(Y),"path of Y") and then to
 531                                  * 3. lookup(real(X),"path of Y").
 532                                  * and now what lookup level 1 sees is the
 533                                  * outcome of 2 but the vnode Y is due to
 534                                  * lookup(Z,"path of Y") so we have to skip
 535                                  * intermediate levels to find if in any level
 536                                  * there is a looping.
 537                                  */
 538                                 is_indirectloop = 0;
 539                                 nonlovp = vp;
 540                                 while (
 541                                     vfs_matchops(nonlovp->v_vfsp, lo_vfsops) &&
 542                                     !(is_indirectloop)) {
 543                                         if (li->li_rootvp  == nonlovp) {
 544                                                 is_indirectloop++;
 545                                                 break;
 546                                         }
 547                                         nonlovp = realvp(nonlovp);
 548                                 }
 549
 550                                 if (is_indirectloop) {
 551                                         VN_RELE(vp);
 552                                         vp = nonlovp;
 553                                         vp = vp->v_vfsp->vfs_vnodecovered;
 554                                         VN_HOLD(vp);
 555                                         looping++;
 556                                 }
 557                         }
 558                 } else {
 559                         /*
 560                          * come here only because of the interaction between
 561                          * the autofs and lofs.
 562                          *
 563                          * Lookup of "/net/X/net/X" will return a shadow of
 564                          * an autonode X_a which we call X_l.
 565                          *
 566                          * Lookup of anything under X_l, will trigger a call to
 567                          * auto_lookup(X_a,nm) which will eventually call
 568                          * lo_lookup(X_lr,nm) where X_lr is the root vnode of
 569                          * the current lofs.
 570                          *
 571                          * We come here only when we are called with X_l as dvp
 572                          * and look for something underneath.
 573                          *
 574                          * Now that an autofs/lofs looping condition has been
 575                          * identified any directory vnode contained within
 576                          * dvp will be set to the vnode covered by the
 577                          * mounted autofs. Thus all directories within dvp
 578                          * will appear empty hence teminating the looping.
 579                          * The LO_AUTOLOOP flag is set on the returned lonode
 580                          * to indicate the termination of the autofs/lofs
 581                          * looping. This is required for the correct behaviour
 582                          * when performing a dotdot.
 583                          */
 584                         realdvp = realvp(dvp);
 585                         while (vfs_matchops(realdvp->v_vfsp, lo_vfsops)) {
 586                                 realdvp = realvp(realdvp);
 587                         }
 588
 589                         error = VFS_ROOT(realdvp->v_vfsp, &tvp);
 590                         if (error)
 591                                 goto out;
 592                         /*
 593                          * tvp now contains the rootvp of the vfs of the
 594                          * real vnode of dvp. The directory vnode vp is set
 595                          * to the covered vnode to terminate looping. No
 596                          * distinction is made between any vp as all directory
 597                          * vnodes contained in dvp are returned as the covered
 598                          * vnode.
 599                          */
 600                         VN_RELE(vp);
 601                         vp = tvp;       /* possibly is an autonode */
 602
 603                         /*
 604                          * Need to find the covered vnode
 605                          */
 606                         if (vp->v_vfsp->vfs_vnodecovered == NULL) {
 607                                 /*
 608                                  * We don't have a covered vnode so this isn't
 609                                  * an autonode. To find the autonode simply
 610                                  * find the vnode covered by the lofs rootvp.
 611                                  */
 612                                 vp = li->li_rootvp;
 613                                 vp = vp->v_vfsp->vfs_vnodecovered;
 614                                 VN_RELE(tvp);
 615                                 error = VFS_ROOT(vp->v_vfsp, &tvp);
 616                                 if (error)
 617                                         goto out;
 618                                 vp = tvp;       /* now this is an autonode */
 619                                 if (vp->v_vfsp->vfs_vnodecovered == NULL) {
 620                                         /*
 621                                          * Still can't find a covered vnode.
 622                                          * Fail the lookup, or we'd loop.
 623                                          */
 624                                         error = ENOENT;
 625                                         goto out;
 626                                 }
 627                         }
 628                         vp = vp->v_vfsp->vfs_vnodecovered;
 629                         VN_HOLD(vp);
 630                         VN_RELE(tvp);
 631                         /*
 632                          * Force the creation of a new lnode even if the hash
 633                          * table contains a lnode that references this vnode.
 634                          */
 635                         mkflag = LOF_FORCE;
 636                         autoloop++;
 637                 }
 638         }
 639         *vpp = makelonode(vp, li, mkflag);
 640
 641         if ((looping) ||
 642             (((vtol(dvp))->lo_looping & LO_LOOPING) && !doingdotdot)) {
 643                 (vtol(*vpp))->lo_looping |= LO_LOOPING;
 644         }
 645
 646         if (autoloop) {
 647                 (vtol(*vpp))->lo_looping |= LO_AUTOLOOP;
 648         }
 649
 650 out:
 651         if (error != 0 && vp != NULL)
 652                 VN_RELE(vp);
 653 #ifdef LODEBUG
 654         lo_dprint(4,
 655         "lo_lookup dvp %x realdvp %x nm '%s' newvp %x real vp %x error %d\n",
 656             dvp, realvp(dvp), nm, *vpp, vp, error);
 657 #endif
 658         return (error);
 659 }
 660
 661 /*ARGSUSED*/
 662 static int
 663 lo_create(
 664         vnode_t *dvp,
 665         char *nm,
 666         struct vattr *va,
 667         enum vcexcl exclusive,
 668         int mode,
 669         vnode_t **vpp,
 670         struct cred *cr,
 671         int flag,
 672         caller_context_t *ct,
 673         vsecattr_t *vsecp)
 674 {
 675         int error;
 676         vnode_t *vp = NULL;
 677
 678 #ifdef LODEBUG
 679         lo_dprint(4, "lo_create vp %p realvp %p\n", dvp, realvp(dvp));
 680 #endif
 681         if (*nm == '\0') {
 682                 ASSERT(vpp && dvp == *vpp);
 683                 vp = realvp(*vpp);
 684         }
 685
 686         error = VOP_CREATE(realvp(dvp), nm, va, exclusive, mode, &vp, cr, flag,
 687             ct, vsecp);
 688         if (!error) {
 689                 *vpp = makelonode(vp, vtoli(dvp->v_vfsp), 0);
 690                 if (IS_DEVVP(*vpp)) {
 691                         vnode_t *svp;
 692
 693                         svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
 694                         VN_RELE(*vpp);
 695                         if (svp == NULL)
 696                                 error = ENOSYS;
 697                         else
 698                                 *vpp = svp;
 699                 }
 700         } else if (error == ENOSYS && exclusive == NONEXCL &&
 701             dvp == vtoli(dvp->v_vfsp)->li_rootvp &&
 702             realvp(dvp)->v_type == VREG) {
 703                 /*
 704                  * We have a single regular file lofs mounted, thus the file is
 705                  * the root vnode (the directory vp is the file vp). Some
 706                  * underlying file systems (e.g. tmpfs or ufs) properly handle
 707                  * this style of create but at least zfs won't support create
 708                  * this way (see zfs_fvnodeops_template which has fs_nosys for
 709                  * the vop_create entry because zfs_create doesn't work
 710                  * properly for this case).
 711                  */
 712                 if ((error = VOP_ACCESS(dvp, mode, 0, cr, NULL)) == 0) {
 713                         /*
 714                          * Since we already know the vnode for the existing
 715                          * file we can handle create as a no-op, as expected,
 716                          * truncating the file if necessary.
 717                          */
 718                         struct vattr vattr;
 719
 720                         vattr.va_size = 0;
 721                         vattr.va_mask = AT_SIZE;
 722
 723                         if ((va->va_mask & AT_SIZE) != 0 && va->va_size == 0 &&
 724                             VOP_SETATTR(dvp, &vattr, 0, CRED(), NULL) != 0)
 725                                 return (error);
 726
 727                         /*
 728                          * vn_createat will do a vn_rele on the file if it is
 729                          * pre-existing, which it is in the case of a single
 730                          * file mounted as the root. Thus, when we eventually
 731                          * close the file the count will already be 1 so the
 732                          * vnode would be freed. To prevent that, we add an
 733                          * extra hold here.
 734                          */
 735                         VN_HOLD(dvp);
 736                         *vpp = dvp;
 737                         error = 0;
 738                 }
 739         }
 740
 741         return (error);
 742 }
 743
 744 static int
 745 lo_remove(
 746         vnode_t *dvp,
 747         char *nm,
 748         struct cred *cr,
 749         caller_context_t *ct,
 750         int flags)
 751 {
 752 #ifdef LODEBUG
 753         lo_dprint(4, "lo_remove vp %p realvp %p\n", dvp, realvp(dvp));
 754 #endif
 755         dvp = realvp(dvp);
 756         return (VOP_REMOVE(dvp, nm, cr, ct, flags));
 757 }
 758
 759 static int
 760 lo_link(
 761         vnode_t *tdvp,
 762         vnode_t *vp,
 763         char *tnm,
 764         struct cred *cr,
 765         caller_context_t *ct,
 766         int flags)
 767 {
 768         vnode_t *realvp;
 769
 770 #ifdef LODEBUG
 771         lo_dprint(4, "lo_link vp %p realvp %p\n", vp, realvp(vp));
 772 #endif
 773
 774         /*
 775          * The source and destination vnodes may be in different lofs
 776          * filesystems sharing the same underlying filesystem, so we need to
 777          * make sure that the filesystem containing the source vnode is not
 778          * mounted read-only (vn_link() has already checked the target vnode).
 779          *
 780          * In a situation such as:
 781          *
 782          * /data        - regular filesystem
 783          * /foo         - lofs mount of /data/foo
 784          * /bar         - read-only lofs mount of /data/bar
 785          *
 786          * This disallows a link from /bar/somefile to /foo/somefile,
 787          * which would otherwise allow changes to somefile on the read-only
 788          * mounted /bar.
 789          */
 790
 791         if (vn_is_readonly(vp)) {
 792                 return (EROFS);
 793         }
 794         while (vn_matchops(vp, lo_vnodeops)) {
 795                 vp = realvp(vp);
 796         }
 797
 798         /*
 799          * In the case where the source vnode is on another stacking
 800          * filesystem (such as specfs), the loop above will
 801          * terminate before finding the true underlying vnode.
 802          *
 803          * We use VOP_REALVP here to continue the search.
 804          */
 805         if (VOP_REALVP(vp, &realvp, ct) == 0)
 806                 vp = realvp;
 807
 808         while (vn_matchops(tdvp, lo_vnodeops)) {
 809                 tdvp = realvp(tdvp);
 810         }
 811         if (vp->v_vfsp != tdvp->v_vfsp)
 812                 return (EXDEV);
 813         return (VOP_LINK(tdvp, vp, tnm, cr, ct, flags));
 814 }
 815
 816 static int
 817 lo_rename(
 818         vnode_t *odvp,
 819         char *onm,
 820         vnode_t *ndvp,
 821         char *nnm,
 822         struct cred *cr,
 823         caller_context_t *ct,
 824         int flags)
 825 {
 826         vnode_t *tnvp;
 827
 828 #ifdef LODEBUG
 829         lo_dprint(4, "lo_rename vp %p realvp %p\n", odvp, realvp(odvp));
 830 #endif
 831         /*
 832          * If we are coming from a loop back mounted fs, that has been
 833          * mounted in the same filesystem as where we want to move to,
 834          * and that filesystem is read/write, but the lofs filesystem is
 835          * read only, we don't want to allow a rename of the file. The
 836          * vn_rename code checks to be sure the target is read/write already
 837          * so that is not necessary here. However, consider the following
 838          * example:
 839          *              / - regular root fs
 840          *              /foo - directory in root
 841          *              /foo/bar - file in foo directory(in root fs)
 842          *              /baz - directory in root
 843          *              mount -F lofs -o ro /foo /baz - all still in root
 844          *                      directory
 845          * The fact that we mounted /foo on /baz read only should stop us
 846          * from renaming the file /foo/bar /bar, but it doesn't since
 847          * / is read/write. We are still renaming here since we are still
 848          * in the same filesystem, it is just that we do not check to see
 849          * if the filesystem we are coming from in this case is read only.
 850          */
 851         if (odvp->v_vfsp->vfs_flag & VFS_RDONLY)
 852                 return (EROFS);
 853         /*
 854          * We need to make sure we're not trying to remove a mount point for a
 855          * filesystem mounted on top of lofs, which only we know about.
 856          */
 857         if (vn_matchops(ndvp, lo_vnodeops))     /* Not our problem. */
 858                 goto rename;
 859
 860         /*
 861          * XXXci - Once case-insensitive behavior is implemented, it should
 862          * be added here.
 863          */
 864         if (VOP_LOOKUP(ndvp, nnm, &tnvp, NULL, 0, NULL, cr,
 865             ct, NULL, NULL) != 0)
 866                 goto rename;
 867         if (tnvp->v_type != VDIR) {
 868                 VN_RELE(tnvp);
 869                 goto rename;
 870         }
 871         if (vn_mountedvfs(tnvp)) {
 872                 VN_RELE(tnvp);
 873                 return (EBUSY);
 874         }
 875         VN_RELE(tnvp);
 876 rename:
 877         /*
 878          * Since the case we're dealing with above can happen at any layer in
 879          * the stack of lofs filesystems, we need to recurse down the stack,
 880          * checking to see if there are any instances of a filesystem mounted on
 881          * top of lofs. In order to keep on using the lofs version of
 882          * VOP_RENAME(), we make sure that while the target directory is of type
 883          * lofs, the source directory (the one used for getting the fs-specific
 884          * version of VOP_RENAME()) is also of type lofs.
 885          */
 886         if (vn_matchops(ndvp, lo_vnodeops)) {
 887                 ndvp = realvp(ndvp);    /* Check the next layer */
 888         } else {
 889                 /*
 890                  * We can go fast here
 891                  */
 892                 while (vn_matchops(odvp, lo_vnodeops)) {
 893                         odvp = realvp(odvp);
 894                 }
 895                 if (odvp->v_vfsp != ndvp->v_vfsp)
 896                         return (EXDEV);
 897         }
 898         return (VOP_RENAME(odvp, onm, ndvp, nnm, cr, ct, flags));
 899 }
 900
 901 static int
 902 lo_mkdir(
 903         vnode_t *dvp,
 904         char *nm,
 905         struct vattr *va,
 906         vnode_t **vpp,
 907         struct cred *cr,
 908         caller_context_t *ct,
 909         int flags,
 910         vsecattr_t *vsecp)
 911 {
 912         int error;
 913
 914 #ifdef LODEBUG
 915         lo_dprint(4, "lo_mkdir vp %p realvp %p\n", dvp, realvp(dvp));
 916 #endif
 917         error = VOP_MKDIR(realvp(dvp), nm, va, vpp, cr, ct, flags, vsecp);
 918         if (!error)
 919                 *vpp = makelonode(*vpp, vtoli(dvp->v_vfsp), 0);
 920         return (error);
 921 }
 922
 923 static int
 924 lo_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
 925 {
 926 #ifdef LODEBUG
 927         lo_dprint(4, "lo_realvp %p\n", vp);
 928 #endif
 929         while (vn_matchops(vp, lo_vnodeops))
 930                 vp = realvp(vp);
 931
 932         if (VOP_REALVP(vp, vpp, ct) != 0)
 933                 *vpp = vp;
 934         return (0);
 935 }
 936
 937 static int
 938 lo_rmdir(
 939         vnode_t *dvp,
 940         char *nm,
 941         vnode_t *cdir,
 942         struct cred *cr,
 943         caller_context_t *ct,
 944         int flags)
 945 {
 946         vnode_t *rvp = cdir;
 947
 948 #ifdef LODEBUG
 949         lo_dprint(4, "lo_rmdir vp %p realvp %p\n", dvp, realvp(dvp));
 950 #endif
 951         /* if cdir is lofs vnode ptr get its real vnode ptr */
 952         if (vn_matchops(dvp, vn_getops(rvp)))
 953                 (void) lo_realvp(cdir, &rvp, ct);
 954         dvp = realvp(dvp);
 955         return (VOP_RMDIR(dvp, nm, rvp, cr, ct, flags));
 956 }
 957
 958 static int
 959 lo_symlink(
 960         vnode_t *dvp,
 961         char *lnm,
 962         struct vattr *tva,
 963         char *tnm,
 964         struct cred *cr,
 965         caller_context_t *ct,
 966         int flags)
 967 {
 968 #ifdef LODEBUG
 969         lo_dprint(4, "lo_symlink vp %p realvp %p\n", dvp, realvp(dvp));
 970 #endif
 971         dvp = realvp(dvp);
 972         return (VOP_SYMLINK(dvp, lnm, tva, tnm, cr, ct, flags));
 973 }
 974
 975 static int
 976 lo_readlink(
 977         vnode_t *vp,
 978         struct uio *uiop,
 979         struct cred *cr,
 980         caller_context_t *ct)
 981 {
 982         vp = realvp(vp);
 983         return (VOP_READLINK(vp, uiop, cr, ct));
 984 }
 985
 986 static int
 987 lo_readdir(
 988         vnode_t *vp,
 989         struct uio *uiop,
 990         struct cred *cr,
 991         int *eofp,
 992         caller_context_t *ct,
 993         int flags)
 994 {
 995 #ifdef LODEBUG
 996         lo_dprint(4, "lo_readdir vp %p realvp %p\n", vp, realvp(vp));
 997 #endif
 998         vp = realvp(vp);
 999         return (VOP_READDIR(vp, uiop, cr, eofp, ct, flags));
1000 }
1001
1002 static int
1003 lo_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
1004 {
1005         vp = realvp(vp);
1006         return (VOP_RWLOCK(vp, write_lock, ct));
1007 }
1008
1009 static void
1010 lo_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
1011 {
1012         vp = realvp(vp);
1013         VOP_RWUNLOCK(vp, write_lock, ct);
1014 }
1015
1016 static int
1017 lo_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1018 {
1019         vp = realvp(vp);
1020         return (VOP_SEEK(vp, ooff, noffp, ct));
1021 }
1022
1023 static int
1024 lo_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
1025 {
1026         while (vn_matchops(vp1, lo_vnodeops))
1027                 vp1 = realvp(vp1);
1028         while (vn_matchops(vp2, lo_vnodeops))
1029                 vp2 = realvp(vp2);
1030         return (VOP_CMP(vp1, vp2, ct));
1031 }
1032
1033 static int
1034 lo_frlock(
1035         vnode_t *vp,
1036         int cmd,
1037         struct flock64 *bfp,
1038         int flag,
1039         offset_t offset,
1040         struct flk_callback *flk_cbp,
1041         cred_t *cr,
1042         caller_context_t *ct)
1043 {
1044         vp = realvp(vp);
1045         return (VOP_FRLOCK(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1046 }
1047
1048 static int
1049 lo_space(
1050         vnode_t *vp,
1051         int cmd,
1052         struct flock64 *bfp,
1053         int flag,
1054         offset_t offset,
1055         struct cred *cr,
1056         caller_context_t *ct)
1057 {
1058         vp = realvp(vp);
1059         return (VOP_SPACE(vp, cmd, bfp, flag, offset, cr, ct));
1060 }
1061
1062 static int
1063 lo_getpage(
1064         vnode_t *vp,
1065         offset_t off,
1066         size_t len,
1067         uint_t *prot,
1068         struct page *parr[],
1069         size_t psz,
1070         struct seg *seg,
1071         caddr_t addr,
1072         enum seg_rw rw,
1073         struct cred *cr,
1074         caller_context_t *ct)
1075 {
1076         vp = realvp(vp);
1077         return (VOP_GETPAGE(vp, off, len, prot, parr, psz, seg, addr, rw, cr,
1078             ct));
1079 }
1080
1081 static int
1082 lo_putpage(
1083         vnode_t *vp,
1084         offset_t off,
1085         size_t len,
1086         int flags,
1087         struct cred *cr,
1088         caller_context_t *ct)
1089 {
1090         vp = realvp(vp);
1091         return (VOP_PUTPAGE(vp, off, len, flags, cr, ct));
1092 }
1093
1094 static int
1095 lo_map(
1096         vnode_t *vp,
1097         offset_t off,
1098         struct as *as,
1099         caddr_t *addrp,
1100         size_t len,
1101         uchar_t prot,
1102         uchar_t maxprot,
1103         uint_t flags,
1104         struct cred *cr,
1105         caller_context_t *ct)
1106 {
1107         vp = realvp(vp);
1108         return (VOP_MAP(vp, off, as, addrp, len, prot, maxprot, flags, cr, ct));
1109 }
1110
1111 static int
1112 lo_addmap(
1113         vnode_t *vp,
1114         offset_t off,
1115         struct as *as,
1116         caddr_t addr,
1117         size_t len,
1118         uchar_t prot,
1119         uchar_t maxprot,
1120         uint_t flags,
1121         struct cred *cr,
1122         caller_context_t *ct)
1123 {
1124         vp = realvp(vp);
1125         return (VOP_ADDMAP(vp, off, as, addr, len, prot, maxprot, flags, cr,
1126             ct));
1127 }
1128
1129 static int
1130 lo_delmap(
1131         vnode_t *vp,
1132         offset_t off,
1133         struct as *as,
1134         caddr_t addr,
1135         size_t len,
1136         uint_t prot,
1137         uint_t maxprot,
1138         uint_t flags,
1139         struct cred *cr,
1140         caller_context_t *ct)
1141 {
1142         vp = realvp(vp);
1143         return (VOP_DELMAP(vp, off, as, addr, len, prot, maxprot, flags, cr,
1144             ct));
1145 }
1146
1147 static int
1148 lo_poll(
1149         vnode_t *vp,
1150         short events,
1151         int anyyet,
1152         short *reventsp,
1153         struct pollhead **phpp,
1154         caller_context_t *ct)
1155 {
1156         vp = realvp(vp);
1157         return (VOP_POLL(vp, events, anyyet, reventsp, phpp, ct));
1158 }
1159
1160 static int
1161 lo_dump(vnode_t *vp, caddr_t addr, offset_t bn, offset_t count,
1162     caller_context_t *ct)
1163 {
1164         vp = realvp(vp);
1165         return (VOP_DUMP(vp, addr, bn, count, ct));
1166 }
1167
1168 static int
1169 lo_pathconf(
1170         vnode_t *vp,
1171         int cmd,
1172         ulong_t *valp,
1173         struct cred *cr,
1174         caller_context_t *ct)
1175 {
1176         vp = realvp(vp);
1177         return (VOP_PATHCONF(vp, cmd, valp, cr, ct));
1178 }
1179
1180 static int
1181 lo_pageio(
1182         vnode_t *vp,
1183         struct page *pp,
1184         u_offset_t io_off,
1185         size_t io_len,
1186         int flags,
1187         cred_t *cr,
1188         caller_context_t *ct)
1189 {
1190         vp = realvp(vp);
1191         return (VOP_PAGEIO(vp, pp, io_off, io_len, flags, cr, ct));
1192 }
1193
1194 static void
1195 lo_dispose(
1196         vnode_t *vp,
1197         page_t *pp,
1198         int fl,
1199         int dn,
1200         cred_t *cr,
1201         caller_context_t *ct)
1202 {
1203         vp = realvp(vp);
1204         if (vp != NULL && !VN_ISKAS(vp))
1205                 VOP_DISPOSE(vp, pp, fl, dn, cr, ct);
1206 }
1207
1208 static int
1209 lo_setsecattr(
1210         vnode_t *vp,
1211         vsecattr_t *secattr,
1212         int flags,
1213         struct cred *cr,
1214         caller_context_t *ct)
1215 {
1216         if (vn_is_readonly(vp))
1217                 return (EROFS);
1218         vp = realvp(vp);
1219         return (VOP_SETSECATTR(vp, secattr, flags, cr, ct));
1220 }
1221
1222 static int
1223 lo_getsecattr(
1224         vnode_t *vp,
1225         vsecattr_t *secattr,
1226         int flags,
1227         struct cred *cr,
1228         caller_context_t *ct)
1229 {
1230         vp = realvp(vp);
1231         return (VOP_GETSECATTR(vp, secattr, flags, cr, ct));
1232 }
1233
1234 static int
1235 lo_shrlock(
1236         vnode_t *vp,
1237         int cmd,
1238         struct shrlock *shr,
1239         int flag,
1240         cred_t *cr,
1241         caller_context_t *ct)
1242 {
1243         vp = realvp(vp);
1244         return (VOP_SHRLOCK(vp, cmd, shr, flag, cr, ct));
1245 }
1246
1247 /*
1248  * Loopback vnode operations vector.
1249  */
1250
1251 struct vnodeops *lo_vnodeops;
1252
1253 const fs_operation_def_t lo_vnodeops_template[] = {
1254         VOPNAME_OPEN,           { .vop_open = lo_open },
1255         VOPNAME_CLOSE,          { .vop_close = lo_close },
1256         VOPNAME_READ,           { .vop_read = lo_read },
1257         VOPNAME_WRITE,          { .vop_write = lo_write },
1258         VOPNAME_IOCTL,          { .vop_ioctl = lo_ioctl },
1259         VOPNAME_SETFL,          { .vop_setfl = lo_setfl },
1260         VOPNAME_GETATTR,        { .vop_getattr = lo_getattr },
1261         VOPNAME_SETATTR,        { .vop_setattr = lo_setattr },
1262         VOPNAME_ACCESS,         { .vop_access = lo_access },
1263         VOPNAME_LOOKUP,         { .vop_lookup = lo_lookup },
1264         VOPNAME_CREATE,         { .vop_create = lo_create },
1265         VOPNAME_REMOVE,         { .vop_remove = lo_remove },
1266         VOPNAME_LINK,           { .vop_link = lo_link },
1267         VOPNAME_RENAME,         { .vop_rename = lo_rename },
1268         VOPNAME_MKDIR,          { .vop_mkdir = lo_mkdir },
1269         VOPNAME_RMDIR,          { .vop_rmdir = lo_rmdir },
1270         VOPNAME_READDIR,        { .vop_readdir = lo_readdir },
1271         VOPNAME_SYMLINK,        { .vop_symlink = lo_symlink },
1272         VOPNAME_READLINK,       { .vop_readlink = lo_readlink },
1273         VOPNAME_FSYNC,          { .vop_fsync = lo_fsync },
1274         VOPNAME_INACTIVE,       { .vop_inactive = lo_inactive },
1275         VOPNAME_FID,            { .vop_fid = lo_fid },
1276         VOPNAME_RWLOCK,         { .vop_rwlock = lo_rwlock },
1277         VOPNAME_RWUNLOCK,       { .vop_rwunlock = lo_rwunlock },
1278         VOPNAME_SEEK,           { .vop_seek = lo_seek },
1279         VOPNAME_CMP,            { .vop_cmp = lo_cmp },
1280         VOPNAME_FRLOCK,         { .vop_frlock = lo_frlock },
1281         VOPNAME_SPACE,          { .vop_space = lo_space },
1282         VOPNAME_REALVP,         { .vop_realvp = lo_realvp },
1283         VOPNAME_GETPAGE,        { .vop_getpage = lo_getpage },
1284         VOPNAME_PUTPAGE,        { .vop_putpage = lo_putpage },
1285         VOPNAME_MAP,            { .vop_map = lo_map },
1286         VOPNAME_ADDMAP,         { .vop_addmap = lo_addmap },
1287         VOPNAME_DELMAP,         { .vop_delmap = lo_delmap },
1288         VOPNAME_POLL,           { .vop_poll = lo_poll },
1289         VOPNAME_DUMP,           { .vop_dump = lo_dump },
1290         VOPNAME_DUMPCTL,        { .error = fs_error },  /* XXX - why? */
1291         VOPNAME_PATHCONF,       { .vop_pathconf = lo_pathconf },
1292         VOPNAME_PAGEIO,         { .vop_pageio = lo_pageio },
1293         VOPNAME_DISPOSE,        { .vop_dispose = lo_dispose },
1294         VOPNAME_SETSECATTR,     { .vop_setsecattr = lo_setsecattr },
1295         VOPNAME_GETSECATTR,     { .vop_getsecattr = lo_getsecattr },
1296         VOPNAME_SHRLOCK,        { .vop_shrlock = lo_shrlock },
1297         NULL,                   NULL
1298 };