src/afs/HPUX/osi_vnodeops.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 /* This is a placeholder for routines unique to the port of AFS to hp-ux*/
  11
  12 #include <afsconfig.h>
  13 #include "afs/param.h"
  14
  15
  16 #include "afs/sysincludes.h"    /* Standard vendor system headers */
  17 #include "afsincludes.h"        /* Afs-based standard headers */
  18 #include "afs/afs_stats.h"      /* statistics stuff */
  19
  20 #include <sys/uio.h>
  21 #include <sys/vfs.h>
  22 #include <sys/mount.h>
  23 #include <sys/vnode.h>
  24 #include <sys/pathname.h>
  25
  26 extern struct vfsops Afs_vfsops;
  27 extern int afs_hp_strategy();
  28 extern int afs_bmap(), afs_badop(), afs_noop(), afs_lockf();
  29 extern int afs_pagein();
  30 extern int afs_pageout();
  31 extern int afs_ioctl();
  32 extern int afs_prealloc();
  33 extern int afs_mapdbd();
  34 extern int afs_mmap();
  35 extern int afs_cachelimit();
  36 extern int afs_vm_checkpage();
  37 extern int afs_vm_fscontiguous();
  38 extern int afs_vm_stopio();
  39 extern int afs_read_ahead();
  40 extern int afs_unmap();
  41 extern int afs_release();
  42 extern int afs_swapfs_len();
  43 extern int afs_readdir2();
  44 extern int afs_readdir();
  45 extern int afs_readdir3();
  46 extern int afs_pathconf();
  47 extern int afs_close();
  48
  49 #define vtoblksz(vp)    ((vp)->v_vfsp->vfs_bsize)
  50
  51 #if defined(AFS_HPUX110_ENV)
  52 /* We no longer need to lock on the VM Empire,
  53  * or at least that is what is claimed.
  54  * so we will noopt the vmemp_ routines
  55  * This needs to be looked at closer.
  56  */
  57 #define vmemp_lockx()
  58 #undef  vmemp_returnx
  59 #define vmemp_returnx(a) return(a)
  60 #define vmemp_unlockx()
  61 #endif
  62
  63 #if !defined(AFS_HPUX110_ENV)
  64 /*
  65  * Copy an mbuf to the contiguous area pointed to by cp.
  66  * Skip <off> bytes and copy <len> bytes.
  67  * Returns the number of bytes not transferred.
  68  * The mbuf is NOT changed.
  69  */
  70 int
  71 m_cpytoc(m, off, len, cp)
  72      struct mbuf *m;
  73      int off, len;
  74      caddr_t cp;
  75 {
  76     int ml;
  77
  78     if (m == NULL || off < 0 || len < 0 || cp == NULL)
  79         osi_Panic("m_cpytoc");
  80     while (off && m)
  81         if (m->m_len <= off) {
  82             off -= m->m_len;
  83             m = m->m_next;
  84             continue;
  85         } else
  86             break;
  87     if (m == NULL)
  88         return (len);
  89
  90     ml = MIN(len, m->m_len - off);
  91     memcpy(cp, mtod(m, caddr_t) + off, (u_int) ml);
  92     cp += ml;
  93     len -= ml;
  94     m = m->m_next;
  95
  96     while (len && m) {
  97         ml = m->m_len;
  98         memcpy(cp, mtod(m, caddr_t), (u_int) ml);
  99         cp += ml;
 100         len -= ml;
 101         m = m->m_next;
 102     }
 103
 104     return (len);
 105 }
 106 #endif
 107
 108 /*
 109  *  Note that the standard Sun vnode interface doesn't haven't an vop_lockf(), so this code is
 110  * totally new.  This came about because HP-UX has lockf() implemented as
 111  * a system call while Sun has it implemented as a library (apparently).
 112  * To handle this, we have to translate the lockf() request into an
 113  * fcntl() looking request, and then translate the results back if necessary.
 114  * we call afs_lockctl() directly .
 115  */
 116 afs_lockf(vp, flag, len, cred, fp, LB, UB)
 117      struct vnode *vp;
 118      int flag;
 119      afs_ucred_t *cred;
 120      struct file *fp;
 121      k_off_t len, LB, UB;
 122 {
 123     /*for now, just pretend it works */
 124     struct k_flock flock;
 125     int cmd, code;
 126
 127     /*
 128      * Create a flock structure and translate the lockf request
 129      * into an appropriate looking fcntl() type request for afs_lockctl()
 130      */
 131     flock.l_whence = 0;
 132     flock.l_len = len;
 133     flock.l_start = fp->f_offset;
 134     /* convert negative lengths to positive */
 135     if (flock.l_len < 0) {
 136         flock.l_start += flock.l_len;
 137         flock.l_len = -(flock.l_len);
 138     }
 139     /*
 140      * Adjust values to look like fcntl() requests.
 141      * All locks are write locks, only F_LOCK requests
 142      * are blocking.  F_TEST has to be translated into
 143      * a get lock and then back again.
 144      */
 145     flock.l_type = F_WRLCK;
 146     cmd = F_SETLK;
 147     switch (flag) {
 148     case F_ULOCK:
 149         flock.l_type = F_UNLCK;
 150         break;
 151     case F_LOCK:
 152         cmd = F_SETLKW;
 153         break;
 154     case F_TEST:
 155         cmd = F_GETLK;
 156         break;
 157     }
 158     u.u_error = mp_afs_lockctl(vp, &flock, cmd, fp->f_cred);
 159     if (u.u_error) {
 160         return (u.u_error);     /* some other error code */
 161     }
 162     /*
 163      * if request is F_TEST, and GETLK changed
 164      * the lock type to ULOCK, then return 0, else
 165      * set errno to EACCESS and return.
 166      */
 167     if (flag == F_TEST && flock.l_type != F_UNLCK) {
 168         u.u_error = EACCES;
 169         return (u.u_error);
 170     }
 171     return (0);
 172 }
 173
 174
 175 #if defined(AFS_HPUX1122_ENV)
 176 #include "machine/vm/vmparam.h"
 177 #else
 178 #include "../machine/vmparam.h" /* For KERNELSPACE */
 179 #endif
 180 #include "h/debug.h"
 181 #include "h/types.h"
 182 #if !defined(AFS_HPUX1123_ENV)
 183         /* 11.23 is using 64 bit in many cases */
 184 #define kern_daddr_t daddr_t
 185 #endif
 186 #include "h/param.h"
 187 #include "h/vmmac.h"
 188 #include "h/time.h"
 189 #include "ufs/inode.h"
 190 #include "ufs/fs.h"
 191 #include "h/dbd.h"
 192 #if defined(AFS_HPUX1123_ENV)
 193 dbd_t       *finddbd();
 194 #endif /* AFS_HPUX1123_ENV */
 195 #include "h/vfd.h"
 196 #include "h/region.h"
 197 #include "h/pregion.h"
 198 #include "h/vmmeter.h"
 199 #include "h/user.h"
 200 #include "h/sysinfo.h"
 201 #include "h/pfdat.h"
 202 #if !defined(AFS_HPUX1123_ENV)
 203 #include "h/tuneable.h"
 204 #endif
 205 #include "h/buf.h"
 206 #include "netinet/in.h"
 207
 208 /* a freelist of one */
 209 struct buf *afs_bread_freebp = 0;
 210
 211 /*
 212  *  Only rfs_read calls this, and it only looks at bp->b_un.b_addr.
 213  *  Thus we can use fake bufs (ie not from the real buffer pool).
 214  */
 215 afs_bread(vp, lbn, bpp)
 216      struct vnode *vp;
 217      kern_daddr_t lbn;
 218      struct buf **bpp;
 219 {
 220     int offset, fsbsize, error;
 221     struct buf *bp;
 222     struct iovec iov;
 223     struct uio uio;
 224
 225     memset(&uio, 0, sizeof(uio));
 226     memset(&iov, 0, sizeof(iov));
 227
 228     AFS_STATCNT(afs_bread);
 229     fsbsize = vp->v_vfsp->vfs_bsize;
 230     offset = lbn * fsbsize;
 231     if (afs_bread_freebp) {
 232         bp = afs_bread_freebp;
 233         afs_bread_freebp = 0;
 234     } else {
 235         bp = (struct buf *)AFS_KALLOC(sizeof(*bp));
 236         bp->b_un.b_addr = (caddr_t) AFS_KALLOC(fsbsize);
 237     }
 238
 239     iov.iov_base = bp->b_un.b_addr;
 240     iov.iov_len = fsbsize;
 241     uio.afsio_iov = &iov;
 242     uio.afsio_iovcnt = 1;
 243     uio.afsio_seg = AFS_UIOSYS;
 244     uio.afsio_offset = offset;
 245     uio.afsio_resid = fsbsize;
 246     uio.uio_fpflags = 0;
 247     *bpp = 0;
 248
 249     error = afs_read(VTOAFS(vp), &uio, p_cred(u.u_procp), 0);
 250     if (error) {
 251         afs_bread_freebp = bp;
 252         return error;
 253     }
 254     if (*bpp) {
 255         afs_bread_freebp = bp;
 256     } else {
 257         *(struct buf **)&bp->b_vp = bp; /* mark as fake */
 258         *bpp = bp;
 259     }
 260     return 0;
 261 }
 262
 263 afs_brelse(vp, bp)
 264      struct vnode *vp;
 265      struct buf *bp;
 266 {
 267     AFS_STATCNT(afs_brelse);
 268
 269     if ((struct buf *)bp->b_vp != bp) { /* not fake */
 270         ufs_brelse(bp->b_vp, bp);
 271     } else if (afs_bread_freebp) {
 272         AFS_KFREE(bp->b_un.b_addr, vp->v_vfsp->vfs_bsize);
 273         AFS_KFREE(bp, sizeof(*bp));
 274     } else {
 275         afs_bread_freebp = bp;
 276     }
 277 }
 278
 279
 280 afs_bmap(avc, abn, anvp, anbn)
 281      struct vcache *avc;
 282      kern_daddr_t abn, *anbn;
 283      struct vcache **anvp;
 284 {
 285     AFS_STATCNT(afs_bmap);
 286     if (anvp)
 287         *anvp = avc;
 288     if (anbn)
 289         *anbn = abn * (8192 / DEV_BSIZE);       /* in 512 byte units */
 290     return 0;
 291 }
 292
 293 afs_inactive(avc, acred)
 294      struct vcache *avc;
 295      afs_ucred_t *acred;
 296 {
 297     struct vnode *vp = AFSTOV(avc);
 298     ulong_t context;
 299     lock_t *sv_lock;
 300     if (afs_shuttingdown != AFS_RUNNING)
 301         return;
 302
 303     /*
 304      * In Solaris and HPUX s800 and HP-UX10.0 they actually call us with
 305      * v_count 1 on last reference!
 306      */
 307     MP_H_SPINLOCK_USAV(vn_h_sl_pool, vp, &sv_lock, &context);
 308     if (avc->vrefCount < 1)
 309         osi_Panic("afs_inactive : v_count < 1\n");
 310
 311     /*
 312      * If more than 1 don't unmap the vnode but do decrement the ref count
 313      */
 314     vp->v_count--;
 315     if (vp->v_count > 0) {
 316         MP_SPINUNLOCK_USAV(sv_lock, context);
 317         return 0;
 318     }
 319     MP_SPINUNLOCK_USAV(sv_lock, context);
 320     afs_InactiveVCache(avc, acred);
 321     return 0;
 322 }
 323
 324
 325 int
 326 mp_afs_open(struct vnode **avcp, int aflags, afs_ucred_t *acred)
 327 {
 328     int code;
 329
 330     AFS_GLOCK();
 331     code = afs_open(avcp, aflags, acred);
 332     AFS_GUNLOCK();
 333     return (code);
 334 }
 335
 336 int
 337 mp_afs_close(struct vnode *avcp, int aflags, afs_ucred_t *acred)
 338 {
 339     int code;
 340
 341     AFS_GLOCK();
 342     code = afs_close(avcp, aflags, acred);
 343     AFS_GUNLOCK();
 344     return (code);
 345 }
 346
 347 int
 348 mp_afs_rdwr(struct vnode *avcp, struct uio *uio, enum uio_rw arw,
 349             int aio, afs_ucred_t *acred)
 350 {
 351     int code;
 352     long save_resid;
 353
 354     AFS_GLOCK();
 355     save_resid = uio->uio_resid;
 356     code = afs_rdwr(avcp, uio, arw, aio, acred);
 357     if (arw == UIO_WRITE && code == ENOSPC) {
 358         /* HP clears code if any data written. */
 359         uio->uio_resid = save_resid;
 360     }
 361     AFS_GUNLOCK();
 362     return (code);
 363 }
 364
 365 int
 366 mp_afs_getattr(struct vnode *avcp, struct vattr *attrs,
 367                afs_ucred_t *acred, enum vsync unused1)
 368 {
 369     int code;
 370
 371     AFS_GLOCK();
 372     code = afs_getattr(avcp, attrs, acred);
 373     AFS_GUNLOCK();
 374     return (code);
 375 }
 376
 377 int
 378 mp_afs_setattr(struct vnode *avcp, struct vattr *attrs,
 379                afs_ucred_t *acred, int unused1)
 380 {
 381     int code;
 382
 383     AFS_GLOCK();
 384     code = afs_setattr(avcp, attrs, acred);
 385     AFS_GUNLOCK();
 386     return (code);
 387 }
 388
 389 int
 390 mp_afs_access(struct vnode *avcp, int mode, afs_ucred_t *acred)
 391 {
 392     int code;
 393
 394     AFS_GLOCK();
 395     code = afs_access(avcp, mode, acred);
 396     AFS_GUNLOCK();
 397     return (code);
 398 }
 399
 400 int
 401 mp_afs_lookup(struct vnode *adp, char *aname,
 402               struct vnode **avcp, afs_ucred_t *acred,
 403               struct vnode *unused1)
 404 {
 405     int code;
 406
 407     AFS_GLOCK();
 408     code = afs_lookup(adp, aname, avcp, acred);
 409     AFS_GUNLOCK();
 410     return (code);
 411 }
 412
 413 int
 414 mp_afs_create(struct vnode *adp, char *aname, struct vattr *attrs,
 415               enum vcexcl aexcl, int amode, struct vnode **avcp,
 416               afs_ucred_t *acred)
 417 {
 418     int code;
 419
 420     AFS_GLOCK();
 421     code = afs_create(adp, aname, attrs, aexcl, amode, avcp, acred);
 422     AFS_GUNLOCK();
 423     return (code);
 424 }
 425
 426
 427 int
 428 mp_afs_remove(struct vnode *adp, char *aname,
 429               afs_ucred_t *acred)
 430 {
 431     int code;
 432
 433     AFS_GLOCK();
 434     code = afs_remove(adp, aname, acred);
 435     AFS_GUNLOCK();
 436     return (code);
 437 }
 438
 439 int
 440 mp_afs_link(struct vnode *avc, struct vnode *adp,
 441             char *aname, afs_ucred_t *acred)
 442 {
 443     int code;
 444
 445     AFS_GLOCK();
 446     code = afs_link(avc, adp, aname, acred);
 447     AFS_GUNLOCK();
 448     return (code);
 449 }
 450
 451 int
 452 mp_afs_rename(struct vnode *aodp, char *aname1,
 453               struct vnode *andp, char *aname2,
 454               afs_ucred_t *acred)
 455 {
 456     int code;
 457
 458     AFS_GLOCK();
 459     code = afs_rename(aodp, aname1, andp, aname2, acred);
 460     AFS_GUNLOCK();
 461     return (code);
 462 }
 463
 464 int
 465 mp_afs_mkdir(struct vnode *adp, char *aname, struct vattr *attrs,
 466              struct vnode **avcp, afs_ucred_t *acred)
 467 {
 468     int code;
 469
 470     AFS_GLOCK();
 471     code = afs_mkdir(adp, aname, attrs, avcp, acred);
 472     AFS_GUNLOCK();
 473     return (code);
 474 }
 475
 476
 477 int
 478 mp_afs_rmdir(struct vnode *adp, char *aname, afs_ucred_t *acred)
 479 {
 480     int code;
 481
 482     AFS_GLOCK();
 483     code = afs_rmdir(adp, aname, acred);
 484     AFS_GUNLOCK();
 485     return (code);
 486 }
 487
 488
 489 int
 490 mp_afs_readdir(struct vnode *avc, struct uio *auio,
 491                afs_ucred_t *acred)
 492 {
 493     int code;
 494
 495     AFS_GLOCK();
 496     code = afs_readdir(avc, auio, acred);
 497     AFS_GUNLOCK();
 498     return (code);
 499 }
 500
 501 int
 502 mp_afs_symlink(struct vnode *adp, char *aname, struct vattr *attrs,
 503                char *atargetName, afs_ucred_t *acred)
 504 {
 505     int code;
 506
 507     AFS_GLOCK();
 508     code = afs_symlink(adp, aname, attrs, atargetName, NULL, acred);
 509     AFS_GUNLOCK();
 510     return (code);
 511 }
 512
 513
 514 int
 515 mp_afs_readlink(struct vnode *avc, struct uio *auio,
 516                 afs_ucred_t *acred)
 517 {
 518     int code;
 519
 520     AFS_GLOCK();
 521     code = afs_readlink(avc, auio, acred);
 522     AFS_GUNLOCK();
 523     return (code);
 524 }
 525
 526 int
 527 mp_afs_fsync(struct vnode *avc, afs_ucred_t *acred, int unused1)
 528 {
 529     int code;
 530
 531     AFS_GLOCK();
 532     code = afs_fsync(avc, acred);
 533     AFS_GUNLOCK();
 534     return (code);
 535 }
 536
 537 int
 538 mp_afs_bread(struct vnode *avc, kern_daddr_t lbn, struct buf **bpp,
 539              struct vattr *unused1, struct ucred *unused2)
 540 {
 541     int code;
 542
 543     AFS_GLOCK();
 544     code = afs_bread(avc, lbn, bpp);
 545     AFS_GUNLOCK();
 546     return (code);
 547 }
 548
 549 int
 550 mp_afs_brelse(struct vnode *avc, struct buf *bp)
 551 {
 552     int code;
 553
 554     AFS_GLOCK();
 555     code = afs_brelse(avc, bp);
 556     AFS_GUNLOCK();
 557     return (code);
 558 }
 559
 560
 561 int
 562 mp_afs_inactive(struct vnode *avc, afs_ucred_t *acred)
 563 {
 564     int code;
 565
 566     AFS_GLOCK();
 567     code = afs_inactive(avc, acred);
 568     AFS_GUNLOCK();
 569     return (code);
 570 }
 571
 572 int
 573 mp_afs_lockctl(struct vnode *avc, struct flock *af, int cmd,
 574                afs_ucred_t *acred, struct file *unused1, off_t unused2,
 575                off_t unused3)
 576 {
 577     int code;
 578
 579     AFS_GLOCK();
 580     code = afs_lockctl(avc, af, cmd, acred);
 581     AFS_GUNLOCK();
 582     return (code);
 583 }
 584
 585 int
 586 mp_afs_fid(struct vnode *avc, struct fid **fidpp)
 587 {
 588     int code;
 589
 590     AFS_GLOCK();
 591     code = afs_fid(avc, fidpp);
 592     AFS_GUNLOCK();
 593     return (code);
 594 }
 595
 596 int
 597 mp_afs_readdir2(struct vnode *avc, struct uio *auio,
 598                 afs_ucred_t *acred)
 599 {
 600     int code;
 601
 602     AFS_GLOCK();
 603     code = afs_readdir2(avc, auio, acred);
 604     AFS_GUNLOCK();
 605     return (code);
 606 }
 607
 608
 609 struct vnodeops Afs_vnodeops = {
 610     mp_afs_open,
 611     mp_afs_close,
 612     mp_afs_rdwr,
 613     afs_ioctl,
 614     afs_noop,
 615     mp_afs_getattr,
 616     mp_afs_setattr,
 617     mp_afs_access,
 618     mp_afs_lookup,
 619     mp_afs_create,
 620     mp_afs_remove,
 621     mp_afs_link,
 622     mp_afs_rename,
 623     mp_afs_mkdir,
 624     mp_afs_rmdir,
 625     afs_readdir,
 626     mp_afs_symlink,
 627     mp_afs_readlink,
 628     mp_afs_fsync,
 629     mp_afs_inactive,
 630     afs_bmap,
 631     afs_hp_strategy,
 632 #if     !defined(AFS_NONFSTRANS)
 633     /* on HPUX102 the nfs translator calls afs_bread but does
 634      * not call afs_brelse. Hence we see a memory leak. If the
 635      * VOP_BREAD() call fails, then nfs does VOP_RDWR() to get
 636      * the same data : this is the path we follow now. */
 637     afs_noop,
 638     afs_noop,
 639 #else
 640     mp_afs_bread,
 641     mp_afs_brelse,
 642 #endif
 643     afs_badop,                  /* pathsend */
 644     afs_noop,                   /* setacl */
 645     afs_noop,                   /* getacl */
 646     afs_pathconf,
 647     afs_pathconf,
 648     mp_afs_lockctl,
 649     afs_lockf,                  /* lockf */
 650     mp_afs_fid,
 651     afs_noop,                   /*fsctl */
 652     afs_badop,
 653     afs_pagein,
 654     afs_pageout,
 655     NULL,
 656     NULL,
 657     afs_prealloc,
 658     afs_mapdbd,
 659     afs_mmap,
 660     afs_cachelimit,
 661     afs_vm_checkpage,
 662     afs_vm_fscontiguous,
 663     afs_vm_stopio,
 664     afs_read_ahead,
 665     afs_release,
 666     afs_unmap,
 667     afs_swapfs_len,
 668     mp_afs_readdir2,
 669     afs_readdir3,
 670 };
 671
 672 struct vnodeops *afs_ops = &Afs_vnodeops;
 673
 674 /* vnode file operations, and our own */
 675 extern int vno_rw();
 676 extern int vno_ioctl();
 677 extern int vno_select();
 678 extern int afs_closex();
 679 extern int vno_close();
 680 struct fileops afs_fileops = {
 681     vno_rw,
 682     vno_ioctl,
 683     vno_select,
 684     afs_close,
 685 };
 686
 687 #define vtoblksz(vp)    ((vp)->v_vfsp->vfs_bsize)
 688
 689 /*
 690  ********************************************************************
 691  ****
 692  ****                   afspgin_setup_io_ranges ()
 693  ****    similar to:    nfspgin_setup_io_ranges ()
 694  ********************************************************************
 695  */
 696 pgcnt_t
 697 afspgin_setup_io_ranges(vfspage_t * vm_info, pgcnt_t bpages, k_off_t isize,
 698                         pgcnt_t startindex)
 699 {
 700     pgcnt_t file_offset = VM_FILE_OFFSET(vm_info);
 701     pgcnt_t minpage;            /* first page to bring in */
 702     pgcnt_t maxpage;            /* one past last page to bring in */
 703     pgcnt_t maxpagein;
 704     pgcnt_t multio_maxpage;
 705     kern_daddr_t start_blk;
 706     dbd_t *dbd;
 707     expnd_flags_t up_reason, down_reason;
 708     int count = 1;
 709     int indx = 0;
 710     int max_num_io;
 711     int dbdtype;
 712     preg_t *prp;
 713
 714     VM_GET_IO_INFO(vm_info, maxpagein, max_num_io);
 715
 716     /*
 717      * We do not go past the end of the current pregion nor past the end
 718      * of the current file.
 719      */
 720
 721     maxpage = startindex + (bpages - (startindex + file_offset) % bpages);
 722     maxpage = vm_reset_maxpage(vm_info, maxpage);
 723     maxpage = MIN(maxpage, (pgcnt_t) btorp(isize) - file_offset);
 724     maxpage = MIN(maxpage, startindex + maxpagein);
 725     multio_maxpage = maxpage = vm_maxpage(vm_info, maxpage);
 726
 727     if (!maxpage)
 728         return (0);
 729
 730     VASSERT(maxpage >= startindex);
 731
 732     /*
 733      * Expanding the fault will create calls to FINDENTRY() for new
 734      * pages, which will obsolete "dbd", so copy what it points to
 735      * and clear it to prevent using stale data.
 736      */
 737
 738     prp = VM_PRP(vm_info);
 739     dbdtype = DBD_TYPE(vm_info);
 740     start_blk = DBD_DATA(vm_info);
 741     vm_info->dbd = NULL;
 742     vm_info->vfd = NULL;
 743     VASSERT(dbdtype != DBD_NONE);
 744
 745     if (max_num_io == 1) {
 746         /*
 747          * We need to set up one I/O: First we attempt to expand the
 748          * I/O forward. Then we expand the I/O backwards.
 749          */
 750         count =
 751             expand_faultin_up(vm_info, dbdtype, (int)bpages, maxpage, count,
 752                               startindex, start_blk, &up_reason);
 753         maxpage = startindex + count;
 754         VASSERT(maxpage <= startindex + maxpagein);
 755         minpage = startindex - (startindex + file_offset) % bpages;
 756         minpage = MAX(minpage, maxpage - maxpagein);
 757         VASSERT(startindex >= VM_BASE_OFFSET(vm_info));
 758         minpage = vm_minpage(vm_info, minpage);
 759         VASSERT(minpage <= startindex);
 760         count =
 761             expand_faultin_down(vm_info, dbdtype, (int)bpages, minpage, count,
 762                                 &startindex, &start_blk, &down_reason);
 763         VM_SET_IO_STARTINDX(vm_info, 0, startindex);
 764         VM_SET_IO_STARTBLK(vm_info, 0, start_blk);
 765         VM_SET_IO_COUNT(vm_info, 0, count);
 766         VM_SET_NUM_IO(vm_info, 1);
 767     }
 768
 769     if (max_num_io > 1) {
 770         /*
 771          * We need to set up multiple I/O information; beginning
 772          * with the startindex, we will expand upwards. The expansion
 773          * could stop for one of 2 reasons; we take the appropriate
 774          * action in each of these cases:
 775          *      o VM reasons: abort setting up the multiple I/O
 776          *        information and return to our caller indicating
 777          *        that "retry" is required.
 778          *      o pagelimit: set up the next I/O info [we may have
 779          *        reached multio_maxpage at this point].
 780          * Note that expansion involves no more than a block at a time;
 781          * hence it could never stop due to "discontiguous block"
 782          * reason.
 783          */
 784         startindex = minpage = vm_minpage(vm_info, 0);
 785         for (indx = 0; (indx < max_num_io) && (startindex < multio_maxpage);
 786              indx++, startindex += count) {
 787             dbd = FINDDBD(prp->p_reg, startindex);
 788             start_blk = dbd->dbd_data;
 789             maxpage =
 790                 startindex + (bpages - (startindex + file_offset) % bpages);
 791             maxpage = min(maxpage, multio_maxpage);
 792             count =
 793                 expand_faultin_up(vm_info, dbdtype, bpages, maxpage,
 794                                   1 /* count */ ,
 795                                   startindex, start_blk, &up_reason);
 796             VM_SET_IO_STARTINDX(vm_info, indx, startindex);
 797             VM_SET_IO_STARTBLK(vm_info, indx, start_blk);
 798             VM_SET_IO_COUNT(vm_info, indx, count);
 799             if (up_reason & VM_REASONS)
 800                 break;
 801             VASSERT(!(up_reason & NONCONTIGUOUS_BLOCK));
 802             VASSERT(up_reason & PAGELIMIT);
 803         }
 804         if (startindex < multio_maxpage) {
 805             VM_MULT_IO_FAILURE(vm_info);
 806             VM_REINIT_FAULT_DBDVFD(vm_info);
 807             return (0);         /* retry */
 808         }
 809         count = maxpagein;
 810         VM_SET_NUM_IO(vm_info, indx);
 811     }
 812
 813     /*
 814      * Tell VM where the I/O intends to start.  This may be different
 815      * from the faulting point.
 816      */
 817
 818     VM_SET_STARTINDX(vm_info, VM_GET_IO_STARTINDX(vm_info, 0));
 819
 820     return (count);
 821
 822 }
 823
 824 /*
 825  ********************************************************************
 826  ****
 827  ****                   afspgin_blkflsh ()
 828  ****   similar to:     nfspgin_blkflsh ()
 829  ********************************************************************
 830  */
 831 retval_t
 832 afspgin_blkflsh(vfspage_t * vm_info, struct vnode * devvp, pgcnt_t * num_4k)
 833 {
 834     int flush_reslt = 0;
 835     pgcnt_t count = *num_4k;
 836     pgcnt_t page_count;
 837     int indx = 0;
 838     int num_io = VM_GET_NUM_IO(vm_info);
 839
 840     /*
 841      * On this blkflush() we don't want to purge the buffer cache and we do
 842      * want to wait, so the flags are '0'.
 843      */
 844
 845     for (indx = 0; indx < num_io; indx++) {
 846         flush_reslt =
 847             blkflush(devvp, (kern_daddr_t) VM_GET_IO_STARTBLK(vm_info, indx),
 848                      ptob(VM_GET_IO_COUNT(vm_info, indx)), 0,
 849                      VM_REGION(vm_info));
 850         if (flush_reslt) {
 851             vm_lock(vm_info);
 852             if (vm_page_now_valid(vm_info, &page_count)) {
 853                 vm_release_memory(vm_info);
 854                 vm_release_structs(vm_info);
 855                 *num_4k = page_count;
 856                 return (VM_PAGE_PRESENT);
 857             }
 858             return (VM_RETRY);
 859         }
 860     }
 861     return (VM_DONE);
 862 }
 863
 864 /*
 865  ********************************************************************
 866  ****
 867  ****                   afspgin_io ()
 868  ****    similar to:    nfspgin_io ()
 869  ********************************************************************
 870  */
 871 int
 872 afspgin_io(vfspage_t * vm_info, struct vnode *devvp, pgcnt_t bpages,
 873            pgcnt_t maxpagein, pgcnt_t count)
 874 {
 875     int i;
 876     int error = 0;
 877     caddr_t vaddr = VM_ADDR(vm_info);
 878     caddr_t virt_addr = VM_MAPPED_ADDR(vm_info);
 879     pagein_info_t *io = VM_PAGEIN_INFO(vm_info);
 880     preg_t *prp = VM_PRP(vm_info);
 881     int wrt = VM_WRT(vm_info);
 882     space_t space = VM_SPACE(vm_info);
 883     int num_io = VM_GET_NUM_IO(vm_info);
 884
 885 #ifdef notdef                   /* Not used in AFS */
 886     /*
 887      * With VM_READ_AHEAD_ALLOWED() macro, check if read-ahead should
 888      * be used in this case.
 889      *
 890      * Unlike UFS, NFS does not start the faulting page I/O
 891      * asynchronously. Why?  Asynchronous requests are handled by the
 892      * biod's.  It doesn't make sense to queue up the faulting request
 893      * behind other asynchrnous requests.  This is not true for UFS
 894      * where the asynchrnous request is immediately handled.
 895      */
 896
 897     if ((VM_READ_AHEAD_ALLOWED(vm_info)) && (nfs_read_ahead_on)
 898         && (NFS_DO_READ_AHEAD) && (should_do_read_ahead(prp, vaddr))) {
 899
 900         pgcnt_t max_rhead_io;
 901         caddr_t rhead_vaddr;
 902         pgcnt_t total_rheads_allowed;
 903
 904         /*
 905          * Determine the maximum amount of read-ahead I/O.
 906          */
 907         total_rheads_allowed = maxpagein - count;
 908
 909         /*
 910          * If the count is less than a block, raise it to one.
 911          */
 912         if (total_rheads_allowed < bpages)
 913             total_rheads_allowed = bpages;
 914
 915         max_rhead_io = total_rheads_allowed;
 916         rhead_vaddr = VM_MAPPED_ADDR(vm_info) + (count * NBPG);
 917         error =
 918             nfs_read_ahead(vm_info->vp, prp, wrt, space, rhead_vaddr,
 919                            &max_rhead_io);
 920
 921         /*
 922          * Set the next fault location.  If read_ahead launches any
 923          * I/O it will adjust it accordingly.
 924          */
 925         vm_info->prp->p_nextfault = vm_info->startindex + count;
 926
 927         /*
 928          * Now perform the faulting I/O synchronously.
 929          */
 930         vm_unlock(vm_info);
 931
 932         error =
 933             syncpageio((swblk_t) VM_GET_IO_STARTBLK(vm_info, 0),
 934                        VM_MAPPED_SPACE(vm_info), VM_MAPPED_ADDR(vm_info),
 935                        (int)ptob(count), B_READ, devvp,
 936                        B_vfs_pagein | B_pagebf, VM_REGION(vm_info));
 937     } else
 938 #endif
 939     {
 940         virt_addr = VM_MAPPED_ADDR(vm_info);
 941         vm_unlock(vm_info);
 942         for (i = 0; i < num_io; i++) {
 943             /*
 944              * REVISIT -- investigate doing asyncpageio().
 945              */
 946             error |= (io[i].error =
 947                       syncpageio((swblk_t) VM_GET_IO_STARTBLK(vm_info, i),
 948                                  VM_MAPPED_SPACE(vm_info), virt_addr,
 949                                  (int)ptob(VM_GET_IO_COUNT(vm_info, i)),
 950                                  B_READ, devvp, B_vfs_pagein | B_pagebf,
 951                                  VM_REGION(vm_info)));
 952             virt_addr += ptob(VM_GET_IO_COUNT(vm_info, i));
 953         }
 954         /*
 955          * Set the next fault location.  If read_ahead launches any
 956          * I/O it will adjust it accordingly.
 957          */
 958         vm_info->prp->p_nextfault = vm_info->startindex + count;
 959     }
 960
 961     return (error);
 962 }
 963
 964 /*
 965  ********************************************************************
 966  ****
 967  ****                   afspgin_update_dbd ()
 968  ****    similar to:    nfspgin_update_dbd ()
 969  ********************************************************************
 970  */
 971 void
 972 afspgin_update_dbd(vfspage_t * vm_info, int bsize)
 973 {
 974     k_off_t off;
 975     pgcnt_t count = bsize / NBPG;
 976     k_off_t rem;
 977     pgcnt_t m;
 978     pgcnt_t pgindx;
 979     kern_daddr_t blkno;
 980     int num_io = VM_GET_NUM_IO(vm_info);
 981     int i;
 982
 983     for (i = 0; i < num_io; i++) {
 984
 985         pgindx = VM_GET_IO_STARTINDX(vm_info, i);
 986         off = vnodindx(VM_REGION(vm_info), pgindx);
 987         rem = off % bsize;
 988         blkno = VM_GET_IO_STARTBLK(vm_info, i);
 989
 990         VASSERT(bsize % NBPG == 0);
 991         VASSERT(rem % NBPG == 0);
 992
 993         pgindx -= (pgcnt_t) btop(rem);
 994         blkno -= (kern_daddr_t) btodb(rem);
 995
 996         /*
 997          * This region could start in mid-block.  If so, pgindx
 998          * could be less than 0, so we adjust pgindx and blkno back
 999          * up so that pgindx is 0.
1000          */
1001
1002         if (pgindx < 0) {
1003             pgcnt_t prem;
1004             prem = 0 - pgindx;
1005             pgindx = 0;
1006             count -= prem;
1007             blkno += btodb(ptob(prem));
1008         }
1009
1010         for (m = 0; m < count && pgindx < VM_REGION_SIZE(vm_info);
1011              m++, pgindx++, blkno += btodb(NBPG)) {
1012             /*
1013              * Note:  since this only changes one block, it
1014              * assumes only one block was faulted in.  Currently
1015              * this is always true for remote files, and we only
1016              * get here for remote files, so everything is ok.
1017              */
1018             vm_mark_dbd(vm_info, pgindx, blkno);
1019         }
1020     }
1021 }
1022
1023 int
1024 afs_pagein(vp, prp, wrt, space, vaddr, ret_startindex)
1025      struct vnode *vp;
1026      preg_t *prp;
1027      int wrt;
1028      space_t space;
1029      caddr_t vaddr;
1030      pgcnt_t *ret_startindex;
1031 {
1032     pgcnt_t startindex;
1033     pgcnt_t pgindx = *ret_startindex;
1034     pgcnt_t maxpagein;
1035     struct vnode *devvp;
1036     pgcnt_t count;
1037     kern_daddr_t start_blk = 0;
1038     int bsize;
1039     int error;
1040     k_off_t isize;
1041     int shared;                 /* writable memory mapped file */
1042     retval_t retval = 0;
1043     pgcnt_t ok_dbd_limit = 0;   /* last dbd that we can trust */
1044     pgcnt_t bpages;             /* number of pages per block */
1045     pgcnt_t page_count;
1046     vfspage_t *vm_info = NULL;
1047     int done;
1048
1049     struct vattr va;
1050
1051     caddr_t nvaddr;
1052     space_t nspace;
1053     int change_to_fstore = 0;   /* need to change dbds to DBD_FSTORE */
1054     int flush_start_blk = 0;
1055     int flush_end_blk = 0;
1056
1057     int i, j;
1058
1059     AFS_STATCNT(afs_pagein);
1060     vmemp_lockx();              /* lock down VM empire */
1061
1062     /* Initialize the VM info structure */
1063     done =
1064         vm_pagein_init(&vm_info, prp, pgindx, space, vaddr, wrt, 0,
1065                        LGPG_ENABLE);
1066
1067     /* Check to see if we slept and the page was falted in. */
1068     if (done) {
1069         vm_release_structs(vm_info);
1070         vmemp_returnx(1);
1071     }
1072
1073     vp = VM_GET_PAGEIN_VNODE(vm_info);
1074     VASSERT(vp != NULL);
1075     shared = VM_SHARED_OBJECT(vm_info);
1076     VASSERT(DBD_TYPE(vm_info) != DBD_NONE);
1077
1078     /*
1079      * Get the devvp and block size for this vnode type
1080      */
1081     devvp = vp;
1082     bsize = vp->v_vfsp->vfs_bsize;
1083     if (bsize <= 0 || (bsize & (DEV_BSIZE - 1)))
1084         osi_Panic("afs_pagein: bsize is zero or not a multiple of DEV_BSIZE");
1085
1086     bpages = (pgcnt_t) btop(bsize);
1087     VASSERT(bpages > 0);
1088     VM_SET_FS_MAX_PAGES(vm_info, bpages);
1089
1090     /* this trace cannot be here because the afs_global lock might not be
1091      * held at this point. We hold the vm global lock throughout
1092      * this procedure ( and not the AFS global lock )
1093      * afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEIN, ICL_TYPE_POINTER, (afs_int32) vp,
1094      * ICL_TYPE_LONG, DBD_TYPE(vm_info), ICL_TYPE_LONG, bpages,
1095      * ICL_TYPE_LONG, shared);
1096      */
1097     /* Come here if we have to release the region lock before
1098      * locking pages.  This can happen in memreserve() and
1099      * blkflush().
1100      */
1101   retry:
1102     /*
1103      * For remote files like ours, we want to check to see if the file has shrunk.
1104      * If so, we should invalidate any pages past the end.  In the name
1105      * of efficiency, we only do this if the page we want to fault is
1106      * past the end of the file.
1107      */
1108     {
1109         if (VOP_GETATTR(vp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1110             VM_ZOMBIE_OBJECT(vm_info);
1111             vm_release_memory(vm_info);
1112             vm_release_structs(vm_info);
1113             vmemp_returnx(0);
1114         }
1115         isize = va.va_size;
1116         if (vnodindx(VM_REGION(vm_info), pgindx) >= isize) {
1117             /*
1118              * The file has shrunk and someone is trying to access a
1119              * page past the end of the object.  Shrink the object back
1120              * to its currrent size, send a SIGBUS to the faulting
1121              * process and return.
1122              *
1123              * We must release the region lock before calling mtrunc(),
1124              * since mtrunc() locks all the regions that are using this
1125              * file.
1126              */
1127             vm_release_memory(vm_info);
1128             vm_truncate_region(vm_info, isize);
1129             vm_release_structs(vm_info);
1130             vmemp_returnx(-SIGBUS);
1131         }
1132     }
1133
1134     maxpagein = vm_pick_maxpagein(vm_info);
1135     if (vm_wait_for_memory(vm_info, maxpagein, 1)) {
1136         /* Check to see if we should continue faulting.  */
1137         if (vm_page_now_valid(vm_info, &page_count)) {
1138             vm_release_memory(vm_info);
1139             vm_release_structs(vm_info);
1140             vmemp_returnx(page_count);
1141         }
1142     }
1143     if (count = vm_no_io_required(vm_info)) {
1144         /* Release any excess memory.  */
1145         vm_release_memory(vm_info);
1146         vm_release_structs(vm_info);
1147         vmemp_returnx(count);
1148     }
1149 #ifdef OSDEBUG
1150     /*
1151      * We should never have DBD_HOLE pages in a non-MMF region.
1152      */
1153     if (!shared)
1154         VASSERT(dbd->dbd_type != DBD_HOLE);
1155 #endif
1156     VASSERT(DBD_TYPE(vm_info) != DBD_NONE);
1157
1158     startindex = *ret_startindex;
1159
1160     /*
1161      * If the page we want is in memory already, take it
1162      */
1163     if (VM_MEMORY_RESERVED(vm_info) < maxpagein) {
1164         /* pick up the rest of memory now.  */
1165         if (vm_wait_for_memory(vm_info, maxpagein, 0)) {
1166             if (vm_page_now_valid(vm_info, &page_count)) {
1167                 vm_release_memory(vm_info);
1168                 vm_release_structs(vm_info);
1169                 vmemp_returnx(page_count);
1170             }
1171             goto retry;
1172         }
1173     }
1174
1175     if (!
1176         (count =
1177          afspgin_setup_io_ranges(vm_info, bpages, isize, startindex))) {
1178         goto retry;
1179     }
1180
1181     startindex = VM_GET_STARTINDX(vm_info);
1182
1183     VASSERT(maxpagein >= count);
1184
1185     /*
1186      * Release the memory we won't need.
1187      */
1188     if (count < maxpagein) {
1189         vm_release_excess_memory(vm_info,
1190                                  (VM_MEMORY_RESERVED(vm_info) - count));
1191     }
1192
1193     retval = afspgin_blkflsh(vm_info, devvp, &count);
1194
1195     if (retval == VM_RETRY) {
1196         goto retry;
1197     }
1198
1199     if (retval == VM_PAGE_PRESENT)
1200         return (count);
1201
1202 #if 0
1203     /*
1204      * The definition of krusage_cntr_t is in h/kmetric.h, which
1205      * is not shipped.  Since it's just statistics, we punt and do
1206      * not update it.  If it's a problem we'll need to get HP to export
1207      * an interface that we can use to increment the counter.
1208      */
1209
1210     /* It's a real fault, not a reclaim */
1211     {
1212         krusage_cntr_t *temp;
1213         temp = kt_cntrp(u.u_kthreadp);
1214         temp->krc_majflt++;
1215     }
1216 #endif
1217
1218     /*
1219      * Tell VM where the I/O intends to start.  This may be different
1220      * from the faulting point.
1221      */
1222
1223     /*
1224      * vm_prepare_io will fill the region with pages and release the
1225      * region lock.
1226      */
1227     vm_prepare_io(vm_info, &count);
1228
1229     /*
1230      * Count may have been adjusted, check to make sure it's non-zero.
1231      */
1232     if (count == 0) {
1233         if (vm_retry(vm_info)) {
1234             goto retry;
1235         }
1236
1237         /*
1238          * Release resources and retry the fault.  Release any excess
1239          * memory.
1240          */
1241
1242         vm_release_memory(vm_info);
1243         vm_release_structs(vm_info);
1244         vmemp_returnx(0);
1245     }
1246
1247     error = afspgin_io(vm_info, devvp, bpages, maxpagein, count);
1248
1249     if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1250         retval = -SIGBUS;
1251         VM_ZOMBIE_OBJECT(vm_info);
1252         goto backout;
1253     }
1254     /*
1255      * For a writable memory mapped file that is remote we must
1256      * detect potential holes in the file and force allocation of
1257      * disk space on the remote system.  Unfortunately, there is
1258      * no easy way to do this, so this gets a little ugly.
1259      */
1260     if (shared && wrt) {
1261         /*
1262          * See if The user wants to write to this page.  Write some
1263          * minimal amount of data back to the remote file to
1264          * force allocation of file space.  We only need to
1265          * write a small amount, since holes are always at
1266          * least one filesystem block in size.
1267          */
1268         error = vm_alloc_hole(vm_info);
1269
1270         /*
1271          * If some sort of I/O error occurred we generate a
1272          * SIGBUS for the process that caused the write,
1273          * undo our page locks, etc and return.
1274          */
1275         if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1276             VM_ZOMBIE_OBJECT(vm_info);
1277             retval = -SIGBUS;
1278             goto backout;
1279         }
1280
1281         /*
1282          * Change these dbds to DBD_FSTORE.  We cannot do it here,
1283          * since the region must be locked, and it is not locked
1284          * at the moment.  We cannot lock the region yet, as we
1285          * first have to release the page locks.
1286          */
1287         change_to_fstore = 1;
1288     }
1289
1290     vm_finish_io(vm_info, count);
1291
1292     /*
1293      * Acquire the lock before we play around with changing the vfd's.
1294      */
1295     vm_lock(vm_info);
1296
1297     if (change_to_fstore)
1298         afspgin_update_dbd(vm_info, bsize);
1299
1300 #if defined(AFS_HPUX110_ENV)
1301     getppdp()->cnt.v_exfod += count;
1302 #else
1303     mpproc_info[getprocindex()].cnt.v_exfod += count;
1304 #endif
1305     vmemp_unlockx();            /* free up VM empire */
1306     *ret_startindex = startindex;
1307
1308     /*
1309      * In case we have any excess memory...
1310      */
1311     if (VM_MEMORY_RESERVED(vm_info))
1312         vm_release_memory(vm_info);
1313     vm_release_structs(vm_info);
1314
1315     return count;
1316
1317   backout:
1318
1319     vm_finish_io_failed(vm_info, count);
1320
1321     vm_lock(vm_info);
1322
1323     vm_undo_validation(vm_info, count);
1324
1325     /*
1326      * In case we have any excess memory...
1327      */
1328     if (VM_MEMORY_RESERVED(vm_info))
1329         vm_release_memory(vm_info);
1330     vm_release_structs(vm_info);
1331
1332     vmemp_unlockx();            /* free up VM empire */
1333     return retval;
1334 }
1335
1336 int
1337 afs_pageout(vp, prp, start, end, flags)
1338      struct vnode *vp;          /* not used */
1339      preg_t *prp;
1340      pgcnt_t start;
1341      pgcnt_t end;
1342      int flags;
1343 {
1344     struct vnode *filevp;
1345     struct vnode *devvp;
1346     pgcnt_t i;
1347     int steal;
1348     int vhand;
1349     int hard;
1350     int *piocnt;                /* wakeup counter used if PAGEOUT_WAIT */
1351     struct ucred *old_cred;
1352     vfspage_t vm_info;
1353     fsdata_t args;
1354
1355     int inode_changed = 0;
1356     int file_is_remote;
1357     struct inode *ip;
1358
1359     AFS_STATCNT(afs_pageout);
1360
1361     steal = (flags & PAGEOUT_FREE);
1362     vhand = (flags & PAGEOUT_VHAND);
1363     hard = (flags & PAGEOUT_HARD);
1364
1365     vmemp_lockx();
1366
1367     /*  Initialize the VM info structure.  */
1368     vm_pageout_init(&vm_info, prp, start, end, 0, 0, 0, flags);
1369
1370     /*
1371      * If the region is marked "don't swap", then don't steal any pages
1372      * from it.  We can, however, write dirty pages out to disk (only if
1373      * PAGEOUT_FREE is not set).
1374      */
1375     if (vm_no_pageout(&vm_info)) {
1376         vmemp_unlockx();
1377         return (0);
1378     }
1379
1380     /*
1381      * If caller wants to wait until the I/O is complete.
1382      */
1383     vm_setup_wait_for_io(&vm_info);
1384
1385     filevp = VM_GET_PAGEOUT_VNODE(&vm_info);    /* always page out to back store */
1386     VASSERT(filevp != NULL);
1387
1388     memset((caddr_t) & args, 0, sizeof(fsdata_t));
1389     args.remote_down = 0;       /* assume remote file servers are up */
1390     args.remote = 1;            /* we are remote */
1391     args.bsize = 0;             /* filled up later by afs_vm_checkpage() */
1392
1393     if (filevp->v_fstype == VUFS) {
1394         ip = VTOI(filevp);
1395         devvp = ip->i_devvp;
1396         file_is_remote = 0;
1397     } else {
1398         file_is_remote = 1;
1399         devvp = filevp;
1400
1401         /*
1402          * If we are vhand(), and this is an NFS file, we need to
1403          * see if the NFS server is "down".  If so, we decide
1404          * if we will try to talk to it again, or defer pageouts
1405          * of dirty NFS pages until a future time.
1406          */
1407 #ifdef  notdef
1408         if (vhand && filevp->v_fstype == VNFS && vtomi(filevp)->mi_down
1409             && vtomi(filevp)->mi_hard) {
1410             extern afs_int32 vhand_nfs_retry;
1411             /*
1412              * If there is still time left on our timer, we will
1413              * not talk to this server right now.
1414              */
1415             if (vhand_nfs_retry > 0)
1416                 args.remote_down = 1;
1417         }
1418 #endif
1419     }
1420
1421     /*
1422      * Initialize args.  We set bsize to 0 to tell vfs_vfdcheck() that
1423      * it must get the file size and other attributes if it comes across
1424      * a dirty page.
1425      */
1426     vm_info.fs_data = (caddr_t) & args;
1427
1428     /* this trace cannot be here because the afs_global lock might not be
1429      * held at this point. We hold the vm global lock throughout
1430      * this procedure ( and not the AFS global lock )
1431      * afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEOUT, ICL_TYPE_POINTER, (afs_int32) filevp,
1432      * ICL_TYPE_LONG, start, ICL_TYPE_LONG, end, ICL_TYPE_LONG, flags);
1433      */
1434
1435     i = start;
1436
1437     while (i <= end) {
1438         struct buf *bp;
1439         k_off_t start;
1440         pgcnt_t npages;
1441         k_off_t nbytes;
1442         int error;
1443
1444         extern int pageiodone();
1445         space_t nspace;
1446         caddr_t nvaddr;
1447
1448         /*
1449          * Ask the VM system to find the next run of pages.
1450          */
1451         vm_find_next_range(&vm_info, i, end);
1452
1453         /*
1454          * It's possible that the remote file shrunk in size.  Check the flags
1455          * to see if the request was beyond the end of the file.  If it was,
1456          * truncate the region to the file size and continue.  We could be on a
1457          * run so after trunction continue, there may be some I/O to write
1458          * out.
1459          */
1460         if (VM_FS_FLAGS(&vm_info) & PAGEOUT_TRUNCATE) {
1461             pgcnt_t pglen = (pgcnt_t) btorp(args.isize);
1462
1463             /*
1464              * This page is past the end of the file.  Unlock this page
1465              * (region_trunc will throw it away) and then call
1466              * region_trunc() to invalidate all pages past the new end of
1467              * the file.
1468              */
1469             region_trunc(VM_REGION(&vm_info), pglen, pglen + 1);
1470
1471             /*
1472              * remove the truncation flag.
1473              */
1474             VM_UNSETFS_FLAGS(&vm_info, PAGEOUT_TRUNCATE);
1475         }
1476
1477         if (VM_NO_PAGEOUT_RUN(&vm_info))
1478             break;
1479
1480         /*
1481          * We have a run of dirty pages [args.start...args.end].
1482          */
1483         VASSERT(filevp->v_fstype != VCDFS);
1484         VASSERT((filevp->v_vfsp->vfs_flag & VFS_RDONLY) == 0);
1485         VASSERT(VM_GET_NUM_IO(&vm_info) == 1);
1486
1487         /*
1488          * We will be doing an I/O on the region, let the VM system know.
1489          */
1490         (void)vm_up_physio_count(&vm_info);
1491
1492         /*
1493          * Okay, get set to perform the I/O.
1494          */
1495         inode_changed = 1;
1496         npages =
1497             (VM_END_PAGEOUT_INDX(&vm_info) + 1) -
1498             VM_START_PAGEOUT_INDX(&vm_info);
1499
1500         /*
1501          * Allocate and initialize an I/O buffer.
1502          */
1503         bp = bswalloc();
1504         vm_init_bp(&vm_info, bp);       /* Let the VM system initialize */
1505
1506         /* Identify this buffer for KI */
1507         bp->b_bptype = B_vfs_pageout | B_pagebf;
1508
1509         if (steal)
1510             bp->b_flags = B_CALL | B_BUSY | B_PAGEOUT;  /* steal pages */
1511         else
1512             bp->b_flags = B_CALL | B_BUSY;      /* keep pages */
1513
1514         /*
1515          * If we are vhand paging over NFS, we will wait for the I/O
1516          * to complete.
1517          */
1518         if (vhand && filevp->v_fstype == VNFS) {
1519             bp->b_flags &= ~B_CALL;
1520         } else {
1521             bp->b_iodone = (int (*)())pageiodone;
1522         }
1523
1524         /*
1525          * Make sure we do not write past the end of the file.
1526          */
1527         nbytes = ptob(npages);
1528         start = vnodindx(VM_REGION(&vm_info), vm_info.start);
1529         if (start + nbytes > args.isize) {
1530 #ifdef OSDEBUG
1531             /*
1532              * The amount we are off better not be bigger than a
1533              * filesystem block.
1534              */
1535             if (start + nbytes - args.isize >= args.bsize) {
1536                 osi_Panic("afs_pageout: remainder too large");
1537             }
1538 #endif
1539             /*
1540              * Reset the size of the I/O as necessary.  For remote
1541              * files, we set the size to the exact number of bytes to
1542              * the end of the file.  For local files, we round this up
1543              * to the nearest DEV_BSIZE chunk since disk I/O must always
1544              * be in multiples of DEV_BSIZE.  In this case, we do not
1545              * bother to zero out the data past the "real" end of the
1546              * file, this is done when the data is read (either through
1547              * mmap() or by normal file system access).
1548              */
1549             if (file_is_remote)
1550                 nbytes = args.isize - start;
1551             else
1552                 nbytes = roundup(args.isize - start, DEV_BSIZE);
1553         }
1554
1555         /*
1556          * Now get ready to perform the I/O
1557          */
1558         if (!vm_protect_pageout(&vm_info, npages)) {
1559             VASSERT(vhand);
1560             vm_undo_invalidation(&vm_info, vm_info.start, vm_info.end);
1561             vm_finish_io_failed(&vm_info, npages);
1562             bswfree(bp);
1563             break;
1564         }
1565         /*
1566          * If this is an NFS write by vhand(), we will not be calling
1567          * pageiodone().  asyncpageio() increments parolemem for us
1568          * if bp->b_iodone is pageiodone, so we must do it manually
1569          * if pageiodone() will not be called automatically.
1570          */
1571         if (!(bp->b_flags & B_CALL) && steal) {
1572             ulong_t context;
1573
1574             SPINLOCK_USAV(pfdat_lock, context);
1575             parolemem += btorp(nbytes);
1576             SPINUNLOCK_USAV(pfdat_lock, context);
1577         }
1578         blkflush(devvp, VM_START_PAGEOUT_BLK(&vm_info), (long)nbytes,
1579                  (BX_NOBUFWAIT | BX_PURGE), VM_REGION(&vm_info));
1580
1581         /*
1582          * If vhand is the one paging things out, and this is an NFS
1583          * file, we need to temporarily become a different user so
1584          * that we are not trying to page over NFS as root.  We use
1585          * the user credentials associated with the writable file
1586          * pointer that is in the psuedo-vas for this MMF.
1587          *
1588          * NOTE: we are currently using "va_rss" to store the ucred
1589          *       value in the vas (this should be fixed in 10.0).
1590          */
1591         old_cred = kt_cred(u.u_kthreadp);
1592         if (vhand) {
1593 #if defined(AFS_HPUX1123_ENV)
1594                 /*
1595                  * DEE - 1123 does not have the vas.h, and it looks
1596                  * we should never be called with a NFS type file anyway.
1597                  * so where did this come from? Was it copied from NFS?
1598                  * I assume it was, so we will add an assert for now
1599                  * and see if the code runs at all.
1600                  */
1601                 VASSERT(filevp->v_fstype != VNFS);
1602 #else
1603             set_kt_cred(u.u_kthreadp, filevp->v_vas->va_cred);
1604
1605             /*
1606              * If root was the one who opened the mmf for write,
1607              * va_cred will be NULL.  So reset kt_cred(u.u_kthreadp) to what it
1608              * was.  We will page out as root, but that is the
1609              * correct thing to do in this case anyway.
1610              */
1611             if (kt_cred(u.u_kthreadp) == NULL)
1612                 set_kt_cred(u.u_kthreadp, old_cred);
1613 #endif
1614         }
1615
1616         /*
1617          * Really do the I/O.
1618          */
1619         error =
1620             asyncpageio(bp, VM_START_PAGEOUT_BLK(&vm_info),
1621                         VM_MAPPED_SPACE(&vm_info), VM_MAPPED_ADDR(&vm_info),
1622                         (int)nbytes, B_WRITE, devvp);
1623
1624         VASSERT(error == 0);
1625
1626 #ifdef  notdef
1627         /*
1628          * If we are vhand paging over NFS we want to wait for the
1629          * I/O to complete and take the appropriate actions if an
1630          * error is encountered.
1631          */
1632         if (vhand) {
1633             if (waitforpageio(bp) && nfs_mi_harddown(filevp)) {
1634                 /*
1635                  * The server is down, ignore this failure, and
1636                  * try again later. (rfscall() has set our retry
1637                  * timer).
1638                  */
1639                 fsdata.remote_down = 1;
1640                 pageiocleanup(bp, 0);
1641
1642                 /*
1643                  * vm_vfdcheck() has cleared the valid bit on the
1644                  * vfds for these pages.  We must go back and set the
1645                  * valid bit, as the pages are really not gone.
1646                  *
1647                  * NOTE: we can do this because we still hold (and have
1648                  * not released) the region lock.
1649                  */
1650                 if (steal)
1651                     vm_undo_invalidation(&vm_info, vm_info.start,
1652                                          vm_info.end);
1653             } else {
1654                 /*
1655                  * The I/O succeeded, or we had an error that we do
1656                  * not want to defer until later.  Call pageidone()
1657                  * to handle things.
1658                  */
1659                 pageiodone(bp);
1660             }
1661         }
1662 #endif
1663
1664         /*
1665          * And restore our credentials to what they were.
1666          */
1667         set_kt_cred(u.u_kthreadp, old_cred);
1668
1669         /*
1670          * If we reserved memory in vfs_vfdcheck(), (only for NFS) we
1671          * can now unreserve it.
1672          */
1673         if (vm_info.vm_flags & PAGEOUT_RESERVED) {
1674             vm_info.vm_flags &= ~PAGEOUT_RESERVED;
1675             vm_release_malloc_memory();
1676         }
1677
1678         /*
1679          * Update statistics
1680          */
1681         if (steal) {
1682             if (flags & PF_DEACT) {
1683 #if defined(AFS_HPUX110_ENV)
1684                 getppdp()->cnt.v_pswpout += npages;
1685 #else
1686                 mpproc_info[getprocindex()].cnt.v_pswpout += npages;
1687 #endif
1688 /*              sar_bswapout += ptod(npages);*/
1689             } else if (vhand) {
1690 #if defined(AFS_HPUX110_ENV)
1691                 getppdp()->cnt.v_pgout++;
1692                 getppdp()->cnt.v_pgpgout += npages;
1693 #else
1694                 mpproc_info[getprocindex()].cnt.v_pgout++;
1695                 mpproc_info[getprocindex()].cnt.v_pgpgout += npages;
1696 #endif
1697             }
1698         }
1699
1700         /*
1701          * If time and patience have delivered enough
1702          * pages, then quit now while we are ahead.
1703          */
1704         if (VM_STOP_PAGING(&vm_info))
1705             break;
1706
1707         i = VM_END_PAGEOUT_INDX(&vm_info) - VM_BASE_OFFSET(&vm_info) + 1;
1708     }
1709
1710     vm_finish_pageout(&vm_info);        /* update vhand's stealscan */
1711
1712     vmemp_unlockx();
1713
1714     /*
1715      * If we wanted to wait for the I/O to complete, sleep on piocnt.
1716      * We must decrement it by one first, and then make sure that it
1717      * is non-zero before going to sleep.
1718      */
1719     vm_wait_for_io(&vm_info);
1720
1721     if (inode_changed && !file_is_remote) {
1722         imark(ip, IUPD | ICHG);
1723         iupdat(ip, 0, 0);
1724     }
1725     return 0;
1726 }
1727
1728 int
1729 afs_mapdbd(filevp, offset, bn, flags, hole, startidx, endidx)
1730      struct vnode *filevp;
1731      off_t offset;
1732      kern_daddr_t *bn;          /* Block number. */
1733      int flags;                 /* B_READ or B_WRITE */
1734      int *hole;                 /* To be used for read-ahead. */
1735      pgcnt_t *startidx;         /* To be used for read-ahead. */
1736      pgcnt_t *endidx;           /* To be used for read-ahead. */
1737 {
1738     kern_daddr_t lbn, local_bn;
1739     int on;
1740     int err;
1741     long bsize = vtoblksz(filevp) & ~(DEV_BSIZE - 1);
1742
1743     if (startidx)
1744         *startidx = (pgcnt_t) (offset / NBPG);
1745     if (endidx)
1746         *endidx = (pgcnt_t) (offset / NBPG);
1747     if (hole)
1748         *hole = 0;              /* Can't have holes. */
1749     if (bsize <= 0)
1750         osi_Panic("afs_mapdbd: zero size");
1751
1752     lbn = (kern_daddr_t) (offset / bsize);
1753     on = offset % bsize;
1754
1755     err = VOP_BMAP(filevp, lbn, NULL, &local_bn, flags);
1756     VASSERT(err == 0);
1757
1758     /*
1759      * We can never get a bn less than zero on remote files.
1760      */
1761     VASSERT(local_bn >= 0);
1762
1763     local_bn = local_bn + btodb(on);
1764     *bn = local_bn;
1765
1766     return (0);
1767 }
1768
1769 /*
1770  * Return values:
1771  *      1: The blocks are contiguous.
1772  *      0: The blocks are not contiguous.
1773  */
1774 int
1775 afs_vm_fscontiguous(vp, args, cur_data)
1776      struct vnode *vp;
1777      vfspage_t *args;
1778      u_int cur_data;
1779 {
1780     if (cur_data == (VM_END_PAGEOUT_BLK(args) + btodb(NBPG))) {
1781         return (1);
1782     } else {
1783         return (0);
1784     }
1785 }
1786
1787 /*
1788  * Return values:
1789  *      1: Stop, this page is the last in the block.
1790  *      0: Continue on
1791  * Terminate requests at filesystem block boundaries
1792  */
1793 afs_vm_stopio(vp, args)
1794      struct vnode *vp;
1795      vfspage_t *args;
1796 {
1797     fsdata_t *fsdata = (fsdata_t *) args->fs_data;
1798
1799 #if defined(AFS_HPUX1123_ENV)
1800         uint64_t tmpdb;
1801         tmpdb = VM_END_PAGEOUT_BLK(args);
1802
1803         if ((dbtob(tmpdb) + NBPG) % (fsdata->bsize) == 0)
1804 #else
1805     if ((dbtob(VM_END_PAGEOUT_BLK(args)) + NBPG) % (fsdata->bsize) == 0)
1806 #endif /* AFS_HPUX1123_ENV */
1807         {
1808         return (1);
1809     } else {
1810         return (0);
1811     }
1812 }
1813
1814 /*
1815  *      afs_vm_checkpage is called by the VM while collecting a run of
1816  *      pages on a pageout.  afs_vm_checkpage() is called for each page
1817  *      VM wants to write to disk.
1818  */
1819 afs_vm_checkpage(vp, args, pgindx, cur_data)
1820      struct vnode *vp;
1821      vfspage_t *args;
1822      pgcnt_t pgindx;
1823      int cur_data;
1824 {
1825     fsdata_t *fsdata = (fsdata_t *) args->fs_data;
1826
1827     if (fsdata->remote_down) {  /* never happens for AFS */
1828         /*
1829          * The remote system is down.
1830          */
1831         VASSERT(args->run == 0);
1832         return 1;
1833     }
1834     /*
1835      * A dirty page.  If we have not yet determined the file size and
1836      * other attributes that we need to write out pages (the block
1837      * size and ok_dbd_limit), get that information now.
1838      */
1839     if (fsdata->bsize == 0) {
1840         k_off_t isize;
1841         long bsize;
1842         struct vattr va;
1843         struct vnode *filevp;
1844         /*
1845          * Get the various attributes about the file.  Store them
1846          * in args for the next time around.
1847          */
1848         filevp = args->vp;
1849
1850         bsize = vtoblksz(filevp);
1851         args->maxpgs = (pgcnt_t) btop(bsize);
1852
1853         if (VOP_GETATTR(filevp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1854             /*
1855              * The VOP_GETATTR() failed.
1856              * we are vhand, and this is a hard mount, we will
1857              * skip dirty pages for a while and try again later.
1858              */
1859             if (args->vm_flags & PAGEOUT_VHAND) {
1860                 VASSERT(args->run == 0);
1861                 return 1;
1862             }
1863             /*
1864              * This is a "soft" mount, or some other error was
1865              * returned from the server.  Mark this region
1866              * as a zombie, and free this dirty page.
1867              */
1868             VM_ZOMBIE_OBJECT(args);
1869
1870             /*
1871              * The caller will see r_zomb and remove the page
1872              * appropriately.
1873              */
1874             return (1);
1875         }
1876         isize = va.va_size;
1877         fsdata->isize = isize;
1878         fsdata->bsize = bsize;
1879         fsdata->remote = 1;
1880     }
1881     /*
1882      * See if the file has shrunk (this could have happened
1883      * asynchronously because of NFS or DUX).  If so, invalidate
1884      * all of the pages past the end of the file. This is only
1885      * needed for remote files, as local files are truncated
1886      * synchronously.
1887      */
1888
1889     if (vnodindx(VM_REGION(args), pgindx) > fsdata->isize) {
1890         /*
1891          * This page is past the end of the file.  Unlock this page
1892          * (region_trunc will throw it away) and then call region_trunc()
1893          * to invalidate all pages past the new end of the file.
1894          */
1895         VM_SETFS_FLAGS(args, PAGEOUT_TRUNCATE);
1896         return (1);
1897     }
1898 #ifdef notdef
1899     if ((args->vm_flags & PAGEOUT_VHAND)
1900         && (!(args->vm_flags & PAGEOUT_RESERVED))
1901         && (!(VM_IS_ZOMBIE(args)))) {
1902         VASSERT(args->run == 0);
1903         if (vm_reserve_malloc_memory(NFS_PAGEOUT_MEM)) {
1904             /*
1905              * Got enough memory to pageout.  Mark the fact that we did
1906              * a sysprocmemreserve(), so that we can sysprocmemunreserve() it
1907              * later (in remote_pageout()).
1908              */
1909             args->vm_flags |= PAGEOUT_RESERVED;
1910         } else {
1911             /*
1912              * We do not have enough memory to do this pageout.  By
1913              * definition, we do not yet have a run, so we just unlock
1914              * this page and tell foreach_valid() to continue scanning.
1915              * If we come across another dirty page, we will try to
1916              * reserve memory again.  That is okay, in fact some memory
1917              * may have freed up (as earlier pageouts complete under
1918              * interrupt).
1919              */
1920             return 1;
1921         }
1922     }
1923 #endif
1924     return (0);
1925 }
1926
1927 afs_swapfs_len(bp)
1928      struct buf *bp;
1929 {
1930     long fs_bsize;
1931     long max_size;
1932     long bnrem;
1933
1934     fs_bsize = vtoblksz(bp->b_vp);
1935     /*
1936      * Check to see if we are starting mid block.  If so, then
1937      * we must return the remainder of the block or less depending
1938      * on the length.
1939      */
1940     bnrem = bp->b_offset % fs_bsize;
1941     if (bnrem) {
1942         max_size = fs_bsize - bnrem;
1943     } else {
1944         max_size = fs_bsize;
1945     }
1946
1947     if (bp->b_bcount > max_size) {
1948         return (max_size);
1949     } else {
1950         return (bp->b_bcount);
1951     }
1952 }
1953
1954 afs_mmap(vp, off, size_bytes, access)
1955      struct vnode *vp;
1956      u_int off;
1957 #if defined(AFS_HPUX1111_ENV)
1958      u_long size_bytes;
1959 #else
1960      u_int size_bytes;
1961 #endif
1962      int access;
1963 {
1964     long bsize = vtoblksz(vp);
1965
1966     if (bsize % NBPG != 0) {
1967         return (EINVAL);
1968     }
1969
1970     return (0);
1971 }
1972
1973 afs_cachelimit(vp, len, location)
1974      struct vnode *vp;
1975      k_off_t len;
1976      int *location;
1977 {
1978     /*
1979      * Disk addresses are logical, not physical, so fragments are
1980      * transparent.
1981      */
1982     *location = btorp(len) + 1;
1983 }
1984
1985 afs_release(vp)
1986      struct vnode *vp;
1987 {
1988     return (0);
1989 }
1990
1991 int
1992 afs_unmap(vp, off, size_bytes, access)
1993      struct vnode *vp;
1994      u_int off;
1995 #if defined(AFS_HPUX1111_ENV)
1996      u_long size_bytes;
1997 #else
1998      u_int size_bytes;
1999 #endif
2000      int access;
2001 {
2002     return 0;
2003 }
2004
2005 int
2006 afs_read_ahead(vp, prp, wrt, space, vaddr, rhead_cnt)
2007      struct vnode *vp;
2008      preg_t *prp;
2009      int wrt;
2010      space_t space;
2011      caddr_t vaddr;
2012      pgcnt_t *rhead_cnt;
2013 {
2014     printf("afs_read_ahead returning 0 \n");
2015     return 0;
2016 }
2017
2018 int
2019 afs_prealloc(vp, size, ignore_minfree, reserved)
2020      struct vnode *vp;
2021       /* DEE on 11.22 following is off_t */
2022      size_t size;
2023      int ignore_minfree;
2024      int reserved;
2025 {
2026     printf("afs_prealloc returning ENOSPC\n");
2027     return ENOSPC;
2028 }
2029
2030 int
2031 afs_ioctl(vp, com, data, flag, cred)
2032      struct vnode *vp;
2033      int com;
2034      caddr_t data;
2035      int flag;
2036      struct ucred *cred;
2037 {
2038     int error;
2039     struct afs_ioctl afsioctl, *ai;
2040
2041     AFS_STATCNT(afs_ioctl);
2042
2043     /* The call must be a VICEIOCTL call */
2044     if (((com >> 8) & 0xff) == 'V') {
2045 #ifdef notdef
2046         /* AFS_COPYIN returns error 14. Copy data in instead */
2047         AFS_COPYIN(data, (caddr_t) & afsioctl, sizeof(afsioctl), error);
2048         if (error)
2049             return (error);
2050 #endif
2051         ai = (struct afs_ioctl *)data;
2052         afsioctl.in = ai->in;
2053         afsioctl.out = ai->out;
2054         afsioctl.in_size = ai->in_size;
2055         afsioctl.out_size = ai->out_size;
2056         error = HandleIoctl(VTOAFS(vp), com, &afsioctl);
2057         return (error);
2058     }
2059     return (ENOTTY);
2060 }
2061
2062 #if defined(AFS_HPUX1111_ENV)
2063 /* looks like even if appl is 32 bit, we need to round to 8 bytes */
2064 /* This had no effect, it must not be being used */
2065
2066 #define roundtoint(x)   (((x) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
2067 #define reclen(dp)      roundtoint(((dp)->d_namlen + 1 + (sizeof(u_long)) +\
2068                                 sizeof(u_int) + 2 * sizeof(u_short)))
2069 #else
2070
2071 #define roundtoint(x)   (((x) + (sizeof(int) - 1)) & ~(sizeof(int) - 1))
2072 #define reclen(dp)      roundtoint(((dp)->d_namlen + 1 + (sizeof(u_long)) +\
2073                                 2 * sizeof(u_short)))
2074 #endif
2075
2076 int
2077 afs_readdir(vp, uiop, cred)
2078      struct vnode *vp;
2079      struct uio *uiop;
2080      struct ucred *cred;
2081 {
2082     struct uio auio;
2083     struct iovec aiov;
2084     caddr_t ibuf, obuf, ibufend, obufend;
2085     struct __dirent32 *idp;
2086     struct dirent *odp;
2087     int count, outcount;
2088     dir_off_t offset;
2089     uint64_t tmp_offset;
2090
2091     memset(&auio, 0, sizeof(auio));
2092     memset(&aiov, 0, sizeof(aiov));
2093
2094     count = uiop->uio_resid;
2095     /* Allocate temporary space for format conversion */
2096     ibuf = kmem_alloc(2 * count);       /* overkill - fix later */
2097     obuf = kmem_alloc(count + sizeof(struct dirent));
2098     aiov.iov_base = ibuf;
2099     aiov.iov_len = count;
2100     auio.uio_iov = &aiov;
2101     auio.uio_iovcnt = 1;
2102     offset = auio.uio_offset = uiop->uio_offset;
2103     auio.uio_seg = UIOSEG_KERNEL;
2104     auio.uio_resid = count;
2105     auio.uio_fpflags = 0;
2106
2107     u.u_error = mp_afs_readdir2(vp, &auio, cred);
2108     if (u.u_error)
2109         goto out;
2110
2111     /* Convert entries from __dirent32 to dirent format */
2112
2113     for (idp = (struct __dirent32 *)ibuf, odp =
2114          (struct dirent *)obuf, ibufend =
2115          ibuf + (count - auio.uio_resid), obufend = obuf + count;
2116          (caddr_t) idp < ibufend;
2117          idp = (struct __dirent32 *)((caddr_t) idp + idp->__d_reclen), odp =
2118          (struct dirent *)((caddr_t) odp + odp->d_reclen)) {
2119         odp->d_ino = idp->__d_ino;
2120         odp->d_namlen = idp->__d_namlen;
2121         (void)strcpy(odp->d_name, idp->__d_name);
2122         odp->d_reclen = reclen(odp);
2123         if ((caddr_t) odp + odp->d_reclen > obufend)
2124             break;
2125         /* record offset *after* we're sure to use this entry */
2126         memcpy((char *)&tmp_offset, (char *)&idp->__d_off, sizeof tmp_offset);
2127         offset = tmp_offset;
2128     }
2129
2130     outcount = (caddr_t) odp - obuf;
2131     AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2132     if (u.u_error)
2133         goto out;
2134     uiop->uio_offset = offset;
2135   out:
2136     kmem_free(ibuf, count);
2137     kmem_free(obuf, count + sizeof(struct dirent));
2138     return u.u_error;
2139 }
2140
2141
2142 #define roundtolong(x)   (((x) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
2143 #define reclen_dirent64(dp)      roundtolong(((dp)->__d_namlen + 1 + (2*sizeof(u_long)) +\
2144                                 2 * sizeof(u_short)))
2145
2146 int
2147 afs_readdir3(vp, uiop, cred)
2148      struct vnode *vp;
2149      struct uio *uiop;
2150      struct ucred *cred;
2151 {
2152     struct uio auio;
2153     struct iovec aiov;
2154     caddr_t ibuf, obuf, ibufend, obufend;
2155     struct __dirent32 *idp;
2156     struct __dirent64 *odp;
2157     int count, outcount;
2158     dir_off_t offset;
2159
2160     memset(&auio, 0, sizeof(auio));
2161     memset(&aiov, 0, sizeof(aiov));
2162
2163     count = uiop->uio_resid;
2164     /* Allocate temporary space for format conversion */
2165     ibuf = kmem_alloc(2 * count);       /* overkill - fix later */
2166     obuf = kmem_alloc(count + sizeof(struct __dirent64));
2167     aiov.iov_base = ibuf;
2168     aiov.iov_len = count;
2169     auio.uio_iov = &aiov;
2170     auio.uio_iovcnt = 1;
2171     offset = auio.uio_offset = uiop->uio_offset;
2172     auio.uio_seg = UIOSEG_KERNEL;
2173     auio.uio_resid = count;
2174     auio.uio_fpflags = 0;
2175
2176     u.u_error = mp_afs_readdir2(vp, &auio, cred);
2177     if (u.u_error)
2178         goto out;
2179
2180     /* Convert entries from __dirent32 to __dirent64 format */
2181
2182     for (idp = (struct __dirent32 *)ibuf, odp =
2183          (struct __dirent64 *)obuf, ibufend =
2184          ibuf + (count - auio.uio_resid), obufend = obuf + count;
2185          (caddr_t) idp < ibufend;
2186          idp = (struct __dirent32 *)((caddr_t) idp + idp->__d_reclen), odp =
2187          (struct __dirent64 *)((caddr_t) odp + odp->__d_reclen)) {
2188         memcpy((char *)&odp->__d_off, (char *)&idp->__d_off,
2189                sizeof odp->__d_off);
2190         odp->__d_ino = idp->__d_ino;
2191         odp->__d_namlen = idp->__d_namlen;
2192         (void)strcpy(odp->__d_name, idp->__d_name);
2193         odp->__d_reclen = reclen_dirent64(odp);
2194         if ((caddr_t) odp + odp->__d_reclen > obufend)
2195             break;
2196         /* record offset *after* we're sure to use this entry */
2197         offset = odp->__d_off;
2198     }
2199
2200     outcount = (caddr_t) odp - obuf;
2201     AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2202     if (u.u_error)
2203         goto out;
2204     uiop->uio_offset = offset;
2205   out:
2206     kmem_free(ibuf, count);
2207     kmem_free(obuf, count + sizeof(struct __dirent64));
2208     return u.u_error;
2209 }
2210
2211 #define AFS_SV_SEMA_HASH 1
2212 #define AFS_SV_SEMA_HASH_DEBUG 0
2213
2214 #if AFS_SV_SEMA_HASH
2215 /* This portion of the code was originally used to implement
2216  * thread specific storage for the semaphore save area. However,
2217  * there were some spare fields in the proc structure, this is
2218  * now being used for the saving semapores.  Hence, this portion of
2219  * the code is no longer used.
2220  */
2221
2222 /* This portion of the code implements thread specific information.
2223  * The thread id is passed in as the key. The semaphore saved area
2224  * is hashed on this key.
2225  */
2226
2227 /* why is this hash table required ?
2228  * The AFS code is written in such a way that a GLOCK() is done in
2229  * one function and the GUNLOCK() is done in another function further
2230  * down the call chain. The GLOCK() call has to save the current
2231  * semaphore status before acquiring afs_global_sema. The GUNLOCK
2232  * has to release afs_global_sema and reacquire the sempahore status
2233  * that existed before the corresponding GLOCK. If GLOCK() and
2234  * GUNLOCK() were called in the same function, the GLOCK call could
2235  * have stored the saved sempahore status in a local variable and the
2236  * corresponding GUNLOCK() call could have restored the original
2237  * status from this local variable. But this is not the case with
2238  * AFS code. Hence, we have to implement a thread specific semaphore
2239  * save area. This is implemented as a hash table. The key is the
2240  * thread id.
2241  */
2242
2243 /* In order for multithreaded processes to work, the sv_sema structures
2244  * must be saved on a per-thread basis, not a per-process basis.  There
2245  * is no per-thread storage available to hijack in the OS per-thread
2246  * data structures (e.g. struct user) so we revive this code.
2247  * I removed the upper limit on the memory consumption since we don't
2248  * know how many threads there will be.  Now the code first checks the
2249  * freeList.  If that fails it then tries garbage collecting.  If that
2250  * doesn't free up anything then it allocs what it needs.
2251  */
2252
2253 #define ELEMENT         sv_sema_t
2254 #define KEY             tid_t
2255 #define Hash(xx)        (  (xx) % sizeOfHashTable )
2256 #define hashLockInit(xx) initsema(&xx,1, FILESYS_SEMA_PRI, FILESYS_SEMA_ORDER)
2257 #define hashLock(xx)    MP_PSEMA(&xx)
2258 #define hashUnlock(xx)  MP_VSEMA(&xx)
2259
2260 typedef struct elem {
2261     struct elem *next;
2262     ELEMENT element;
2263     KEY key;
2264     int refCnt;
2265 } Element;
2266
2267 typedef struct bucket {
2268     sema_t lock;
2269     Element *element;
2270 } Bucket;
2271
2272 static int sizeOfHashTable;
2273 static Bucket *hashTable;
2274
2275 static int currentSize = 0;
2276 static Element *freeList;       /* free list */
2277
2278 #pragma align 64
2279 static sema_t afsHashLock = { 0 };      /* global lock for hash table */
2280
2281 static void afsHashGarbageCollect();
2282
2283 /*
2284 ** The global lock protects the global data structures,
2285 ** e.g. freeList and currentSize.
2286 ** The bucket lock protects the link list hanging off that bucket.
2287 ** The lock hierarchy : one can obtain the bucket lock while holding
2288 ** the global lock, but not vice versa.
2289 */
2290
2291
2292 void
2293 afsHash(int nbuckets)
2294 {                               /* allocate the hash table */
2295     int i;
2296
2297 #if AFS_SV_SEMA_HASH_DEBUG
2298     printf("afsHash: enter\n");
2299 #endif
2300
2301     sizeOfHashTable = nbuckets;
2302     currentSize = nbuckets * sizeof(Bucket);
2303
2304     if (hashTable)
2305         osi_Panic("afs: SEMA Hashtable already created\n");
2306
2307     hashTable = (Bucket *) AFS_KALLOC(sizeOfHashTable * sizeof(Bucket));
2308     if (!hashTable)
2309         osi_Panic("afs: cannot create SEMA Hashtable\n");
2310
2311     /* initialize the hash table and associated locks */
2312     memset(hashTable, 0, sizeOfHashTable * sizeof(Bucket));
2313     for (i = 0; i < sizeOfHashTable; i++)
2314         hashLockInit(hashTable[i].lock);
2315     hashLockInit(afsHashLock);
2316
2317 #if AFS_SV_SEMA_HASH_DEBUG
2318     printf("afsHash: exit\n");
2319 #endif
2320 }
2321
2322 ELEMENT *
2323 afsHashInsertFind(KEY key)
2324 {
2325     int index;
2326     Element *ptr;
2327
2328 #if AFS_SV_SEMA_HASH_DEBUG
2329     printf("afsHashInsertFind: %d\n", key);
2330 #endif
2331     if (!hashTable)
2332         osi_Panic("afs: afsHashInsertFind: no hashTable\n");
2333
2334     index = Hash(key);          /* get bucket number */
2335     hashLock(hashTable[index].lock);    /* lock this bucket */
2336     ptr = hashTable[index].element;
2337
2338     /* if it is already there */
2339     while (ptr) {
2340         if (ptr->key == key) {
2341             ptr->refCnt++;      /* hold it */
2342             hashUnlock(hashTable[index].lock);
2343 #if AFS_SV_SEMA_HASH_DEBUG
2344             printf("afsHashInsertFind: %d FOUND\n", key);
2345 #endif
2346             return &(ptr->element);
2347         } else {
2348             ptr = ptr->next;
2349         }
2350     }
2351
2352     hashUnlock(hashTable[index].lock);
2353
2354     /*  if something exists in the freeList, take it from there */
2355     ptr = NULL;
2356     hashLock(afsHashLock);
2357
2358     if (freeList) {
2359         ptr = freeList;         /* reuse entry */
2360         freeList = freeList->next;
2361     } else {
2362         afsHashGarbageCollect();        /* afsHashLock locked */
2363         if (freeList) {
2364             ptr = freeList;     /* reuse entry */
2365             freeList = freeList->next;
2366         } else {
2367             ptr = (Element *) AFS_KALLOC(sizeof(Element));
2368         }
2369     }
2370
2371     currentSize += sizeof(Element);     /* update memory used */
2372     hashUnlock(afsHashLock);
2373
2374     if (!ptr)
2375         osi_Panic("afs: SEMA Hashtable cannot create new entry\n");
2376     /* create new entry */
2377     ptr->key = key;
2378     memset(&ptr->element, 0, sizeof(ptr->element));
2379     ptr->refCnt = 1;            /* this guy */
2380
2381     /* insert new entry in bucket */
2382     hashLock(hashTable[index].lock);    /* lock this bucket */
2383     ptr->next = hashTable[index].element;
2384     hashTable[index].element = ptr;
2385     hashUnlock(hashTable[index].lock);
2386
2387 #if AFS_SV_SEMA_HASH_DEBUG
2388     printf("afsHashInsertFind: %d MADE\n", key);
2389 #endif
2390
2391     return &(ptr->element);
2392 }
2393
2394 ELEMENT *
2395 afsHashFind(KEY key)
2396 {
2397     int index;
2398     Element *ptr;
2399
2400 #if AFS_SV_SEMA_HASH_DEBUG
2401     printf("afsHashFind: %d\n", key);
2402 #endif
2403     if (!hashTable)
2404         osi_Panic("afs: afsHashFind: no hashTable\n");
2405
2406     index = Hash(key);          /* get bucket number */
2407     hashLock(hashTable[index].lock);    /* lock this bucket */
2408     ptr = hashTable[index].element;
2409
2410     /* it should be in the hash table */
2411     while (ptr) {
2412         if (ptr->key == key) {
2413             if (ptr->refCnt <= 0)
2414                 osi_Panic("afs: SEMA HashTable entry already released\n");
2415             hashUnlock(hashTable[index].lock);
2416 #if AFS_SV_SEMA_HASH_DEBUG
2417             printf("afsHashFind: %d FOUND\n", key);
2418 #endif
2419             return &(ptr->element);
2420         } else {
2421             ptr = ptr->next;
2422         }
2423     }
2424
2425     hashUnlock(hashTable[index].lock);
2426     /* it better be in the hash table */
2427     osi_Panic("afs: SEMA HashTable wants non-existent entry \n");
2428     return 0;
2429 }
2430
2431 void
2432 afsHashRelease(KEY key)
2433 {
2434     int index;
2435     Element *ptr;
2436
2437 #if AFS_SV_SEMA_HASH_DEBUG
2438     printf("afsHashRelease: %d\n", key);
2439 #endif
2440     if (!hashTable)
2441         osi_Panic("afs: afsHashRelease: no hashTable\n");
2442
2443     index = Hash(key);          /* get bucket number */
2444     hashLock(hashTable[index].lock);    /* lock this bucket */
2445     ptr = hashTable[index].element;
2446
2447     /* it should be in the hash table */
2448     while (ptr) {
2449         if (ptr->key == key) {
2450             if (ptr->refCnt <= 0)
2451                 osi_Panic("afs: SEMA HashTable entry already released\n");
2452             ptr->refCnt--;      /* release this guy */
2453             hashUnlock(hashTable[index].lock);
2454 #if AFS_SV_SEMA_HASH_DEBUG
2455             printf("afsHashRelease: %d FOUND\n", key);
2456 #endif
2457             return;
2458         } else {
2459             ptr = ptr->next;
2460         }
2461     }
2462
2463     hashUnlock(hashTable[index].lock);
2464     /* it better be in the hash table */
2465     osi_Panic("afs: SEMA HashTable deleting non-existent entry \n");
2466 }
2467
2468 /* this should be called with afsHashLock WRITE locked */
2469 static void
2470 afsHashGarbageCollect()
2471 {
2472     int index;
2473     Element *ptr;
2474     int foundFlag = 0;
2475
2476     if (!hashTable)
2477         osi_Panic("afs: afsHashGarbageCollect: no hashTable\n");
2478
2479     for (index = 0; index < sizeOfHashTable; index++) {
2480         hashLock(hashTable[index].lock);
2481         ptr = hashTable[index].element; /* pick up bucket */
2482
2483         while (ptr && !ptr->refCnt) {
2484             /* insert this element into free list */
2485             Element *temp;
2486             temp = ptr->next;
2487             ptr->next = freeList;
2488             freeList = ptr;
2489
2490             foundFlag = 1;      /* found at least one */
2491             currentSize -= sizeof(Element);
2492             ptr = temp;
2493         }
2494         hashTable[index].element = ptr;
2495
2496         /* scan thru the remaining list */
2497         if (ptr) {
2498             while (ptr->next) {
2499                 if (ptr->next->refCnt == 0) {
2500                     /* collect this element */
2501                     Element *temp;
2502                     temp = ptr->next;
2503                     ptr->next = ptr->next->next;
2504                     temp->next = freeList;
2505                     freeList = temp;
2506                     foundFlag = 1;
2507                     currentSize -= sizeof(Element);
2508                 } else {
2509                     ptr = ptr->next;
2510                 }
2511             }
2512         }
2513         hashUnlock(hashTable[index].lock);
2514     }
2515 #if 0
2516     if (!foundFlag)
2517         osi_Panic("afs: SEMA HashTable full\n");
2518 #endif
2519 }
2520
2521 #endif /* AFS_SV_SEMA_HASH */
2522
2523
2524 afs_hp_strategy(bp)
2525      struct buf *bp;
2526 {
2527     afs_int32 code;
2528     struct uio tuio;
2529     struct iovec tiovec[1];
2530     extern caddr_t hdl_kmap_bp();
2531     struct kthread *t = u.u_kthreadp;
2532
2533     memset(&tuio, 0, sizeof(tuio));
2534     memset(&tiovec, 0, sizeof(tiovec));
2535
2536     AFS_STATCNT(afs_hp_strategy);
2537     /*
2538      * hdl_kmap_bp() saves "b_bcount" and restores it in hdl_remap_bp() after
2539      * the I/O.  We must save and restore the count because pageiodone()
2540      * uses b_bcount to determine how many pages to unlock.
2541      *
2542      * Remap the entire range.
2543      */
2544     hdl_kmap_bp(bp);
2545
2546     AFS_GLOCK();
2547     afs_Trace4(afs_iclSetp, CM_TRACE_HPSTRAT, ICL_TYPE_POINTER, bp->b_vp,
2548                ICL_TYPE_LONG, (int)bp->b_blkno * DEV_BSIZE, ICL_TYPE_LONG,
2549                bp->b_bcount, ICL_TYPE_LONG, 0);
2550
2551     /* Set up the uio structure */
2552     tuio.afsio_iov = tiovec;
2553     tuio.afsio_iovcnt = 1;
2554     tuio.afsio_offset = DEV_BSIZE * bp->b_blkno;
2555     tuio.afsio_seg = AFS_UIOSYS;
2556     tuio.afsio_resid = bp->b_bcount;
2557     tuio.uio_fpflags = 0;
2558     tiovec[0].iov_base = bp->b_un.b_addr;
2559     tiovec[0].iov_len = bp->b_bcount;
2560
2561     /* Do the I/O */
2562     if ((bp->b_flags & B_READ) == B_READ) {
2563         /* read b_bcount bytes into kernel address b_un.b_addr
2564          * starting at byte DEV_BSIZE * b_blkno. Bzero anything
2565          * we can't read, and finally call iodone(bp).  File is
2566          * in bp->b_vp. Credentials are from u area??
2567          */
2568         code = afs_rdwr(VTOAFS(bp->b_vp), &tuio, UIO_READ, 0, kt_cred(t));
2569         if (code == 0)
2570             if (tuio.afsio_resid > 0) {
2571                 privlbzero(bvtospace(bp, bp->b_un.b_addr),
2572                            bp->b_un.b_addr + bp->b_bcount - tuio.afsio_resid,
2573                            (size_t) tuio.afsio_resid);
2574
2575             }
2576     } else
2577         code = afs_rdwr(VTOAFS(bp->b_vp), &tuio, UIO_WRITE, 0, kt_cred(t));
2578
2579     /* Remap back to the user's space */
2580     hdl_remap_bp(bp);
2581
2582     AFS_GUNLOCK();
2583
2584     iodone(bp);
2585     return code;
2586 }
2587
2588 afs_pathconf(vp, name, resultp, cred)
2589      struct vnode *vp;
2590      int name;
2591      int *resultp;
2592      struct ucred *cred;        /* unused */
2593 {
2594     switch (name) {
2595     case _PC_LINK_MAX:          /* Maximum number of links to a file */
2596         *resultp = 255;         /* an unsigned short on the fileserver */
2597         break;                  /* a unsigned char in the client.... */
2598
2599     case _PC_NAME_MAX:          /* Max length of file name */
2600         *resultp = 255;
2601         break;
2602
2603     case _PC_PATH_MAX:          /* Maximum length of Path Name */
2604         *resultp = 1024;
2605         break;
2606
2607     case _PC_PIPE_BUF:          /* Max atomic write to pipe.  See fifo_vnops */
2608     case _PC_CHOWN_RESTRICTED:  /* Anybody can chown? */
2609     case _PC_NO_TRUNC:          /* No file name truncation on overflow? */
2610         u.u_error = EOPNOTSUPP;
2611         return (EOPNOTSUPP);
2612         break;
2613
2614     case _PC_MAX_CANON: /* TTY buffer size for canonical input */
2615         /* need more work here for pty, ite buffer size, if differ */
2616         if (vp->v_type != VCHR) {
2617             u.u_error = EINVAL;
2618             return (EINVAL);
2619         }
2620         *resultp = CANBSIZ;     /*for tty */
2621         break;
2622
2623     case _PC_MAX_INPUT:
2624         /* need more work here for pty, ite buffer size, if differ */
2625         if (vp->v_type != VCHR) {       /* TTY buffer size */
2626             u.u_error = EINVAL;
2627             return (EINVAL);
2628         }
2629         *resultp = TTYHOG;      /*for tty */
2630         break;
2631
2632     case _PC_VDISABLE:
2633         /* Terminal special characters can be disabled? */
2634         if (vp->v_type != VCHR) {
2635             u.u_error = EINVAL;
2636             return (EINVAL);
2637         }
2638         *resultp = 1;
2639         break;
2640
2641     case _PC_SYNC_IO:
2642         if ((vp->v_type != VREG) && (vp->v_type != VBLK)) {
2643             *resultp = -1;
2644             return EINVAL;
2645         }
2646         *resultp = 1;           /* Synchronized IO supported for this file */
2647         break;
2648
2649     case _PC_FILESIZEBITS:
2650         if (vp->v_type != VDIR)
2651             return (EINVAL);
2652         *resultp = MAX_SMALL_FILE_BITS;
2653         break;
2654
2655     default:
2656         return (EINVAL);
2657     }
2658
2659     return (0);
2660 }