kernel/vm/seg_map.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28
  29 /*
  30  * Portions of this source code were derived from Berkeley 4.3 BSD
  31  * under license from the Regents of the University of California.
  32  */
  33
  34 /*
  35  * VM - generic vnode mapping segment.
  36  *
  37  * The segmap driver is used only by the kernel to get faster (than seg_vn)
  38  * mappings [lower routine overhead; more persistent cache] to random
  39  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
  40  */
  41
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/param.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/buf.h>
  47 #include <sys/systm.h>
  48 #include <sys/vnode.h>
  49 #include <sys/mman.h>
  50 #include <sys/errno.h>
  51 #include <sys/cred.h>
  52 #include <sys/kmem.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/debug.h>
  56 #include <sys/thread.h>
  57 #include <sys/dumphdr.h>
  58 #include <sys/bitmap.h>
  59 #include <sys/lgrp.h>
  60
  61 #include <vm/seg_kmem.h>
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_kpm.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/page.h>
  68 #include <vm/pvn.h>
  69 #include <vm/rm.h>
  70
  71 /*
  72  * Private seg op routines.
  73  */
  74 static void     segmap_free(struct seg *seg);
  75 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
  76                         size_t len, enum fault_type type, enum seg_rw rw);
  77 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
  78 static int      segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
  79                         uint_t prot);
  80 static int      segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
  81 static int      segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
  82                         uint_t *protv);
  83 static uoff_t   segmap_getoffset(struct seg *seg, caddr_t addr);
  84 static int      segmap_gettype(struct seg *seg, caddr_t addr);
  85 static int      segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
  86 static void     segmap_dump(struct seg *seg);
  87 static int      segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
  88                         struct page ***ppp, enum lock_type type,
  89                         enum seg_rw rw);
  90 static void     segmap_badop(void);
  91 static int      segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
  92
  93 /* segkpm support */
  94 static caddr_t  segmap_pagecreate_kpm(struct seg *, vnode_t *, uoff_t,
  95                         struct smap *, enum seg_rw);
  96 struct smap     *get_smap_kpm(caddr_t, page_t **);
  97
  98 #define SEGMAP_BADOP(t) (t(*)())segmap_badop
  99
 100 static const struct seg_ops segmap_ops = {
 101         .dup            = SEGMAP_BADOP(int),
 102         .unmap          = SEGMAP_BADOP(int),
 103         .free           = segmap_free,
 104         .fault          = segmap_fault,
 105         .faulta         = segmap_faulta,
 106         .setprot        = SEGMAP_BADOP(int),
 107         .checkprot      = segmap_checkprot,
 108         .kluster        = segmap_kluster,
 109         .sync           = SEGMAP_BADOP(int),
 110         .incore         = SEGMAP_BADOP(size_t),
 111         .lockop         = SEGMAP_BADOP(int),
 112         .getprot        = segmap_getprot,
 113         .getoffset      = segmap_getoffset,
 114         .gettype        = segmap_gettype,
 115         .getvp          = segmap_getvp,
 116         .advise         = SEGMAP_BADOP(int),
 117         .dump           = segmap_dump,
 118         .pagelock       = segmap_pagelock,
 119         .setpagesize    = SEGMAP_BADOP(int),
 120         .getmemid       = segmap_getmemid,
 121 };
 122
 123 /*
 124  * Private segmap routines.
 125  */
 126 static void     segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
 127                         size_t len, enum seg_rw rw, struct smap *smp);
 128 static void     segmap_smapadd(struct smap *smp);
 129 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
 130                         uoff_t off, int hashid);
 131 static void     segmap_hashout(struct smap *smp);
 132
 133
 134 /*
 135  * Statistics for segmap operations.
 136  *
 137  * No explicit locking to protect these stats.
 138  */
 139 struct segmapcnt segmapcnt = {
 140         { "fault",              KSTAT_DATA_ULONG },
 141         { "faulta",             KSTAT_DATA_ULONG },
 142         { "getmap",             KSTAT_DATA_ULONG },
 143         { "get_use",            KSTAT_DATA_ULONG },
 144         { "get_reclaim",        KSTAT_DATA_ULONG },
 145         { "get_reuse",          KSTAT_DATA_ULONG },
 146         { "get_unused",         KSTAT_DATA_ULONG },
 147         { "get_nofree",         KSTAT_DATA_ULONG },
 148         { "rel_async",          KSTAT_DATA_ULONG },
 149         { "rel_write",          KSTAT_DATA_ULONG },
 150         { "rel_free",           KSTAT_DATA_ULONG },
 151         { "rel_abort",          KSTAT_DATA_ULONG },
 152         { "rel_dontneed",       KSTAT_DATA_ULONG },
 153         { "release",            KSTAT_DATA_ULONG },
 154         { "pagecreate",         KSTAT_DATA_ULONG },
 155         { "free_notfree",       KSTAT_DATA_ULONG },
 156         { "free_dirty",         KSTAT_DATA_ULONG },
 157         { "free",               KSTAT_DATA_ULONG },
 158         { "stolen",             KSTAT_DATA_ULONG },
 159         { "get_nomtx",          KSTAT_DATA_ULONG }
 160 };
 161
 162 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
 163 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
 164
 165 /*
 166  * Return number of map pages in segment.
 167  */
 168 #define MAP_PAGES(seg)          ((seg)->s_size >> MAXBSHIFT)
 169
 170 /*
 171  * Translate addr into smap number within segment.
 172  */
 173 #define MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
 174
 175 /*
 176  * Translate addr in seg into struct smap pointer.
 177  */
 178 #define GET_SMAP(seg, addr)     \
 179         &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
 180
 181 /*
 182  * Bit in map (16 bit bitmap).
 183  */
 184 #define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf))
 185
 186 static int smd_colormsk = 0;
 187 static int smd_ncolor = 0;
 188 static int smd_nfree = 0;
 189 static int smd_freemsk = 0;
 190 #ifdef DEBUG
 191 static int *colors_used;
 192 #endif
 193 static struct smap *smd_smap;
 194 static struct smaphash *smd_hash;
 195 #ifdef SEGMAP_HASHSTATS
 196 static unsigned int *smd_hash_len;
 197 #endif
 198 static struct smfree *smd_free;
 199 static ulong_t smd_hashmsk = 0;
 200
 201 #define SEGMAP_MAXCOLOR         2
 202 #define SEGMAP_CACHE_PAD        64
 203
 204 union segmap_cpu {
 205         struct {
 206                 uint32_t        scpu_free_ndx[SEGMAP_MAXCOLOR];
 207                 struct smap     *scpu_last_smap;
 208                 ulong_t         scpu_getmap;
 209                 ulong_t         scpu_release;
 210                 ulong_t         scpu_get_reclaim;
 211                 ulong_t         scpu_fault;
 212                 ulong_t         scpu_pagecreate;
 213                 ulong_t         scpu_get_reuse;
 214         } scpu;
 215         char    scpu_pad[SEGMAP_CACHE_PAD];
 216 };
 217 static union segmap_cpu *smd_cpu;
 218
 219 /*
 220  * There are three locks in seg_map:
 221  *      - per freelist mutexes
 222  *      - per hashchain mutexes
 223  *      - per smap mutexes
 224  *
 225  * The lock ordering is to get the smap mutex to lock down the slot
 226  * first then the hash lock (for hash in/out (vp, off) list) or the
 227  * freelist lock to put the slot back on the free list.
 228  *
 229  * The hash search is done by only holding the hashchain lock, when a wanted
 230  * slot is found, we drop the hashchain lock then lock the slot so there
 231  * is no overlapping of hashchain and smap locks. After the slot is
 232  * locked, we verify again if the slot is still what we are looking
 233  * for.
 234  *
 235  * Allocation of a free slot is done by holding the freelist lock,
 236  * then locking the smap slot at the head of the freelist. This is
 237  * in reversed lock order so mutex_tryenter() is used.
 238  *
 239  * The smap lock protects all fields in smap structure except for
 240  * the link fields for hash/free lists which are protected by
 241  * hashchain and freelist locks.
 242  */
 243
 244 #define SHASHMTX(hashid)        (&smd_hash[hashid].sh_mtx)
 245
 246 #define SMP2SMF(smp)            (&smd_free[(smp - smd_smap) & smd_freemsk])
 247 #define SMP2SMF_NDX(smp)        (ushort_t)((smp - smd_smap) & smd_freemsk)
 248
 249 #define SMAPMTX(smp) (&smp->sm_mtx)
 250
 251 #define SMAP_HASHFUNC(vp, off, hashid) \
 252         { \
 253         hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
 254                 ((off) >> MAXBSHIFT)) & smd_hashmsk); \
 255         }
 256
 257 /*
 258  * The most frequently updated kstat counters are kept in the
 259  * per cpu array to avoid hot cache blocks. The update function
 260  * sums the cpu local counters to update the global counters.
 261  */
 262
 263 /* ARGSUSED */
 264 int
 265 segmap_kstat_update(kstat_t *ksp, int rw)
 266 {
 267         int i;
 268         ulong_t getmap, release, get_reclaim;
 269         ulong_t fault, pagecreate, get_reuse;
 270
 271         if (rw == KSTAT_WRITE)
 272                 return (EACCES);
 273         getmap = release = get_reclaim = (ulong_t)0;
 274         fault = pagecreate = get_reuse = (ulong_t)0;
 275         for (i = 0; i < max_ncpus; i++) {
 276                 getmap += smd_cpu[i].scpu.scpu_getmap;
 277                 release  += smd_cpu[i].scpu.scpu_release;
 278                 get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
 279                 fault  += smd_cpu[i].scpu.scpu_fault;
 280                 pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
 281                 get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
 282         }
 283         segmapcnt.smp_getmap.value.ul = getmap;
 284         segmapcnt.smp_release.value.ul = release;
 285         segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
 286         segmapcnt.smp_fault.value.ul = fault;
 287         segmapcnt.smp_pagecreate.value.ul = pagecreate;
 288         segmapcnt.smp_get_reuse.value.ul = get_reuse;
 289         return (0);
 290 }
 291
 292 int
 293 segmap_create(struct seg *seg, void *argsp)
 294 {
 295         struct segmap_data *smd;
 296         struct smap *smp;
 297         struct smfree *sm;
 298         struct segmap_crargs *a = (struct segmap_crargs *)argsp;
 299         struct smaphash *shashp;
 300         union segmap_cpu *scpu;
 301         long i, npages;
 302         size_t hashsz;
 303         uint_t nfreelist;
 304         extern void prefetch_smap_w(void *);
 305         extern int max_ncpus;
 306
 307         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 308
 309         if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
 310                 panic("segkmap not MAXBSIZE aligned");
 311                 /*NOTREACHED*/
 312         }
 313
 314         smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
 315
 316         seg->s_data = (void *)smd;
 317         seg->s_ops = &segmap_ops;
 318         smd->smd_prot = a->prot;
 319
 320         /*
 321          * Scale the number of smap freelists to be
 322          * proportional to max_ncpus * number of virtual colors.
 323          * The caller can over-ride this scaling by providing
 324          * a non-zero a->nfreelist argument.
 325          */
 326         nfreelist = a->nfreelist;
 327         if (nfreelist == 0)
 328                 nfreelist = max_ncpus;
 329         else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
 330                 cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
 331                 "%d, using %d", nfreelist, max_ncpus);
 332                 nfreelist = max_ncpus;
 333         }
 334         if (!ISP2(nfreelist)) {
 335                 /* round up nfreelist to the next power of two. */
 336                 nfreelist = 1 << (highbit(nfreelist));
 337         }
 338
 339         /*
 340          * Get the number of virtual colors - must be a power of 2.
 341          */
 342         if (a->shmsize)
 343                 smd_ncolor = a->shmsize >> MAXBSHIFT;
 344         else
 345                 smd_ncolor = 1;
 346         ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
 347         ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
 348         smd_colormsk = smd_ncolor - 1;
 349         smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
 350         smd_freemsk = smd_nfree - 1;
 351
 352         /*
 353          * Allocate and initialize the freelist headers.
 354          * Note that sm_freeq[1] starts out as the release queue. This
 355          * is known when the smap structures are initialized below.
 356          */
 357         smd_free = smd->smd_free =
 358             kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
 359         for (i = 0; i < smd_nfree; i++) {
 360                 sm = &smd->smd_free[i];
 361                 mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 362                 mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 363                 sm->sm_allocq = &sm->sm_freeq[0];
 364                 sm->sm_releq = &sm->sm_freeq[1];
 365         }
 366
 367         /*
 368          * Allocate and initialize the smap hash chain headers.
 369          * Compute hash size rounding down to the next power of two.
 370          */
 371         npages = MAP_PAGES(seg);
 372         smd->smd_npages = npages;
 373         hashsz = npages / SMAP_HASHAVELEN;
 374         hashsz = 1 << (highbit(hashsz)-1);
 375         smd_hashmsk = hashsz - 1;
 376         smd_hash = smd->smd_hash =
 377             kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
 378 #ifdef SEGMAP_HASHSTATS
 379         smd_hash_len =
 380             kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
 381 #endif
 382         for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
 383                 shashp->sh_hash_list = NULL;
 384                 mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
 385         }
 386
 387         /*
 388          * Allocate and initialize the smap structures.
 389          * Link all slots onto the appropriate freelist.
 390          * The smap array is large enough to affect boot time
 391          * on large systems, so use memory prefetching and only
 392          * go through the array 1 time. Inline a optimized version
 393          * of segmap_smapadd to add structures to freelists with
 394          * knowledge that no locks are needed here.
 395          */
 396         smd_smap = smd->smd_sm =
 397             kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
 398
 399         for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
 400             smp >= smd->smd_sm; smp--) {
 401                 struct smap *smpfreelist;
 402                 struct sm_freeq *releq;
 403
 404                 prefetch_smap_w((char *)smp);
 405
 406                 smp->sm_vp = NULL;
 407                 smp->sm_hash = NULL;
 408                 smp->sm_off = 0;
 409                 smp->sm_bitmap = 0;
 410                 smp->sm_refcnt = 0;
 411                 mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
 412                 smp->sm_free_ndx = SMP2SMF_NDX(smp);
 413
 414                 sm = SMP2SMF(smp);
 415                 releq = sm->sm_releq;
 416
 417                 smpfreelist = releq->smq_free;
 418                 if (smpfreelist == 0) {
 419                         releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 420                 } else {
 421                         smp->sm_next = smpfreelist;
 422                         smp->sm_prev = smpfreelist->sm_prev;
 423                         smpfreelist->sm_prev = smp;
 424                         smp->sm_prev->sm_next = smp;
 425                         releq->smq_free = smp->sm_next;
 426                 }
 427
 428                 /*
 429                  * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
 430                  */
 431                 smp->sm_flags = 0;
 432
 433 #ifdef  SEGKPM_SUPPORT
 434                 /*
 435                  * Due to the fragile prefetch loop no
 436                  * separate function is used here.
 437                  */
 438                 smp->sm_kpme_next = NULL;
 439                 smp->sm_kpme_prev = NULL;
 440                 smp->sm_kpme_page = NULL;
 441 #endif
 442         }
 443
 444         /*
 445          * Allocate the per color indices that distribute allocation
 446          * requests over the free lists. Each cpu will have a private
 447          * rotor index to spread the allocations even across the available
 448          * smap freelists. Init the scpu_last_smap field to the first
 449          * smap element so there is no need to check for NULL.
 450          */
 451         smd_cpu =
 452             kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
 453         for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
 454                 int j;
 455                 for (j = 0; j < smd_ncolor; j++)
 456                         scpu->scpu.scpu_free_ndx[j] = j;
 457                 scpu->scpu.scpu_last_smap = smd_smap;
 458         }
 459
 460         vpm_init();
 461
 462 #ifdef DEBUG
 463         /*
 464          * Keep track of which colors are used more often.
 465          */
 466         colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
 467 #endif /* DEBUG */
 468
 469         return (0);
 470 }
 471
 472 static void
 473 segmap_free(seg)
 474         struct seg *seg;
 475 {
 476         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 477 }
 478
 479 /*
 480  * Do a F_SOFTUNLOCK call over the range requested.
 481  * The range must have already been F_SOFTLOCK'ed.
 482  */
 483 static void
 484 segmap_unlock(
 485         struct hat *hat,
 486         struct seg *seg,
 487         caddr_t addr,
 488         size_t len,
 489         enum seg_rw rw,
 490         struct smap *smp)
 491 {
 492         page_t *pp;
 493         caddr_t adr;
 494         uoff_t off;
 495         struct vnode *vp;
 496         kmutex_t *smtx;
 497
 498         ASSERT(smp->sm_refcnt > 0);
 499
 500
 501         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 502
 503                 /*
 504                  * We're called only from segmap_fault and this was a
 505                  * NOP in case of a kpm based smap, so dangerous things
 506                  * must have happened in the meantime. Pages are prefaulted
 507                  * and locked in segmap_getmapflt and they will not be
 508                  * unlocked until segmap_release.
 509                  */
 510                 panic("segmap_unlock: called with kpm addr %p", (void *)addr);
 511                 /*NOTREACHED*/
 512         }
 513
 514         vp = smp->sm_vp;
 515         off = smp->sm_off + (uoff_t)((uintptr_t)addr & MAXBOFFSET);
 516
 517         hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
 518         for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
 519                 ushort_t bitmask;
 520
 521                 /*
 522                  * Use page_find() instead of page_lookup() to
 523                  * find the page since we know that it has
 524                  * "shared" lock.
 525                  */
 526                 pp = page_find(&vp->v_object, off);
 527                 if (pp == NULL) {
 528                         panic("segmap_unlock: page not found");
 529                         /*NOTREACHED*/
 530                 }
 531
 532                 if (rw == S_WRITE) {
 533                         hat_setrefmod(pp);
 534                 } else if (rw != S_OTHER) {
 535                         hat_setref(pp);
 536                 }
 537
 538                 /*
 539                  * Clear bitmap, if the bit corresponding to "off" is set,
 540                  * since the page and translation are being unlocked.
 541                  */
 542                 bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
 543
 544                 /*
 545                  * Large Files: Following assertion is to verify
 546                  * the correctness of the cast to (int) above.
 547                  */
 548                 ASSERT((uoff_t)(off - smp->sm_off) <= INT_MAX);
 549                 smtx = SMAPMTX(smp);
 550                 mutex_enter(smtx);
 551                 if (smp->sm_bitmap & bitmask) {
 552                         smp->sm_bitmap &= ~bitmask;
 553                 }
 554                 mutex_exit(smtx);
 555
 556                 page_unlock(pp);
 557         }
 558 }
 559
 560 #define MAXPPB  (MAXBSIZE/4096) /* assumes minimum page size of 4k */
 561
 562 /*
 563  * This routine is called via a machine specific fault handling
 564  * routine.  It is also called by software routines wishing to
 565  * lock or unlock a range of addresses.
 566  *
 567  * Note that this routine expects a page-aligned "addr".
 568  */
 569 faultcode_t
 570 segmap_fault(
 571         struct hat *hat,
 572         struct seg *seg,
 573         caddr_t addr,
 574         size_t len,
 575         enum fault_type type,
 576         enum seg_rw rw)
 577 {
 578         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 579         struct smap *smp;
 580         page_t *pp, **ppp;
 581         struct vnode *vp;
 582         uoff_t off;
 583         page_t *pl[MAXPPB + 1];
 584         uint_t prot;
 585         uoff_t addroff;
 586         caddr_t adr;
 587         int err;
 588         uoff_t sm_off;
 589         int hat_flag;
 590
 591         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 592                 int newpage;
 593                 kmutex_t *smtx;
 594
 595                 /*
 596                  * Pages are successfully prefaulted and locked in
 597                  * segmap_getmapflt and can't be unlocked until
 598                  * segmap_release. No hat mappings have to be locked
 599                  * and they also can't be unlocked as long as the
 600                  * caller owns an active kpm addr.
 601                  */
 602 #ifndef DEBUG
 603                 if (type != F_SOFTUNLOCK)
 604                         return (0);
 605 #endif
 606
 607                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 608                         panic("segmap_fault: smap not found "
 609                             "for addr %p", (void *)addr);
 610                         /*NOTREACHED*/
 611                 }
 612
 613                 smtx = SMAPMTX(smp);
 614 #ifdef  DEBUG
 615                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 616                 if (newpage) {
 617                         cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
 618                             (void *)smp);
 619                 }
 620
 621                 if (type != F_SOFTUNLOCK) {
 622                         mutex_exit(smtx);
 623                         return (0);
 624                 }
 625 #endif
 626                 mutex_exit(smtx);
 627                 vp = smp->sm_vp;
 628                 sm_off = smp->sm_off;
 629
 630                 if (vp == NULL)
 631                         return (FC_MAKE_ERR(EIO));
 632
 633                 ASSERT(smp->sm_refcnt > 0);
 634
 635                 addroff = (uoff_t)((uintptr_t)addr & MAXBOFFSET);
 636                 if (addroff + len > MAXBSIZE)
 637                         panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
 638                             (void *)(addr + len));
 639
 640                 off = sm_off + addroff;
 641
 642                 pp = page_find(&vp->v_object, off);
 643
 644                 if (pp == NULL)
 645                         panic("segmap_fault: softunlock page not found");
 646
 647                 /*
 648                  * Set ref bit also here in case of S_OTHER to avoid the
 649                  * overhead of supporting other cases than F_SOFTUNLOCK
 650                  * with segkpm. We can do this because the underlying
 651                  * pages are locked anyway.
 652                  */
 653                 if (rw == S_WRITE) {
 654                         hat_setrefmod(pp);
 655                 } else {
 656                         hat_setref(pp);
 657                 }
 658
 659                 return (0);
 660         }
 661
 662         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
 663         smp = GET_SMAP(seg, addr);
 664         vp = smp->sm_vp;
 665         sm_off = smp->sm_off;
 666
 667         if (vp == NULL)
 668                 return (FC_MAKE_ERR(EIO));
 669
 670         ASSERT(smp->sm_refcnt > 0);
 671
 672         addroff = (uoff_t)((uintptr_t)addr & MAXBOFFSET);
 673         if (addroff + len > MAXBSIZE) {
 674                 panic("segmap_fault: endaddr %p "
 675                     "exceeds MAXBSIZE chunk", (void *)(addr + len));
 676                 /*NOTREACHED*/
 677         }
 678         off = sm_off + addroff;
 679
 680         /*
 681          * First handle the easy stuff
 682          */
 683         if (type == F_SOFTUNLOCK) {
 684                 segmap_unlock(hat, seg, addr, len, rw, smp);
 685                 return (0);
 686         }
 687
 688         err = fop_getpage(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
 689             seg, addr, rw, CRED(), NULL);
 690
 691         if (err)
 692                 return (FC_MAKE_ERR(err));
 693
 694         prot &= smd->smd_prot;
 695
 696         /*
 697          * Handle all pages returned in the pl[] array.
 698          * This loop is coded on the assumption that if
 699          * there was no error from the fop_getpage routine,
 700          * that the page list returned will contain all the
 701          * needed pages for the vp from [off..off + len].
 702          */
 703         ppp = pl;
 704         while ((pp = *ppp++) != NULL) {
 705                 uoff_t poff;
 706                 VERIFY(pp->p_object == &vp->v_object);
 707                 ASSERT(pp->p_vnode == vp);
 708                 hat_flag = HAT_LOAD;
 709
 710                 /*
 711                  * Verify that the pages returned are within the range
 712                  * of this segmap region.  Note that it is theoretically
 713                  * possible for pages outside this range to be returned,
 714                  * but it is not very likely.  If we cannot use the
 715                  * page here, just release it and go on to the next one.
 716                  */
 717                 if (pp->p_offset < sm_off ||
 718                     pp->p_offset >= sm_off + MAXBSIZE) {
 719                         (void) page_release(pp, 1);
 720                         continue;
 721                 }
 722
 723                 ASSERT(hat == kas.a_hat);
 724                 poff = pp->p_offset;
 725                 adr = addr + (poff - off);
 726                 if (adr >= addr && adr < addr + len) {
 727                         hat_setref(pp);
 728                         if (type == F_SOFTLOCK)
 729                                 hat_flag = HAT_LOAD_LOCK;
 730                 }
 731
 732                 /*
 733                  * Deal with VMODSORT pages here. If we know this is a write
 734                  * do the setmod now and allow write protection.
 735                  * As long as it's modified or not S_OTHER, remove write
 736                  * protection. With S_OTHER it's up to the FS to deal with this.
 737                  */
 738                 if (IS_VMODSORT(vp)) {
 739                         if (rw == S_WRITE)
 740                                 hat_setmod(pp);
 741                         else if (rw != S_OTHER && !hat_ismod(pp))
 742                                 prot &= ~PROT_WRITE;
 743                 }
 744
 745                 hat_memload(hat, adr, pp, prot, hat_flag);
 746                 if (hat_flag != HAT_LOAD_LOCK)
 747                         page_unlock(pp);
 748         }
 749         return (0);
 750 }
 751
 752 /*
 753  * This routine is used to start I/O on pages asynchronously.
 754  */
 755 static faultcode_t
 756 segmap_faulta(struct seg *seg, caddr_t addr)
 757 {
 758         struct smap *smp;
 759         struct vnode *vp;
 760         uoff_t off;
 761         int err;
 762
 763         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 764                 int     newpage;
 765                 kmutex_t *smtx;
 766
 767                 /*
 768                  * Pages are successfully prefaulted and locked in
 769                  * segmap_getmapflt and can't be unlocked until
 770                  * segmap_release. No hat mappings have to be locked
 771                  * and they also can't be unlocked as long as the
 772                  * caller owns an active kpm addr.
 773                  */
 774 #ifdef  DEBUG
 775                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 776                         panic("segmap_faulta: smap not found "
 777                             "for addr %p", (void *)addr);
 778                         /*NOTREACHED*/
 779                 }
 780
 781                 smtx = SMAPMTX(smp);
 782                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 783                 mutex_exit(smtx);
 784                 if (newpage)
 785                         cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
 786                             (void *)smp);
 787 #endif
 788                 return (0);
 789         }
 790
 791         segmapcnt.smp_faulta.value.ul++;
 792         smp = GET_SMAP(seg, addr);
 793
 794         ASSERT(smp->sm_refcnt > 0);
 795
 796         vp = smp->sm_vp;
 797         off = smp->sm_off;
 798
 799         if (vp == NULL) {
 800                 cmn_err(CE_WARN, "segmap_faulta - no vp");
 801                 return (FC_MAKE_ERR(EIO));
 802         }
 803
 804         err = fop_getpage(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
 805             & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
 806             seg, addr, S_READ, CRED(), NULL);
 807
 808         if (err)
 809                 return (FC_MAKE_ERR(err));
 810         return (0);
 811 }
 812
 813 /*ARGSUSED*/
 814 static int
 815 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
 816 {
 817         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 818
 819         ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
 820
 821         /*
 822          * Need not acquire the segment lock since
 823          * "smd_prot" is a read-only field.
 824          */
 825         return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
 826 }
 827
 828 static int
 829 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
 830 {
 831         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 832         size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
 833
 834         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
 835
 836         if (pgno != 0) {
 837                 do {
 838                         protv[--pgno] = smd->smd_prot;
 839                 } while (pgno != 0);
 840         }
 841         return (0);
 842 }
 843
 844 static uoff_t
 845 segmap_getoffset(struct seg *seg, caddr_t addr)
 846 {
 847         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 848
 849         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 850
 851         return ((uoff_t)smd->smd_sm->sm_off + (addr - seg->s_base));
 852 }
 853
 854 /*ARGSUSED*/
 855 static int
 856 segmap_gettype(struct seg *seg, caddr_t addr)
 857 {
 858         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 859
 860         return (MAP_SHARED);
 861 }
 862
 863 /*ARGSUSED*/
 864 static int
 865 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
 866 {
 867         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 868
 869         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 870
 871         /* XXX - This doesn't make any sense */
 872         *vpp = smd->smd_sm->sm_vp;
 873         return (0);
 874 }
 875
 876 /*
 877  * Check to see if it makes sense to do kluster/read ahead to
 878  * addr + delta relative to the mapping at addr.  We assume here
 879  * that delta is a signed PAGESIZE'd multiple (which can be negative).
 880  *
 881  * For segmap we always "approve" of this action from our standpoint.
 882  */
 883 /*ARGSUSED*/
 884 static int
 885 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 886 {
 887         return (0);
 888 }
 889
 890 static void
 891 segmap_badop()
 892 {
 893         panic("segmap_badop");
 894         /*NOTREACHED*/
 895 }
 896
 897 /*
 898  * Special private segmap operations
 899  */
 900
 901 /*
 902  * Add smap to the appropriate free list.
 903  */
 904 static void
 905 segmap_smapadd(struct smap *smp)
 906 {
 907         struct smfree *sm;
 908         struct smap *smpfreelist;
 909         struct sm_freeq *releq;
 910
 911         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 912
 913         if (smp->sm_refcnt != 0) {
 914                 panic("segmap_smapadd");
 915                 /*NOTREACHED*/
 916         }
 917
 918         sm = &smd_free[smp->sm_free_ndx];
 919         /*
 920          * Add to the tail of the release queue
 921          * Note that sm_releq and sm_allocq could toggle
 922          * before we get the lock. This does not affect
 923          * correctness as the 2 queues are only maintained
 924          * to reduce lock pressure.
 925          */
 926         releq = sm->sm_releq;
 927         if (releq == &sm->sm_freeq[0])
 928                 smp->sm_flags |= SM_QNDX_ZERO;
 929         else
 930                 smp->sm_flags &= ~SM_QNDX_ZERO;
 931         mutex_enter(&releq->smq_mtx);
 932         smpfreelist = releq->smq_free;
 933         if (smpfreelist == 0) {
 934                 int want;
 935
 936                 releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 937                 /*
 938                  * Both queue mutexes held to set sm_want;
 939                  * snapshot the value before dropping releq mutex.
 940                  * If sm_want appears after the releq mutex is dropped,
 941                  * then the smap just freed is already gone.
 942                  */
 943                 want = sm->sm_want;
 944                 mutex_exit(&releq->smq_mtx);
 945                 /*
 946                  * See if there was a waiter before dropping the releq mutex
 947                  * then recheck after obtaining sm_freeq[0] mutex as
 948                  * the another thread may have already signaled.
 949                  */
 950                 if (want) {
 951                         mutex_enter(&sm->sm_freeq[0].smq_mtx);
 952                         if (sm->sm_want)
 953                                 cv_signal(&sm->sm_free_cv);
 954                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
 955                 }
 956         } else {
 957                 smp->sm_next = smpfreelist;
 958                 smp->sm_prev = smpfreelist->sm_prev;
 959                 smpfreelist->sm_prev = smp;
 960                 smp->sm_prev->sm_next = smp;
 961                 mutex_exit(&releq->smq_mtx);
 962         }
 963 }
 964
 965
 966 static struct smap *
 967 segmap_hashin(struct smap *smp, struct vnode *vp, uoff_t off, int hashid)
 968 {
 969         struct smap **hpp;
 970         struct smap *tmp;
 971         kmutex_t *hmtx;
 972
 973         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 974         ASSERT(smp->sm_vp == NULL);
 975         ASSERT(smp->sm_hash == NULL);
 976         ASSERT(smp->sm_prev == NULL);
 977         ASSERT(smp->sm_next == NULL);
 978         ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
 979
 980         hmtx = SHASHMTX(hashid);
 981
 982         mutex_enter(hmtx);
 983         /*
 984          * First we need to verify that no one has created a smp
 985          * with (vp,off) as its tag before we us.
 986          */
 987         for (tmp = smd_hash[hashid].sh_hash_list;
 988             tmp != NULL; tmp = tmp->sm_hash)
 989                 if (tmp->sm_vp == vp && tmp->sm_off == off)
 990                         break;
 991
 992         if (tmp == NULL) {
 993                 /*
 994                  * No one created one yet.
 995                  *
 996                  * Funniness here - we don't increment the ref count on the
 997                  * vnode * even though we have another pointer to it here.
 998                  * The reason for this is that we don't want the fact that
 999                  * a seg_map entry somewhere refers to a vnode to prevent the
1000                  * vnode * itself from going away.  This is because this
1001                  * reference to the vnode is a "soft one".  In the case where
1002                  * a mapping is being used by a rdwr [or directory routine?]
1003                  * there already has to be a non-zero ref count on the vnode.
1004                  * In the case where the vp has been freed and the the smap
1005                  * structure is on the free list, there are no pages in memory
1006                  * that can refer to the vnode.  Thus even if we reuse the same
1007                  * vnode/smap structure for a vnode which has the same
1008                  * address but represents a different object, we are ok.
1009                  */
1010                 smp->sm_vp = vp;
1011                 smp->sm_off = off;
1012
1013                 hpp = &smd_hash[hashid].sh_hash_list;
1014                 smp->sm_hash = *hpp;
1015                 *hpp = smp;
1016 #ifdef SEGMAP_HASHSTATS
1017                 smd_hash_len[hashid]++;
1018 #endif
1019         }
1020         mutex_exit(hmtx);
1021
1022         return (tmp);
1023 }
1024
1025 static void
1026 segmap_hashout(struct smap *smp)
1027 {
1028         struct smap **hpp, *hp;
1029         struct vnode *vp;
1030         kmutex_t *mtx;
1031         int hashid;
1032         uoff_t off;
1033
1034         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1035
1036         vp = smp->sm_vp;
1037         off = smp->sm_off;
1038
1039         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1040         mtx = SHASHMTX(hashid);
1041         mutex_enter(mtx);
1042
1043         hpp = &smd_hash[hashid].sh_hash_list;
1044         for (;;) {
1045                 hp = *hpp;
1046                 if (hp == NULL) {
1047                         panic("segmap_hashout");
1048                         /*NOTREACHED*/
1049                 }
1050                 if (hp == smp)
1051                         break;
1052                 hpp = &hp->sm_hash;
1053         }
1054
1055         *hpp = smp->sm_hash;
1056         smp->sm_hash = NULL;
1057 #ifdef SEGMAP_HASHSTATS
1058         smd_hash_len[hashid]--;
1059 #endif
1060         mutex_exit(mtx);
1061
1062         smp->sm_vp = NULL;
1063         smp->sm_off = 0;
1064
1065 }
1066
1067 /*
1068  * Attempt to free unmodified, unmapped, and non locked segmap
1069  * pages.
1070  */
1071 void
1072 segmap_pagefree(struct vnode *vp, uoff_t off)
1073 {
1074         uoff_t pgoff;
1075         page_t  *pp;
1076
1077         for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1078
1079                 if ((pp = page_lookup_nowait(&vp->v_object, pgoff, SE_EXCL)) == NULL)
1080                         continue;
1081
1082                 switch (page_release(pp, 1)) {
1083                 case PGREL_NOTREL:
1084                         segmapcnt.smp_free_notfree.value.ul++;
1085                         break;
1086                 case PGREL_MOD:
1087                         segmapcnt.smp_free_dirty.value.ul++;
1088                         break;
1089                 case PGREL_CLEAN:
1090                         segmapcnt.smp_free.value.ul++;
1091                         break;
1092                 }
1093         }
1094 }
1095
1096 /*
1097  * Locks held on entry: smap lock
1098  * Locks held on exit : smap lock.
1099  */
1100
1101 static void
1102 grab_smp(struct smap *smp, page_t *pp)
1103 {
1104         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1105         ASSERT(smp->sm_refcnt == 0);
1106
1107         if (smp->sm_vp != NULL) {
1108                 struct vnode    *vp = smp->sm_vp;
1109                 uoff_t  off = smp->sm_off;
1110                 /*
1111                  * Destroy old vnode association and
1112                  * unload any hardware translations to
1113                  * the old object.
1114                  */
1115                 smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1116                 segmap_hashout(smp);
1117
1118                 /*
1119                  * This node is off freelist and hashlist,
1120                  * so there is no reason to drop/reacquire sm_mtx
1121                  * across calls to hat_unload.
1122                  */
1123                 if (segmap_kpm) {
1124                         caddr_t vaddr;
1125                         int hat_unload_needed = 0;
1126
1127                         /*
1128                          * unload kpm mapping
1129                          */
1130                         if (pp != NULL) {
1131                                 vaddr = hat_kpm_page2va(pp, 1);
1132                                 hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1133                                 page_unlock(pp);
1134                         }
1135
1136                         /*
1137                          * Check if we have (also) the rare case of a
1138                          * non kpm mapping.
1139                          */
1140                         if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1141                                 hat_unload_needed = 1;
1142                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1143                         }
1144
1145                         if (hat_unload_needed) {
1146                                 hat_unload(kas.a_hat, segkmap->s_base +
1147                                     ((smp - smd_smap) * MAXBSIZE),
1148                                     MAXBSIZE, HAT_UNLOAD);
1149                         }
1150
1151                 } else {
1152                         ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1153                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1154                         hat_unload(kas.a_hat, segkmap->s_base +
1155                             ((smp - smd_smap) * MAXBSIZE),
1156                             MAXBSIZE, HAT_UNLOAD);
1157                 }
1158                 segmap_pagefree(vp, off);
1159         }
1160 }
1161
1162 static struct smap *
1163 get_free_smp(int free_ndx)
1164 {
1165         struct smfree *sm;
1166         kmutex_t *smtx;
1167         struct smap *smp, *first;
1168         struct sm_freeq *allocq, *releq;
1169         struct kpme *kpme;
1170         page_t *pp = NULL;
1171         int end_ndx, page_locked = 0;
1172
1173         end_ndx = free_ndx;
1174         sm = &smd_free[free_ndx];
1175
1176 retry_queue:
1177         allocq = sm->sm_allocq;
1178         mutex_enter(&allocq->smq_mtx);
1179
1180         if ((smp = allocq->smq_free) == NULL) {
1181
1182 skip_queue:
1183                 /*
1184                  * The alloc list is empty or this queue is being skipped;
1185                  * first see if the allocq toggled.
1186                  */
1187                 if (sm->sm_allocq != allocq) {
1188                         /* queue changed */
1189                         mutex_exit(&allocq->smq_mtx);
1190                         goto retry_queue;
1191                 }
1192                 releq = sm->sm_releq;
1193                 if (!mutex_tryenter(&releq->smq_mtx)) {
1194                         /* cannot get releq; a free smp may be there now */
1195                         mutex_exit(&allocq->smq_mtx);
1196
1197                         /*
1198                          * This loop could spin forever if this thread has
1199                          * higher priority than the thread that is holding
1200                          * releq->smq_mtx. In order to force the other thread
1201                          * to run, we'll lock/unlock the mutex which is safe
1202                          * since we just unlocked the allocq mutex.
1203                          */
1204                         mutex_enter(&releq->smq_mtx);
1205                         mutex_exit(&releq->smq_mtx);
1206                         goto retry_queue;
1207                 }
1208                 if (releq->smq_free == NULL) {
1209                         /*
1210                          * This freelist is empty.
1211                          * This should not happen unless clients
1212                          * are failing to release the segmap
1213                          * window after accessing the data.
1214                          * Before resorting to sleeping, try
1215                          * the next list of the same color.
1216                          */
1217                         free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1218                         if (free_ndx != end_ndx) {
1219                                 mutex_exit(&releq->smq_mtx);
1220                                 mutex_exit(&allocq->smq_mtx);
1221                                 sm = &smd_free[free_ndx];
1222                                 goto retry_queue;
1223                         }
1224                         /*
1225                          * Tried all freelists of the same color once,
1226                          * wait on this list and hope something gets freed.
1227                          */
1228                         segmapcnt.smp_get_nofree.value.ul++;
1229                         sm->sm_want++;
1230                         mutex_exit(&sm->sm_freeq[1].smq_mtx);
1231                         cv_wait(&sm->sm_free_cv,
1232                             &sm->sm_freeq[0].smq_mtx);
1233                         sm->sm_want--;
1234                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
1235                         sm = &smd_free[free_ndx];
1236                         goto retry_queue;
1237                 } else {
1238                         /*
1239                          * Something on the rele queue; flip the alloc
1240                          * and rele queues and retry.
1241                          */
1242                         sm->sm_allocq = releq;
1243                         sm->sm_releq = allocq;
1244                         mutex_exit(&allocq->smq_mtx);
1245                         mutex_exit(&releq->smq_mtx);
1246                         if (page_locked) {
1247                                 ddi_msleep(250);
1248                                 page_locked = 0;
1249                         }
1250                         goto retry_queue;
1251                 }
1252         } else {
1253                 /*
1254                  * Fastpath the case we get the smap mutex
1255                  * on the first try.
1256                  */
1257                 first = smp;
1258 next_smap:
1259                 smtx = SMAPMTX(smp);
1260                 if (!mutex_tryenter(smtx)) {
1261                         /*
1262                          * Another thread is trying to reclaim this slot.
1263                          * Skip to the next queue or smap.
1264                          */
1265                         if ((smp = smp->sm_next) == first) {
1266                                 goto skip_queue;
1267                         } else {
1268                                 goto next_smap;
1269                         }
1270                 } else {
1271                         /*
1272                          * if kpme exists, get shared lock on the page
1273                          */
1274                         if (segmap_kpm && smp->sm_vp != NULL) {
1275
1276                                 kpme = GET_KPME(smp);
1277                                 pp = kpme->kpe_page;
1278
1279                                 if (pp != NULL) {
1280                                         if (!page_trylock(pp, SE_SHARED)) {
1281                                                 smp = smp->sm_next;
1282                                                 mutex_exit(smtx);
1283                                                 page_locked = 1;
1284
1285                                                 pp = NULL;
1286
1287                                                 if (smp == first) {
1288                                                         goto skip_queue;
1289                                                 } else {
1290                                                         goto next_smap;
1291                                                 }
1292                                         } else {
1293                                                 if (kpme->kpe_page == NULL) {
1294                                                         page_unlock(pp);
1295                                                         pp = NULL;
1296                                                 }
1297                                         }
1298                                 }
1299                         }
1300
1301                         /*
1302                          * At this point, we've selected smp.  Remove smp
1303                          * from its freelist.  If smp is the first one in
1304                          * the freelist, update the head of the freelist.
1305                          */
1306                         if (first == smp) {
1307                                 ASSERT(first == allocq->smq_free);
1308                                 allocq->smq_free = smp->sm_next;
1309                         }
1310
1311                         /*
1312                          * if the head of the freelist still points to smp,
1313                          * then there are no more free smaps in that list.
1314                          */
1315                         if (allocq->smq_free == smp)
1316                                 /*
1317                                  * Took the last one
1318                                  */
1319                                 allocq->smq_free = NULL;
1320                         else {
1321                                 smp->sm_prev->sm_next = smp->sm_next;
1322                                 smp->sm_next->sm_prev = smp->sm_prev;
1323                         }
1324                         mutex_exit(&allocq->smq_mtx);
1325                         smp->sm_prev = smp->sm_next = NULL;
1326
1327                         /*
1328                          * if pp != NULL, pp must have been locked;
1329                          * grab_smp() unlocks pp.
1330                          */
1331                         ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1332                         grab_smp(smp, pp);
1333                         /* return smp locked. */
1334                         ASSERT(SMAPMTX(smp) == smtx);
1335                         ASSERT(MUTEX_HELD(smtx));
1336                         return (smp);
1337                 }
1338         }
1339 }
1340
1341 /*
1342  * Special public segmap operations
1343  */
1344
1345 /*
1346  * Create pages (without using fop_getpage) and load up translations to them.
1347  * If softlock is TRUE, then set things up so that it looks like a call
1348  * to segmap_fault with F_SOFTLOCK.
1349  *
1350  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1351  *
1352  * All fields in the generic segment (struct seg) are considered to be
1353  * read-only for "segmap" even though the kernel address space (kas) may
1354  * not be locked, hence no lock is needed to access them.
1355  */
1356 int
1357 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1358 {
1359         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1360         page_t *pp;
1361         uoff_t off;
1362         struct smap *smp;
1363         struct vnode *vp;
1364         caddr_t eaddr;
1365         int newpage = 0;
1366         uint_t prot;
1367         kmutex_t *smtx;
1368         int hat_flag;
1369
1370         ASSERT(seg->s_as == &kas);
1371
1372         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1373                 /*
1374                  * Pages are successfully prefaulted and locked in
1375                  * segmap_getmapflt and can't be unlocked until
1376                  * segmap_release. The SM_KPM_NEWPAGE flag is set
1377                  * in segmap_pagecreate_kpm when new pages are created.
1378                  * and it is returned as "newpage" indication here.
1379                  */
1380                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1381                         panic("segmap_pagecreate: smap not found "
1382                             "for addr %p", (void *)addr);
1383                         /*NOTREACHED*/
1384                 }
1385
1386                 smtx = SMAPMTX(smp);
1387                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1388                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1389                 mutex_exit(smtx);
1390
1391                 return (newpage);
1392         }
1393
1394         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1395
1396         eaddr = addr + len;
1397         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1398
1399         smp = GET_SMAP(seg, addr);
1400
1401         /*
1402          * We don't grab smp mutex here since we assume the smp
1403          * has a refcnt set already which prevents the slot from
1404          * changing its id.
1405          */
1406         ASSERT(smp->sm_refcnt > 0);
1407
1408         vp = smp->sm_vp;
1409         off = smp->sm_off + ((uoff_t)((uintptr_t)addr & MAXBOFFSET));
1410         prot = smd->smd_prot;
1411
1412         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1413                 hat_flag = HAT_LOAD;
1414                 pp = page_lookup(&vp->v_object, off, SE_SHARED);
1415                 if (pp == NULL) {
1416                         ushort_t bitindex;
1417
1418                         if ((pp = page_create_va(&vp->v_object, off,
1419                             PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1420                                 panic("segmap_pagecreate: page_create failed");
1421                                 /*NOTREACHED*/
1422                         }
1423                         newpage = 1;
1424                         page_io_unlock(pp);
1425
1426                         /*
1427                          * Since pages created here do not contain valid
1428                          * data until the caller writes into them, the
1429                          * "exclusive" lock will not be dropped to prevent
1430                          * other users from accessing the page.  We also
1431                          * have to lock the translation to prevent a fault
1432                          * from occurring when the virtual address mapped by
1433                          * this page is written into.  This is necessary to
1434                          * avoid a deadlock since we haven't dropped the
1435                          * "exclusive" lock.
1436                          */
1437                         bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1438
1439                         /*
1440                          * Large Files: The following assertion is to
1441                          * verify the cast above.
1442                          */
1443                         ASSERT((uoff_t)(off - smp->sm_off) <= INT_MAX);
1444                         smtx = SMAPMTX(smp);
1445                         mutex_enter(smtx);
1446                         smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1447                         mutex_exit(smtx);
1448
1449                         hat_flag = HAT_LOAD_LOCK;
1450                 } else if (softlock) {
1451                         hat_flag = HAT_LOAD_LOCK;
1452                 }
1453
1454                 if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1455                         hat_setmod(pp);
1456
1457                 hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1458
1459                 if (hat_flag != HAT_LOAD_LOCK)
1460                         page_unlock(pp);
1461         }
1462
1463         return (newpage);
1464 }
1465
1466 void
1467 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1468 {
1469         struct smap     *smp;
1470         ushort_t        bitmask;
1471         page_t          *pp;
1472         struct  vnode   *vp;
1473         uoff_t  off;
1474         caddr_t         eaddr;
1475         kmutex_t        *smtx;
1476
1477         ASSERT(seg->s_as == &kas);
1478
1479         eaddr = addr + len;
1480         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1481
1482         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1483                 /*
1484                  * Pages are successfully prefaulted and locked in
1485                  * segmap_getmapflt and can't be unlocked until
1486                  * segmap_release, so no pages or hat mappings have
1487                  * to be unlocked at this point.
1488                  */
1489 #ifdef DEBUG
1490                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1491                         panic("segmap_pageunlock: smap not found "
1492                             "for addr %p", (void *)addr);
1493                         /*NOTREACHED*/
1494                 }
1495
1496                 ASSERT(smp->sm_refcnt > 0);
1497                 mutex_exit(SMAPMTX(smp));
1498 #endif
1499                 return;
1500         }
1501
1502         smp = GET_SMAP(seg, addr);
1503         smtx = SMAPMTX(smp);
1504
1505         ASSERT(smp->sm_refcnt > 0);
1506
1507         vp = smp->sm_vp;
1508         off = smp->sm_off + ((uoff_t)((uintptr_t)addr & MAXBOFFSET));
1509
1510         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1511                 bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1512
1513                 /*
1514                  * Large Files: Following assertion is to verify
1515                  * the correctness of the cast to (int) above.
1516                  */
1517                 ASSERT((uoff_t)(off - smp->sm_off) <= INT_MAX);
1518
1519                 /*
1520                  * If the bit corresponding to "off" is set,
1521                  * clear this bit in the bitmap, unlock translations,
1522                  * and release the "exclusive" lock on the page.
1523                  */
1524                 if (smp->sm_bitmap & bitmask) {
1525                         mutex_enter(smtx);
1526                         smp->sm_bitmap &= ~bitmask;
1527                         mutex_exit(smtx);
1528
1529                         hat_unlock(kas.a_hat, addr, PAGESIZE);
1530
1531                         /*
1532                          * Use page_find() instead of page_lookup() to
1533                          * find the page since we know that it has
1534                          * "exclusive" lock.
1535                          */
1536                         pp = page_find(&vp->v_object, off);
1537                         if (pp == NULL) {
1538                                 panic("segmap_pageunlock: page not found");
1539                                 /*NOTREACHED*/
1540                         }
1541                         if (rw == S_WRITE) {
1542                                 hat_setrefmod(pp);
1543                         } else if (rw != S_OTHER) {
1544                                 hat_setref(pp);
1545                         }
1546
1547                         page_unlock(pp);
1548                 }
1549         }
1550 }
1551
1552 caddr_t
1553 segmap_getmap(struct seg *seg, struct vnode *vp, uoff_t off)
1554 {
1555         return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1556 }
1557
1558 /*
1559  * This is the magic virtual address that offset 0 of an ELF
1560  * file gets mapped to in user space. This is used to pick
1561  * the vac color on the freelist.
1562  */
1563 #define ELF_OFFZERO_VA  (0x10000)
1564 /*
1565  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1566  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1567  * The return address is  always MAXBSIZE aligned.
1568  *
1569  * If forcefault is nonzero and the MMU translations haven't yet been created,
1570  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1571  */
1572 caddr_t
1573 segmap_getmapflt(
1574         struct seg *seg,
1575         struct vnode *vp,
1576         uoff_t off,
1577         size_t len,
1578         int forcefault,
1579         enum seg_rw rw)
1580 {
1581         struct smap *smp, *nsmp;
1582         extern struct vnode *common_specvp();
1583         caddr_t baseaddr;                       /* MAXBSIZE aligned */
1584         uoff_t baseoff;
1585         int newslot;
1586         caddr_t vaddr;
1587         int color, hashid;
1588         kmutex_t *hashmtx, *smapmtx;
1589         struct smfree *sm;
1590         page_t  *pp;
1591         struct kpme *kpme;
1592         uint_t  prot;
1593         caddr_t base;
1594         page_t  *pl[MAXPPB + 1];
1595         int     error;
1596         int     is_kpm = 1;
1597
1598         ASSERT(seg->s_as == &kas);
1599         ASSERT(seg == segkmap);
1600
1601         baseoff = off & (offset_t)MAXBMASK;
1602         if (off + len > baseoff + MAXBSIZE) {
1603                 panic("segmap_getmap bad len");
1604                 /*NOTREACHED*/
1605         }
1606
1607         /*
1608          * If this is a block device we have to be sure to use the
1609          * "common" block device vnode for the mapping.
1610          */
1611         if (vp->v_type == VBLK)
1612                 vp = common_specvp(vp);
1613
1614         smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1615
1616         if (segmap_kpm == 0 ||
1617             (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1618                 is_kpm = 0;
1619         }
1620
1621         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1622         hashmtx = SHASHMTX(hashid);
1623
1624 retry_hash:
1625         mutex_enter(hashmtx);
1626         for (smp = smd_hash[hashid].sh_hash_list;
1627             smp != NULL; smp = smp->sm_hash)
1628                 if (smp->sm_vp == vp && smp->sm_off == baseoff)
1629                         break;
1630         mutex_exit(hashmtx);
1631
1632 vrfy_smp:
1633         if (smp != NULL) {
1634
1635                 ASSERT(vp->v_count != 0);
1636
1637                 /*
1638                  * Get smap lock and recheck its tag. The hash lock
1639                  * is dropped since the hash is based on (vp, off)
1640                  * and (vp, off) won't change when we have smap mtx.
1641                  */
1642                 smapmtx = SMAPMTX(smp);
1643                 mutex_enter(smapmtx);
1644                 if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1645                         mutex_exit(smapmtx);
1646                         goto retry_hash;
1647                 }
1648
1649                 if (smp->sm_refcnt == 0) {
1650
1651                         smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1652
1653                         /*
1654                          * Could still be on the free list. However, this
1655                          * could also be an smp that is transitioning from
1656                          * the free list when we have too much contention
1657                          * for the smapmtx's. In this case, we have an
1658                          * unlocked smp that is not on the free list any
1659                          * longer, but still has a 0 refcnt.  The only way
1660                          * to be sure is to check the freelist pointers.
1661                          * Since we now have the smapmtx, we are guaranteed
1662                          * that the (vp, off) won't change, so we are safe
1663                          * to reclaim it.  get_free_smp() knows that this
1664                          * can happen, and it will check the refcnt.
1665                          */
1666
1667                         if ((smp->sm_next != NULL)) {
1668                                 struct sm_freeq *freeq;
1669
1670                                 ASSERT(smp->sm_prev != NULL);
1671                                 sm = &smd_free[smp->sm_free_ndx];
1672
1673                                 if (smp->sm_flags & SM_QNDX_ZERO)
1674                                         freeq = &sm->sm_freeq[0];
1675                                 else
1676                                         freeq = &sm->sm_freeq[1];
1677
1678                                 mutex_enter(&freeq->smq_mtx);
1679                                 if (freeq->smq_free != smp) {
1680                                         /*
1681                                          * fastpath normal case
1682                                          */
1683                                         smp->sm_prev->sm_next = smp->sm_next;
1684                                         smp->sm_next->sm_prev = smp->sm_prev;
1685                                 } else if (smp == smp->sm_next) {
1686                                         /*
1687                                          * Taking the last smap on freelist
1688                                          */
1689                                         freeq->smq_free = NULL;
1690                                 } else {
1691                                         /*
1692                                          * Reclaiming 1st smap on list
1693                                          */
1694                                         freeq->smq_free = smp->sm_next;
1695                                         smp->sm_prev->sm_next = smp->sm_next;
1696                                         smp->sm_next->sm_prev = smp->sm_prev;
1697                                 }
1698                                 mutex_exit(&freeq->smq_mtx);
1699                                 smp->sm_prev = smp->sm_next = NULL;
1700                         } else {
1701                                 ASSERT(smp->sm_prev == NULL);
1702                                 segmapcnt.smp_stolen.value.ul++;
1703                         }
1704
1705                 } else {
1706                         segmapcnt.smp_get_use.value.ul++;
1707                 }
1708                 smp->sm_refcnt++;               /* another user */
1709
1710                 /*
1711                  * We don't invoke segmap_fault via TLB miss, so we set ref
1712                  * and mod bits in advance. For S_OTHER  we set them in
1713                  * segmap_fault F_SOFTUNLOCK.
1714                  */
1715                 if (is_kpm) {
1716                         if (rw == S_WRITE) {
1717                                 smp->sm_flags |= SM_WRITE_DATA;
1718                         } else if (rw == S_READ) {
1719                                 smp->sm_flags |= SM_READ_DATA;
1720                         }
1721                 }
1722                 mutex_exit(smapmtx);
1723
1724                 newslot = 0;
1725         } else {
1726
1727                 uint32_t free_ndx, *free_ndxp;
1728                 union segmap_cpu *scpu;
1729
1730                 /*
1731                  * On a PAC machine or a machine with anti-alias
1732                  * hardware, smd_colormsk will be zero.
1733                  *
1734                  * On a VAC machine- pick color by offset in the file
1735                  * so we won't get VAC conflicts on elf files.
1736                  * On data files, color does not matter but we
1737                  * don't know what kind of file it is so we always
1738                  * pick color by offset. This causes color
1739                  * corresponding to file offset zero to be used more
1740                  * heavily.
1741                  */
1742                 color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1743                 scpu = smd_cpu+CPU->cpu_seqid;
1744                 free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1745                 free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1746 #ifdef DEBUG
1747                 colors_used[free_ndx]++;
1748 #endif /* DEBUG */
1749
1750                 /*
1751                  * Get a locked smp slot from the free list.
1752                  */
1753                 smp = get_free_smp(free_ndx);
1754                 smapmtx = SMAPMTX(smp);
1755
1756                 ASSERT(smp->sm_vp == NULL);
1757
1758                 if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1759                         /*
1760                          * Failed to hashin, there exists one now.
1761                          * Return the smp we just allocated.
1762                          */
1763                         segmap_smapadd(smp);
1764                         mutex_exit(smapmtx);
1765
1766                         smp = nsmp;
1767                         goto vrfy_smp;
1768                 }
1769                 smp->sm_refcnt++;               /* another user */
1770
1771                 /*
1772                  * We don't invoke segmap_fault via TLB miss, so we set ref
1773                  * and mod bits in advance. For S_OTHER  we set them in
1774                  * segmap_fault F_SOFTUNLOCK.
1775                  */
1776                 if (is_kpm) {
1777                         if (rw == S_WRITE) {
1778                                 smp->sm_flags |= SM_WRITE_DATA;
1779                         } else if (rw == S_READ) {
1780                                 smp->sm_flags |= SM_READ_DATA;
1781                         }
1782                 }
1783                 mutex_exit(smapmtx);
1784
1785                 newslot = 1;
1786         }
1787
1788         if (!is_kpm)
1789                 goto use_segmap_range;
1790
1791         /*
1792          * Use segkpm
1793          */
1794         /* Lint directive required until 6746211 is fixed */
1795         /*CONSTCOND*/
1796         ASSERT(PAGESIZE == MAXBSIZE);
1797
1798         /*
1799          * remember the last smp faulted on this cpu.
1800          */
1801         (smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1802
1803         if (forcefault == SM_PAGECREATE) {
1804                 baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1805                 return (baseaddr);
1806         }
1807
1808         if (newslot == 0 &&
1809             (pp = GET_KPME(smp)->kpe_page) != NULL) {
1810
1811                 /* fastpath */
1812                 switch (rw) {
1813                 case S_READ:
1814                 case S_WRITE:
1815                         if (page_trylock(pp, SE_SHARED)) {
1816                                 if (PP_ISFREE(pp) ||
1817                                     !(pp->p_vnode == vp &&
1818                                     pp->p_offset == baseoff)) {
1819                                         page_unlock(pp);
1820                                         pp = page_lookup(&vp->v_object,
1821                                                          baseoff, SE_SHARED);
1822                                 }
1823                         } else {
1824                                 pp = page_lookup(&vp->v_object, baseoff,
1825                                                  SE_SHARED);
1826                         }
1827
1828                         if (pp == NULL) {
1829                                 ASSERT(GET_KPME(smp)->kpe_page == NULL);
1830                                 break;
1831                         }
1832
1833                         if (rw == S_WRITE &&
1834                             hat_page_getattr(pp, P_MOD | P_REF) !=
1835                             (P_MOD | P_REF)) {
1836                                 page_unlock(pp);
1837                                 break;
1838                         }
1839
1840                         /*
1841                          * We have the p_selock as reader, grab_smp
1842                          * can't hit us, we have bumped the smap
1843                          * refcnt and hat_pageunload needs the
1844                          * p_selock exclusive.
1845                          */
1846                         kpme = GET_KPME(smp);
1847                         if (kpme->kpe_page == pp) {
1848                                 baseaddr = hat_kpm_page2va(pp, 0);
1849                         } else if (kpme->kpe_page == NULL) {
1850                                 baseaddr = hat_kpm_mapin(pp, kpme);
1851                         } else {
1852                                 panic("segmap_getmapflt: stale "
1853                                     "kpme page, kpme %p", (void *)kpme);
1854                                 /*NOTREACHED*/
1855                         }
1856
1857                         /*
1858                          * We don't invoke segmap_fault via TLB miss,
1859                          * so we set ref and mod bits in advance.
1860                          * For S_OTHER and we set them in segmap_fault
1861                          * F_SOFTUNLOCK.
1862                          */
1863                         if (rw == S_READ && !hat_isref(pp))
1864                                 hat_setref(pp);
1865
1866                         return (baseaddr);
1867                 default:
1868                         break;
1869                 }
1870         }
1871
1872         base = segkpm_create_va(baseoff);
1873         error = fop_getpage(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1874             seg, base, rw, CRED(), NULL);
1875
1876         pp = pl[0];
1877         if (error || pp == NULL) {
1878                 /*
1879                  * Use segmap address slot and let segmap_fault deal
1880                  * with the error cases. There is no error return
1881                  * possible here.
1882                  */
1883                 goto use_segmap_range;
1884         }
1885
1886         ASSERT(pl[1] == NULL);
1887
1888         /*
1889          * When prot is not returned w/ PROT_ALL the returned pages
1890          * are not backed by fs blocks. For most of the segmap users
1891          * this is no problem, they don't write to the pages in the
1892          * same request and therefore don't rely on a following
1893          * trap driven segmap_fault. With SM_LOCKPROTO users it
1894          * is more secure to use segkmap adresses to allow
1895          * protection segmap_fault's.
1896          */
1897         if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1898                 /*
1899                  * Use segmap address slot and let segmap_fault
1900                  * do the error return.
1901                  */
1902                 ASSERT(rw != S_WRITE);
1903                 ASSERT(PAGE_LOCKED(pp));
1904                 page_unlock(pp);
1905                 forcefault = 0;
1906                 goto use_segmap_range;
1907         }
1908
1909         /*
1910          * We have the p_selock as reader, grab_smp can't hit us, we
1911          * have bumped the smap refcnt and hat_pageunload needs the
1912          * p_selock exclusive.
1913          */
1914         kpme = GET_KPME(smp);
1915         if (kpme->kpe_page == pp) {
1916                 baseaddr = hat_kpm_page2va(pp, 0);
1917         } else if (kpme->kpe_page == NULL) {
1918                 baseaddr = hat_kpm_mapin(pp, kpme);
1919         } else {
1920                 panic("segmap_getmapflt: stale kpme page after "
1921                     "fop_getpage, kpme %p", (void *)kpme);
1922                 /*NOTREACHED*/
1923         }
1924
1925         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1926
1927         return (baseaddr);
1928
1929
1930 use_segmap_range:
1931         baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1932
1933         /*
1934          * Prefault the translations
1935          */
1936         vaddr = baseaddr + (off - baseoff);
1937         if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1938
1939                 caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1940                     (uintptr_t)PAGEMASK);
1941
1942                 (void) segmap_fault(kas.a_hat, seg, pgaddr,
1943                     (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1944                     F_INVAL, rw);
1945         }
1946
1947         return (baseaddr);
1948 }
1949
1950 int
1951 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1952 {
1953         struct smap     *smp;
1954         int             error;
1955         int             bflags = 0;
1956         struct vnode    *vp;
1957         uoff_t  offset;
1958         kmutex_t        *smtx;
1959         int             is_kpm = 0;
1960         page_t          *pp;
1961
1962         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1963
1964                 if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1965                         panic("segmap_release: addr %p not "
1966                             "MAXBSIZE aligned", (void *)addr);
1967                         /*NOTREACHED*/
1968                 }
1969
1970                 if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
1971                         panic("segmap_release: smap not found "
1972                             "for addr %p", (void *)addr);
1973                         /*NOTREACHED*/
1974                 }
1975
1976                 smtx = SMAPMTX(smp);
1977
1978                 /*
1979                  * For compatibility reasons segmap_pagecreate_kpm sets this
1980                  * flag to allow a following segmap_pagecreate to return
1981                  * this as "newpage" flag. When segmap_pagecreate is not
1982                  * called at all we clear it now.
1983                  */
1984                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1985                 is_kpm = 1;
1986                 if (smp->sm_flags & SM_WRITE_DATA) {
1987                         hat_setrefmod(pp);
1988                 } else if (smp->sm_flags & SM_READ_DATA) {
1989                         hat_setref(pp);
1990                 }
1991         } else {
1992                 if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
1993                     ((uintptr_t)addr & MAXBOFFSET) != 0) {
1994                         panic("segmap_release: bad addr %p", (void *)addr);
1995                         /*NOTREACHED*/
1996                 }
1997                 smp = GET_SMAP(seg, addr);
1998
1999                 smtx = SMAPMTX(smp);
2000                 mutex_enter(smtx);
2001                 smp->sm_flags |= SM_NOTKPM_RELEASED;
2002         }
2003
2004         ASSERT(smp->sm_refcnt > 0);
2005
2006         /*
2007          * Need to call fop_putpage() if any flags (except SM_DONTNEED)
2008          * are set.
2009          */
2010         if ((flags & ~SM_DONTNEED) != 0) {
2011                 if (flags & SM_WRITE)
2012                         segmapcnt.smp_rel_write.value.ul++;
2013                 if (flags & SM_ASYNC) {
2014                         bflags |= B_ASYNC;
2015                         segmapcnt.smp_rel_async.value.ul++;
2016                 }
2017                 if (flags & SM_INVAL) {
2018                         bflags |= B_INVAL;
2019                         segmapcnt.smp_rel_abort.value.ul++;
2020                 }
2021                 if (flags & SM_DESTROY) {
2022                         bflags |= (B_INVAL|B_TRUNC);
2023                         segmapcnt.smp_rel_abort.value.ul++;
2024                 }
2025                 if (smp->sm_refcnt == 1) {
2026                         /*
2027                          * We only bother doing the FREE and DONTNEED flags
2028                          * if no one else is still referencing this mapping.
2029                          */
2030                         if (flags & SM_FREE) {
2031                                 bflags |= B_FREE;
2032                                 segmapcnt.smp_rel_free.value.ul++;
2033                         }
2034                         if (flags & SM_DONTNEED) {
2035                                 bflags |= B_DONTNEED;
2036                                 segmapcnt.smp_rel_dontneed.value.ul++;
2037                         }
2038                 }
2039         } else {
2040                 smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2041         }
2042
2043         vp = smp->sm_vp;
2044         offset = smp->sm_off;
2045
2046         if (--smp->sm_refcnt == 0) {
2047
2048                 smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2049
2050                 if (flags & (SM_INVAL|SM_DESTROY)) {
2051                         segmap_hashout(smp);    /* remove map info */
2052                         if (is_kpm) {
2053                                 hat_kpm_mapout(pp, GET_KPME(smp), addr);
2054                                 if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2055                                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2056                                         hat_unload(kas.a_hat, segkmap->s_base +
2057                                             ((smp - smd_smap) * MAXBSIZE),
2058                                             MAXBSIZE, HAT_UNLOAD);
2059                                 }
2060
2061                         } else {
2062                                 if (segmap_kpm)
2063                                         segkpm_mapout_validkpme(GET_KPME(smp));
2064
2065                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2066                                 hat_unload(kas.a_hat, addr, MAXBSIZE,
2067                                     HAT_UNLOAD);
2068                         }
2069                 }
2070                 segmap_smapadd(smp);    /* add to free list */
2071         }
2072
2073         mutex_exit(smtx);
2074
2075         if (is_kpm)
2076                 page_unlock(pp);
2077         /*
2078          * Now invoke fop_putpage() if any flags (except SM_DONTNEED)
2079          * are set.
2080          */
2081         if ((flags & ~SM_DONTNEED) != 0) {
2082                 error = fop_putpage(vp, offset, MAXBSIZE,
2083                     bflags, CRED(), NULL);
2084         } else {
2085                 error = 0;
2086         }
2087
2088         return (error);
2089 }
2090
2091 /*
2092  * Dump the pages belonging to this segmap segment.
2093  */
2094 static void
2095 segmap_dump(struct seg *seg)
2096 {
2097         struct segmap_data *smd;
2098         struct smap *smp, *smp_end;
2099         page_t *pp;
2100         pfn_t pfn;
2101         uoff_t off;
2102         caddr_t addr;
2103
2104         smd = (struct segmap_data *)seg->s_data;
2105         addr = seg->s_base;
2106         for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2107             smp < smp_end; smp++) {
2108
2109                 if (smp->sm_refcnt) {
2110                         for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2111                                 int we_own_it = 0;
2112
2113                                 /*
2114                                  * If pp == NULL, the page either does
2115                                  * not exist or is exclusively locked.
2116                                  * So determine if it exists before
2117                                  * searching for it.
2118                                  */
2119                                 if ((pp = page_lookup_nowait(&smp->sm_vp->v_object,
2120                                                              smp->sm_off + off,
2121                                                              SE_SHARED)))
2122                                         we_own_it = 1;
2123                                 else
2124                                         pp = page_exists(&smp->sm_vp->v_object,
2125                                                          smp->sm_off + off);
2126
2127                                 if (pp) {
2128                                         pfn = page_pptonum(pp);
2129                                         dump_addpage(seg->s_as,
2130                                             addr + off, pfn);
2131                                         if (we_own_it)
2132                                                 page_unlock(pp);
2133                                 }
2134                                 dump_timeleft = dump_timeout;
2135                         }
2136                 }
2137                 addr += MAXBSIZE;
2138         }
2139 }
2140
2141 /*ARGSUSED*/
2142 static int
2143 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2144     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2145 {
2146         return (ENOTSUP);
2147 }
2148
2149 static int
2150 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2151 {
2152         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2153
2154         memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2155         memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2156         return (0);
2157 }
2158
2159
2160 #ifdef  SEGKPM_SUPPORT
2161
2162 /*
2163  * segkpm support routines
2164  */
2165
2166 static caddr_t
2167 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, uoff_t off,
2168         struct smap *smp, enum seg_rw rw)
2169 {
2170         caddr_t base;
2171         page_t  *pp;
2172         int     newpage = 0;
2173         struct kpme     *kpme;
2174
2175         ASSERT(smp->sm_refcnt > 0);
2176
2177         if ((pp = page_lookup(&vp->v_object, off, SE_SHARED)) == NULL) {
2178                 kmutex_t *smtx;
2179
2180                 base = segkpm_create_va(off);
2181
2182                 if ((pp = page_create_va(&vp->v_object, off, PAGESIZE, PG_WAIT,
2183                     seg, base)) == NULL) {
2184                         panic("segmap_pagecreate_kpm: "
2185                             "page_create failed");
2186                         /*NOTREACHED*/
2187                 }
2188
2189                 newpage = 1;
2190                 page_io_unlock(pp);
2191                 ASSERT((uoff_t)(off - smp->sm_off) <= INT_MAX);
2192
2193                 /*
2194                  * Mark this here until the following segmap_pagecreate
2195                  * or segmap_release.
2196                  */
2197                 smtx = SMAPMTX(smp);
2198                 mutex_enter(smtx);
2199                 smp->sm_flags |= SM_KPM_NEWPAGE;
2200                 mutex_exit(smtx);
2201         }
2202
2203         kpme = GET_KPME(smp);
2204         if (!newpage && kpme->kpe_page == pp)
2205                 base = hat_kpm_page2va(pp, 0);
2206         else
2207                 base = hat_kpm_mapin(pp, kpme);
2208
2209         /*
2210          * FS code may decide not to call segmap_pagecreate and we
2211          * don't invoke segmap_fault via TLB miss, so we have to set
2212          * ref and mod bits in advance.
2213          */
2214         if (rw == S_WRITE) {
2215                 hat_setrefmod(pp);
2216         } else {
2217                 ASSERT(rw == S_READ);
2218                 hat_setref(pp);
2219         }
2220
2221         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2222
2223         return (base);
2224 }
2225
2226 /*
2227  * Find the smap structure corresponding to the
2228  * KPM addr and return it locked.
2229  */
2230 struct smap *
2231 get_smap_kpm(caddr_t addr, page_t **ppp)
2232 {
2233         struct smap     *smp;
2234         struct vnode    *vp;
2235         uoff_t  offset;
2236         caddr_t         baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2237         int             hashid;
2238         kmutex_t        *hashmtx;
2239         page_t          *pp;
2240         union segmap_cpu *scpu;
2241
2242         pp = hat_kpm_vaddr2page(baseaddr);
2243
2244         ASSERT(pp && !PP_ISFREE(pp));
2245         ASSERT(PAGE_LOCKED(pp));
2246         ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2247
2248         vp = pp->p_vnode;
2249         offset = pp->p_offset;
2250         ASSERT(vp != NULL);
2251
2252         /*
2253          * Assume the last smap used on this cpu is the one needed.
2254          */
2255         scpu = smd_cpu+CPU->cpu_seqid;
2256         smp = scpu->scpu.scpu_last_smap;
2257         mutex_enter(&smp->sm_mtx);
2258         if (smp->sm_vp == vp && smp->sm_off == offset) {
2259                 ASSERT(smp->sm_refcnt > 0);
2260         } else {
2261                 /*
2262                  * Assumption wrong, find the smap on the hash chain.
2263                  */
2264                 mutex_exit(&smp->sm_mtx);
2265                 SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2266                 hashmtx = SHASHMTX(hashid);
2267
2268                 mutex_enter(hashmtx);
2269                 smp = smd_hash[hashid].sh_hash_list;
2270                 for (; smp != NULL; smp = smp->sm_hash) {
2271                         if (smp->sm_vp == vp && smp->sm_off == offset)
2272                                 break;
2273                 }
2274                 mutex_exit(hashmtx);
2275                 if (smp) {
2276                         mutex_enter(&smp->sm_mtx);
2277                         ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2278                 }
2279         }
2280
2281         if (ppp)
2282                 *ppp = smp ? pp : NULL;
2283
2284         return (smp);
2285 }
2286
2287 #else   /* SEGKPM_SUPPORT */
2288
2289 /* segkpm stubs */
2290
2291 /*ARGSUSED*/
2292 static caddr_t
2293 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, uoff_t off,
2294         struct smap *smp, enum seg_rw rw)
2295 {
2296         return (NULL);
2297 }
2298
2299 /*ARGSUSED*/
2300 struct smap *
2301 get_smap_kpm(caddr_t addr, page_t **ppp)
2302 {
2303         return (NULL);
2304 }
2305
2306 #endif  /* SEGKPM_SUPPORT */