kernel/vm/seg_kmem.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2016 Joyent, Inc.
  24  */
  25
  26 #include <sys/types.h>
  27 #include <sys/t_lock.h>
  28 #include <sys/param.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/tuneable.h>
  31 #include <sys/systm.h>
  32 #include <sys/vm.h>
  33 #include <sys/kmem.h>
  34 #include <sys/vmem.h>
  35 #include <sys/mman.h>
  36 #include <sys/cmn_err.h>
  37 #include <sys/debug.h>
  38 #include <sys/dumphdr.h>
  39 #include <sys/bootconf.h>
  40 #include <sys/lgrp.h>
  41 #include <vm/seg_kmem.h>
  42 #include <vm/hat.h>
  43 #include <vm/page.h>
  44 #include <vm/vm_dep.h>
  45 #include <vm/faultcode.h>
  46 #include <sys/promif.h>
  47 #include <vm/seg_kp.h>
  48 #include <sys/bitmap.h>
  49 #include <sys/mem_cage.h>
  50
  51
  52 /*
  53  * seg_kmem is the primary kernel memory segment driver.  It
  54  * maps the kernel heap [kernelheap, ekernelheap), module text,
  55  * and all memory which was allocated before the VM was initialized
  56  * into kas.
  57  *
  58  * Pages which belong to seg_kmem are hashed into &kvp vnode at
  59  * an offset equal to (uoff_t)virt_addr, and have p_lckcnt >= 1.
  60  * They must never be paged out since segkmem_fault() is a no-op to
  61  * prevent recursive faults.
  62  *
  63  * Currently, seg_kmem pages are sharelocked (p_sharelock == 1) on
  64  * __x86 and are unlocked (p_sharelock == 0) on __sparc.  Once __x86
  65  * supports relocation the #ifdef kludges can be removed.
  66  *
  67  * seg_kmem pages may be subject to relocation by page_relocate(),
  68  * provided that the HAT supports it; if this is so, segkmem_reloc
  69  * will be set to a nonzero value. All boot time allocated memory as
  70  * well as static memory is considered off limits to relocation.
  71  * Pages are "relocatable" if p_state does not have P_NORELOC set, so
  72  * we request P_NORELOC pages for memory that isn't safe to relocate.
  73  *
  74  * The kernel heap is logically divided up into four pieces:
  75  *
  76  *   heap32_arena is for allocations that require 32-bit absolute
  77  *   virtual addresses (e.g. code that uses 32-bit pointers/offsets).
  78  *
  79  *   heap_core is for allocations that require 2GB *relative*
  80  *   offsets; in other words all memory from heap_core is within
  81  *   2GB of all other memory from the same arena. This is a requirement
  82  *   of the addressing modes of some processors in supervisor code.
  83  *
  84  *   heap_arena is the general heap arena.
  85  *
  86  *   static_arena is the static memory arena.  Allocations from it
  87  *   are not subject to relocation so it is safe to use the memory
  88  *   physical address as well as the virtual address (e.g. the VA to
  89  *   PA translations are static).  Caches may import from static_arena;
  90  *   all other static memory allocations should use static_alloc_arena.
  91  *
  92  * On some platforms which have limited virtual address space, seg_kmem
  93  * may share [kernelheap, ekernelheap) with seg_kp; if this is so,
  94  * segkp_bitmap is non-NULL, and each bit represents a page of virtual
  95  * address space which is actually seg_kp mapped.
  96  */
  97
  98 extern ulong_t *segkp_bitmap;   /* Is set if segkp is from the kernel heap */
  99
 100 char *kernelheap;               /* start of primary kernel heap */
 101 char *ekernelheap;              /* end of primary kernel heap */
 102 struct seg kvseg;               /* primary kernel heap segment */
 103 struct seg kvseg_core;          /* "core" kernel heap segment */
 104 struct seg kzioseg;             /* Segment for zio mappings */
 105 vmem_t *heap_arena;             /* primary kernel heap arena */
 106 vmem_t *heap_core_arena;        /* core kernel heap arena */
 107 char *heap_core_base;           /* start of core kernel heap arena */
 108 char *heap_lp_base;             /* start of kernel large page heap arena */
 109 char *heap_lp_end;              /* end of kernel large page heap arena */
 110 vmem_t *hat_memload_arena;      /* HAT translation data */
 111 struct seg kvseg32;             /* 32-bit kernel heap segment */
 112 vmem_t *heap32_arena;           /* 32-bit kernel heap arena */
 113 vmem_t *heaptext_arena;         /* heaptext arena */
 114 struct as kas;                  /* kernel address space */
 115 int segkmem_reloc;              /* enable/disable relocatable segkmem pages */
 116 vmem_t *static_arena;           /* arena for caches to import static memory */
 117 vmem_t *static_alloc_arena;     /* arena for allocating static memory */
 118 vmem_t *zio_arena = NULL;       /* arena for allocating zio memory */
 119 vmem_t *zio_alloc_arena = NULL; /* arena for allocating zio memory */
 120
 121 /*
 122  * seg_kmem driver can map part of the kernel heap with large pages.
 123  * Currently this functionality is implemented for sparc platforms only.
 124  *
 125  * The large page size "segkmem_lpsize" for kernel heap is selected in the
 126  * platform specific code. It can also be modified via /etc/system file.
 127  * Setting segkmem_lpsize to PAGESIZE in /etc/system disables usage of large
 128  * pages for kernel heap. "segkmem_lpshift" is adjusted appropriately to
 129  * match segkmem_lpsize.
 130  *
 131  * At boot time we carve from kernel heap arena a range of virtual addresses
 132  * that will be used for large page mappings. This range [heap_lp_base,
 133  * heap_lp_end) is set up as a separate vmem arena - "heap_lp_arena". We also
 134  * create "kmem_lp_arena" that caches memory already backed up by large
 135  * pages. kmem_lp_arena imports virtual segments from heap_lp_arena.
 136  */
 137
 138 size_t  segkmem_lpsize;
 139 static  uint_t  segkmem_lpshift = PAGESHIFT;
 140 int     segkmem_lpszc = 0;
 141
 142 size_t  segkmem_kmemlp_quantum = 0x400000;      /* 4MB */
 143 size_t  segkmem_heaplp_quantum;
 144 vmem_t *heap_lp_arena;
 145 static  vmem_t *kmem_lp_arena;
 146 static  vmem_t *segkmem_ppa_arena;
 147 static  segkmem_lpcb_t segkmem_lpcb;
 148
 149 /*
 150  * We use "segkmem_kmemlp_max" to limit the total amount of physical memory
 151  * consumed by the large page heap. By default this parameter is set to 1/8 of
 152  * physmem but can be adjusted through /etc/system either directly or
 153  * indirectly by setting "segkmem_kmemlp_pcnt" to the percent of physmem
 154  * we allow for large page heap.
 155  */
 156 size_t  segkmem_kmemlp_max;
 157 static  uint_t  segkmem_kmemlp_pcnt;
 158
 159 /*
 160  * Getting large pages for kernel heap could be problematic due to
 161  * physical memory fragmentation. That's why we allow to preallocate
 162  * "segkmem_kmemlp_min" bytes at boot time.
 163  */
 164 static  size_t  segkmem_kmemlp_min;
 165
 166 /*
 167  * Throttling is used to avoid expensive tries to allocate large pages
 168  * for kernel heap when a lot of succesive attempts to do so fail.
 169  */
 170 static  ulong_t segkmem_lpthrottle_max = 0x400000;
 171 static  ulong_t segkmem_lpthrottle_start = 0x40;
 172 static  ulong_t segkmem_use_lpthrottle = 1;
 173
 174 /*
 175  * Freed pages accumulate on a garbage list until segkmem is ready,
 176  * at which point we call segkmem_gc() to free it all.
 177  */
 178 typedef struct segkmem_gc_list {
 179         struct segkmem_gc_list  *gc_next;
 180         vmem_t                  *gc_arena;
 181         size_t                  gc_size;
 182 } segkmem_gc_list_t;
 183
 184 static segkmem_gc_list_t *segkmem_gc_list;
 185
 186 /*
 187  * Allocations from the hat_memload arena add VM_MEMLOAD to their
 188  * vmflags so that segkmem_xalloc() can inform the hat layer that it needs
 189  * to take steps to prevent infinite recursion.  HAT allocations also
 190  * must be non-relocatable to prevent recursive page faults.
 191  */
 192 static void *
 193 hat_memload_alloc(vmem_t *vmp, size_t size, int flags)
 194 {
 195         flags |= (VM_MEMLOAD | VM_NORELOC);
 196         return (segkmem_alloc(vmp, size, flags));
 197 }
 198
 199 /*
 200  * Allocations from static_arena arena (or any other arena that uses
 201  * segkmem_alloc_permanent()) require non-relocatable (permanently
 202  * wired) memory pages, since these pages are referenced by physical
 203  * as well as virtual address.
 204  */
 205 void *
 206 segkmem_alloc_permanent(vmem_t *vmp, size_t size, int flags)
 207 {
 208         return (segkmem_alloc(vmp, size, flags | VM_NORELOC));
 209 }
 210
 211 /*
 212  * Initialize kernel heap boundaries.
 213  */
 214 void
 215 kernelheap_init(
 216         void *heap_start,
 217         void *heap_end,
 218         char *first_avail,
 219         void *core_start,
 220         void *core_end)
 221 {
 222         uintptr_t textbase;
 223         size_t core_size;
 224         size_t heap_size;
 225         vmem_t *heaptext_parent;
 226         size_t  heap_lp_size = 0;
 227
 228         kernelheap = heap_start;
 229         ekernelheap = heap_end;
 230
 231
 232         /*
 233          * If this platform has a 'core' heap area, then the space for
 234          * overflow module text should be carved out of the end of that
 235          * heap.  Otherwise, it gets carved out of the general purpose
 236          * heap.
 237          */
 238         core_size = (uintptr_t)core_end - (uintptr_t)core_start;
 239         if (core_size > 0) {
 240                 ASSERT(core_size >= HEAPTEXT_SIZE);
 241                 textbase = (uintptr_t)core_end - HEAPTEXT_SIZE;
 242                 core_size -= HEAPTEXT_SIZE;
 243         }
 244         else {
 245                 ekernelheap -= HEAPTEXT_SIZE;
 246                 textbase = (uintptr_t)ekernelheap;
 247         }
 248
 249         heap_size = (uintptr_t)ekernelheap - (uintptr_t)kernelheap;
 250         heap_arena = vmem_init("heap", kernelheap, heap_size, PAGESIZE,
 251             segkmem_alloc, segkmem_free);
 252
 253         if (core_size > 0) {
 254                 heap_core_arena = vmem_create("heap_core", core_start,
 255                     core_size, PAGESIZE, NULL, NULL, NULL, 0, VM_SLEEP);
 256                 heap_core_base = core_start;
 257         } else {
 258                 heap_core_arena = heap_arena;
 259                 heap_core_base = kernelheap;
 260         }
 261
 262         /*
 263          * reserve space for the large page heap. If large pages for kernel
 264          * heap is enabled large page heap arean will be created later in the
 265          * boot sequence in segkmem_heap_lp_init(). Otherwise the allocated
 266          * range will be returned back to the heap_arena.
 267          */
 268         if (heap_lp_size) {
 269                 (void) vmem_xalloc(heap_arena, heap_lp_size, PAGESIZE, 0, 0,
 270                     heap_lp_base, heap_lp_end,
 271                     VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
 272         }
 273
 274         /*
 275          * Remove the already-spoken-for memory range [kernelheap, first_avail).
 276          */
 277         (void) vmem_xalloc(heap_arena, first_avail - kernelheap, PAGESIZE,
 278             0, 0, kernelheap, first_avail, VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
 279
 280         heap32_arena = heap_core_arena;
 281         heaptext_parent = heap_core_arena;
 282
 283         heaptext_arena = vmem_create("heaptext", (void *)textbase,
 284             HEAPTEXT_SIZE, PAGESIZE, NULL, NULL, heaptext_parent, 0, VM_SLEEP);
 285
 286         /*
 287          * Create a set of arenas for memory with static translations
 288          * (e.g. VA -> PA translations cannot change).  Since using
 289          * kernel pages by physical address implies it isn't safe to
 290          * walk across page boundaries, the static_arena quantum must
 291          * be PAGESIZE.  Any kmem caches that require static memory
 292          * should source from static_arena, while direct allocations
 293          * should only use static_alloc_arena.
 294          */
 295         static_arena = vmem_create("static", NULL, 0, PAGESIZE,
 296             segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP);
 297         static_alloc_arena = vmem_create("static_alloc", NULL, 0,
 298             sizeof (uint64_t), vmem_alloc, vmem_free, static_arena,
 299             0, VM_SLEEP);
 300
 301         /*
 302          * Create an arena for translation data (ptes, hmes, or hblks).
 303          * We need an arena for this because hat_memload() is essential
 304          * to vmem_populate() (see comments in kernel/os/vmem.c).
 305          *
 306          * Note: any kmem cache that allocates from hat_memload_arena
 307          * must be created as a KMC_NOHASH cache (i.e. no external slab
 308          * and bufctl structures to allocate) so that slab creation doesn't
 309          * require anything more than a single vmem_alloc().
 310          */
 311         hat_memload_arena = vmem_create("hat_memload", NULL, 0, PAGESIZE,
 312             hat_memload_alloc, segkmem_free, heap_arena, 0,
 313             VM_SLEEP | VMC_POPULATOR | VMC_DUMPSAFE);
 314 }
 315
 316 void
 317 boot_mapin(caddr_t addr, size_t size)
 318 {
 319         caddr_t  eaddr;
 320         page_t  *pp;
 321         pfn_t    pfnum;
 322
 323         if (page_resv(btop(size), KM_NOSLEEP) == 0)
 324                 panic("boot_mapin: page_resv failed");
 325
 326         for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
 327                 pfnum = va_to_pfn(addr);
 328                 if (pfnum == PFN_INVALID)
 329                         continue;
 330                 if ((pp = page_numtopp_nolock(pfnum)) == NULL)
 331                         panic("boot_mapin(): No pp for pfnum = %lx", pfnum);
 332
 333                 /*
 334                  * must break up any large pages that may have constituent
 335                  * pages being utilized for BOP_ALLOC()'s before calling
 336                  * page_numtopp().The locking code (ie. page_reclaim())
 337                  * can't handle them
 338                  */
 339                 if (pp->p_szc != 0)
 340                         page_boot_demote(pp);
 341
 342                 pp = page_numtopp(pfnum, SE_EXCL);
 343                 if (pp == NULL || PP_ISFREE(pp))
 344                         panic("boot_alloc: pp is NULL or free");
 345
 346                 /*
 347                  * If the cage is on but doesn't yet contain this page,
 348                  * mark it as non-relocatable.
 349                  */
 350                 if (kcage_on && !PP_ISNORELOC(pp)) {
 351                         PP_SETNORELOC(pp);
 352                         PLCNT_XFER_NORELOC(pp);
 353                 }
 354
 355                 (void) page_hashin(pp, &kvp.v_object, (uoff_t)(uintptr_t)addr,
 356                                    false);
 357                 pp->p_lckcnt = 1;
 358 #if defined(__x86)
 359                 page_downgrade(pp);
 360 #else
 361                 page_unlock(pp);
 362 #endif
 363         }
 364 }
 365
 366 /*
 367  * Get pages from boot and hash them into the kernel's vp.
 368  * Used after page structs have been allocated, but before segkmem is ready.
 369  */
 370 void *
 371 boot_alloc(void *inaddr, size_t size, uint_t align)
 372 {
 373         caddr_t addr = inaddr;
 374
 375         if (bootops == NULL)
 376                 prom_panic("boot_alloc: attempt to allocate memory after "
 377                     "BOP_GONE");
 378
 379         size = ptob(btopr(size));
 380         if (BOP_ALLOC(bootops, addr, size, align) != addr)
 381                 panic("boot_alloc: BOP_ALLOC failed");
 382         boot_mapin((caddr_t)addr, size);
 383         return (addr);
 384 }
 385
 386 static void
 387 segkmem_badop()
 388 {
 389         panic("segkmem_badop");
 390 }
 391
 392 #define SEGKMEM_BADOP(t)        (t(*)())segkmem_badop
 393
 394 /*ARGSUSED*/
 395 static faultcode_t
 396 segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size,
 397         enum fault_type type, enum seg_rw rw)
 398 {
 399         pgcnt_t npages;
 400         spgcnt_t pg;
 401         page_t *pp;
 402         struct vnode *vp = seg->s_data;
 403
 404         ASSERT(RW_READ_HELD(&seg->s_as->a_lock));
 405
 406         if (seg->s_as != &kas || size > seg->s_size ||
 407             addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
 408                 panic("segkmem_fault: bad args");
 409
 410         /*
 411          * If it is one of segkp pages, call segkp_fault.
 412          */
 413         if (segkp_bitmap && seg == &kvseg &&
 414             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 415                 return (segop_fault(hat, segkp, addr, size, type, rw));
 416
 417         if (rw != S_READ && rw != S_WRITE && rw != S_OTHER)
 418                 return (FC_NOSUPPORT);
 419
 420         npages = btopr(size);
 421
 422         switch (type) {
 423         case F_SOFTLOCK:        /* lock down already-loaded translations */
 424                 for (pg = 0; pg < npages; pg++) {
 425                         pp = page_lookup(&vp->v_object, (uoff_t)(uintptr_t)addr,
 426                             SE_SHARED);
 427                         if (pp == NULL) {
 428                                 /*
 429                                  * Hmm, no page. Does a kernel mapping
 430                                  * exist for it?
 431                                  */
 432                                 if (!hat_probe(kas.a_hat, addr)) {
 433                                         addr -= PAGESIZE;
 434                                         while (--pg >= 0) {
 435                                                 pp = page_find(&vp->v_object,
 436                                                     (uoff_t)(uintptr_t)addr);
 437                                                 if (pp)
 438                                                         page_unlock(pp);
 439                                                 addr -= PAGESIZE;
 440                                         }
 441                                         return (FC_NOMAP);
 442                                 }
 443                         }
 444                         addr += PAGESIZE;
 445                 }
 446                 if (rw == S_OTHER)
 447                         hat_reserve(seg->s_as, addr, size);
 448                 return (0);
 449         case F_SOFTUNLOCK:
 450                 while (npages--) {
 451                         pp = page_find(&vp->v_object, (uoff_t)(uintptr_t)addr);
 452                         if (pp)
 453                                 page_unlock(pp);
 454                         addr += PAGESIZE;
 455                 }
 456                 return (0);
 457         default:
 458                 return (FC_NOSUPPORT);
 459         }
 460         /*NOTREACHED*/
 461 }
 462
 463 static int
 464 segkmem_setprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
 465 {
 466         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 467
 468         if (seg->s_as != &kas || size > seg->s_size ||
 469             addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
 470                 panic("segkmem_setprot: bad args");
 471
 472         /*
 473          * If it is one of segkp pages, call segkp.
 474          */
 475         if (segkp_bitmap && seg == &kvseg &&
 476             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 477                 return (segop_setprot(segkp, addr, size, prot));
 478
 479         if (prot == 0)
 480                 hat_unload(kas.a_hat, addr, size, HAT_UNLOAD);
 481         else
 482                 hat_chgprot(kas.a_hat, addr, size, prot);
 483         return (0);
 484 }
 485
 486 /*
 487  * This is a dummy segkmem function overloaded to call segkp
 488  * when segkp is under the heap.
 489  */
 490 /* ARGSUSED */
 491 static int
 492 segkmem_checkprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
 493 {
 494         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 495
 496         if (seg->s_as != &kas)
 497                 segkmem_badop();
 498
 499         /*
 500          * If it is one of segkp pages, call into segkp.
 501          */
 502         if (segkp_bitmap && seg == &kvseg &&
 503             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 504                 return (segop_checkprot(segkp, addr, size, prot));
 505
 506         segkmem_badop();
 507         return (0);
 508 }
 509
 510 /*
 511  * This is a dummy segkmem function overloaded to call segkp
 512  * when segkp is under the heap.
 513  */
 514 /* ARGSUSED */
 515 static int
 516 segkmem_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 517 {
 518         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 519
 520         if (seg->s_as != &kas)
 521                 segkmem_badop();
 522
 523         /*
 524          * If it is one of segkp pages, call into segkp.
 525          */
 526         if (segkp_bitmap && seg == &kvseg &&
 527             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 528                 return (segop_kluster(segkp, addr, delta));
 529
 530         segkmem_badop();
 531         return (0);
 532 }
 533
 534 static void
 535 segkmem_xdump_range(void *arg, void *start, size_t size)
 536 {
 537         struct as *as = arg;
 538         caddr_t addr = start;
 539         caddr_t addr_end = addr + size;
 540
 541         while (addr < addr_end) {
 542                 pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
 543                 if (pfn != PFN_INVALID && pfn <= physmax && pf_is_memory(pfn))
 544                         dump_addpage(as, addr, pfn);
 545                 addr += PAGESIZE;
 546                 dump_timeleft = dump_timeout;
 547         }
 548 }
 549
 550 static void
 551 segkmem_dump_range(void *arg, void *start, size_t size)
 552 {
 553         caddr_t addr = start;
 554         caddr_t addr_end = addr + size;
 555
 556         /*
 557          * If we are about to start dumping the range of addresses we
 558          * carved out of the kernel heap for the large page heap walk
 559          * heap_lp_arena to find what segments are actually populated
 560          */
 561         if (SEGKMEM_USE_LARGEPAGES &&
 562             addr == heap_lp_base && addr_end == heap_lp_end &&
 563             vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
 564                 vmem_walk(heap_lp_arena, VMEM_ALLOC | VMEM_REENTRANT,
 565                     segkmem_xdump_range, arg);
 566         } else {
 567                 segkmem_xdump_range(arg, start, size);
 568         }
 569 }
 570
 571 static void
 572 segkmem_dump(struct seg *seg)
 573 {
 574         /*
 575          * The kernel's heap_arena (represented by kvseg) is a very large
 576          * VA space, most of which is typically unused.  To speed up dumping
 577          * we use vmem_walk() to quickly find the pieces of heap_arena that
 578          * are actually in use.  We do the same for heap32_arena and
 579          * heap_core.
 580          *
 581          * We specify VMEM_REENTRANT to vmem_walk() because dump_addpage()
 582          * may ultimately need to allocate memory.  Reentrant walks are
 583          * necessarily imperfect snapshots.  The kernel heap continues
 584          * to change during a live crash dump, for example.  For a normal
 585          * crash dump, however, we know that there won't be any other threads
 586          * messing with the heap.  Therefore, at worst, we may fail to dump
 587          * the pages that get allocated by the act of dumping; but we will
 588          * always dump every page that was allocated when the walk began.
 589          *
 590          * The other segkmem segments are dense (fully populated), so there's
 591          * no need to use this technique when dumping them.
 592          *
 593          * Note: when adding special dump handling for any new sparsely-
 594          * populated segments, be sure to add similar handling to the ::kgrep
 595          * code in mdb.
 596          */
 597         if (seg == &kvseg) {
 598                 vmem_walk(heap_arena, VMEM_ALLOC | VMEM_REENTRANT,
 599                     segkmem_dump_range, seg->s_as);
 600                 vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
 601                     segkmem_dump_range, seg->s_as);
 602         } else if (seg == &kvseg_core) {
 603                 vmem_walk(heap_core_arena, VMEM_ALLOC | VMEM_REENTRANT,
 604                     segkmem_dump_range, seg->s_as);
 605         } else if (seg == &kvseg32) {
 606                 vmem_walk(heap32_arena, VMEM_ALLOC | VMEM_REENTRANT,
 607                     segkmem_dump_range, seg->s_as);
 608                 vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
 609                     segkmem_dump_range, seg->s_as);
 610         } else if (seg == &kzioseg) {
 611                 /*
 612                  * We don't want to dump pages attached to kzioseg since they
 613                  * contain file data from ZFS.  If this page's segment is
 614                  * kzioseg return instead of writing it to the dump device.
 615                  */
 616                 return;
 617         } else {
 618                 segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size);
 619         }
 620 }
 621
 622 /*
 623  * lock/unlock kmem pages over a given range [addr, addr+len).
 624  * Returns a shadow list of pages in ppp. If there are holes
 625  * in the range (e.g. some of the kernel mappings do not have
 626  * underlying page_ts) returns ENOTSUP so that as_pagelock()
 627  * will handle the range via as_fault(F_SOFTLOCK).
 628  */
 629 /*ARGSUSED*/
 630 static int
 631 segkmem_pagelock(struct seg *seg, caddr_t addr, size_t len,
 632         page_t ***ppp, enum lock_type type, enum seg_rw rw)
 633 {
 634         page_t **pplist, *pp;
 635         pgcnt_t npages;
 636         spgcnt_t pg;
 637         size_t nb;
 638         struct vnode *vp = seg->s_data;
 639
 640         ASSERT(ppp != NULL);
 641
 642         /*
 643          * If it is one of segkp pages, call into segkp.
 644          */
 645         if (segkp_bitmap && seg == &kvseg &&
 646             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 647                 return (segop_pagelock(segkp, addr, len, ppp, type, rw));
 648
 649         npages = btopr(len);
 650         nb = sizeof (page_t *) * npages;
 651
 652         if (type == L_PAGEUNLOCK) {
 653                 pplist = *ppp;
 654                 ASSERT(pplist != NULL);
 655
 656                 for (pg = 0; pg < npages; pg++) {
 657                         pp = pplist[pg];
 658                         page_unlock(pp);
 659                 }
 660                 kmem_free(pplist, nb);
 661                 return (0);
 662         }
 663
 664         ASSERT(type == L_PAGELOCK);
 665
 666         pplist = kmem_alloc(nb, KM_NOSLEEP);
 667         if (pplist == NULL) {
 668                 *ppp = NULL;
 669                 return (ENOTSUP);       /* take the slow path */
 670         }
 671
 672         for (pg = 0; pg < npages; pg++) {
 673                 pp = page_lookup(&vp->v_object, (uoff_t)(uintptr_t)addr,
 674                                  SE_SHARED);
 675                 if (pp == NULL) {
 676                         while (--pg >= 0)
 677                                 page_unlock(pplist[pg]);
 678                         kmem_free(pplist, nb);
 679                         *ppp = NULL;
 680                         return (ENOTSUP);
 681                 }
 682                 pplist[pg] = pp;
 683                 addr += PAGESIZE;
 684         }
 685
 686         *ppp = pplist;
 687         return (0);
 688 }
 689
 690 /*
 691  * This is a dummy segkmem function overloaded to call segkp
 692  * when segkp is under the heap.
 693  */
 694 /* ARGSUSED */
 695 static int
 696 segkmem_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
 697 {
 698         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 699
 700         if (seg->s_as != &kas)
 701                 segkmem_badop();
 702
 703         /*
 704          * If it is one of segkp pages, call into segkp.
 705          */
 706         if (segkp_bitmap && seg == &kvseg &&
 707             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 708                 return (segop_getmemid(segkp, addr, memidp));
 709
 710         segkmem_badop();
 711         return (0);
 712 }
 713
 714 /*ARGSUSED*/
 715 static int
 716 segkmem_capable(struct seg *seg, segcapability_t capability)
 717 {
 718         if (capability == S_CAPABILITY_NOMINFLT)
 719                 return (1);
 720         return (0);
 721 }
 722
 723 const struct seg_ops segkmem_ops = {
 724         .dup            = SEGKMEM_BADOP(int),
 725         .unmap          = SEGKMEM_BADOP(int),
 726         .free           = SEGKMEM_BADOP(void),
 727         .fault          = segkmem_fault,
 728         .faulta         = SEGKMEM_BADOP(faultcode_t),
 729         .setprot        = segkmem_setprot,
 730         .checkprot      = segkmem_checkprot,
 731         .kluster        = segkmem_kluster,
 732         .sync           = SEGKMEM_BADOP(int),
 733         .incore         = SEGKMEM_BADOP(size_t),
 734         .lockop         = SEGKMEM_BADOP(int),
 735         .getprot        = SEGKMEM_BADOP(int),
 736         .getoffset      = SEGKMEM_BADOP(uoff_t),
 737         .gettype        = SEGKMEM_BADOP(int),
 738         .getvp          = SEGKMEM_BADOP(int),
 739         .advise         = SEGKMEM_BADOP(int),
 740         .dump           = segkmem_dump,
 741         .pagelock       = segkmem_pagelock,
 742         .setpagesize    = SEGKMEM_BADOP(int),
 743         .getmemid       = segkmem_getmemid,
 744         .capable        = segkmem_capable,
 745 };
 746
 747 int
 748 segkmem_zio_create(struct seg *seg)
 749 {
 750         ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
 751         seg->s_ops = &segkmem_ops;
 752         seg->s_data = &zvp;
 753         kas.a_size += seg->s_size;
 754         return (0);
 755 }
 756
 757 int
 758 segkmem_create(struct seg *seg)
 759 {
 760         ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
 761         seg->s_ops = &segkmem_ops;
 762         seg->s_data = &kvp;
 763         kas.a_size += seg->s_size;
 764         return (0);
 765 }
 766
 767 /*ARGSUSED*/
 768 page_t *
 769 segkmem_page_create(void *addr, size_t size, int vmflag, void *arg)
 770 {
 771         struct seg kseg;
 772         int pgflags;
 773         struct vnode *vp = arg;
 774
 775         if (vp == NULL)
 776                 vp = &kvp;
 777
 778         kseg.s_as = &kas;
 779         pgflags = PG_EXCL;
 780
 781         if (segkmem_reloc == 0 || (vmflag & VM_NORELOC))
 782                 pgflags |= PG_NORELOC;
 783         if ((vmflag & VM_NOSLEEP) == 0)
 784                 pgflags |= PG_WAIT;
 785         if (vmflag & VM_PANIC)
 786                 pgflags |= PG_PANIC;
 787         if (vmflag & VM_PUSHPAGE)
 788                 pgflags |= PG_PUSHPAGE;
 789         if (vmflag & VM_NORMALPRI) {
 790                 ASSERT(vmflag & VM_NOSLEEP);
 791                 pgflags |= PG_NORMALPRI;
 792         }
 793
 794         return (page_create_va(&vp->v_object, (uoff_t)(uintptr_t)addr, size,
 795             pgflags, &kseg, addr));
 796 }
 797
 798 /*
 799  * Allocate pages to back the virtual address range [addr, addr + size).
 800  * If addr is NULL, allocate the virtual address space as well.
 801  */
 802 void *
 803 segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr,
 804         page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
 805 {
 806         page_t *ppl;
 807         caddr_t addr = inaddr;
 808         pgcnt_t npages = btopr(size);
 809         int allocflag;
 810
 811         if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
 812                 return (NULL);
 813
 814         ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
 815
 816         if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
 817                 if (inaddr == NULL)
 818                         vmem_free(vmp, addr, size);
 819                 return (NULL);
 820         }
 821
 822         ppl = page_create_func(addr, size, vmflag, pcarg);
 823         if (ppl == NULL) {
 824                 if (inaddr == NULL)
 825                         vmem_free(vmp, addr, size);
 826                 page_unresv(npages);
 827                 return (NULL);
 828         }
 829
 830         /*
 831          * Under certain conditions, we need to let the HAT layer know
 832          * that it cannot safely allocate memory.  Allocations from
 833          * the hat_memload vmem arena always need this, to prevent
 834          * infinite recursion.
 835          *
 836          * In addition, the x86 hat cannot safely do memory
 837          * allocations while in vmem_populate(), because there
 838          * is no simple bound on its usage.
 839          */
 840         if (vmflag & VM_MEMLOAD)
 841                 allocflag = HAT_NO_KALLOC;
 842 #if defined(__x86)
 843         else if (vmem_is_populator())
 844                 allocflag = HAT_NO_KALLOC;
 845 #endif
 846         else
 847                 allocflag = 0;
 848
 849         while (ppl != NULL) {
 850                 page_t *pp = ppl;
 851                 page_sub(&ppl, pp);
 852                 ASSERT(page_iolock_assert(pp));
 853                 ASSERT(PAGE_EXCL(pp));
 854                 page_io_unlock(pp);
 855                 hat_memload(kas.a_hat, (caddr_t)(uintptr_t)pp->p_offset, pp,
 856                     (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
 857                     HAT_LOAD_LOCK | allocflag);
 858                 pp->p_lckcnt = 1;
 859 #if defined(__x86)
 860                 page_downgrade(pp);
 861 #else
 862                 if (vmflag & SEGKMEM_SHARELOCKED)
 863                         page_downgrade(pp);
 864                 else
 865                         page_unlock(pp);
 866 #endif
 867         }
 868
 869         return (addr);
 870 }
 871
 872 static void *
 873 segkmem_alloc_vn(vmem_t *vmp, size_t size, int vmflag, struct vnode *vp)
 874 {
 875         void *addr;
 876         segkmem_gc_list_t *gcp, **prev_gcpp;
 877
 878         ASSERT(vp != NULL);
 879
 880         if (kvseg.s_base == NULL) {
 881                 if (bootops->bsys_alloc == NULL)
 882                         halt("Memory allocation between bop_alloc() and "
 883                             "kmem_alloc().\n");
 884
 885                 /*
 886                  * There's not a lot of memory to go around during boot,
 887                  * so recycle it if we can.
 888                  */
 889                 for (prev_gcpp = &segkmem_gc_list; (gcp = *prev_gcpp) != NULL;
 890                     prev_gcpp = &gcp->gc_next) {
 891                         if (gcp->gc_arena == vmp && gcp->gc_size == size) {
 892                                 *prev_gcpp = gcp->gc_next;
 893                                 return (gcp);
 894                         }
 895                 }
 896
 897                 addr = vmem_alloc(vmp, size, vmflag | VM_PANIC);
 898                 if (boot_alloc(addr, size, BO_NO_ALIGN) != addr)
 899                         panic("segkmem_alloc: boot_alloc failed");
 900                 return (addr);
 901         }
 902         return (segkmem_xalloc(vmp, NULL, size, vmflag, 0,
 903             segkmem_page_create, vp));
 904 }
 905
 906 void *
 907 segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
 908 {
 909         return (segkmem_alloc_vn(vmp, size, vmflag, &kvp));
 910 }
 911
 912 void *
 913 segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
 914 {
 915         return (segkmem_alloc_vn(vmp, size, vmflag, &zvp));
 916 }
 917
 918 /*
 919  * Any changes to this routine must also be carried over to
 920  * devmap_free_pages() in the seg_dev driver. This is because
 921  * we currently don't have a special kernel segment for non-paged
 922  * kernel memory that is exported by drivers to user space.
 923  */
 924 static void
 925 segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
 926     void (*func)(page_t *))
 927 {
 928         page_t *pp;
 929         caddr_t addr = inaddr;
 930         caddr_t eaddr;
 931         pgcnt_t npages = btopr(size);
 932
 933         ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
 934         ASSERT(vp != NULL);
 935
 936         if (kvseg.s_base == NULL) {
 937                 segkmem_gc_list_t *gc = inaddr;
 938                 gc->gc_arena = vmp;
 939                 gc->gc_size = size;
 940                 gc->gc_next = segkmem_gc_list;
 941                 segkmem_gc_list = gc;
 942                 return;
 943         }
 944
 945         hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
 946
 947         for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
 948 #if defined(__x86)
 949                 pp = page_find(&vp->v_object, (uoff_t)(uintptr_t)addr);
 950                 if (pp == NULL)
 951                         panic("segkmem_free: page not found");
 952                 if (!page_tryupgrade(pp)) {
 953                         /*
 954                          * Some other thread has a sharelock. Wait for
 955                          * it to drop the lock so we can free this page.
 956                          */
 957                         page_unlock(pp);
 958                         pp = page_lookup(&vp->v_object, (uoff_t)(uintptr_t)addr,
 959                             SE_EXCL);
 960                 }
 961 #else
 962                 pp = page_lookup(&vp->v_object, (uoff_t)(uintptr_t)addr,
 963                                  SE_EXCL);
 964 #endif
 965                 if (pp == NULL)
 966                         panic("segkmem_free: page not found");
 967                 /* Clear p_lckcnt so page_destroy() doesn't update availrmem */
 968                 pp->p_lckcnt = 0;
 969                 if (func)
 970                         func(pp);
 971                 else
 972                         page_destroy(pp, 0);
 973         }
 974         if (func == NULL)
 975                 page_unresv(npages);
 976
 977         if (vmp != NULL)
 978                 vmem_free(vmp, inaddr, size);
 979
 980 }
 981
 982 void
 983 segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, void (*func)(page_t *))
 984 {
 985         segkmem_free_vn(vmp, inaddr, size, &kvp, func);
 986 }
 987
 988 void
 989 segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
 990 {
 991         segkmem_free_vn(vmp, inaddr, size, &kvp, NULL);
 992 }
 993
 994 void
 995 segkmem_zio_free(vmem_t *vmp, void *inaddr, size_t size)
 996 {
 997         segkmem_free_vn(vmp, inaddr, size, &zvp, NULL);
 998 }
 999
1000 void
1001 segkmem_gc(void)
1002 {
1003         ASSERT(kvseg.s_base != NULL);
1004         while (segkmem_gc_list != NULL) {
1005                 segkmem_gc_list_t *gc = segkmem_gc_list;
1006                 segkmem_gc_list = gc->gc_next;
1007                 segkmem_free(gc->gc_arena, gc, gc->gc_size);
1008         }
1009 }
1010
1011 /*
1012  * Legacy entry points from here to end of file.
1013  */
1014 void
1015 segkmem_mapin(struct seg *seg, void *addr, size_t size, uint_t vprot,
1016     pfn_t pfn, uint_t flags)
1017 {
1018         hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1019         hat_devload(seg->s_as->a_hat, addr, size, pfn, vprot,
1020             flags | HAT_LOAD_LOCK);
1021 }
1022
1023 void
1024 segkmem_mapout(struct seg *seg, void *addr, size_t size)
1025 {
1026         hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1027 }
1028
1029 void *
1030 kmem_getpages(pgcnt_t npages, int kmflag)
1031 {
1032         return (kmem_alloc(ptob(npages), kmflag));
1033 }
1034
1035 void
1036 kmem_freepages(void *addr, pgcnt_t npages)
1037 {
1038         kmem_free(addr, ptob(npages));
1039 }
1040
1041 /*
1042  * segkmem_page_create_large() allocates a large page to be used for the kmem
1043  * caches. If kpr is enabled we ask for a relocatable page unless requested
1044  * otherwise. If kpr is disabled we have to ask for a non-reloc page
1045  */
1046 static page_t *
1047 segkmem_page_create_large(void *addr, size_t size, int vmflag, void *arg)
1048 {
1049         int pgflags;
1050
1051         pgflags = PG_EXCL;
1052
1053         if (segkmem_reloc == 0 || (vmflag & VM_NORELOC))
1054                 pgflags |= PG_NORELOC;
1055         if (!(vmflag & VM_NOSLEEP))
1056                 pgflags |= PG_WAIT;
1057         if (vmflag & VM_PUSHPAGE)
1058                 pgflags |= PG_PUSHPAGE;
1059         if (vmflag & VM_NORMALPRI)
1060                 pgflags |= PG_NORMALPRI;
1061
1062         return (page_create_va_large(&kvp.v_object, (uoff_t)(uintptr_t)addr,
1063                                      size, pgflags, &kvseg, addr, arg));
1064 }
1065
1066 /*
1067  * Allocate a large page to back the virtual address range
1068  * [addr, addr + size).  If addr is NULL, allocate the virtual address
1069  * space as well.
1070  */
1071 static void *
1072 segkmem_xalloc_lp(vmem_t *vmp, void *inaddr, size_t size, int vmflag,
1073     uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *),
1074     void *pcarg)
1075 {
1076         caddr_t addr = inaddr, pa;
1077         size_t  lpsize = segkmem_lpsize;
1078         pgcnt_t npages = btopr(size);
1079         pgcnt_t nbpages = btop(lpsize);
1080         pgcnt_t nlpages = size >> segkmem_lpshift;
1081         size_t  ppasize = nbpages * sizeof (page_t *);
1082         page_t *pp, *rootpp, **ppa, *pplist = NULL;
1083         int i;
1084
1085         vmflag |= VM_NOSLEEP;
1086
1087         if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
1088                 return (NULL);
1089         }
1090
1091         /*
1092          * allocate an array we need for hat_memload_array.
1093          * we use a separate arena to avoid recursion.
1094          * we will not need this array when hat_memload_array learns pp++
1095          */
1096         if ((ppa = vmem_alloc(segkmem_ppa_arena, ppasize, vmflag)) == NULL) {
1097                 goto fail_array_alloc;
1098         }
1099
1100         if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
1101                 goto fail_vmem_alloc;
1102
1103         ASSERT(((uintptr_t)addr & (lpsize - 1)) == 0);
1104
1105         /* create all the pages */
1106         for (pa = addr, i = 0; i < nlpages; i++, pa += lpsize) {
1107                 if ((pp = page_create_func(pa, lpsize, vmflag, pcarg)) == NULL)
1108                         goto fail_page_create;
1109                 page_list_concat(&pplist, &pp);
1110         }
1111
1112         /* at this point we have all the resource to complete the request */
1113         while ((rootpp = pplist) != NULL) {
1114                 for (i = 0; i < nbpages; i++) {
1115                         ASSERT(pplist != NULL);
1116                         pp = pplist;
1117                         page_sub(&pplist, pp);
1118                         ASSERT(page_iolock_assert(pp));
1119                         page_io_unlock(pp);
1120                         ppa[i] = pp;
1121                 }
1122                 /*
1123                  * Load the locked entry. It's OK to preload the entry into the
1124                  * TSB since we now support large mappings in the kernel TSB.
1125                  */
1126                 hat_memload_array(kas.a_hat,
1127                     (caddr_t)(uintptr_t)rootpp->p_offset, lpsize,
1128                     ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
1129                     HAT_LOAD_LOCK);
1130
1131                 for (--i; i >= 0; --i) {
1132                         ppa[i]->p_lckcnt = 1;
1133                         page_unlock(ppa[i]);
1134                 }
1135         }
1136
1137         vmem_free(segkmem_ppa_arena, ppa, ppasize);
1138         return (addr);
1139
1140 fail_page_create:
1141         while ((rootpp = pplist) != NULL) {
1142                 for (i = 0, pp = pplist; i < nbpages; i++, pp = pplist) {
1143                         ASSERT(pp != NULL);
1144                         page_sub(&pplist, pp);
1145                         ASSERT(page_iolock_assert(pp));
1146                         page_io_unlock(pp);
1147                 }
1148                 page_destroy_pages(rootpp);
1149         }
1150
1151         if (inaddr == NULL)
1152                 vmem_free(vmp, addr, size);
1153
1154 fail_vmem_alloc:
1155         vmem_free(segkmem_ppa_arena, ppa, ppasize);
1156
1157 fail_array_alloc:
1158         page_unresv(npages);
1159
1160         return (NULL);
1161 }
1162
1163 static void
1164 segkmem_free_one_lp(caddr_t addr, size_t size)
1165 {
1166         page_t          *pp, *rootpp = NULL;
1167         pgcnt_t         pgs_left = btopr(size);
1168
1169         ASSERT(size == segkmem_lpsize);
1170
1171         hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1172
1173         for (; pgs_left > 0; addr += PAGESIZE, pgs_left--) {
1174                 pp = page_lookup(&kvp.v_object, (uoff_t)(uintptr_t)addr, SE_EXCL);
1175                 if (pp == NULL)
1176                         panic("segkmem_free_one_lp: page not found");
1177                 ASSERT(PAGE_EXCL(pp));
1178                 pp->p_lckcnt = 0;
1179                 if (rootpp == NULL)
1180                         rootpp = pp;
1181         }
1182         ASSERT(rootpp != NULL);
1183         page_destroy_pages(rootpp);
1184
1185         /* page_unresv() is done by the caller */
1186 }
1187
1188 /*
1189  * This function is called to import new spans into the vmem arenas like
1190  * kmem_default_arena and kmem_oversize_arena. It first tries to import
1191  * spans from large page arena - kmem_lp_arena. In order to do this it might
1192  * have to "upgrade the requested size" to kmem_lp_arena quantum. If
1193  * it was not able to satisfy the upgraded request it then calls regular
1194  * segkmem_alloc() that satisfies the request by importing from "*vmp" arena
1195  */
1196 /*ARGSUSED*/
1197 void *
1198 segkmem_alloc_lp(vmem_t *vmp, size_t *sizep, size_t align, int vmflag)
1199 {
1200         size_t size;
1201         kthread_t *t = curthread;
1202         segkmem_lpcb_t *lpcb = &segkmem_lpcb;
1203
1204         ASSERT(sizep != NULL);
1205
1206         size = *sizep;
1207
1208         if (lpcb->lp_uselp && !(t->t_flag & T_PANIC) &&
1209             !(vmflag & SEGKMEM_SHARELOCKED)) {
1210
1211                 size_t kmemlp_qnt = segkmem_kmemlp_quantum;
1212                 size_t asize = P2ROUNDUP(size, kmemlp_qnt);
1213                 void  *addr = NULL;
1214                 ulong_t *lpthrtp = &lpcb->lp_throttle;
1215                 ulong_t lpthrt = *lpthrtp;
1216                 int     dowakeup = 0;
1217                 int     doalloc = 1;
1218
1219                 ASSERT(kmem_lp_arena != NULL);
1220                 ASSERT(asize >= size);
1221
1222                 if (lpthrt != 0) {
1223                         /* try to update the throttle value */
1224                         lpthrt = atomic_inc_ulong_nv(lpthrtp);
1225                         if (lpthrt >= segkmem_lpthrottle_max) {
1226                                 lpthrt = atomic_cas_ulong(lpthrtp, lpthrt,
1227                                     segkmem_lpthrottle_max / 4);
1228                         }
1229
1230                         /*
1231                          * when we get above throttle start do an exponential
1232                          * backoff at trying large pages and reaping
1233                          */
1234                         if (lpthrt > segkmem_lpthrottle_start &&
1235                             !ISP2(lpthrt)) {
1236                                 lpcb->allocs_throttled++;
1237                                 lpthrt--;
1238                                 if (ISP2(lpthrt))
1239                                         kmem_reap();
1240                                 return (segkmem_alloc(vmp, size, vmflag));
1241                         }
1242                 }
1243
1244                 if (!(vmflag & VM_NOSLEEP) &&
1245                     segkmem_heaplp_quantum >= (8 * kmemlp_qnt) &&
1246                     vmem_size(kmem_lp_arena, VMEM_FREE) <= kmemlp_qnt &&
1247                     asize < (segkmem_heaplp_quantum - kmemlp_qnt)) {
1248
1249                         /*
1250                          * we are low on free memory in kmem_lp_arena
1251                          * we let only one guy to allocate heap_lp
1252                          * quantum size chunk that everybody is going to
1253                          * share
1254                          */
1255                         mutex_enter(&lpcb->lp_lock);
1256
1257                         if (lpcb->lp_wait) {
1258
1259                                 /* we are not the first one - wait */
1260                                 cv_wait(&lpcb->lp_cv, &lpcb->lp_lock);
1261                                 if (vmem_size(kmem_lp_arena, VMEM_FREE) <
1262                                     kmemlp_qnt)  {
1263                                         doalloc = 0;
1264                                 }
1265                         } else if (vmem_size(kmem_lp_arena, VMEM_FREE) <=
1266                             kmemlp_qnt) {
1267
1268                                 /*
1269                                  * we are the first one, make sure we import
1270                                  * a large page
1271                                  */
1272                                 if (asize == kmemlp_qnt)
1273                                         asize += kmemlp_qnt;
1274                                 dowakeup = 1;
1275                                 lpcb->lp_wait = 1;
1276                         }
1277
1278                         mutex_exit(&lpcb->lp_lock);
1279                 }
1280
1281                 /*
1282                  * VM_ABORT flag prevents sleeps in vmem_xalloc when
1283                  * large pages are not available. In that case this allocation
1284                  * attempt will fail and we will retry allocation with small
1285                  * pages. We also do not want to panic if this allocation fails
1286                  * because we are going to retry.
1287                  */
1288                 if (doalloc) {
1289                         addr = vmem_alloc(kmem_lp_arena, asize,
1290                             (vmflag | VM_ABORT) & ~VM_PANIC);
1291
1292                         if (dowakeup) {
1293                                 mutex_enter(&lpcb->lp_lock);
1294                                 ASSERT(lpcb->lp_wait != 0);
1295                                 lpcb->lp_wait = 0;
1296                                 cv_broadcast(&lpcb->lp_cv);
1297                                 mutex_exit(&lpcb->lp_lock);
1298                         }
1299                 }
1300
1301                 if (addr != NULL) {
1302                         *sizep = asize;
1303                         *lpthrtp = 0;
1304                         return (addr);
1305                 }
1306
1307                 if (vmflag & VM_NOSLEEP)
1308                         lpcb->nosleep_allocs_failed++;
1309                 else
1310                         lpcb->sleep_allocs_failed++;
1311                 lpcb->alloc_bytes_failed += size;
1312
1313                 /* if large page throttling is not started yet do it */
1314                 if (segkmem_use_lpthrottle && lpthrt == 0) {
1315                         lpthrt = atomic_cas_ulong(lpthrtp, lpthrt, 1);
1316                 }
1317         }
1318         return (segkmem_alloc(vmp, size, vmflag));
1319 }
1320
1321 void
1322 segkmem_free_lp(vmem_t *vmp, void *inaddr, size_t size)
1323 {
1324         if (kmem_lp_arena == NULL || !IS_KMEM_VA_LARGEPAGE((caddr_t)inaddr)) {
1325                 segkmem_free(vmp, inaddr, size);
1326         } else {
1327                 vmem_free(kmem_lp_arena, inaddr, size);
1328         }
1329 }
1330
1331 /*
1332  * segkmem_alloc_lpi() imports virtual memory from large page heap arena
1333  * into kmem_lp arena. In the process it maps the imported segment with
1334  * large pages
1335  */
1336 static void *
1337 segkmem_alloc_lpi(vmem_t *vmp, size_t size, int vmflag)
1338 {
1339         segkmem_lpcb_t *lpcb = &segkmem_lpcb;
1340         void  *addr;
1341
1342         ASSERT(size != 0);
1343         ASSERT(vmp == heap_lp_arena);
1344
1345         /* do not allow large page heap grow beyound limits */
1346         if (vmem_size(vmp, VMEM_ALLOC) >= segkmem_kmemlp_max) {
1347                 lpcb->allocs_limited++;
1348                 return (NULL);
1349         }
1350
1351         addr = segkmem_xalloc_lp(vmp, NULL, size, vmflag, 0,
1352             segkmem_page_create_large, NULL);
1353         return (addr);
1354 }
1355
1356 /*
1357  * segkmem_free_lpi() returns virtual memory back into large page heap arena
1358  * from kmem_lp arena. Beore doing this it unmaps the segment and frees
1359  * large pages used to map it.
1360  */
1361 static void
1362 segkmem_free_lpi(vmem_t *vmp, void *inaddr, size_t size)
1363 {
1364         pgcnt_t         nlpages = size >> segkmem_lpshift;
1365         size_t          lpsize = segkmem_lpsize;
1366         caddr_t         addr = inaddr;
1367         pgcnt_t         npages = btopr(size);
1368         int             i;
1369
1370         ASSERT(vmp == heap_lp_arena);
1371         ASSERT(IS_KMEM_VA_LARGEPAGE(addr));
1372         ASSERT(((uintptr_t)inaddr & (lpsize - 1)) == 0);
1373
1374         for (i = 0; i < nlpages; i++) {
1375                 segkmem_free_one_lp(addr, lpsize);
1376                 addr += lpsize;
1377         }
1378
1379         page_unresv(npages);
1380
1381         vmem_free(vmp, inaddr, size);
1382 }
1383
1384 /*
1385  * This function is called at system boot time by kmem_init right after
1386  * /etc/system file has been read. It checks based on hardware configuration
1387  * and /etc/system settings if system is going to use large pages. The
1388  * initialiazation necessary to actually start using large pages
1389  * happens later in the process after segkmem_heap_lp_init() is called.
1390  */
1391 int
1392 segkmem_lpsetup()
1393 {
1394         int use_large_pages = 0;
1395
1396         return (use_large_pages);
1397 }
1398
1399 void
1400 segkmem_zio_init(void *zio_mem_base, size_t zio_mem_size)
1401 {
1402         ASSERT(zio_mem_base != NULL);
1403         ASSERT(zio_mem_size != 0);
1404
1405         /*
1406          * To reduce VA space fragmentation, we set up quantum caches for the
1407          * smaller sizes;  we chose 32k because that translates to 128k VA
1408          * slabs, which matches nicely with the common 128k zio_data bufs.
1409          */
1410         zio_arena = vmem_create("zfs_file_data", zio_mem_base, zio_mem_size,
1411             PAGESIZE, NULL, NULL, NULL, 32 * 1024, VM_SLEEP);
1412
1413         zio_alloc_arena = vmem_create("zfs_file_data_buf", NULL, 0, PAGESIZE,
1414             segkmem_zio_alloc, segkmem_zio_free, zio_arena, 0, VM_SLEEP);
1415
1416         ASSERT(zio_arena != NULL);
1417         ASSERT(zio_alloc_arena != NULL);
1418 }
1419