kernel/vm/vm_pagelist.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 /*
  26  * Copyright 2012 Joyent, Inc.  All rights reserved.
  27  */
  28
  29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  30 /*      All Rights Reserved   */
  31
  32 /*
  33  * Portions of this source code were derived from Berkeley 4.3 BSD
  34  * under license from the Regents of the University of California.
  35  */
  36
  37
  38 /*
  39  * This file contains common functions to access and manage the page lists.
  40  * Many of these routines originated from platform dependent modules
  41  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
  42  * a platform independent manner.
  43  *
  44  * vm/vm_dep.h provides for platform specific support.
  45  */
  46
  47 #include <sys/types.h>
  48 #include <sys/debug.h>
  49 #include <sys/cmn_err.h>
  50 #include <sys/systm.h>
  51 #include <sys/atomic.h>
  52 #include <sys/sysmacros.h>
  53 #include <vm/as.h>
  54 #include <vm/page.h>
  55 #include <vm/seg_kmem.h>
  56 #include <vm/seg_vn.h>
  57 #include <sys/vmsystm.h>
  58 #include <sys/memnode.h>
  59 #include <vm/vm_dep.h>
  60 #include <sys/lgrp.h>
  61 #include <sys/mem_config.h>
  62 #include <sys/callb.h>
  63 #include <sys/mem_cage.h>
  64 #include <sys/sdt.h>
  65 #include <sys/dumphdr.h>
  66 #include <sys/swap.h>
  67
  68 extern uint_t   vac_colors;
  69
  70 #define MAX_PRAGMA_ALIGN        128
  71
  72 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
  73
  74 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
  75 #pragma align   L2CACHE_ALIGN_MAX(vm_cpu_data0)
  76 #else
  77 #pragma align   MAX_PRAGMA_ALIGN(vm_cpu_data0)
  78 #endif
  79 char            vm_cpu_data0[VM_CPU_DATA_PADSIZE];
  80
  81 /*
  82  * number of page colors equivalent to reqested color in page_get routines.
  83  * If set, keeps large pages intact longer and keeps MPO allocation
  84  * from the local mnode in favor of acquiring the 'correct' page color from
  85  * a demoted large page or from a remote mnode.
  86  */
  87 uint_t  colorequiv;
  88
  89 /*
  90  * color equivalency mask for each page size.
  91  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
  92  * High 4 bits determine the number of high order bits of the color to ignore.
  93  * Low 4 bits determines number of low order bits of color to ignore (it's only
  94  * relevant for hashed index based page coloring).
  95  */
  96 uchar_t colorequivszc[MMU_PAGE_SIZES];
  97
  98 /*
  99  * if set, specifies the percentage of large pages that are free from within
 100  * a large page region before attempting to lock those pages for
 101  * page_get_contig_pages processing.
 102  *
 103  * Should be turned on when kpr is available when page_trylock_contig_pages
 104  * can be more selective.
 105  */
 106
 107 int     ptcpthreshold;
 108
 109 /*
 110  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
 111  * Enabled by default via pgcplimitsearch.
 112  *
 113  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
 114  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
 115  * bound. This upper bound range guarantees:
 116  *    - all large page 'slots' will be searched over time
 117  *    - the minimum (1) large page candidates considered on each pgcp call
 118  *    - count doesn't wrap around to 0
 119  */
 120 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES];
 121 int     pgcplimitsearch = 1;
 122
 123 #define PGCPFAILMAX             (1 << (highbit(physinstalled) - 1))
 124 #define SETPGCPFAILCNT(szc)                                             \
 125         if (++pgcpfailcnt[szc] >= PGCPFAILMAX)                          \
 126                 pgcpfailcnt[szc] = PGCPFAILMAX / 2;
 127
 128 #ifdef VM_STATS
 129 struct vmm_vmstats_str  vmm_vmstats;
 130
 131 #endif /* VM_STATS */
 132
 133 /* enable page_get_contig_pages */
 134 #define LPGCREATE       1
 135
 136 int pg_contig_disable;
 137 int pg_lpgcreate_nocage = LPGCREATE;
 138
 139 /*
 140  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
 141  */
 142 #define PFNNULL         0
 143
 144 /* Flags involved in promotion and demotion routines */
 145 #define PC_FREE         0x1     /* put page on freelist */
 146 #define PC_ALLOC        0x2     /* return page for allocation */
 147
 148 /*
 149  * Flag for page_demote to be used with PC_FREE to denote that we don't care
 150  * what the color is as the color parameter to the function is ignored.
 151  */
 152 #define PC_NO_COLOR     (-1)
 153
 154 /* mtype value for page_promote to use when mtype does not matter */
 155 #define PC_MTYPE_ANY    (-1)
 156
 157 /*
 158  * page counters candidates info
 159  * See page_ctrs_cands comment below for more details.
 160  * fields are as follows:
 161  *      pcc_pages_free:         # pages which freelist coalesce can create
 162  *      pcc_color_free:         pointer to page free counts per color
 163  */
 164 typedef struct pcc_info {
 165         pgcnt_t pcc_pages_free;
 166         pgcnt_t *pcc_color_free;
 167         uint_t  pad[12];
 168 } pcc_info_t;
 169
 170 /*
 171  * On big machines it can take a long time to check page_counters
 172  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
 173  * updated sum of all elements of the corresponding page_counters arrays.
 174  * page_freelist_coalesce() searches page_counters only if an appropriate
 175  * element of page_ctrs_cands array is greater than 0.
 176  *
 177  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
 178  */
 179 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
 180
 181 /*
 182  * Return in val the total number of free pages which can be created
 183  * for the given mnode (m), mrange (g), and region size (r)
 184  */
 185 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) {                           \
 186         int i;                                                          \
 187         val = 0;                                                        \
 188         for (i = 0; i < NPC_MUTEX; i++) {                               \
 189             val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;    \
 190         }                                                               \
 191 }
 192
 193 /*
 194  * Return in val the total number of free pages which can be created
 195  * for the given mnode (m), mrange (g), region size (r), and color (c)
 196  */
 197 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {                   \
 198         int i;                                                          \
 199         val = 0;                                                        \
 200         ASSERT((c) < PAGE_GET_PAGECOLORS(r));                           \
 201         for (i = 0; i < NPC_MUTEX; i++) {                               \
 202             val +=                                                      \
 203                 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];  \
 204         }                                                               \
 205 }
 206
 207 /*
 208  * We can only allow a single thread to update a counter within the physical
 209  * range of the largest supported page size. That is the finest granularity
 210  * possible since the counter values are dependent on each other
 211  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
 212  * ctr_mutex lock index for a particular physical range.
 213  */
 214 static kmutex_t *ctr_mutex[NPC_MUTEX];
 215
 216 #define PP_CTR_LOCK_INDX(pp)                                            \
 217         (((pp)->p_pagenum >>                                            \
 218             (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
 219
 220 #define INVALID_COLOR 0xffffffff
 221 #define INVALID_MASK  0xffffffff
 222
 223 /*
 224  * Local functions prototypes.
 225  */
 226
 227 void page_ctr_add(int, int, page_t *, int);
 228 void page_ctr_add_internal(int, int, page_t *, int);
 229 void page_ctr_sub(int, int, page_t *, int);
 230 void page_ctr_sub_internal(int, int, page_t *, int);
 231 void page_freelist_lock(int);
 232 void page_freelist_unlock(int);
 233 page_t *page_promote(int, pfn_t, uchar_t, int, int);
 234 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
 235 page_t *page_freelist_split(uchar_t,
 236     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
 237 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
 238 static int page_trylock_cons(page_t *pp, se_t se);
 239
 240 /*
 241  * The page_counters array below is used to keep track of free contiguous
 242  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
 243  * This contains an array of counters, the size of the array, a shift value
 244  * used to convert a pagenum into a counter array index or vice versa, as
 245  * well as a cache of the last successful index to be promoted to a larger
 246  * page size.  As an optimization, we keep track of the last successful index
 247  * to be promoted per page color for the given size region, and this is
 248  * allocated dynamically based upon the number of colors for a given
 249  * region size.
 250  *
 251  * Conceptually, the page counters are represented as:
 252  *
 253  *      page_counters[region_size][mnode]
 254  *
 255  *      region_size:    size code of a candidate larger page made up
 256  *                      of contiguous free smaller pages.
 257  *
 258  *      page_counters[region_size][mnode].hpm_counters[index]:
 259  *              represents how many (region_size - 1) pages either
 260  *              exist or can be created within the given index range.
 261  *
 262  * Let's look at a sparc example:
 263  *      If we want to create a free 512k page, we look at region_size 2
 264  *      for the mnode we want.  We calculate the index and look at a specific
 265  *      hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
 266  *      this location, it means that 8 64k pages either exist or can be created
 267  *      from 8K pages in order to make a single free 512k page at the given
 268  *      index.  Note that when a region is full, it will contribute to the
 269  *      counts in the region above it.  Thus we will not know what page
 270  *      size the free pages will be which can be promoted to this new free
 271  *      page unless we look at all regions below the current region.
 272  */
 273
 274 /*
 275  * Note: hpmctr_t is defined in platform vm_dep.h
 276  * hw_page_map_t contains all the information needed for the page_counters
 277  * logic. The fields are as follows:
 278  *
 279  *      hpm_counters:   dynamically allocated array to hold counter data
 280  *      hpm_entries:    entries in hpm_counters
 281  *      hpm_shift:      shift for pnum/array index conv
 282  *      hpm_base:       PFN mapped to counter index 0
 283  *      hpm_color_current:      last index in counter array for this color at
 284  *                              which we successfully created a large page
 285  */
 286 typedef struct hw_page_map {
 287         hpmctr_t        *hpm_counters;
 288         size_t          hpm_entries;
 289         int             hpm_shift;
 290         pfn_t           hpm_base;
 291         size_t          *hpm_color_current[MAX_MNODE_MRANGES];
 292 } hw_page_map_t;
 293
 294 /*
 295  * Element zero is not used, but is allocated for convenience.
 296  */
 297 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
 298
 299 /*
 300  * Cached value of MNODE_RANGE_CNT(mnode).
 301  * This is a function call in x86.
 302  */
 303 static int mnode_nranges[MAX_MEM_NODES];
 304 static int mnode_maxmrange[MAX_MEM_NODES];
 305
 306 /*
 307  * The following macros are convenient ways to get access to the individual
 308  * elements of the page_counters arrays.  They can be used on both
 309  * the left side and right side of equations.
 310  */
 311 #define PAGE_COUNTERS(mnode, rg_szc, idx)                       \
 312         (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
 313
 314 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc)                   \
 315         (page_counters[(rg_szc)][(mnode)].hpm_counters)
 316
 317 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc)                      \
 318         (page_counters[(rg_szc)][(mnode)].hpm_shift)
 319
 320 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc)                    \
 321         (page_counters[(rg_szc)][(mnode)].hpm_entries)
 322
 323 #define PAGE_COUNTERS_BASE(mnode, rg_szc)                       \
 324         (page_counters[(rg_szc)][(mnode)].hpm_base)
 325
 326 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)             \
 327         (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
 328
 329 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)       \
 330         (page_counters[(rg_szc)][(mnode)].                              \
 331         hpm_color_current[(mrange)][(color)])
 332
 333 #define PNUM_TO_IDX(mnode, rg_szc, pnum)                        \
 334         (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>    \
 335                 PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
 336
 337 #define IDX_TO_PNUM(mnode, rg_szc, index)                       \
 338         (PAGE_COUNTERS_BASE((mnode), (rg_szc)) +                \
 339                 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
 340
 341 /*
 342  * Protects the hpm_counters and hpm_color_current memory from changing while
 343  * looking at page counters information.
 344  * Grab the write lock to modify what these fields point at.
 345  * Grab the read lock to prevent any pointers from changing.
 346  * The write lock can not be held during memory allocation due to a possible
 347  * recursion deadlock with trying to grab the read lock while the
 348  * write lock is already held.
 349  */
 350 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
 351
 352
 353 /*
 354  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
 355  */
 356 void
 357 cpu_vm_data_init(struct cpu *cp)
 358 {
 359         if (cp == CPU0) {
 360                 cp->cpu_vm_data = (void *)&vm_cpu_data0;
 361         } else {
 362                 void    *kmptr;
 363                 int     align;
 364                 size_t  sz;
 365
 366                 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
 367                 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
 368                 kmptr = kmem_zalloc(sz, KM_SLEEP);
 369                 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
 370                 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
 371                 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
 372         }
 373 }
 374
 375 /*
 376  * free cpu_vm_data
 377  */
 378 void
 379 cpu_vm_data_destroy(struct cpu *cp)
 380 {
 381         if (cp->cpu_seqid && cp->cpu_vm_data) {
 382                 ASSERT(cp != CPU0);
 383                 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
 384                     ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
 385         }
 386         cp->cpu_vm_data = NULL;
 387 }
 388
 389
 390 /*
 391  * page size to page size code
 392  */
 393 int
 394 page_szc(size_t pagesize)
 395 {
 396         int     i = 0;
 397
 398         while (hw_page_array[i].hp_size) {
 399                 if (pagesize == hw_page_array[i].hp_size)
 400                         return (i);
 401                 i++;
 402         }
 403         return (-1);
 404 }
 405
 406 /*
 407  * page size to page size code with the restriction that it be a supported
 408  * user page size.  If it's not a supported user page size, -1 will be returned.
 409  */
 410 int
 411 page_szc_user_filtered(size_t pagesize)
 412 {
 413         int szc = page_szc(pagesize);
 414         if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
 415                 return (szc);
 416         }
 417         return (-1);
 418 }
 419
 420 /*
 421  * Return how many page sizes are available for the user to use.  This is
 422  * what the hardware supports and not based upon how the OS implements the
 423  * support of different page sizes.
 424  *
 425  * If legacy is non-zero, return the number of pagesizes available to legacy
 426  * applications. The number of legacy page sizes might be less than the
 427  * exported user page sizes. This is to prevent legacy applications that
 428  * use the largest page size returned from getpagesizes(3c) from inadvertantly
 429  * using the 'new' large pagesizes.
 430  */
 431 uint_t
 432 page_num_user_pagesizes(int legacy)
 433 {
 434         if (legacy)
 435                 return (mmu_legacy_page_sizes);
 436         return (mmu_exported_page_sizes);
 437 }
 438
 439 uint_t
 440 page_num_pagesizes(void)
 441 {
 442         return (mmu_page_sizes);
 443 }
 444
 445 /*
 446  * returns the count of the number of base pagesize pages associated with szc
 447  */
 448 pgcnt_t
 449 page_get_pagecnt(uint_t szc)
 450 {
 451         if (szc >= mmu_page_sizes)
 452                 panic("page_get_pagecnt: out of range %d", szc);
 453         return (hw_page_array[szc].hp_pgcnt);
 454 }
 455
 456 size_t
 457 page_get_pagesize(uint_t szc)
 458 {
 459         if (szc >= mmu_page_sizes)
 460                 panic("page_get_pagesize: out of range %d", szc);
 461         return (hw_page_array[szc].hp_size);
 462 }
 463
 464 /*
 465  * Return the size of a page based upon the index passed in.  An index of
 466  * zero refers to the smallest page size in the system, and as index increases
 467  * it refers to the next larger supported page size in the system.
 468  * Note that szc and userszc may not be the same due to unsupported szc's on
 469  * some systems.
 470  */
 471 size_t
 472 page_get_user_pagesize(uint_t userszc)
 473 {
 474         uint_t szc = USERSZC_2_SZC(userszc);
 475
 476         if (szc >= mmu_page_sizes)
 477                 panic("page_get_user_pagesize: out of range %d", szc);
 478         return (hw_page_array[szc].hp_size);
 479 }
 480
 481 uint_t
 482 page_get_shift(uint_t szc)
 483 {
 484         if (szc >= mmu_page_sizes)
 485                 panic("page_get_shift: out of range %d", szc);
 486         return (PAGE_GET_SHIFT(szc));
 487 }
 488
 489 uint_t
 490 page_get_pagecolors(uint_t szc)
 491 {
 492         if (szc >= mmu_page_sizes)
 493                 panic("page_get_pagecolors: out of range %d", szc);
 494         return (PAGE_GET_PAGECOLORS(szc));
 495 }
 496
 497 /*
 498  * this assigns the desired equivalent color after a split
 499  */
 500 uint_t
 501 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
 502     uint_t ncolor, uint_t ceq_mask)
 503 {
 504         ASSERT(nszc > szc);
 505         ASSERT(szc < mmu_page_sizes);
 506         ASSERT(color < PAGE_GET_PAGECOLORS(szc));
 507         ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
 508
 509         color &= ceq_mask;
 510         ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
 511         return (color | (ncolor & ~ceq_mask));
 512 }
 513
 514 /*
 515  * The interleaved_mnodes flag is set when mnodes overlap in
 516  * the physbase..physmax range, but have disjoint slices.
 517  * In this case hpm_counters is shared by all mnodes.
 518  * This flag is set dynamically by the platform.
 519  */
 520 int interleaved_mnodes = 0;
 521
 522 /*
 523  * Called by startup().
 524  * Size up the per page size free list counters based on physmax
 525  * of each node and max_mem_nodes.
 526  *
 527  * If interleaved_mnodes is set we need to find the first mnode that
 528  * exists. hpm_counters for the first mnode will then be shared by
 529  * all other mnodes. If interleaved_mnodes is not set, just set
 530  * first=mnode each time. That means there will be no sharing.
 531  */
 532 size_t
 533 page_ctrs_sz(void)
 534 {
 535         int     r;              /* region size */
 536         int     mnode;
 537         int     firstmn;        /* first mnode that exists */
 538         int     nranges;
 539         pfn_t   physbase;
 540         pfn_t   physmax;
 541         uint_t  ctrs_sz = 0;
 542         int     i;
 543         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 544
 545         /*
 546          * We need to determine how many page colors there are for each
 547          * page size in order to allocate memory for any color specific
 548          * arrays.
 549          */
 550         for (i = 0; i < mmu_page_sizes; i++) {
 551                 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 552         }
 553
 554         for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 555
 556                 pgcnt_t r_pgcnt;
 557                 pfn_t   r_base;
 558                 pgcnt_t r_align;
 559
 560                 if (mem_node_config[mnode].exists == 0)
 561                         continue;
 562
 563                 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 564                 nranges = MNODE_RANGE_CNT(mnode);
 565                 mnode_nranges[mnode] = nranges;
 566                 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
 567
 568                 /*
 569                  * determine size needed for page counter arrays with
 570                  * base aligned to large page size.
 571                  */
 572                 for (r = 1; r < mmu_page_sizes; r++) {
 573                         /* add in space for hpm_color_current */
 574                         ctrs_sz += sizeof (size_t) *
 575                             colors_per_szc[r] * nranges;
 576
 577                         if (firstmn != mnode)
 578                                 continue;
 579
 580                         /* add in space for hpm_counters */
 581                         r_align = page_get_pagecnt(r);
 582                         r_base = physbase;
 583                         r_base &= ~(r_align - 1);
 584                         r_pgcnt = howmany(physmax - r_base + 1, r_align);
 585
 586                         /*
 587                          * Round up to always allocate on pointer sized
 588                          * boundaries.
 589                          */
 590                         ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
 591                             sizeof (hpmctr_t *));
 592                 }
 593         }
 594
 595         for (r = 1; r < mmu_page_sizes; r++) {
 596                 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
 597         }
 598
 599         /* add in space for page_ctrs_cands and pcc_color_free */
 600         ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
 601             mmu_page_sizes * NPC_MUTEX;
 602
 603         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 604
 605                 if (mem_node_config[mnode].exists == 0)
 606                         continue;
 607
 608                 nranges = mnode_nranges[mnode];
 609                 ctrs_sz += sizeof (pcc_info_t) * nranges *
 610                     mmu_page_sizes * NPC_MUTEX;
 611                 for (r = 1; r < mmu_page_sizes; r++) {
 612                         ctrs_sz += sizeof (pgcnt_t) * nranges *
 613                             colors_per_szc[r] * NPC_MUTEX;
 614                 }
 615         }
 616
 617         /* ctr_mutex */
 618         ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
 619
 620         /* size for page list counts */
 621         PLCNT_SZ(ctrs_sz);
 622
 623         /*
 624          * add some slop for roundups. page_ctrs_alloc will roundup the start
 625          * address of the counters to ecache_alignsize boundary for every
 626          * memory node.
 627          */
 628         return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
 629 }
 630
 631 caddr_t
 632 page_ctrs_alloc(caddr_t alloc_base)
 633 {
 634         int     mnode;
 635         int     mrange, nranges;
 636         int     r;              /* region size */
 637         int     i;
 638         int     firstmn;        /* first mnode that exists */
 639         pfn_t   physbase;
 640         pfn_t   physmax;
 641         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 642
 643         /*
 644          * We need to determine how many page colors there are for each
 645          * page size in order to allocate memory for any color specific
 646          * arrays.
 647          */
 648         for (i = 0; i < mmu_page_sizes; i++) {
 649                 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 650         }
 651
 652         for (r = 1; r < mmu_page_sizes; r++) {
 653                 page_counters[r] = (hw_page_map_t *)alloc_base;
 654                 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
 655         }
 656
 657         /* page_ctrs_cands and pcc_color_free array */
 658         for (i = 0; i < NPC_MUTEX; i++) {
 659                 for (r = 1; r < mmu_page_sizes; r++) {
 660
 661                         page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
 662                         alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
 663
 664                         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 665                                 pcc_info_t *pi;
 666
 667                                 if (mem_node_config[mnode].exists == 0)
 668                                         continue;
 669
 670                                 nranges = mnode_nranges[mnode];
 671
 672                                 pi = (pcc_info_t *)alloc_base;
 673                                 alloc_base += sizeof (pcc_info_t) * nranges;
 674                                 page_ctrs_cands[i][r][mnode] = pi;
 675
 676                                 for (mrange = 0; mrange < nranges; mrange++) {
 677                                         pi->pcc_color_free =
 678                                             (pgcnt_t *)alloc_base;
 679                                         alloc_base += sizeof (pgcnt_t) *
 680                                             colors_per_szc[r];
 681                                         pi++;
 682                                 }
 683                         }
 684                 }
 685         }
 686
 687         /* ctr_mutex */
 688         for (i = 0; i < NPC_MUTEX; i++) {
 689                 ctr_mutex[i] = (kmutex_t *)alloc_base;
 690                 alloc_base += (max_mem_nodes * sizeof (kmutex_t));
 691         }
 692
 693         /* initialize page list counts */
 694         PLCNT_INIT(alloc_base);
 695
 696         for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 697
 698                 pgcnt_t r_pgcnt;
 699                 pfn_t   r_base;
 700                 pgcnt_t r_align;
 701                 int     r_shift;
 702                 int     nranges = mnode_nranges[mnode];
 703
 704                 if (mem_node_config[mnode].exists == 0)
 705                         continue;
 706
 707                 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 708
 709                 for (r = 1; r < mmu_page_sizes; r++) {
 710                         /*
 711                          * the page_counters base has to be aligned to the
 712                          * page count of page size code r otherwise the counts
 713                          * will cross large page boundaries.
 714                          */
 715                         r_align = page_get_pagecnt(r);
 716                         r_base = physbase;
 717                         /* base needs to be aligned - lower to aligned value */
 718                         r_base &= ~(r_align - 1);
 719                         r_pgcnt = howmany(physmax - r_base + 1, r_align);
 720                         r_shift = PAGE_BSZS_SHIFT(r);
 721
 722                         PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
 723                         PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
 724                         PAGE_COUNTERS_BASE(mnode, r) = r_base;
 725                         for (mrange = 0; mrange < nranges; mrange++) {
 726                                 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
 727                                     r, mrange) = (size_t *)alloc_base;
 728                                 alloc_base += sizeof (size_t) *
 729                                     colors_per_szc[r];
 730                         }
 731                         for (i = 0; i < colors_per_szc[r]; i++) {
 732                                 uint_t color_mask = colors_per_szc[r] - 1;
 733                                 pfn_t  pfnum = r_base;
 734                                 size_t idx;
 735                                 int mrange;
 736                                 MEM_NODE_ITERATOR_DECL(it);
 737
 738                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
 739                                 if (pfnum == (pfn_t)-1) {
 740                                         idx = 0;
 741                                 } else {
 742                                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
 743                                             color_mask, color_mask, &it);
 744                                         idx = PNUM_TO_IDX(mnode, r, pfnum);
 745                                         idx = (idx >= r_pgcnt) ? 0 : idx;
 746                                 }
 747                                 for (mrange = 0; mrange < nranges; mrange++) {
 748                                         PAGE_COUNTERS_CURRENT_COLOR(mnode,
 749                                             r, i, mrange) = idx;
 750                                 }
 751                         }
 752
 753                         /* hpm_counters may be shared by all mnodes */
 754                         if (firstmn == mnode) {
 755                                 PAGE_COUNTERS_COUNTERS(mnode, r) =
 756                                     (hpmctr_t *)alloc_base;
 757                                 alloc_base +=
 758                                     P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
 759                                     sizeof (hpmctr_t *));
 760                         } else {
 761                                 PAGE_COUNTERS_COUNTERS(mnode, r) =
 762                                     PAGE_COUNTERS_COUNTERS(firstmn, r);
 763                         }
 764
 765                         /*
 766                          * Verify that PNUM_TO_IDX and IDX_TO_PNUM
 767                          * satisfy the identity requirement.
 768                          * We should be able to go from one to the other
 769                          * and get consistent values.
 770                          */
 771                         ASSERT(PNUM_TO_IDX(mnode, r,
 772                             (IDX_TO_PNUM(mnode, r, 0))) == 0);
 773                         ASSERT(IDX_TO_PNUM(mnode, r,
 774                             (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
 775                 }
 776                 /*
 777                  * Roundup the start address of the page_counters to
 778                  * cache aligned boundary for every memory node.
 779                  * page_ctrs_sz() has added some slop for these roundups.
 780                  */
 781                 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
 782                     L2CACHE_ALIGN);
 783         }
 784
 785         /* Initialize other page counter specific data structures. */
 786         for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
 787                 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
 788         }
 789
 790         return (alloc_base);
 791 }
 792
 793 /*
 794  * Functions to adjust region counters for each size free list.
 795  * Caller is responsible to acquire the ctr_mutex lock if necessary and
 796  * thus can be called during startup without locks.
 797  */
 798 /* ARGSUSED */
 799 void
 800 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
 801 {
 802         ssize_t         r;      /* region size */
 803         ssize_t         idx;
 804         pfn_t           pfnum;
 805         int             lckidx;
 806
 807         ASSERT(mnode == PP_2_MEM_NODE(pp));
 808         ASSERT(mtype == PP_2_MTYPE(pp));
 809
 810         ASSERT(pp->p_szc < mmu_page_sizes);
 811
 812         PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
 813
 814         /* no counter update needed for largest page size */
 815         if (pp->p_szc >= mmu_page_sizes - 1) {
 816                 return;
 817         }
 818
 819         r = pp->p_szc + 1;
 820         pfnum = pp->p_pagenum;
 821         lckidx = PP_CTR_LOCK_INDX(pp);
 822
 823         /*
 824          * Increment the count of free pages for the current
 825          * region. Continue looping up in region size incrementing
 826          * count if the preceeding region is full.
 827          */
 828         while (r < mmu_page_sizes) {
 829                 idx = PNUM_TO_IDX(mnode, r, pfnum);
 830
 831                 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 832                 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
 833
 834                 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
 835                         break;
 836                 } else {
 837                         int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
 838                         pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
 839                             [MTYPE_2_MRANGE(mnode, root_mtype)];
 840
 841                         cand->pcc_pages_free++;
 842                         cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
 843                 }
 844                 r++;
 845         }
 846 }
 847
 848 void
 849 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
 850 {
 851         int             lckidx = PP_CTR_LOCK_INDX(pp);
 852         kmutex_t        *lock = &ctr_mutex[lckidx][mnode];
 853
 854         mutex_enter(lock);
 855         page_ctr_add_internal(mnode, mtype, pp, flags);
 856         mutex_exit(lock);
 857 }
 858
 859 void
 860 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
 861 {
 862         int             lckidx;
 863         ssize_t         r;      /* region size */
 864         ssize_t         idx;
 865         pfn_t           pfnum;
 866
 867         ASSERT(mnode == PP_2_MEM_NODE(pp));
 868         ASSERT(mtype == PP_2_MTYPE(pp));
 869
 870         ASSERT(pp->p_szc < mmu_page_sizes);
 871
 872         PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
 873
 874         /* no counter update needed for largest page size */
 875         if (pp->p_szc >= mmu_page_sizes - 1) {
 876                 return;
 877         }
 878
 879         r = pp->p_szc + 1;
 880         pfnum = pp->p_pagenum;
 881         lckidx = PP_CTR_LOCK_INDX(pp);
 882
 883         /*
 884          * Decrement the count of free pages for the current
 885          * region. Continue looping up in region size decrementing
 886          * count if the preceeding region was full.
 887          */
 888         while (r < mmu_page_sizes) {
 889                 idx = PNUM_TO_IDX(mnode, r, pfnum);
 890
 891                 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 892                 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
 893
 894                 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
 895                         break;
 896                 } else {
 897                         int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
 898                         pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
 899                             [MTYPE_2_MRANGE(mnode, root_mtype)];
 900
 901                         ASSERT(cand->pcc_pages_free != 0);
 902                         ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
 903
 904                         cand->pcc_pages_free--;
 905                         cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
 906                 }
 907                 r++;
 908         }
 909 }
 910
 911 void
 912 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
 913 {
 914         int             lckidx = PP_CTR_LOCK_INDX(pp);
 915         kmutex_t        *lock = &ctr_mutex[lckidx][mnode];
 916
 917         mutex_enter(lock);
 918         page_ctr_sub_internal(mnode, mtype, pp, flags);
 919         mutex_exit(lock);
 920 }
 921
 922 /*
 923  * Adjust page counters following a memory attach, since typically the
 924  * size of the array needs to change, and the PFN to counter index
 925  * mapping needs to change.
 926  *
 927  * It is possible this mnode did not exist at startup. In that case
 928  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
 929  * to change (a theoretical possibility on x86), which means pcc_color_free
 930  * arrays must be extended.
 931  */
 932 uint_t
 933 page_ctrs_adjust(int mnode)
 934 {
 935         pgcnt_t npgs;
 936         int     r;              /* region size */
 937         int     i;
 938         size_t  pcsz, old_csz;
 939         hpmctr_t *new_ctr, *old_ctr;
 940         pfn_t   oldbase, newbase;
 941         pfn_t   physbase, physmax;
 942         size_t  old_npgs;
 943         hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
 944         size_t  size_cache[MMU_PAGE_SIZES];
 945         size_t  *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 946         size_t  *old_color_array[MAX_MNODE_MRANGES];
 947         pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 948         pcc_info_t **cands_cache;
 949         pcc_info_t *old_pi, *pi;
 950         pgcnt_t *pgcntp;
 951         int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
 952         int cands_cache_nranges;
 953         int old_maxmrange, new_maxmrange;
 954         int rc = 0;
 955         int oldmnode;
 956
 957         cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
 958             MMU_PAGE_SIZES, KM_NOSLEEP);
 959         if (cands_cache == NULL)
 960                 return (ENOMEM);
 961
 962         i = -1;
 963         HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
 964
 965         newbase = physbase & ~PC_BASE_ALIGN_MASK;
 966         npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
 967
 968         /* prepare to free non-null pointers on the way out */
 969         cands_cache_nranges = nranges;
 970         bzero(ctr_cache, sizeof (ctr_cache));
 971         bzero(color_cache, sizeof (color_cache));
 972
 973         /*
 974          * We need to determine how many page colors there are for each
 975          * page size in order to allocate memory for any color specific
 976          * arrays.
 977          */
 978         for (r = 0; r < mmu_page_sizes; r++) {
 979                 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
 980         }
 981
 982         /*
 983          * Preallocate all of the new hpm_counters arrays as we can't
 984          * hold the page_ctrs_rwlock as a writer and allocate memory.
 985          * If we can't allocate all of the arrays, undo our work so far
 986          * and return failure.
 987          */
 988         for (r = 1; r < mmu_page_sizes; r++) {
 989                 pcsz = npgs >> PAGE_BSZS_SHIFT(r);
 990                 size_cache[r] = pcsz;
 991                 ctr_cache[r] = kmem_zalloc(pcsz *
 992                     sizeof (hpmctr_t), KM_NOSLEEP);
 993                 if (ctr_cache[r] == NULL) {
 994                         rc = ENOMEM;
 995                         goto cleanup;
 996                 }
 997         }
 998
 999         /*
1000          * Preallocate all of the new color current arrays as we can't
1001          * hold the page_ctrs_rwlock as a writer and allocate memory.
1002          * If we can't allocate all of the arrays, undo our work so far
1003          * and return failure.
1004          */
1005         for (r = 1; r < mmu_page_sizes; r++) {
1006                 for (mrange = 0; mrange < nranges; mrange++) {
1007                         color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1008                             colors_per_szc[r], KM_NOSLEEP);
1009                         if (color_cache[r][mrange] == NULL) {
1010                                 rc = ENOMEM;
1011                                 goto cleanup;
1012                         }
1013                 }
1014         }
1015
1016         /*
1017          * Preallocate all of the new pcc_info_t arrays as we can't
1018          * hold the page_ctrs_rwlock as a writer and allocate memory.
1019          * If we can't allocate all of the arrays, undo our work so far
1020          * and return failure.
1021          */
1022         for (r = 1; r < mmu_page_sizes; r++) {
1023                 for (i = 0; i < NPC_MUTEX; i++) {
1024                         pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1025                             KM_NOSLEEP);
1026                         if (pi == NULL) {
1027                                 rc = ENOMEM;
1028                                 goto cleanup;
1029                         }
1030                         cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1031
1032                         for (mrange = 0; mrange < nranges; mrange++, pi++) {
1033                                 pgcntp = kmem_zalloc(colors_per_szc[r] *
1034                                     sizeof (pgcnt_t), KM_NOSLEEP);
1035                                 if (pgcntp == NULL) {
1036                                         rc = ENOMEM;
1037                                         goto cleanup;
1038                                 }
1039                                 pi->pcc_color_free = pgcntp;
1040                         }
1041                 }
1042         }
1043
1044         /*
1045          * Grab the write lock to prevent others from walking these arrays
1046          * while we are modifying them.
1047          */
1048         PAGE_CTRS_WRITE_LOCK(mnode);
1049
1050         /*
1051          * For interleaved mnodes, find the first mnode
1052          * with valid page counters since the current
1053          * mnode may have just been added and not have
1054          * valid page counters.
1055          */
1056         if (interleaved_mnodes) {
1057                 for (i = 0; i < max_mem_nodes; i++)
1058                         if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1059                                 break;
1060                 ASSERT(i < max_mem_nodes);
1061                 oldmnode = i;
1062         } else
1063                 oldmnode = mnode;
1064
1065         old_nranges = mnode_nranges[mnode];
1066         cands_cache_nranges = old_nranges;
1067         mnode_nranges[mnode] = nranges;
1068         old_maxmrange = mnode_maxmrange[mnode];
1069         mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1070         new_maxmrange = mnode_maxmrange[mnode];
1071
1072         for (r = 1; r < mmu_page_sizes; r++) {
1073                 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1074                 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1075                 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1076                 oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1077                 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1078                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1079                         old_color_array[mrange] =
1080                             PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1081                             r, mrange);
1082                 }
1083
1084                 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1085                 new_ctr = ctr_cache[r];
1086                 ctr_cache[r] = NULL;
1087                 if (old_ctr != NULL &&
1088                     (oldbase + old_npgs > newbase) &&
1089                     (newbase + npgs > oldbase)) {
1090                         /*
1091                          * Map the intersection of the old and new
1092                          * counters into the new array.
1093                          */
1094                         size_t offset;
1095                         if (newbase > oldbase) {
1096                                 offset = (newbase - oldbase) >>
1097                                     PAGE_COUNTERS_SHIFT(mnode, r);
1098                                 bcopy(old_ctr + offset, new_ctr,
1099                                     MIN(pcsz, (old_csz - offset)) *
1100                                     sizeof (hpmctr_t));
1101                         } else {
1102                                 offset = (oldbase - newbase) >>
1103                                     PAGE_COUNTERS_SHIFT(mnode, r);
1104                                 bcopy(old_ctr, new_ctr + offset,
1105                                     MIN(pcsz - offset, old_csz) *
1106                                     sizeof (hpmctr_t));
1107                         }
1108                 }
1109
1110                 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1111                 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1112                 PAGE_COUNTERS_BASE(mnode, r) = newbase;
1113
1114                 /* update shared hpm_counters in other mnodes */
1115                 if (interleaved_mnodes) {
1116                         for (i = 0; i < max_mem_nodes; i++) {
1117                                 if ((i == mnode) ||
1118                                     (mem_node_config[i].exists == 0))
1119                                         continue;
1120                                 ASSERT(
1121                                     PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1122                                     PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1123                                 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1124                                 PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1125                                 PAGE_COUNTERS_BASE(i, r) = newbase;
1126                         }
1127                 }
1128
1129                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1130                         PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1131                             color_cache[r][mrange];
1132                         color_cache[r][mrange] = NULL;
1133                 }
1134                 /*
1135                  * for now, just reset on these events as it's probably
1136                  * not worthwhile to try and optimize this.
1137                  */
1138                 for (i = 0; i < colors_per_szc[r]; i++) {
1139                         uint_t color_mask = colors_per_szc[r] - 1;
1140                         int mlo = interleaved_mnodes ? 0 : mnode;
1141                         int mhi = interleaved_mnodes ? max_mem_nodes :
1142                             (mnode + 1);
1143                         int m;
1144                         pfn_t  pfnum;
1145                         size_t idx;
1146                         MEM_NODE_ITERATOR_DECL(it);
1147
1148                         for (m = mlo; m < mhi; m++) {
1149                                 if (mem_node_config[m].exists == 0)
1150                                         continue;
1151                                 pfnum = newbase;
1152                                 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1153                                 if (pfnum == (pfn_t)-1) {
1154                                         idx = 0;
1155                                 } else {
1156                                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1157                                             color_mask, color_mask, &it);
1158                                         idx = PNUM_TO_IDX(m, r, pfnum);
1159                                         idx = (idx < pcsz) ? idx : 0;
1160                                 }
1161                                 for (mrange = 0; mrange < nranges; mrange++) {
1162                                         if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1163                                             r, mrange) != NULL)
1164                                                 PAGE_COUNTERS_CURRENT_COLOR(m,
1165                                                     r, i, mrange) = idx;
1166                                 }
1167                         }
1168                 }
1169
1170                 /* cache info for freeing out of the critical path */
1171                 if ((caddr_t)old_ctr >= kernelheap &&
1172                     (caddr_t)old_ctr < ekernelheap) {
1173                         ctr_cache[r] = old_ctr;
1174                         size_cache[r] = old_csz;
1175                 }
1176                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1177                         size_t *tmp = old_color_array[mrange];
1178                         if ((caddr_t)tmp >= kernelheap &&
1179                             (caddr_t)tmp < ekernelheap) {
1180                                 color_cache[r][mrange] = tmp;
1181                         }
1182                 }
1183                 /*
1184                  * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1185                  * satisfy the identity requirement.
1186                  * We should be able to go from one to the other
1187                  * and get consistent values.
1188                  */
1189                 ASSERT(PNUM_TO_IDX(mnode, r,
1190                     (IDX_TO_PNUM(mnode, r, 0))) == 0);
1191                 ASSERT(IDX_TO_PNUM(mnode, r,
1192                     (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1193
1194                 /* pcc_info_t and pcc_color_free */
1195                 for (i = 0; i < NPC_MUTEX; i++) {
1196                         pcc_info_t *epi;
1197                         pcc_info_t *eold_pi;
1198
1199                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1200                         old_pi = page_ctrs_cands[i][r][mnode];
1201                         page_ctrs_cands[i][r][mnode] = pi;
1202                         cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1203
1204                         /* preserve old pcc_color_free values, if any */
1205                         if (old_pi == NULL)
1206                                 continue;
1207
1208                         /*
1209                          * when/if x86 does DR, must account for
1210                          * possible change in range index when
1211                          * preserving pcc_info
1212                          */
1213                         epi = &pi[nranges];
1214                         eold_pi = &old_pi[old_nranges];
1215                         if (new_maxmrange > old_maxmrange) {
1216                                 pi += new_maxmrange - old_maxmrange;
1217                         } else if (new_maxmrange < old_maxmrange) {
1218                                 old_pi += old_maxmrange - new_maxmrange;
1219                         }
1220                         for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1221                                 pcc_info_t tmp = *pi;
1222                                 *pi = *old_pi;
1223                                 *old_pi = tmp;
1224                         }
1225                 }
1226         }
1227         PAGE_CTRS_WRITE_UNLOCK(mnode);
1228
1229         /*
1230          * Now that we have dropped the write lock, it is safe to free all
1231          * of the memory we have cached above.
1232          * We come thru here to free memory when pre-alloc fails, and also to
1233          * free old pointers which were recorded while locked.
1234          */
1235 cleanup:
1236         for (r = 1; r < mmu_page_sizes; r++) {
1237                 if (ctr_cache[r] != NULL) {
1238                         kmem_free(ctr_cache[r],
1239                             size_cache[r] * sizeof (hpmctr_t));
1240                 }
1241                 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1242                         if (color_cache[r][mrange] != NULL) {
1243                                 kmem_free(color_cache[r][mrange],
1244                                     colors_per_szc[r] * sizeof (size_t));
1245                         }
1246                 }
1247                 for (i = 0; i < NPC_MUTEX; i++) {
1248                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1249                         if (pi == NULL)
1250                                 continue;
1251                         nr = cands_cache_nranges;
1252                         for (mrange = 0; mrange < nr; mrange++, pi++) {
1253                                 pgcntp = pi->pcc_color_free;
1254                                 if (pgcntp == NULL)
1255                                         continue;
1256                                 if ((caddr_t)pgcntp >= kernelheap &&
1257                                     (caddr_t)pgcntp < ekernelheap) {
1258                                         kmem_free(pgcntp,
1259                                             colors_per_szc[r] *
1260                                             sizeof (pgcnt_t));
1261                                 }
1262                         }
1263                         pi = cands_cache[i * MMU_PAGE_SIZES + r];
1264                         if ((caddr_t)pi >= kernelheap &&
1265                             (caddr_t)pi < ekernelheap) {
1266                                 kmem_free(pi, nr * sizeof (pcc_info_t));
1267                         }
1268                 }
1269         }
1270
1271         kmem_free(cands_cache,
1272             sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1273         return (rc);
1274 }
1275
1276 /*
1277  * Cleanup the hpm_counters field in the page counters
1278  * array.
1279  */
1280 void
1281 page_ctrs_cleanup(void)
1282 {
1283         int r;  /* region size */
1284         int i;  /* mnode index */
1285
1286         /*
1287          * Get the page counters write lock while we are
1288          * setting the page hpm_counters field to NULL
1289          * for non-existent mnodes.
1290          */
1291         for (i = 0; i < max_mem_nodes; i++) {
1292                 PAGE_CTRS_WRITE_LOCK(i);
1293                 if (mem_node_config[i].exists) {
1294                         PAGE_CTRS_WRITE_UNLOCK(i);
1295                         continue;
1296                 }
1297                 for (r = 1; r < mmu_page_sizes; r++) {
1298                         PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1299                 }
1300                 PAGE_CTRS_WRITE_UNLOCK(i);
1301         }
1302 }
1303
1304 #ifdef DEBUG
1305
1306 /*
1307  * confirm pp is a large page corresponding to szc
1308  */
1309 void
1310 chk_lpg(page_t *pp, uchar_t szc)
1311 {
1312         spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1313         uint_t noreloc;
1314
1315         if (npgs == 1) {
1316                 ASSERT(pp->p_szc == 0);
1317                 ASSERT(pp->p_next == pp);
1318                 ASSERT(pp->p_prev == pp);
1319                 return;
1320         }
1321
1322         ASSERT(pp->p_list.largepg.next == pp || pp->p_list.largepg.next == NULL);
1323         ASSERT(pp->p_list.largepg.prev == pp || pp->p_list.largepg.prev == NULL);
1324
1325         ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1326         ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1327         ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1328         ASSERT(pp->p_prev == (pp + (npgs - 1)));
1329
1330         /*
1331          * Check list of pages.
1332          */
1333         noreloc = PP_ISNORELOC(pp);
1334         while (npgs--) {
1335                 if (npgs != 0) {
1336                         ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1337                         ASSERT(pp->p_next == (pp + 1));
1338                 }
1339                 ASSERT(pp->p_szc == szc);
1340                 ASSERT(PP_ISFREE(pp));
1341                 ASSERT(PP_ISAGED(pp));
1342                 ASSERT(pp->p_list.largepg.next == pp || pp->p_list.largepg.next == NULL);
1343                 ASSERT(pp->p_list.largepg.prev == pp || pp->p_list.largepg.prev == NULL);
1344                 VERIFY(pp->p_object == NULL);
1345                 ASSERT(pp->p_vnode  == NULL);
1346                 ASSERT(PP_ISNORELOC(pp) == noreloc);
1347
1348                 pp = pp->p_next;
1349         }
1350 }
1351 #endif /* DEBUG */
1352
1353 void
1354 page_freelist_lock(int mnode)
1355 {
1356         int i;
1357         for (i = 0; i < NPC_MUTEX; i++) {
1358                 mutex_enter(FPC_MUTEX(mnode, i));
1359                 mutex_enter(CPC_MUTEX(mnode, i));
1360         }
1361 }
1362
1363 void
1364 page_freelist_unlock(int mnode)
1365 {
1366         int i;
1367         for (i = 0; i < NPC_MUTEX; i++) {
1368                 mutex_exit(FPC_MUTEX(mnode, i));
1369                 mutex_exit(CPC_MUTEX(mnode, i));
1370         }
1371 }
1372
1373 /*
1374  * add pp to the specified page list. Defaults to head of the page list
1375  * unless PG_LIST_TAIL is specified.
1376  */
1377 void
1378 page_list_add(page_t *pp, int flags)
1379 {
1380         page_t          **ppp;
1381         kmutex_t        *pcm;
1382         uint_t          bin, mtype;
1383         int             mnode;
1384
1385         ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1386         ASSERT(PP_ISFREE(pp));
1387         ASSERT(!hat_page_is_mapped(pp));
1388         ASSERT(hat_page_getshare(pp) == 0);
1389
1390         /*
1391          * Large pages should be freed via page_list_add_pages().
1392          */
1393         ASSERT(pp->p_szc == 0);
1394
1395         /*
1396          * Don't need to lock the freelist first here
1397          * because the page isn't on the freelist yet.
1398          * This means p_szc can't change on us.
1399          */
1400
1401         bin = PP_2_BIN(pp);
1402         mnode = PP_2_MEM_NODE(pp);
1403         mtype = PP_2_MTYPE(pp);
1404
1405         if (flags & PG_LIST_ISINIT) {
1406                 /*
1407                  * PG_LIST_ISINIT is set during system startup (ie. single
1408                  * threaded), add a page to the free list and add to the
1409                  * the free region counters w/o any locking
1410                  */
1411                 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1412
1413                 /* inline version of page_add() */
1414                 if (*ppp != NULL) {
1415                         pp->p_next = *ppp;
1416                         pp->p_prev = (*ppp)->p_prev;
1417                         (*ppp)->p_prev = pp;
1418                         pp->p_prev->p_next = pp;
1419                 } else
1420                         *ppp = pp;
1421
1422                 page_ctr_add_internal(mnode, mtype, pp, flags);
1423                 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1424         } else {
1425                 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1426
1427                 if (flags & PG_FREE_LIST) {
1428                         VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1429                         ASSERT(PP_ISAGED(pp));
1430                         ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1431
1432                 } else {
1433                         VM_STAT_ADD(vmm_vmstats.pladd_cache);
1434                         VERIFY(pp->p_object);
1435                         ASSERT(pp->p_vnode);
1436                         ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1437                         ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1438                 }
1439                 mutex_enter(pcm);
1440                 page_add(ppp, pp);
1441
1442                 if (flags & PG_LIST_TAIL)
1443                         *ppp = (*ppp)->p_next;
1444                 /*
1445                  * Add counters before releasing pcm mutex to avoid a race with
1446                  * page_freelist_coalesce and page_freelist_split.
1447                  */
1448                 page_ctr_add(mnode, mtype, pp, flags);
1449                 mutex_exit(pcm);
1450         }
1451
1452
1453         /*
1454          * It is up to the caller to unlock the page!
1455          */
1456         ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1457 }
1458
1459
1460
1461 /* ARGSUSED */
1462 void
1463 page_list_noreloc_startup(page_t *pp)
1464 {
1465         panic("page_list_noreloc_startup: should be here only for sparc");
1466 }
1467
1468 void
1469 page_list_add_pages(page_t *pp, int flags)
1470 {
1471         kmutex_t *pcm;
1472         pgcnt_t pgcnt;
1473         uint_t  bin, mtype, i;
1474         int     mnode;
1475
1476         /* default to freelist/head */
1477         ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1478
1479         CHK_LPG(pp, pp->p_szc);
1480         VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1481
1482         bin = PP_2_BIN(pp);
1483         mnode = PP_2_MEM_NODE(pp);
1484         mtype = PP_2_MTYPE(pp);
1485
1486         if (flags & PG_LIST_ISINIT) {
1487                 ASSERT(pp->p_szc == mmu_page_sizes - 1);
1488                 page_lpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1489                 ASSERT(!PP_ISNORELOC(pp));
1490                 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1491         } else {
1492
1493                 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1494
1495                 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1496
1497                 mutex_enter(pcm);
1498                 page_lpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1499                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1500                 mutex_exit(pcm);
1501
1502                 pgcnt = page_get_pagecnt(pp->p_szc);
1503                 for (i = 0; i < pgcnt; i++, pp++)
1504                         page_unlock_nocapture(pp);
1505         }
1506 }
1507
1508 /*
1509  * During boot, need to demote a large page to base
1510  * pagesize pages for seg_kmem for use in boot_alloc()
1511  */
1512 void
1513 page_boot_demote(page_t *pp)
1514 {
1515         ASSERT(pp->p_szc != 0);
1516         ASSERT(PP_ISFREE(pp));
1517         ASSERT(PP_ISAGED(pp));
1518
1519         (void) page_demote(PP_2_MEM_NODE(pp),
1520             PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1521             PC_FREE);
1522
1523         ASSERT(PP_ISFREE(pp));
1524         ASSERT(PP_ISAGED(pp));
1525         ASSERT(pp->p_szc == 0);
1526 }
1527
1528 /*
1529  * Take a particular page off of whatever freelist the page
1530  * is claimed to be on.
1531  *
1532  * NOTE: Only used for PAGESIZE pages.
1533  */
1534 void
1535 page_list_sub(page_t *pp, int flags)
1536 {
1537         int             bin;
1538         uint_t          mtype;
1539         int             mnode;
1540         kmutex_t        *pcm;
1541         page_t          **ppp;
1542
1543         ASSERT(PAGE_EXCL(pp));
1544         ASSERT(PP_ISFREE(pp));
1545
1546         /*
1547          * The p_szc field can only be changed by page_promote()
1548          * and page_demote(). Only free pages can be promoted and
1549          * demoted and the free list MUST be locked during these
1550          * operations. So to prevent a race in page_list_sub()
1551          * between computing which bin of the freelist lock to
1552          * grab and actually grabing the lock we check again that
1553          * the bin we locked is still the correct one. Notice that
1554          * the p_szc field could have actually changed on us but
1555          * if the bin happens to still be the same we are safe.
1556          */
1557 try_again:
1558         bin = PP_2_BIN(pp);
1559         mnode = PP_2_MEM_NODE(pp);
1560         pcm = PC_BIN_MUTEX(mnode, bin, flags);
1561         mutex_enter(pcm);
1562         if (PP_2_BIN(pp) != bin) {
1563                 mutex_exit(pcm);
1564                 goto try_again;
1565         }
1566         mtype = PP_2_MTYPE(pp);
1567
1568         if (flags & PG_FREE_LIST) {
1569                 VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1570                 ASSERT(PP_ISAGED(pp));
1571                 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1572         } else {
1573                 VM_STAT_ADD(vmm_vmstats.plsub_cache);
1574                 ASSERT(!PP_ISAGED(pp));
1575                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1576         }
1577
1578         /*
1579          * Common PAGESIZE case.
1580          *
1581          * Note that we locked the freelist. This prevents
1582          * any page promotion/demotion operations. Therefore
1583          * the p_szc will not change until we drop pcm mutex.
1584          */
1585         if (pp->p_szc == 0) {
1586                 page_sub(ppp, pp);
1587                 /*
1588                  * Subtract counters before releasing pcm mutex
1589                  * to avoid race with page_freelist_coalesce.
1590                  */
1591                 page_ctr_sub(mnode, mtype, pp, flags);
1592                 mutex_exit(pcm);
1593
1594                 return;
1595         }
1596
1597         /*
1598          * Large pages on the cache list are not supported.
1599          */
1600         if (flags & PG_CACHE_LIST)
1601                 panic("page_list_sub: large page on cachelist");
1602
1603         /*
1604          * Slow but rare.
1605          *
1606          * Somebody wants this particular page which is part
1607          * of a large page. In this case we just demote the page
1608          * if it's on the freelist.
1609          *
1610          * We have to drop pcm before locking the entire freelist.
1611          * Once we have re-locked the freelist check to make sure
1612          * the page hasn't already been demoted or completely
1613          * freed.
1614          */
1615         mutex_exit(pcm);
1616         page_freelist_lock(mnode);
1617         if (pp->p_szc != 0) {
1618                 /*
1619                  * Large page is on freelist.
1620                  */
1621                 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1622                     0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1623         }
1624         ASSERT(PP_ISFREE(pp));
1625         ASSERT(PP_ISAGED(pp));
1626         ASSERT(pp->p_szc == 0);
1627
1628         /*
1629          * Subtract counters before releasing pcm mutex
1630          * to avoid race with page_freelist_coalesce.
1631          */
1632         bin = PP_2_BIN(pp);
1633         mtype = PP_2_MTYPE(pp);
1634         ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1635
1636         page_sub(ppp, pp);
1637         page_ctr_sub(mnode, mtype, pp, flags);
1638         page_freelist_unlock(mnode);
1639
1640 }
1641
1642 void
1643 page_list_sub_pages(page_t *pp, uint_t szc)
1644 {
1645         kmutex_t *pcm;
1646         uint_t  bin, mtype;
1647         int     mnode;
1648
1649         ASSERT(PAGE_EXCL(pp));
1650         ASSERT(PP_ISFREE(pp));
1651         ASSERT(PP_ISAGED(pp));
1652
1653         /*
1654          * See comment in page_list_sub().
1655          */
1656 try_again:
1657         bin = PP_2_BIN(pp);
1658         mnode = PP_2_MEM_NODE(pp);
1659         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1660         mutex_enter(pcm);
1661         if (PP_2_BIN(pp) != bin) {
1662                 mutex_exit(pcm);
1663                 goto    try_again;
1664         }
1665
1666         /*
1667          * If we're called with a page larger than szc or it got
1668          * promoted above szc before we locked the freelist then
1669          * drop pcm and re-lock entire freelist. If page still larger
1670          * than szc then demote it.
1671          */
1672         if (pp->p_szc > szc) {
1673                 mutex_exit(pcm);
1674                 pcm = NULL;
1675                 page_freelist_lock(mnode);
1676                 if (pp->p_szc > szc) {
1677                         VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1678                         (void) page_demote(mnode,
1679                             PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1680                             pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1681                 }
1682                 bin = PP_2_BIN(pp);
1683         }
1684         ASSERT(PP_ISFREE(pp));
1685         ASSERT(PP_ISAGED(pp));
1686         ASSERT(pp->p_szc <= szc);
1687         ASSERT(pp == PP_PAGEROOT(pp));
1688
1689         VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1690
1691         mtype = PP_2_MTYPE(pp);
1692         if (pp->p_szc != 0) {
1693                 page_lpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1694                 CHK_LPG(pp, pp->p_szc);
1695         } else {
1696                 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1697                 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1698         }
1699         page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1700
1701         if (pcm != NULL) {
1702                 mutex_exit(pcm);
1703         } else {
1704                 page_freelist_unlock(mnode);
1705         }
1706
1707 }
1708
1709 /*
1710  * Add the page to the front of a linked list of pages
1711  * using the p_next & p_prev pointers for the list.
1712  * The caller is responsible for protecting the list pointers.
1713  */
1714 void
1715 mach_page_add(page_t **ppp, page_t *pp)
1716 {
1717         if (*ppp == NULL) {
1718                 pp->p_next = pp->p_prev = pp;
1719         } else {
1720                 pp->p_next = *ppp;
1721                 pp->p_prev = (*ppp)->p_prev;
1722                 (*ppp)->p_prev = pp;
1723                 pp->p_prev->p_next = pp;
1724         }
1725         *ppp = pp;
1726 }
1727
1728 /*
1729  * Remove this page from a linked list of pages
1730  * using the p_next & p_prev pointers for the list.
1731  *
1732  * The caller is responsible for protecting the list pointers.
1733  */
1734 void
1735 mach_page_sub(page_t **ppp, page_t *pp)
1736 {
1737         ASSERT(PP_ISFREE(pp));
1738
1739         if (*ppp == NULL || pp == NULL)
1740                 panic("mach_page_sub");
1741
1742         if (*ppp == pp)
1743                 *ppp = pp->p_next;              /* go to next page */
1744
1745         if (*ppp == pp)
1746                 *ppp = NULL;                    /* page list is gone */
1747         else {
1748                 pp->p_prev->p_next = pp->p_next;
1749                 pp->p_next->p_prev = pp->p_prev;
1750         }
1751         pp->p_prev = pp->p_next = pp;           /* make pp a list of one */
1752 }
1753
1754 /*
1755  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1756  */
1757 void
1758 page_promote_size(page_t *pp, uint_t cur_szc)
1759 {
1760         pfn_t pfn;
1761         int mnode;
1762         int idx;
1763         int new_szc = cur_szc + 1;
1764         int full = FULL_REGION_CNT(new_szc);
1765
1766         pfn = page_pptonum(pp);
1767         mnode = PFN_2_MEM_NODE(pfn);
1768
1769         page_freelist_lock(mnode);
1770
1771         idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1772         if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1773                 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1774
1775         page_freelist_unlock(mnode);
1776 }
1777
1778 static uint_t page_promote_err;
1779 static uint_t page_promote_noreloc_err;
1780
1781 /*
1782  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1783  * for the given mnode starting at pfnum. Pages involved are on the freelist
1784  * before the call and may be returned to the caller if requested, otherwise
1785  * they will be placed back on the freelist.
1786  * If flags is PC_ALLOC, then the large page will be returned to the user in
1787  * a state which is consistent with a page being taken off the freelist.  If
1788  * we failed to lock the new large page, then we will return NULL to the
1789  * caller and put the large page on the freelist instead.
1790  * If flags is PC_FREE, then the large page will be placed on the freelist,
1791  * and NULL will be returned.
1792  * The caller is responsible for locking the freelist as well as any other
1793  * accounting which needs to be done for a returned page.
1794  *
1795  * RFE: For performance pass in pp instead of pfnum so
1796  *      we can avoid excessive calls to page_numtopp_nolock().
1797  *      This would depend on an assumption that all contiguous
1798  *      pages are in the same memseg so we can just add/dec
1799  *      our pp.
1800  *
1801  * Lock ordering:
1802  *
1803  *      There is a potential but rare deadlock situation
1804  *      for page promotion and demotion operations. The problem
1805  *      is there are two paths into the freelist manager and
1806  *      they have different lock orders:
1807  *
1808  *      page_create()
1809  *              lock freelist
1810  *              page_lock(EXCL)
1811  *              unlock freelist
1812  *              return
1813  *              caller drops page_lock
1814  *
1815  *      page_free() and page_reclaim()
1816  *              caller grabs page_lock(EXCL)
1817  *
1818  *              lock freelist
1819  *              unlock freelist
1820  *              drop page_lock
1821  *
1822  *      What prevents a thread in page_create() from deadlocking
1823  *      with a thread freeing or reclaiming the same page is the
1824  *      page_trylock() in page_get_freelist(). If the trylock fails
1825  *      it skips the page.
1826  *
1827  *      The lock ordering for promotion and demotion is the same as
1828  *      for page_create(). Since the same deadlock could occur during
1829  *      page promotion and freeing or reclaiming of a page on the
1830  *      cache list we might have to fail the operation and undo what
1831  *      have done so far. Again this is rare.
1832  */
1833 page_t *
1834 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1835 {
1836         page_t          *pp, *pplist, *tpp, *start_pp;
1837         pgcnt_t         new_npgs, npgs;
1838         uint_t          bin;
1839         pgcnt_t         tmpnpgs, pages_left;
1840         uint_t          noreloc;
1841         int             which_list;
1842         ulong_t         index;
1843         kmutex_t        *phm;
1844
1845         /*
1846          * General algorithm:
1847          * Find the starting page
1848          * Walk each page struct removing it from the freelist,
1849          * and linking it to all the other pages removed.
1850          * Once all pages are off the freelist,
1851          * walk the list, modifying p_szc to new_szc and what
1852          * ever other info needs to be done to create a large free page.
1853          * According to the flags, either return the page or put it
1854          * on the freelist.
1855          */
1856
1857         start_pp = page_numtopp_nolock(pfnum);
1858         ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1859         new_npgs = page_get_pagecnt(new_szc);
1860         ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1861
1862         /* don't return page of the wrong mtype */
1863         if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1864                         return (NULL);
1865
1866         /*
1867          * Loop through smaller pages to confirm that all pages
1868          * give the same result for PP_ISNORELOC().
1869          * We can check this reliably here as the protocol for setting
1870          * P_NORELOC requires pages to be taken off the free list first.
1871          */
1872         noreloc = PP_ISNORELOC(start_pp);
1873         for (pp = start_pp + new_npgs; --pp > start_pp; ) {
1874                 if (noreloc != PP_ISNORELOC(pp)) {
1875                         page_promote_noreloc_err++;
1876                         page_promote_err++;
1877                         return (NULL);
1878                 }
1879         }
1880
1881         pages_left = new_npgs;
1882         pplist = NULL;
1883         pp = start_pp;
1884
1885         /* Loop around coalescing the smaller pages into a big page. */
1886         while (pages_left) {
1887                 /*
1888                  * Remove from the freelist.
1889                  */
1890                 ASSERT(PP_ISFREE(pp));
1891                 bin = PP_2_BIN(pp);
1892                 ASSERT(mnode == PP_2_MEM_NODE(pp));
1893                 mtype = PP_2_MTYPE(pp);
1894                 if (PP_ISAGED(pp)) {
1895
1896                         /*
1897                          * PG_FREE_LIST
1898                          */
1899                         if (pp->p_szc) {
1900                                 page_lpsub(&PAGE_FREELISTS(mnode,
1901                                     pp->p_szc, bin, mtype), pp);
1902                         } else {
1903                                 mach_page_sub(&PAGE_FREELISTS(mnode, 0,
1904                                     bin, mtype), pp);
1905                         }
1906                         which_list = PG_FREE_LIST;
1907                 } else {
1908                         struct vmobject *obj;
1909
1910                         ASSERT(pp->p_szc == 0);
1911
1912                         /*
1913                          * PG_CACHE_LIST
1914                          *
1915                          * Since this page comes from the
1916                          * cachelist, we must destroy the
1917                          * vnode association.
1918                          */
1919                         if (!page_trylock(pp, SE_EXCL)) {
1920                                 goto fail_promote;
1921                         }
1922
1923                         obj = &pp->p_vnode->v_object;
1924
1925                         /*
1926                          * We need to be careful not to deadlock
1927                          * with another thread in page_lookup().
1928                          * The page_lookup() thread could be holding
1929                          * the same phm that we need if the two
1930                          * pages happen to hash to the same phm lock.
1931                          * At this point we have locked the entire
1932                          * freelist and page_lookup() could be trying
1933                          * to grab a freelist lock.
1934                          */
1935                         if (!vmobject_trylock(obj)) {
1936                                 page_unlock_nocapture(pp);
1937                                 goto fail_promote;
1938                         }
1939
1940                         mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
1941                         page_hashout(pp, true);
1942                         vmobject_unlock(obj);
1943                         PP_SETAGED(pp);
1944                         page_unlock_nocapture(pp);
1945                         which_list = PG_CACHE_LIST;
1946                 }
1947                 page_ctr_sub(mnode, mtype, pp, which_list);
1948
1949                 /*
1950                  * Concatenate the smaller page(s) onto
1951                  * the large page list.
1952                  */
1953                 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
1954                 pages_left -= npgs;
1955                 tpp = pp;
1956                 while (npgs--) {
1957                         tpp->p_szc = new_szc;
1958                         tpp = tpp->p_next;
1959                 }
1960                 page_list_concat(&pplist, &pp);
1961                 pp += tmpnpgs;
1962         }
1963         CHK_LPG(pplist, new_szc);
1964
1965         /*
1966          * return the page to the user if requested
1967          * in the properly locked state.
1968          */
1969         if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
1970                 return (pplist);
1971         }
1972
1973         /*
1974          * Otherwise place the new large page on the freelist
1975          */
1976         bin = PP_2_BIN(pplist);
1977         mnode = PP_2_MEM_NODE(pplist);
1978         mtype = PP_2_MTYPE(pplist);
1979         page_lpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
1980
1981         page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
1982         return (NULL);
1983
1984 fail_promote:
1985         /*
1986          * A thread must have still been freeing or
1987          * reclaiming the page on the cachelist.
1988          * To prevent a deadlock undo what we have
1989          * done sofar and return failure. This
1990          * situation can only happen while promoting
1991          * PAGESIZE pages.
1992          */
1993         page_promote_err++;
1994         while (pplist) {
1995                 pp = pplist;
1996                 mach_page_sub(&pplist, pp);
1997                 pp->p_szc = 0;
1998                 bin = PP_2_BIN(pp);
1999                 mtype = PP_2_MTYPE(pp);
2000                 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2001                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2002         }
2003         return (NULL);
2004
2005 }
2006
2007 /*
2008  * Break up a large page into smaller size pages.
2009  * Pages involved are on the freelist before the call and may
2010  * be returned to the caller if requested, otherwise they will
2011  * be placed back on the freelist.
2012  * The caller is responsible for locking the freelist as well as any other
2013  * accounting which needs to be done for a returned page.
2014  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2015  * technically, any value may be passed in but PC_NO_COLOR is the standard
2016  * which should be followed for clarity's sake.
2017  * Returns a page whose pfn is < pfnmax
2018  */
2019 page_t *
2020 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2021     uchar_t new_szc, int color, int flags)
2022 {
2023         page_t  *pp, *pplist, *npplist;
2024         pgcnt_t npgs, n;
2025         uint_t  bin;
2026         uint_t  mtype;
2027         page_t  *ret_pp = NULL;
2028
2029         ASSERT(cur_szc != 0);
2030         ASSERT(new_szc < cur_szc);
2031
2032         pplist = page_numtopp_nolock(pfnum);
2033         ASSERT(pplist != NULL);
2034
2035         ASSERT(pplist->p_szc == cur_szc);
2036
2037         bin = PP_2_BIN(pplist);
2038         ASSERT(mnode == PP_2_MEM_NODE(pplist));
2039         mtype = PP_2_MTYPE(pplist);
2040         page_lpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2041
2042         CHK_LPG(pplist, cur_szc);
2043         page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2044
2045         /*
2046          * Number of PAGESIZE pages for smaller new_szc
2047          * page.
2048          */
2049         npgs = page_get_pagecnt(new_szc);
2050
2051         while (pplist) {
2052                 pp = pplist;
2053
2054                 ASSERT(pp->p_szc == cur_szc);
2055
2056                 /*
2057                  * We either break it up into PAGESIZE pages or larger.
2058                  */
2059                 if (npgs == 1) {        /* PAGESIZE case */
2060                         mach_page_sub(&pplist, pp);
2061                         ASSERT(pp->p_szc == cur_szc);
2062                         ASSERT(new_szc == 0);
2063                         ASSERT(mnode == PP_2_MEM_NODE(pp));
2064                         pp->p_szc = new_szc;
2065                         bin = PP_2_BIN(pp);
2066                         if ((bin == color) && (flags == PC_ALLOC) &&
2067                             (ret_pp == NULL) && (pfnmax == 0 ||
2068                             pp->p_pagenum < pfnmax) &&
2069                             page_trylock_cons(pp, SE_EXCL)) {
2070                                 ret_pp = pp;
2071                         } else {
2072                                 mtype = PP_2_MTYPE(pp);
2073                                 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2074                                     mtype), pp);
2075                                 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2076                         }
2077                 } else {
2078                         page_t *try_to_return_this_page = NULL;
2079                         int count = 0;
2080
2081                         /*
2082                          * Break down into smaller lists of pages.
2083                          */
2084                         page_list_break(&pplist, &npplist, npgs);
2085
2086                         pp = pplist;
2087                         n = npgs;
2088                         while (n--) {
2089                                 ASSERT(pp->p_szc == cur_szc);
2090                                 /*
2091                                  * Check whether all the pages in this list
2092                                  * fit the request criteria.
2093                                  */
2094                                 if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2095                                         count++;
2096                                 }
2097                                 pp->p_szc = new_szc;
2098                                 pp = pp->p_next;
2099                         }
2100
2101                         if (count == npgs &&
2102                             (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2103                                 try_to_return_this_page = pp;
2104                         }
2105
2106                         CHK_LPG(pplist, new_szc);
2107
2108                         bin = PP_2_BIN(pplist);
2109                         if (try_to_return_this_page)
2110                                 ASSERT(mnode ==
2111                                     PP_2_MEM_NODE(try_to_return_this_page));
2112                         if ((bin == color) && (flags == PC_ALLOC) &&
2113                             (ret_pp == NULL) && try_to_return_this_page &&
2114                             page_trylock_cons(try_to_return_this_page,
2115                             SE_EXCL)) {
2116                                 ret_pp = try_to_return_this_page;
2117                         } else {
2118                                 mtype = PP_2_MTYPE(pp);
2119                                 page_lpadd(&PAGE_FREELISTS(mnode, new_szc,
2120                                     bin, mtype), pplist);
2121
2122                                 page_ctr_add(mnode, mtype, pplist,
2123                                     PG_FREE_LIST);
2124                         }
2125                         pplist = npplist;
2126                 }
2127         }
2128         return (ret_pp);
2129 }
2130
2131 int mpss_coalesce_disable = 0;
2132
2133 /*
2134  * Coalesce free pages into a page of the given szc and color if possible.
2135  * Return the pointer to the page created, otherwise, return NULL.
2136  *
2137  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2138  */
2139 page_t *
2140 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2141     int mtype, pfn_t pfnhi)
2142 {
2143         int     r = szc;                /* region size */
2144         int     mrange;
2145         uint_t  full, bin, color_mask, wrap = 0;
2146         pfn_t   pfnum, lo, hi;
2147         size_t  len, idx, idx0;
2148         pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
2149         page_t  *ret_pp;
2150         MEM_NODE_ITERATOR_DECL(it);
2151
2152         if (mpss_coalesce_disable) {
2153                 ASSERT(szc < MMU_PAGE_SIZES);
2154                 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2155                 return (NULL);
2156         }
2157
2158         ASSERT(szc < mmu_page_sizes);
2159         color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2160         ASSERT(ceq_mask <= color_mask);
2161         ASSERT(color <= color_mask);
2162         color &= ceq_mask;
2163
2164         /* Prevent page_counters dynamic memory from being freed */
2165         rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2166
2167         mrange = MTYPE_2_MRANGE(mnode, mtype);
2168         ASSERT(mrange < mnode_nranges[mnode]);
2169         VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2170
2171         /* get pfn range for mtype */
2172         len = PAGE_COUNTERS_ENTRIES(mnode, r);
2173         MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2174         hi++;
2175
2176         /* use lower limit if given */
2177         if (pfnhi != PFNNULL && pfnhi < hi)
2178                 hi = pfnhi;
2179
2180         /* round to szcpgcnt boundaries */
2181         lo = P2ROUNDUP(lo, szcpgcnt);
2182         MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2183         if (lo == (pfn_t)-1) {
2184                 rw_exit(&page_ctrs_rwlock[mnode]);
2185                 return (NULL);
2186         }
2187         hi = hi & ~(szcpgcnt - 1);
2188
2189         /* set lo to the closest pfn of the right color */
2190         if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2191             (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2192                 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2193                     &it);
2194         }
2195
2196         if (hi <= lo) {
2197                 rw_exit(&page_ctrs_rwlock[mnode]);
2198                 return (NULL);
2199         }
2200
2201         full = FULL_REGION_CNT(r);
2202
2203         /* calculate the number of page candidates and initial search index */
2204         bin = color;
2205         idx0 = (size_t)(-1);
2206         do {
2207                 pgcnt_t acand;
2208
2209                 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2210                 if (acand) {
2211                         idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2212                             r, bin, mrange);
2213                         idx0 = MIN(idx0, idx);
2214                         cands += acand;
2215                 }
2216                 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2217         } while (bin != color);
2218
2219         if (cands == 0) {
2220                 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2221                 rw_exit(&page_ctrs_rwlock[mnode]);
2222                 return (NULL);
2223         }
2224
2225         pfnum = IDX_TO_PNUM(mnode, r, idx0);
2226         if (pfnum < lo || pfnum >= hi) {
2227                 pfnum = lo;
2228         } else {
2229                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2230                 if (pfnum == (pfn_t)-1) {
2231                         pfnum = lo;
2232                         MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2233                         ASSERT(pfnum != (pfn_t)-1);
2234                 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2235                     (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2236                         /* invalid color, get the closest correct pfn */
2237                         PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2238                             color_mask, &it);
2239                         if (pfnum >= hi) {
2240                                 pfnum = lo;
2241                                 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2242                         }
2243                 }
2244         }
2245
2246         /* set starting index */
2247         idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2248         ASSERT(idx0 < len);
2249
2250
2251         for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2252
2253
2254                 if (PAGE_COUNTERS(mnode, r, idx) != full)
2255                         goto next;
2256
2257                 /*
2258                  * RFE: For performance maybe we can do something less
2259                  *      brutal than locking the entire freelist. So far
2260                  *      this doesn't seem to be a performance problem?
2261                  */
2262                 page_freelist_lock(mnode);
2263                 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2264                         ret_pp =
2265                             page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2266                         if (ret_pp != NULL) {
2267                                 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2268                                 PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2269                                     PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2270                                 page_freelist_unlock(mnode);
2271                                 rw_exit(&page_ctrs_rwlock[mnode]);
2272                                 return (ret_pp);
2273                         }
2274                 } else {
2275                         VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2276                 }
2277
2278                 page_freelist_unlock(mnode);
2279                 /*
2280                  * No point looking for another page if we've
2281                  * already tried all of the ones that
2282                  * page_ctr_cands indicated.  Stash off where we left
2283                  * off.
2284                  * Note: this is not exact since we don't hold the
2285                  * page_freelist_locks before we initially get the
2286                  * value of cands for performance reasons, but should
2287                  * be a decent approximation.
2288                  */
2289                 if (--cands == 0) {
2290                         PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2291                             idx;
2292                         break;
2293                 }
2294 next:
2295                 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2296                     color_mask, &it);
2297                 idx = PNUM_TO_IDX(mnode, r, pfnum);
2298                 if (idx >= len || pfnum >= hi) {
2299 wrapit:
2300                         pfnum = lo;
2301                         MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2302                         idx = PNUM_TO_IDX(mnode, r, pfnum);
2303                         wrap++;
2304                 }
2305         }
2306
2307         rw_exit(&page_ctrs_rwlock[mnode]);
2308         VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2309         return (NULL);
2310 }
2311
2312 /*
2313  * For the given mnode, promote as many small pages to large pages as possible.
2314  * mnode can be -1, which means do them all
2315  */
2316 void
2317 page_freelist_coalesce_all(int mnode)
2318 {
2319         int     r;              /* region size */
2320         int     idx, full;
2321         size_t  len;
2322         int doall = interleaved_mnodes || mnode < 0;
2323         int mlo = doall ? 0 : mnode;
2324         int mhi = doall ? max_mem_nodes : (mnode + 1);
2325
2326         VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2327
2328         if (mpss_coalesce_disable) {
2329                 return;
2330         }
2331
2332         /*
2333          * Lock the entire freelist and coalesce what we can.
2334          *
2335          * Always promote to the largest page possible
2336          * first to reduce the number of page promotions.
2337          */
2338         for (mnode = mlo; mnode < mhi; mnode++) {
2339                 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2340                 page_freelist_lock(mnode);
2341         }
2342         for (r = mmu_page_sizes - 1; r > 0; r--) {
2343                 for (mnode = mlo; mnode < mhi; mnode++) {
2344                         pgcnt_t cands = 0;
2345                         int mrange, nranges = mnode_nranges[mnode];
2346
2347                         for (mrange = 0; mrange < nranges; mrange++) {
2348                                 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2349                                 if (cands != 0)
2350                                         break;
2351                         }
2352                         if (cands == 0) {
2353                                 VM_STAT_ADD(vmm_vmstats.
2354                                     page_ctrs_cands_skip_all);
2355                                 continue;
2356                         }
2357
2358                         full = FULL_REGION_CNT(r);
2359                         len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2360
2361                         for (idx = 0; idx < len; idx++) {
2362                                 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2363                                         pfn_t pfnum =
2364                                             IDX_TO_PNUM(mnode, r, idx);
2365                                         int tmnode = interleaved_mnodes ?
2366                                             PFN_2_MEM_NODE(pfnum) : mnode;
2367
2368                                         ASSERT(pfnum >=
2369                                             mem_node_config[tmnode].physbase &&
2370                                             pfnum <
2371                                             mem_node_config[tmnode].physmax);
2372
2373                                         (void) page_promote(tmnode,
2374                                             pfnum, r, PC_FREE, PC_MTYPE_ANY);
2375                                 }
2376                         }
2377                         /* shared hpm_counters covers all mnodes, so we quit */
2378                         if (interleaved_mnodes)
2379                                 break;
2380                 }
2381         }
2382         for (mnode = mlo; mnode < mhi; mnode++) {
2383                 page_freelist_unlock(mnode);
2384                 rw_exit(&page_ctrs_rwlock[mnode]);
2385         }
2386 }
2387
2388 /*
2389  * This is where all polices for moving pages around
2390  * to different page size free lists is implemented.
2391  * Returns 1 on success, 0 on failure.
2392  *
2393  * So far these are the priorities for this algorithm in descending
2394  * order:
2395  *
2396  *      1) When servicing a request try to do so with a free page
2397  *         from next size up. Helps defer fragmentation as long
2398  *         as possible.
2399  *
2400  *      2) Page coalesce on demand. Only when a freelist
2401  *         larger than PAGESIZE is empty and step 1
2402  *         will not work since all larger size lists are
2403  *         also empty.
2404  *
2405  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2406  */
2407
2408 page_t *
2409 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2410     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2411 {
2412         uchar_t nszc = szc + 1;
2413         uint_t  bin, sbin, bin_prev;
2414         page_t  *pp, *firstpp;
2415         page_t  *ret_pp = NULL;
2416         uint_t  color_mask;
2417
2418         if (nszc == mmu_page_sizes)
2419                 return (NULL);
2420
2421         ASSERT(nszc < mmu_page_sizes);
2422         color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2423         bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2424         bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2425             PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2426
2427         VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2428         /*
2429          * First try to break up a larger page to fill current size freelist.
2430          */
2431         while (plw->plw_bins[nszc] != 0) {
2432
2433                 ASSERT(nszc < mmu_page_sizes);
2434
2435                 /*
2436                  * If page found then demote it.
2437                  */
2438                 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2439                         page_freelist_lock(mnode);
2440                         firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2441
2442                         /*
2443                          * If pfnhi is not PFNNULL, look for large page below
2444                          * pfnhi. PFNNULL signifies no pfn requirement.
2445                          */
2446                         if (pp &&
2447                             ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2448                             (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2449                                 do {
2450                                         pp = pp->p_list.largepg.next;
2451                                         if (pp == firstpp) {
2452                                                 pp = NULL;
2453                                                 break;
2454                                         }
2455                                 } while ((pfnhi != PFNNULL &&
2456                                     pp->p_pagenum >= pfnhi) ||
2457                                     (pfnlo != PFNNULL &&
2458                                     pp->p_pagenum < pfnlo));
2459
2460                                 if (pfnhi != PFNNULL && pp != NULL)
2461                                         ASSERT(pp->p_pagenum < pfnhi);
2462
2463                                 if (pfnlo != PFNNULL && pp != NULL)
2464                                         ASSERT(pp->p_pagenum >= pfnlo);
2465                         }
2466                         if (pp) {
2467                                 uint_t ccolor = page_correct_color(szc, nszc,
2468                                     color, bin, plw->plw_ceq_mask[szc]);
2469
2470                                 ASSERT(pp->p_szc == nszc);
2471                                 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2472                                 ret_pp = page_demote(mnode, pp->p_pagenum,
2473                                     pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2474                                 if (ret_pp) {
2475                                         page_freelist_unlock(mnode);
2476                                         return (ret_pp);
2477                                 }
2478                         }
2479                         page_freelist_unlock(mnode);
2480                 }
2481
2482                 /* loop through next size bins */
2483                 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2484                 plw->plw_bins[nszc]--;
2485
2486                 if (bin == sbin) {
2487                         uchar_t nnszc = nszc + 1;
2488
2489                         /* we are done with this page size - check next */
2490                         if (plw->plw_bins[nnszc] == 0)
2491                                 /* we have already checked next size bins */
2492                                 break;
2493
2494                         bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2495                         if (bin_prev != INVALID_COLOR) {
2496                                 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2497                                 if (!((bin ^ bin_prev) &
2498                                     plw->plw_ceq_mask[nnszc]))
2499                                         break;
2500                         }
2501                         ASSERT(nnszc < mmu_page_sizes);
2502                         color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2503                         nszc = nnszc;
2504                         ASSERT(nszc < mmu_page_sizes);
2505                 }
2506         }
2507
2508         return (ret_pp);
2509 }
2510
2511 /*
2512  * Helper routine used only by the freelist code to lock
2513  * a page. If the page is a large page then it succeeds in
2514  * locking all the constituent pages or none at all.
2515  * Returns 1 on sucess, 0 on failure.
2516  */
2517 static int
2518 page_trylock_cons(page_t *pp, se_t se)
2519 {
2520         page_t  *tpp, *first_pp = pp;
2521
2522         /*
2523          * Fail if can't lock first or only page.
2524          */
2525         if (!page_trylock(pp, se)) {
2526                 return (0);
2527         }
2528
2529         /*
2530          * PAGESIZE: common case.
2531          */
2532         if (pp->p_szc == 0) {
2533                 return (1);
2534         }
2535
2536         /*
2537          * Large page case.
2538          */
2539         tpp = pp->p_next;
2540         while (tpp != pp) {
2541                 if (!page_trylock(tpp, se)) {
2542                         /*
2543                          * On failure unlock what we have locked so far.
2544                          * We want to avoid attempting to capture these
2545                          * pages as the pcm mutex may be held which could
2546                          * lead to a recursive mutex panic.
2547                          */
2548                         while (first_pp != tpp) {
2549                                 page_unlock_nocapture(first_pp);
2550                                 first_pp = first_pp->p_next;
2551                         }
2552                         return (0);
2553                 }
2554                 tpp = tpp->p_next;
2555         }
2556         return (1);
2557 }
2558
2559 /*
2560  * init context for walking page lists
2561  * Called when a page of the given szc in unavailable. Sets markers
2562  * for the beginning of the search to detect when search has
2563  * completed a full cycle. Sets flags for splitting larger pages
2564  * and coalescing smaller pages. Page walking procedes until a page
2565  * of the desired equivalent color is found.
2566  */
2567 void
2568 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2569     int use_ceq, page_list_walker_t *plw)
2570 {
2571         uint_t  nszc, ceq_mask, colors;
2572         uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2573
2574         ASSERT(szc < mmu_page_sizes);
2575         colors = PAGE_GET_PAGECOLORS(szc);
2576
2577         plw->plw_colors = colors;
2578         plw->plw_color_mask = colors - 1;
2579         plw->plw_bin_marker = plw->plw_bin0 = bin;
2580         plw->plw_bin_split_prev = bin;
2581         plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2582
2583         /*
2584          * if vac aliasing is possible make sure lower order color
2585          * bits are never ignored
2586          */
2587         if (vac_colors > 1)
2588                 ceq &= 0xf0;
2589
2590         /*
2591          * calculate the number of non-equivalent colors and
2592          * color equivalency mask
2593          */
2594         plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2595         ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2596         ASSERT(plw->plw_ceq_dif > 0);
2597         plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2598
2599         if (flags & PG_MATCH_COLOR) {
2600                 if (cpu_page_colors <  0) {
2601                         /*
2602                          * this is a heterogeneous machine with different CPUs
2603                          * having different size e$ (not supported for ni2/rock
2604                          */
2605                         uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2606                         cpucolors = MAX(cpucolors, 1);
2607                         ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2608                         plw->plw_ceq_mask[szc] =
2609                             MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2610                 }
2611                 plw->plw_ceq_dif = 1;
2612         }
2613
2614         /* we can split pages in the freelist, but not the cachelist */
2615         if (can_split) {
2616                 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2617
2618                 /* set next szc color masks and number of free list bins */
2619                 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2620                         plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2621                             plw->plw_ceq_mask[szc]);
2622                         plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2623                 }
2624                 plw->plw_ceq_mask[nszc] = INVALID_MASK;
2625                 plw->plw_bins[nszc] = 0;
2626
2627         } else {
2628                 ASSERT(szc == 0);
2629                 plw->plw_do_split = 0;
2630                 plw->plw_bins[1] = 0;
2631                 plw->plw_ceq_mask[1] = INVALID_MASK;
2632         }
2633 }
2634
2635 /*
2636  * set mark to flag where next split should occur
2637  */
2638 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {                    \
2639         uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);                       \
2640         uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);            \
2641         uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2642         plw->plw_split_next =                                                \
2643                 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);          \
2644         if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2645                 plw->plw_split_next =                                        \
2646                 INC_MASKED(plw->plw_split_next,                              \
2647                     neq_mask, plw->plw_color_mask);                          \
2648         }                                                                    \
2649 }
2650
2651 uint_t
2652 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2653 {
2654         uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2655         uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2656         uchar_t nszc = szc + 1;
2657
2658         nbin = ADD_MASKED(bin,
2659             plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2660
2661         if (plw->plw_do_split) {
2662                 plw->plw_bin_split_prev = bin;
2663                 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2664                 plw->plw_do_split = 0;
2665         }
2666
2667         if (szc == 0) {
2668                 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2669                         if (nbin == plw->plw_bin0 &&
2670                             (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2671                                 nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2672                                     neq_mask, plw->plw_color_mask);
2673                                 plw->plw_bin_split_prev = plw->plw_bin0;
2674                         }
2675
2676                         if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2677                                 plw->plw_bin_marker =
2678                                     nbin = INC_MASKED(nbin, neq_mask,
2679                                     plw->plw_color_mask);
2680                                 plw->plw_bin_split_prev = plw->plw_bin0;
2681                                 /*
2682                                  * large pages all have the same vac color
2683                                  * so by now we should be done with next
2684                                  * size page splitting process
2685                                  */
2686                                 ASSERT(plw->plw_bins[1] == 0);
2687                                 plw->plw_do_split = 0;
2688                                 return (nbin);
2689                         }
2690
2691                 } else {
2692                         uint_t bin_jump = (vac_colors == 1) ?
2693                             (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2694
2695                         bin_jump &= ~(vac_colors - 1);
2696
2697                         nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2698                             plw->plw_color_mask);
2699
2700                         if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2701
2702                                 plw->plw_bin_marker = nbin = nbin0;
2703
2704                                 if (plw->plw_bins[nszc] != 0) {
2705                                         /*
2706                                          * check if next page size bin is the
2707                                          * same as the next page size bin for
2708                                          * bin0
2709                                          */
2710                                         nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2711                                             nbin);
2712                                         bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2713                                             plw->plw_bin0);
2714
2715                                         if ((bin0_nsz ^ nbin_nsz) &
2716                                             plw->plw_ceq_mask[nszc])
2717                                                 plw->plw_do_split = 1;
2718                                 }
2719                                 return (nbin);
2720                         }
2721                 }
2722         }
2723
2724         if (plw->plw_bins[nszc] != 0) {
2725                 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2726                 if (!((plw->plw_split_next ^ nbin_nsz) &
2727                     plw->plw_ceq_mask[nszc]))
2728                         plw->plw_do_split = 1;
2729         }
2730
2731         return (nbin);
2732 }
2733
2734 page_t *
2735 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2736     uint_t flags)
2737 {
2738         kmutex_t                *pcm;
2739         page_t                  *pp, *first_pp;
2740         uint_t                  sbin;
2741         int                     plw_initialized;
2742         page_list_walker_t      plw;
2743
2744         ASSERT(szc < mmu_page_sizes);
2745
2746         VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2747
2748         MTYPE_START(mnode, mtype, flags);
2749         if (mtype < 0) {        /* mnode does not have memory in mtype range */
2750                 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2751                 return (NULL);
2752         }
2753 try_again:
2754
2755         plw_initialized = 0;
2756         plw.plw_ceq_dif = 1;
2757
2758         /*
2759          * Only hold one freelist lock at a time, that way we
2760          * can start anywhere and not have to worry about lock
2761          * ordering.
2762          */
2763         for (plw.plw_count = 0;
2764             plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2765                 sbin = bin;
2766                 do {
2767                         if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2768                                 goto bin_empty_1;
2769
2770                         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2771                         mutex_enter(pcm);
2772                         pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2773                         if (pp == NULL)
2774                                 goto bin_empty_0;
2775
2776                         /*
2777                          * These were set before the page
2778                          * was put on the free list,
2779                          * they must still be set.
2780                          */
2781                         ASSERT(PP_ISFREE(pp));
2782                         ASSERT(PP_ISAGED(pp));
2783                         VERIFY(pp->p_object == NULL);
2784                         ASSERT(pp->p_vnode == NULL);
2785                         ASSERT(pp->p_offset == (uoff_t)-1);
2786                         ASSERT(pp->p_szc == szc);
2787                         ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2788
2789                         /*
2790                          * Walk down the hash chain.  4k/8k pages are linked
2791                          * on p_next and p_prev fields. Large pages are a
2792                          * contiguous group of constituent pages linked
2793                          * together on their p_next and p_prev fields.  The
2794                          * large pages are linked together on the hash chain
2795                          * using p_list.largepg of the base constituent page
2796                          * of each large page.
2797                          */
2798                         first_pp = pp;
2799                         while (!page_trylock_cons(pp, SE_EXCL)) {
2800                                 if (szc == 0) {
2801                                         pp = pp->p_next;
2802                                 } else {
2803                                         pp = pp->p_list.largepg.next;
2804                                 }
2805
2806                                 ASSERT(PP_ISFREE(pp));
2807                                 ASSERT(PP_ISAGED(pp));
2808                                 VERIFY(pp->p_object == NULL);
2809                                 ASSERT(pp->p_vnode == NULL);
2810                                 ASSERT(pp->p_offset == (uoff_t)-1);
2811                                 ASSERT(pp->p_szc == szc);
2812                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2813
2814                                 if (pp == first_pp)
2815                                         goto bin_empty_0;
2816                         }
2817
2818                         ASSERT(pp != NULL);
2819                         ASSERT(mtype == PP_2_MTYPE(pp));
2820                         ASSERT(pp->p_szc == szc);
2821                         if (szc == 0) {
2822                                 page_sub(&PAGE_FREELISTS(mnode,
2823                                     szc, bin, mtype), pp);
2824                         } else {
2825                                 page_lpsub(&PAGE_FREELISTS(mnode,
2826                                     szc, bin, mtype), pp);
2827                                 CHK_LPG(pp, szc);
2828                         }
2829                         page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
2830
2831                         if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
2832                                 panic("free page is not. pp %p", (void *)pp);
2833                         mutex_exit(pcm);
2834
2835                         VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
2836                         return (pp);
2837
2838 bin_empty_0:
2839                         mutex_exit(pcm);
2840 bin_empty_1:
2841                         if (plw_initialized == 0) {
2842                                 page_list_walk_init(szc, flags, bin, 1, 1,
2843                                     &plw);
2844                                 plw_initialized = 1;
2845                                 ASSERT(plw.plw_colors <=
2846                                     PAGE_GET_PAGECOLORS(szc));
2847                                 ASSERT(plw.plw_colors > 0);
2848                                 ASSERT((plw.plw_colors &
2849                                     (plw.plw_colors - 1)) == 0);
2850                                 ASSERT(bin < plw.plw_colors);
2851                                 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
2852                         }
2853                         /* calculate the next bin with equivalent color */
2854                         bin = ADD_MASKED(bin, plw.plw_bin_step,
2855                             plw.plw_ceq_mask[szc], plw.plw_color_mask);
2856                 } while (sbin != bin);
2857
2858                 /*
2859                  * color bins are all empty if color match. Try and
2860                  * satisfy the request by breaking up or coalescing
2861                  * pages from a different size freelist of the correct
2862                  * color that satisfies the ORIGINAL color requested.
2863                  * If that fails then try pages of the same size but
2864                  * different colors assuming we are not called with
2865                  * PG_MATCH_COLOR.
2866                  */
2867                 if (plw.plw_do_split &&
2868                     (pp = page_freelist_split(szc, bin, mnode,
2869                     mtype, PFNNULL, PFNNULL, &plw)) != NULL)
2870                         return (pp);
2871
2872                 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
2873                     bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
2874                         return (pp);
2875
2876                 if (plw.plw_ceq_dif > 1)
2877                         bin = page_list_walk_next_bin(szc, bin, &plw);
2878         }
2879
2880         /* if allowed, cycle through additional mtypes */
2881         MTYPE_NEXT(mnode, mtype, flags);
2882         if (mtype >= 0)
2883                 goto try_again;
2884
2885         VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
2886
2887         return (NULL);
2888 }
2889
2890 /*
2891  * Returns the count of free pages for 'pp' with size code 'szc'.
2892  * Note: This function does not return an exact value as the page freelist
2893  * locks are not held and thus the values in the page_counters may be
2894  * changing as we walk through the data.
2895  */
2896 static int
2897 page_freecnt(int mnode, page_t *pp, uchar_t szc)
2898 {
2899         pgcnt_t pgfree;
2900         pgcnt_t cnt;
2901         ssize_t r = szc;        /* region size */
2902         ssize_t idx;
2903         int     i;
2904         int     full, range;
2905
2906         /* Make sure pagenum passed in is aligned properly */
2907         ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
2908         ASSERT(szc > 0);
2909
2910         /* Prevent page_counters dynamic memory from being freed */
2911         rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2912         idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2913         cnt = PAGE_COUNTERS(mnode, r, idx);
2914         pgfree = cnt << PNUM_SHIFT(r - 1);
2915         range = FULL_REGION_CNT(szc);
2916
2917         /* Check for completely full region */
2918         if (cnt == range) {
2919                 rw_exit(&page_ctrs_rwlock[mnode]);
2920                 return (pgfree);
2921         }
2922
2923         while (--r > 0) {
2924                 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2925                 full = FULL_REGION_CNT(r);
2926                 for (i = 0; i < range; i++, idx++) {
2927                         cnt = PAGE_COUNTERS(mnode, r, idx);
2928                         /*
2929                          * If cnt here is full, that means we have already
2930                          * accounted for these pages earlier.
2931                          */
2932                         if (cnt != full) {
2933                                 pgfree += (cnt << PNUM_SHIFT(r - 1));
2934                         }
2935                 }
2936                 range *= full;
2937         }
2938         rw_exit(&page_ctrs_rwlock[mnode]);
2939         return (pgfree);
2940 }
2941
2942 /*
2943  * Called from page_geti_contig_pages to exclusively lock constituent pages
2944  * starting from 'spp' for page size code 'szc'.
2945  *
2946  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
2947  * region needs to be greater than or equal to the threshold.
2948  */
2949 static int
2950 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
2951 {
2952         pgcnt_t pgcnt = PNUM_SIZE(szc);
2953         pgcnt_t pgfree, i;
2954         page_t *pp;
2955
2956         VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
2957
2958
2959         if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
2960                 goto skipptcpcheck;
2961         /*
2962          * check if there are sufficient free pages available before attempting
2963          * to trylock. Count is approximate as page counters can change.
2964          */
2965         pgfree = page_freecnt(mnode, spp, szc);
2966
2967         /* attempt to trylock if there are sufficient already free pages */
2968         if (pgfree < pgcnt/ptcpthreshold) {
2969                 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
2970                 return (0);
2971         }
2972
2973 skipptcpcheck:
2974
2975         for (i = 0; i < pgcnt; i++) {
2976                 pp = &spp[i];
2977                 if (!page_trylock(pp, SE_EXCL)) {
2978                         VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
2979                         while (--i != (pgcnt_t)-1) {
2980                                 pp = &spp[i];
2981                                 ASSERT(PAGE_EXCL(pp));
2982                                 page_unlock_nocapture(pp);
2983                         }
2984                         return (0);
2985                 }
2986                 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
2987                 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
2988                     !PP_ISFREE(pp)) {
2989                         VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
2990                         ASSERT(i == 0);
2991                         page_unlock_nocapture(pp);
2992                         return (0);
2993                 }
2994
2995                 /*
2996                  * If a page has been marked non-relocatable or has been
2997                  * explicitly locked in memory, we don't want to relocate it;
2998                  * unlock the pages and fail the operation.
2999                  */
3000                 if (PP_ISNORELOC(pp) ||
3001                     pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
3002                         VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3003                         while (i != (pgcnt_t)-1) {
3004                                 pp = &spp[i];
3005                                 ASSERT(PAGE_EXCL(pp));
3006                                 page_unlock_nocapture(pp);
3007                                 i--;
3008                         }
3009                         return (0);
3010                 }
3011         }
3012         VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3013         return (1);
3014 }
3015
3016 /*
3017  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3018  * of 'szc' constituent pages that had been locked exclusively previously.
3019  * Will attempt to relocate constituent pages in use.
3020  */
3021 static page_t *
3022 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3023 {
3024         spgcnt_t pgcnt, npgs, i;
3025         page_t *targpp, *rpp, *hpp;
3026         page_t *replpp = NULL;
3027         page_t *pplist = NULL;
3028
3029         ASSERT(pp != NULL);
3030
3031         pgcnt = page_get_pagecnt(szc);
3032         while (pgcnt) {
3033                 ASSERT(PAGE_EXCL(pp));
3034                 ASSERT(!PP_ISNORELOC(pp));
3035                 if (PP_ISFREE(pp)) {
3036                         /*
3037                          * If this is a PG_FREE_LIST page then its
3038                          * size code can change underneath us due to
3039                          * page promotion or demotion. As an optimzation
3040                          * use page_list_sub_pages() instead of
3041                          * page_list_sub().
3042                          */
3043                         if (PP_ISAGED(pp)) {
3044                                 page_list_sub_pages(pp, szc);
3045                                 if (pp->p_szc == szc) {
3046                                         return (pp);
3047                                 }
3048                                 ASSERT(pp->p_szc < szc);
3049                                 npgs = page_get_pagecnt(pp->p_szc);
3050                                 hpp = pp;
3051                                 for (i = 0; i < npgs; i++, pp++) {
3052                                         pp->p_szc = szc;
3053                                 }
3054                                 page_list_concat(&pplist, &hpp);
3055                                 pgcnt -= npgs;
3056                                 continue;
3057                         }
3058                         ASSERT(!PP_ISAGED(pp));
3059                         ASSERT(pp->p_szc == 0);
3060                         page_list_sub(pp, PG_CACHE_LIST);
3061                         page_hashout(pp, false);
3062                         PP_SETAGED(pp);
3063                         pp->p_szc = szc;
3064                         page_list_concat(&pplist, &pp);
3065                         pp++;
3066                         pgcnt--;
3067                         continue;
3068                 }
3069                 npgs = page_get_pagecnt(pp->p_szc);
3070
3071                 /*
3072                  * page_create_wait freemem accounting done by caller of
3073                  * page_get_freelist and not necessary to call it prior to
3074                  * calling page_get_replacement_page.
3075                  *
3076                  * page_get_replacement_page can call page_get_contig_pages
3077                  * to acquire a large page (szc > 0); the replacement must be
3078                  * smaller than the contig page size to avoid looping or
3079                  * szc == 0 and PGI_PGCPSZC0 is set.
3080                  */
3081                 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3082                         replpp = page_get_replacement_page(pp, NULL, 0);
3083                         if (replpp) {
3084                                 npgs = page_get_pagecnt(pp->p_szc);
3085                                 ASSERT(npgs <= pgcnt);
3086                                 targpp = pp;
3087                         }
3088                 }
3089
3090                 /*
3091                  * If replacement is NULL or do_page_relocate fails, fail
3092                  * coalescing of pages.
3093                  */
3094                 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3095                     &npgs, NULL) != 0)) {
3096                         /*
3097                          * Unlock un-processed target list
3098                          */
3099                         while (pgcnt--) {
3100                                 ASSERT(PAGE_EXCL(pp));
3101                                 page_unlock_nocapture(pp);
3102                                 pp++;
3103                         }
3104                         /*
3105                          * Free the processed target list.
3106                          */
3107                         while (pplist) {
3108                                 pp = pplist;
3109                                 page_sub(&pplist, pp);
3110                                 ASSERT(PAGE_EXCL(pp));
3111                                 ASSERT(pp->p_szc == szc);
3112                                 ASSERT(PP_ISFREE(pp));
3113                                 ASSERT(PP_ISAGED(pp));
3114                                 pp->p_szc = 0;
3115                                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3116                                 page_unlock_nocapture(pp);
3117                         }
3118
3119                         if (replpp != NULL)
3120                                 page_free_replacement_page(replpp);
3121
3122                         return (NULL);
3123                 }
3124                 ASSERT(pp == targpp);
3125
3126                 ASSERT(hpp = pp); /* That's right, it's an assignment */
3127
3128                 pp += npgs;
3129                 pgcnt -= npgs;
3130
3131                 while (npgs--) {
3132                         ASSERT(PAGE_EXCL(targpp));
3133                         ASSERT(!PP_ISFREE(targpp));
3134                         ASSERT(!PP_ISNORELOC(targpp));
3135                         PP_SETFREE(targpp);
3136                         ASSERT(PP_ISAGED(targpp));
3137                         ASSERT(targpp->p_szc < szc || (szc == 0 &&
3138                             (flags & PGI_PGCPSZC0)));
3139                         targpp->p_szc = szc;
3140                         targpp = targpp->p_next;
3141
3142                         rpp = replpp;
3143                         ASSERT(rpp != NULL);
3144                         page_sub(&replpp, rpp);
3145                         ASSERT(PAGE_EXCL(rpp));
3146                         ASSERT(!PP_ISFREE(rpp));
3147                         page_unlock_nocapture(rpp);
3148                 }
3149                 ASSERT(targpp == hpp);
3150                 ASSERT(replpp == NULL);
3151                 page_list_concat(&pplist, &targpp);
3152         }
3153         CHK_LPG(pplist, szc);
3154         return (pplist);
3155 }
3156
3157 /*
3158  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3159  * of 0 means nothing left after trim.
3160  */
3161 int
3162 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3163 {
3164         pfn_t   kcagepfn;
3165         int     decr;
3166         int     rc = 0;
3167
3168         if (PP_ISNORELOC(mseg->pages)) {
3169                 if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3170
3171                         /* lower part of this mseg inside kernel cage */
3172                         decr = kcage_current_pfn(&kcagepfn);
3173
3174                         /* kernel cage may have transitioned past mseg */
3175                         if (kcagepfn >= mseg->pages_base &&
3176                             kcagepfn < mseg->pages_end) {
3177                                 ASSERT(decr == 0);
3178                                 *lo = MAX(kcagepfn, pfnlo);
3179                                 *hi = MIN(pfnhi, (mseg->pages_end - 1));
3180                                 rc = 1;
3181                         }
3182                 }
3183                 /* else entire mseg in the cage */
3184         } else {
3185                 if (PP_ISNORELOC(mseg->epages - 1)) {
3186
3187                         /* upper part of this mseg inside kernel cage */
3188                         decr = kcage_current_pfn(&kcagepfn);
3189
3190                         /* kernel cage may have transitioned past mseg */
3191                         if (kcagepfn >= mseg->pages_base &&
3192                             kcagepfn < mseg->pages_end) {
3193                                 ASSERT(decr);
3194                                 *hi = MIN(kcagepfn, pfnhi);
3195                                 *lo = MAX(pfnlo, mseg->pages_base);
3196                                 rc = 1;
3197                         }
3198                 } else {
3199                         /* entire mseg outside of kernel cage */
3200                         *lo = MAX(pfnlo, mseg->pages_base);
3201                         *hi = MIN(pfnhi, (mseg->pages_end - 1));
3202                         rc = 1;
3203                 }
3204         }
3205         return (rc);
3206 }
3207
3208 /*
3209  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3210  * page with size code 'szc'. Claiming such a page requires acquiring
3211  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3212  * relocating pages in use and concatenating these constituent pages into a
3213  * large page.
3214  *
3215  * The page lists do not have such a large page and page_freelist_split has
3216  * already failed to demote larger pages and/or coalesce smaller free pages.
3217  *
3218  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3219  * pages with the same color as 'bin'.
3220  *
3221  * 'pfnflag' specifies the subset of the pfn range to search.
3222  */
3223
3224 static page_t *
3225 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3226     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3227 {
3228         struct memseg *mseg;
3229         pgcnt_t szcpgcnt = page_get_pagecnt(szc);
3230         pgcnt_t szcpgmask = szcpgcnt - 1;
3231         pfn_t   randpfn;
3232         page_t *pp, *randpp, *endpp;
3233         uint_t colors, ceq_mask;
3234         uint_t color_mask;
3235         pfn_t hi, lo;
3236         uint_t skip;
3237         MEM_NODE_ITERATOR_DECL(it);
3238
3239         ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3240
3241         pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3242
3243         if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3244                 return (NULL);
3245
3246         ASSERT(szc < mmu_page_sizes);
3247
3248         colors = PAGE_GET_PAGECOLORS(szc);
3249         color_mask = colors - 1;
3250         if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3251                 uchar_t ceq = colorequivszc[szc];
3252                 uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3253
3254                 ASSERT(ceq_dif > 0);
3255                 ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3256         } else {
3257                 ceq_mask = 0;
3258         }
3259
3260         ASSERT(bin < colors);
3261
3262         /* clear "non-significant" color bits */
3263         bin &= ceq_mask;
3264
3265         /*
3266          * trim the pfn range to search based on pfnflag. pfnflag is set
3267          * when there have been previous page_get_contig_page failures to
3268          * limit the search.
3269          *
3270          * The high bit in pfnflag specifies the number of 'slots' in the
3271          * pfn range and the remainder of pfnflag specifies which slot.
3272          * For example, a value of 1010b would mean the second slot of
3273          * the pfn range that has been divided into 8 slots.
3274          */
3275         if (pfnflag > 1) {
3276                 int     slots = 1 << (highbit(pfnflag) - 1);
3277                 int     slotid = pfnflag & (slots - 1);
3278                 pgcnt_t szcpages;
3279                 int     slotlen;
3280
3281                 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3282                 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3283                 slotlen = howmany(szcpages, slots);
3284                 /* skip if 'slotid' slot is empty */
3285                 if (slotid * slotlen >= szcpages)
3286                         return (NULL);
3287                 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3288                 ASSERT(pfnlo < pfnhi);
3289                 if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3290                         pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3291         }
3292
3293         /*
3294          * This routine is can be called recursively so we shouldn't
3295          * acquire a reader lock if a write request is pending. This
3296          * could lead to a deadlock with the DR thread.
3297          *
3298          * Returning NULL informs the caller that we could not get
3299          * a contig page with the required characteristics.
3300          */
3301
3302         if (!memsegs_trylock(0))
3303                 return (NULL);
3304
3305         /*
3306          * loop through memsegs to look for contig page candidates
3307          */
3308
3309         for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3310                 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3311                         /* no overlap */
3312                         continue;
3313                 }
3314
3315                 if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3316                         /* mseg too small */
3317                         continue;
3318
3319                 /*
3320                  * trim off kernel cage pages from pfn range and check for
3321                  * a trimmed pfn range returned that does not span the
3322                  * desired large page size.
3323                  */
3324                 if (kcage_on) {
3325                         if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3326                             lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3327                                 continue;
3328                 } else {
3329                         lo = MAX(pfnlo, mseg->pages_base);
3330                         hi = MIN(pfnhi, (mseg->pages_end - 1));
3331                 }
3332
3333                 /* round to szcpgcnt boundaries */
3334                 lo = P2ROUNDUP(lo, szcpgcnt);
3335
3336                 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3337                 hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3338
3339                 if (hi <= lo)
3340                         continue;
3341
3342                 /*
3343                  * set lo to point to the pfn for the desired bin. Large
3344                  * page sizes may only have a single page color
3345                  */
3346                 skip = szcpgcnt;
3347                 if (ceq_mask > 0 || interleaved_mnodes) {
3348                         /* set lo to point at appropriate color */
3349                         if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3350                             (interleaved_mnodes &&
3351                             PFN_2_MEM_NODE(lo) != mnode)) {
3352                                 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3353                                     color_mask, &it);
3354                         }
3355                         if (hi <= lo)
3356                                 /* mseg cannot satisfy color request */
3357                                 continue;
3358                 }
3359
3360                 /* randomly choose a point between lo and hi to begin search */
3361
3362                 randpfn = (pfn_t)GETTICK();
3363                 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3364                 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3365                 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3366                         if (randpfn != (pfn_t)-1) {
3367                                 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3368                                     ceq_mask, color_mask, &it);
3369                         }
3370                         if (randpfn >= hi) {
3371                                 randpfn = lo;
3372                                 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3373                                     &it);
3374                         }
3375                 }
3376                 randpp = mseg->pages + (randpfn - mseg->pages_base);
3377
3378                 ASSERT(randpp->p_pagenum == randpfn);
3379
3380                 pp = randpp;
3381                 endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3382
3383                 ASSERT(randpp + szcpgcnt <= endpp);
3384
3385                 do {
3386                         ASSERT(!(pp->p_pagenum & szcpgmask));
3387                         ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3388
3389                         if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3390                                 /* pages unlocked by page_claim on failure */
3391                                 if (page_claim_contig_pages(pp, szc, flags)) {
3392                                         memsegs_unlock(0);
3393                                         return (pp);
3394                                 }
3395                         }
3396
3397                         if (ceq_mask == 0 && !interleaved_mnodes) {
3398                                 pp += skip;
3399                         } else {
3400                                 pfn_t pfn = pp->p_pagenum;
3401
3402                                 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3403                                     ceq_mask, color_mask, &it);
3404                                 if (pfn == (pfn_t)-1) {
3405                                         pp = endpp;
3406                                 } else {
3407                                         pp = mseg->pages +
3408                                             (pfn - mseg->pages_base);
3409                                 }
3410                         }
3411                         if (pp >= endpp) {
3412                                 /* start from the beginning */
3413                                 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3414                                 pp = mseg->pages + (lo - mseg->pages_base);
3415                                 ASSERT(pp->p_pagenum == lo);
3416                                 ASSERT(pp + szcpgcnt <= endpp);
3417                         }
3418                 } while (pp != randpp);
3419         }
3420         memsegs_unlock(0);
3421         return (NULL);
3422 }
3423
3424
3425 /*
3426  * controlling routine that searches through physical memory in an attempt to
3427  * claim a large page based on the input parameters.
3428  * on the page free lists.
3429  *
3430  * calls page_geti_contig_pages with an initial pfn range from the mnode
3431  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3432  * that overlaps with the kernel cage or does not match the requested page
3433  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3434  * page_geti_contig_pages may further limit the search range based on
3435  * previous failure counts (pgcpfailcnt[]).
3436  *
3437  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3438  * pagesize page that satisfies mtype.
3439  */
3440 page_t *
3441 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3442     uint_t flags)
3443 {
3444         pfn_t           pfnlo, pfnhi;   /* contig pages pfn range */
3445         page_t          *pp;
3446         pgcnt_t         pfnflag = 0;    /* no limit on search if 0 */
3447
3448         VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3449
3450         /* no allocations from cage */
3451         flags |= PGI_NOCAGE;
3452
3453         MTYPE_START(mnode, mtype, flags);
3454         if (mtype < 0) {        /* mnode does not have memory in mtype range */
3455                 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3456                 return (NULL);
3457         }
3458
3459         ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3460
3461         /* do not limit search and ignore color if hi pri */
3462
3463         if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3464                 pfnflag = pgcpfailcnt[szc];
3465
3466         /* remove color match to improve chances */
3467
3468         if (flags & PGI_PGCPHIPRI || pfnflag)
3469                 flags &= ~PG_MATCH_COLOR;
3470
3471         do {
3472                 /* get pfn range based on mnode and mtype */
3473                 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3474
3475                 ASSERT(pfnhi >= pfnlo);
3476
3477                 pp = page_geti_contig_pages(mnode, bin, szc, flags,
3478                     pfnlo, pfnhi, pfnflag);
3479
3480                 if (pp != NULL) {
3481                         pfnflag = pgcpfailcnt[szc];
3482                         if (pfnflag) {
3483                                 /* double the search size */
3484                                 pgcpfailcnt[szc] = pfnflag >> 1;
3485                         }
3486                         VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3487                         return (pp);
3488                 }
3489                 MTYPE_NEXT(mnode, mtype, flags);
3490         } while (mtype >= 0);
3491
3492         VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3493         return (NULL);
3494 }
3495
3496 #if defined(__i386) || defined(__amd64)
3497 /*
3498  * Determine the likelihood of finding/coalescing a szc page.
3499  * Return 0 if the likelihood is small otherwise return 1.
3500  *
3501  * For now, be conservative and check only 1g pages and return 0
3502  * if there had been previous coalescing failures and the szc pages
3503  * needed to satisfy request would exhaust most of freemem.
3504  */
3505 int
3506 page_chk_freelist(uint_t szc)
3507 {
3508         pgcnt_t         pgcnt;
3509
3510         if (szc <= 1)
3511                 return (1);
3512
3513         pgcnt = page_get_pagecnt(szc);
3514         if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3515                 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3516                 return (0);
3517         }
3518         VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3519         return (1);
3520 }
3521 #endif
3522
3523 /*
3524  * Find the `best' page on the freelist for this (obj,off) (as,vaddr) pair.
3525  *
3526  * Does its own locking and accounting.
3527  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3528  * pages of the proper color even if there are pages of a different color.
3529  *
3530  * Finds a page, removes it, THEN locks it.
3531  */
3532
3533 /*ARGSUSED*/
3534 page_t *
3535 page_get_freelist(struct vmobject *obj, uoff_t off, struct seg *seg,
3536         caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3537 {
3538         struct as       *as = seg->s_as;
3539         page_t          *pp = NULL;
3540         ulong_t         bin;
3541         uchar_t         szc;
3542         int             mnode;
3543         int             mtype;
3544         page_t          *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3545         lgrp_mnode_cookie_t     lgrp_cookie;
3546
3547         page_get_func = page_get_mnode_freelist;
3548
3549         /*
3550          * If we aren't passed a specific lgroup, or passed a freed lgrp
3551          * assume we wish to allocate near to the current thread's home.
3552          */
3553         if (!LGRP_EXISTS(lgrp))
3554                 lgrp = lgrp_home_lgrp();
3555
3556         if (kcage_on) {
3557                 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3558                     kcage_freemem < kcage_throttlefree + btop(size) &&
3559                     curthread != kcage_cageout_thread) {
3560                         /*
3561                          * Set a "reserve" of kcage_throttlefree pages for
3562                          * PG_PANIC and cageout thread allocations.
3563                          *
3564                          * Everybody else has to serialize in
3565                          * page_create_get_something() to get a cage page, so
3566                          * that we don't deadlock cageout!
3567                          */
3568                         return (NULL);
3569                 }
3570         } else {
3571                 flags &= ~PG_NORELOC;
3572                 flags |= PGI_NOCAGE;
3573         }
3574
3575         MTYPE_INIT(mtype, obj->vnode, vaddr, flags, size);
3576
3577         /*
3578          * Convert size to page size code.
3579          */
3580         if ((szc = page_szc(size)) == (uchar_t)-1)
3581                 panic("page_get_freelist: illegal page size request");
3582         ASSERT(szc < mmu_page_sizes);
3583
3584         VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3585
3586         AS_2_BIN(as, seg, obj->vnode, vaddr, bin, szc);
3587
3588         ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3589
3590         /*
3591          * Try to get a local page first, but try remote if we can't
3592          * get a page of the right color.
3593          */
3594 pgretry:
3595         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3596         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3597                 pp = page_get_func(mnode, bin, mtype, szc, flags);
3598                 if (pp != NULL) {
3599                         VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3600                         DTRACE_PROBE4(page__get,
3601                             lgrp_t *, lgrp,
3602                             int, mnode,
3603                             ulong_t, bin,
3604                             uint_t, flags);
3605                         return (pp);
3606                 }
3607         }
3608         ASSERT(pp == NULL);
3609
3610         /*
3611          * for non-SZC0 PAGESIZE requests, check cachelist before checking
3612          * remote free lists.  Caller expected to call page_get_cachelist which
3613          * will check local cache lists and remote free lists.
3614          */
3615         if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3616                 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3617                 return (NULL);
3618         }
3619
3620         ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3621
3622         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3623
3624         if (!(flags & PG_LOCAL)) {
3625                 /*
3626                  * Try to get a non-local freelist page.
3627                  */
3628                 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3629                 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3630                         pp = page_get_func(mnode, bin, mtype, szc, flags);
3631                         if (pp != NULL) {
3632                                 DTRACE_PROBE4(page__get,
3633                                     lgrp_t *, lgrp,
3634                                     int, mnode,
3635                                     ulong_t, bin,
3636                                     uint_t, flags);
3637                                 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3638                                 return (pp);
3639                         }
3640                 }
3641                 ASSERT(pp == NULL);
3642         }
3643
3644         /*
3645          * when the cage is off chances are page_get_contig_pages() will fail
3646          * to lock a large page chunk therefore when the cage is off it's not
3647          * called by default.  this can be changed via /etc/system.
3648          *
3649          * page_get_contig_pages() also called to acquire a base pagesize page
3650          * for page_create_get_something().
3651          */
3652         if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3653             (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3654             (page_get_func != page_get_contig_pages)) {
3655
3656                 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3657                 page_get_func = page_get_contig_pages;
3658                 goto pgretry;
3659         }
3660
3661         if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3662             page_get_func == page_get_contig_pages)
3663                 SETPGCPFAILCNT(szc);
3664
3665         VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3666         return (NULL);
3667 }
3668
3669 /*
3670  * Find the `best' page on the cachelist for this (obj,off) (as,vaddr) pair.
3671  *
3672  * Does its own locking.
3673  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3674  * pages of the proper color even if there are pages of a different color.
3675  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3676  * try to lock one of them.  If no page can be locked, try the
3677  * next bin.  Return NULL if a page can not be found and locked.
3678  *
3679  * Finds a pages, trys to lock it, then removes it.
3680  */
3681
3682 /*ARGSUSED*/
3683 struct page *
3684 page_get_cachelist(struct vmobject *obj, uoff_t off, struct seg *seg,
3685     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3686 {
3687         page_t          *pp;
3688         struct as       *as = seg->s_as;
3689         ulong_t         bin;
3690         int             mnode;
3691         int             mtype;
3692         lgrp_mnode_cookie_t     lgrp_cookie;
3693
3694         /*
3695          * If we aren't passed a specific lgroup, or pasased a freed lgrp
3696          * assume we wish to allocate near to the current thread's home.
3697          */
3698         if (!LGRP_EXISTS(lgrp))
3699                 lgrp = lgrp_home_lgrp();
3700
3701         if (!kcage_on) {
3702                 flags &= ~PG_NORELOC;
3703                 flags |= PGI_NOCAGE;
3704         }
3705
3706         if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3707             kcage_freemem <= kcage_throttlefree) {
3708                 /*
3709                  * Reserve kcage_throttlefree pages for critical kernel
3710                  * threads.
3711                  *
3712                  * Everybody else has to go to page_create_get_something()
3713                  * to get a cage page, so we don't deadlock cageout.
3714                  */
3715                 return (NULL);
3716         }
3717
3718         AS_2_BIN(as, seg, obj->vnode, vaddr, bin, 0);
3719
3720         ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3721
3722         MTYPE_INIT(mtype, obj->vnode, vaddr, flags, MMU_PAGESIZE);
3723
3724         VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3725
3726         /*
3727          * Try local cachelists first
3728          */
3729         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3730         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3731                 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3732                 if (pp != NULL) {
3733                         VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3734                         DTRACE_PROBE4(page__get,
3735                             lgrp_t *, lgrp,
3736                             int, mnode,
3737                             ulong_t, bin,
3738                             uint_t, flags);
3739                         return (pp);
3740                 }
3741         }
3742
3743         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3744
3745         /*
3746          * Try freelists/cachelists that are farther away
3747          * This is our only chance to allocate remote pages for PAGESIZE
3748          * requests.
3749          */
3750         LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3751         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3752                 pp = page_get_mnode_freelist(mnode, bin, mtype,
3753                     0, flags);
3754                 if (pp != NULL) {
3755                         VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3756                         DTRACE_PROBE4(page__get,
3757                             lgrp_t *, lgrp,
3758                             int, mnode,
3759                             ulong_t, bin,
3760                             uint_t, flags);
3761                         return (pp);
3762                 }
3763                 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3764                 if (pp != NULL) {
3765                         VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3766                         DTRACE_PROBE4(page__get,
3767                             lgrp_t *, lgrp,
3768                             int, mnode,
3769                             ulong_t, bin,
3770                             uint_t, flags);
3771                         return (pp);
3772                 }
3773         }
3774
3775         VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3776         return (NULL);
3777 }
3778
3779 page_t *
3780 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3781 {
3782         kmutex_t                *pcm;
3783         page_t                  *pp, *first_pp;
3784         uint_t                  sbin;
3785         int                     plw_initialized;
3786         page_list_walker_t      plw;
3787
3788         VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3789
3790         MTYPE_START(mnode, mtype, flags);
3791         if (mtype < 0) {        /* mnode does not have memory in mtype range */
3792                 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3793                 return (NULL);
3794         }
3795
3796 try_again:
3797
3798         plw_initialized = 0;
3799         plw.plw_ceq_dif = 1;
3800
3801         /*
3802          * Only hold one cachelist lock at a time, that way we
3803          * can start anywhere and not have to worry about lock
3804          * ordering.
3805          */
3806
3807         for (plw.plw_count = 0;
3808             plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3809                 sbin = bin;
3810                 do {
3811
3812                         if (!PAGE_CACHELISTS(mnode, bin, mtype))
3813                                 goto bin_empty_1;
3814                         pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3815                         mutex_enter(pcm);
3816                         pp = PAGE_CACHELISTS(mnode, bin, mtype);
3817                         if (pp == NULL)
3818                                 goto bin_empty_0;
3819
3820                         first_pp = pp;
3821                         VERIFY(pp->p_object);
3822                         ASSERT(pp->p_vnode);
3823                         ASSERT(PP_ISAGED(pp) == 0);
3824                         ASSERT(pp->p_szc == 0);
3825                         ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3826                         while (!page_trylock(pp, SE_EXCL)) {
3827                                 pp = pp->p_next;
3828                                 ASSERT(pp->p_szc == 0);
3829                                 if (pp == first_pp) {
3830                                         /*
3831                                          * We have searched the complete list!
3832                                          * And all of them (might only be one)
3833                                          * are locked. This can happen since
3834                                          * these pages can also be found via
3835                                          * the hash list. When found via the
3836                                          * hash list, they are locked first,
3837                                          * then removed. We give up to let the
3838                                          * other thread run.
3839                                          */
3840                                         pp = NULL;
3841                                         break;
3842                                 }
3843                                 VERIFY(pp->p_object);
3844                                 ASSERT(pp->p_vnode);
3845                                 ASSERT(PP_ISFREE(pp));
3846                                 ASSERT(PP_ISAGED(pp) == 0);
3847                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
3848                                     mnode);
3849                         }
3850
3851                         if (pp) {
3852                                 page_t  **ppp;
3853                                 /*
3854                                  * Found and locked a page.
3855                                  * Pull it off the list.
3856                                  */
3857                                 ASSERT(mtype == PP_2_MTYPE(pp));
3858                                 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
3859                                 page_sub(ppp, pp);
3860                                 /*
3861                                  * Subtract counters before releasing pcm mutex
3862                                  * to avoid a race with page_freelist_coalesce
3863                                  * and page_freelist_split.
3864                                  */
3865                                 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3866                                 mutex_exit(pcm);
3867                                 VERIFY(pp->p_object);
3868                                 ASSERT(pp->p_vnode);
3869                                 ASSERT(PP_ISAGED(pp) == 0);
3870                                 VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
3871                                 return (pp);
3872                         }
3873 bin_empty_0:
3874                         mutex_exit(pcm);
3875 bin_empty_1:
3876                         if (plw_initialized == 0) {
3877                                 page_list_walk_init(0, flags, bin, 0, 1, &plw);
3878                                 plw_initialized = 1;
3879                         }
3880                         /* calculate the next bin with equivalent color */
3881                         bin = ADD_MASKED(bin, plw.plw_bin_step,
3882                             plw.plw_ceq_mask[0], plw.plw_color_mask);
3883                 } while (sbin != bin);
3884
3885                 if (plw.plw_ceq_dif > 1)
3886                         bin = page_list_walk_next_bin(0, bin, &plw);
3887         }
3888
3889         MTYPE_NEXT(mnode, mtype, flags);
3890         if (mtype >= 0)
3891                 goto try_again;
3892
3893         VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
3894         return (NULL);
3895 }
3896
3897 #ifdef DEBUG
3898 #define REPL_PAGE_STATS
3899 #endif /* DEBUG */
3900
3901 #ifdef REPL_PAGE_STATS
3902 struct repl_page_stats {
3903         uint_t  ngets;
3904         uint_t  ngets_noreloc;
3905         uint_t  npgr_noreloc;
3906         uint_t  nnopage_first;
3907         uint_t  nnopage;
3908         uint_t  nhashout;
3909         uint_t  nnofree;
3910         uint_t  nnext_pp;
3911 } repl_page_stats;
3912 #define REPL_STAT_INCR(v)       atomic_inc_32(&repl_page_stats.v)
3913 #else /* REPL_PAGE_STATS */
3914 #define REPL_STAT_INCR(v)
3915 #endif /* REPL_PAGE_STATS */
3916
3917 int     pgrppgcp;
3918
3919 /*
3920  * The freemem accounting must be done by the caller.
3921  * First we try to get a replacement page of the same size as like_pp,
3922  * if that is not possible, then we just get a set of discontiguous
3923  * PAGESIZE pages.
3924  */
3925 page_t *
3926 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
3927     uint_t pgrflags)
3928 {
3929         page_t          *like_pp;
3930         page_t          *pp, *pplist;
3931         page_t          *pl = NULL;
3932         ulong_t         bin;
3933         int             mnode, page_mnode;
3934         int             szc;
3935         spgcnt_t        npgs, pg_cnt;
3936         pfn_t           pfnum;
3937         int             mtype;
3938         int             flags = 0;
3939         lgrp_mnode_cookie_t     lgrp_cookie;
3940         lgrp_t          *lgrp;
3941
3942         REPL_STAT_INCR(ngets);
3943         like_pp = orig_like_pp;
3944         ASSERT(PAGE_EXCL(like_pp));
3945
3946         szc = like_pp->p_szc;
3947         npgs = page_get_pagecnt(szc);
3948         /*
3949          * Now we reset like_pp to the base page_t.
3950          * That way, we won't walk past the end of this 'szc' page.
3951          */
3952         pfnum = PFN_BASE(like_pp->p_pagenum, szc);
3953         like_pp = page_numtopp_nolock(pfnum);
3954         ASSERT(like_pp->p_szc == szc);
3955
3956         if (PP_ISNORELOC(like_pp)) {
3957                 ASSERT(kcage_on);
3958                 REPL_STAT_INCR(ngets_noreloc);
3959                 flags = PGI_RELOCONLY;
3960         } else if (pgrflags & PGR_NORELOC) {
3961                 ASSERT(kcage_on);
3962                 REPL_STAT_INCR(npgr_noreloc);
3963                 flags = PG_NORELOC;
3964         }
3965
3966         /*
3967          * Kernel pages must always be replaced with the same size
3968          * pages, since we cannot properly handle demotion of kernel
3969          * pages.
3970          */
3971         if (PP_ISKAS(like_pp))
3972                 pgrflags |= PGR_SAMESZC;
3973
3974         MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
3975
3976         while (npgs) {
3977                 pplist = NULL;
3978                 for (;;) {
3979                         pg_cnt = page_get_pagecnt(szc);
3980                         bin = PP_2_BIN(like_pp);
3981                         ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
3982                         ASSERT(pg_cnt <= npgs);
3983
3984                         /*
3985                          * If an lgroup was specified, try to get the
3986                          * page from that lgroup.
3987                          * NOTE: Must be careful with code below because
3988                          *       lgroup may disappear and reappear since there
3989                          *       is no locking for lgroup here.
3990                          */
3991                         if (LGRP_EXISTS(lgrp_target)) {
3992                                 /*
3993                                  * Keep local variable for lgroup separate
3994                                  * from lgroup argument since this code should
3995                                  * only be exercised when lgroup argument
3996                                  * exists....
3997                                  */
3998                                 lgrp = lgrp_target;
3999
4000                                 /* Try the lgroup's freelists first */
4001                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4002                                     LGRP_SRCH_LOCAL);
4003                                 while ((pplist == NULL) &&
4004                                     (mnode = lgrp_memnode_choose(&lgrp_cookie))
4005                                     != -1) {
4006                                         pplist =
4007                                             page_get_mnode_freelist(mnode, bin,
4008                                             mtype, szc, flags);
4009                                 }
4010
4011                                 /*
4012                                  * Now try it's cachelists if this is a
4013                                  * small page. Don't need to do it for
4014                                  * larger ones since page_freelist_coalesce()
4015                                  * already failed.
4016                                  */
4017                                 if (pplist != NULL || szc != 0)
4018                                         break;
4019
4020                                 /* Now try it's cachelists */
4021                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4022                                     LGRP_SRCH_LOCAL);
4023
4024                                 while ((pplist == NULL) &&
4025                                     (mnode = lgrp_memnode_choose(&lgrp_cookie))
4026                                     != -1) {
4027                                         pplist =
4028                                             page_get_mnode_cachelist(bin, flags,
4029                                             mnode, mtype);
4030                                 }
4031                                 if (pplist != NULL) {
4032                                         page_hashout(pplist, false);
4033                                         PP_SETAGED(pplist);
4034                                         REPL_STAT_INCR(nhashout);
4035                                         break;
4036                                 }
4037                                 /* Done looking in this lgroup. Bail out. */
4038                                 break;
4039                         }
4040
4041                         /*
4042                          * No lgroup was specified (or lgroup was removed by
4043                          * DR, so just try to get the page as close to
4044                          * like_pp's mnode as possible.
4045                          * First try the local freelist...
4046                          */
4047                         mnode = PP_2_MEM_NODE(like_pp);
4048                         pplist = page_get_mnode_freelist(mnode, bin,
4049                             mtype, szc, flags);
4050                         if (pplist != NULL)
4051                                 break;
4052
4053                         REPL_STAT_INCR(nnofree);
4054
4055                         /*
4056                          * ...then the local cachelist. Don't need to do it for
4057                          * larger pages cause page_freelist_coalesce() already
4058                          * failed there anyway.
4059                          */
4060                         if (szc == 0) {
4061                                 pplist = page_get_mnode_cachelist(bin, flags,
4062                                     mnode, mtype);
4063                                 if (pplist != NULL) {
4064                                         page_hashout(pplist, false);
4065                                         PP_SETAGED(pplist);
4066                                         REPL_STAT_INCR(nhashout);
4067                                         break;
4068                                 }
4069                         }
4070
4071                         /* Now try remote freelists */
4072                         page_mnode = mnode;
4073                         lgrp =
4074                             lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4075                         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4076                             LGRP_SRCH_HIER);
4077                         while (pplist == NULL &&
4078                             (mnode = lgrp_memnode_choose(&lgrp_cookie))
4079                             != -1) {
4080                                 /*
4081                                  * Skip local mnode.
4082                                  */
4083                                 if ((mnode == page_mnode) ||
4084                                     (mem_node_config[mnode].exists == 0))
4085                                         continue;
4086
4087                                 pplist = page_get_mnode_freelist(mnode,
4088                                     bin, mtype, szc, flags);
4089                         }
4090
4091                         if (pplist != NULL)
4092                                 break;
4093
4094
4095                         /* Now try remote cachelists */
4096                         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4097                             LGRP_SRCH_HIER);
4098                         while (pplist == NULL && szc == 0) {
4099                                 mnode = lgrp_memnode_choose(&lgrp_cookie);
4100                                 if (mnode == -1)
4101                                         break;
4102                                 /*
4103                                  * Skip local mnode.
4104                                  */
4105                                 if ((mnode == page_mnode) ||
4106                                     (mem_node_config[mnode].exists == 0))
4107                                         continue;
4108
4109                                 pplist = page_get_mnode_cachelist(bin,
4110                                     flags, mnode, mtype);
4111
4112                                 if (pplist != NULL) {
4113                                         page_hashout(pplist, false);
4114                                         PP_SETAGED(pplist);
4115                                         REPL_STAT_INCR(nhashout);
4116                                         break;
4117                                 }
4118                         }
4119
4120                         /*
4121                          * Break out of while loop under the following cases:
4122                          * - If we successfully got a page.
4123                          * - If pgrflags specified only returning a specific
4124                          *   page size and we could not find that page size.
4125                          * - If we could not satisfy the request with PAGESIZE
4126                          *   or larger pages.
4127                          */
4128                         if (pplist != NULL || szc == 0)
4129                                 break;
4130
4131                         if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4132                                 /* try to find contig page */
4133
4134                                 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4135                                     LGRP_SRCH_HIER);
4136
4137                                 while ((pplist == NULL) &&
4138                                     (mnode =
4139                                     lgrp_memnode_choose(&lgrp_cookie))
4140                                     != -1) {
4141                                         pplist = page_get_contig_pages(
4142                                             mnode, bin, mtype, szc,
4143                                             flags | PGI_PGCPHIPRI);
4144                                 }
4145                                 break;
4146                         }
4147
4148                         /*
4149                          * The correct thing to do here is try the next
4150                          * page size down using szc--. Due to a bug
4151                          * with the processing of HAT_RELOAD_SHARE
4152                          * where the sfmmu_ttecnt arrays of all
4153                          * hats sharing an ISM segment don't get updated,
4154                          * using intermediate size pages for relocation
4155                          * can lead to continuous page faults.
4156                          */
4157                         szc = 0;
4158                 }
4159
4160                 if (pplist != NULL) {
4161                         DTRACE_PROBE4(page__get,
4162                             lgrp_t *, lgrp,
4163                             int, mnode,
4164                             ulong_t, bin,
4165                             uint_t, flags);
4166
4167                         while (pplist != NULL && pg_cnt--) {
4168                                 ASSERT(pplist != NULL);
4169                                 pp = pplist;
4170                                 page_sub(&pplist, pp);
4171                                 PP_CLRFREE(pp);
4172                                 PP_CLRAGED(pp);
4173                                 page_list_concat(&pl, &pp);
4174                                 npgs--;
4175                                 like_pp = like_pp + 1;
4176                                 REPL_STAT_INCR(nnext_pp);
4177                         }
4178                         ASSERT(pg_cnt == 0);
4179                 } else {
4180                         break;
4181                 }
4182         }
4183
4184         if (npgs) {
4185                 /*
4186                  * We were unable to allocate the necessary number
4187                  * of pages.
4188                  * We need to free up any pl.
4189                  */
4190                 REPL_STAT_INCR(nnopage);
4191                 page_free_replacement_page(pl);
4192                 return (NULL);
4193         } else {
4194                 return (pl);
4195         }
4196 }
4197
4198 /*
4199  * demote a free large page to it's constituent pages
4200  */
4201 void
4202 page_demote_free_pages(page_t *pp)
4203 {
4204
4205         int mnode;
4206
4207         ASSERT(pp != NULL);
4208         ASSERT(PAGE_LOCKED(pp));
4209         ASSERT(PP_ISFREE(pp));
4210         ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4211
4212         mnode = PP_2_MEM_NODE(pp);
4213         page_freelist_lock(mnode);
4214         if (pp->p_szc != 0) {
4215                 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4216                     pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4217         }
4218         page_freelist_unlock(mnode);
4219         ASSERT(pp->p_szc == 0);
4220 }
4221
4222 /*
4223  * Factor in colorequiv to check additional 'equivalent' bins.
4224  * colorequiv may be set in /etc/system
4225  */
4226 void
4227 page_set_colorequiv_arr(void)
4228 {
4229         if (colorequiv > 1) {
4230                 int i;
4231                 uint_t sv_a = lowbit(colorequiv) - 1;
4232
4233                 if (sv_a > 15)
4234                         sv_a = 15;
4235
4236                 for (i = 0; i < MMU_PAGE_SIZES; i++) {
4237                         uint_t colors;
4238                         uint_t a = sv_a;
4239
4240                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
4241                                 continue;
4242                         }
4243                         while ((colors >> a) == 0)
4244                                 a--;
4245                         if ((a << 4) > colorequivszc[i]) {
4246                                 colorequivszc[i] = (a << 4);
4247                         }
4248                 }
4249         }
4250 }