remove support for 'trademark files'
[unleashed/tickless.git] / kernel / vm / vm_pagelist.c
blob826506798f88584989b950815cc644f560573260
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright 2012 Joyent, Inc. All rights reserved.
29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
33 * Portions of this source code were derived from Berkeley 4.3 BSD
34 * under license from the Regents of the University of California.
39 * This file contains common functions to access and manage the page lists.
40 * Many of these routines originated from platform dependent modules
41 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
42 * a platform independent manner.
44 * vm/vm_dep.h provides for platform specific support.
47 #include <sys/types.h>
48 #include <sys/debug.h>
49 #include <sys/cmn_err.h>
50 #include <sys/systm.h>
51 #include <sys/atomic.h>
52 #include <sys/sysmacros.h>
53 #include <vm/as.h>
54 #include <vm/page.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/seg_vn.h>
57 #include <sys/vmsystm.h>
58 #include <sys/memnode.h>
59 #include <vm/vm_dep.h>
60 #include <sys/lgrp.h>
61 #include <sys/mem_config.h>
62 #include <sys/callb.h>
63 #include <sys/mem_cage.h>
64 #include <sys/sdt.h>
65 #include <sys/dumphdr.h>
66 #include <sys/swap.h>
68 extern uint_t vac_colors;
70 #define MAX_PRAGMA_ALIGN 128
72 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
74 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
75 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0)
76 #else
77 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0)
78 #endif
79 char vm_cpu_data0[VM_CPU_DATA_PADSIZE];
82 * number of page colors equivalent to reqested color in page_get routines.
83 * If set, keeps large pages intact longer and keeps MPO allocation
84 * from the local mnode in favor of acquiring the 'correct' page color from
85 * a demoted large page or from a remote mnode.
87 uint_t colorequiv;
90 * color equivalency mask for each page size.
91 * Mask is computed based on cpu L2$ way sizes and colorequiv global.
92 * High 4 bits determine the number of high order bits of the color to ignore.
93 * Low 4 bits determines number of low order bits of color to ignore (it's only
94 * relevant for hashed index based page coloring).
96 uchar_t colorequivszc[MMU_PAGE_SIZES];
99 * if set, specifies the percentage of large pages that are free from within
100 * a large page region before attempting to lock those pages for
101 * page_get_contig_pages processing.
103 * Should be turned on when kpr is available when page_trylock_contig_pages
104 * can be more selective.
107 int ptcpthreshold;
110 * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
111 * Enabled by default via pgcplimitsearch.
113 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
114 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
115 * bound. This upper bound range guarantees:
116 * - all large page 'slots' will be searched over time
117 * - the minimum (1) large page candidates considered on each pgcp call
118 * - count doesn't wrap around to 0
120 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES];
121 int pgcplimitsearch = 1;
123 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1))
124 #define SETPGCPFAILCNT(szc) \
125 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \
126 pgcpfailcnt[szc] = PGCPFAILMAX / 2;
128 #ifdef VM_STATS
129 struct vmm_vmstats_str vmm_vmstats;
131 #endif /* VM_STATS */
133 /* enable page_get_contig_pages */
134 #define LPGCREATE 1
136 int pg_contig_disable;
137 int pg_lpgcreate_nocage = LPGCREATE;
140 * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
142 #define PFNNULL 0
144 /* Flags involved in promotion and demotion routines */
145 #define PC_FREE 0x1 /* put page on freelist */
146 #define PC_ALLOC 0x2 /* return page for allocation */
149 * Flag for page_demote to be used with PC_FREE to denote that we don't care
150 * what the color is as the color parameter to the function is ignored.
152 #define PC_NO_COLOR (-1)
154 /* mtype value for page_promote to use when mtype does not matter */
155 #define PC_MTYPE_ANY (-1)
158 * page counters candidates info
159 * See page_ctrs_cands comment below for more details.
160 * fields are as follows:
161 * pcc_pages_free: # pages which freelist coalesce can create
162 * pcc_color_free: pointer to page free counts per color
164 typedef struct pcc_info {
165 pgcnt_t pcc_pages_free;
166 pgcnt_t *pcc_color_free;
167 uint_t pad[12];
168 } pcc_info_t;
171 * On big machines it can take a long time to check page_counters
172 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
173 * updated sum of all elements of the corresponding page_counters arrays.
174 * page_freelist_coalesce() searches page_counters only if an appropriate
175 * element of page_ctrs_cands array is greater than 0.
177 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
179 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
182 * Return in val the total number of free pages which can be created
183 * for the given mnode (m), mrange (g), and region size (r)
185 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \
186 int i; \
187 val = 0; \
188 for (i = 0; i < NPC_MUTEX; i++) { \
189 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \
194 * Return in val the total number of free pages which can be created
195 * for the given mnode (m), mrange (g), region size (r), and color (c)
197 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \
198 int i; \
199 val = 0; \
200 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \
201 for (i = 0; i < NPC_MUTEX; i++) { \
202 val += \
203 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \
208 * We can only allow a single thread to update a counter within the physical
209 * range of the largest supported page size. That is the finest granularity
210 * possible since the counter values are dependent on each other
211 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
212 * ctr_mutex lock index for a particular physical range.
214 static kmutex_t *ctr_mutex[NPC_MUTEX];
216 #define PP_CTR_LOCK_INDX(pp) \
217 (((pp)->p_pagenum >> \
218 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
220 #define INVALID_COLOR 0xffffffff
221 #define INVALID_MASK 0xffffffff
224 * Local functions prototypes.
227 void page_ctr_add(int, int, page_t *, int);
228 void page_ctr_add_internal(int, int, page_t *, int);
229 void page_ctr_sub(int, int, page_t *, int);
230 void page_ctr_sub_internal(int, int, page_t *, int);
231 void page_freelist_lock(int);
232 void page_freelist_unlock(int);
233 page_t *page_promote(int, pfn_t, uchar_t, int, int);
234 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
235 page_t *page_freelist_split(uchar_t,
236 uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
237 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
238 static int page_trylock_cons(page_t *pp, se_t se);
241 * The page_counters array below is used to keep track of free contiguous
242 * physical memory. A hw_page_map_t will be allocated per mnode per szc.
243 * This contains an array of counters, the size of the array, a shift value
244 * used to convert a pagenum into a counter array index or vice versa, as
245 * well as a cache of the last successful index to be promoted to a larger
246 * page size. As an optimization, we keep track of the last successful index
247 * to be promoted per page color for the given size region, and this is
248 * allocated dynamically based upon the number of colors for a given
249 * region size.
251 * Conceptually, the page counters are represented as:
253 * page_counters[region_size][mnode]
255 * region_size: size code of a candidate larger page made up
256 * of contiguous free smaller pages.
258 * page_counters[region_size][mnode].hpm_counters[index]:
259 * represents how many (region_size - 1) pages either
260 * exist or can be created within the given index range.
262 * Let's look at a sparc example:
263 * If we want to create a free 512k page, we look at region_size 2
264 * for the mnode we want. We calculate the index and look at a specific
265 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at
266 * this location, it means that 8 64k pages either exist or can be created
267 * from 8K pages in order to make a single free 512k page at the given
268 * index. Note that when a region is full, it will contribute to the
269 * counts in the region above it. Thus we will not know what page
270 * size the free pages will be which can be promoted to this new free
271 * page unless we look at all regions below the current region.
275 * Note: hpmctr_t is defined in platform vm_dep.h
276 * hw_page_map_t contains all the information needed for the page_counters
277 * logic. The fields are as follows:
279 * hpm_counters: dynamically allocated array to hold counter data
280 * hpm_entries: entries in hpm_counters
281 * hpm_shift: shift for pnum/array index conv
282 * hpm_base: PFN mapped to counter index 0
283 * hpm_color_current: last index in counter array for this color at
284 * which we successfully created a large page
286 typedef struct hw_page_map {
287 hpmctr_t *hpm_counters;
288 size_t hpm_entries;
289 int hpm_shift;
290 pfn_t hpm_base;
291 size_t *hpm_color_current[MAX_MNODE_MRANGES];
292 } hw_page_map_t;
295 * Element zero is not used, but is allocated for convenience.
297 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
300 * Cached value of MNODE_RANGE_CNT(mnode).
301 * This is a function call in x86.
303 static int mnode_nranges[MAX_MEM_NODES];
304 static int mnode_maxmrange[MAX_MEM_NODES];
307 * The following macros are convenient ways to get access to the individual
308 * elements of the page_counters arrays. They can be used on both
309 * the left side and right side of equations.
311 #define PAGE_COUNTERS(mnode, rg_szc, idx) \
312 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
314 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \
315 (page_counters[(rg_szc)][(mnode)].hpm_counters)
317 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \
318 (page_counters[(rg_szc)][(mnode)].hpm_shift)
320 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \
321 (page_counters[(rg_szc)][(mnode)].hpm_entries)
323 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \
324 (page_counters[(rg_szc)][(mnode)].hpm_base)
326 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \
327 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
329 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \
330 (page_counters[(rg_szc)][(mnode)]. \
331 hpm_color_current[(mrange)][(color)])
333 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \
334 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \
335 PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
337 #define IDX_TO_PNUM(mnode, rg_szc, index) \
338 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \
339 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
342 * Protects the hpm_counters and hpm_color_current memory from changing while
343 * looking at page counters information.
344 * Grab the write lock to modify what these fields point at.
345 * Grab the read lock to prevent any pointers from changing.
346 * The write lock can not be held during memory allocation due to a possible
347 * recursion deadlock with trying to grab the read lock while the
348 * write lock is already held.
350 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
354 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
356 void
357 cpu_vm_data_init(struct cpu *cp)
359 if (cp == CPU0) {
360 cp->cpu_vm_data = (void *)&vm_cpu_data0;
361 } else {
362 void *kmptr;
363 int align;
364 size_t sz;
366 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
367 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
368 kmptr = kmem_zalloc(sz, KM_SLEEP);
369 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
370 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
371 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
376 * free cpu_vm_data
378 void
379 cpu_vm_data_destroy(struct cpu *cp)
381 if (cp->cpu_seqid && cp->cpu_vm_data) {
382 ASSERT(cp != CPU0);
383 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
384 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
386 cp->cpu_vm_data = NULL;
391 * page size to page size code
394 page_szc(size_t pagesize)
396 int i = 0;
398 while (hw_page_array[i].hp_size) {
399 if (pagesize == hw_page_array[i].hp_size)
400 return (i);
401 i++;
403 return (-1);
407 * page size to page size code with the restriction that it be a supported
408 * user page size. If it's not a supported user page size, -1 will be returned.
411 page_szc_user_filtered(size_t pagesize)
413 int szc = page_szc(pagesize);
414 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
415 return (szc);
417 return (-1);
421 * Return how many page sizes are available for the user to use. This is
422 * what the hardware supports and not based upon how the OS implements the
423 * support of different page sizes.
425 * If legacy is non-zero, return the number of pagesizes available to legacy
426 * applications. The number of legacy page sizes might be less than the
427 * exported user page sizes. This is to prevent legacy applications that
428 * use the largest page size returned from getpagesizes(3c) from inadvertantly
429 * using the 'new' large pagesizes.
431 uint_t
432 page_num_user_pagesizes(int legacy)
434 if (legacy)
435 return (mmu_legacy_page_sizes);
436 return (mmu_exported_page_sizes);
439 uint_t
440 page_num_pagesizes(void)
442 return (mmu_page_sizes);
446 * returns the count of the number of base pagesize pages associated with szc
448 pgcnt_t
449 page_get_pagecnt(uint_t szc)
451 if (szc >= mmu_page_sizes)
452 panic("page_get_pagecnt: out of range %d", szc);
453 return (hw_page_array[szc].hp_pgcnt);
456 size_t
457 page_get_pagesize(uint_t szc)
459 if (szc >= mmu_page_sizes)
460 panic("page_get_pagesize: out of range %d", szc);
461 return (hw_page_array[szc].hp_size);
465 * Return the size of a page based upon the index passed in. An index of
466 * zero refers to the smallest page size in the system, and as index increases
467 * it refers to the next larger supported page size in the system.
468 * Note that szc and userszc may not be the same due to unsupported szc's on
469 * some systems.
471 size_t
472 page_get_user_pagesize(uint_t userszc)
474 uint_t szc = USERSZC_2_SZC(userszc);
476 if (szc >= mmu_page_sizes)
477 panic("page_get_user_pagesize: out of range %d", szc);
478 return (hw_page_array[szc].hp_size);
481 uint_t
482 page_get_shift(uint_t szc)
484 if (szc >= mmu_page_sizes)
485 panic("page_get_shift: out of range %d", szc);
486 return (PAGE_GET_SHIFT(szc));
489 uint_t
490 page_get_pagecolors(uint_t szc)
492 if (szc >= mmu_page_sizes)
493 panic("page_get_pagecolors: out of range %d", szc);
494 return (PAGE_GET_PAGECOLORS(szc));
498 * this assigns the desired equivalent color after a split
500 uint_t
501 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
502 uint_t ncolor, uint_t ceq_mask)
504 ASSERT(nszc > szc);
505 ASSERT(szc < mmu_page_sizes);
506 ASSERT(color < PAGE_GET_PAGECOLORS(szc));
507 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
509 color &= ceq_mask;
510 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
511 return (color | (ncolor & ~ceq_mask));
515 * The interleaved_mnodes flag is set when mnodes overlap in
516 * the physbase..physmax range, but have disjoint slices.
517 * In this case hpm_counters is shared by all mnodes.
518 * This flag is set dynamically by the platform.
520 int interleaved_mnodes = 0;
523 * Called by startup().
524 * Size up the per page size free list counters based on physmax
525 * of each node and max_mem_nodes.
527 * If interleaved_mnodes is set we need to find the first mnode that
528 * exists. hpm_counters for the first mnode will then be shared by
529 * all other mnodes. If interleaved_mnodes is not set, just set
530 * first=mnode each time. That means there will be no sharing.
532 size_t
533 page_ctrs_sz(void)
535 int r; /* region size */
536 int mnode;
537 int firstmn; /* first mnode that exists */
538 int nranges;
539 pfn_t physbase;
540 pfn_t physmax;
541 uint_t ctrs_sz = 0;
542 int i;
543 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
546 * We need to determine how many page colors there are for each
547 * page size in order to allocate memory for any color specific
548 * arrays.
550 for (i = 0; i < mmu_page_sizes; i++) {
551 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
554 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
556 pgcnt_t r_pgcnt;
557 pfn_t r_base;
558 pgcnt_t r_align;
560 if (mem_node_config[mnode].exists == 0)
561 continue;
563 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
564 nranges = MNODE_RANGE_CNT(mnode);
565 mnode_nranges[mnode] = nranges;
566 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
569 * determine size needed for page counter arrays with
570 * base aligned to large page size.
572 for (r = 1; r < mmu_page_sizes; r++) {
573 /* add in space for hpm_color_current */
574 ctrs_sz += sizeof (size_t) *
575 colors_per_szc[r] * nranges;
577 if (firstmn != mnode)
578 continue;
580 /* add in space for hpm_counters */
581 r_align = page_get_pagecnt(r);
582 r_base = physbase;
583 r_base &= ~(r_align - 1);
584 r_pgcnt = howmany(physmax - r_base + 1, r_align);
587 * Round up to always allocate on pointer sized
588 * boundaries.
590 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
591 sizeof (hpmctr_t *));
595 for (r = 1; r < mmu_page_sizes; r++) {
596 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
599 /* add in space for page_ctrs_cands and pcc_color_free */
600 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
601 mmu_page_sizes * NPC_MUTEX;
603 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
605 if (mem_node_config[mnode].exists == 0)
606 continue;
608 nranges = mnode_nranges[mnode];
609 ctrs_sz += sizeof (pcc_info_t) * nranges *
610 mmu_page_sizes * NPC_MUTEX;
611 for (r = 1; r < mmu_page_sizes; r++) {
612 ctrs_sz += sizeof (pgcnt_t) * nranges *
613 colors_per_szc[r] * NPC_MUTEX;
617 /* ctr_mutex */
618 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
620 /* size for page list counts */
621 PLCNT_SZ(ctrs_sz);
624 * add some slop for roundups. page_ctrs_alloc will roundup the start
625 * address of the counters to ecache_alignsize boundary for every
626 * memory node.
628 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
631 caddr_t
632 page_ctrs_alloc(caddr_t alloc_base)
634 int mnode;
635 int mrange, nranges;
636 int r; /* region size */
637 int i;
638 int firstmn; /* first mnode that exists */
639 pfn_t physbase;
640 pfn_t physmax;
641 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
644 * We need to determine how many page colors there are for each
645 * page size in order to allocate memory for any color specific
646 * arrays.
648 for (i = 0; i < mmu_page_sizes; i++) {
649 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
652 for (r = 1; r < mmu_page_sizes; r++) {
653 page_counters[r] = (hw_page_map_t *)alloc_base;
654 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
657 /* page_ctrs_cands and pcc_color_free array */
658 for (i = 0; i < NPC_MUTEX; i++) {
659 for (r = 1; r < mmu_page_sizes; r++) {
661 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
662 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
664 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
665 pcc_info_t *pi;
667 if (mem_node_config[mnode].exists == 0)
668 continue;
670 nranges = mnode_nranges[mnode];
672 pi = (pcc_info_t *)alloc_base;
673 alloc_base += sizeof (pcc_info_t) * nranges;
674 page_ctrs_cands[i][r][mnode] = pi;
676 for (mrange = 0; mrange < nranges; mrange++) {
677 pi->pcc_color_free =
678 (pgcnt_t *)alloc_base;
679 alloc_base += sizeof (pgcnt_t) *
680 colors_per_szc[r];
681 pi++;
687 /* ctr_mutex */
688 for (i = 0; i < NPC_MUTEX; i++) {
689 ctr_mutex[i] = (kmutex_t *)alloc_base;
690 alloc_base += (max_mem_nodes * sizeof (kmutex_t));
693 /* initialize page list counts */
694 PLCNT_INIT(alloc_base);
696 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
698 pgcnt_t r_pgcnt;
699 pfn_t r_base;
700 pgcnt_t r_align;
701 int r_shift;
702 int nranges = mnode_nranges[mnode];
704 if (mem_node_config[mnode].exists == 0)
705 continue;
707 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
709 for (r = 1; r < mmu_page_sizes; r++) {
711 * the page_counters base has to be aligned to the
712 * page count of page size code r otherwise the counts
713 * will cross large page boundaries.
715 r_align = page_get_pagecnt(r);
716 r_base = physbase;
717 /* base needs to be aligned - lower to aligned value */
718 r_base &= ~(r_align - 1);
719 r_pgcnt = howmany(physmax - r_base + 1, r_align);
720 r_shift = PAGE_BSZS_SHIFT(r);
722 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
723 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
724 PAGE_COUNTERS_BASE(mnode, r) = r_base;
725 for (mrange = 0; mrange < nranges; mrange++) {
726 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
727 r, mrange) = (size_t *)alloc_base;
728 alloc_base += sizeof (size_t) *
729 colors_per_szc[r];
731 for (i = 0; i < colors_per_szc[r]; i++) {
732 uint_t color_mask = colors_per_szc[r] - 1;
733 pfn_t pfnum = r_base;
734 size_t idx;
735 int mrange;
736 MEM_NODE_ITERATOR_DECL(it);
738 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
739 if (pfnum == (pfn_t)-1) {
740 idx = 0;
741 } else {
742 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
743 color_mask, color_mask, &it);
744 idx = PNUM_TO_IDX(mnode, r, pfnum);
745 idx = (idx >= r_pgcnt) ? 0 : idx;
747 for (mrange = 0; mrange < nranges; mrange++) {
748 PAGE_COUNTERS_CURRENT_COLOR(mnode,
749 r, i, mrange) = idx;
753 /* hpm_counters may be shared by all mnodes */
754 if (firstmn == mnode) {
755 PAGE_COUNTERS_COUNTERS(mnode, r) =
756 (hpmctr_t *)alloc_base;
757 alloc_base +=
758 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
759 sizeof (hpmctr_t *));
760 } else {
761 PAGE_COUNTERS_COUNTERS(mnode, r) =
762 PAGE_COUNTERS_COUNTERS(firstmn, r);
766 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
767 * satisfy the identity requirement.
768 * We should be able to go from one to the other
769 * and get consistent values.
771 ASSERT(PNUM_TO_IDX(mnode, r,
772 (IDX_TO_PNUM(mnode, r, 0))) == 0);
773 ASSERT(IDX_TO_PNUM(mnode, r,
774 (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
777 * Roundup the start address of the page_counters to
778 * cache aligned boundary for every memory node.
779 * page_ctrs_sz() has added some slop for these roundups.
781 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
782 L2CACHE_ALIGN);
785 /* Initialize other page counter specific data structures. */
786 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
787 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
790 return (alloc_base);
794 * Functions to adjust region counters for each size free list.
795 * Caller is responsible to acquire the ctr_mutex lock if necessary and
796 * thus can be called during startup without locks.
798 /* ARGSUSED */
799 void
800 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
802 ssize_t r; /* region size */
803 ssize_t idx;
804 pfn_t pfnum;
805 int lckidx;
807 ASSERT(mnode == PP_2_MEM_NODE(pp));
808 ASSERT(mtype == PP_2_MTYPE(pp));
810 ASSERT(pp->p_szc < mmu_page_sizes);
812 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
814 /* no counter update needed for largest page size */
815 if (pp->p_szc >= mmu_page_sizes - 1) {
816 return;
819 r = pp->p_szc + 1;
820 pfnum = pp->p_pagenum;
821 lckidx = PP_CTR_LOCK_INDX(pp);
824 * Increment the count of free pages for the current
825 * region. Continue looping up in region size incrementing
826 * count if the preceeding region is full.
828 while (r < mmu_page_sizes) {
829 idx = PNUM_TO_IDX(mnode, r, pfnum);
831 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
832 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
834 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
835 break;
836 } else {
837 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
838 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
839 [MTYPE_2_MRANGE(mnode, root_mtype)];
841 cand->pcc_pages_free++;
842 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
844 r++;
848 void
849 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
851 int lckidx = PP_CTR_LOCK_INDX(pp);
852 kmutex_t *lock = &ctr_mutex[lckidx][mnode];
854 mutex_enter(lock);
855 page_ctr_add_internal(mnode, mtype, pp, flags);
856 mutex_exit(lock);
859 void
860 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
862 int lckidx;
863 ssize_t r; /* region size */
864 ssize_t idx;
865 pfn_t pfnum;
867 ASSERT(mnode == PP_2_MEM_NODE(pp));
868 ASSERT(mtype == PP_2_MTYPE(pp));
870 ASSERT(pp->p_szc < mmu_page_sizes);
872 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
874 /* no counter update needed for largest page size */
875 if (pp->p_szc >= mmu_page_sizes - 1) {
876 return;
879 r = pp->p_szc + 1;
880 pfnum = pp->p_pagenum;
881 lckidx = PP_CTR_LOCK_INDX(pp);
884 * Decrement the count of free pages for the current
885 * region. Continue looping up in region size decrementing
886 * count if the preceeding region was full.
888 while (r < mmu_page_sizes) {
889 idx = PNUM_TO_IDX(mnode, r, pfnum);
891 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
892 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
894 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
895 break;
896 } else {
897 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
898 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
899 [MTYPE_2_MRANGE(mnode, root_mtype)];
901 ASSERT(cand->pcc_pages_free != 0);
902 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
904 cand->pcc_pages_free--;
905 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
907 r++;
911 void
912 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
914 int lckidx = PP_CTR_LOCK_INDX(pp);
915 kmutex_t *lock = &ctr_mutex[lckidx][mnode];
917 mutex_enter(lock);
918 page_ctr_sub_internal(mnode, mtype, pp, flags);
919 mutex_exit(lock);
923 * Adjust page counters following a memory attach, since typically the
924 * size of the array needs to change, and the PFN to counter index
925 * mapping needs to change.
927 * It is possible this mnode did not exist at startup. In that case
928 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
929 * to change (a theoretical possibility on x86), which means pcc_color_free
930 * arrays must be extended.
932 uint_t
933 page_ctrs_adjust(int mnode)
935 pgcnt_t npgs;
936 int r; /* region size */
937 int i;
938 size_t pcsz, old_csz;
939 hpmctr_t *new_ctr, *old_ctr;
940 pfn_t oldbase, newbase;
941 pfn_t physbase, physmax;
942 size_t old_npgs;
943 hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
944 size_t size_cache[MMU_PAGE_SIZES];
945 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
946 size_t *old_color_array[MAX_MNODE_MRANGES];
947 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
948 pcc_info_t **cands_cache;
949 pcc_info_t *old_pi, *pi;
950 pgcnt_t *pgcntp;
951 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
952 int cands_cache_nranges;
953 int old_maxmrange, new_maxmrange;
954 int rc = 0;
955 int oldmnode;
957 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
958 MMU_PAGE_SIZES, KM_NOSLEEP);
959 if (cands_cache == NULL)
960 return (ENOMEM);
962 i = -1;
963 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
965 newbase = physbase & ~PC_BASE_ALIGN_MASK;
966 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
968 /* prepare to free non-null pointers on the way out */
969 cands_cache_nranges = nranges;
970 bzero(ctr_cache, sizeof (ctr_cache));
971 bzero(color_cache, sizeof (color_cache));
974 * We need to determine how many page colors there are for each
975 * page size in order to allocate memory for any color specific
976 * arrays.
978 for (r = 0; r < mmu_page_sizes; r++) {
979 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
983 * Preallocate all of the new hpm_counters arrays as we can't
984 * hold the page_ctrs_rwlock as a writer and allocate memory.
985 * If we can't allocate all of the arrays, undo our work so far
986 * and return failure.
988 for (r = 1; r < mmu_page_sizes; r++) {
989 pcsz = npgs >> PAGE_BSZS_SHIFT(r);
990 size_cache[r] = pcsz;
991 ctr_cache[r] = kmem_zalloc(pcsz *
992 sizeof (hpmctr_t), KM_NOSLEEP);
993 if (ctr_cache[r] == NULL) {
994 rc = ENOMEM;
995 goto cleanup;
1000 * Preallocate all of the new color current arrays as we can't
1001 * hold the page_ctrs_rwlock as a writer and allocate memory.
1002 * If we can't allocate all of the arrays, undo our work so far
1003 * and return failure.
1005 for (r = 1; r < mmu_page_sizes; r++) {
1006 for (mrange = 0; mrange < nranges; mrange++) {
1007 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1008 colors_per_szc[r], KM_NOSLEEP);
1009 if (color_cache[r][mrange] == NULL) {
1010 rc = ENOMEM;
1011 goto cleanup;
1017 * Preallocate all of the new pcc_info_t arrays as we can't
1018 * hold the page_ctrs_rwlock as a writer and allocate memory.
1019 * If we can't allocate all of the arrays, undo our work so far
1020 * and return failure.
1022 for (r = 1; r < mmu_page_sizes; r++) {
1023 for (i = 0; i < NPC_MUTEX; i++) {
1024 pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1025 KM_NOSLEEP);
1026 if (pi == NULL) {
1027 rc = ENOMEM;
1028 goto cleanup;
1030 cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1032 for (mrange = 0; mrange < nranges; mrange++, pi++) {
1033 pgcntp = kmem_zalloc(colors_per_szc[r] *
1034 sizeof (pgcnt_t), KM_NOSLEEP);
1035 if (pgcntp == NULL) {
1036 rc = ENOMEM;
1037 goto cleanup;
1039 pi->pcc_color_free = pgcntp;
1045 * Grab the write lock to prevent others from walking these arrays
1046 * while we are modifying them.
1048 PAGE_CTRS_WRITE_LOCK(mnode);
1051 * For interleaved mnodes, find the first mnode
1052 * with valid page counters since the current
1053 * mnode may have just been added and not have
1054 * valid page counters.
1056 if (interleaved_mnodes) {
1057 for (i = 0; i < max_mem_nodes; i++)
1058 if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1059 break;
1060 ASSERT(i < max_mem_nodes);
1061 oldmnode = i;
1062 } else
1063 oldmnode = mnode;
1065 old_nranges = mnode_nranges[mnode];
1066 cands_cache_nranges = old_nranges;
1067 mnode_nranges[mnode] = nranges;
1068 old_maxmrange = mnode_maxmrange[mnode];
1069 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1070 new_maxmrange = mnode_maxmrange[mnode];
1072 for (r = 1; r < mmu_page_sizes; r++) {
1073 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1074 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1075 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1076 oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1077 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1078 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1079 old_color_array[mrange] =
1080 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1081 r, mrange);
1084 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1085 new_ctr = ctr_cache[r];
1086 ctr_cache[r] = NULL;
1087 if (old_ctr != NULL &&
1088 (oldbase + old_npgs > newbase) &&
1089 (newbase + npgs > oldbase)) {
1091 * Map the intersection of the old and new
1092 * counters into the new array.
1094 size_t offset;
1095 if (newbase > oldbase) {
1096 offset = (newbase - oldbase) >>
1097 PAGE_COUNTERS_SHIFT(mnode, r);
1098 bcopy(old_ctr + offset, new_ctr,
1099 MIN(pcsz, (old_csz - offset)) *
1100 sizeof (hpmctr_t));
1101 } else {
1102 offset = (oldbase - newbase) >>
1103 PAGE_COUNTERS_SHIFT(mnode, r);
1104 bcopy(old_ctr, new_ctr + offset,
1105 MIN(pcsz - offset, old_csz) *
1106 sizeof (hpmctr_t));
1110 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1111 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1112 PAGE_COUNTERS_BASE(mnode, r) = newbase;
1114 /* update shared hpm_counters in other mnodes */
1115 if (interleaved_mnodes) {
1116 for (i = 0; i < max_mem_nodes; i++) {
1117 if ((i == mnode) ||
1118 (mem_node_config[i].exists == 0))
1119 continue;
1120 ASSERT(
1121 PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1122 PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1123 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1124 PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1125 PAGE_COUNTERS_BASE(i, r) = newbase;
1129 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1130 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1131 color_cache[r][mrange];
1132 color_cache[r][mrange] = NULL;
1135 * for now, just reset on these events as it's probably
1136 * not worthwhile to try and optimize this.
1138 for (i = 0; i < colors_per_szc[r]; i++) {
1139 uint_t color_mask = colors_per_szc[r] - 1;
1140 int mlo = interleaved_mnodes ? 0 : mnode;
1141 int mhi = interleaved_mnodes ? max_mem_nodes :
1142 (mnode + 1);
1143 int m;
1144 pfn_t pfnum;
1145 size_t idx;
1146 MEM_NODE_ITERATOR_DECL(it);
1148 for (m = mlo; m < mhi; m++) {
1149 if (mem_node_config[m].exists == 0)
1150 continue;
1151 pfnum = newbase;
1152 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1153 if (pfnum == (pfn_t)-1) {
1154 idx = 0;
1155 } else {
1156 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1157 color_mask, color_mask, &it);
1158 idx = PNUM_TO_IDX(m, r, pfnum);
1159 idx = (idx < pcsz) ? idx : 0;
1161 for (mrange = 0; mrange < nranges; mrange++) {
1162 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1163 r, mrange) != NULL)
1164 PAGE_COUNTERS_CURRENT_COLOR(m,
1165 r, i, mrange) = idx;
1170 /* cache info for freeing out of the critical path */
1171 if ((caddr_t)old_ctr >= kernelheap &&
1172 (caddr_t)old_ctr < ekernelheap) {
1173 ctr_cache[r] = old_ctr;
1174 size_cache[r] = old_csz;
1176 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1177 size_t *tmp = old_color_array[mrange];
1178 if ((caddr_t)tmp >= kernelheap &&
1179 (caddr_t)tmp < ekernelheap) {
1180 color_cache[r][mrange] = tmp;
1184 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1185 * satisfy the identity requirement.
1186 * We should be able to go from one to the other
1187 * and get consistent values.
1189 ASSERT(PNUM_TO_IDX(mnode, r,
1190 (IDX_TO_PNUM(mnode, r, 0))) == 0);
1191 ASSERT(IDX_TO_PNUM(mnode, r,
1192 (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1194 /* pcc_info_t and pcc_color_free */
1195 for (i = 0; i < NPC_MUTEX; i++) {
1196 pcc_info_t *epi;
1197 pcc_info_t *eold_pi;
1199 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1200 old_pi = page_ctrs_cands[i][r][mnode];
1201 page_ctrs_cands[i][r][mnode] = pi;
1202 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1204 /* preserve old pcc_color_free values, if any */
1205 if (old_pi == NULL)
1206 continue;
1209 * when/if x86 does DR, must account for
1210 * possible change in range index when
1211 * preserving pcc_info
1213 epi = &pi[nranges];
1214 eold_pi = &old_pi[old_nranges];
1215 if (new_maxmrange > old_maxmrange) {
1216 pi += new_maxmrange - old_maxmrange;
1217 } else if (new_maxmrange < old_maxmrange) {
1218 old_pi += old_maxmrange - new_maxmrange;
1220 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1221 pcc_info_t tmp = *pi;
1222 *pi = *old_pi;
1223 *old_pi = tmp;
1227 PAGE_CTRS_WRITE_UNLOCK(mnode);
1230 * Now that we have dropped the write lock, it is safe to free all
1231 * of the memory we have cached above.
1232 * We come thru here to free memory when pre-alloc fails, and also to
1233 * free old pointers which were recorded while locked.
1235 cleanup:
1236 for (r = 1; r < mmu_page_sizes; r++) {
1237 if (ctr_cache[r] != NULL) {
1238 kmem_free(ctr_cache[r],
1239 size_cache[r] * sizeof (hpmctr_t));
1241 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1242 if (color_cache[r][mrange] != NULL) {
1243 kmem_free(color_cache[r][mrange],
1244 colors_per_szc[r] * sizeof (size_t));
1247 for (i = 0; i < NPC_MUTEX; i++) {
1248 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1249 if (pi == NULL)
1250 continue;
1251 nr = cands_cache_nranges;
1252 for (mrange = 0; mrange < nr; mrange++, pi++) {
1253 pgcntp = pi->pcc_color_free;
1254 if (pgcntp == NULL)
1255 continue;
1256 if ((caddr_t)pgcntp >= kernelheap &&
1257 (caddr_t)pgcntp < ekernelheap) {
1258 kmem_free(pgcntp,
1259 colors_per_szc[r] *
1260 sizeof (pgcnt_t));
1263 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1264 if ((caddr_t)pi >= kernelheap &&
1265 (caddr_t)pi < ekernelheap) {
1266 kmem_free(pi, nr * sizeof (pcc_info_t));
1271 kmem_free(cands_cache,
1272 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1273 return (rc);
1277 * Cleanup the hpm_counters field in the page counters
1278 * array.
1280 void
1281 page_ctrs_cleanup(void)
1283 int r; /* region size */
1284 int i; /* mnode index */
1287 * Get the page counters write lock while we are
1288 * setting the page hpm_counters field to NULL
1289 * for non-existent mnodes.
1291 for (i = 0; i < max_mem_nodes; i++) {
1292 PAGE_CTRS_WRITE_LOCK(i);
1293 if (mem_node_config[i].exists) {
1294 PAGE_CTRS_WRITE_UNLOCK(i);
1295 continue;
1297 for (r = 1; r < mmu_page_sizes; r++) {
1298 PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1300 PAGE_CTRS_WRITE_UNLOCK(i);
1304 #ifdef DEBUG
1307 * confirm pp is a large page corresponding to szc
1309 void
1310 chk_lpg(page_t *pp, uchar_t szc)
1312 spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1313 uint_t noreloc;
1315 if (npgs == 1) {
1316 ASSERT(pp->p_szc == 0);
1317 ASSERT(pp->p_next == pp);
1318 ASSERT(pp->p_prev == pp);
1319 return;
1322 ASSERT(pp->p_list.largepg.next == pp || pp->p_list.largepg.next == NULL);
1323 ASSERT(pp->p_list.largepg.prev == pp || pp->p_list.largepg.prev == NULL);
1325 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1326 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1327 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1328 ASSERT(pp->p_prev == (pp + (npgs - 1)));
1331 * Check list of pages.
1333 noreloc = PP_ISNORELOC(pp);
1334 while (npgs--) {
1335 if (npgs != 0) {
1336 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1337 ASSERT(pp->p_next == (pp + 1));
1339 ASSERT(pp->p_szc == szc);
1340 ASSERT(PP_ISFREE(pp));
1341 ASSERT(PP_ISAGED(pp));
1342 ASSERT(pp->p_list.largepg.next == pp || pp->p_list.largepg.next == NULL);
1343 ASSERT(pp->p_list.largepg.prev == pp || pp->p_list.largepg.prev == NULL);
1344 VERIFY(pp->p_object == NULL);
1345 ASSERT(pp->p_vnode == NULL);
1346 ASSERT(PP_ISNORELOC(pp) == noreloc);
1348 pp = pp->p_next;
1351 #endif /* DEBUG */
1353 void
1354 page_freelist_lock(int mnode)
1356 int i;
1357 for (i = 0; i < NPC_MUTEX; i++) {
1358 mutex_enter(FPC_MUTEX(mnode, i));
1359 mutex_enter(CPC_MUTEX(mnode, i));
1363 void
1364 page_freelist_unlock(int mnode)
1366 int i;
1367 for (i = 0; i < NPC_MUTEX; i++) {
1368 mutex_exit(FPC_MUTEX(mnode, i));
1369 mutex_exit(CPC_MUTEX(mnode, i));
1374 * add pp to the specified page list. Defaults to head of the page list
1375 * unless PG_LIST_TAIL is specified.
1377 void
1378 page_list_add(page_t *pp, int flags)
1380 page_t **ppp;
1381 kmutex_t *pcm;
1382 uint_t bin, mtype;
1383 int mnode;
1385 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1386 ASSERT(PP_ISFREE(pp));
1387 ASSERT(!hat_page_is_mapped(pp));
1388 ASSERT(hat_page_getshare(pp) == 0);
1391 * Large pages should be freed via page_list_add_pages().
1393 ASSERT(pp->p_szc == 0);
1396 * Don't need to lock the freelist first here
1397 * because the page isn't on the freelist yet.
1398 * This means p_szc can't change on us.
1401 bin = PP_2_BIN(pp);
1402 mnode = PP_2_MEM_NODE(pp);
1403 mtype = PP_2_MTYPE(pp);
1405 if (flags & PG_LIST_ISINIT) {
1407 * PG_LIST_ISINIT is set during system startup (ie. single
1408 * threaded), add a page to the free list and add to the
1409 * the free region counters w/o any locking
1411 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1413 /* inline version of page_add() */
1414 if (*ppp != NULL) {
1415 pp->p_next = *ppp;
1416 pp->p_prev = (*ppp)->p_prev;
1417 (*ppp)->p_prev = pp;
1418 pp->p_prev->p_next = pp;
1419 } else
1420 *ppp = pp;
1422 page_ctr_add_internal(mnode, mtype, pp, flags);
1423 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1424 } else {
1425 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1427 if (flags & PG_FREE_LIST) {
1428 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1429 ASSERT(PP_ISAGED(pp));
1430 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1432 } else {
1433 VM_STAT_ADD(vmm_vmstats.pladd_cache);
1434 VERIFY(pp->p_object);
1435 ASSERT(pp->p_vnode);
1436 ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1437 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1439 mutex_enter(pcm);
1440 page_add(ppp, pp);
1442 if (flags & PG_LIST_TAIL)
1443 *ppp = (*ppp)->p_next;
1445 * Add counters before releasing pcm mutex to avoid a race with
1446 * page_freelist_coalesce and page_freelist_split.
1448 page_ctr_add(mnode, mtype, pp, flags);
1449 mutex_exit(pcm);
1454 * It is up to the caller to unlock the page!
1456 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1461 /* ARGSUSED */
1462 void
1463 page_list_noreloc_startup(page_t *pp)
1465 panic("page_list_noreloc_startup: should be here only for sparc");
1468 void
1469 page_list_add_pages(page_t *pp, int flags)
1471 kmutex_t *pcm;
1472 pgcnt_t pgcnt;
1473 uint_t bin, mtype, i;
1474 int mnode;
1476 /* default to freelist/head */
1477 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1479 CHK_LPG(pp, pp->p_szc);
1480 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1482 bin = PP_2_BIN(pp);
1483 mnode = PP_2_MEM_NODE(pp);
1484 mtype = PP_2_MTYPE(pp);
1486 if (flags & PG_LIST_ISINIT) {
1487 ASSERT(pp->p_szc == mmu_page_sizes - 1);
1488 page_lpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1489 ASSERT(!PP_ISNORELOC(pp));
1490 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1491 } else {
1493 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1495 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1497 mutex_enter(pcm);
1498 page_lpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1499 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1500 mutex_exit(pcm);
1502 pgcnt = page_get_pagecnt(pp->p_szc);
1503 for (i = 0; i < pgcnt; i++, pp++)
1504 page_unlock_nocapture(pp);
1509 * During boot, need to demote a large page to base
1510 * pagesize pages for seg_kmem for use in boot_alloc()
1512 void
1513 page_boot_demote(page_t *pp)
1515 ASSERT(pp->p_szc != 0);
1516 ASSERT(PP_ISFREE(pp));
1517 ASSERT(PP_ISAGED(pp));
1519 (void) page_demote(PP_2_MEM_NODE(pp),
1520 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1521 PC_FREE);
1523 ASSERT(PP_ISFREE(pp));
1524 ASSERT(PP_ISAGED(pp));
1525 ASSERT(pp->p_szc == 0);
1529 * Take a particular page off of whatever freelist the page
1530 * is claimed to be on.
1532 * NOTE: Only used for PAGESIZE pages.
1534 void
1535 page_list_sub(page_t *pp, int flags)
1537 int bin;
1538 uint_t mtype;
1539 int mnode;
1540 kmutex_t *pcm;
1541 page_t **ppp;
1543 ASSERT(PAGE_EXCL(pp));
1544 ASSERT(PP_ISFREE(pp));
1547 * The p_szc field can only be changed by page_promote()
1548 * and page_demote(). Only free pages can be promoted and
1549 * demoted and the free list MUST be locked during these
1550 * operations. So to prevent a race in page_list_sub()
1551 * between computing which bin of the freelist lock to
1552 * grab and actually grabing the lock we check again that
1553 * the bin we locked is still the correct one. Notice that
1554 * the p_szc field could have actually changed on us but
1555 * if the bin happens to still be the same we are safe.
1557 try_again:
1558 bin = PP_2_BIN(pp);
1559 mnode = PP_2_MEM_NODE(pp);
1560 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1561 mutex_enter(pcm);
1562 if (PP_2_BIN(pp) != bin) {
1563 mutex_exit(pcm);
1564 goto try_again;
1566 mtype = PP_2_MTYPE(pp);
1568 if (flags & PG_FREE_LIST) {
1569 VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1570 ASSERT(PP_ISAGED(pp));
1571 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1572 } else {
1573 VM_STAT_ADD(vmm_vmstats.plsub_cache);
1574 ASSERT(!PP_ISAGED(pp));
1575 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1579 * Common PAGESIZE case.
1581 * Note that we locked the freelist. This prevents
1582 * any page promotion/demotion operations. Therefore
1583 * the p_szc will not change until we drop pcm mutex.
1585 if (pp->p_szc == 0) {
1586 page_sub(ppp, pp);
1588 * Subtract counters before releasing pcm mutex
1589 * to avoid race with page_freelist_coalesce.
1591 page_ctr_sub(mnode, mtype, pp, flags);
1592 mutex_exit(pcm);
1594 return;
1598 * Large pages on the cache list are not supported.
1600 if (flags & PG_CACHE_LIST)
1601 panic("page_list_sub: large page on cachelist");
1604 * Slow but rare.
1606 * Somebody wants this particular page which is part
1607 * of a large page. In this case we just demote the page
1608 * if it's on the freelist.
1610 * We have to drop pcm before locking the entire freelist.
1611 * Once we have re-locked the freelist check to make sure
1612 * the page hasn't already been demoted or completely
1613 * freed.
1615 mutex_exit(pcm);
1616 page_freelist_lock(mnode);
1617 if (pp->p_szc != 0) {
1619 * Large page is on freelist.
1621 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1622 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1624 ASSERT(PP_ISFREE(pp));
1625 ASSERT(PP_ISAGED(pp));
1626 ASSERT(pp->p_szc == 0);
1629 * Subtract counters before releasing pcm mutex
1630 * to avoid race with page_freelist_coalesce.
1632 bin = PP_2_BIN(pp);
1633 mtype = PP_2_MTYPE(pp);
1634 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1636 page_sub(ppp, pp);
1637 page_ctr_sub(mnode, mtype, pp, flags);
1638 page_freelist_unlock(mnode);
1642 void
1643 page_list_sub_pages(page_t *pp, uint_t szc)
1645 kmutex_t *pcm;
1646 uint_t bin, mtype;
1647 int mnode;
1649 ASSERT(PAGE_EXCL(pp));
1650 ASSERT(PP_ISFREE(pp));
1651 ASSERT(PP_ISAGED(pp));
1654 * See comment in page_list_sub().
1656 try_again:
1657 bin = PP_2_BIN(pp);
1658 mnode = PP_2_MEM_NODE(pp);
1659 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1660 mutex_enter(pcm);
1661 if (PP_2_BIN(pp) != bin) {
1662 mutex_exit(pcm);
1663 goto try_again;
1667 * If we're called with a page larger than szc or it got
1668 * promoted above szc before we locked the freelist then
1669 * drop pcm and re-lock entire freelist. If page still larger
1670 * than szc then demote it.
1672 if (pp->p_szc > szc) {
1673 mutex_exit(pcm);
1674 pcm = NULL;
1675 page_freelist_lock(mnode);
1676 if (pp->p_szc > szc) {
1677 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1678 (void) page_demote(mnode,
1679 PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1680 pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1682 bin = PP_2_BIN(pp);
1684 ASSERT(PP_ISFREE(pp));
1685 ASSERT(PP_ISAGED(pp));
1686 ASSERT(pp->p_szc <= szc);
1687 ASSERT(pp == PP_PAGEROOT(pp));
1689 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1691 mtype = PP_2_MTYPE(pp);
1692 if (pp->p_szc != 0) {
1693 page_lpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1694 CHK_LPG(pp, pp->p_szc);
1695 } else {
1696 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1697 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1699 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1701 if (pcm != NULL) {
1702 mutex_exit(pcm);
1703 } else {
1704 page_freelist_unlock(mnode);
1710 * Add the page to the front of a linked list of pages
1711 * using the p_next & p_prev pointers for the list.
1712 * The caller is responsible for protecting the list pointers.
1714 void
1715 mach_page_add(page_t **ppp, page_t *pp)
1717 if (*ppp == NULL) {
1718 pp->p_next = pp->p_prev = pp;
1719 } else {
1720 pp->p_next = *ppp;
1721 pp->p_prev = (*ppp)->p_prev;
1722 (*ppp)->p_prev = pp;
1723 pp->p_prev->p_next = pp;
1725 *ppp = pp;
1729 * Remove this page from a linked list of pages
1730 * using the p_next & p_prev pointers for the list.
1732 * The caller is responsible for protecting the list pointers.
1734 void
1735 mach_page_sub(page_t **ppp, page_t *pp)
1737 ASSERT(PP_ISFREE(pp));
1739 if (*ppp == NULL || pp == NULL)
1740 panic("mach_page_sub");
1742 if (*ppp == pp)
1743 *ppp = pp->p_next; /* go to next page */
1745 if (*ppp == pp)
1746 *ppp = NULL; /* page list is gone */
1747 else {
1748 pp->p_prev->p_next = pp->p_next;
1749 pp->p_next->p_prev = pp->p_prev;
1751 pp->p_prev = pp->p_next = pp; /* make pp a list of one */
1755 * Routine fsflush uses to gradually coalesce the free list into larger pages.
1757 void
1758 page_promote_size(page_t *pp, uint_t cur_szc)
1760 pfn_t pfn;
1761 int mnode;
1762 int idx;
1763 int new_szc = cur_szc + 1;
1764 int full = FULL_REGION_CNT(new_szc);
1766 pfn = page_pptonum(pp);
1767 mnode = PFN_2_MEM_NODE(pfn);
1769 page_freelist_lock(mnode);
1771 idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1772 if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1773 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1775 page_freelist_unlock(mnode);
1778 static uint_t page_promote_err;
1779 static uint_t page_promote_noreloc_err;
1782 * Create a single larger page (of szc new_szc) from smaller contiguous pages
1783 * for the given mnode starting at pfnum. Pages involved are on the freelist
1784 * before the call and may be returned to the caller if requested, otherwise
1785 * they will be placed back on the freelist.
1786 * If flags is PC_ALLOC, then the large page will be returned to the user in
1787 * a state which is consistent with a page being taken off the freelist. If
1788 * we failed to lock the new large page, then we will return NULL to the
1789 * caller and put the large page on the freelist instead.
1790 * If flags is PC_FREE, then the large page will be placed on the freelist,
1791 * and NULL will be returned.
1792 * The caller is responsible for locking the freelist as well as any other
1793 * accounting which needs to be done for a returned page.
1795 * RFE: For performance pass in pp instead of pfnum so
1796 * we can avoid excessive calls to page_numtopp_nolock().
1797 * This would depend on an assumption that all contiguous
1798 * pages are in the same memseg so we can just add/dec
1799 * our pp.
1801 * Lock ordering:
1803 * There is a potential but rare deadlock situation
1804 * for page promotion and demotion operations. The problem
1805 * is there are two paths into the freelist manager and
1806 * they have different lock orders:
1808 * page_create()
1809 * lock freelist
1810 * page_lock(EXCL)
1811 * unlock freelist
1812 * return
1813 * caller drops page_lock
1815 * page_free() and page_reclaim()
1816 * caller grabs page_lock(EXCL)
1818 * lock freelist
1819 * unlock freelist
1820 * drop page_lock
1822 * What prevents a thread in page_create() from deadlocking
1823 * with a thread freeing or reclaiming the same page is the
1824 * page_trylock() in page_get_freelist(). If the trylock fails
1825 * it skips the page.
1827 * The lock ordering for promotion and demotion is the same as
1828 * for page_create(). Since the same deadlock could occur during
1829 * page promotion and freeing or reclaiming of a page on the
1830 * cache list we might have to fail the operation and undo what
1831 * have done so far. Again this is rare.
1833 page_t *
1834 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1836 page_t *pp, *pplist, *tpp, *start_pp;
1837 pgcnt_t new_npgs, npgs;
1838 uint_t bin;
1839 pgcnt_t tmpnpgs, pages_left;
1840 uint_t noreloc;
1841 int which_list;
1842 ulong_t index;
1843 kmutex_t *phm;
1846 * General algorithm:
1847 * Find the starting page
1848 * Walk each page struct removing it from the freelist,
1849 * and linking it to all the other pages removed.
1850 * Once all pages are off the freelist,
1851 * walk the list, modifying p_szc to new_szc and what
1852 * ever other info needs to be done to create a large free page.
1853 * According to the flags, either return the page or put it
1854 * on the freelist.
1857 start_pp = page_numtopp_nolock(pfnum);
1858 ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1859 new_npgs = page_get_pagecnt(new_szc);
1860 ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1862 /* don't return page of the wrong mtype */
1863 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1864 return (NULL);
1867 * Loop through smaller pages to confirm that all pages
1868 * give the same result for PP_ISNORELOC().
1869 * We can check this reliably here as the protocol for setting
1870 * P_NORELOC requires pages to be taken off the free list first.
1872 noreloc = PP_ISNORELOC(start_pp);
1873 for (pp = start_pp + new_npgs; --pp > start_pp; ) {
1874 if (noreloc != PP_ISNORELOC(pp)) {
1875 page_promote_noreloc_err++;
1876 page_promote_err++;
1877 return (NULL);
1881 pages_left = new_npgs;
1882 pplist = NULL;
1883 pp = start_pp;
1885 /* Loop around coalescing the smaller pages into a big page. */
1886 while (pages_left) {
1888 * Remove from the freelist.
1890 ASSERT(PP_ISFREE(pp));
1891 bin = PP_2_BIN(pp);
1892 ASSERT(mnode == PP_2_MEM_NODE(pp));
1893 mtype = PP_2_MTYPE(pp);
1894 if (PP_ISAGED(pp)) {
1897 * PG_FREE_LIST
1899 if (pp->p_szc) {
1900 page_lpsub(&PAGE_FREELISTS(mnode,
1901 pp->p_szc, bin, mtype), pp);
1902 } else {
1903 mach_page_sub(&PAGE_FREELISTS(mnode, 0,
1904 bin, mtype), pp);
1906 which_list = PG_FREE_LIST;
1907 } else {
1908 struct vmobject *obj;
1910 ASSERT(pp->p_szc == 0);
1913 * PG_CACHE_LIST
1915 * Since this page comes from the
1916 * cachelist, we must destroy the
1917 * vnode association.
1919 if (!page_trylock(pp, SE_EXCL)) {
1920 goto fail_promote;
1923 obj = &pp->p_vnode->v_object;
1926 * We need to be careful not to deadlock
1927 * with another thread in page_lookup().
1928 * The page_lookup() thread could be holding
1929 * the same phm that we need if the two
1930 * pages happen to hash to the same phm lock.
1931 * At this point we have locked the entire
1932 * freelist and page_lookup() could be trying
1933 * to grab a freelist lock.
1935 if (!vmobject_trylock(obj)) {
1936 page_unlock_nocapture(pp);
1937 goto fail_promote;
1940 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
1941 page_hashout(pp, true);
1942 vmobject_unlock(obj);
1943 PP_SETAGED(pp);
1944 page_unlock_nocapture(pp);
1945 which_list = PG_CACHE_LIST;
1947 page_ctr_sub(mnode, mtype, pp, which_list);
1950 * Concatenate the smaller page(s) onto
1951 * the large page list.
1953 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
1954 pages_left -= npgs;
1955 tpp = pp;
1956 while (npgs--) {
1957 tpp->p_szc = new_szc;
1958 tpp = tpp->p_next;
1960 page_list_concat(&pplist, &pp);
1961 pp += tmpnpgs;
1963 CHK_LPG(pplist, new_szc);
1966 * return the page to the user if requested
1967 * in the properly locked state.
1969 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
1970 return (pplist);
1974 * Otherwise place the new large page on the freelist
1976 bin = PP_2_BIN(pplist);
1977 mnode = PP_2_MEM_NODE(pplist);
1978 mtype = PP_2_MTYPE(pplist);
1979 page_lpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
1981 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
1982 return (NULL);
1984 fail_promote:
1986 * A thread must have still been freeing or
1987 * reclaiming the page on the cachelist.
1988 * To prevent a deadlock undo what we have
1989 * done sofar and return failure. This
1990 * situation can only happen while promoting
1991 * PAGESIZE pages.
1993 page_promote_err++;
1994 while (pplist) {
1995 pp = pplist;
1996 mach_page_sub(&pplist, pp);
1997 pp->p_szc = 0;
1998 bin = PP_2_BIN(pp);
1999 mtype = PP_2_MTYPE(pp);
2000 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2001 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2003 return (NULL);
2008 * Break up a large page into smaller size pages.
2009 * Pages involved are on the freelist before the call and may
2010 * be returned to the caller if requested, otherwise they will
2011 * be placed back on the freelist.
2012 * The caller is responsible for locking the freelist as well as any other
2013 * accounting which needs to be done for a returned page.
2014 * If flags is not PC_ALLOC, the color argument is ignored, and thus
2015 * technically, any value may be passed in but PC_NO_COLOR is the standard
2016 * which should be followed for clarity's sake.
2017 * Returns a page whose pfn is < pfnmax
2019 page_t *
2020 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2021 uchar_t new_szc, int color, int flags)
2023 page_t *pp, *pplist, *npplist;
2024 pgcnt_t npgs, n;
2025 uint_t bin;
2026 uint_t mtype;
2027 page_t *ret_pp = NULL;
2029 ASSERT(cur_szc != 0);
2030 ASSERT(new_szc < cur_szc);
2032 pplist = page_numtopp_nolock(pfnum);
2033 ASSERT(pplist != NULL);
2035 ASSERT(pplist->p_szc == cur_szc);
2037 bin = PP_2_BIN(pplist);
2038 ASSERT(mnode == PP_2_MEM_NODE(pplist));
2039 mtype = PP_2_MTYPE(pplist);
2040 page_lpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2042 CHK_LPG(pplist, cur_szc);
2043 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2046 * Number of PAGESIZE pages for smaller new_szc
2047 * page.
2049 npgs = page_get_pagecnt(new_szc);
2051 while (pplist) {
2052 pp = pplist;
2054 ASSERT(pp->p_szc == cur_szc);
2057 * We either break it up into PAGESIZE pages or larger.
2059 if (npgs == 1) { /* PAGESIZE case */
2060 mach_page_sub(&pplist, pp);
2061 ASSERT(pp->p_szc == cur_szc);
2062 ASSERT(new_szc == 0);
2063 ASSERT(mnode == PP_2_MEM_NODE(pp));
2064 pp->p_szc = new_szc;
2065 bin = PP_2_BIN(pp);
2066 if ((bin == color) && (flags == PC_ALLOC) &&
2067 (ret_pp == NULL) && (pfnmax == 0 ||
2068 pp->p_pagenum < pfnmax) &&
2069 page_trylock_cons(pp, SE_EXCL)) {
2070 ret_pp = pp;
2071 } else {
2072 mtype = PP_2_MTYPE(pp);
2073 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2074 mtype), pp);
2075 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2077 } else {
2078 page_t *try_to_return_this_page = NULL;
2079 int count = 0;
2082 * Break down into smaller lists of pages.
2084 page_list_break(&pplist, &npplist, npgs);
2086 pp = pplist;
2087 n = npgs;
2088 while (n--) {
2089 ASSERT(pp->p_szc == cur_szc);
2091 * Check whether all the pages in this list
2092 * fit the request criteria.
2094 if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2095 count++;
2097 pp->p_szc = new_szc;
2098 pp = pp->p_next;
2101 if (count == npgs &&
2102 (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2103 try_to_return_this_page = pp;
2106 CHK_LPG(pplist, new_szc);
2108 bin = PP_2_BIN(pplist);
2109 if (try_to_return_this_page)
2110 ASSERT(mnode ==
2111 PP_2_MEM_NODE(try_to_return_this_page));
2112 if ((bin == color) && (flags == PC_ALLOC) &&
2113 (ret_pp == NULL) && try_to_return_this_page &&
2114 page_trylock_cons(try_to_return_this_page,
2115 SE_EXCL)) {
2116 ret_pp = try_to_return_this_page;
2117 } else {
2118 mtype = PP_2_MTYPE(pp);
2119 page_lpadd(&PAGE_FREELISTS(mnode, new_szc,
2120 bin, mtype), pplist);
2122 page_ctr_add(mnode, mtype, pplist,
2123 PG_FREE_LIST);
2125 pplist = npplist;
2128 return (ret_pp);
2131 int mpss_coalesce_disable = 0;
2134 * Coalesce free pages into a page of the given szc and color if possible.
2135 * Return the pointer to the page created, otherwise, return NULL.
2137 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2139 page_t *
2140 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2141 int mtype, pfn_t pfnhi)
2143 int r = szc; /* region size */
2144 int mrange;
2145 uint_t full, bin, color_mask, wrap = 0;
2146 pfn_t pfnum, lo, hi;
2147 size_t len, idx, idx0;
2148 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
2149 page_t *ret_pp;
2150 MEM_NODE_ITERATOR_DECL(it);
2152 if (mpss_coalesce_disable) {
2153 ASSERT(szc < MMU_PAGE_SIZES);
2154 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2155 return (NULL);
2158 ASSERT(szc < mmu_page_sizes);
2159 color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2160 ASSERT(ceq_mask <= color_mask);
2161 ASSERT(color <= color_mask);
2162 color &= ceq_mask;
2164 /* Prevent page_counters dynamic memory from being freed */
2165 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2167 mrange = MTYPE_2_MRANGE(mnode, mtype);
2168 ASSERT(mrange < mnode_nranges[mnode]);
2169 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2171 /* get pfn range for mtype */
2172 len = PAGE_COUNTERS_ENTRIES(mnode, r);
2173 MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2174 hi++;
2176 /* use lower limit if given */
2177 if (pfnhi != PFNNULL && pfnhi < hi)
2178 hi = pfnhi;
2180 /* round to szcpgcnt boundaries */
2181 lo = P2ROUNDUP(lo, szcpgcnt);
2182 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2183 if (lo == (pfn_t)-1) {
2184 rw_exit(&page_ctrs_rwlock[mnode]);
2185 return (NULL);
2187 hi = hi & ~(szcpgcnt - 1);
2189 /* set lo to the closest pfn of the right color */
2190 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2191 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2192 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2193 &it);
2196 if (hi <= lo) {
2197 rw_exit(&page_ctrs_rwlock[mnode]);
2198 return (NULL);
2201 full = FULL_REGION_CNT(r);
2203 /* calculate the number of page candidates and initial search index */
2204 bin = color;
2205 idx0 = (size_t)(-1);
2206 do {
2207 pgcnt_t acand;
2209 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2210 if (acand) {
2211 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2212 r, bin, mrange);
2213 idx0 = MIN(idx0, idx);
2214 cands += acand;
2216 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2217 } while (bin != color);
2219 if (cands == 0) {
2220 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2221 rw_exit(&page_ctrs_rwlock[mnode]);
2222 return (NULL);
2225 pfnum = IDX_TO_PNUM(mnode, r, idx0);
2226 if (pfnum < lo || pfnum >= hi) {
2227 pfnum = lo;
2228 } else {
2229 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2230 if (pfnum == (pfn_t)-1) {
2231 pfnum = lo;
2232 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2233 ASSERT(pfnum != (pfn_t)-1);
2234 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2235 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2236 /* invalid color, get the closest correct pfn */
2237 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2238 color_mask, &it);
2239 if (pfnum >= hi) {
2240 pfnum = lo;
2241 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2246 /* set starting index */
2247 idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2248 ASSERT(idx0 < len);
2251 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2254 if (PAGE_COUNTERS(mnode, r, idx) != full)
2255 goto next;
2258 * RFE: For performance maybe we can do something less
2259 * brutal than locking the entire freelist. So far
2260 * this doesn't seem to be a performance problem?
2262 page_freelist_lock(mnode);
2263 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2264 ret_pp =
2265 page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2266 if (ret_pp != NULL) {
2267 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2268 PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2269 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2270 page_freelist_unlock(mnode);
2271 rw_exit(&page_ctrs_rwlock[mnode]);
2272 return (ret_pp);
2274 } else {
2275 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2278 page_freelist_unlock(mnode);
2280 * No point looking for another page if we've
2281 * already tried all of the ones that
2282 * page_ctr_cands indicated. Stash off where we left
2283 * off.
2284 * Note: this is not exact since we don't hold the
2285 * page_freelist_locks before we initially get the
2286 * value of cands for performance reasons, but should
2287 * be a decent approximation.
2289 if (--cands == 0) {
2290 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2291 idx;
2292 break;
2294 next:
2295 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2296 color_mask, &it);
2297 idx = PNUM_TO_IDX(mnode, r, pfnum);
2298 if (idx >= len || pfnum >= hi) {
2299 wrapit:
2300 pfnum = lo;
2301 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2302 idx = PNUM_TO_IDX(mnode, r, pfnum);
2303 wrap++;
2307 rw_exit(&page_ctrs_rwlock[mnode]);
2308 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2309 return (NULL);
2313 * For the given mnode, promote as many small pages to large pages as possible.
2314 * mnode can be -1, which means do them all
2316 void
2317 page_freelist_coalesce_all(int mnode)
2319 int r; /* region size */
2320 int idx, full;
2321 size_t len;
2322 int doall = interleaved_mnodes || mnode < 0;
2323 int mlo = doall ? 0 : mnode;
2324 int mhi = doall ? max_mem_nodes : (mnode + 1);
2326 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2328 if (mpss_coalesce_disable) {
2329 return;
2333 * Lock the entire freelist and coalesce what we can.
2335 * Always promote to the largest page possible
2336 * first to reduce the number of page promotions.
2338 for (mnode = mlo; mnode < mhi; mnode++) {
2339 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2340 page_freelist_lock(mnode);
2342 for (r = mmu_page_sizes - 1; r > 0; r--) {
2343 for (mnode = mlo; mnode < mhi; mnode++) {
2344 pgcnt_t cands = 0;
2345 int mrange, nranges = mnode_nranges[mnode];
2347 for (mrange = 0; mrange < nranges; mrange++) {
2348 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2349 if (cands != 0)
2350 break;
2352 if (cands == 0) {
2353 VM_STAT_ADD(vmm_vmstats.
2354 page_ctrs_cands_skip_all);
2355 continue;
2358 full = FULL_REGION_CNT(r);
2359 len = PAGE_COUNTERS_ENTRIES(mnode, r);
2361 for (idx = 0; idx < len; idx++) {
2362 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2363 pfn_t pfnum =
2364 IDX_TO_PNUM(mnode, r, idx);
2365 int tmnode = interleaved_mnodes ?
2366 PFN_2_MEM_NODE(pfnum) : mnode;
2368 ASSERT(pfnum >=
2369 mem_node_config[tmnode].physbase &&
2370 pfnum <
2371 mem_node_config[tmnode].physmax);
2373 (void) page_promote(tmnode,
2374 pfnum, r, PC_FREE, PC_MTYPE_ANY);
2377 /* shared hpm_counters covers all mnodes, so we quit */
2378 if (interleaved_mnodes)
2379 break;
2382 for (mnode = mlo; mnode < mhi; mnode++) {
2383 page_freelist_unlock(mnode);
2384 rw_exit(&page_ctrs_rwlock[mnode]);
2389 * This is where all polices for moving pages around
2390 * to different page size free lists is implemented.
2391 * Returns 1 on success, 0 on failure.
2393 * So far these are the priorities for this algorithm in descending
2394 * order:
2396 * 1) When servicing a request try to do so with a free page
2397 * from next size up. Helps defer fragmentation as long
2398 * as possible.
2400 * 2) Page coalesce on demand. Only when a freelist
2401 * larger than PAGESIZE is empty and step 1
2402 * will not work since all larger size lists are
2403 * also empty.
2405 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2408 page_t *
2409 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2410 pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2412 uchar_t nszc = szc + 1;
2413 uint_t bin, sbin, bin_prev;
2414 page_t *pp, *firstpp;
2415 page_t *ret_pp = NULL;
2416 uint_t color_mask;
2418 if (nszc == mmu_page_sizes)
2419 return (NULL);
2421 ASSERT(nszc < mmu_page_sizes);
2422 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2423 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2424 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2425 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2427 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2429 * First try to break up a larger page to fill current size freelist.
2431 while (plw->plw_bins[nszc] != 0) {
2433 ASSERT(nszc < mmu_page_sizes);
2436 * If page found then demote it.
2438 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2439 page_freelist_lock(mnode);
2440 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2443 * If pfnhi is not PFNNULL, look for large page below
2444 * pfnhi. PFNNULL signifies no pfn requirement.
2446 if (pp &&
2447 ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2448 (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2449 do {
2450 pp = pp->p_list.largepg.next;
2451 if (pp == firstpp) {
2452 pp = NULL;
2453 break;
2455 } while ((pfnhi != PFNNULL &&
2456 pp->p_pagenum >= pfnhi) ||
2457 (pfnlo != PFNNULL &&
2458 pp->p_pagenum < pfnlo));
2460 if (pfnhi != PFNNULL && pp != NULL)
2461 ASSERT(pp->p_pagenum < pfnhi);
2463 if (pfnlo != PFNNULL && pp != NULL)
2464 ASSERT(pp->p_pagenum >= pfnlo);
2466 if (pp) {
2467 uint_t ccolor = page_correct_color(szc, nszc,
2468 color, bin, plw->plw_ceq_mask[szc]);
2470 ASSERT(pp->p_szc == nszc);
2471 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2472 ret_pp = page_demote(mnode, pp->p_pagenum,
2473 pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2474 if (ret_pp) {
2475 page_freelist_unlock(mnode);
2476 return (ret_pp);
2479 page_freelist_unlock(mnode);
2482 /* loop through next size bins */
2483 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2484 plw->plw_bins[nszc]--;
2486 if (bin == sbin) {
2487 uchar_t nnszc = nszc + 1;
2489 /* we are done with this page size - check next */
2490 if (plw->plw_bins[nnszc] == 0)
2491 /* we have already checked next size bins */
2492 break;
2494 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2495 if (bin_prev != INVALID_COLOR) {
2496 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2497 if (!((bin ^ bin_prev) &
2498 plw->plw_ceq_mask[nnszc]))
2499 break;
2501 ASSERT(nnszc < mmu_page_sizes);
2502 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2503 nszc = nnszc;
2504 ASSERT(nszc < mmu_page_sizes);
2508 return (ret_pp);
2512 * Helper routine used only by the freelist code to lock
2513 * a page. If the page is a large page then it succeeds in
2514 * locking all the constituent pages or none at all.
2515 * Returns 1 on sucess, 0 on failure.
2517 static int
2518 page_trylock_cons(page_t *pp, se_t se)
2520 page_t *tpp, *first_pp = pp;
2523 * Fail if can't lock first or only page.
2525 if (!page_trylock(pp, se)) {
2526 return (0);
2530 * PAGESIZE: common case.
2532 if (pp->p_szc == 0) {
2533 return (1);
2537 * Large page case.
2539 tpp = pp->p_next;
2540 while (tpp != pp) {
2541 if (!page_trylock(tpp, se)) {
2543 * On failure unlock what we have locked so far.
2544 * We want to avoid attempting to capture these
2545 * pages as the pcm mutex may be held which could
2546 * lead to a recursive mutex panic.
2548 while (first_pp != tpp) {
2549 page_unlock_nocapture(first_pp);
2550 first_pp = first_pp->p_next;
2552 return (0);
2554 tpp = tpp->p_next;
2556 return (1);
2560 * init context for walking page lists
2561 * Called when a page of the given szc in unavailable. Sets markers
2562 * for the beginning of the search to detect when search has
2563 * completed a full cycle. Sets flags for splitting larger pages
2564 * and coalescing smaller pages. Page walking procedes until a page
2565 * of the desired equivalent color is found.
2567 void
2568 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2569 int use_ceq, page_list_walker_t *plw)
2571 uint_t nszc, ceq_mask, colors;
2572 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2574 ASSERT(szc < mmu_page_sizes);
2575 colors = PAGE_GET_PAGECOLORS(szc);
2577 plw->plw_colors = colors;
2578 plw->plw_color_mask = colors - 1;
2579 plw->plw_bin_marker = plw->plw_bin0 = bin;
2580 plw->plw_bin_split_prev = bin;
2581 plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2584 * if vac aliasing is possible make sure lower order color
2585 * bits are never ignored
2587 if (vac_colors > 1)
2588 ceq &= 0xf0;
2591 * calculate the number of non-equivalent colors and
2592 * color equivalency mask
2594 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2595 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2596 ASSERT(plw->plw_ceq_dif > 0);
2597 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2599 if (flags & PG_MATCH_COLOR) {
2600 if (cpu_page_colors < 0) {
2602 * this is a heterogeneous machine with different CPUs
2603 * having different size e$ (not supported for ni2/rock
2605 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2606 cpucolors = MAX(cpucolors, 1);
2607 ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2608 plw->plw_ceq_mask[szc] =
2609 MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2611 plw->plw_ceq_dif = 1;
2614 /* we can split pages in the freelist, but not the cachelist */
2615 if (can_split) {
2616 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2618 /* set next szc color masks and number of free list bins */
2619 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2620 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2621 plw->plw_ceq_mask[szc]);
2622 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2624 plw->plw_ceq_mask[nszc] = INVALID_MASK;
2625 plw->plw_bins[nszc] = 0;
2627 } else {
2628 ASSERT(szc == 0);
2629 plw->plw_do_split = 0;
2630 plw->plw_bins[1] = 0;
2631 plw->plw_ceq_mask[1] = INVALID_MASK;
2636 * set mark to flag where next split should occur
2638 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \
2639 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \
2640 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \
2641 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \
2642 plw->plw_split_next = \
2643 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \
2644 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2645 plw->plw_split_next = \
2646 INC_MASKED(plw->plw_split_next, \
2647 neq_mask, plw->plw_color_mask); \
2651 uint_t
2652 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2654 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2655 uint_t bin0_nsz, nbin_nsz, nbin0, nbin;
2656 uchar_t nszc = szc + 1;
2658 nbin = ADD_MASKED(bin,
2659 plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2661 if (plw->plw_do_split) {
2662 plw->plw_bin_split_prev = bin;
2663 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2664 plw->plw_do_split = 0;
2667 if (szc == 0) {
2668 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2669 if (nbin == plw->plw_bin0 &&
2670 (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2671 nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2672 neq_mask, plw->plw_color_mask);
2673 plw->plw_bin_split_prev = plw->plw_bin0;
2676 if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2677 plw->plw_bin_marker =
2678 nbin = INC_MASKED(nbin, neq_mask,
2679 plw->plw_color_mask);
2680 plw->plw_bin_split_prev = plw->plw_bin0;
2682 * large pages all have the same vac color
2683 * so by now we should be done with next
2684 * size page splitting process
2686 ASSERT(plw->plw_bins[1] == 0);
2687 plw->plw_do_split = 0;
2688 return (nbin);
2691 } else {
2692 uint_t bin_jump = (vac_colors == 1) ?
2693 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2695 bin_jump &= ~(vac_colors - 1);
2697 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2698 plw->plw_color_mask);
2700 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2702 plw->plw_bin_marker = nbin = nbin0;
2704 if (plw->plw_bins[nszc] != 0) {
2706 * check if next page size bin is the
2707 * same as the next page size bin for
2708 * bin0
2710 nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2711 nbin);
2712 bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2713 plw->plw_bin0);
2715 if ((bin0_nsz ^ nbin_nsz) &
2716 plw->plw_ceq_mask[nszc])
2717 plw->plw_do_split = 1;
2719 return (nbin);
2724 if (plw->plw_bins[nszc] != 0) {
2725 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2726 if (!((plw->plw_split_next ^ nbin_nsz) &
2727 plw->plw_ceq_mask[nszc]))
2728 plw->plw_do_split = 1;
2731 return (nbin);
2734 page_t *
2735 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2736 uint_t flags)
2738 kmutex_t *pcm;
2739 page_t *pp, *first_pp;
2740 uint_t sbin;
2741 int plw_initialized;
2742 page_list_walker_t plw;
2744 ASSERT(szc < mmu_page_sizes);
2746 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2748 MTYPE_START(mnode, mtype, flags);
2749 if (mtype < 0) { /* mnode does not have memory in mtype range */
2750 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2751 return (NULL);
2753 try_again:
2755 plw_initialized = 0;
2756 plw.plw_ceq_dif = 1;
2759 * Only hold one freelist lock at a time, that way we
2760 * can start anywhere and not have to worry about lock
2761 * ordering.
2763 for (plw.plw_count = 0;
2764 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2765 sbin = bin;
2766 do {
2767 if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2768 goto bin_empty_1;
2770 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2771 mutex_enter(pcm);
2772 pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2773 if (pp == NULL)
2774 goto bin_empty_0;
2777 * These were set before the page
2778 * was put on the free list,
2779 * they must still be set.
2781 ASSERT(PP_ISFREE(pp));
2782 ASSERT(PP_ISAGED(pp));
2783 VERIFY(pp->p_object == NULL);
2784 ASSERT(pp->p_vnode == NULL);
2785 ASSERT(pp->p_offset == (uoff_t)-1);
2786 ASSERT(pp->p_szc == szc);
2787 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2790 * Walk down the hash chain. 4k/8k pages are linked
2791 * on p_next and p_prev fields. Large pages are a
2792 * contiguous group of constituent pages linked
2793 * together on their p_next and p_prev fields. The
2794 * large pages are linked together on the hash chain
2795 * using p_list.largepg of the base constituent page
2796 * of each large page.
2798 first_pp = pp;
2799 while (!page_trylock_cons(pp, SE_EXCL)) {
2800 if (szc == 0) {
2801 pp = pp->p_next;
2802 } else {
2803 pp = pp->p_list.largepg.next;
2806 ASSERT(PP_ISFREE(pp));
2807 ASSERT(PP_ISAGED(pp));
2808 VERIFY(pp->p_object == NULL);
2809 ASSERT(pp->p_vnode == NULL);
2810 ASSERT(pp->p_offset == (uoff_t)-1);
2811 ASSERT(pp->p_szc == szc);
2812 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2814 if (pp == first_pp)
2815 goto bin_empty_0;
2818 ASSERT(pp != NULL);
2819 ASSERT(mtype == PP_2_MTYPE(pp));
2820 ASSERT(pp->p_szc == szc);
2821 if (szc == 0) {
2822 page_sub(&PAGE_FREELISTS(mnode,
2823 szc, bin, mtype), pp);
2824 } else {
2825 page_lpsub(&PAGE_FREELISTS(mnode,
2826 szc, bin, mtype), pp);
2827 CHK_LPG(pp, szc);
2829 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
2831 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
2832 panic("free page is not. pp %p", (void *)pp);
2833 mutex_exit(pcm);
2835 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
2836 return (pp);
2838 bin_empty_0:
2839 mutex_exit(pcm);
2840 bin_empty_1:
2841 if (plw_initialized == 0) {
2842 page_list_walk_init(szc, flags, bin, 1, 1,
2843 &plw);
2844 plw_initialized = 1;
2845 ASSERT(plw.plw_colors <=
2846 PAGE_GET_PAGECOLORS(szc));
2847 ASSERT(plw.plw_colors > 0);
2848 ASSERT((plw.plw_colors &
2849 (plw.plw_colors - 1)) == 0);
2850 ASSERT(bin < plw.plw_colors);
2851 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
2853 /* calculate the next bin with equivalent color */
2854 bin = ADD_MASKED(bin, plw.plw_bin_step,
2855 plw.plw_ceq_mask[szc], plw.plw_color_mask);
2856 } while (sbin != bin);
2859 * color bins are all empty if color match. Try and
2860 * satisfy the request by breaking up or coalescing
2861 * pages from a different size freelist of the correct
2862 * color that satisfies the ORIGINAL color requested.
2863 * If that fails then try pages of the same size but
2864 * different colors assuming we are not called with
2865 * PG_MATCH_COLOR.
2867 if (plw.plw_do_split &&
2868 (pp = page_freelist_split(szc, bin, mnode,
2869 mtype, PFNNULL, PFNNULL, &plw)) != NULL)
2870 return (pp);
2872 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
2873 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL)
2874 return (pp);
2876 if (plw.plw_ceq_dif > 1)
2877 bin = page_list_walk_next_bin(szc, bin, &plw);
2880 /* if allowed, cycle through additional mtypes */
2881 MTYPE_NEXT(mnode, mtype, flags);
2882 if (mtype >= 0)
2883 goto try_again;
2885 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
2887 return (NULL);
2891 * Returns the count of free pages for 'pp' with size code 'szc'.
2892 * Note: This function does not return an exact value as the page freelist
2893 * locks are not held and thus the values in the page_counters may be
2894 * changing as we walk through the data.
2896 static int
2897 page_freecnt(int mnode, page_t *pp, uchar_t szc)
2899 pgcnt_t pgfree;
2900 pgcnt_t cnt;
2901 ssize_t r = szc; /* region size */
2902 ssize_t idx;
2903 int i;
2904 int full, range;
2906 /* Make sure pagenum passed in is aligned properly */
2907 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
2908 ASSERT(szc > 0);
2910 /* Prevent page_counters dynamic memory from being freed */
2911 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2912 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2913 cnt = PAGE_COUNTERS(mnode, r, idx);
2914 pgfree = cnt << PNUM_SHIFT(r - 1);
2915 range = FULL_REGION_CNT(szc);
2917 /* Check for completely full region */
2918 if (cnt == range) {
2919 rw_exit(&page_ctrs_rwlock[mnode]);
2920 return (pgfree);
2923 while (--r > 0) {
2924 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2925 full = FULL_REGION_CNT(r);
2926 for (i = 0; i < range; i++, idx++) {
2927 cnt = PAGE_COUNTERS(mnode, r, idx);
2929 * If cnt here is full, that means we have already
2930 * accounted for these pages earlier.
2932 if (cnt != full) {
2933 pgfree += (cnt << PNUM_SHIFT(r - 1));
2936 range *= full;
2938 rw_exit(&page_ctrs_rwlock[mnode]);
2939 return (pgfree);
2943 * Called from page_geti_contig_pages to exclusively lock constituent pages
2944 * starting from 'spp' for page size code 'szc'.
2946 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
2947 * region needs to be greater than or equal to the threshold.
2949 static int
2950 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
2952 pgcnt_t pgcnt = PNUM_SIZE(szc);
2953 pgcnt_t pgfree, i;
2954 page_t *pp;
2956 VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
2959 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
2960 goto skipptcpcheck;
2962 * check if there are sufficient free pages available before attempting
2963 * to trylock. Count is approximate as page counters can change.
2965 pgfree = page_freecnt(mnode, spp, szc);
2967 /* attempt to trylock if there are sufficient already free pages */
2968 if (pgfree < pgcnt/ptcpthreshold) {
2969 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
2970 return (0);
2973 skipptcpcheck:
2975 for (i = 0; i < pgcnt; i++) {
2976 pp = &spp[i];
2977 if (!page_trylock(pp, SE_EXCL)) {
2978 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
2979 while (--i != (pgcnt_t)-1) {
2980 pp = &spp[i];
2981 ASSERT(PAGE_EXCL(pp));
2982 page_unlock_nocapture(pp);
2984 return (0);
2986 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
2987 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
2988 !PP_ISFREE(pp)) {
2989 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
2990 ASSERT(i == 0);
2991 page_unlock_nocapture(pp);
2992 return (0);
2996 * If a page has been marked non-relocatable or has been
2997 * explicitly locked in memory, we don't want to relocate it;
2998 * unlock the pages and fail the operation.
3000 if (PP_ISNORELOC(pp) ||
3001 pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
3002 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3003 while (i != (pgcnt_t)-1) {
3004 pp = &spp[i];
3005 ASSERT(PAGE_EXCL(pp));
3006 page_unlock_nocapture(pp);
3007 i--;
3009 return (0);
3012 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3013 return (1);
3017 * Claim large page pointed to by 'pp'. 'pp' is the starting set
3018 * of 'szc' constituent pages that had been locked exclusively previously.
3019 * Will attempt to relocate constituent pages in use.
3021 static page_t *
3022 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3024 spgcnt_t pgcnt, npgs, i;
3025 page_t *targpp, *rpp, *hpp;
3026 page_t *replpp = NULL;
3027 page_t *pplist = NULL;
3029 ASSERT(pp != NULL);
3031 pgcnt = page_get_pagecnt(szc);
3032 while (pgcnt) {
3033 ASSERT(PAGE_EXCL(pp));
3034 ASSERT(!PP_ISNORELOC(pp));
3035 if (PP_ISFREE(pp)) {
3037 * If this is a PG_FREE_LIST page then its
3038 * size code can change underneath us due to
3039 * page promotion or demotion. As an optimzation
3040 * use page_list_sub_pages() instead of
3041 * page_list_sub().
3043 if (PP_ISAGED(pp)) {
3044 page_list_sub_pages(pp, szc);
3045 if (pp->p_szc == szc) {
3046 return (pp);
3048 ASSERT(pp->p_szc < szc);
3049 npgs = page_get_pagecnt(pp->p_szc);
3050 hpp = pp;
3051 for (i = 0; i < npgs; i++, pp++) {
3052 pp->p_szc = szc;
3054 page_list_concat(&pplist, &hpp);
3055 pgcnt -= npgs;
3056 continue;
3058 ASSERT(!PP_ISAGED(pp));
3059 ASSERT(pp->p_szc == 0);
3060 page_list_sub(pp, PG_CACHE_LIST);
3061 page_hashout(pp, false);
3062 PP_SETAGED(pp);
3063 pp->p_szc = szc;
3064 page_list_concat(&pplist, &pp);
3065 pp++;
3066 pgcnt--;
3067 continue;
3069 npgs = page_get_pagecnt(pp->p_szc);
3072 * page_create_wait freemem accounting done by caller of
3073 * page_get_freelist and not necessary to call it prior to
3074 * calling page_get_replacement_page.
3076 * page_get_replacement_page can call page_get_contig_pages
3077 * to acquire a large page (szc > 0); the replacement must be
3078 * smaller than the contig page size to avoid looping or
3079 * szc == 0 and PGI_PGCPSZC0 is set.
3081 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3082 replpp = page_get_replacement_page(pp, NULL, 0);
3083 if (replpp) {
3084 npgs = page_get_pagecnt(pp->p_szc);
3085 ASSERT(npgs <= pgcnt);
3086 targpp = pp;
3091 * If replacement is NULL or do_page_relocate fails, fail
3092 * coalescing of pages.
3094 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3095 &npgs, NULL) != 0)) {
3097 * Unlock un-processed target list
3099 while (pgcnt--) {
3100 ASSERT(PAGE_EXCL(pp));
3101 page_unlock_nocapture(pp);
3102 pp++;
3105 * Free the processed target list.
3107 while (pplist) {
3108 pp = pplist;
3109 page_sub(&pplist, pp);
3110 ASSERT(PAGE_EXCL(pp));
3111 ASSERT(pp->p_szc == szc);
3112 ASSERT(PP_ISFREE(pp));
3113 ASSERT(PP_ISAGED(pp));
3114 pp->p_szc = 0;
3115 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3116 page_unlock_nocapture(pp);
3119 if (replpp != NULL)
3120 page_free_replacement_page(replpp);
3122 return (NULL);
3124 ASSERT(pp == targpp);
3126 ASSERT(hpp = pp); /* That's right, it's an assignment */
3128 pp += npgs;
3129 pgcnt -= npgs;
3131 while (npgs--) {
3132 ASSERT(PAGE_EXCL(targpp));
3133 ASSERT(!PP_ISFREE(targpp));
3134 ASSERT(!PP_ISNORELOC(targpp));
3135 PP_SETFREE(targpp);
3136 ASSERT(PP_ISAGED(targpp));
3137 ASSERT(targpp->p_szc < szc || (szc == 0 &&
3138 (flags & PGI_PGCPSZC0)));
3139 targpp->p_szc = szc;
3140 targpp = targpp->p_next;
3142 rpp = replpp;
3143 ASSERT(rpp != NULL);
3144 page_sub(&replpp, rpp);
3145 ASSERT(PAGE_EXCL(rpp));
3146 ASSERT(!PP_ISFREE(rpp));
3147 page_unlock_nocapture(rpp);
3149 ASSERT(targpp == hpp);
3150 ASSERT(replpp == NULL);
3151 page_list_concat(&pplist, &targpp);
3153 CHK_LPG(pplist, szc);
3154 return (pplist);
3158 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3159 * of 0 means nothing left after trim.
3162 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3164 pfn_t kcagepfn;
3165 int decr;
3166 int rc = 0;
3168 if (PP_ISNORELOC(mseg->pages)) {
3169 if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3171 /* lower part of this mseg inside kernel cage */
3172 decr = kcage_current_pfn(&kcagepfn);
3174 /* kernel cage may have transitioned past mseg */
3175 if (kcagepfn >= mseg->pages_base &&
3176 kcagepfn < mseg->pages_end) {
3177 ASSERT(decr == 0);
3178 *lo = MAX(kcagepfn, pfnlo);
3179 *hi = MIN(pfnhi, (mseg->pages_end - 1));
3180 rc = 1;
3183 /* else entire mseg in the cage */
3184 } else {
3185 if (PP_ISNORELOC(mseg->epages - 1)) {
3187 /* upper part of this mseg inside kernel cage */
3188 decr = kcage_current_pfn(&kcagepfn);
3190 /* kernel cage may have transitioned past mseg */
3191 if (kcagepfn >= mseg->pages_base &&
3192 kcagepfn < mseg->pages_end) {
3193 ASSERT(decr);
3194 *hi = MIN(kcagepfn, pfnhi);
3195 *lo = MAX(pfnlo, mseg->pages_base);
3196 rc = 1;
3198 } else {
3199 /* entire mseg outside of kernel cage */
3200 *lo = MAX(pfnlo, mseg->pages_base);
3201 *hi = MIN(pfnhi, (mseg->pages_end - 1));
3202 rc = 1;
3205 return (rc);
3209 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3210 * page with size code 'szc'. Claiming such a page requires acquiring
3211 * exclusive locks on all constituent pages (page_trylock_contig_pages),
3212 * relocating pages in use and concatenating these constituent pages into a
3213 * large page.
3215 * The page lists do not have such a large page and page_freelist_split has
3216 * already failed to demote larger pages and/or coalesce smaller free pages.
3218 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3219 * pages with the same color as 'bin'.
3221 * 'pfnflag' specifies the subset of the pfn range to search.
3224 static page_t *
3225 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3226 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3228 struct memseg *mseg;
3229 pgcnt_t szcpgcnt = page_get_pagecnt(szc);
3230 pgcnt_t szcpgmask = szcpgcnt - 1;
3231 pfn_t randpfn;
3232 page_t *pp, *randpp, *endpp;
3233 uint_t colors, ceq_mask;
3234 uint_t color_mask;
3235 pfn_t hi, lo;
3236 uint_t skip;
3237 MEM_NODE_ITERATOR_DECL(it);
3239 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3241 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3243 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3244 return (NULL);
3246 ASSERT(szc < mmu_page_sizes);
3248 colors = PAGE_GET_PAGECOLORS(szc);
3249 color_mask = colors - 1;
3250 if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3251 uchar_t ceq = colorequivszc[szc];
3252 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3254 ASSERT(ceq_dif > 0);
3255 ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3256 } else {
3257 ceq_mask = 0;
3260 ASSERT(bin < colors);
3262 /* clear "non-significant" color bits */
3263 bin &= ceq_mask;
3266 * trim the pfn range to search based on pfnflag. pfnflag is set
3267 * when there have been previous page_get_contig_page failures to
3268 * limit the search.
3270 * The high bit in pfnflag specifies the number of 'slots' in the
3271 * pfn range and the remainder of pfnflag specifies which slot.
3272 * For example, a value of 1010b would mean the second slot of
3273 * the pfn range that has been divided into 8 slots.
3275 if (pfnflag > 1) {
3276 int slots = 1 << (highbit(pfnflag) - 1);
3277 int slotid = pfnflag & (slots - 1);
3278 pgcnt_t szcpages;
3279 int slotlen;
3281 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3282 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3283 slotlen = howmany(szcpages, slots);
3284 /* skip if 'slotid' slot is empty */
3285 if (slotid * slotlen >= szcpages)
3286 return (NULL);
3287 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3288 ASSERT(pfnlo < pfnhi);
3289 if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3290 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3294 * This routine is can be called recursively so we shouldn't
3295 * acquire a reader lock if a write request is pending. This
3296 * could lead to a deadlock with the DR thread.
3298 * Returning NULL informs the caller that we could not get
3299 * a contig page with the required characteristics.
3302 if (!memsegs_trylock(0))
3303 return (NULL);
3306 * loop through memsegs to look for contig page candidates
3309 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3310 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3311 /* no overlap */
3312 continue;
3315 if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3316 /* mseg too small */
3317 continue;
3320 * trim off kernel cage pages from pfn range and check for
3321 * a trimmed pfn range returned that does not span the
3322 * desired large page size.
3324 if (kcage_on) {
3325 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3326 lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3327 continue;
3328 } else {
3329 lo = MAX(pfnlo, mseg->pages_base);
3330 hi = MIN(pfnhi, (mseg->pages_end - 1));
3333 /* round to szcpgcnt boundaries */
3334 lo = P2ROUNDUP(lo, szcpgcnt);
3336 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3337 hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3339 if (hi <= lo)
3340 continue;
3343 * set lo to point to the pfn for the desired bin. Large
3344 * page sizes may only have a single page color
3346 skip = szcpgcnt;
3347 if (ceq_mask > 0 || interleaved_mnodes) {
3348 /* set lo to point at appropriate color */
3349 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3350 (interleaved_mnodes &&
3351 PFN_2_MEM_NODE(lo) != mnode)) {
3352 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3353 color_mask, &it);
3355 if (hi <= lo)
3356 /* mseg cannot satisfy color request */
3357 continue;
3360 /* randomly choose a point between lo and hi to begin search */
3362 randpfn = (pfn_t)GETTICK();
3363 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3364 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3365 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3366 if (randpfn != (pfn_t)-1) {
3367 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3368 ceq_mask, color_mask, &it);
3370 if (randpfn >= hi) {
3371 randpfn = lo;
3372 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3373 &it);
3376 randpp = mseg->pages + (randpfn - mseg->pages_base);
3378 ASSERT(randpp->p_pagenum == randpfn);
3380 pp = randpp;
3381 endpp = mseg->pages + (hi - mseg->pages_base) + 1;
3383 ASSERT(randpp + szcpgcnt <= endpp);
3385 do {
3386 ASSERT(!(pp->p_pagenum & szcpgmask));
3387 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3389 if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3390 /* pages unlocked by page_claim on failure */
3391 if (page_claim_contig_pages(pp, szc, flags)) {
3392 memsegs_unlock(0);
3393 return (pp);
3397 if (ceq_mask == 0 && !interleaved_mnodes) {
3398 pp += skip;
3399 } else {
3400 pfn_t pfn = pp->p_pagenum;
3402 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3403 ceq_mask, color_mask, &it);
3404 if (pfn == (pfn_t)-1) {
3405 pp = endpp;
3406 } else {
3407 pp = mseg->pages +
3408 (pfn - mseg->pages_base);
3411 if (pp >= endpp) {
3412 /* start from the beginning */
3413 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3414 pp = mseg->pages + (lo - mseg->pages_base);
3415 ASSERT(pp->p_pagenum == lo);
3416 ASSERT(pp + szcpgcnt <= endpp);
3418 } while (pp != randpp);
3420 memsegs_unlock(0);
3421 return (NULL);
3426 * controlling routine that searches through physical memory in an attempt to
3427 * claim a large page based on the input parameters.
3428 * on the page free lists.
3430 * calls page_geti_contig_pages with an initial pfn range from the mnode
3431 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3432 * that overlaps with the kernel cage or does not match the requested page
3433 * color if PG_MATCH_COLOR is set. Since this search is very expensive,
3434 * page_geti_contig_pages may further limit the search range based on
3435 * previous failure counts (pgcpfailcnt[]).
3437 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3438 * pagesize page that satisfies mtype.
3440 page_t *
3441 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3442 uint_t flags)
3444 pfn_t pfnlo, pfnhi; /* contig pages pfn range */
3445 page_t *pp;
3446 pgcnt_t pfnflag = 0; /* no limit on search if 0 */
3448 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3450 /* no allocations from cage */
3451 flags |= PGI_NOCAGE;
3453 MTYPE_START(mnode, mtype, flags);
3454 if (mtype < 0) { /* mnode does not have memory in mtype range */
3455 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3456 return (NULL);
3459 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3461 /* do not limit search and ignore color if hi pri */
3463 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3464 pfnflag = pgcpfailcnt[szc];
3466 /* remove color match to improve chances */
3468 if (flags & PGI_PGCPHIPRI || pfnflag)
3469 flags &= ~PG_MATCH_COLOR;
3471 do {
3472 /* get pfn range based on mnode and mtype */
3473 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3475 ASSERT(pfnhi >= pfnlo);
3477 pp = page_geti_contig_pages(mnode, bin, szc, flags,
3478 pfnlo, pfnhi, pfnflag);
3480 if (pp != NULL) {
3481 pfnflag = pgcpfailcnt[szc];
3482 if (pfnflag) {
3483 /* double the search size */
3484 pgcpfailcnt[szc] = pfnflag >> 1;
3486 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3487 return (pp);
3489 MTYPE_NEXT(mnode, mtype, flags);
3490 } while (mtype >= 0);
3492 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3493 return (NULL);
3496 #if defined(__i386) || defined(__amd64)
3498 * Determine the likelihood of finding/coalescing a szc page.
3499 * Return 0 if the likelihood is small otherwise return 1.
3501 * For now, be conservative and check only 1g pages and return 0
3502 * if there had been previous coalescing failures and the szc pages
3503 * needed to satisfy request would exhaust most of freemem.
3506 page_chk_freelist(uint_t szc)
3508 pgcnt_t pgcnt;
3510 if (szc <= 1)
3511 return (1);
3513 pgcnt = page_get_pagecnt(szc);
3514 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3515 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3516 return (0);
3518 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3519 return (1);
3521 #endif
3524 * Find the `best' page on the freelist for this (obj,off) (as,vaddr) pair.
3526 * Does its own locking and accounting.
3527 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3528 * pages of the proper color even if there are pages of a different color.
3530 * Finds a page, removes it, THEN locks it.
3533 /*ARGSUSED*/
3534 page_t *
3535 page_get_freelist(struct vmobject *obj, uoff_t off, struct seg *seg,
3536 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3538 struct as *as = seg->s_as;
3539 page_t *pp = NULL;
3540 ulong_t bin;
3541 uchar_t szc;
3542 int mnode;
3543 int mtype;
3544 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3545 lgrp_mnode_cookie_t lgrp_cookie;
3547 page_get_func = page_get_mnode_freelist;
3550 * If we aren't passed a specific lgroup, or passed a freed lgrp
3551 * assume we wish to allocate near to the current thread's home.
3553 if (!LGRP_EXISTS(lgrp))
3554 lgrp = lgrp_home_lgrp();
3556 if (kcage_on) {
3557 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3558 kcage_freemem < kcage_throttlefree + btop(size) &&
3559 curthread != kcage_cageout_thread) {
3561 * Set a "reserve" of kcage_throttlefree pages for
3562 * PG_PANIC and cageout thread allocations.
3564 * Everybody else has to serialize in
3565 * page_create_get_something() to get a cage page, so
3566 * that we don't deadlock cageout!
3568 return (NULL);
3570 } else {
3571 flags &= ~PG_NORELOC;
3572 flags |= PGI_NOCAGE;
3575 MTYPE_INIT(mtype, obj->vnode, vaddr, flags, size);
3578 * Convert size to page size code.
3580 if ((szc = page_szc(size)) == (uchar_t)-1)
3581 panic("page_get_freelist: illegal page size request");
3582 ASSERT(szc < mmu_page_sizes);
3584 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3586 AS_2_BIN(as, seg, obj->vnode, vaddr, bin, szc);
3588 ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3591 * Try to get a local page first, but try remote if we can't
3592 * get a page of the right color.
3594 pgretry:
3595 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3596 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3597 pp = page_get_func(mnode, bin, mtype, szc, flags);
3598 if (pp != NULL) {
3599 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3600 DTRACE_PROBE4(page__get,
3601 lgrp_t *, lgrp,
3602 int, mnode,
3603 ulong_t, bin,
3604 uint_t, flags);
3605 return (pp);
3608 ASSERT(pp == NULL);
3611 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3612 * remote free lists. Caller expected to call page_get_cachelist which
3613 * will check local cache lists and remote free lists.
3615 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3616 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3617 return (NULL);
3620 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3622 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3624 if (!(flags & PG_LOCAL)) {
3626 * Try to get a non-local freelist page.
3628 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3629 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3630 pp = page_get_func(mnode, bin, mtype, szc, flags);
3631 if (pp != NULL) {
3632 DTRACE_PROBE4(page__get,
3633 lgrp_t *, lgrp,
3634 int, mnode,
3635 ulong_t, bin,
3636 uint_t, flags);
3637 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3638 return (pp);
3641 ASSERT(pp == NULL);
3645 * when the cage is off chances are page_get_contig_pages() will fail
3646 * to lock a large page chunk therefore when the cage is off it's not
3647 * called by default. this can be changed via /etc/system.
3649 * page_get_contig_pages() also called to acquire a base pagesize page
3650 * for page_create_get_something().
3652 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3653 (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3654 (page_get_func != page_get_contig_pages)) {
3656 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3657 page_get_func = page_get_contig_pages;
3658 goto pgretry;
3661 if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3662 page_get_func == page_get_contig_pages)
3663 SETPGCPFAILCNT(szc);
3665 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3666 return (NULL);
3670 * Find the `best' page on the cachelist for this (obj,off) (as,vaddr) pair.
3672 * Does its own locking.
3673 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3674 * pages of the proper color even if there are pages of a different color.
3675 * Otherwise, scan the bins for ones with pages. For each bin with pages,
3676 * try to lock one of them. If no page can be locked, try the
3677 * next bin. Return NULL if a page can not be found and locked.
3679 * Finds a pages, trys to lock it, then removes it.
3682 /*ARGSUSED*/
3683 struct page *
3684 page_get_cachelist(struct vmobject *obj, uoff_t off, struct seg *seg,
3685 caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3687 page_t *pp;
3688 struct as *as = seg->s_as;
3689 ulong_t bin;
3690 int mnode;
3691 int mtype;
3692 lgrp_mnode_cookie_t lgrp_cookie;
3695 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3696 * assume we wish to allocate near to the current thread's home.
3698 if (!LGRP_EXISTS(lgrp))
3699 lgrp = lgrp_home_lgrp();
3701 if (!kcage_on) {
3702 flags &= ~PG_NORELOC;
3703 flags |= PGI_NOCAGE;
3706 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3707 kcage_freemem <= kcage_throttlefree) {
3709 * Reserve kcage_throttlefree pages for critical kernel
3710 * threads.
3712 * Everybody else has to go to page_create_get_something()
3713 * to get a cage page, so we don't deadlock cageout.
3715 return (NULL);
3718 AS_2_BIN(as, seg, obj->vnode, vaddr, bin, 0);
3720 ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3722 MTYPE_INIT(mtype, obj->vnode, vaddr, flags, MMU_PAGESIZE);
3724 VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3727 * Try local cachelists first
3729 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3730 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3731 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3732 if (pp != NULL) {
3733 VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3734 DTRACE_PROBE4(page__get,
3735 lgrp_t *, lgrp,
3736 int, mnode,
3737 ulong_t, bin,
3738 uint_t, flags);
3739 return (pp);
3743 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3746 * Try freelists/cachelists that are farther away
3747 * This is our only chance to allocate remote pages for PAGESIZE
3748 * requests.
3750 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3751 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3752 pp = page_get_mnode_freelist(mnode, bin, mtype,
3753 0, flags);
3754 if (pp != NULL) {
3755 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3756 DTRACE_PROBE4(page__get,
3757 lgrp_t *, lgrp,
3758 int, mnode,
3759 ulong_t, bin,
3760 uint_t, flags);
3761 return (pp);
3763 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3764 if (pp != NULL) {
3765 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3766 DTRACE_PROBE4(page__get,
3767 lgrp_t *, lgrp,
3768 int, mnode,
3769 ulong_t, bin,
3770 uint_t, flags);
3771 return (pp);
3775 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3776 return (NULL);
3779 page_t *
3780 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3782 kmutex_t *pcm;
3783 page_t *pp, *first_pp;
3784 uint_t sbin;
3785 int plw_initialized;
3786 page_list_walker_t plw;
3788 VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3790 MTYPE_START(mnode, mtype, flags);
3791 if (mtype < 0) { /* mnode does not have memory in mtype range */
3792 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3793 return (NULL);
3796 try_again:
3798 plw_initialized = 0;
3799 plw.plw_ceq_dif = 1;
3802 * Only hold one cachelist lock at a time, that way we
3803 * can start anywhere and not have to worry about lock
3804 * ordering.
3807 for (plw.plw_count = 0;
3808 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3809 sbin = bin;
3810 do {
3812 if (!PAGE_CACHELISTS(mnode, bin, mtype))
3813 goto bin_empty_1;
3814 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3815 mutex_enter(pcm);
3816 pp = PAGE_CACHELISTS(mnode, bin, mtype);
3817 if (pp == NULL)
3818 goto bin_empty_0;
3820 first_pp = pp;
3821 VERIFY(pp->p_object);
3822 ASSERT(pp->p_vnode);
3823 ASSERT(PP_ISAGED(pp) == 0);
3824 ASSERT(pp->p_szc == 0);
3825 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3826 while (!page_trylock(pp, SE_EXCL)) {
3827 pp = pp->p_next;
3828 ASSERT(pp->p_szc == 0);
3829 if (pp == first_pp) {
3831 * We have searched the complete list!
3832 * And all of them (might only be one)
3833 * are locked. This can happen since
3834 * these pages can also be found via
3835 * the hash list. When found via the
3836 * hash list, they are locked first,
3837 * then removed. We give up to let the
3838 * other thread run.
3840 pp = NULL;
3841 break;
3843 VERIFY(pp->p_object);
3844 ASSERT(pp->p_vnode);
3845 ASSERT(PP_ISFREE(pp));
3846 ASSERT(PP_ISAGED(pp) == 0);
3847 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
3848 mnode);
3851 if (pp) {
3852 page_t **ppp;
3854 * Found and locked a page.
3855 * Pull it off the list.
3857 ASSERT(mtype == PP_2_MTYPE(pp));
3858 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
3859 page_sub(ppp, pp);
3861 * Subtract counters before releasing pcm mutex
3862 * to avoid a race with page_freelist_coalesce
3863 * and page_freelist_split.
3865 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3866 mutex_exit(pcm);
3867 VERIFY(pp->p_object);
3868 ASSERT(pp->p_vnode);
3869 ASSERT(PP_ISAGED(pp) == 0);
3870 VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
3871 return (pp);
3873 bin_empty_0:
3874 mutex_exit(pcm);
3875 bin_empty_1:
3876 if (plw_initialized == 0) {
3877 page_list_walk_init(0, flags, bin, 0, 1, &plw);
3878 plw_initialized = 1;
3880 /* calculate the next bin with equivalent color */
3881 bin = ADD_MASKED(bin, plw.plw_bin_step,
3882 plw.plw_ceq_mask[0], plw.plw_color_mask);
3883 } while (sbin != bin);
3885 if (plw.plw_ceq_dif > 1)
3886 bin = page_list_walk_next_bin(0, bin, &plw);
3889 MTYPE_NEXT(mnode, mtype, flags);
3890 if (mtype >= 0)
3891 goto try_again;
3893 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
3894 return (NULL);
3897 #ifdef DEBUG
3898 #define REPL_PAGE_STATS
3899 #endif /* DEBUG */
3901 #ifdef REPL_PAGE_STATS
3902 struct repl_page_stats {
3903 uint_t ngets;
3904 uint_t ngets_noreloc;
3905 uint_t npgr_noreloc;
3906 uint_t nnopage_first;
3907 uint_t nnopage;
3908 uint_t nhashout;
3909 uint_t nnofree;
3910 uint_t nnext_pp;
3911 } repl_page_stats;
3912 #define REPL_STAT_INCR(v) atomic_inc_32(&repl_page_stats.v)
3913 #else /* REPL_PAGE_STATS */
3914 #define REPL_STAT_INCR(v)
3915 #endif /* REPL_PAGE_STATS */
3917 int pgrppgcp;
3920 * The freemem accounting must be done by the caller.
3921 * First we try to get a replacement page of the same size as like_pp,
3922 * if that is not possible, then we just get a set of discontiguous
3923 * PAGESIZE pages.
3925 page_t *
3926 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
3927 uint_t pgrflags)
3929 page_t *like_pp;
3930 page_t *pp, *pplist;
3931 page_t *pl = NULL;
3932 ulong_t bin;
3933 int mnode, page_mnode;
3934 int szc;
3935 spgcnt_t npgs, pg_cnt;
3936 pfn_t pfnum;
3937 int mtype;
3938 int flags = 0;
3939 lgrp_mnode_cookie_t lgrp_cookie;
3940 lgrp_t *lgrp;
3942 REPL_STAT_INCR(ngets);
3943 like_pp = orig_like_pp;
3944 ASSERT(PAGE_EXCL(like_pp));
3946 szc = like_pp->p_szc;
3947 npgs = page_get_pagecnt(szc);
3949 * Now we reset like_pp to the base page_t.
3950 * That way, we won't walk past the end of this 'szc' page.
3952 pfnum = PFN_BASE(like_pp->p_pagenum, szc);
3953 like_pp = page_numtopp_nolock(pfnum);
3954 ASSERT(like_pp->p_szc == szc);
3956 if (PP_ISNORELOC(like_pp)) {
3957 ASSERT(kcage_on);
3958 REPL_STAT_INCR(ngets_noreloc);
3959 flags = PGI_RELOCONLY;
3960 } else if (pgrflags & PGR_NORELOC) {
3961 ASSERT(kcage_on);
3962 REPL_STAT_INCR(npgr_noreloc);
3963 flags = PG_NORELOC;
3967 * Kernel pages must always be replaced with the same size
3968 * pages, since we cannot properly handle demotion of kernel
3969 * pages.
3971 if (PP_ISKAS(like_pp))
3972 pgrflags |= PGR_SAMESZC;
3974 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
3976 while (npgs) {
3977 pplist = NULL;
3978 for (;;) {
3979 pg_cnt = page_get_pagecnt(szc);
3980 bin = PP_2_BIN(like_pp);
3981 ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
3982 ASSERT(pg_cnt <= npgs);
3985 * If an lgroup was specified, try to get the
3986 * page from that lgroup.
3987 * NOTE: Must be careful with code below because
3988 * lgroup may disappear and reappear since there
3989 * is no locking for lgroup here.
3991 if (LGRP_EXISTS(lgrp_target)) {
3993 * Keep local variable for lgroup separate
3994 * from lgroup argument since this code should
3995 * only be exercised when lgroup argument
3996 * exists....
3998 lgrp = lgrp_target;
4000 /* Try the lgroup's freelists first */
4001 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4002 LGRP_SRCH_LOCAL);
4003 while ((pplist == NULL) &&
4004 (mnode = lgrp_memnode_choose(&lgrp_cookie))
4005 != -1) {
4006 pplist =
4007 page_get_mnode_freelist(mnode, bin,
4008 mtype, szc, flags);
4012 * Now try it's cachelists if this is a
4013 * small page. Don't need to do it for
4014 * larger ones since page_freelist_coalesce()
4015 * already failed.
4017 if (pplist != NULL || szc != 0)
4018 break;
4020 /* Now try it's cachelists */
4021 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4022 LGRP_SRCH_LOCAL);
4024 while ((pplist == NULL) &&
4025 (mnode = lgrp_memnode_choose(&lgrp_cookie))
4026 != -1) {
4027 pplist =
4028 page_get_mnode_cachelist(bin, flags,
4029 mnode, mtype);
4031 if (pplist != NULL) {
4032 page_hashout(pplist, false);
4033 PP_SETAGED(pplist);
4034 REPL_STAT_INCR(nhashout);
4035 break;
4037 /* Done looking in this lgroup. Bail out. */
4038 break;
4042 * No lgroup was specified (or lgroup was removed by
4043 * DR, so just try to get the page as close to
4044 * like_pp's mnode as possible.
4045 * First try the local freelist...
4047 mnode = PP_2_MEM_NODE(like_pp);
4048 pplist = page_get_mnode_freelist(mnode, bin,
4049 mtype, szc, flags);
4050 if (pplist != NULL)
4051 break;
4053 REPL_STAT_INCR(nnofree);
4056 * ...then the local cachelist. Don't need to do it for
4057 * larger pages cause page_freelist_coalesce() already
4058 * failed there anyway.
4060 if (szc == 0) {
4061 pplist = page_get_mnode_cachelist(bin, flags,
4062 mnode, mtype);
4063 if (pplist != NULL) {
4064 page_hashout(pplist, false);
4065 PP_SETAGED(pplist);
4066 REPL_STAT_INCR(nhashout);
4067 break;
4071 /* Now try remote freelists */
4072 page_mnode = mnode;
4073 lgrp =
4074 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4075 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4076 LGRP_SRCH_HIER);
4077 while (pplist == NULL &&
4078 (mnode = lgrp_memnode_choose(&lgrp_cookie))
4079 != -1) {
4081 * Skip local mnode.
4083 if ((mnode == page_mnode) ||
4084 (mem_node_config[mnode].exists == 0))
4085 continue;
4087 pplist = page_get_mnode_freelist(mnode,
4088 bin, mtype, szc, flags);
4091 if (pplist != NULL)
4092 break;
4095 /* Now try remote cachelists */
4096 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4097 LGRP_SRCH_HIER);
4098 while (pplist == NULL && szc == 0) {
4099 mnode = lgrp_memnode_choose(&lgrp_cookie);
4100 if (mnode == -1)
4101 break;
4103 * Skip local mnode.
4105 if ((mnode == page_mnode) ||
4106 (mem_node_config[mnode].exists == 0))
4107 continue;
4109 pplist = page_get_mnode_cachelist(bin,
4110 flags, mnode, mtype);
4112 if (pplist != NULL) {
4113 page_hashout(pplist, false);
4114 PP_SETAGED(pplist);
4115 REPL_STAT_INCR(nhashout);
4116 break;
4121 * Break out of while loop under the following cases:
4122 * - If we successfully got a page.
4123 * - If pgrflags specified only returning a specific
4124 * page size and we could not find that page size.
4125 * - If we could not satisfy the request with PAGESIZE
4126 * or larger pages.
4128 if (pplist != NULL || szc == 0)
4129 break;
4131 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4132 /* try to find contig page */
4134 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4135 LGRP_SRCH_HIER);
4137 while ((pplist == NULL) &&
4138 (mnode =
4139 lgrp_memnode_choose(&lgrp_cookie))
4140 != -1) {
4141 pplist = page_get_contig_pages(
4142 mnode, bin, mtype, szc,
4143 flags | PGI_PGCPHIPRI);
4145 break;
4149 * The correct thing to do here is try the next
4150 * page size down using szc--. Due to a bug
4151 * with the processing of HAT_RELOAD_SHARE
4152 * where the sfmmu_ttecnt arrays of all
4153 * hats sharing an ISM segment don't get updated,
4154 * using intermediate size pages for relocation
4155 * can lead to continuous page faults.
4157 szc = 0;
4160 if (pplist != NULL) {
4161 DTRACE_PROBE4(page__get,
4162 lgrp_t *, lgrp,
4163 int, mnode,
4164 ulong_t, bin,
4165 uint_t, flags);
4167 while (pplist != NULL && pg_cnt--) {
4168 ASSERT(pplist != NULL);
4169 pp = pplist;
4170 page_sub(&pplist, pp);
4171 PP_CLRFREE(pp);
4172 PP_CLRAGED(pp);
4173 page_list_concat(&pl, &pp);
4174 npgs--;
4175 like_pp = like_pp + 1;
4176 REPL_STAT_INCR(nnext_pp);
4178 ASSERT(pg_cnt == 0);
4179 } else {
4180 break;
4184 if (npgs) {
4186 * We were unable to allocate the necessary number
4187 * of pages.
4188 * We need to free up any pl.
4190 REPL_STAT_INCR(nnopage);
4191 page_free_replacement_page(pl);
4192 return (NULL);
4193 } else {
4194 return (pl);
4199 * demote a free large page to it's constituent pages
4201 void
4202 page_demote_free_pages(page_t *pp)
4205 int mnode;
4207 ASSERT(pp != NULL);
4208 ASSERT(PAGE_LOCKED(pp));
4209 ASSERT(PP_ISFREE(pp));
4210 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4212 mnode = PP_2_MEM_NODE(pp);
4213 page_freelist_lock(mnode);
4214 if (pp->p_szc != 0) {
4215 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4216 pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4218 page_freelist_unlock(mnode);
4219 ASSERT(pp->p_szc == 0);
4223 * Factor in colorequiv to check additional 'equivalent' bins.
4224 * colorequiv may be set in /etc/system
4226 void
4227 page_set_colorequiv_arr(void)
4229 if (colorequiv > 1) {
4230 int i;
4231 uint_t sv_a = lowbit(colorequiv) - 1;
4233 if (sv_a > 15)
4234 sv_a = 15;
4236 for (i = 0; i < MMU_PAGE_SIZES; i++) {
4237 uint_t colors;
4238 uint_t a = sv_a;
4240 if ((colors = hw_page_array[i].hp_colors) <= 1) {
4241 continue;
4243 while ((colors >> a) == 0)
4244 a--;
4245 if ((a << 4) > colorequivszc[i]) {
4246 colorequivszc[i] = (a << 4);