4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
25 * Copyright (c) 2018, Joyent, Inc.
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 * The Regents of the University of California
36 * University Acknowledgment- Portions of this document are derived from
37 * software developed by the University of California, Berkeley, and its
42 * VM - segment management.
45 #include <sys/types.h>
46 #include <sys/inttypes.h>
47 #include <sys/t_lock.h>
48 #include <sys/param.h>
49 #include <sys/systm.h>
51 #include <sys/sysmacros.h>
52 #include <sys/vmsystm.h>
53 #include <sys/tuneable.h>
54 #include <sys/debug.h>
55 #include <sys/fs/swapnode.h>
56 #include <sys/cmn_err.h>
57 #include <sys/callb.h>
58 #include <sys/mem_config.h>
64 #include <vm/seg_kmem.h>
65 #include <vm/seg_spt.h>
66 #include <vm/seg_vn.h>
70 * kstats for segment advise
72 segadvstat_t segadvstat
= {
73 { "MADV_FREE_hit", KSTAT_DATA_ULONG
},
74 { "MADV_FREE_miss", KSTAT_DATA_ULONG
},
77 kstat_named_t
*segadvstat_ptr
= (kstat_named_t
*)&segadvstat
;
78 uint_t segadvstat_ndata
= sizeof (segadvstat
) / sizeof (kstat_named_t
);
81 * entry in the segment page cache
84 struct seg_pcache
*p_hnext
; /* list for hashed blocks */
85 struct seg_pcache
*p_hprev
;
86 pcache_link_t p_plink
; /* per segment/amp list */
87 void *p_htag0
; /* segment/amp pointer */
88 caddr_t p_addr
; /* base address/anon_idx */
89 size_t p_len
; /* total bytes */
90 size_t p_wlen
; /* writtable bytes at p_addr */
91 struct page
**p_pp
; /* pp shadow list */
92 seg_preclaim_cbfunc_t p_callback
; /* reclaim callback function */
93 clock_t p_lbolt
; /* lbolt from last use */
94 struct seg_phash
*p_hashp
; /* our pcache hash bucket */
95 uint_t p_active
; /* active count */
96 uchar_t p_write
; /* true if S_WRITE */
97 uchar_t p_ref
; /* reference byte */
98 ushort_t p_flags
; /* bit flags */
102 struct seg_pcache
*p_hnext
; /* list for hashed blocks */
103 struct seg_pcache
*p_hprev
;
104 kmutex_t p_hmutex
; /* protects hash bucket */
105 pcache_link_t p_halink
[2]; /* active bucket linkages */
108 struct seg_phash_wired
{
109 struct seg_pcache
*p_hnext
; /* list for hashed blocks */
110 struct seg_pcache
*p_hprev
;
111 kmutex_t p_hmutex
; /* protects hash bucket */
115 * A parameter to control a maximum number of bytes that can be
116 * purged from pcache at a time.
118 #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024)
121 * log2(fraction of pcache to reclaim at a time).
123 #define P_SHRINK_SHFT (5)
126 * The following variables can be tuned via /etc/system.
129 int segpcache_enabled
= 1; /* if 1, shadow lists are cached */
130 pgcnt_t segpcache_maxwindow
= 0; /* max # of pages that can be cached */
131 ulong_t segpcache_hashsize_win
= 0; /* # of non wired buckets */
132 ulong_t segpcache_hashsize_wired
= 0; /* # of wired buckets */
133 int segpcache_reap_sec
= 1; /* reap check rate in secs */
134 clock_t segpcache_reap_ticks
= 0; /* reap interval in ticks */
135 int segpcache_pcp_maxage_sec
= 1; /* pcp max age in secs */
136 clock_t segpcache_pcp_maxage_ticks
= 0; /* pcp max age in ticks */
137 int segpcache_shrink_shift
= P_SHRINK_SHFT
; /* log2 reap fraction */
138 pgcnt_t segpcache_maxapurge_bytes
= P_MAX_APURGE_BYTES
; /* max purge bytes */
140 static kmutex_t seg_pcache_mtx
; /* protects seg_pdisabled counter */
141 static kmutex_t seg_pasync_mtx
; /* protects async thread scheduling */
142 static kcondvar_t seg_pasync_cv
;
144 #pragma align 64(pctrl1)
145 #pragma align 64(pctrl2)
146 #pragma align 64(pctrl3)
149 * Keep frequently used variables together in one cache line.
151 static struct p_ctrl1
{
152 uint_t p_disabled
; /* if not 0, caching temporarily off */
153 pgcnt_t p_maxwin
; /* max # of pages that can be cached */
154 size_t p_hashwin_sz
; /* # of non wired buckets */
155 struct seg_phash
*p_htabwin
; /* hash table for non wired entries */
156 size_t p_hashwired_sz
; /* # of wired buckets */
157 struct seg_phash_wired
*p_htabwired
; /* hash table for wired entries */
158 kmem_cache_t
*p_kmcache
; /* kmem cache for seg_pcache structs */
164 static struct p_ctrl2
{
165 kmutex_t p_mem_mtx
; /* protects window counter and p_halinks */
166 pgcnt_t p_locked_win
; /* # pages from window */
167 pgcnt_t p_locked
; /* # of pages cached by pagelock */
168 uchar_t p_ahcur
; /* current active links for insert/delete */
169 uchar_t p_athr_on
; /* async reclaim thread is running. */
170 pcache_link_t p_ahhead
[2]; /* active buckets linkages */
173 static struct p_ctrl3
{
174 clock_t p_pcp_maxage
; /* max pcp age in ticks */
175 ulong_t p_athr_empty_ahb
; /* athread walk stats */
176 ulong_t p_athr_full_ahb
; /* athread walk stats */
177 pgcnt_t p_maxapurge_npages
; /* max pages to purge at a time */
178 int p_shrink_shft
; /* reap shift factor */
184 #define seg_pdisabled pctrl1.p_disabled
185 #define seg_pmaxwindow pctrl1.p_maxwin
186 #define seg_phashsize_win pctrl1.p_hashwin_sz
187 #define seg_phashtab_win pctrl1.p_htabwin
188 #define seg_phashsize_wired pctrl1.p_hashwired_sz
189 #define seg_phashtab_wired pctrl1.p_htabwired
190 #define seg_pkmcache pctrl1.p_kmcache
191 #define seg_pmem_mtx pctrl2.p_mem_mtx
192 #define seg_plocked_window pctrl2.p_locked_win
193 #define seg_plocked pctrl2.p_locked
194 #define seg_pahcur pctrl2.p_ahcur
195 #define seg_pathr_on pctrl2.p_athr_on
196 #define seg_pahhead pctrl2.p_ahhead
197 #define seg_pmax_pcpage pctrl3.p_pcp_maxage
198 #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb
199 #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb
200 #define seg_pshrink_shift pctrl3.p_shrink_shft
201 #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages
203 #define P_HASHWIN_MASK (seg_phashsize_win - 1)
204 #define P_HASHWIRED_MASK (seg_phashsize_wired - 1)
205 #define P_BASESHIFT (6)
207 kthread_t
*seg_pasync_thr
;
209 extern const struct seg_ops segvn_ops
;
210 extern const struct seg_ops segspt_shmops
;
212 #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
213 #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
215 #define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t)))
217 #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt)
220 * htag0 argument can be a seg or amp pointer.
222 #define P_HASHBP(seg, htag0, addr, flags) \
223 (IS_PFLAGS_WIRED((flags)) ? \
224 ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
225 ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \
226 (&seg_phashtab_win[P_HASHWIN_MASK & \
227 (((uintptr_t)(htag0) >> 3) ^ \
228 ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \
229 (flags >> 16) : page_get_shift((seg)->s_szc))))]))
232 * htag0 argument can be a seg or amp pointer.
234 #define P_MATCH(pcp, htag0, addr, len) \
235 ((pcp)->p_htag0 == (htag0) && \
236 (pcp)->p_addr == (addr) && \
237 (pcp)->p_len >= (len))
239 #define P_MATCH_PP(pcp, htag0, addr, len, pp) \
240 ((pcp)->p_pp == (pp) && \
241 (pcp)->p_htag0 == (htag0) && \
242 (pcp)->p_addr == (addr) && \
243 (pcp)->p_len >= (len))
245 #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \
246 offsetof(struct seg_pcache, p_plink)))
248 #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \
249 offsetof(struct seg_phash, p_halink[l])))
252 * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
253 * active hash bucket lists. We maintain active bucket lists to reduce the
254 * overhead of finding active buckets during asynchronous purging since there
255 * can be 10s of millions of buckets on a large system but only a small subset
256 * of them in actual use.
258 * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
259 * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
260 * buckets. The other list is used by asynchronous purge thread. This allows
261 * the purge thread to walk its active list without holding seg_pmem_mtx for a
262 * long time. When asynchronous thread is done with its list it switches to
263 * current active list and makes the list it just finished processing as
264 * current active list.
266 * seg_padd_abuck() only adds the bucket to current list if the bucket is not
267 * yet on any list. seg_premove_abuck() may remove the bucket from either
268 * list. If the bucket is on current list it will be always removed. Otherwise
269 * the bucket is only removed if asynchronous purge thread is not currently
270 * running or seg_premove_abuck() is called by asynchronous purge thread
271 * itself. A given bucket can only be on one of active lists at a time. These
272 * routines should be called with per bucket lock held. The routines use
273 * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
274 * the first entry is added to the bucket chain and seg_premove_abuck() must
275 * be called after the last pcp entry is deleted from its chain. Per bucket
276 * lock should be held by the callers. This avoids a potential race condition
277 * when seg_premove_abuck() removes a bucket after pcp entries are added to
278 * its list after the caller checked that the bucket has no entries. (this
279 * race would cause a loss of an active bucket from the active lists).
281 * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
282 * New entries are added to the end of the list since LRU is used as the
286 seg_padd_abuck(struct seg_phash
*hp
)
290 ASSERT(MUTEX_HELD(&hp
->p_hmutex
));
291 ASSERT((struct seg_phash
*)hp
->p_hnext
!= hp
);
292 ASSERT((struct seg_phash
*)hp
->p_hprev
!= hp
);
293 ASSERT(hp
->p_hnext
== hp
->p_hprev
);
294 ASSERT(!IS_PCP_WIRED(hp
->p_hnext
));
295 ASSERT(hp
->p_hnext
->p_hnext
== (struct seg_pcache
*)hp
);
296 ASSERT(hp
->p_hprev
->p_hprev
== (struct seg_pcache
*)hp
);
297 ASSERT(hp
>= seg_phashtab_win
&&
298 hp
< &seg_phashtab_win
[seg_phashsize_win
]);
301 * This bucket can already be on one of active lists
302 * since seg_premove_abuck() may have failed to remove it
305 mutex_enter(&seg_pmem_mtx
);
307 ASSERT(lix
>= 0 && lix
<= 1);
308 if (hp
->p_halink
[lix
].p_lnext
!= NULL
) {
309 ASSERT(hp
->p_halink
[lix
].p_lprev
!= NULL
);
310 ASSERT(hp
->p_halink
[!lix
].p_lnext
== NULL
);
311 ASSERT(hp
->p_halink
[!lix
].p_lprev
== NULL
);
312 mutex_exit(&seg_pmem_mtx
);
315 ASSERT(hp
->p_halink
[lix
].p_lprev
== NULL
);
318 * If this bucket is still on list !lix async thread can't yet remove
319 * it since we hold here per bucket lock. In this case just return
320 * since async thread will eventually find and process this bucket.
322 if (hp
->p_halink
[!lix
].p_lnext
!= NULL
) {
323 ASSERT(hp
->p_halink
[!lix
].p_lprev
!= NULL
);
324 mutex_exit(&seg_pmem_mtx
);
327 ASSERT(hp
->p_halink
[!lix
].p_lprev
== NULL
);
329 * This bucket is not on any active bucket list yet.
330 * Add the bucket to the tail of current active list.
332 hp
->p_halink
[lix
].p_lnext
= &seg_pahhead
[lix
];
333 hp
->p_halink
[lix
].p_lprev
= seg_pahhead
[lix
].p_lprev
;
334 seg_pahhead
[lix
].p_lprev
->p_lnext
= &hp
->p_halink
[lix
];
335 seg_pahhead
[lix
].p_lprev
= &hp
->p_halink
[lix
];
336 mutex_exit(&seg_pmem_mtx
);
340 seg_premove_abuck(struct seg_phash
*hp
, int athr
)
344 ASSERT(MUTEX_HELD(&hp
->p_hmutex
));
345 ASSERT((struct seg_phash
*)hp
->p_hnext
== hp
);
346 ASSERT((struct seg_phash
*)hp
->p_hprev
== hp
);
347 ASSERT(hp
>= seg_phashtab_win
&&
348 hp
< &seg_phashtab_win
[seg_phashsize_win
]);
351 ASSERT(seg_pathr_on
);
352 ASSERT(seg_pahcur
<= 1);
354 * We are called by asynchronous thread that found this bucket
355 * on not currently active (i.e. !seg_pahcur) list. Remove it
356 * from there. Per bucket lock we are holding makes sure
357 * seg_pinsert() can't sneak in and add pcp entries to this
358 * bucket right before we remove the bucket from its list.
361 ASSERT(hp
->p_halink
[lix
].p_lnext
!= NULL
);
362 ASSERT(hp
->p_halink
[lix
].p_lprev
!= NULL
);
363 ASSERT(hp
->p_halink
[!lix
].p_lnext
== NULL
);
364 ASSERT(hp
->p_halink
[!lix
].p_lprev
== NULL
);
365 hp
->p_halink
[lix
].p_lnext
->p_lprev
= hp
->p_halink
[lix
].p_lprev
;
366 hp
->p_halink
[lix
].p_lprev
->p_lnext
= hp
->p_halink
[lix
].p_lnext
;
367 hp
->p_halink
[lix
].p_lnext
= NULL
;
368 hp
->p_halink
[lix
].p_lprev
= NULL
;
372 mutex_enter(&seg_pmem_mtx
);
374 ASSERT(lix
>= 0 && lix
<= 1);
377 * If the bucket is on currently active list just remove it from
380 if (hp
->p_halink
[lix
].p_lnext
!= NULL
) {
381 ASSERT(hp
->p_halink
[lix
].p_lprev
!= NULL
);
382 ASSERT(hp
->p_halink
[!lix
].p_lnext
== NULL
);
383 ASSERT(hp
->p_halink
[!lix
].p_lprev
== NULL
);
384 hp
->p_halink
[lix
].p_lnext
->p_lprev
= hp
->p_halink
[lix
].p_lprev
;
385 hp
->p_halink
[lix
].p_lprev
->p_lnext
= hp
->p_halink
[lix
].p_lnext
;
386 hp
->p_halink
[lix
].p_lnext
= NULL
;
387 hp
->p_halink
[lix
].p_lprev
= NULL
;
388 mutex_exit(&seg_pmem_mtx
);
391 ASSERT(hp
->p_halink
[lix
].p_lprev
== NULL
);
394 * If asynchronous thread is not running we can remove the bucket from
395 * not currently active list. The bucket must be on this list since we
396 * already checked that it's not on the other list and the bucket from
397 * which we just deleted the last pcp entry must be still on one of the
398 * active bucket lists.
401 ASSERT(hp
->p_halink
[lix
].p_lnext
!= NULL
);
402 ASSERT(hp
->p_halink
[lix
].p_lprev
!= NULL
);
405 hp
->p_halink
[lix
].p_lnext
->p_lprev
= hp
->p_halink
[lix
].p_lprev
;
406 hp
->p_halink
[lix
].p_lprev
->p_lnext
= hp
->p_halink
[lix
].p_lnext
;
407 hp
->p_halink
[lix
].p_lnext
= NULL
;
408 hp
->p_halink
[lix
].p_lprev
= NULL
;
410 mutex_exit(&seg_pmem_mtx
);
414 * Check if bucket pointed by hp already has a pcp entry that matches request
415 * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
416 * Also delete matching entries that cover smaller address range but start
417 * at the same address as addr argument. Return the list of deleted entries if
418 * any. This is an internal helper function called from seg_pinsert() only
419 * for non wired shadow lists. The caller already holds a per seg/amp list
422 static struct seg_pcache
*
423 seg_plookup_checkdup(struct seg_phash
*hp
, void *htag0
,
424 caddr_t addr
, size_t len
, int *found
)
426 struct seg_pcache
*pcp
;
427 struct seg_pcache
*delcallb_list
= NULL
;
429 ASSERT(MUTEX_HELD(&hp
->p_hmutex
));
432 for (pcp
= hp
->p_hnext
; pcp
!= (struct seg_pcache
*)hp
;
433 pcp
= pcp
->p_hnext
) {
434 ASSERT(pcp
->p_hashp
== hp
);
435 if (pcp
->p_htag0
== htag0
&& pcp
->p_addr
== addr
) {
436 ASSERT(!IS_PCP_WIRED(pcp
));
437 if (pcp
->p_len
< len
) {
438 pcache_link_t
*plinkp
;
442 plinkp
= &pcp
->p_plink
;
443 plinkp
->p_lprev
->p_lnext
= plinkp
->p_lnext
;
444 plinkp
->p_lnext
->p_lprev
= plinkp
->p_lprev
;
445 pcp
->p_hprev
->p_hnext
= pcp
->p_hnext
;
446 pcp
->p_hnext
->p_hprev
= pcp
->p_hprev
;
447 pcp
->p_hprev
= delcallb_list
;
455 return (delcallb_list
);
459 * lookup an address range in pagelock cache. Return shadow list and bump up
460 * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
464 seg_plookup(struct seg
*seg
, struct anon_map
*amp
, caddr_t addr
, size_t len
,
465 enum seg_rw rw
, uint_t flags
)
467 struct seg_pcache
*pcp
;
468 struct seg_phash
*hp
;
472 ASSERT(rw
== S_READ
|| rw
== S_WRITE
);
475 * Skip pagelock cache, while DR is in progress or
481 ASSERT(seg_phashsize_win
!= 0);
483 htag0
= (amp
== NULL
? (void *)seg
: (void *)amp
);
484 hp
= P_HASHBP(seg
, htag0
, addr
, flags
);
485 mutex_enter(&hp
->p_hmutex
);
486 for (pcp
= hp
->p_hnext
; pcp
!= (struct seg_pcache
*)hp
;
487 pcp
= pcp
->p_hnext
) {
488 ASSERT(pcp
->p_hashp
== hp
);
489 if (P_MATCH(pcp
, htag0
, addr
, len
)) {
490 ASSERT(IS_PFLAGS_WIRED(flags
) == IS_PCP_WIRED(pcp
));
492 * If this request wants to write pages
493 * but write permissions starting from
494 * addr don't cover the entire length len
495 * return lookup failure back to the caller.
496 * It will check protections and fail this
497 * pagelock operation with EACCESS error.
499 if (rw
== S_WRITE
&& pcp
->p_wlen
< len
) {
502 if (pcp
->p_active
== UINT_MAX
) {
506 if (rw
== S_WRITE
&& !pcp
->p_write
) {
509 mutex_exit(&hp
->p_hmutex
);
513 mutex_exit(&hp
->p_hmutex
);
518 * mark address range inactive. If the cache is off or the address range is
519 * not in the cache or another shadow list that covers bigger range is found
520 * we call the segment driver to reclaim the pages. Otherwise just decrement
521 * active count and set ref bit. If amp is not NULL use amp as a lookup tag
522 * otherwise use seg as a lookup tag.
525 seg_pinactive(struct seg
*seg
, struct anon_map
*amp
, caddr_t addr
,
526 size_t len
, struct page
**pp
, enum seg_rw rw
, uint_t flags
,
527 seg_preclaim_cbfunc_t callback
)
529 struct seg_pcache
*pcp
;
530 struct seg_phash
*hp
;
531 kmutex_t
*pmtx
= NULL
;
532 pcache_link_t
*pheadp
;
538 ASSERT(rw
== S_READ
|| rw
== S_WRITE
);
540 htag0
= (amp
== NULL
? (void *)seg
: (void *)amp
);
543 * Skip lookup if pcache is not configured.
545 if (seg_phashsize_win
== 0) {
550 * Grab per seg/amp lock before hash lock if we are going to remove
551 * inactive entry from pcache.
553 if (!IS_PFLAGS_WIRED(flags
) && seg_pdisabled
) {
555 pheadp
= &seg
->s_phead
;
558 pheadp
= &
->a_phead
;
564 hp
= P_HASHBP(seg
, htag0
, addr
, flags
);
565 mutex_enter(&hp
->p_hmutex
);
567 for (pcp
= hp
->p_hnext
; pcp
!= (struct seg_pcache
*)hp
;
568 pcp
= pcp
->p_hnext
) {
569 ASSERT(pcp
->p_hashp
== hp
);
570 if (P_MATCH_PP(pcp
, htag0
, addr
, len
, pp
)) {
571 ASSERT(IS_PFLAGS_WIRED(flags
) == IS_PCP_WIRED(pcp
));
572 ASSERT(pcp
->p_active
);
575 * Don't remove this pcp entry
576 * if we didn't find duplicate
577 * shadow lists on second search.
578 * Somebody removed those duplicates
579 * since we dropped hash lock after first
582 ASSERT(pmtx
!= NULL
);
583 ASSERT(!IS_PFLAGS_WIRED(flags
));
588 if (pcp
->p_active
== 0 && (pmtx
!= NULL
||
589 (seg_pdisabled
&& IS_PFLAGS_WIRED(flags
)))) {
592 * This entry is no longer active. Remove it
593 * now either because pcaching is temporarily
594 * disabled or there're other pcp entries that
595 * can match this pagelock request (i.e. this
596 * entry is a duplicate).
599 ASSERT(callback
== pcp
->p_callback
);
601 pcache_link_t
*plinkp
= &pcp
->p_plink
;
602 ASSERT(!IS_PCP_WIRED(pcp
));
603 ASSERT(pheadp
->p_lnext
!= pheadp
);
604 ASSERT(pheadp
->p_lprev
!= pheadp
);
605 plinkp
->p_lprev
->p_lnext
=
607 plinkp
->p_lnext
->p_lprev
=
610 pcp
->p_hprev
->p_hnext
= pcp
->p_hnext
;
611 pcp
->p_hnext
->p_hprev
= pcp
->p_hprev
;
612 if (!IS_PCP_WIRED(pcp
) &&
613 hp
->p_hnext
== (struct seg_pcache
*)hp
) {
615 * We removed the last entry from this
616 * bucket. Now remove the bucket from
619 seg_premove_abuck(hp
, 0);
621 mutex_exit(&hp
->p_hmutex
);
627 if (rw
!= S_WRITE
&& pcp
->p_write
) {
630 kmem_cache_free(seg_pkmcache
, pcp
);
634 * We found a matching pcp entry but will not
635 * free it right away even if it's no longer
638 if (!pcp
->p_active
&& !IS_PCP_WIRED(pcp
)) {
640 * Set the reference bit and mark the
641 * time of last access to this pcp
642 * so that asynchronous thread doesn't
643 * free it immediately since
644 * it may be reactivated very soon.
646 pcp
->p_lbolt
= ddi_get_lbolt();
649 mutex_exit(&hp
->p_hmutex
);
655 } else if (!IS_PFLAGS_WIRED(flags
) &&
656 P_MATCH(pcp
, htag0
, addr
, len
)) {
658 * This is a duplicate pcp entry. This situation may
659 * happen if a bigger shadow list that covers our
660 * range was added while our entry was still active.
661 * Now we can free our pcp entry if it becomes
664 if (!pcp
->p_active
) {
666 * Mark this entry as referenced just in case
667 * we'll free our own pcp entry soon.
669 pcp
->p_lbolt
= ddi_get_lbolt();
674 * we are already holding pmtx and found a
675 * duplicate. Don't keep our own pcp entry.
681 * We have to use mutex_tryenter to attempt to lock
682 * seg/amp list lock since we already hold hash lock
683 * and seg/amp list lock is above hash lock in lock
684 * order. If mutex_tryenter fails drop hash lock and
685 * retake both locks in correct order and research
690 pheadp
= &seg
->s_phead
;
693 pheadp
= &
->a_phead
;
696 if (!mutex_tryenter(pmtx
)) {
697 mutex_exit(&hp
->p_hmutex
);
699 mutex_enter(&hp
->p_hmutex
);
701 * If we don't find bigger shadow list on
702 * second search (it may happen since we
703 * dropped bucket lock) keep the entry that
704 * matches our own shadow list.
711 mutex_exit(&hp
->p_hmutex
);
716 (*callback
)(htag0
, addr
, len
, pp
, rw
, 0);
718 mutex_enter(&seg_pmem_mtx
);
719 ASSERT(seg_plocked
>= npages
);
720 seg_plocked
-= npages
;
721 if (!IS_PFLAGS_WIRED(flags
)) {
722 ASSERT(seg_plocked_window
>= npages
);
723 seg_plocked_window
-= npages
;
725 mutex_exit(&seg_pmem_mtx
);
731 static uint32_t p_insert_chk_mtbf
= 0;
735 * The seg_pinsert_check() is used by segment drivers to predict whether
736 * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
740 seg_pinsert_check(struct seg
*seg
, struct anon_map
*amp
, caddr_t addr
,
741 size_t len
, uint_t flags
)
746 if (p_insert_chk_mtbf
&& !(gethrtime() % p_insert_chk_mtbf
)) {
754 ASSERT(seg_phashsize_win
!= 0);
756 if (IS_PFLAGS_WIRED(flags
)) {
757 return (SEGP_SUCCESS
);
760 if (seg_plocked_window
+ btop(len
) > seg_pmaxwindow
) {
764 if (freemem
< desfree
) {
768 return (SEGP_SUCCESS
);
772 static uint32_t p_insert_mtbf
= 0;
776 * Insert address range with shadow list into pagelock cache if there's no
777 * shadow list already cached for this address range. If the cache is off or
778 * caching is temporarily disabled or the allowed 'window' is exceeded return
779 * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
781 * For non wired shadow lists (segvn case) include address in the hashing
782 * function to avoid linking all the entries from the same segment or amp on
783 * the same bucket. amp is used instead of seg if amp is not NULL. Non wired
784 * pcache entries are also linked on a per segment/amp list so that all
785 * entries can be found quickly during seg/amp purge without walking the
786 * entire pcache hash table. For wired shadow lists (segspt case) we
787 * don't use address hashing and per segment linking because the caller
788 * currently inserts only one entry per segment that covers the entire
789 * segment. If we used per segment linking even for segspt it would complicate
790 * seg_ppurge_wiredpp() locking.
792 * Both hash bucket and per seg/amp locks need to be held before adding a non
793 * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
796 * This function will also remove from pcache old inactive shadow lists that
797 * overlap with this request but cover smaller range for the same start
801 seg_pinsert(struct seg
*seg
, struct anon_map
*amp
, caddr_t addr
, size_t len
,
802 size_t wlen
, struct page
**pp
, enum seg_rw rw
, uint_t flags
,
803 seg_preclaim_cbfunc_t callback
)
805 struct seg_pcache
*pcp
;
806 struct seg_phash
*hp
;
808 pcache_link_t
*pheadp
;
810 struct seg_pcache
*delcallb_list
= NULL
;
813 ASSERT(rw
== S_READ
|| rw
== S_WRITE
);
814 ASSERT(rw
== S_READ
|| wlen
== len
);
815 ASSERT(rw
== S_WRITE
|| wlen
<= len
);
816 ASSERT(amp
== NULL
|| wlen
== len
);
819 if (p_insert_mtbf
&& !(gethrtime() % p_insert_mtbf
)) {
827 ASSERT(seg_phashsize_win
!= 0);
829 ASSERT((len
& PAGEOFFSET
) == 0);
831 mutex_enter(&seg_pmem_mtx
);
832 if (!IS_PFLAGS_WIRED(flags
)) {
833 if (seg_plocked_window
+ npages
> seg_pmaxwindow
) {
834 mutex_exit(&seg_pmem_mtx
);
837 seg_plocked_window
+= npages
;
839 seg_plocked
+= npages
;
840 mutex_exit(&seg_pmem_mtx
);
842 pcp
= kmem_cache_alloc(seg_pkmcache
, KM_SLEEP
);
844 * If amp is not NULL set htag0 to amp otherwise set it to seg.
847 pcp
->p_htag0
= (void *)seg
;
848 pcp
->p_flags
= flags
& 0xffff;
850 pcp
->p_htag0
= (void *)amp
;
851 pcp
->p_flags
= (flags
& 0xffff) | SEGP_AMP
;
857 pcp
->p_write
= (rw
== S_WRITE
);
858 pcp
->p_callback
= callback
;
861 hp
= P_HASHBP(seg
, pcp
->p_htag0
, addr
, flags
);
862 if (!IS_PFLAGS_WIRED(flags
)) {
866 pheadp
= &seg
->s_phead
;
870 pheadp
= &
->a_phead
;
875 mutex_enter(&hp
->p_hmutex
);
876 delcallb_list
= seg_plookup_checkdup(hp
, htag0
, addr
,
879 mutex_exit(&hp
->p_hmutex
);
881 mutex_enter(&seg_pmem_mtx
);
882 seg_plocked
-= npages
;
883 seg_plocked_window
-= npages
;
884 mutex_exit(&seg_pmem_mtx
);
885 kmem_cache_free(seg_pkmcache
, pcp
);
888 pcp
->p_plink
.p_lnext
= pheadp
->p_lnext
;
889 pcp
->p_plink
.p_lprev
= pheadp
;
890 pheadp
->p_lnext
->p_lprev
= &pcp
->p_plink
;
891 pheadp
->p_lnext
= &pcp
->p_plink
;
893 mutex_enter(&hp
->p_hmutex
);
896 pcp
->p_hnext
= hp
->p_hnext
;
897 pcp
->p_hprev
= (struct seg_pcache
*)hp
;
898 hp
->p_hnext
->p_hprev
= pcp
;
900 if (!IS_PFLAGS_WIRED(flags
) &&
901 hp
->p_hprev
== pcp
) {
904 mutex_exit(&hp
->p_hmutex
);
905 if (!IS_PFLAGS_WIRED(flags
)) {
911 while (delcallb_list
!= NULL
) {
913 delcallb_list
= pcp
->p_hprev
;
914 ASSERT(!IS_PCP_WIRED(pcp
) && !pcp
->p_active
);
915 (void) (*pcp
->p_callback
)(pcp
->p_htag0
, pcp
->p_addr
,
916 pcp
->p_len
, pcp
->p_pp
, pcp
->p_write
? S_WRITE
: S_READ
, 0);
917 npages
+= btop(pcp
->p_len
);
918 kmem_cache_free(seg_pkmcache
, pcp
);
921 ASSERT(!IS_PFLAGS_WIRED(flags
));
922 mutex_enter(&seg_pmem_mtx
);
923 ASSERT(seg_plocked
>= npages
);
924 ASSERT(seg_plocked_window
>= npages
);
925 seg_plocked
-= npages
;
926 seg_plocked_window
-= npages
;
927 mutex_exit(&seg_pmem_mtx
);
930 return (SEGP_SUCCESS
);
934 * purge entries from the pagelock cache if not active
935 * and not recently used.
938 seg_ppurge_async(int force
)
940 struct seg_pcache
*delcallb_list
= NULL
;
941 struct seg_pcache
*pcp
;
942 struct seg_phash
*hp
;
944 pgcnt_t npages_window
= 0;
945 pgcnt_t npgs_to_purge
;
946 pgcnt_t npgs_purged
= 0;
949 pcache_link_t
*hlinkp
;
950 pcache_link_t
*hlnextp
= NULL
;
954 ASSERT(seg_phashsize_win
!= 0);
957 * if the cache is off or empty, return
959 if (seg_plocked
== 0 || (!force
&& seg_plocked_window
== 0)) {
966 if (freemem
< lotsfree
+ needfree
) {
967 spgcnt_t fmem
= MAX((spgcnt_t
)(freemem
- needfree
), 0);
968 if (fmem
<= 5 * (desfree
>> 2)) {
970 } else if (fmem
<= 7 * (lotsfree
>> 3)) {
971 if (seg_plocked_window
>=
972 (availrmem_initial
>> 1)) {
975 } else if (fmem
< lotsfree
) {
976 if (seg_plocked_window
>=
977 3 * (availrmem_initial
>> 2)) {
982 if (seg_plocked_window
>= 7 * (seg_pmaxwindow
>> 3)) {
985 if (!lowmem
&& !trim
) {
988 npgs_to_purge
= seg_plocked_window
>>
991 npgs_to_purge
= MIN(npgs_to_purge
,
992 MAX(seg_pmaxapurge_npages
, desfree
));
994 npgs_to_purge
= MIN(npgs_to_purge
,
995 seg_pmaxapurge_npages
);
997 if (npgs_to_purge
== 0) {
1001 struct seg_phash_wired
*hpw
;
1003 ASSERT(seg_phashsize_wired
!= 0);
1005 for (hpw
= seg_phashtab_wired
;
1006 hpw
< &seg_phashtab_wired
[seg_phashsize_wired
]; hpw
++) {
1008 if (hpw
->p_hnext
== (struct seg_pcache
*)hpw
) {
1012 mutex_enter(&hpw
->p_hmutex
);
1014 for (pcp
= hpw
->p_hnext
;
1015 pcp
!= (struct seg_pcache
*)hpw
;
1016 pcp
= pcp
->p_hnext
) {
1018 ASSERT(IS_PCP_WIRED(pcp
));
1019 ASSERT(pcp
->p_hashp
==
1020 (struct seg_phash
*)hpw
);
1022 if (pcp
->p_active
) {
1025 pcp
->p_hprev
->p_hnext
= pcp
->p_hnext
;
1026 pcp
->p_hnext
->p_hprev
= pcp
->p_hprev
;
1027 pcp
->p_hprev
= delcallb_list
;
1028 delcallb_list
= pcp
;
1030 mutex_exit(&hpw
->p_hmutex
);
1034 mutex_enter(&seg_pmem_mtx
);
1036 mutex_exit(&seg_pmem_mtx
);
1040 mutex_exit(&seg_pmem_mtx
);
1041 ASSERT(seg_pahcur
<= 1);
1045 for (hlinkp
= seg_pahhead
[hlix
].p_lnext
; hlinkp
!= &seg_pahhead
[hlix
];
1048 hlnextp
= hlinkp
->p_lnext
;
1049 ASSERT(hlnextp
!= NULL
);
1051 hp
= hlink2phash(hlinkp
, hlix
);
1052 if (hp
->p_hnext
== (struct seg_pcache
*)hp
) {
1053 seg_pathr_empty_ahb
++;
1056 seg_pathr_full_ahb
++;
1057 mutex_enter(&hp
->p_hmutex
);
1059 for (pcp
= hp
->p_hnext
; pcp
!= (struct seg_pcache
*)hp
;
1060 pcp
= pcp
->p_hnext
) {
1061 pcache_link_t
*pheadp
;
1062 pcache_link_t
*plinkp
;
1066 ASSERT(!IS_PCP_WIRED(pcp
));
1067 ASSERT(pcp
->p_hashp
== hp
);
1069 if (pcp
->p_active
) {
1072 if (!force
&& pcp
->p_ref
&&
1073 PCP_AGE(pcp
) < seg_pmax_pcpage
) {
1077 plinkp
= &pcp
->p_plink
;
1078 htag0
= pcp
->p_htag0
;
1079 if (pcp
->p_flags
& SEGP_AMP
) {
1080 pheadp
= &((amp_t
*)htag0
)->a_phead
;
1081 pmtx
= &((amp_t
*)htag0
)->a_pmtx
;
1083 pheadp
= &((seg_t
*)htag0
)->s_phead
;
1084 pmtx
= &((seg_t
*)htag0
)->s_pmtx
;
1086 if (!mutex_tryenter(pmtx
)) {
1089 ASSERT(pheadp
->p_lnext
!= pheadp
);
1090 ASSERT(pheadp
->p_lprev
!= pheadp
);
1091 plinkp
->p_lprev
->p_lnext
=
1093 plinkp
->p_lnext
->p_lprev
=
1095 pcp
->p_hprev
->p_hnext
= pcp
->p_hnext
;
1096 pcp
->p_hnext
->p_hprev
= pcp
->p_hprev
;
1098 pcp
->p_hprev
= delcallb_list
;
1099 delcallb_list
= pcp
;
1100 npgs_purged
+= btop(pcp
->p_len
);
1102 if (hp
->p_hnext
== (struct seg_pcache
*)hp
) {
1103 seg_premove_abuck(hp
, 1);
1105 mutex_exit(&hp
->p_hmutex
);
1106 if (npgs_purged
>= seg_plocked_window
) {
1110 if (npgs_purged
>= npgs_to_purge
) {
1113 if (!trim
&& !(seg_pathr_full_ahb
& 15)) {
1115 if (freemem
>= lotsfree
+ needfree
) {
1122 if (hlinkp
== &seg_pahhead
[hlix
]) {
1124 * We processed the entire hlix active bucket list
1125 * but didn't find enough pages to reclaim.
1126 * Switch the lists and walk the other list
1127 * if we haven't done it yet.
1129 mutex_enter(&seg_pmem_mtx
);
1130 ASSERT(seg_pathr_on
);
1131 ASSERT(seg_pahcur
== !hlix
);
1133 mutex_exit(&seg_pmem_mtx
);
1138 } else if ((hlinkp
= hlnextp
) != &seg_pahhead
[hlix
] &&
1139 seg_pahhead
[hlix
].p_lnext
!= hlinkp
) {
1140 ASSERT(hlinkp
!= NULL
);
1141 ASSERT(hlinkp
->p_lprev
!= &seg_pahhead
[hlix
]);
1142 ASSERT(seg_pahhead
[hlix
].p_lnext
!= &seg_pahhead
[hlix
]);
1143 ASSERT(seg_pahhead
[hlix
].p_lprev
!= &seg_pahhead
[hlix
]);
1146 * Reinsert the header to point to hlinkp
1147 * so that we start from hlinkp bucket next time around.
1149 seg_pahhead
[hlix
].p_lnext
->p_lprev
= seg_pahhead
[hlix
].p_lprev
;
1150 seg_pahhead
[hlix
].p_lprev
->p_lnext
= seg_pahhead
[hlix
].p_lnext
;
1151 seg_pahhead
[hlix
].p_lnext
= hlinkp
;
1152 seg_pahhead
[hlix
].p_lprev
= hlinkp
->p_lprev
;
1153 hlinkp
->p_lprev
->p_lnext
= &seg_pahhead
[hlix
];
1154 hlinkp
->p_lprev
= &seg_pahhead
[hlix
];
1157 mutex_enter(&seg_pmem_mtx
);
1158 ASSERT(seg_pathr_on
);
1160 mutex_exit(&seg_pmem_mtx
);
1164 * Run the delayed callback list. segments/amps can't go away until
1165 * callback is executed since they must have non 0 softlockcnt. That's
1166 * why we don't need to hold as/seg/amp locks to execute the callback.
1168 while (delcallb_list
!= NULL
) {
1169 pcp
= delcallb_list
;
1170 delcallb_list
= pcp
->p_hprev
;
1171 ASSERT(!pcp
->p_active
);
1172 (void) (*pcp
->p_callback
)(pcp
->p_htag0
, pcp
->p_addr
,
1173 pcp
->p_len
, pcp
->p_pp
, pcp
->p_write
? S_WRITE
: S_READ
, 1);
1174 npages
+= btop(pcp
->p_len
);
1175 if (!IS_PCP_WIRED(pcp
)) {
1176 npages_window
+= btop(pcp
->p_len
);
1178 kmem_cache_free(seg_pkmcache
, pcp
);
1181 mutex_enter(&seg_pmem_mtx
);
1182 ASSERT(seg_plocked
>= npages
);
1183 ASSERT(seg_plocked_window
>= npages_window
);
1184 seg_plocked
-= npages
;
1185 seg_plocked_window
-= npages_window
;
1186 mutex_exit(&seg_pmem_mtx
);
1191 * Remove cached pages for segment(s) entries from hashtable. The segments
1192 * are identified by pp array. This is useful for multiple seg's cached on
1193 * behalf of dummy segment (ISM/DISM) with common pp array.
1196 seg_ppurge_wiredpp(struct page
**pp
)
1198 struct seg_pcache
*pcp
;
1199 struct seg_phash_wired
*hp
;
1201 struct seg_pcache
*delcallb_list
= NULL
;
1204 * if the cache is empty, return
1206 if (seg_plocked
== 0) {
1209 ASSERT(seg_phashsize_wired
!= 0);
1211 for (hp
= seg_phashtab_wired
;
1212 hp
< &seg_phashtab_wired
[seg_phashsize_wired
]; hp
++) {
1213 if (hp
->p_hnext
== (struct seg_pcache
*)hp
) {
1216 mutex_enter(&hp
->p_hmutex
);
1218 while (pcp
!= (struct seg_pcache
*)hp
) {
1219 ASSERT(pcp
->p_hashp
== (struct seg_phash
*)hp
);
1220 ASSERT(IS_PCP_WIRED(pcp
));
1222 * purge entries which are not active
1224 if (!pcp
->p_active
&& pcp
->p_pp
== pp
) {
1225 ASSERT(pcp
->p_htag0
!= NULL
);
1226 pcp
->p_hprev
->p_hnext
= pcp
->p_hnext
;
1227 pcp
->p_hnext
->p_hprev
= pcp
->p_hprev
;
1228 pcp
->p_hprev
= delcallb_list
;
1229 delcallb_list
= pcp
;
1233 mutex_exit(&hp
->p_hmutex
);
1235 * segments can't go away until callback is executed since
1236 * they must have non 0 softlockcnt. That's why we don't
1237 * need to hold as/seg locks to execute the callback.
1239 while (delcallb_list
!= NULL
) {
1241 pcp
= delcallb_list
;
1242 delcallb_list
= pcp
->p_hprev
;
1243 ASSERT(!pcp
->p_active
);
1244 done
= (*pcp
->p_callback
)(pcp
->p_htag0
, pcp
->p_addr
,
1245 pcp
->p_len
, pcp
->p_pp
,
1246 pcp
->p_write
? S_WRITE
: S_READ
, 1);
1247 npages
+= btop(pcp
->p_len
);
1248 ASSERT(IS_PCP_WIRED(pcp
));
1249 kmem_cache_free(seg_pkmcache
, pcp
);
1251 ASSERT(delcallb_list
== NULL
);
1258 mutex_enter(&seg_pmem_mtx
);
1259 ASSERT(seg_plocked
>= npages
);
1260 seg_plocked
-= npages
;
1261 mutex_exit(&seg_pmem_mtx
);
1265 * purge all entries for a given segment. Since we
1266 * callback into the segment driver directly for page
1267 * reclaim the caller needs to hold the right locks.
1270 seg_ppurge(struct seg
*seg
, struct anon_map
*amp
, uint_t flags
)
1272 struct seg_pcache
*delcallb_list
= NULL
;
1273 struct seg_pcache
*pcp
;
1274 struct seg_phash
*hp
;
1278 if (seg_plocked
== 0) {
1281 ASSERT(seg_phashsize_win
!= 0);
1284 * If amp is not NULL use amp as a lookup tag otherwise use seg
1287 htag0
= (amp
== NULL
? (void *)seg
: (void *)amp
);
1288 ASSERT(htag0
!= NULL
);
1289 if (IS_PFLAGS_WIRED(flags
)) {
1290 hp
= P_HASHBP(seg
, htag0
, 0, flags
);
1291 mutex_enter(&hp
->p_hmutex
);
1293 while (pcp
!= (struct seg_pcache
*)hp
) {
1294 ASSERT(pcp
->p_hashp
== hp
);
1295 ASSERT(IS_PCP_WIRED(pcp
));
1296 if (pcp
->p_htag0
== htag0
) {
1297 if (pcp
->p_active
) {
1300 pcp
->p_hprev
->p_hnext
= pcp
->p_hnext
;
1301 pcp
->p_hnext
->p_hprev
= pcp
->p_hprev
;
1302 pcp
->p_hprev
= delcallb_list
;
1303 delcallb_list
= pcp
;
1307 mutex_exit(&hp
->p_hmutex
);
1309 pcache_link_t
*plinkp
;
1310 pcache_link_t
*pheadp
;
1314 ASSERT(seg
!= NULL
);
1315 pheadp
= &seg
->s_phead
;
1316 pmtx
= &seg
->s_pmtx
;
1318 pheadp
= &
->a_phead
;
1319 pmtx
= &
->a_pmtx
;
1322 while ((plinkp
= pheadp
->p_lnext
) != pheadp
) {
1323 pcp
= plink2pcache(plinkp
);
1324 ASSERT(!IS_PCP_WIRED(pcp
));
1325 ASSERT(pcp
->p_htag0
== htag0
);
1327 mutex_enter(&hp
->p_hmutex
);
1328 if (pcp
->p_active
) {
1329 mutex_exit(&hp
->p_hmutex
);
1332 ASSERT(plinkp
->p_lprev
== pheadp
);
1333 pheadp
->p_lnext
= plinkp
->p_lnext
;
1334 plinkp
->p_lnext
->p_lprev
= pheadp
;
1335 pcp
->p_hprev
->p_hnext
= pcp
->p_hnext
;
1336 pcp
->p_hnext
->p_hprev
= pcp
->p_hprev
;
1337 pcp
->p_hprev
= delcallb_list
;
1338 delcallb_list
= pcp
;
1339 if (hp
->p_hnext
== (struct seg_pcache
*)hp
) {
1340 seg_premove_abuck(hp
, 0);
1342 mutex_exit(&hp
->p_hmutex
);
1346 while (delcallb_list
!= NULL
) {
1347 pcp
= delcallb_list
;
1348 delcallb_list
= pcp
->p_hprev
;
1349 ASSERT(!pcp
->p_active
);
1350 (void) (*pcp
->p_callback
)(pcp
->p_htag0
, pcp
->p_addr
, pcp
->p_len
,
1351 pcp
->p_pp
, pcp
->p_write
? S_WRITE
: S_READ
, 0);
1352 npages
+= btop(pcp
->p_len
);
1353 kmem_cache_free(seg_pkmcache
, pcp
);
1355 mutex_enter(&seg_pmem_mtx
);
1356 ASSERT(seg_plocked
>= npages
);
1357 seg_plocked
-= npages
;
1358 if (!IS_PFLAGS_WIRED(flags
)) {
1359 ASSERT(seg_plocked_window
>= npages
);
1360 seg_plocked_window
-= npages
;
1362 mutex_exit(&seg_pmem_mtx
);
1365 static void seg_pinit_mem_config(void);
1368 * setup the pagelock cache
1373 struct seg_phash
*hp
;
1378 seg_plocked_window
= 0;
1380 if (segpcache_enabled
== 0) {
1381 seg_phashsize_win
= 0;
1382 seg_phashsize_wired
= 0;
1388 seg_pkmcache
= kmem_cache_create("seg_pcache",
1389 sizeof (struct seg_pcache
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1390 if (segpcache_pcp_maxage_ticks
<= 0) {
1391 segpcache_pcp_maxage_ticks
= segpcache_pcp_maxage_sec
* hz
;
1393 seg_pmax_pcpage
= segpcache_pcp_maxage_ticks
;
1394 seg_pathr_empty_ahb
= 0;
1395 seg_pathr_full_ahb
= 0;
1396 seg_pshrink_shift
= segpcache_shrink_shift
;
1397 seg_pmaxapurge_npages
= btop(segpcache_maxapurge_bytes
);
1399 mutex_init(&seg_pcache_mtx
, NULL
, MUTEX_DEFAULT
, NULL
);
1400 mutex_init(&seg_pmem_mtx
, NULL
, MUTEX_DEFAULT
, NULL
);
1401 mutex_init(&seg_pasync_mtx
, NULL
, MUTEX_DEFAULT
, NULL
);
1402 cv_init(&seg_pasync_cv
, NULL
, CV_DEFAULT
, NULL
);
1404 physmegs
= physmem
>> (20 - PAGESHIFT
);
1407 * If segpcache_hashsize_win was not set in /etc/system or it has
1408 * absurd value set it to a default.
1410 if (segpcache_hashsize_win
== 0 || segpcache_hashsize_win
> physmem
) {
1412 * Create one bucket per 32K (or at least per 8 pages) of
1415 pgcnt_t pages_per_bucket
= MAX(btop(32 * 1024), 8);
1416 segpcache_hashsize_win
= MAX(1024, physmem
/ pages_per_bucket
);
1418 if (!ISP2(segpcache_hashsize_win
)) {
1419 ulong_t rndfac
= ~(1UL <<
1420 (highbit(segpcache_hashsize_win
) - 1));
1421 rndfac
&= segpcache_hashsize_win
;
1422 segpcache_hashsize_win
+= rndfac
;
1423 segpcache_hashsize_win
= 1 <<
1424 (highbit(segpcache_hashsize_win
) - 1);
1426 seg_phashsize_win
= segpcache_hashsize_win
;
1427 seg_phashtab_win
= kmem_zalloc(
1428 seg_phashsize_win
* sizeof (struct seg_phash
),
1430 for (i
= 0; i
< seg_phashsize_win
; i
++) {
1431 hp
= &seg_phashtab_win
[i
];
1432 hp
->p_hnext
= (struct seg_pcache
*)hp
;
1433 hp
->p_hprev
= (struct seg_pcache
*)hp
;
1434 mutex_init(&hp
->p_hmutex
, NULL
, MUTEX_DEFAULT
, NULL
);
1439 seg_pahhead
[0].p_lnext
= &seg_pahhead
[0];
1440 seg_pahhead
[0].p_lprev
= &seg_pahhead
[0];
1441 seg_pahhead
[1].p_lnext
= &seg_pahhead
[1];
1442 seg_pahhead
[1].p_lprev
= &seg_pahhead
[1];
1445 * If segpcache_hashsize_wired was not set in /etc/system or it has
1446 * absurd value set it to a default.
1448 if (segpcache_hashsize_wired
== 0 ||
1449 segpcache_hashsize_wired
> physmem
/ 4) {
1451 * Choose segpcache_hashsize_wired based on physmem.
1452 * Create a bucket per 128K bytes upto 256K buckets.
1454 if (physmegs
< 20 * 1024) {
1455 segpcache_hashsize_wired
= MAX(1024, physmegs
<< 3);
1457 segpcache_hashsize_wired
= 256 * 1024;
1460 if (!ISP2(segpcache_hashsize_wired
)) {
1461 segpcache_hashsize_wired
= 1 <<
1462 highbit(segpcache_hashsize_wired
);
1464 seg_phashsize_wired
= segpcache_hashsize_wired
;
1465 seg_phashtab_wired
= kmem_zalloc(
1466 seg_phashsize_wired
* sizeof (struct seg_phash_wired
), KM_SLEEP
);
1467 for (i
= 0; i
< seg_phashsize_wired
; i
++) {
1468 hp
= (struct seg_phash
*)&seg_phashtab_wired
[i
];
1469 hp
->p_hnext
= (struct seg_pcache
*)hp
;
1470 hp
->p_hprev
= (struct seg_pcache
*)hp
;
1471 mutex_init(&hp
->p_hmutex
, NULL
, MUTEX_DEFAULT
, NULL
);
1474 if (segpcache_maxwindow
== 0) {
1475 if (physmegs
< 64) {
1477 segpcache_maxwindow
= availrmem
>> 5;
1478 } else if (physmegs
< 512) {
1480 segpcache_maxwindow
= availrmem
>> 3;
1481 } else if (physmegs
< 1024) {
1483 segpcache_maxwindow
= availrmem
>> 2;
1484 } else if (physmegs
< 2048) {
1486 segpcache_maxwindow
= availrmem
>> 1;
1489 segpcache_maxwindow
= (pgcnt_t
)-1;
1492 seg_pmaxwindow
= segpcache_maxwindow
;
1493 seg_pinit_mem_config();
1497 * called by pageout if memory is low
1503 * if the cache is off or empty, return
1505 if (seg_plocked_window
== 0) {
1508 ASSERT(seg_phashsize_win
!= 0);
1511 * If somebody is already purging pcache
1514 if (seg_pdisabled
) {
1518 cv_signal(&seg_pasync_cv
);
1522 * run as a backgroud thread and reclaim pagelock
1523 * pages which have not been used recently
1526 seg_pasync_thread(void)
1528 callb_cpr_t cpr_info
;
1530 if (seg_phashsize_win
== 0) {
1535 seg_pasync_thr
= curthread
;
1537 CALLB_CPR_INIT(&cpr_info
, &seg_pasync_mtx
,
1538 callb_generic_cpr
, "seg_pasync");
1540 if (segpcache_reap_ticks
<= 0) {
1541 segpcache_reap_ticks
= segpcache_reap_sec
* hz
;
1544 mutex_enter(&seg_pasync_mtx
);
1546 CALLB_CPR_SAFE_BEGIN(&cpr_info
);
1547 (void) cv_reltimedwait(&seg_pasync_cv
, &seg_pasync_mtx
,
1548 segpcache_reap_ticks
, TR_CLOCK_TICK
);
1549 CALLB_CPR_SAFE_END(&cpr_info
, &seg_pasync_mtx
);
1550 if (seg_pdisabled
== 0) {
1551 seg_ppurge_async(0);
1556 static struct kmem_cache
*seg_cache
;
1559 * Initialize segment management data structures.
1566 seg_cache
= kmem_cache_create("seg_cache", sizeof (struct seg
),
1567 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1569 ksp
= kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED
,
1570 segadvstat_ndata
, KSTAT_FLAG_VIRTUAL
);
1572 ksp
->ks_data
= (void *)segadvstat_ptr
;
1580 * Allocate a segment to cover [base, base+size]
1581 * and attach it to the specified address space.
1584 seg_alloc(struct as
*as
, caddr_t base
, size_t size
)
1590 segbase
= (caddr_t
)((uintptr_t)base
& (uintptr_t)PAGEMASK
);
1591 segsize
= (((uintptr_t)(base
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1594 if (!valid_va_range(&segbase
, &segsize
, segsize
, AH_LO
))
1595 return (NULL
); /* bad virtual addr range */
1598 valid_usr_range(segbase
, segsize
, 0, as
,
1599 as
->a_userlimit
) != RANGE_OKAY
)
1600 return (NULL
); /* bad virtual addr range */
1602 new = kmem_cache_alloc(seg_cache
, KM_SLEEP
);
1607 mutex_init(&new->s_pmtx
, NULL
, MUTEX_DEFAULT
, NULL
);
1608 new->s_phead
.p_lnext
= &new->s_phead
;
1609 new->s_phead
.p_lprev
= &new->s_phead
;
1610 if (seg_attach(as
, segbase
, segsize
, new) < 0) {
1611 kmem_cache_free(seg_cache
, new);
1614 /* caller must fill in ops, data */
1619 * Attach a segment to the address space. Used by seg_alloc()
1620 * and for kernel startup to attach to static segments.
1623 seg_attach(struct as
*as
, caddr_t base
, size_t size
, struct seg
*seg
)
1630 * as_addseg() will add the segment at the appropraite point
1631 * in the list. It will return -1 if there is overlap with
1632 * an already existing segment.
1634 return (as_addseg(as
, seg
));
1638 * Unmap a segment and free it from its associated address space.
1639 * This should be called by anybody who's finished with a whole segment's
1640 * mapping. Just calls segop_unmap() on the whole mapping . It is the
1641 * responsibility of the segment driver to unlink the the segment
1642 * from the address space, and to free public and private data structures
1643 * associated with the segment. (This is typically done by a call to
1647 seg_unmap(struct seg
*seg
)
1653 ASSERT(seg
->s_as
&& AS_WRITE_HELD(seg
->s_as
));
1655 /* Shouldn't have called seg_unmap if mapping isn't yet established */
1656 ASSERT(seg
->s_data
!= NULL
);
1658 /* Unmap the whole mapping */
1660 ret
= segop_unmap(seg
, seg
->s_base
, seg
->s_size
);
1663 (void) segop_unmap(seg
, seg
->s_base
, seg
->s_size
);
1668 * Free the segment from its associated as. This should only be called
1669 * if a mapping to the segment has not yet been established (e.g., if
1670 * an error occurs in the middle of doing an as_map when the segment
1671 * has already been partially set up) or if it has already been deleted
1672 * (e.g., from a segment driver unmap routine if the unmap applies to the
1673 * entire segment). If the mapping is currently set up then seg_unmap() should
1674 * be called instead.
1677 seg_free(struct seg
*seg
)
1679 register struct as
*as
= seg
->s_as
;
1680 struct seg
*tseg
= as_removeseg(as
, seg
);
1682 ASSERT(tseg
== seg
);
1685 * If the segment private data field is NULL,
1686 * then segment driver is not attached yet.
1688 if (seg
->s_data
!= NULL
)
1691 mutex_destroy(&seg
->s_pmtx
);
1692 ASSERT(seg
->s_phead
.p_lnext
== &seg
->s_phead
);
1693 ASSERT(seg
->s_phead
.p_lprev
== &seg
->s_phead
);
1694 kmem_cache_free(seg_cache
, seg
);
1699 seg_p_mem_config_post_add(
1701 pgcnt_t delta_pages
)
1703 /* Nothing to do. */
1709 mutex_enter(&seg_pcache_mtx
);
1710 ASSERT(seg_pdisabled
!= 0);
1712 mutex_exit(&seg_pcache_mtx
);
1716 * seg_p_disable - disables seg_pcache, and then attempts to empty the
1718 * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1719 * SEGP_FAIL if the cache could not be emptied.
1724 pgcnt_t old_plocked
;
1725 int stall_count
= 0;
1727 mutex_enter(&seg_pcache_mtx
);
1729 ASSERT(seg_pdisabled
!= 0);
1730 mutex_exit(&seg_pcache_mtx
);
1733 * Attempt to empty the cache. Terminate if seg_plocked does not
1734 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1736 while (seg_plocked
!= 0) {
1737 ASSERT(seg_phashsize_win
!= 0);
1738 old_plocked
= seg_plocked
;
1739 seg_ppurge_async(1);
1740 if (seg_plocked
== old_plocked
) {
1741 if (stall_count
++ > SEGP_STALL_THRESHOLD
) {
1746 if (seg_plocked
!= 0)
1747 delay(hz
/SEGP_PREDEL_DELAY_FACTOR
);
1749 return (SEGP_SUCCESS
);
1753 * Attempt to purge seg_pcache. May need to return before this has
1754 * completed to allow other pre_del callbacks to unlock pages. This is
1756 * 1) The seg_pdisabled flag has been set so at least we won't
1757 * cache anymore locks and the locks we couldn't purge
1758 * will not be held if they do get released by a subsequent
1759 * pre-delete callback.
1761 * 2) The rest of the memory delete thread processing does not
1762 * depend on the changes made in this pre-delete callback. No
1763 * panics will result, the worst that will happen is that the
1764 * DR code will timeout and cancel the delete.
1768 seg_p_mem_config_pre_del(
1770 pgcnt_t delta_pages
)
1772 if (seg_phashsize_win
== 0) {
1775 if (seg_p_disable() != SEGP_SUCCESS
)
1777 "!Pre-delete couldn't purge"" pagelock cache - continuing");
1783 seg_p_mem_config_post_del(
1785 pgcnt_t delta_pages
,
1788 if (seg_phashsize_win
== 0) {
1794 static kphysm_setup_vector_t seg_p_mem_config_vec
= {
1795 KPHYSM_SETUP_VECTOR_VERSION
,
1796 seg_p_mem_config_post_add
,
1797 seg_p_mem_config_pre_del
,
1798 seg_p_mem_config_post_del
,
1802 seg_pinit_mem_config(void)
1806 ret
= kphysm_setup_func_register(&seg_p_mem_config_vec
, NULL
);
1808 * Want to catch this in the debug kernel. At run time, if the
1809 * callbacks don't get run all will be OK as the disable just makes
1810 * it more likely that the pages can be collected.
1816 * Verify that segment is not a shared anonymous segment which reserves
1817 * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1818 * from one zone to another if any segments are shared. This is because the
1819 * last process to exit will credit the swap reservation. This could lead
1820 * to the swap being reserved by one zone, and credited to another.
1823 seg_can_change_zones(struct seg
*seg
)
1825 struct segvn_data
*svd
;
1827 if (seg
->s_ops
== &segspt_shmops
)
1830 if (seg
->s_ops
== &segvn_ops
) {
1831 svd
= (struct segvn_data
*)seg
->s_data
;
1832 if (svd
->type
== MAP_SHARED
&&
1834 svd
->amp
->swresv
> 0)
1841 * Return swap reserved by a segment backing a private mapping.
1844 seg_swresv(struct seg
*seg
)
1846 struct segvn_data
*svd
;
1849 if (seg
->s_ops
== &segvn_ops
) {
1850 svd
= (struct segvn_data
*)seg
->s_data
;
1851 if (svd
->type
== MAP_PRIVATE
&& svd
->swresv
> 0)
1861 segop_dup(struct seg
*seg
, struct seg
*new)
1863 return (seg
->s_ops
->dup(seg
, new));
1867 segop_unmap(struct seg
*seg
, caddr_t addr
, size_t len
)
1869 return (seg
->s_ops
->unmap(seg
, addr
, len
));
1873 segop_free(struct seg
*seg
)
1875 seg
->s_ops
->free(seg
);
1879 segop_fault(struct hat
*hat
, struct seg
*seg
, caddr_t addr
, size_t len
,
1880 enum fault_type type
, enum seg_rw rw
)
1882 return (seg
->s_ops
->fault(hat
, seg
, addr
, len
, type
, rw
));
1886 segop_faulta(struct seg
*seg
, caddr_t addr
)
1888 return (seg
->s_ops
->faulta(seg
, addr
));
1892 segop_setprot(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t prot
)
1894 return (seg
->s_ops
->setprot(seg
, addr
, len
, prot
));
1898 segop_checkprot(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t prot
)
1900 return (seg
->s_ops
->checkprot(seg
, addr
, len
, prot
));
1904 segop_kluster(struct seg
*seg
, caddr_t addr
, ssize_t d
)
1906 return (seg
->s_ops
->kluster(seg
, addr
, d
));
1910 segop_sync(struct seg
*seg
, caddr_t addr
, size_t len
, int atr
, uint_t f
)
1912 return (seg
->s_ops
->sync(seg
, addr
, len
, atr
, f
));
1916 segop_incore(struct seg
*seg
, caddr_t addr
, size_t len
, char *v
)
1918 return (seg
->s_ops
->incore(seg
, addr
, len
, v
));
1922 segop_lockop(struct seg
*seg
, caddr_t addr
, size_t len
, int atr
, int op
,
1923 ulong_t
*b
, size_t p
)
1925 return (seg
->s_ops
->lockop(seg
, addr
, len
, atr
, op
, b
, p
));
1929 segop_getprot(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t
*p
)
1931 return (seg
->s_ops
->getprot(seg
, addr
, len
, p
));
1935 segop_getoffset(struct seg
*seg
, caddr_t addr
)
1937 return (seg
->s_ops
->getoffset(seg
, addr
));
1941 segop_gettype(struct seg
*seg
, caddr_t addr
)
1943 return (seg
->s_ops
->gettype(seg
, addr
));
1947 segop_getvp(struct seg
*seg
, caddr_t addr
, struct vnode
**vpp
)
1949 return (seg
->s_ops
->getvp(seg
, addr
, vpp
));
1953 segop_advise(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t b
)
1955 return (seg
->s_ops
->advise(seg
, addr
, len
, b
));
1959 segop_dump(struct seg
*seg
)
1961 if (seg
->s_ops
->dump
== NULL
)
1964 seg
->s_ops
->dump(seg
);
1968 segop_pagelock(struct seg
*seg
, caddr_t addr
, size_t len
, struct page
***page
,
1969 enum lock_type type
, enum seg_rw rw
)
1971 if (seg
->s_ops
->pagelock
== NULL
)
1974 return (seg
->s_ops
->pagelock(seg
, addr
, len
, page
, type
, rw
));
1978 segop_setpagesize(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t szc
)
1980 if (seg
->s_ops
->setpagesize
== NULL
)
1983 return (seg
->s_ops
->setpagesize(seg
, addr
, len
, szc
));
1987 segop_getmemid(struct seg
*seg
, caddr_t addr
, memid_t
*mp
)
1989 if (seg
->s_ops
->getmemid
== NULL
)
1992 return (seg
->s_ops
->getmemid(seg
, addr
, mp
));
1995 struct lgrp_mem_policy_info
*
1996 segop_getpolicy(struct seg
*seg
, caddr_t addr
)
1998 if (seg
->s_ops
->getpolicy
== NULL
)
2001 return (seg
->s_ops
->getpolicy(seg
, addr
));
2005 segop_capable(struct seg
*seg
, segcapability_t cap
)
2007 if (seg
->s_ops
->capable
== NULL
)
2010 return (seg
->s_ops
->capable(seg
, cap
));
2014 segop_inherit(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t op
)
2016 if (seg
->s_ops
->inherit
== NULL
)
2019 return (seg
->s_ops
->inherit(seg
, addr
, len
, op
));