4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2012 Joyent, Inc. All rights reserved.
29 * For a more complete description of the main ideas, see:
31 * Jeff Bonwick and Jonathan Adams,
33 * Magazines and vmem: Extending the Slab Allocator to Many CPUs and
34 * Arbitrary Resources.
36 * Proceedings of the 2001 Usenix Conference.
37 * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf.
39 * For the "Big Theory Statement", see kernel/os/vmem.c
41 * 1. Overview of changes
42 * ------------------------------
43 * There have been a few changes to vmem in order to support umem. The
46 * * VM_SLEEP unsupported
50 * * initialization changes
52 * * _vmem_extend_alloc
57 * Since VM_SLEEP allocations can hold locks (in vmem_populate()) for
58 * possibly infinite amounts of time, they are not supported in this
59 * version of vmem. Sleep-like behavior can be achieved through
60 * UMEM_NOFAIL umem allocations.
65 * Unlike kmem_reap(), which just asynchronously schedules work, umem_reap()
66 * can do allocations and frees synchronously. This is a problem if it
67 * occurs during a vmem_populate() allocation.
69 * Instead, we delay reaps while populates are active.
72 * 4. Initialization changes
73 * -------------------------
74 * In the kernel, vmem_init() allows you to create a single, top-level arena,
75 * which has vmem_internal_arena as a child. For umem, we want to be able
76 * to extend arenas dynamically. It is much easier to support this if we
77 * allow a two-level "heap" arena:
87 * | +-+-- ... <other children>
95 * The new vmem_init() allows you to specify a "parent" of the heap, along
96 * with allocation functions.
99 * 5. _vmem_extend_alloc
100 * ---------------------
101 * The other part of extending is _vmem_extend_alloc. This function allows
102 * you to extend (expand current spans, if possible) an arena and allocate
103 * a chunk of the newly extened span atomically. This is needed to support
104 * extending the heap while vmem_populate()ing it.
106 * In order to increase the usefulness of extending, non-imported spans are
107 * sorted in address order.
110 #include <sys/vmem_impl_user.h>
112 #include <sys/sysmacros.h>
117 #include "vmem_base.h"
118 #include "umem_base.h"
120 #define VMEM_INITIAL 6 /* early vmem arenas */
121 #define VMEM_SEG_INITIAL 100 /* early segments */
124 * Adding a new span to an arena requires two segment structures: one to
125 * represent the span, and one to represent the free segment it contains.
127 #define VMEM_SEGS_PER_SPAN_CREATE 2
130 * Allocating a piece of an existing segment requires 0-2 segment structures
131 * depending on how much of the segment we're allocating.
133 * To allocate the entire segment, no new segment structures are needed; we
134 * simply move the existing segment structure from the freelist to the
135 * allocation hash table.
137 * To allocate a piece from the left or right end of the segment, we must
138 * split the segment into two pieces (allocated part and remainder), so we
139 * need one new segment structure to represent the remainder.
141 * To allocate from the middle of a segment, we need two new segment strucures
142 * to represent the remainders on either side of the allocated part.
144 #define VMEM_SEGS_PER_EXACT_ALLOC 0
145 #define VMEM_SEGS_PER_LEFT_ALLOC 1
146 #define VMEM_SEGS_PER_RIGHT_ALLOC 1
147 #define VMEM_SEGS_PER_MIDDLE_ALLOC 2
150 * vmem_populate() preallocates segment structures for vmem to do its work.
151 * It must preallocate enough for the worst case, which is when we must import
152 * a new span and then allocate from the middle of it.
154 #define VMEM_SEGS_PER_ALLOC_MAX \
155 (VMEM_SEGS_PER_SPAN_CREATE + VMEM_SEGS_PER_MIDDLE_ALLOC)
158 * The segment structures themselves are allocated from vmem_seg_arena, so
159 * we have a recursion problem when vmem_seg_arena needs to populate itself.
160 * We address this by working out the maximum number of segment structures
161 * this act will require, and multiplying by the maximum number of threads
162 * that we'll allow to do it simultaneously.
164 * The worst-case segment consumption to populate vmem_seg_arena is as
165 * follows (depicted as a stack trace to indicate why events are occurring):
167 * vmem_alloc(vmem_seg_arena) -> 2 segs (span create + exact alloc)
168 * vmem_alloc(vmem_internal_arena) -> 2 segs (span create + exact alloc)
169 * heap_alloc(heap_arena)
170 * vmem_alloc(heap_arena) -> 4 seg (span create + alloc)
171 * parent_alloc(parent_arena)
172 * _vmem_extend_alloc(parent_arena) -> 3 seg (span create + left alloc)
174 * Note: The reservation for heap_arena must be 4, since vmem_xalloc()
175 * is overly pessimistic on allocations where parent_arena has a stricter
176 * alignment than heap_arena.
178 * The worst-case consumption for any arena is 4 segment structures.
179 * For now, we only support VM_NOSLEEP allocations, so as long as we
180 * serialize all vmem_populates, a 4-seg reserve is sufficient.
182 #define VMEM_POPULATE_SEGS_PER_ARENA 4
183 #define VMEM_POPULATE_LOCKS 1
185 #define VMEM_POPULATE_RESERVE \
186 (VMEM_POPULATE_SEGS_PER_ARENA * VMEM_POPULATE_LOCKS)
189 * vmem_populate() ensures that each arena has VMEM_MINFREE seg structures
190 * so that it can satisfy the worst-case allocation *and* participate in
191 * worst-case allocation from vmem_seg_arena.
193 #define VMEM_MINFREE (VMEM_POPULATE_RESERVE + VMEM_SEGS_PER_ALLOC_MAX)
195 /* Don't assume new statics are zeroed - see vmem_startup() */
196 static vmem_t vmem0
[VMEM_INITIAL
];
197 static vmem_t
*vmem_populator
[VMEM_INITIAL
];
198 static uint32_t vmem_id
;
199 static uint32_t vmem_populators
;
200 static vmem_seg_t vmem_seg0
[VMEM_SEG_INITIAL
];
201 static vmem_seg_t
*vmem_segfree
;
202 static mutex_t vmem_list_lock
;
203 static mutex_t vmem_segfree_lock
;
204 static vmem_populate_lock_t vmem_nosleep_lock
;
205 #define IN_POPULATE() (vmem_nosleep_lock.vmpl_thr == thr_self())
206 static vmem_t
*vmem_list
;
207 static vmem_t
*vmem_internal_arena
;
208 static vmem_t
*vmem_seg_arena
;
209 static vmem_t
*vmem_hash_arena
;
210 static vmem_t
*vmem_vmem_arena
;
213 vmem_alloc_t
*vmem_heap_alloc
;
214 vmem_free_t
*vmem_heap_free
;
216 uint32_t vmem_mtbf
; /* mean time between failures [default: off] */
217 size_t vmem_seg_size
= sizeof (vmem_seg_t
);
220 * Insert/delete from arena list (type 'a') or next-of-kin list (type 'k').
222 #define VMEM_INSERT(vprev, vsp, type) \
224 vmem_seg_t *vnext = (vprev)->vs_##type##next; \
225 (vsp)->vs_##type##next = (vnext); \
226 (vsp)->vs_##type##prev = (vprev); \
227 (vprev)->vs_##type##next = (vsp); \
228 (vnext)->vs_##type##prev = (vsp); \
231 #define VMEM_DELETE(vsp, type) \
233 vmem_seg_t *vprev = (vsp)->vs_##type##prev; \
234 vmem_seg_t *vnext = (vsp)->vs_##type##next; \
235 (vprev)->vs_##type##next = (vnext); \
236 (vnext)->vs_##type##prev = (vprev); \
240 * Get a vmem_seg_t from the global segfree list.
243 vmem_getseg_global(void)
247 (void) mutex_lock(&vmem_segfree_lock
);
248 if ((vsp
= vmem_segfree
) != NULL
)
249 vmem_segfree
= vsp
->vs_knext
;
250 (void) mutex_unlock(&vmem_segfree_lock
);
256 * Put a vmem_seg_t on the global segfree list.
259 vmem_putseg_global(vmem_seg_t
*vsp
)
261 (void) mutex_lock(&vmem_segfree_lock
);
262 vsp
->vs_knext
= vmem_segfree
;
264 (void) mutex_unlock(&vmem_segfree_lock
);
268 * Get a vmem_seg_t from vmp's segfree list.
271 vmem_getseg(vmem_t
*vmp
)
275 ASSERT(vmp
->vm_nsegfree
> 0);
277 vsp
= vmp
->vm_segfree
;
278 vmp
->vm_segfree
= vsp
->vs_knext
;
285 * Put a vmem_seg_t on vmp's segfree list.
288 vmem_putseg(vmem_t
*vmp
, vmem_seg_t
*vsp
)
290 vsp
->vs_knext
= vmp
->vm_segfree
;
291 vmp
->vm_segfree
= vsp
;
296 * Add vsp to the appropriate freelist.
299 vmem_freelist_insert(vmem_t
*vmp
, vmem_seg_t
*vsp
)
303 ASSERT(*VMEM_HASH(vmp
, vsp
->vs_start
) != vsp
);
305 vprev
= (vmem_seg_t
*)&vmp
->vm_freelist
[highbit(VS_SIZE(vsp
)) - 1];
306 vsp
->vs_type
= VMEM_FREE
;
307 vmp
->vm_freemap
|= VS_SIZE(vprev
);
308 VMEM_INSERT(vprev
, vsp
, k
);
310 (void) cond_broadcast(&vmp
->vm_cv
);
314 * Take vsp from the freelist.
317 vmem_freelist_delete(vmem_t
*vmp
, vmem_seg_t
*vsp
)
319 ASSERT(*VMEM_HASH(vmp
, vsp
->vs_start
) != vsp
);
320 ASSERT(vsp
->vs_type
== VMEM_FREE
);
322 if (vsp
->vs_knext
->vs_start
== 0 && vsp
->vs_kprev
->vs_start
== 0) {
324 * The segments on both sides of 'vsp' are freelist heads,
325 * so taking vsp leaves the freelist at vsp->vs_kprev empty.
327 ASSERT(vmp
->vm_freemap
& VS_SIZE(vsp
->vs_kprev
));
328 vmp
->vm_freemap
^= VS_SIZE(vsp
->vs_kprev
);
334 * Add vsp to the allocated-segment hash table and update kstats.
337 vmem_hash_insert(vmem_t
*vmp
, vmem_seg_t
*vsp
)
341 vsp
->vs_type
= VMEM_ALLOC
;
342 bucket
= VMEM_HASH(vmp
, vsp
->vs_start
);
343 vsp
->vs_knext
= *bucket
;
346 if (vmem_seg_size
== sizeof (vmem_seg_t
)) {
347 vsp
->vs_depth
= (uint8_t)getpcstack(vsp
->vs_stack
,
348 VMEM_STACK_DEPTH
, 0);
349 vsp
->vs_thread
= thr_self();
350 vsp
->vs_timestamp
= gethrtime();
355 vmp
->vm_kstat
.vk_alloc
++;
356 vmp
->vm_kstat
.vk_mem_inuse
+= VS_SIZE(vsp
);
360 * Remove vsp from the allocated-segment hash table and update kstats.
363 vmem_hash_delete(vmem_t
*vmp
, uintptr_t addr
, size_t size
)
365 vmem_seg_t
*vsp
, **prev_vspp
;
367 prev_vspp
= VMEM_HASH(vmp
, addr
);
368 while ((vsp
= *prev_vspp
) != NULL
) {
369 if (vsp
->vs_start
== addr
) {
370 *prev_vspp
= vsp
->vs_knext
;
373 vmp
->vm_kstat
.vk_lookup
++;
374 prev_vspp
= &vsp
->vs_knext
;
378 umem_panic("vmem_hash_delete(%p, %lx, %lu): bad free",
381 if (VS_SIZE(vsp
) != size
) {
382 umem_panic("vmem_hash_delete(%p, %lx, %lu): wrong size "
383 "(expect %lu)", vmp
, addr
, size
, VS_SIZE(vsp
));
386 vmp
->vm_kstat
.vk_free
++;
387 vmp
->vm_kstat
.vk_mem_inuse
-= size
;
393 * Create a segment spanning the range [start, end) and add it to the arena.
396 vmem_seg_create(vmem_t
*vmp
, vmem_seg_t
*vprev
, uintptr_t start
, uintptr_t end
)
398 vmem_seg_t
*newseg
= vmem_getseg(vmp
);
400 newseg
->vs_start
= start
;
401 newseg
->vs_end
= end
;
403 newseg
->vs_import
= 0;
405 VMEM_INSERT(vprev
, newseg
, a
);
411 * Remove segment vsp from the arena.
414 vmem_seg_destroy(vmem_t
*vmp
, vmem_seg_t
*vsp
)
416 ASSERT(vsp
->vs_type
!= VMEM_ROTOR
);
419 vmem_putseg(vmp
, vsp
);
423 * Add the span [vaddr, vaddr + size) to vmp and update kstats.
426 vmem_span_create(vmem_t
*vmp
, void *vaddr
, size_t size
, uint8_t import
)
429 vmem_seg_t
*newseg
, *span
;
430 uintptr_t start
= (uintptr_t)vaddr
;
431 uintptr_t end
= start
+ size
;
433 knext
= &vmp
->vm_seg0
;
434 if (!import
&& vmp
->vm_source_alloc
== NULL
) {
435 vmem_seg_t
*kend
, *kprev
;
437 * non-imported spans are sorted in address order. This
438 * makes vmem_extend_unlocked() much more effective.
440 * We search in reverse order, since new spans are
441 * generally at higher addresses.
443 kend
= &vmp
->vm_seg0
;
444 for (kprev
= kend
->vs_kprev
; kprev
!= kend
;
445 kprev
= kprev
->vs_kprev
) {
446 if (!kprev
->vs_import
&& (kprev
->vs_end
- 1) < start
)
449 knext
= kprev
->vs_knext
;
452 ASSERT(MUTEX_HELD(&vmp
->vm_lock
));
454 if ((start
| end
) & (vmp
->vm_quantum
- 1)) {
455 umem_panic("vmem_span_create(%p, %p, %lu): misaligned",
459 span
= vmem_seg_create(vmp
, knext
->vs_aprev
, start
, end
);
460 span
->vs_type
= VMEM_SPAN
;
461 VMEM_INSERT(knext
->vs_kprev
, span
, k
);
463 newseg
= vmem_seg_create(vmp
, span
, start
, end
);
464 vmem_freelist_insert(vmp
, newseg
);
466 newseg
->vs_import
= import
;
468 vmp
->vm_kstat
.vk_mem_import
+= size
;
469 vmp
->vm_kstat
.vk_mem_total
+= size
;
475 * Remove span vsp from vmp and update kstats.
478 vmem_span_destroy(vmem_t
*vmp
, vmem_seg_t
*vsp
)
480 vmem_seg_t
*span
= vsp
->vs_aprev
;
481 size_t size
= VS_SIZE(vsp
);
483 ASSERT(MUTEX_HELD(&vmp
->vm_lock
));
484 ASSERT(span
->vs_type
== VMEM_SPAN
);
487 vmp
->vm_kstat
.vk_mem_import
-= size
;
488 vmp
->vm_kstat
.vk_mem_total
-= size
;
490 VMEM_DELETE(span
, k
);
492 vmem_seg_destroy(vmp
, vsp
);
493 vmem_seg_destroy(vmp
, span
);
497 * Allocate the subrange [addr, addr + size) from segment vsp.
498 * If there are leftovers on either side, place them on the freelist.
499 * Returns a pointer to the segment representing [addr, addr + size).
502 vmem_seg_alloc(vmem_t
*vmp
, vmem_seg_t
*vsp
, uintptr_t addr
, size_t size
)
504 uintptr_t vs_start
= vsp
->vs_start
;
505 uintptr_t vs_end
= vsp
->vs_end
;
506 size_t vs_size
= vs_end
- vs_start
;
507 size_t realsize
= P2ROUNDUP(size
, vmp
->vm_quantum
);
508 uintptr_t addr_end
= addr
+ realsize
;
510 ASSERT(P2PHASE(vs_start
, vmp
->vm_quantum
) == 0);
511 ASSERT(P2PHASE(addr
, vmp
->vm_quantum
) == 0);
512 ASSERT(vsp
->vs_type
== VMEM_FREE
);
513 ASSERT(addr
>= vs_start
&& addr_end
- 1 <= vs_end
- 1);
514 ASSERT(addr
- 1 <= addr_end
- 1);
517 * If we're allocating from the start of the segment, and the
518 * remainder will be on the same freelist, we can save quite
521 if (P2SAMEHIGHBIT(vs_size
, vs_size
- realsize
) && addr
== vs_start
) {
522 ASSERT(highbit(vs_size
) == highbit(vs_size
- realsize
));
523 vsp
->vs_start
= addr_end
;
524 vsp
= vmem_seg_create(vmp
, vsp
->vs_aprev
, addr
, addr
+ size
);
525 vmem_hash_insert(vmp
, vsp
);
529 vmem_freelist_delete(vmp
, vsp
);
531 if (vs_end
!= addr_end
)
532 vmem_freelist_insert(vmp
,
533 vmem_seg_create(vmp
, vsp
, addr_end
, vs_end
));
535 if (vs_start
!= addr
)
536 vmem_freelist_insert(vmp
,
537 vmem_seg_create(vmp
, vsp
->vs_aprev
, vs_start
, addr
));
539 vsp
->vs_start
= addr
;
540 vsp
->vs_end
= addr
+ size
;
542 vmem_hash_insert(vmp
, vsp
);
547 * We cannot reap if we are in the middle of a vmem_populate().
557 * Populate vmp's segfree list with VMEM_MINFREE vmem_seg_t structures.
560 vmem_populate(vmem_t
*vmp
, int vmflag
)
566 vmem_populate_lock_t
*lp
;
569 while (vmp
->vm_nsegfree
< VMEM_MINFREE
&&
570 (vsp
= vmem_getseg_global()) != NULL
)
571 vmem_putseg(vmp
, vsp
);
573 if (vmp
->vm_nsegfree
>= VMEM_MINFREE
)
577 * If we're already populating, tap the reserve.
579 if (vmem_nosleep_lock
.vmpl_thr
== thr_self()) {
580 ASSERT(vmp
->vm_cflags
& VMC_POPULATOR
);
584 (void) mutex_unlock(&vmp
->vm_lock
);
586 ASSERT(vmflag
& VM_NOSLEEP
); /* we do not allow sleep allocations */
587 lp
= &vmem_nosleep_lock
;
590 * Cannot be just a mutex_lock(), since that has no effect if
591 * libthread is not linked.
593 (void) mutex_lock(&lp
->vmpl_mutex
);
594 ASSERT(lp
->vmpl_thr
== 0);
595 lp
->vmpl_thr
= thr_self();
597 nseg
= VMEM_MINFREE
+ vmem_populators
* VMEM_POPULATE_RESERVE
;
598 size
= P2ROUNDUP(nseg
* vmem_seg_size
, vmem_seg_arena
->vm_quantum
);
599 nseg
= size
/ vmem_seg_size
;
602 * The following vmem_alloc() may need to populate vmem_seg_arena
603 * and all the things it imports from. When doing so, it will tap
604 * each arena's reserve to prevent recursion (see the block comment
605 * above the definition of VMEM_POPULATE_RESERVE).
607 * During this allocation, vmem_reap() is a no-op. If the allocation
608 * fails, we call vmem_reap() after dropping the population lock.
610 p
= vmem_alloc(vmem_seg_arena
, size
, vmflag
& VM_UMFLAGS
);
613 (void) mutex_unlock(&lp
->vmpl_mutex
);
616 (void) mutex_lock(&vmp
->vm_lock
);
617 vmp
->vm_kstat
.vk_populate_fail
++;
621 * Restock the arenas that may have been depleted during population.
623 for (i
= 0; i
< vmem_populators
; i
++) {
624 (void) mutex_lock(&vmem_populator
[i
]->vm_lock
);
625 while (vmem_populator
[i
]->vm_nsegfree
< VMEM_POPULATE_RESERVE
)
626 vmem_putseg(vmem_populator
[i
],
627 (vmem_seg_t
*)(p
+ --nseg
* vmem_seg_size
));
628 (void) mutex_unlock(&vmem_populator
[i
]->vm_lock
);
632 (void) mutex_unlock(&lp
->vmpl_mutex
);
633 (void) mutex_lock(&vmp
->vm_lock
);
636 * Now take our own segments.
638 ASSERT(nseg
>= VMEM_MINFREE
);
639 while (vmp
->vm_nsegfree
< VMEM_MINFREE
)
640 vmem_putseg(vmp
, (vmem_seg_t
*)(p
+ --nseg
* vmem_seg_size
));
643 * Give the remainder to charity.
646 vmem_putseg_global((vmem_seg_t
*)(p
+ --nseg
* vmem_seg_size
));
652 * Advance a walker from its previous position to 'afterme'.
653 * Note: may drop and reacquire vmp->vm_lock.
656 vmem_advance(vmem_t
*vmp
, vmem_seg_t
*walker
, vmem_seg_t
*afterme
)
658 vmem_seg_t
*vprev
= walker
->vs_aprev
;
659 vmem_seg_t
*vnext
= walker
->vs_anext
;
660 vmem_seg_t
*vsp
= NULL
;
662 VMEM_DELETE(walker
, a
);
665 VMEM_INSERT(afterme
, walker
, a
);
668 * The walker segment's presence may have prevented its neighbors
669 * from coalescing. If so, coalesce them now.
671 if (vprev
->vs_type
== VMEM_FREE
) {
672 if (vnext
->vs_type
== VMEM_FREE
) {
673 ASSERT(vprev
->vs_end
== vnext
->vs_start
);
674 vmem_freelist_delete(vmp
, vnext
);
675 vmem_freelist_delete(vmp
, vprev
);
676 vprev
->vs_end
= vnext
->vs_end
;
677 vmem_freelist_insert(vmp
, vprev
);
678 vmem_seg_destroy(vmp
, vnext
);
681 } else if (vnext
->vs_type
== VMEM_FREE
) {
686 * vsp could represent a complete imported span,
687 * in which case we must return it to the source.
689 if (vsp
!= NULL
&& vsp
->vs_import
&& vmp
->vm_source_free
!= NULL
&&
690 vsp
->vs_aprev
->vs_type
== VMEM_SPAN
&&
691 vsp
->vs_anext
->vs_type
== VMEM_SPAN
) {
692 void *vaddr
= (void *)vsp
->vs_start
;
693 size_t size
= VS_SIZE(vsp
);
694 ASSERT(size
== VS_SIZE(vsp
->vs_aprev
));
695 vmem_freelist_delete(vmp
, vsp
);
696 vmem_span_destroy(vmp
, vsp
);
697 (void) mutex_unlock(&vmp
->vm_lock
);
698 vmp
->vm_source_free(vmp
->vm_source
, vaddr
, size
);
699 (void) mutex_lock(&vmp
->vm_lock
);
704 * VM_NEXTFIT allocations deliberately cycle through all virtual addresses
705 * in an arena, so that we avoid reusing addresses for as long as possible.
706 * This helps to catch used-after-freed bugs. It's also the perfect policy
707 * for allocating things like process IDs, where we want to cycle through
708 * all values in order.
711 vmem_nextfit_alloc(vmem_t
*vmp
, size_t size
, int vmflag
)
713 vmem_seg_t
*vsp
, *rotor
;
715 size_t realsize
= P2ROUNDUP(size
, vmp
->vm_quantum
);
718 (void) mutex_lock(&vmp
->vm_lock
);
720 if (vmp
->vm_nsegfree
< VMEM_MINFREE
&& !vmem_populate(vmp
, vmflag
)) {
721 (void) mutex_unlock(&vmp
->vm_lock
);
726 * The common case is that the segment right after the rotor is free,
727 * and large enough that extracting 'size' bytes won't change which
728 * freelist it's on. In this case we can avoid a *lot* of work.
729 * Instead of the normal vmem_seg_alloc(), we just advance the start
730 * address of the victim segment. Instead of moving the rotor, we
731 * create the new segment structure *behind the rotor*, which has
732 * the same effect. And finally, we know we don't have to coalesce
733 * the rotor's neighbors because the new segment lies between them.
735 rotor
= &vmp
->vm_rotor
;
736 vsp
= rotor
->vs_anext
;
737 if (vsp
->vs_type
== VMEM_FREE
&& (vs_size
= VS_SIZE(vsp
)) > realsize
&&
738 P2SAMEHIGHBIT(vs_size
, vs_size
- realsize
)) {
739 ASSERT(highbit(vs_size
) == highbit(vs_size
- realsize
));
740 addr
= vsp
->vs_start
;
741 vsp
->vs_start
= addr
+ realsize
;
742 vmem_hash_insert(vmp
,
743 vmem_seg_create(vmp
, rotor
->vs_aprev
, addr
, addr
+ size
));
744 (void) mutex_unlock(&vmp
->vm_lock
);
745 return ((void *)addr
);
749 * Starting at the rotor, look for a segment large enough to
750 * satisfy the allocation.
753 vmp
->vm_kstat
.vk_search
++;
754 if (vsp
->vs_type
== VMEM_FREE
&& VS_SIZE(vsp
) >= size
)
761 * We've come full circle. One possibility is that the
762 * there's actually enough space, but the rotor itself
763 * is preventing the allocation from succeeding because
764 * it's sitting between two free segments. Therefore,
765 * we advance the rotor and see if that liberates a
768 vmem_advance(vmp
, rotor
, rotor
->vs_anext
);
769 vsp
= rotor
->vs_aprev
;
770 if (vsp
->vs_type
== VMEM_FREE
&& VS_SIZE(vsp
) >= size
)
773 * If there's a lower arena we can import from, or it's
774 * a VM_NOSLEEP allocation, let vmem_xalloc() handle it.
775 * Otherwise, wait until another thread frees something.
777 if (vmp
->vm_source_alloc
!= NULL
||
778 (vmflag
& VM_NOSLEEP
)) {
779 (void) mutex_unlock(&vmp
->vm_lock
);
780 return (vmem_xalloc(vmp
, size
, vmp
->vm_quantum
,
781 0, 0, NULL
, NULL
, vmflag
& VM_UMFLAGS
));
783 vmp
->vm_kstat
.vk_wait
++;
784 (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE
,
786 (void) cond_wait(&vmp
->vm_cv
, &vmp
->vm_lock
);
787 (void) pthread_setcancelstate(cancel_state
, NULL
);
788 vsp
= rotor
->vs_anext
;
793 * We found a segment. Extract enough space to satisfy the allocation.
795 addr
= vsp
->vs_start
;
796 vsp
= vmem_seg_alloc(vmp
, vsp
, addr
, size
);
797 ASSERT(vsp
->vs_type
== VMEM_ALLOC
&&
798 vsp
->vs_start
== addr
&& vsp
->vs_end
== addr
+ size
);
801 * Advance the rotor to right after the newly-allocated segment.
802 * That's where the next VM_NEXTFIT allocation will begin searching.
804 vmem_advance(vmp
, rotor
, vsp
);
805 (void) mutex_unlock(&vmp
->vm_lock
);
806 return ((void *)addr
);
810 * Allocate size bytes at offset phase from an align boundary such that the
811 * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr)
812 * that does not straddle a nocross-aligned boundary.
815 vmem_xalloc(vmem_t
*vmp
, size_t size
, size_t align
, size_t phase
,
816 size_t nocross
, void *minaddr
, void *maxaddr
, int vmflag
)
819 vmem_seg_t
*vbest
= NULL
;
820 uintptr_t addr
, taddr
, start
, end
;
825 if (phase
> 0 && phase
>= align
)
826 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): "
828 (void *)vmp
, size
, align
, phase
, nocross
,
829 minaddr
, maxaddr
, vmflag
);
832 align
= vmp
->vm_quantum
;
834 if ((align
| phase
| nocross
) & (vmp
->vm_quantum
- 1)) {
835 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): "
836 "parameters not vm_quantum aligned",
837 (void *)vmp
, size
, align
, phase
, nocross
,
838 minaddr
, maxaddr
, vmflag
);
842 (align
> nocross
|| P2ROUNDUP(phase
+ size
, align
) > nocross
)) {
843 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): "
844 "overconstrained allocation",
845 (void *)vmp
, size
, align
, phase
, nocross
,
846 minaddr
, maxaddr
, vmflag
);
849 if ((mtbf
= vmem_mtbf
| vmp
->vm_mtbf
) != 0 && gethrtime() % mtbf
== 0 &&
850 (vmflag
& (VM_NOSLEEP
| VM_PANIC
)) == VM_NOSLEEP
)
853 (void) mutex_lock(&vmp
->vm_lock
);
857 if (vmp
->vm_nsegfree
< VMEM_MINFREE
&&
858 !vmem_populate(vmp
, vmflag
))
862 * highbit() returns the highest bit + 1, which is exactly
863 * what we want: we want to search the first freelist whose
864 * members are *definitely* large enough to satisfy our
865 * allocation. However, there are certain cases in which we
866 * want to look at the next-smallest freelist (which *might*
867 * be able to satisfy the allocation):
869 * (1) The size is exactly a power of 2, in which case
870 * the smaller freelist is always big enough;
872 * (2) All other freelists are empty;
874 * (3) We're in the highest possible freelist, which is
875 * always empty (e.g. the 4GB freelist on 32-bit systems);
877 * (4) We're doing a best-fit or first-fit allocation.
879 if ((size
& (size
- 1)) == 0) {
880 flist
= lowbit(P2ALIGN(vmp
->vm_freemap
, size
));
883 if ((vmp
->vm_freemap
>> hb
) == 0 ||
884 hb
== VMEM_FREELISTS
||
885 (vmflag
& (VM_BESTFIT
| VM_FIRSTFIT
)))
887 flist
= lowbit(P2ALIGN(vmp
->vm_freemap
, 1UL << hb
));
890 for (vbest
= NULL
, vsp
= (flist
== 0) ? NULL
:
891 vmp
->vm_freelist
[flist
- 1].vs_knext
;
892 vsp
!= NULL
; vsp
= vsp
->vs_knext
) {
893 vmp
->vm_kstat
.vk_search
++;
894 if (vsp
->vs_start
== 0) {
896 * We're moving up to a larger freelist,
897 * so if we've already found a candidate,
898 * the fit can't possibly get any better.
903 * Find the next non-empty freelist.
905 flist
= lowbit(P2ALIGN(vmp
->vm_freemap
,
909 vsp
= (vmem_seg_t
*)&vmp
->vm_freelist
[flist
];
910 ASSERT(vsp
->vs_knext
->vs_type
== VMEM_FREE
);
913 if (vsp
->vs_end
- 1 < (uintptr_t)minaddr
)
915 if (vsp
->vs_start
> (uintptr_t)maxaddr
- 1)
917 start
= MAX(vsp
->vs_start
, (uintptr_t)minaddr
);
918 end
= MIN(vsp
->vs_end
- 1, (uintptr_t)maxaddr
- 1) + 1;
919 taddr
= P2PHASEUP(start
, align
, phase
);
920 if (P2BOUNDARY(taddr
, size
, nocross
))
922 P2ROUNDUP(P2NPHASE(taddr
, nocross
), align
);
923 if ((taddr
- start
) + size
> end
- start
||
924 (vbest
!= NULL
&& VS_SIZE(vsp
) >= VS_SIZE(vbest
)))
928 if (!(vmflag
& VM_BESTFIT
) || VS_SIZE(vbest
) == size
)
934 umem_panic("vmem_xalloc(): size == 0");
935 if (vmp
->vm_source_alloc
!= NULL
&& nocross
== 0 &&
936 minaddr
== NULL
&& maxaddr
== NULL
) {
937 size_t asize
= P2ROUNDUP(size
+ phase
,
938 MAX(align
, vmp
->vm_source
->vm_quantum
));
939 if (asize
< size
) { /* overflow */
940 (void) mutex_unlock(&vmp
->vm_lock
);
941 if (vmflag
& VM_NOSLEEP
)
944 umem_panic("vmem_xalloc(): "
945 "overflow on VM_SLEEP allocation");
948 * Determine how many segment structures we'll consume.
949 * The calculation must be presise because if we're
950 * here on behalf of vmem_populate(), we are taking
951 * segments from a very limited reserve.
953 resv
= (size
== asize
) ?
954 VMEM_SEGS_PER_SPAN_CREATE
+
955 VMEM_SEGS_PER_EXACT_ALLOC
:
956 VMEM_SEGS_PER_ALLOC_MAX
;
957 ASSERT(vmp
->vm_nsegfree
>= resv
);
958 vmp
->vm_nsegfree
-= resv
; /* reserve our segs */
959 (void) mutex_unlock(&vmp
->vm_lock
);
960 vaddr
= vmp
->vm_source_alloc(vmp
->vm_source
, asize
,
961 vmflag
& VM_UMFLAGS
);
962 (void) mutex_lock(&vmp
->vm_lock
);
963 vmp
->vm_nsegfree
+= resv
; /* claim reservation */
965 vbest
= vmem_span_create(vmp
, vaddr
, asize
, 1);
966 addr
= P2PHASEUP(vbest
->vs_start
, align
, phase
);
970 (void) mutex_unlock(&vmp
->vm_lock
);
972 (void) mutex_lock(&vmp
->vm_lock
);
973 if (vmflag
& VM_NOSLEEP
)
975 vmp
->vm_kstat
.vk_wait
++;
976 (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE
,
978 (void) cond_wait(&vmp
->vm_cv
, &vmp
->vm_lock
);
979 (void) pthread_setcancelstate(cancel_state
, NULL
);
982 ASSERT(vbest
->vs_type
== VMEM_FREE
);
983 ASSERT(vbest
->vs_knext
!= vbest
);
984 (void) vmem_seg_alloc(vmp
, vbest
, addr
, size
);
985 (void) mutex_unlock(&vmp
->vm_lock
);
986 ASSERT(P2PHASE(addr
, align
) == phase
);
987 ASSERT(!P2BOUNDARY(addr
, size
, nocross
));
988 ASSERT(addr
>= (uintptr_t)minaddr
);
989 ASSERT(addr
+ size
- 1 <= (uintptr_t)maxaddr
- 1);
990 return ((void *)addr
);
992 vmp
->vm_kstat
.vk_fail
++;
993 (void) mutex_unlock(&vmp
->vm_lock
);
994 if (vmflag
& VM_PANIC
)
995 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): "
996 "cannot satisfy mandatory allocation",
997 (void *)vmp
, size
, align
, phase
, nocross
,
998 minaddr
, maxaddr
, vmflag
);
1003 * Free the segment [vaddr, vaddr + size), where vaddr was a constrained
1004 * allocation. vmem_xalloc() and vmem_xfree() must always be paired because
1005 * both routines bypass the quantum caches.
1008 vmem_xfree(vmem_t
*vmp
, void *vaddr
, size_t size
)
1010 vmem_seg_t
*vsp
, *vnext
, *vprev
;
1012 (void) mutex_lock(&vmp
->vm_lock
);
1014 vsp
= vmem_hash_delete(vmp
, (uintptr_t)vaddr
, size
);
1015 vsp
->vs_end
= P2ROUNDUP(vsp
->vs_end
, vmp
->vm_quantum
);
1018 * Attempt to coalesce with the next segment.
1020 vnext
= vsp
->vs_anext
;
1021 if (vnext
->vs_type
== VMEM_FREE
) {
1022 ASSERT(vsp
->vs_end
== vnext
->vs_start
);
1023 vmem_freelist_delete(vmp
, vnext
);
1024 vsp
->vs_end
= vnext
->vs_end
;
1025 vmem_seg_destroy(vmp
, vnext
);
1029 * Attempt to coalesce with the previous segment.
1031 vprev
= vsp
->vs_aprev
;
1032 if (vprev
->vs_type
== VMEM_FREE
) {
1033 ASSERT(vprev
->vs_end
== vsp
->vs_start
);
1034 vmem_freelist_delete(vmp
, vprev
);
1035 vprev
->vs_end
= vsp
->vs_end
;
1036 vmem_seg_destroy(vmp
, vsp
);
1041 * If the entire span is free, return it to the source.
1043 if (vsp
->vs_import
&& vmp
->vm_source_free
!= NULL
&&
1044 vsp
->vs_aprev
->vs_type
== VMEM_SPAN
&&
1045 vsp
->vs_anext
->vs_type
== VMEM_SPAN
) {
1046 vaddr
= (void *)vsp
->vs_start
;
1047 size
= VS_SIZE(vsp
);
1048 ASSERT(size
== VS_SIZE(vsp
->vs_aprev
));
1049 vmem_span_destroy(vmp
, vsp
);
1050 (void) mutex_unlock(&vmp
->vm_lock
);
1051 vmp
->vm_source_free(vmp
->vm_source
, vaddr
, size
);
1053 vmem_freelist_insert(vmp
, vsp
);
1054 (void) mutex_unlock(&vmp
->vm_lock
);
1059 * Allocate size bytes from arena vmp. Returns the allocated address
1060 * on success, NULL on failure. vmflag specifies VM_SLEEP or VM_NOSLEEP,
1061 * and may also specify best-fit, first-fit, or next-fit allocation policy
1062 * instead of the default instant-fit policy. VM_SLEEP allocations are
1063 * guaranteed to succeed.
1066 vmem_alloc(vmem_t
*vmp
, size_t size
, int vmflag
)
1073 vmflag
|= vmem_allocator
;
1075 if (size
- 1 < vmp
->vm_qcache_max
) {
1076 ASSERT(vmflag
& VM_NOSLEEP
);
1077 return (_umem_cache_alloc(vmp
->vm_qcache
[(size
- 1) >>
1078 vmp
->vm_qshift
], UMEM_DEFAULT
));
1081 if ((mtbf
= vmem_mtbf
| vmp
->vm_mtbf
) != 0 && gethrtime() % mtbf
== 0 &&
1082 (vmflag
& (VM_NOSLEEP
| VM_PANIC
)) == VM_NOSLEEP
)
1085 if (vmflag
& VM_NEXTFIT
)
1086 return (vmem_nextfit_alloc(vmp
, size
, vmflag
));
1088 if (vmflag
& (VM_BESTFIT
| VM_FIRSTFIT
))
1089 return (vmem_xalloc(vmp
, size
, vmp
->vm_quantum
, 0, 0,
1090 NULL
, NULL
, vmflag
));
1093 * Unconstrained instant-fit allocation from the segment list.
1095 (void) mutex_lock(&vmp
->vm_lock
);
1097 if (vmp
->vm_nsegfree
>= VMEM_MINFREE
|| vmem_populate(vmp
, vmflag
)) {
1098 if ((size
& (size
- 1)) == 0)
1099 flist
= lowbit(P2ALIGN(vmp
->vm_freemap
, size
));
1100 else if ((hb
= highbit(size
)) < VMEM_FREELISTS
)
1101 flist
= lowbit(P2ALIGN(vmp
->vm_freemap
, 1UL << hb
));
1105 (void) mutex_unlock(&vmp
->vm_lock
);
1106 return (vmem_xalloc(vmp
, size
, vmp
->vm_quantum
,
1107 0, 0, NULL
, NULL
, vmflag
));
1110 ASSERT(size
<= (1UL << flist
));
1111 vsp
= vmp
->vm_freelist
[flist
].vs_knext
;
1112 addr
= vsp
->vs_start
;
1113 (void) vmem_seg_alloc(vmp
, vsp
, addr
, size
);
1114 (void) mutex_unlock(&vmp
->vm_lock
);
1115 return ((void *)addr
);
1119 * Free the segment [vaddr, vaddr + size).
1122 vmem_free(vmem_t
*vmp
, void *vaddr
, size_t size
)
1124 if (size
- 1 < vmp
->vm_qcache_max
)
1125 _umem_cache_free(vmp
->vm_qcache
[(size
- 1) >> vmp
->vm_qshift
],
1128 vmem_xfree(vmp
, vaddr
, size
);
1132 * Determine whether arena vmp contains the segment [vaddr, vaddr + size).
1135 vmem_contains(vmem_t
*vmp
, void *vaddr
, size_t size
)
1137 uintptr_t start
= (uintptr_t)vaddr
;
1138 uintptr_t end
= start
+ size
;
1140 vmem_seg_t
*seg0
= &vmp
->vm_seg0
;
1142 (void) mutex_lock(&vmp
->vm_lock
);
1143 vmp
->vm_kstat
.vk_contains
++;
1144 for (vsp
= seg0
->vs_knext
; vsp
!= seg0
; vsp
= vsp
->vs_knext
) {
1145 vmp
->vm_kstat
.vk_contains_search
++;
1146 ASSERT(vsp
->vs_type
== VMEM_SPAN
);
1147 if (start
>= vsp
->vs_start
&& end
- 1 <= vsp
->vs_end
- 1)
1150 (void) mutex_unlock(&vmp
->vm_lock
);
1151 return (vsp
!= seg0
);
1155 * Add the span [vaddr, vaddr + size) to arena vmp.
1158 vmem_add(vmem_t
*vmp
, void *vaddr
, size_t size
, int vmflag
)
1160 if (vaddr
== NULL
|| size
== 0) {
1161 umem_panic("vmem_add(%p, %p, %lu): bad arguments",
1165 ASSERT(!vmem_contains(vmp
, vaddr
, size
));
1167 (void) mutex_lock(&vmp
->vm_lock
);
1168 if (vmem_populate(vmp
, vmflag
))
1169 (void) vmem_span_create(vmp
, vaddr
, size
, 0);
1172 (void) cond_broadcast(&vmp
->vm_cv
);
1173 (void) mutex_unlock(&vmp
->vm_lock
);
1178 * Adds the address range [addr, endaddr) to arena vmp, by either:
1179 * 1. joining two existing spans, [x, addr), and [endaddr, y) (which
1180 * are in that order) into a single [x, y) span,
1181 * 2. expanding an existing [x, addr) span to [x, endaddr),
1182 * 3. expanding an existing [endaddr, x) span to [addr, x), or
1183 * 4. creating a new [addr, endaddr) span.
1185 * Called with vmp->vm_lock held, and a successful vmem_populate() completed.
1186 * Cannot fail. Returns the new segment.
1188 * NOTE: this algorithm is linear-time in the number of spans, but is
1189 * constant-time when you are extending the last (highest-addressed)
1193 vmem_extend_unlocked(vmem_t
*vmp
, uintptr_t addr
, uintptr_t endaddr
)
1198 vmem_seg_t
*end
= &vmp
->vm_seg0
;
1200 ASSERT(MUTEX_HELD(&vmp
->vm_lock
));
1203 * the second "if" clause below relies on the direction of this search
1205 for (span
= end
->vs_kprev
; span
!= end
; span
= span
->vs_kprev
) {
1206 if (span
->vs_end
== addr
|| span
->vs_start
== endaddr
)
1211 return (vmem_span_create(vmp
, (void *)addr
, endaddr
- addr
, 0));
1212 if (span
->vs_kprev
->vs_end
== addr
&& span
->vs_start
== endaddr
) {
1213 vmem_seg_t
*prevspan
= span
->vs_kprev
;
1214 vmem_seg_t
*nextseg
= span
->vs_anext
;
1215 vmem_seg_t
*prevseg
= span
->vs_aprev
;
1218 * prevspan becomes the span marker for the full range
1220 prevspan
->vs_end
= span
->vs_end
;
1223 * Notionally, span becomes a free segment representing
1226 * However, if either of its neighbors are free, we coalesce
1227 * by destroying span and changing the free segment.
1229 if (prevseg
->vs_type
== VMEM_FREE
&&
1230 nextseg
->vs_type
== VMEM_FREE
) {
1232 * coalesce both ways
1234 ASSERT(prevseg
->vs_end
== addr
&&
1235 nextseg
->vs_start
== endaddr
);
1237 vmem_freelist_delete(vmp
, prevseg
);
1238 prevseg
->vs_end
= nextseg
->vs_end
;
1240 vmem_freelist_delete(vmp
, nextseg
);
1241 VMEM_DELETE(span
, k
);
1242 vmem_seg_destroy(vmp
, nextseg
);
1243 vmem_seg_destroy(vmp
, span
);
1246 } else if (prevseg
->vs_type
== VMEM_FREE
) {
1250 ASSERT(prevseg
->vs_end
== addr
);
1252 VMEM_DELETE(span
, k
);
1253 vmem_seg_destroy(vmp
, span
);
1255 vmem_freelist_delete(vmp
, prevseg
);
1256 prevseg
->vs_end
= endaddr
;
1259 } else if (nextseg
->vs_type
== VMEM_FREE
) {
1263 ASSERT(nextseg
->vs_start
== endaddr
);
1265 VMEM_DELETE(span
, k
);
1266 vmem_seg_destroy(vmp
, span
);
1268 vmem_freelist_delete(vmp
, nextseg
);
1269 nextseg
->vs_start
= addr
;
1276 VMEM_DELETE(span
, k
);
1277 span
->vs_start
= addr
;
1278 span
->vs_end
= endaddr
;
1282 } else if (span
->vs_end
== addr
) {
1283 vmem_seg_t
*oldseg
= span
->vs_knext
->vs_aprev
;
1284 span
->vs_end
= endaddr
;
1286 ASSERT(oldseg
->vs_type
!= VMEM_SPAN
);
1287 if (oldseg
->vs_type
== VMEM_FREE
) {
1288 ASSERT(oldseg
->vs_end
== addr
);
1289 vmem_freelist_delete(vmp
, oldseg
);
1290 oldseg
->vs_end
= endaddr
;
1293 vsp
= vmem_seg_create(vmp
, oldseg
, addr
, endaddr
);
1295 vmem_seg_t
*oldseg
= span
->vs_anext
;
1296 ASSERT(span
->vs_start
== endaddr
);
1297 span
->vs_start
= addr
;
1299 ASSERT(oldseg
->vs_type
!= VMEM_SPAN
);
1300 if (oldseg
->vs_type
== VMEM_FREE
) {
1301 ASSERT(oldseg
->vs_start
== endaddr
);
1302 vmem_freelist_delete(vmp
, oldseg
);
1303 oldseg
->vs_start
= addr
;
1306 vsp
= vmem_seg_create(vmp
, span
, addr
, endaddr
);
1308 vmem_freelist_insert(vmp
, vsp
);
1309 vmp
->vm_kstat
.vk_mem_total
+= (endaddr
- addr
);
1314 * Does some error checking, calls vmem_extend_unlocked to add
1315 * [vaddr, vaddr+size) to vmp, then allocates alloc bytes from the
1316 * newly merged segment.
1319 _vmem_extend_alloc(vmem_t
*vmp
, void *vaddr
, size_t size
, size_t alloc
,
1322 uintptr_t addr
= (uintptr_t)vaddr
;
1323 uintptr_t endaddr
= addr
+ size
;
1326 ASSERT(vaddr
!= NULL
&& size
!= 0 && endaddr
> addr
);
1327 ASSERT(alloc
<= size
&& alloc
!= 0);
1328 ASSERT(((addr
| size
| alloc
) & (vmp
->vm_quantum
- 1)) == 0);
1330 ASSERT(!vmem_contains(vmp
, vaddr
, size
));
1332 (void) mutex_lock(&vmp
->vm_lock
);
1333 if (!vmem_populate(vmp
, vmflag
)) {
1334 (void) mutex_unlock(&vmp
->vm_lock
);
1338 * if there is a source, we can't mess with the spans
1340 if (vmp
->vm_source_alloc
!= NULL
)
1341 vsp
= vmem_span_create(vmp
, vaddr
, size
, 0);
1343 vsp
= vmem_extend_unlocked(vmp
, addr
, endaddr
);
1345 ASSERT(VS_SIZE(vsp
) >= alloc
);
1347 addr
= vsp
->vs_start
;
1348 (void) vmem_seg_alloc(vmp
, vsp
, addr
, alloc
);
1349 vaddr
= (void *)addr
;
1351 (void) cond_broadcast(&vmp
->vm_cv
);
1352 (void) mutex_unlock(&vmp
->vm_lock
);
1358 * Walk the vmp arena, applying func to each segment matching typemask.
1359 * If VMEM_REENTRANT is specified, the arena lock is dropped across each
1360 * call to func(); otherwise, it is held for the duration of vmem_walk()
1361 * to ensure a consistent snapshot. Note that VMEM_REENTRANT callbacks
1362 * are *not* necessarily consistent, so they may only be used when a hint
1366 vmem_walk(vmem_t
*vmp
, int typemask
,
1367 void (*func
)(void *, void *, size_t), void *arg
)
1370 vmem_seg_t
*seg0
= &vmp
->vm_seg0
;
1373 if (typemask
& VMEM_WALKER
)
1376 bzero(&walker
, sizeof (walker
));
1377 walker
.vs_type
= VMEM_WALKER
;
1379 (void) mutex_lock(&vmp
->vm_lock
);
1380 VMEM_INSERT(seg0
, &walker
, a
);
1381 for (vsp
= seg0
->vs_anext
; vsp
!= seg0
; vsp
= vsp
->vs_anext
) {
1382 if (vsp
->vs_type
& typemask
) {
1383 void *start
= (void *)vsp
->vs_start
;
1384 size_t size
= VS_SIZE(vsp
);
1385 if (typemask
& VMEM_REENTRANT
) {
1386 vmem_advance(vmp
, &walker
, vsp
);
1387 (void) mutex_unlock(&vmp
->vm_lock
);
1388 func(arg
, start
, size
);
1389 (void) mutex_lock(&vmp
->vm_lock
);
1392 func(arg
, start
, size
);
1396 vmem_advance(vmp
, &walker
, NULL
);
1397 (void) mutex_unlock(&vmp
->vm_lock
);
1401 * Return the total amount of memory whose type matches typemask. Thus:
1403 * typemask VMEM_ALLOC yields total memory allocated (in use).
1404 * typemask VMEM_FREE yields total memory free (available).
1405 * typemask (VMEM_ALLOC | VMEM_FREE) yields total arena size.
1408 vmem_size(vmem_t
*vmp
, int typemask
)
1412 if (typemask
& VMEM_ALLOC
)
1413 size
+= vmp
->vm_kstat
.vk_mem_inuse
;
1414 if (typemask
& VMEM_FREE
)
1415 size
+= vmp
->vm_kstat
.vk_mem_total
-
1416 vmp
->vm_kstat
.vk_mem_inuse
;
1417 return ((size_t)size
);
1421 * Create an arena called name whose initial span is [base, base + size).
1422 * The arena's natural unit of currency is quantum, so vmem_alloc()
1423 * guarantees quantum-aligned results. The arena may import new spans
1424 * by invoking afunc() on source, and may return those spans by invoking
1425 * ffunc() on source. To make small allocations fast and scalable,
1426 * the arena offers high-performance caching for each integer multiple
1427 * of quantum up to qcache_max.
1430 vmem_create(const char *name
, void *base
, size_t size
, size_t quantum
,
1431 vmem_alloc_t
*afunc
, vmem_free_t
*ffunc
, vmem_t
*source
,
1432 size_t qcache_max
, int vmflag
)
1436 vmem_t
*vmp
, *cur
, **vmpp
;
1438 vmem_freelist_t
*vfp
;
1439 uint32_t id
= atomic_add_32_nv(&vmem_id
, 1);
1441 if (vmem_vmem_arena
!= NULL
) {
1442 vmp
= vmem_alloc(vmem_vmem_arena
, sizeof (vmem_t
),
1443 vmflag
& VM_UMFLAGS
);
1445 ASSERT(id
<= VMEM_INITIAL
);
1446 vmp
= &vmem0
[id
- 1];
1451 bzero(vmp
, sizeof (vmem_t
));
1453 (void) snprintf(vmp
->vm_name
, VMEM_NAMELEN
, "%s", name
);
1454 (void) mutex_init(&vmp
->vm_lock
, USYNC_THREAD
, NULL
);
1455 (void) cond_init(&vmp
->vm_cv
, USYNC_THREAD
, NULL
);
1456 vmp
->vm_cflags
= vmflag
;
1457 vmflag
&= VM_UMFLAGS
;
1459 vmp
->vm_quantum
= quantum
;
1460 vmp
->vm_qshift
= highbit(quantum
) - 1;
1461 nqcache
= MIN(qcache_max
>> vmp
->vm_qshift
, VMEM_NQCACHE_MAX
);
1463 for (i
= 0; i
<= VMEM_FREELISTS
; i
++) {
1464 vfp
= &vmp
->vm_freelist
[i
];
1465 vfp
->vs_end
= 1UL << i
;
1466 vfp
->vs_knext
= (vmem_seg_t
*)(vfp
+ 1);
1467 vfp
->vs_kprev
= (vmem_seg_t
*)(vfp
- 1);
1470 vmp
->vm_freelist
[0].vs_kprev
= NULL
;
1471 vmp
->vm_freelist
[VMEM_FREELISTS
].vs_knext
= NULL
;
1472 vmp
->vm_freelist
[VMEM_FREELISTS
].vs_end
= 0;
1473 vmp
->vm_hash_table
= vmp
->vm_hash0
;
1474 vmp
->vm_hash_mask
= VMEM_HASH_INITIAL
- 1;
1475 vmp
->vm_hash_shift
= highbit(vmp
->vm_hash_mask
);
1477 vsp
= &vmp
->vm_seg0
;
1478 vsp
->vs_anext
= vsp
;
1479 vsp
->vs_aprev
= vsp
;
1480 vsp
->vs_knext
= vsp
;
1481 vsp
->vs_kprev
= vsp
;
1482 vsp
->vs_type
= VMEM_SPAN
;
1484 vsp
= &vmp
->vm_rotor
;
1485 vsp
->vs_type
= VMEM_ROTOR
;
1486 VMEM_INSERT(&vmp
->vm_seg0
, vsp
, a
);
1490 vmp
->vm_kstat
.vk_source_id
= source
->vm_id
;
1491 vmp
->vm_source
= source
;
1492 vmp
->vm_source_alloc
= afunc
;
1493 vmp
->vm_source_free
= ffunc
;
1496 vmp
->vm_qcache_max
= nqcache
<< vmp
->vm_qshift
;
1497 for (i
= 0; i
< nqcache
; i
++) {
1498 char buf
[VMEM_NAMELEN
+ 21];
1499 (void) snprintf(buf
, sizeof (buf
), "%s_%lu",
1500 vmp
->vm_name
, (long)((i
+ 1) * quantum
));
1501 vmp
->vm_qcache
[i
] = umem_cache_create(buf
,
1502 (i
+ 1) * quantum
, quantum
, NULL
, NULL
, NULL
,
1503 NULL
, vmp
, UMC_QCACHE
| UMC_NOTOUCH
);
1504 if (vmp
->vm_qcache
[i
] == NULL
) {
1505 vmp
->vm_qcache_max
= i
* quantum
;
1511 (void) mutex_lock(&vmem_list_lock
);
1513 while ((cur
= *vmpp
) != NULL
)
1514 vmpp
= &cur
->vm_next
;
1516 (void) mutex_unlock(&vmem_list_lock
);
1518 if (vmp
->vm_cflags
& VMC_POPULATOR
) {
1519 uint_t pop_id
= atomic_add_32_nv(&vmem_populators
, 1);
1520 ASSERT(pop_id
<= VMEM_INITIAL
);
1521 vmem_populator
[pop_id
- 1] = vmp
;
1522 (void) mutex_lock(&vmp
->vm_lock
);
1523 (void) vmem_populate(vmp
, vmflag
| VM_PANIC
);
1524 (void) mutex_unlock(&vmp
->vm_lock
);
1527 if ((base
|| size
) && vmem_add(vmp
, base
, size
, vmflag
) == NULL
) {
1536 * Destroy arena vmp.
1539 vmem_destroy(vmem_t
*vmp
)
1541 vmem_t
*cur
, **vmpp
;
1542 vmem_seg_t
*seg0
= &vmp
->vm_seg0
;
1547 (void) mutex_lock(&vmem_list_lock
);
1549 while ((cur
= *vmpp
) != vmp
)
1550 vmpp
= &cur
->vm_next
;
1551 *vmpp
= vmp
->vm_next
;
1552 (void) mutex_unlock(&vmem_list_lock
);
1554 for (i
= 0; i
< VMEM_NQCACHE_MAX
; i
++)
1555 if (vmp
->vm_qcache
[i
])
1556 umem_cache_destroy(vmp
->vm_qcache
[i
]);
1558 leaked
= vmem_size(vmp
, VMEM_ALLOC
);
1560 umem_printf("vmem_destroy('%s'): leaked %lu bytes",
1561 vmp
->vm_name
, leaked
);
1563 if (vmp
->vm_hash_table
!= vmp
->vm_hash0
)
1564 vmem_free(vmem_hash_arena
, vmp
->vm_hash_table
,
1565 (vmp
->vm_hash_mask
+ 1) * sizeof (void *));
1568 * Give back the segment structures for anything that's left in the
1569 * arena, e.g. the primary spans and their free segments.
1571 VMEM_DELETE(&vmp
->vm_rotor
, a
);
1572 for (vsp
= seg0
->vs_anext
; vsp
!= seg0
; vsp
= vsp
->vs_anext
)
1573 vmem_putseg_global(vsp
);
1575 while (vmp
->vm_nsegfree
> 0)
1576 vmem_putseg_global(vmem_getseg(vmp
));
1578 (void) mutex_destroy(&vmp
->vm_lock
);
1579 (void) cond_destroy(&vmp
->vm_cv
);
1580 vmem_free(vmem_vmem_arena
, vmp
, sizeof (vmem_t
));
1584 * Resize vmp's hash table to keep the average lookup depth near 1.0.
1587 vmem_hash_rescale(vmem_t
*vmp
)
1589 vmem_seg_t
**old_table
, **new_table
, *vsp
;
1590 size_t old_size
, new_size
, h
, nseg
;
1592 nseg
= (size_t)(vmp
->vm_kstat
.vk_alloc
- vmp
->vm_kstat
.vk_free
);
1594 new_size
= MAX(VMEM_HASH_INITIAL
, 1 << (highbit(3 * nseg
+ 4) - 2));
1595 old_size
= vmp
->vm_hash_mask
+ 1;
1597 if ((old_size
>> 1) <= new_size
&& new_size
<= (old_size
<< 1))
1600 new_table
= vmem_alloc(vmem_hash_arena
, new_size
* sizeof (void *),
1602 if (new_table
== NULL
)
1604 bzero(new_table
, new_size
* sizeof (void *));
1606 (void) mutex_lock(&vmp
->vm_lock
);
1608 old_size
= vmp
->vm_hash_mask
+ 1;
1609 old_table
= vmp
->vm_hash_table
;
1611 vmp
->vm_hash_mask
= new_size
- 1;
1612 vmp
->vm_hash_table
= new_table
;
1613 vmp
->vm_hash_shift
= highbit(vmp
->vm_hash_mask
);
1615 for (h
= 0; h
< old_size
; h
++) {
1617 while (vsp
!= NULL
) {
1618 uintptr_t addr
= vsp
->vs_start
;
1619 vmem_seg_t
*next_vsp
= vsp
->vs_knext
;
1620 vmem_seg_t
**hash_bucket
= VMEM_HASH(vmp
, addr
);
1621 vsp
->vs_knext
= *hash_bucket
;
1627 (void) mutex_unlock(&vmp
->vm_lock
);
1629 if (old_table
!= vmp
->vm_hash0
)
1630 vmem_free(vmem_hash_arena
, old_table
,
1631 old_size
* sizeof (void *));
1635 * Perform periodic maintenance on all vmem arenas.
1639 vmem_update(void *dummy
)
1643 (void) mutex_lock(&vmem_list_lock
);
1644 for (vmp
= vmem_list
; vmp
!= NULL
; vmp
= vmp
->vm_next
) {
1646 * If threads are waiting for resources, wake them up
1647 * periodically so they can issue another vmem_reap()
1648 * to reclaim resources cached by the slab allocator.
1650 (void) cond_broadcast(&vmp
->vm_cv
);
1653 * Rescale the hash table to keep the hash chains short.
1655 vmem_hash_rescale(vmp
);
1657 (void) mutex_unlock(&vmem_list_lock
);
1661 * If vmem_init is called again, we need to be able to reset the world.
1662 * That includes resetting the statics back to their original values.
1667 #ifdef UMEM_STANDALONE
1669 vmem_populators
= 0;
1670 vmem_segfree
= NULL
;
1672 vmem_internal_arena
= NULL
;
1673 vmem_seg_arena
= NULL
;
1674 vmem_hash_arena
= NULL
;
1675 vmem_vmem_arena
= NULL
;
1677 vmem_heap_alloc
= NULL
;
1678 vmem_heap_free
= NULL
;
1680 bzero(vmem0
, sizeof (vmem0
));
1681 bzero(vmem_populator
, sizeof (vmem_populator
));
1682 bzero(vmem_seg0
, sizeof (vmem_seg0
));
1687 * Prepare vmem for use.
1690 vmem_init(const char *parent_name
, size_t parent_quantum
,
1691 vmem_alloc_t
*parent_alloc
, vmem_free_t
*parent_free
,
1692 const char *heap_name
, void *heap_start
, size_t heap_size
,
1693 size_t heap_quantum
, vmem_alloc_t
*heap_alloc
, vmem_free_t
*heap_free
)
1696 int nseg
= VMEM_SEG_INITIAL
;
1697 vmem_t
*parent
, *heap
;
1699 ASSERT(vmem_internal_arena
== NULL
);
1702 vmem_putseg_global(&vmem_seg0
[nseg
]);
1704 if (parent_name
!= NULL
) {
1705 parent
= vmem_create(parent_name
,
1706 heap_start
, heap_size
, parent_quantum
,
1707 NULL
, NULL
, NULL
, 0,
1708 VM_SLEEP
| VMC_POPULATOR
);
1712 ASSERT(parent_alloc
== NULL
&& parent_free
== NULL
);
1716 heap
= vmem_create(heap_name
,
1717 heap_start
, heap_size
, heap_quantum
,
1718 parent_alloc
, parent_free
, parent
, 0,
1719 VM_SLEEP
| VMC_POPULATOR
);
1722 vmem_heap_alloc
= heap_alloc
;
1723 vmem_heap_free
= heap_free
;
1725 vmem_internal_arena
= vmem_create("vmem_internal",
1726 NULL
, 0, heap_quantum
,
1727 heap_alloc
, heap_free
, heap
, 0,
1728 VM_SLEEP
| VMC_POPULATOR
);
1730 vmem_seg_arena
= vmem_create("vmem_seg",
1731 NULL
, 0, heap_quantum
,
1732 vmem_alloc
, vmem_free
, vmem_internal_arena
, 0,
1733 VM_SLEEP
| VMC_POPULATOR
);
1735 vmem_hash_arena
= vmem_create("vmem_hash",
1737 vmem_alloc
, vmem_free
, vmem_internal_arena
, 0,
1740 vmem_vmem_arena
= vmem_create("vmem_vmem",
1741 vmem0
, sizeof (vmem0
), 1,
1742 vmem_alloc
, vmem_free
, vmem_internal_arena
, 0,
1745 for (id
= 0; id
< vmem_id
; id
++)
1746 (void) vmem_xalloc(vmem_vmem_arena
, sizeof (vmem_t
),
1747 1, 0, 0, &vmem0
[id
], &vmem0
[id
+ 1],
1748 VM_NOSLEEP
| VM_BESTFIT
| VM_PANIC
);
1757 * This size must be a multiple of the minimum required alignment,
1758 * since vmem_populate allocates them compactly.
1760 vmem_seg_size
= P2ROUNDUP(offsetof(vmem_seg_t
, vs_thread
),
1765 * Lockup and release, for fork1(2) handling.
1772 (void) mutex_lock(&vmem_list_lock
);
1773 (void) mutex_lock(&vmem_nosleep_lock
.vmpl_mutex
);
1776 * Lock up and broadcast all arenas.
1778 for (cur
= vmem_list
; cur
!= NULL
; cur
= cur
->vm_next
) {
1779 (void) mutex_lock(&cur
->vm_lock
);
1780 (void) cond_broadcast(&cur
->vm_cv
);
1783 (void) mutex_lock(&vmem_segfree_lock
);
1791 (void) mutex_unlock(&vmem_nosleep_lock
.vmpl_mutex
);
1793 for (cur
= vmem_list
; cur
!= NULL
; cur
= cur
->vm_next
)
1794 (void) mutex_unlock(&cur
->vm_lock
);
1796 (void) mutex_unlock(&vmem_segfree_lock
);
1797 (void) mutex_unlock(&vmem_list_lock
);