4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
30 * This file implements the getvmusage() private system call.
31 * getvmusage() counts the amount of resident memory pages and swap
32 * reserved by the specified process collective. A "process collective" is
33 * the set of processes owned by a particular, zone, project, task, or user.
35 * rss and swap are counted so that for a given process collective, a page is
36 * only counted once. For example, this means that if multiple processes in
37 * the same project map the same page, then the project will only be charged
38 * once for that page. On the other hand, if two processes in different
39 * projects map the same page, then both projects will be charged
42 * The vm_getusage() calculation is implemented so that the first thread
43 * performs the rss/swap counting. Other callers will wait for that thread to
44 * finish, copying the results. This enables multiple rcapds and prstats to
45 * consume data from the same calculation. The results are also cached so that
46 * a caller interested in recent results can just copy them instead of starting
47 * a new calculation. The caller passes the maximium age (in seconds) of the
48 * data. If the cached data is young enough, the cache is copied, otherwise,
49 * a new calculation is executed and the cache is replaced with the new
52 * The rss calculation for each process collective is as follows:
54 * - Inspect flags, determine if counting rss for zones, projects, tasks,
57 * - Figure out proc's collectives (zone, project, task, and/or user).
58 * - For each seg in proc's address space:
59 * - If seg is private:
60 * - Lookup anons in the amp.
61 * - For incore pages not previously visited each of the
62 * proc's collectives, add incore pagesize to each.
64 * Anon's with a refcnt of 1 can be assummed to be not
66 * - For address ranges without anons in the amp:
67 * - Lookup pages in underlying vnode.
68 * - For incore pages not previously visiting for
69 * each of the proc's collectives, add incore
70 * pagesize to each collective.
72 * - Lookup pages in the shared amp or vnode.
73 * - For incore pages not previously visited for each of
74 * the proc's collectives, add incore pagesize to each
77 * Swap is reserved by private segments, and shared anonymous segments.
78 * The only shared anon segments which do not reserve swap are ISM segments
79 * and schedctl segments, both of which can be identified by having
82 * The swap calculation for each collective is as follows:
84 * - Inspect flags, determine if counting rss for zones, projects, tasks,
87 * - Figure out proc's collectives (zone, project, task, and/or user).
88 * - For each seg in proc's address space:
89 * - If seg is private:
90 * - Add svd->swresv pages to swap count for each of the
92 * - If seg is anon, shared, and amp->swresv != 0
93 * - For address ranges in amp not previously visited for
94 * each of the proc's collectives, add size of address
95 * range to the swap count for each collective.
97 * These two calculations are done simultaneously, with most of the work
98 * being done in vmu_calculate_seg(). The results of the calculation are
99 * copied into "vmu_data.vmu_cache_results".
101 * To perform the calculation, various things are tracked and cached:
103 * - incore/not-incore page ranges for all vnodes.
104 * (vmu_data.vmu_all_vnodes_hash)
105 * This eliminates looking up the same page more than once.
107 * - incore/not-incore page ranges for all shared amps.
108 * (vmu_data.vmu_all_amps_hash)
109 * This eliminates looking up the same page more than once.
111 * - visited page ranges for each collective.
112 * - per vnode (entity->vme_vnode_hash)
113 * - per shared amp (entity->vme_amp_hash)
114 * For accurate counting of map-shared and COW-shared pages.
116 * - visited private anons (refcnt > 1) for each collective.
117 * (entity->vme_anon_hash)
118 * For accurate counting of COW-shared pages.
120 * The common accounting structure is the vmu_entity_t, which represents
124 * - A project, task, or user within a zone.
125 * - The entire system (vmu_data.vmu_system).
126 * - Each collapsed (col) project and user. This means a given projid or
127 * uid, regardless of which zone the process is in. For instance,
128 * project 0 in the global zone and project 0 in a non global zone are
129 * the same collapsed project.
131 * Each entity structure tracks which pages have been already visited for
132 * that entity (via previously inspected processes) so that these pages are
133 * not double counted.
136 #include <sys/errno.h>
137 #include <sys/types.h>
138 #include <sys/zone.h>
139 #include <sys/proc.h>
140 #include <sys/project.h>
141 #include <sys/task.h>
142 #include <sys/thread.h>
143 #include <sys/time.h>
144 #include <sys/mman.h>
145 #include <sys/modhash.h>
146 #include <sys/modhash_impl.h>
148 #include <sys/swap.h>
149 #include <sys/synch.h>
150 #include <sys/systm.h>
152 #include <sys/vm_usage.h>
153 #include <sys/zone.h>
154 #include <sys/sunddi.h>
158 #include <vm/seg_vn.h>
159 #include <vm/seg_spt.h>
161 #define VMUSAGE_HASH_SIZE 512
163 #define VMUSAGE_TYPE_VNODE 1
164 #define VMUSAGE_TYPE_AMP 2
165 #define VMUSAGE_TYPE_ANON 3
167 #define VMUSAGE_BOUND_UNKNOWN 0
168 #define VMUSAGE_BOUND_INCORE 1
169 #define VMUSAGE_BOUND_NOT_INCORE 2
171 #define ISWITHIN(node, addr) ((node)->vmb_start <= addr && \
172 (node)->vmb_end >= addr ? 1 : 0)
175 * bounds for vnodes and shared amps
176 * Each bound is either entirely incore, entirely not in core, or
177 * entirely unknown. bounds are stored in an avl tree sorted by start member
178 * when in use, otherwise (free or temporary lists) they're strung
179 * together off of vmb_next.
181 typedef struct vmu_bound
{
183 struct vmu_bound
*vmb_next
; /* NULL in tree else on free or temp list */
184 pgcnt_t vmb_start
; /* page offset in vnode/amp on which bound starts */
185 pgcnt_t vmb_end
; /* page offset in vnode/amp on which bound ends */
186 char vmb_type
; /* One of VMUSAGE_BOUND_* */
190 * hash of visited objects (vnodes or shared amps)
191 * key is address of vnode or amp. Bounds lists known incore/non-incore
192 * bounds for vnode/amp.
194 typedef struct vmu_object
{
195 struct vmu_object
*vmo_next
; /* free list */
198 avl_tree_t vmo_bounds
;
202 * Entity by which to count results.
204 * The entity structure keeps the current rss/swap counts for each entity
205 * (zone, project, etc), and hashes of vm structures that have already
206 * been visited for the entity.
208 * vme_next: links the list of all entities currently being counted by
211 * vme_next_calc: links the list of entities related to the current process
212 * being counted by vmu_calculate_proc().
214 * vmu_calculate_proc() walks all processes. For each process, it makes a
215 * list of the entities related to that process using vme_next_calc. This
216 * list changes each time vmu_calculate_proc() is called.
219 typedef struct vmu_entity
{
220 struct vmu_entity
*vme_next
;
221 struct vmu_entity
*vme_next_calc
;
222 mod_hash_t
*vme_vnode_hash
; /* vnodes visited for entity */
223 mod_hash_t
*vme_amp_hash
; /* shared amps visited for entity */
224 mod_hash_t
*vme_anon_hash
; /* COW anons visited for entity */
225 vmusage_t vme_result
; /* identifies entity and results */
229 * Hash of entities visited within a zone, and an entity for the zone
232 typedef struct vmu_zone
{
233 struct vmu_zone
*vmz_next
; /* free list */
235 vmu_entity_t
*vmz_zone
;
236 mod_hash_t
*vmz_projects_hash
;
237 mod_hash_t
*vmz_tasks_hash
;
238 mod_hash_t
*vmz_rusers_hash
;
239 mod_hash_t
*vmz_eusers_hash
;
243 * Cache of results from last calculation
245 typedef struct vmu_cache
{
246 vmusage_t
*vmc_results
; /* Results from last call to */
248 uint64_t vmc_nresults
; /* Count of cached results */
249 uint64_t vmc_refcnt
; /* refcnt for free */
250 uint_t vmc_flags
; /* Flags for vm_getusage() */
251 hrtime_t vmc_timestamp
; /* when cache was created */
255 * top level rss info for the system
257 typedef struct vmu_data
{
258 kmutex_t vmu_lock
; /* Protects vmu_data */
259 kcondvar_t vmu_cv
; /* Used to signal threads */
261 /* Rss_calc_thread to finish */
262 vmu_entity_t
*vmu_system
; /* Entity for tracking */
263 /* rss/swap for all processes */
265 mod_hash_t
*vmu_zones_hash
; /* Zones visited */
266 mod_hash_t
*vmu_projects_col_hash
; /* These *_col_hash hashes */
267 mod_hash_t
*vmu_rusers_col_hash
; /* keep track of entities, */
268 mod_hash_t
*vmu_eusers_col_hash
; /* ignoring zoneid, in order */
269 /* to implement VMUSAGE_COL_* */
270 /* flags, which aggregate by */
271 /* project or user regardless */
273 mod_hash_t
*vmu_all_vnodes_hash
; /* System wide visited vnodes */
274 /* to track incore/not-incore */
275 mod_hash_t
*vmu_all_amps_hash
; /* System wide visited shared */
276 /* amps to track incore/not- */
278 vmu_entity_t
*vmu_entities
; /* Linked list of entities */
279 size_t vmu_nentities
; /* Count of entities in list */
280 vmu_cache_t
*vmu_cache
; /* Cached results */
281 kthread_t
*vmu_calc_thread
; /* NULL, or thread running */
282 /* vmu_calculate() */
283 uint_t vmu_calc_flags
; /* Flags being using by */
284 /* currently running calc */
286 uint_t vmu_pending_flags
; /* Flags of vm_getusage() */
287 /* threads waiting for */
288 /* calc thread to finish */
289 uint_t vmu_pending_waiters
; /* Number of threads waiting */
290 /* for calc thread */
291 vmu_bound_t
*vmu_free_bounds
;
292 vmu_object_t
*vmu_free_objects
;
293 vmu_entity_t
*vmu_free_entities
;
294 vmu_zone_t
*vmu_free_zones
;
297 extern struct as kas
;
298 extern proc_t
*practive
;
299 extern zone_t
*global_zone
;
300 extern const struct seg_ops segvn_ops
;
301 extern const struct seg_ops segspt_shmops
;
303 static vmu_data_t vmu_data
;
304 static kmem_cache_t
*vmu_bound_cache
;
305 static kmem_cache_t
*vmu_object_cache
;
308 * Comparison routine for AVL tree. We base our comparison on vmb_start.
311 bounds_cmp(const void *bnd1
, const void *bnd2
)
313 const vmu_bound_t
*bound1
= bnd1
;
314 const vmu_bound_t
*bound2
= bnd2
;
316 if (bound1
->vmb_start
== bound2
->vmb_start
) {
319 if (bound1
->vmb_start
< bound2
->vmb_start
) {
327 * Save a bound on the free list.
330 vmu_free_bound(vmu_bound_t
*bound
)
332 bound
->vmb_next
= vmu_data
.vmu_free_bounds
;
333 bound
->vmb_start
= 0;
336 vmu_data
.vmu_free_bounds
= bound
;
340 * Free an object, and all visited bound info.
343 vmu_free_object(mod_hash_val_t val
)
345 vmu_object_t
*obj
= (vmu_object_t
*)val
;
346 avl_tree_t
*tree
= &(obj
->vmo_bounds
);
350 while ((bound
= avl_destroy_nodes(tree
, &cookie
)) != NULL
)
351 vmu_free_bound(bound
);
355 obj
->vmo_next
= vmu_data
.vmu_free_objects
;
356 vmu_data
.vmu_free_objects
= obj
;
360 * Free an entity, and hashes of visited objects for that entity.
363 vmu_free_entity(mod_hash_val_t val
)
365 vmu_entity_t
*entity
= (vmu_entity_t
*)val
;
367 if (entity
->vme_vnode_hash
!= NULL
)
368 i_mod_hash_clear_nosync(entity
->vme_vnode_hash
);
369 if (entity
->vme_amp_hash
!= NULL
)
370 i_mod_hash_clear_nosync(entity
->vme_amp_hash
);
371 if (entity
->vme_anon_hash
!= NULL
)
372 i_mod_hash_clear_nosync(entity
->vme_anon_hash
);
374 entity
->vme_next
= vmu_data
.vmu_free_entities
;
375 vmu_data
.vmu_free_entities
= entity
;
379 * Free zone entity, and all hashes of entities inside that zone,
380 * which are projects, tasks, and users.
383 vmu_free_zone(mod_hash_val_t val
)
385 vmu_zone_t
*zone
= (vmu_zone_t
*)val
;
387 if (zone
->vmz_zone
!= NULL
) {
388 vmu_free_entity((mod_hash_val_t
)zone
->vmz_zone
);
389 zone
->vmz_zone
= NULL
;
391 if (zone
->vmz_projects_hash
!= NULL
)
392 i_mod_hash_clear_nosync(zone
->vmz_projects_hash
);
393 if (zone
->vmz_tasks_hash
!= NULL
)
394 i_mod_hash_clear_nosync(zone
->vmz_tasks_hash
);
395 if (zone
->vmz_rusers_hash
!= NULL
)
396 i_mod_hash_clear_nosync(zone
->vmz_rusers_hash
);
397 if (zone
->vmz_eusers_hash
!= NULL
)
398 i_mod_hash_clear_nosync(zone
->vmz_eusers_hash
);
399 zone
->vmz_next
= vmu_data
.vmu_free_zones
;
400 vmu_data
.vmu_free_zones
= zone
;
404 * Initialize synchronization primitives and hashes for system-wide tracking
405 * of visited vnodes and shared amps. Initialize results cache.
410 mutex_init(&vmu_data
.vmu_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
411 cv_init(&vmu_data
.vmu_cv
, NULL
, CV_DEFAULT
, NULL
);
413 vmu_data
.vmu_system
= NULL
;
414 vmu_data
.vmu_zones_hash
= NULL
;
415 vmu_data
.vmu_projects_col_hash
= NULL
;
416 vmu_data
.vmu_rusers_col_hash
= NULL
;
417 vmu_data
.vmu_eusers_col_hash
= NULL
;
419 vmu_data
.vmu_free_bounds
= NULL
;
420 vmu_data
.vmu_free_objects
= NULL
;
421 vmu_data
.vmu_free_entities
= NULL
;
422 vmu_data
.vmu_free_zones
= NULL
;
424 vmu_data
.vmu_all_vnodes_hash
= mod_hash_create_ptrhash(
425 "vmusage vnode hash", VMUSAGE_HASH_SIZE
, vmu_free_object
,
427 vmu_data
.vmu_all_amps_hash
= mod_hash_create_ptrhash(
428 "vmusage amp hash", VMUSAGE_HASH_SIZE
, vmu_free_object
,
429 sizeof (struct anon_map
));
430 vmu_data
.vmu_projects_col_hash
= mod_hash_create_idhash(
431 "vmusage collapsed project hash", VMUSAGE_HASH_SIZE
,
433 vmu_data
.vmu_rusers_col_hash
= mod_hash_create_idhash(
434 "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE
,
436 vmu_data
.vmu_eusers_col_hash
= mod_hash_create_idhash(
437 "vmusage collpased euser hash", VMUSAGE_HASH_SIZE
,
439 vmu_data
.vmu_zones_hash
= mod_hash_create_idhash(
440 "vmusage zone hash", VMUSAGE_HASH_SIZE
, vmu_free_zone
);
442 vmu_bound_cache
= kmem_cache_create("vmu_bound_cache",
443 sizeof (vmu_bound_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
444 vmu_object_cache
= kmem_cache_create("vmu_object_cache",
445 sizeof (vmu_object_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
447 vmu_data
.vmu_entities
= NULL
;
448 vmu_data
.vmu_nentities
= 0;
450 vmu_data
.vmu_cache
= NULL
;
451 vmu_data
.vmu_calc_thread
= NULL
;
452 vmu_data
.vmu_calc_flags
= 0;
453 vmu_data
.vmu_pending_flags
= 0;
454 vmu_data
.vmu_pending_waiters
= 0;
458 * Allocate hashes for tracking vm objects visited for an entity.
459 * Update list of entities.
461 static vmu_entity_t
*
462 vmu_alloc_entity(id_t id
, int type
, id_t zoneid
)
464 vmu_entity_t
*entity
;
466 if (vmu_data
.vmu_free_entities
!= NULL
) {
467 entity
= vmu_data
.vmu_free_entities
;
468 vmu_data
.vmu_free_entities
=
469 vmu_data
.vmu_free_entities
->vme_next
;
470 bzero(&entity
->vme_result
, sizeof (vmusage_t
));
472 entity
= kmem_zalloc(sizeof (vmu_entity_t
), KM_SLEEP
);
474 entity
->vme_result
.vmu_id
= id
;
475 entity
->vme_result
.vmu_zoneid
= zoneid
;
476 entity
->vme_result
.vmu_type
= type
;
478 if (entity
->vme_vnode_hash
== NULL
)
479 entity
->vme_vnode_hash
= mod_hash_create_ptrhash(
480 "vmusage vnode hash", VMUSAGE_HASH_SIZE
, vmu_free_object
,
483 if (entity
->vme_amp_hash
== NULL
)
484 entity
->vme_amp_hash
= mod_hash_create_ptrhash(
485 "vmusage amp hash", VMUSAGE_HASH_SIZE
, vmu_free_object
,
486 sizeof (struct anon_map
));
488 if (entity
->vme_anon_hash
== NULL
)
489 entity
->vme_anon_hash
= mod_hash_create_ptrhash(
490 "vmusage anon hash", VMUSAGE_HASH_SIZE
,
491 mod_hash_null_valdtor
, sizeof (struct anon
));
493 entity
->vme_next
= vmu_data
.vmu_entities
;
494 vmu_data
.vmu_entities
= entity
;
495 vmu_data
.vmu_nentities
++;
501 * Allocate a zone entity, and hashes for tracking visited vm objects
502 * for projects, tasks, and users within that zone.
505 vmu_alloc_zone(id_t id
)
509 if (vmu_data
.vmu_free_zones
!= NULL
) {
510 zone
= vmu_data
.vmu_free_zones
;
511 vmu_data
.vmu_free_zones
=
512 vmu_data
.vmu_free_zones
->vmz_next
;
513 zone
->vmz_next
= NULL
;
514 zone
->vmz_zone
= NULL
;
516 zone
= kmem_zalloc(sizeof (vmu_zone_t
), KM_SLEEP
);
521 if ((vmu_data
.vmu_calc_flags
& (VMUSAGE_ZONE
| VMUSAGE_ALL_ZONES
)) != 0)
522 zone
->vmz_zone
= vmu_alloc_entity(id
, VMUSAGE_ZONE
, id
);
524 if ((vmu_data
.vmu_calc_flags
& (VMUSAGE_PROJECTS
|
525 VMUSAGE_ALL_PROJECTS
)) != 0 && zone
->vmz_projects_hash
== NULL
)
526 zone
->vmz_projects_hash
= mod_hash_create_idhash(
527 "vmusage project hash", VMUSAGE_HASH_SIZE
, vmu_free_entity
);
529 if ((vmu_data
.vmu_calc_flags
& (VMUSAGE_TASKS
| VMUSAGE_ALL_TASKS
))
530 != 0 && zone
->vmz_tasks_hash
== NULL
)
531 zone
->vmz_tasks_hash
= mod_hash_create_idhash(
532 "vmusage task hash", VMUSAGE_HASH_SIZE
, vmu_free_entity
);
534 if ((vmu_data
.vmu_calc_flags
& (VMUSAGE_RUSERS
| VMUSAGE_ALL_RUSERS
))
535 != 0 && zone
->vmz_rusers_hash
== NULL
)
536 zone
->vmz_rusers_hash
= mod_hash_create_idhash(
537 "vmusage ruser hash", VMUSAGE_HASH_SIZE
, vmu_free_entity
);
539 if ((vmu_data
.vmu_calc_flags
& (VMUSAGE_EUSERS
| VMUSAGE_ALL_EUSERS
))
540 != 0 && zone
->vmz_eusers_hash
== NULL
)
541 zone
->vmz_eusers_hash
= mod_hash_create_idhash(
542 "vmusage euser hash", VMUSAGE_HASH_SIZE
, vmu_free_entity
);
548 * Allocate a structure for tracking visited bounds for a vm object.
550 static vmu_object_t
*
551 vmu_alloc_object(caddr_t key
, int type
)
553 vmu_object_t
*object
;
555 if (vmu_data
.vmu_free_objects
!= NULL
) {
556 object
= vmu_data
.vmu_free_objects
;
557 vmu_data
.vmu_free_objects
=
558 vmu_data
.vmu_free_objects
->vmo_next
;
560 object
= kmem_cache_alloc(vmu_object_cache
, KM_SLEEP
);
563 object
->vmo_next
= NULL
;
564 object
->vmo_key
= key
;
565 object
->vmo_type
= type
;
566 avl_create(&(object
->vmo_bounds
), bounds_cmp
, sizeof (vmu_bound_t
), 0);
572 * Allocate and return a bound structure.
579 if (vmu_data
.vmu_free_bounds
!= NULL
) {
580 bound
= vmu_data
.vmu_free_bounds
;
581 vmu_data
.vmu_free_bounds
=
582 vmu_data
.vmu_free_bounds
->vmb_next
;
584 bound
= kmem_cache_alloc(vmu_bound_cache
, KM_SLEEP
);
587 bound
->vmb_next
= NULL
;
588 bound
->vmb_start
= 0;
595 * vmu_find_insert_* functions implement hash lookup or allocate and
598 static vmu_object_t
*
599 vmu_find_insert_object(mod_hash_t
*hash
, caddr_t key
, uint_t type
)
602 vmu_object_t
*object
;
604 ret
= i_mod_hash_find_nosync(hash
, (mod_hash_key_t
)key
,
605 (mod_hash_val_t
*)&object
);
607 object
= vmu_alloc_object(key
, type
);
608 ret
= i_mod_hash_insert_nosync(hash
, (mod_hash_key_t
)key
,
609 (mod_hash_val_t
)object
, (mod_hash_hndl_t
)0);
616 vmu_find_insert_anon(mod_hash_t
*hash
, caddr_t key
)
621 ret
= i_mod_hash_find_nosync(hash
, (mod_hash_key_t
)key
,
622 (mod_hash_val_t
*)&val
);
627 ret
= i_mod_hash_insert_nosync(hash
, (mod_hash_key_t
)key
,
628 (mod_hash_val_t
)key
, (mod_hash_hndl_t
)0);
635 static vmu_entity_t
*
636 vmu_find_insert_entity(mod_hash_t
*hash
, id_t id
, uint_t type
, id_t zoneid
)
639 vmu_entity_t
*entity
;
641 ret
= i_mod_hash_find_nosync(hash
, (mod_hash_key_t
)(uintptr_t)id
,
642 (mod_hash_val_t
*)&entity
);
644 entity
= vmu_alloc_entity(id
, type
, zoneid
);
645 ret
= i_mod_hash_insert_nosync(hash
,
646 (mod_hash_key_t
)(uintptr_t)id
, (mod_hash_val_t
)entity
,
657 * Returns list of object bounds between start and end. New bounds inserted
658 * by this call are given type.
660 * Returns the number of pages covered if new bounds are created. Returns 0
661 * if region between start/end consists of all existing bounds.
664 vmu_insert_lookup_object_bounds(vmu_object_t
*ro
, pgcnt_t start
, pgcnt_t
665 end
, char type
, vmu_bound_t
**first
, vmu_bound_t
**last
)
667 avl_tree_t
*tree
= &(ro
->vmo_bounds
);
669 vmu_bound_t
*walker
, *tmp
;
672 ASSERT(start
<= end
);
674 *first
= *last
= NULL
;
676 tmp
= vmu_alloc_bound();
677 tmp
->vmb_start
= start
;
678 tmp
->vmb_type
= type
;
680 /* Hopelessly optimistic case. */
681 if (walker
= avl_find(tree
, tmp
, &where
)) {
687 if (walker
== NULL
) {
688 /* Is start in the previous node? */
689 walker
= avl_nearest(tree
, where
, AVL_BEFORE
);
690 if (walker
!= NULL
) {
691 if (ISWITHIN(walker
, start
)) {
692 /* We found start. */
700 * At this point, if *first is still NULL, then we
701 * didn't get a direct hit and start isn't covered
702 * by the previous node. We know that the next node
703 * must have a greater start value than we require
704 * because avl_find tells us where the AVL routines would
705 * insert our new node. We have some gap between the
706 * start we want and the next node.
708 if (*first
== NULL
) {
709 walker
= avl_nearest(tree
, where
, AVL_AFTER
);
710 if (walker
!= NULL
&& walker
->vmb_start
<= end
) {
712 tmp
->vmb_end
= walker
->vmb_start
- 1;
715 /* We have a gap over [start, end]. */
717 *first
= *last
= tmp
;
719 ret
+= tmp
->vmb_end
- tmp
->vmb_start
+ 1;
720 avl_insert(tree
, tmp
, where
);
723 ASSERT(*first
!= NULL
);
731 * If we are here we still need to set *last and
732 * that may involve filling in some gaps.
736 if (ISWITHIN(*last
, end
)) {
740 walker
= AVL_NEXT(tree
, *last
);
741 if (walker
== NULL
|| walker
->vmb_start
> end
) {
742 /* Bottom or mid tree with gap. */
743 tmp
= vmu_alloc_bound();
744 tmp
->vmb_start
= (*last
)->vmb_end
+ 1;
746 tmp
->vmb_type
= type
;
747 ret
+= tmp
->vmb_end
- tmp
->vmb_start
+ 1;
748 avl_insert_here(tree
, tmp
, *last
, AVL_AFTER
);
752 if ((*last
)->vmb_end
+ 1 != walker
->vmb_start
) {
753 /* Non-contiguous. */
754 tmp
= vmu_alloc_bound();
755 tmp
->vmb_start
= (*last
)->vmb_end
+ 1;
756 tmp
->vmb_end
= walker
->vmb_start
- 1;
757 tmp
->vmb_type
= type
;
758 ret
+= tmp
->vmb_end
- tmp
->vmb_start
+ 1;
759 avl_insert_here(tree
, tmp
, *last
, AVL_AFTER
);
771 * vmu_update_bounds()
773 * tree: avl_tree in which first and last hang.
775 * first, last: list of continuous bounds, of which zero or more are of
776 * type VMUSAGE_BOUND_UNKNOWN.
778 * new_tree: avl_tree in which new_first and new_last hang.
780 * new_first, new_last: list of continuous bounds, of which none are of
781 * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to
782 * update the types of bounds in (first,last) with
783 * type VMUSAGE_BOUND_UNKNOWN.
785 * For the list of bounds (first,last), this function updates any bounds
786 * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
787 * the list (new_first, new_last).
789 * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
790 * (new_first, new_last), it will be split into multiple bounds.
793 * The number of pages in the list of bounds (first,last) that were of
794 * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
795 * VMUSAGE_BOUND_INCORE.
799 vmu_update_bounds(avl_tree_t
*tree
, vmu_bound_t
**first
, vmu_bound_t
**last
,
800 avl_tree_t
*new_tree
, vmu_bound_t
*new_first
, vmu_bound_t
*new_last
)
802 vmu_bound_t
*next
, *new_next
, *tmp
;
806 new_next
= new_first
;
809 * Verify first and last bound are covered by new bounds if they
812 ASSERT((*first
)->vmb_type
!= VMUSAGE_BOUND_UNKNOWN
||
813 (*first
)->vmb_start
>= new_first
->vmb_start
);
814 ASSERT((*last
)->vmb_type
!= VMUSAGE_BOUND_UNKNOWN
||
815 (*last
)->vmb_end
<= new_last
->vmb_end
);
817 /* If bound already has type, proceed to next bound. */
818 if (next
->vmb_type
!= VMUSAGE_BOUND_UNKNOWN
) {
821 next
= AVL_NEXT(tree
, next
);
824 while (new_next
->vmb_end
< next
->vmb_start
)
825 new_next
= AVL_NEXT(new_tree
, new_next
);
826 ASSERT(new_next
->vmb_type
!= VMUSAGE_BOUND_UNKNOWN
);
827 next
->vmb_type
= new_next
->vmb_type
;
828 if (new_next
->vmb_end
< next
->vmb_end
) {
829 /* need to split bound */
830 tmp
= vmu_alloc_bound();
831 tmp
->vmb_type
= VMUSAGE_BOUND_UNKNOWN
;
832 tmp
->vmb_start
= new_next
->vmb_end
+ 1;
833 tmp
->vmb_end
= next
->vmb_end
;
834 avl_insert_here(tree
, tmp
, next
, AVL_AFTER
);
835 next
->vmb_end
= new_next
->vmb_end
;
838 if (next
->vmb_type
== VMUSAGE_BOUND_INCORE
)
839 rss
+= next
->vmb_end
- next
->vmb_start
+ 1;
842 if (next
->vmb_type
== VMUSAGE_BOUND_INCORE
)
843 rss
+= next
->vmb_end
- next
->vmb_start
+ 1;
846 next
= AVL_NEXT(tree
, next
);
853 * Merges adjacent bounds with same type between first and last bound.
854 * After merge, last pointer may point to a different bound, as (incoming)
855 * last bound may have been merged away.
858 vmu_merge_bounds(avl_tree_t
*tree
, vmu_bound_t
**first
, vmu_bound_t
**last
)
860 vmu_bound_t
*current
;
863 ASSERT(tree
!= NULL
);
864 ASSERT(*first
!= NULL
);
865 ASSERT(*last
!= NULL
);
868 while (current
!= *last
) {
869 next
= AVL_NEXT(tree
, current
);
870 if ((current
->vmb_end
+ 1) == next
->vmb_start
&&
871 current
->vmb_type
== next
->vmb_type
) {
872 current
->vmb_end
= next
->vmb_end
;
873 avl_remove(tree
, next
);
874 vmu_free_bound(next
);
879 current
= AVL_NEXT(tree
, current
);
885 * Given an amp and a list of bounds, updates each bound's type with
886 * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
888 * If a bound is partially incore, it will be split into two bounds.
889 * first and last may be modified, as bounds may be split into multiple
890 * bounds if they are partially incore/not-incore.
892 * Set incore to non-zero if bounds are already known to be incore.
896 vmu_amp_update_incore_bounds(avl_tree_t
*tree
, struct anon_map
*amp
,
897 vmu_bound_t
**first
, vmu_bound_t
**last
, boolean_t incore
)
909 /* Shared anon slots don't change once set. */
910 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
912 if (incore
== B_TRUE
)
913 next
->vmb_type
= VMUSAGE_BOUND_INCORE
;
915 if (next
->vmb_type
!= VMUSAGE_BOUND_UNKNOWN
) {
918 next
= AVL_NEXT(tree
, next
);
921 bound_type
= next
->vmb_type
;
922 index
= next
->vmb_start
;
923 while (index
<= next
->vmb_end
) {
926 * These are used to determine how much to increment
927 * index when a large page is found.
934 ap
= anon_get_ptr(amp
->ahp
, index
);
936 swap_xlate(ap
, &vn
, &off
);
938 if (ap
!= NULL
&& vn
!= NULL
&& vn_has_cached_data(vn
) &&
939 (page
= page_exists(&vn
->v_object
, off
)) != NULL
) {
940 page_type
= VMUSAGE_BOUND_INCORE
;
941 if (page
->p_szc
> 0) {
942 pgcnt
= page_get_pagecnt(page
->p_szc
);
943 pgshft
= page_get_shift(page
->p_szc
);
944 pgmsk
= (0x1 << (pgshft
- PAGESHIFT
))
948 page_type
= VMUSAGE_BOUND_NOT_INCORE
;
950 if (bound_type
== VMUSAGE_BOUND_UNKNOWN
) {
951 next
->vmb_type
= page_type
;
952 } else if (next
->vmb_type
!= page_type
) {
954 * If current bound type does not match page
955 * type, need to split off new bound.
957 tmp
= vmu_alloc_bound();
958 tmp
->vmb_type
= page_type
;
959 tmp
->vmb_start
= index
;
960 tmp
->vmb_end
= next
->vmb_end
;
961 avl_insert_here(tree
, tmp
, next
, AVL_AFTER
);
962 next
->vmb_end
= index
- 1;
969 * If inside large page, jump to next large
972 index
= (index
& ~pgmsk
) + pgcnt
;
978 ASSERT(next
->vmb_type
!= VMUSAGE_BOUND_UNKNOWN
);
981 next
= AVL_NEXT(tree
, next
);
983 ANON_LOCK_EXIT(&
->a_rwlock
);
987 * Same as vmu_amp_update_incore_bounds(), except for tracking
988 * incore-/not-incore for vnodes.
991 vmu_vnode_update_incore_bounds(avl_tree_t
*tree
, vnode_t
*vnode
,
992 vmu_bound_t
**first
, vmu_bound_t
**last
)
1002 if (!vn_has_cached_data(vnode
))
1003 next
->vmb_type
= VMUSAGE_BOUND_NOT_INCORE
;
1005 if (next
->vmb_type
!= VMUSAGE_BOUND_UNKNOWN
) {
1008 next
= AVL_NEXT(tree
, next
);
1012 bound_type
= next
->vmb_type
;
1013 index
= next
->vmb_start
;
1014 while (index
<= next
->vmb_end
) {
1017 * These are used to determine how much to increment
1018 * index when a large page is found.
1025 if (vn_has_cached_data(vnode
) &&
1026 (page
= page_exists(&vnode
->v_object
, ptob(index
))) != NULL
) {
1027 page_type
= VMUSAGE_BOUND_INCORE
;
1028 if (page
->p_szc
> 0) {
1029 pgcnt
= page_get_pagecnt(page
->p_szc
);
1030 pgshft
= page_get_shift(page
->p_szc
);
1031 pgmsk
= (0x1 << (pgshft
- PAGESHIFT
))
1035 page_type
= VMUSAGE_BOUND_NOT_INCORE
;
1037 if (bound_type
== VMUSAGE_BOUND_UNKNOWN
) {
1038 next
->vmb_type
= page_type
;
1039 } else if (next
->vmb_type
!= page_type
) {
1041 * If current bound type does not match page
1042 * type, need to split off new bound.
1044 tmp
= vmu_alloc_bound();
1045 tmp
->vmb_type
= page_type
;
1046 tmp
->vmb_start
= index
;
1047 tmp
->vmb_end
= next
->vmb_end
;
1048 avl_insert_here(tree
, tmp
, next
, AVL_AFTER
);
1049 next
->vmb_end
= index
- 1;
1056 * If inside large page, jump to next large
1059 index
= (index
& ~pgmsk
) + pgcnt
;
1064 if (next
== *last
) {
1065 ASSERT(next
->vmb_type
!= VMUSAGE_BOUND_UNKNOWN
);
1068 next
= AVL_NEXT(tree
, next
);
1073 * Calculate the rss and swap consumed by a segment. vmu_entities is the
1074 * list of entities to visit. For shared segments, the vnode or amp
1075 * is looked up in each entity to see if it has been already counted. Private
1076 * anon pages are checked per entity to ensure that COW pages are not
1079 * For private mapped files, first the amp is checked for private pages.
1080 * Bounds not backed by the amp are looked up in the vnode for each entity
1081 * to avoid double counting of private COW vnode pages.
1084 vmu_calculate_seg(vmu_entity_t
*vmu_entities
, struct seg
*seg
)
1086 struct segvn_data
*svd
;
1087 struct shm_data
*shmd
;
1088 struct spt_data
*sptd
;
1089 vmu_object_t
*shared_object
= NULL
;
1090 vmu_object_t
*entity_object
= NULL
;
1091 vmu_entity_t
*entity
;
1093 vmu_bound_t
*first
= NULL
;
1094 vmu_bound_t
*last
= NULL
;
1095 vmu_bound_t
*cur
= NULL
;
1096 vmu_bound_t
*e_first
= NULL
;
1097 vmu_bound_t
*e_last
= NULL
;
1099 pgcnt_t p_index
, s_index
, p_start
, p_end
, s_start
, s_end
, rss
, virt
;
1100 struct anon_map
*private_amp
= NULL
;
1101 boolean_t incore
= B_FALSE
;
1102 boolean_t shared
= B_FALSE
;
1107 /* Can zero-length segments exist? Not sure, so paranoia. */
1108 if (seg
->s_size
<= 0)
1112 * Figure out if there is a shared object (such as a named vnode or
1113 * a shared amp, then figure out if there is a private amp, which
1114 * identifies private pages.
1116 if (seg
->s_ops
== &segvn_ops
) {
1117 svd
= (struct segvn_data
*)seg
->s_data
;
1118 if (svd
->type
== MAP_SHARED
) {
1121 swresv
= svd
->swresv
;
1123 if (SEGVN_LOCK_TRYENTER(seg
->s_as
, &svd
->lock
,
1126 * Text replication anon maps can be shared
1127 * across all zones. Space used for text
1128 * replication is typically capped as a small %
1129 * of memory. To keep it simple for now we
1130 * don't account for swap and memory space used
1131 * for text replication.
1133 if (svd
->tr_state
== SEGVN_TR_OFF
&&
1135 private_amp
= svd
->amp
;
1136 p_start
= svd
->anon_index
;
1137 p_end
= svd
->anon_index
+
1138 btop(seg
->s_size
) - 1;
1140 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
1143 if (svd
->vp
!= NULL
) {
1145 shared_object
= vmu_find_insert_object(
1146 vmu_data
.vmu_all_vnodes_hash
, (caddr_t
)svd
->vp
,
1147 VMUSAGE_TYPE_VNODE
);
1148 s_start
= btop(svd
->offset
);
1149 s_end
= btop(svd
->offset
+ seg
->s_size
) - 1;
1151 if (svd
->amp
!= NULL
&& svd
->type
== MAP_SHARED
) {
1152 ASSERT(shared_object
== NULL
);
1153 shared_object
= vmu_find_insert_object(
1154 vmu_data
.vmu_all_amps_hash
, (caddr_t
)svd
->amp
,
1156 s_start
= svd
->anon_index
;
1157 s_end
= svd
->anon_index
+ btop(seg
->s_size
) - 1;
1158 /* schedctl mappings are always in core */
1159 if (svd
->amp
->swresv
== 0)
1162 } else if (seg
->s_ops
== &segspt_shmops
) {
1164 shmd
= (struct shm_data
*)seg
->s_data
;
1165 shared_object
= vmu_find_insert_object(
1166 vmu_data
.vmu_all_amps_hash
, (caddr_t
)shmd
->shm_amp
,
1169 s_end
= btop(seg
->s_size
) - 1;
1170 sptd
= shmd
->shm_sptseg
->s_data
;
1172 /* ism segments are always incore and do not reserve swap */
1173 if (sptd
->spt_flags
& SHM_SHARE_MMU
)
1181 * If there is a private amp, count anon pages that exist. If an
1182 * anon has a refcnt > 1 (COW sharing), then save the anon in a
1183 * hash so that it is not double counted.
1185 * If there is also a shared object, then figure out the bounds
1186 * which are not mapped by the private amp.
1188 if (private_amp
!= NULL
) {
1190 /* Enter as writer to prevent COW anons from being freed */
1191 ANON_LOCK_ENTER(&private_amp
->a_rwlock
, RW_WRITER
);
1196 while (p_index
<= p_end
) {
1198 pgcnt_t p_index_next
;
1199 pgcnt_t p_bound_size
;
1204 page_t
*page
; /* For handling of large */
1205 pgcnt_t pgcnt
= 1; /* pages */
1211 p_index_next
= p_index
;
1212 ap
= anon_get_next_ptr(private_amp
->ahp
,
1216 * If next anon is past end of mapping, simulate
1217 * end of anon so loop terminates.
1219 if (p_index_next
> p_end
) {
1220 p_index_next
= p_end
+ 1;
1224 * For COW segments, keep track of bounds not
1225 * backed by private amp so they can be looked
1226 * up in the backing vnode
1228 if (p_index_next
!= p_index
) {
1231 * Compute index difference between anon and
1234 p_bound_size
= p_index_next
- p_index
- 1;
1236 if (shared_object
!= NULL
) {
1237 cur
= vmu_alloc_bound();
1238 cur
->vmb_start
= s_index
;
1239 cur
->vmb_end
= s_index
+ p_bound_size
;
1240 cur
->vmb_type
= VMUSAGE_BOUND_UNKNOWN
;
1241 if (first
== NULL
) {
1245 last
->vmb_next
= cur
;
1249 p_index
= p_index
+ p_bound_size
+ 1;
1250 s_index
= s_index
+ p_bound_size
+ 1;
1253 /* Detect end of anons in amp */
1257 cnt
= ap
->an_refcnt
;
1258 swap_xlate(ap
, &vn
, &off
);
1260 if (vn
== NULL
|| !vn_has_cached_data(vn
) ||
1261 (page
= page_exists(&vn
->v_object
, off
)) == NULL
) {
1268 * If large page is found, compute portion of large
1269 * page in mapping, and increment indicies to the next
1272 if (page
->p_szc
> 0) {
1274 pgcnt
= page_get_pagecnt(page
->p_szc
);
1275 pgshft
= page_get_shift(page
->p_szc
);
1276 pgmsk
= (0x1 << (pgshft
- PAGESHIFT
)) - 1;
1278 /* First page in large page */
1279 pgstart
= p_index
& ~pgmsk
;
1280 /* Last page in large page */
1281 pgend
= pgstart
+ pgcnt
- 1;
1283 * Artifically end page if page extends past
1290 * Compute number of pages from large page
1293 pgcnt
= pgend
- p_index
+ 1;
1296 * Point indicies at page after large page,
1297 * or at page after end of mapping.
1307 * Assume anon structs with a refcnt
1308 * of 1 are not COW shared, so there
1309 * is no reason to track them per entity.
1315 for (entity
= vmu_entities
; entity
!= NULL
;
1316 entity
= entity
->vme_next_calc
) {
1318 result
= &entity
->vme_result
;
1320 * Track COW anons per entity so
1321 * they are not double counted.
1323 if (vmu_find_insert_anon(entity
->vme_anon_hash
,
1327 result
->vmu_rss_all
+= (pgcnt
<< PAGESHIFT
);
1328 result
->vmu_rss_private
+=
1329 (pgcnt
<< PAGESHIFT
);
1332 ANON_LOCK_EXIT(&private_amp
->a_rwlock
);
1335 /* Add up resident anon and swap reserved for private mappings */
1336 if (swresv
> 0 || panon
> 0) {
1337 for (entity
= vmu_entities
; entity
!= NULL
;
1338 entity
= entity
->vme_next_calc
) {
1339 result
= &entity
->vme_result
;
1340 result
->vmu_swap_all
+= swresv
;
1341 result
->vmu_swap_private
+= swresv
;
1342 result
->vmu_rss_all
+= (panon
<< PAGESHIFT
);
1343 result
->vmu_rss_private
+= (panon
<< PAGESHIFT
);
1347 /* Compute resident pages backing shared amp or named vnode */
1348 if (shared_object
!= NULL
) {
1349 avl_tree_t
*tree
= &(shared_object
->vmo_bounds
);
1351 if (first
== NULL
) {
1353 * No private amp, or private amp has no anon
1354 * structs. This means entire segment is backed by
1355 * the shared object.
1357 first
= vmu_alloc_bound();
1358 first
->vmb_start
= s_start
;
1359 first
->vmb_end
= s_end
;
1360 first
->vmb_type
= VMUSAGE_BOUND_UNKNOWN
;
1363 * Iterate bounds not backed by private amp, and compute
1367 while (cur
!= NULL
) {
1369 if (vmu_insert_lookup_object_bounds(shared_object
,
1370 cur
->vmb_start
, cur
->vmb_end
, VMUSAGE_BOUND_UNKNOWN
,
1371 &first
, &last
) > 0) {
1372 /* new bounds, find incore/not-incore */
1373 if (shared_object
->vmo_type
==
1374 VMUSAGE_TYPE_VNODE
) {
1375 vmu_vnode_update_incore_bounds(
1378 shared_object
->vmo_key
, &first
,
1381 vmu_amp_update_incore_bounds(
1384 shared_object
->vmo_key
, &first
,
1387 vmu_merge_bounds(tree
, &first
, &last
);
1389 for (entity
= vmu_entities
; entity
!= NULL
;
1390 entity
= entity
->vme_next_calc
) {
1393 result
= &entity
->vme_result
;
1395 entity_object
= vmu_find_insert_object(
1396 shared_object
->vmo_type
==
1397 VMUSAGE_TYPE_VNODE
? entity
->vme_vnode_hash
:
1398 entity
->vme_amp_hash
,
1399 shared_object
->vmo_key
,
1400 shared_object
->vmo_type
);
1402 virt
= vmu_insert_lookup_object_bounds(
1403 entity_object
, cur
->vmb_start
, cur
->vmb_end
,
1404 VMUSAGE_BOUND_UNKNOWN
, &e_first
, &e_last
);
1409 * Range visited for this entity
1411 e_tree
= &(entity_object
->vmo_bounds
);
1412 rss
= vmu_update_bounds(e_tree
, &e_first
,
1413 &e_last
, tree
, first
, last
);
1414 result
->vmu_rss_all
+= (rss
<< PAGESHIFT
);
1415 if (shared
== B_TRUE
&& file
== B_FALSE
) {
1416 /* shared anon mapping */
1417 result
->vmu_swap_all
+=
1418 (virt
<< PAGESHIFT
);
1419 result
->vmu_swap_shared
+=
1420 (virt
<< PAGESHIFT
);
1421 result
->vmu_rss_shared
+=
1423 } else if (shared
== B_TRUE
&& file
== B_TRUE
) {
1424 /* shared file mapping */
1425 result
->vmu_rss_shared
+=
1427 } else if (shared
== B_FALSE
&&
1429 /* private file mapping */
1430 result
->vmu_rss_private
+=
1433 vmu_merge_bounds(e_tree
, &e_first
, &e_last
);
1436 cur
= cur
->vmb_next
;
1437 vmu_free_bound(tmp
);
1443 * Based on the current calculation flags, find the relevant entities
1444 * which are relative to the process. Then calculate each segment
1445 * in the process'es address space for each relevant entity.
1448 vmu_calculate_proc(proc_t
*p
)
1450 vmu_entity_t
*entities
= NULL
;
1457 /* Figure out which entities are being computed */
1458 if ((vmu_data
.vmu_system
) != NULL
) {
1459 tmp
= vmu_data
.vmu_system
;
1460 tmp
->vme_next_calc
= entities
;
1463 if (vmu_data
.vmu_calc_flags
&
1464 (VMUSAGE_ZONE
| VMUSAGE_ALL_ZONES
| VMUSAGE_PROJECTS
|
1465 VMUSAGE_ALL_PROJECTS
| VMUSAGE_TASKS
| VMUSAGE_ALL_TASKS
|
1466 VMUSAGE_RUSERS
| VMUSAGE_ALL_RUSERS
| VMUSAGE_EUSERS
|
1467 VMUSAGE_ALL_EUSERS
)) {
1468 ret
= i_mod_hash_find_nosync(vmu_data
.vmu_zones_hash
,
1469 (mod_hash_key_t
)(uintptr_t)p
->p_zone
->zone_id
,
1470 (mod_hash_val_t
*)&zone
);
1472 zone
= vmu_alloc_zone(p
->p_zone
->zone_id
);
1473 ret
= i_mod_hash_insert_nosync(vmu_data
.vmu_zones_hash
,
1474 (mod_hash_key_t
)(uintptr_t)p
->p_zone
->zone_id
,
1475 (mod_hash_val_t
)zone
, (mod_hash_hndl_t
)0);
1478 if (zone
->vmz_zone
!= NULL
) {
1479 tmp
= zone
->vmz_zone
;
1480 tmp
->vme_next_calc
= entities
;
1483 if (vmu_data
.vmu_calc_flags
&
1484 (VMUSAGE_PROJECTS
| VMUSAGE_ALL_PROJECTS
)) {
1485 tmp
= vmu_find_insert_entity(zone
->vmz_projects_hash
,
1486 p
->p_task
->tk_proj
->kpj_id
, VMUSAGE_PROJECTS
,
1488 tmp
->vme_next_calc
= entities
;
1491 if (vmu_data
.vmu_calc_flags
&
1492 (VMUSAGE_TASKS
| VMUSAGE_ALL_TASKS
)) {
1493 tmp
= vmu_find_insert_entity(zone
->vmz_tasks_hash
,
1494 p
->p_task
->tk_tkid
, VMUSAGE_TASKS
, zone
->vmz_id
);
1495 tmp
->vme_next_calc
= entities
;
1498 if (vmu_data
.vmu_calc_flags
&
1499 (VMUSAGE_RUSERS
| VMUSAGE_ALL_RUSERS
)) {
1500 tmp
= vmu_find_insert_entity(zone
->vmz_rusers_hash
,
1501 crgetruid(p
->p_cred
), VMUSAGE_RUSERS
, zone
->vmz_id
);
1502 tmp
->vme_next_calc
= entities
;
1505 if (vmu_data
.vmu_calc_flags
&
1506 (VMUSAGE_EUSERS
| VMUSAGE_ALL_EUSERS
)) {
1507 tmp
= vmu_find_insert_entity(zone
->vmz_eusers_hash
,
1508 crgetuid(p
->p_cred
), VMUSAGE_EUSERS
, zone
->vmz_id
);
1509 tmp
->vme_next_calc
= entities
;
1513 /* Entities which collapse projects and users for all zones */
1514 if (vmu_data
.vmu_calc_flags
& VMUSAGE_COL_PROJECTS
) {
1515 tmp
= vmu_find_insert_entity(vmu_data
.vmu_projects_col_hash
,
1516 p
->p_task
->tk_proj
->kpj_id
, VMUSAGE_PROJECTS
, ALL_ZONES
);
1517 tmp
->vme_next_calc
= entities
;
1520 if (vmu_data
.vmu_calc_flags
& VMUSAGE_COL_RUSERS
) {
1521 tmp
= vmu_find_insert_entity(vmu_data
.vmu_rusers_col_hash
,
1522 crgetruid(p
->p_cred
), VMUSAGE_RUSERS
, ALL_ZONES
);
1523 tmp
->vme_next_calc
= entities
;
1526 if (vmu_data
.vmu_calc_flags
& VMUSAGE_COL_EUSERS
) {
1527 tmp
= vmu_find_insert_entity(vmu_data
.vmu_eusers_col_hash
,
1528 crgetuid(p
->p_cred
), VMUSAGE_EUSERS
, ALL_ZONES
);
1529 tmp
->vme_next_calc
= entities
;
1533 ASSERT(entities
!= NULL
);
1534 /* process all segs in process's address space */
1536 AS_LOCK_ENTER(as
, RW_READER
);
1537 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
;
1538 seg
= AS_SEGNEXT(as
, seg
)) {
1539 vmu_calculate_seg(entities
, seg
);
1545 * Free data created by previous call to vmu_calculate().
1550 if (vmu_data
.vmu_system
!= NULL
) {
1551 vmu_free_entity(vmu_data
.vmu_system
);
1552 vmu_data
.vmu_system
= NULL
;
1554 if (vmu_data
.vmu_zones_hash
!= NULL
)
1555 i_mod_hash_clear_nosync(vmu_data
.vmu_zones_hash
);
1556 if (vmu_data
.vmu_projects_col_hash
!= NULL
)
1557 i_mod_hash_clear_nosync(vmu_data
.vmu_projects_col_hash
);
1558 if (vmu_data
.vmu_rusers_col_hash
!= NULL
)
1559 i_mod_hash_clear_nosync(vmu_data
.vmu_rusers_col_hash
);
1560 if (vmu_data
.vmu_eusers_col_hash
!= NULL
)
1561 i_mod_hash_clear_nosync(vmu_data
.vmu_eusers_col_hash
);
1563 i_mod_hash_clear_nosync(vmu_data
.vmu_all_vnodes_hash
);
1564 i_mod_hash_clear_nosync(vmu_data
.vmu_all_amps_hash
);
1568 * Free unused data structures. These can result if the system workload
1569 * decreases between calculations.
1579 while (vmu_data
.vmu_free_bounds
!= NULL
) {
1580 tb
= vmu_data
.vmu_free_bounds
;
1581 vmu_data
.vmu_free_bounds
= vmu_data
.vmu_free_bounds
->vmb_next
;
1582 kmem_cache_free(vmu_bound_cache
, tb
);
1584 while (vmu_data
.vmu_free_objects
!= NULL
) {
1585 to
= vmu_data
.vmu_free_objects
;
1586 vmu_data
.vmu_free_objects
=
1587 vmu_data
.vmu_free_objects
->vmo_next
;
1588 kmem_cache_free(vmu_object_cache
, to
);
1590 while (vmu_data
.vmu_free_entities
!= NULL
) {
1591 te
= vmu_data
.vmu_free_entities
;
1592 vmu_data
.vmu_free_entities
=
1593 vmu_data
.vmu_free_entities
->vme_next
;
1594 if (te
->vme_vnode_hash
!= NULL
)
1595 mod_hash_destroy_hash(te
->vme_vnode_hash
);
1596 if (te
->vme_amp_hash
!= NULL
)
1597 mod_hash_destroy_hash(te
->vme_amp_hash
);
1598 if (te
->vme_anon_hash
!= NULL
)
1599 mod_hash_destroy_hash(te
->vme_anon_hash
);
1600 kmem_free(te
, sizeof (vmu_entity_t
));
1602 while (vmu_data
.vmu_free_zones
!= NULL
) {
1603 tz
= vmu_data
.vmu_free_zones
;
1604 vmu_data
.vmu_free_zones
=
1605 vmu_data
.vmu_free_zones
->vmz_next
;
1606 if (tz
->vmz_projects_hash
!= NULL
)
1607 mod_hash_destroy_hash(tz
->vmz_projects_hash
);
1608 if (tz
->vmz_tasks_hash
!= NULL
)
1609 mod_hash_destroy_hash(tz
->vmz_tasks_hash
);
1610 if (tz
->vmz_rusers_hash
!= NULL
)
1611 mod_hash_destroy_hash(tz
->vmz_rusers_hash
);
1612 if (tz
->vmz_eusers_hash
!= NULL
)
1613 mod_hash_destroy_hash(tz
->vmz_eusers_hash
);
1614 kmem_free(tz
, sizeof (vmu_zone_t
));
1618 extern kcondvar_t
*pr_pid_cv
;
1621 * Determine which entity types are relevant and allocate the hashes to
1622 * track them. Then walk the process table and count rss and swap
1623 * for each process'es address space. Address space object such as
1624 * vnodes, amps and anons are tracked per entity, so that they are
1625 * not double counted in the results.
1637 if (vmu_data
.vmu_calc_flags
& VMUSAGE_SYSTEM
)
1638 vmu_data
.vmu_system
= vmu_alloc_entity(0, VMUSAGE_SYSTEM
,
1642 * Walk process table and calculate rss of each proc.
1644 * Pidlock and p_lock cannot be held while doing the rss calculation.
1646 * 1. The calculation allocates using KM_SLEEP.
1647 * 2. The calculation grabs a_lock, which cannot be grabbed
1650 * Since pidlock must be dropped, we cannot simply just walk the
1651 * practive list. Instead, we walk the process table, and sprlock
1652 * each process to ensure that it does not exit during the
1656 mutex_enter(&pidlock
);
1657 for (i
= 0; i
< v
.v_proc
; i
++) {
1663 mutex_enter(&p
->p_lock
);
1664 mutex_exit(&pidlock
);
1667 mutex_exit(&p
->p_lock
);
1671 /* Try to set P_PR_LOCK */
1672 ret
= sprtrylock_proc(p
);
1674 /* Process in invalid state */
1675 mutex_exit(&p
->p_lock
);
1676 mutex_enter(&pidlock
);
1678 } else if (ret
== 1) {
1680 * P_PR_LOCK is already set. Wait and try again.
1681 * This also drops p_lock.
1683 sprwaitlock_proc(p
);
1684 mutex_enter(&pidlock
);
1687 mutex_exit(&p
->p_lock
);
1689 vmu_calculate_proc(p
);
1691 mutex_enter(&p
->p_lock
);
1693 mutex_enter(&pidlock
);
1695 mutex_exit(&pidlock
);
1701 * allocate a new cache for N results satisfying flags
1704 vmu_cache_alloc(size_t nres
, uint_t flags
)
1708 cache
= kmem_zalloc(sizeof (vmu_cache_t
), KM_SLEEP
);
1709 cache
->vmc_results
= kmem_zalloc(sizeof (vmusage_t
) * nres
, KM_SLEEP
);
1710 cache
->vmc_nresults
= nres
;
1711 cache
->vmc_flags
= flags
;
1712 cache
->vmc_refcnt
= 1;
1717 * Make sure cached results are not freed
1720 vmu_cache_hold(vmu_cache_t
*cache
)
1722 ASSERT(MUTEX_HELD(&vmu_data
.vmu_lock
));
1723 cache
->vmc_refcnt
++;
1730 vmu_cache_rele(vmu_cache_t
*cache
)
1732 ASSERT(MUTEX_HELD(&vmu_data
.vmu_lock
));
1733 ASSERT(cache
->vmc_refcnt
> 0);
1734 cache
->vmc_refcnt
--;
1735 if (cache
->vmc_refcnt
== 0) {
1736 kmem_free(cache
->vmc_results
, sizeof (vmusage_t
) *
1737 cache
->vmc_nresults
);
1738 kmem_free(cache
, sizeof (vmu_cache_t
));
1743 * Copy out the cached results to a caller. Inspect the callers flags
1744 * and zone to determine which cached results should be copied.
1747 vmu_copyout_results(vmu_cache_t
*cache
, vmusage_t
*buf
, size_t *nres
,
1748 uint_t flags
, int cpflg
)
1750 vmusage_t
*result
, *out_result
;
1752 size_t i
, count
= 0;
1758 if (ddi_copyin((caddr_t
)nres
, &bufsize
, sizeof (size_t), cpflg
))
1759 return (set_errno(EFAULT
));
1764 /* figure out what results the caller is interested in. */
1765 if ((flags
& VMUSAGE_SYSTEM
) && curproc
->p_zone
== global_zone
)
1766 types
|= VMUSAGE_SYSTEM
;
1767 if (flags
& (VMUSAGE_ZONE
| VMUSAGE_ALL_ZONES
))
1768 types
|= VMUSAGE_ZONE
;
1769 if (flags
& (VMUSAGE_PROJECTS
| VMUSAGE_ALL_PROJECTS
|
1770 VMUSAGE_COL_PROJECTS
))
1771 types
|= VMUSAGE_PROJECTS
;
1772 if (flags
& (VMUSAGE_TASKS
| VMUSAGE_ALL_TASKS
))
1773 types
|= VMUSAGE_TASKS
;
1774 if (flags
& (VMUSAGE_RUSERS
| VMUSAGE_ALL_RUSERS
| VMUSAGE_COL_RUSERS
))
1775 types
|= VMUSAGE_RUSERS
;
1776 if (flags
& (VMUSAGE_EUSERS
| VMUSAGE_ALL_EUSERS
| VMUSAGE_COL_EUSERS
))
1777 types
|= VMUSAGE_EUSERS
;
1779 /* count results for current zone */
1781 for (result
= cache
->vmc_results
, i
= 0;
1782 i
< cache
->vmc_nresults
; result
++, i
++) {
1784 /* Do not return "other-zone" results to non-global zones */
1785 if (curproc
->p_zone
!= global_zone
&&
1786 curproc
->p_zone
->zone_id
!= result
->vmu_zoneid
)
1790 * If non-global zone requests VMUSAGE_SYSTEM, fake
1791 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1793 if (curproc
->p_zone
!= global_zone
&&
1794 (flags
& VMUSAGE_SYSTEM
) != 0 &&
1795 result
->vmu_type
== VMUSAGE_ZONE
) {
1797 if (out_result
!= NULL
) {
1798 if (bufsize
< count
) {
1799 ret
= set_errno(EOVERFLOW
);
1802 dummy
.vmu_zoneid
= ALL_ZONES
;
1804 dummy
.vmu_type
= VMUSAGE_SYSTEM
;
1805 if (ddi_copyout(&dummy
, out_result
,
1806 sizeof (vmusage_t
), cpflg
))
1807 return (set_errno(EFAULT
));
1813 /* Skip results that do not match requested type */
1814 if ((result
->vmu_type
& types
) == 0)
1817 /* Skip collated results if not requested */
1818 if (result
->vmu_zoneid
== ALL_ZONES
) {
1819 if (result
->vmu_type
== VMUSAGE_PROJECTS
&&
1820 (flags
& VMUSAGE_COL_PROJECTS
) == 0)
1822 if (result
->vmu_type
== VMUSAGE_EUSERS
&&
1823 (flags
& VMUSAGE_COL_EUSERS
) == 0)
1825 if (result
->vmu_type
== VMUSAGE_RUSERS
&&
1826 (flags
& VMUSAGE_COL_RUSERS
) == 0)
1830 /* Skip "other zone" results if not requested */
1831 if (result
->vmu_zoneid
!= curproc
->p_zone
->zone_id
) {
1832 if (result
->vmu_type
== VMUSAGE_ZONE
&&
1833 (flags
& VMUSAGE_ALL_ZONES
) == 0)
1835 if (result
->vmu_type
== VMUSAGE_PROJECTS
&&
1836 (flags
& (VMUSAGE_ALL_PROJECTS
|
1837 VMUSAGE_COL_PROJECTS
)) == 0)
1839 if (result
->vmu_type
== VMUSAGE_TASKS
&&
1840 (flags
& VMUSAGE_ALL_TASKS
) == 0)
1842 if (result
->vmu_type
== VMUSAGE_RUSERS
&&
1843 (flags
& (VMUSAGE_ALL_RUSERS
|
1844 VMUSAGE_COL_RUSERS
)) == 0)
1846 if (result
->vmu_type
== VMUSAGE_EUSERS
&&
1847 (flags
& (VMUSAGE_ALL_EUSERS
|
1848 VMUSAGE_COL_EUSERS
)) == 0)
1852 if (out_result
!= NULL
) {
1853 if (bufsize
< count
) {
1854 ret
= set_errno(EOVERFLOW
);
1856 if (ddi_copyout(result
, out_result
,
1857 sizeof (vmusage_t
), cpflg
))
1858 return (set_errno(EFAULT
));
1864 if (ddi_copyout(&count
, (void *)nres
, sizeof (size_t), cpflg
))
1865 return (set_errno(EFAULT
));
1873 * Counts rss and swap by zone, project, task, and/or user. The flags argument
1874 * determines the type of results structures returned. Flags requesting
1875 * results from more than one zone are "flattened" to the local zone if the
1876 * caller is not the global zone.
1879 * flags: bitmap consisting of one or more of VMUSAGE_*.
1880 * age: maximum allowable age (time since counting was done) in
1881 * seconds of the results. Results from previous callers are
1883 * buf: pointer to buffer array of vmusage_t. If NULL, then only nres
1885 * nres: Set to number of vmusage_t structures pointed to by buf
1886 * before calling vm_getusage().
1887 * On return 0 (success) or ENOSPC, is set to the number of result
1888 * structures returned or attempted to return.
1890 * returns 0 on success, -1 on failure:
1891 * EINTR (interrupted)
1892 * ENOSPC (nres to small for results, nres set to needed value for success)
1893 * EINVAL (flags invalid)
1894 * EFAULT (bad address for buf or nres)
1897 vm_getusage(uint_t flags
, time_t age
, vmusage_t
*buf
, size_t *nres
, int cpflg
)
1899 vmu_entity_t
*entity
;
1902 int cacherecent
= 0;
1907 * Non-global zones cannot request system wide and/or collated
1908 * results, or the system result, so munge the flags accordingly.
1911 if (curproc
->p_zone
!= global_zone
) {
1912 if (flags
& (VMUSAGE_ALL_PROJECTS
| VMUSAGE_COL_PROJECTS
)) {
1913 flags
&= ~(VMUSAGE_ALL_PROJECTS
| VMUSAGE_COL_PROJECTS
);
1914 flags
|= VMUSAGE_PROJECTS
;
1916 if (flags
& (VMUSAGE_ALL_RUSERS
| VMUSAGE_COL_RUSERS
)) {
1917 flags
&= ~(VMUSAGE_ALL_RUSERS
| VMUSAGE_COL_RUSERS
);
1918 flags
|= VMUSAGE_RUSERS
;
1920 if (flags
& (VMUSAGE_ALL_EUSERS
| VMUSAGE_COL_EUSERS
)) {
1921 flags
&= ~(VMUSAGE_ALL_EUSERS
| VMUSAGE_COL_EUSERS
);
1922 flags
|= VMUSAGE_EUSERS
;
1924 if (flags
& VMUSAGE_SYSTEM
) {
1925 flags
&= ~VMUSAGE_SYSTEM
;
1926 flags
|= VMUSAGE_ZONE
;
1930 /* Check for unknown flags */
1931 if ((flags
& (~VMUSAGE_MASK
)) != 0)
1932 return (set_errno(EINVAL
));
1934 /* Check for no flags */
1935 if ((flags
& VMUSAGE_MASK
) == 0)
1936 return (set_errno(EINVAL
));
1938 mutex_enter(&vmu_data
.vmu_lock
);
1942 if (vmu_data
.vmu_cache
!= NULL
) {
1946 if ((vmu_data
.vmu_cache
->vmc_timestamp
+
1947 ((hrtime_t
)age
* NANOSEC
)) > now
)
1950 if ((vmu_data
.vmu_cache
->vmc_flags
& flags
) == flags
&&
1952 cache
= vmu_data
.vmu_cache
;
1953 vmu_cache_hold(cache
);
1954 mutex_exit(&vmu_data
.vmu_lock
);
1956 ret
= vmu_copyout_results(cache
, buf
, nres
, flags_orig
,
1958 mutex_enter(&vmu_data
.vmu_lock
);
1959 vmu_cache_rele(cache
);
1960 if (vmu_data
.vmu_pending_waiters
> 0)
1961 cv_broadcast(&vmu_data
.vmu_cv
);
1962 mutex_exit(&vmu_data
.vmu_lock
);
1966 * If the cache is recent, it is likely that there are other
1967 * consumers of vm_getusage running, so add their flags to the
1968 * desired flags for the calculation.
1970 if (cacherecent
== 1)
1971 flags
= vmu_data
.vmu_cache
->vmc_flags
| flags
;
1973 if (vmu_data
.vmu_calc_thread
== NULL
) {
1977 vmu_data
.vmu_calc_thread
= curthread
;
1978 vmu_data
.vmu_calc_flags
= flags
;
1979 vmu_data
.vmu_entities
= NULL
;
1980 vmu_data
.vmu_nentities
= 0;
1981 if (vmu_data
.vmu_pending_waiters
> 0)
1982 vmu_data
.vmu_calc_flags
|=
1983 vmu_data
.vmu_pending_flags
;
1985 vmu_data
.vmu_pending_flags
= 0;
1986 mutex_exit(&vmu_data
.vmu_lock
);
1988 mutex_enter(&vmu_data
.vmu_lock
);
1989 /* copy results to cache */
1990 if (vmu_data
.vmu_cache
!= NULL
)
1991 vmu_cache_rele(vmu_data
.vmu_cache
);
1992 cache
= vmu_data
.vmu_cache
=
1993 vmu_cache_alloc(vmu_data
.vmu_nentities
,
1994 vmu_data
.vmu_calc_flags
);
1996 result
= cache
->vmc_results
;
1997 for (entity
= vmu_data
.vmu_entities
; entity
!= NULL
;
1998 entity
= entity
->vme_next
) {
1999 *result
= entity
->vme_result
;
2002 cache
->vmc_timestamp
= gethrtime();
2003 vmu_cache_hold(cache
);
2005 vmu_data
.vmu_calc_flags
= 0;
2006 vmu_data
.vmu_calc_thread
= NULL
;
2008 if (vmu_data
.vmu_pending_waiters
> 0)
2009 cv_broadcast(&vmu_data
.vmu_cv
);
2011 mutex_exit(&vmu_data
.vmu_lock
);
2014 ret
= vmu_copyout_results(cache
, buf
, nres
, flags_orig
, cpflg
);
2015 mutex_enter(&vmu_data
.vmu_lock
);
2016 vmu_cache_rele(cache
);
2017 mutex_exit(&vmu_data
.vmu_lock
);
2021 vmu_data
.vmu_pending_flags
|= flags
;
2022 vmu_data
.vmu_pending_waiters
++;
2023 while (vmu_data
.vmu_calc_thread
!= NULL
) {
2024 if (cv_wait_sig(&vmu_data
.vmu_cv
,
2025 &vmu_data
.vmu_lock
) == 0) {
2026 vmu_data
.vmu_pending_waiters
--;
2027 mutex_exit(&vmu_data
.vmu_lock
);
2028 return (set_errno(EINTR
));
2031 vmu_data
.vmu_pending_waiters
--;