kernel/vm/vm_usage.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /*
  28  * vm_usage
  29  *
  30  * This file implements the getvmusage() private system call.
  31  * getvmusage() counts the amount of resident memory pages and swap
  32  * reserved by the specified process collective. A "process collective" is
  33  * the set of processes owned by a particular, zone, project, task, or user.
  34  *
  35  * rss and swap are counted so that for a given process collective, a page is
  36  * only counted once.  For example, this means that if multiple processes in
  37  * the same project map the same page, then the project will only be charged
  38  * once for that page.  On the other hand, if two processes in different
  39  * projects map the same page, then both projects will be charged
  40  * for the page.
  41  *
  42  * The vm_getusage() calculation is implemented so that the first thread
  43  * performs the rss/swap counting. Other callers will wait for that thread to
  44  * finish, copying the results.  This enables multiple rcapds and prstats to
  45  * consume data from the same calculation.  The results are also cached so that
  46  * a caller interested in recent results can just copy them instead of starting
  47  * a new calculation. The caller passes the maximium age (in seconds) of the
  48  * data.  If the cached data is young enough, the cache is copied, otherwise,
  49  * a new calculation is executed and the cache is replaced with the new
  50  * data.
  51  *
  52  * The rss calculation for each process collective is as follows:
  53  *
  54  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
  55  *     and/or users.
  56  *   - For each proc:
  57  *      - Figure out proc's collectives (zone, project, task, and/or user).
  58  *      - For each seg in proc's address space:
  59  *              - If seg is private:
  60  *                      - Lookup anons in the amp.
  61  *                      - For incore pages not previously visited each of the
  62  *                        proc's collectives, add incore pagesize to each.
  63  *                        collective.
  64  *                        Anon's with a refcnt of 1 can be assummed to be not
  65  *                        previously visited.
  66  *                      - For address ranges without anons in the amp:
  67  *                              - Lookup pages in underlying vnode.
  68  *                              - For incore pages not previously visiting for
  69  *                                each of the proc's collectives, add incore
  70  *                                pagesize to each collective.
  71  *              - If seg is shared:
  72  *                      - Lookup pages in the shared amp or vnode.
  73  *                      - For incore pages not previously visited for each of
  74  *                        the proc's collectives, add incore pagesize to each
  75  *                        collective.
  76  *
  77  * Swap is reserved by private segments, and shared anonymous segments.
  78  * The only shared anon segments which do not reserve swap are ISM segments
  79  * and schedctl segments, both of which can be identified by having
  80  * amp->swresv == 0.
  81  *
  82  * The swap calculation for each collective is as follows:
  83  *
  84  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
  85  *     and/or users.
  86  *   - For each proc:
  87  *      - Figure out proc's collectives (zone, project, task, and/or user).
  88  *      - For each seg in proc's address space:
  89  *              - If seg is private:
  90  *                      - Add svd->swresv pages to swap count for each of the
  91  *                        proc's collectives.
  92  *              - If seg is anon, shared, and amp->swresv != 0
  93  *                      - For address ranges in amp not previously visited for
  94  *                        each of the proc's collectives, add size of address
  95  *                        range to the swap count for each collective.
  96  *
  97  * These two calculations are done simultaneously, with most of the work
  98  * being done in vmu_calculate_seg().  The results of the calculation are
  99  * copied into "vmu_data.vmu_cache_results".
 100  *
 101  * To perform the calculation, various things are tracked and cached:
 102  *
 103  *    - incore/not-incore page ranges for all vnodes.
 104  *      (vmu_data.vmu_all_vnodes_hash)
 105  *      This eliminates looking up the same page more than once.
 106  *
 107  *    - incore/not-incore page ranges for all shared amps.
 108  *      (vmu_data.vmu_all_amps_hash)
 109  *      This eliminates looking up the same page more than once.
 110  *
 111  *    - visited page ranges for each collective.
 112  *         - per vnode (entity->vme_vnode_hash)
 113  *         - per shared amp (entity->vme_amp_hash)
 114  *      For accurate counting of map-shared and COW-shared pages.
 115  *
 116  *    - visited private anons (refcnt > 1) for each collective.
 117  *      (entity->vme_anon_hash)
 118  *      For accurate counting of COW-shared pages.
 119  *
 120  * The common accounting structure is the vmu_entity_t, which represents
 121  * collectives:
 122  *
 123  *    - A zone.
 124  *    - A project, task, or user within a zone.
 125  *    - The entire system (vmu_data.vmu_system).
 126  *    - Each collapsed (col) project and user.  This means a given projid or
 127  *      uid, regardless of which zone the process is in.  For instance,
 128  *      project 0 in the global zone and project 0 in a non global zone are
 129  *      the same collapsed project.
 130  *
 131  *  Each entity structure tracks which pages have been already visited for
 132  *  that entity (via previously inspected processes) so that these pages are
 133  *  not double counted.
 134  */
 135
 136 #include <sys/errno.h>
 137 #include <sys/types.h>
 138 #include <sys/zone.h>
 139 #include <sys/proc.h>
 140 #include <sys/project.h>
 141 #include <sys/task.h>
 142 #include <sys/thread.h>
 143 #include <sys/time.h>
 144 #include <sys/mman.h>
 145 #include <sys/modhash.h>
 146 #include <sys/modhash_impl.h>
 147 #include <sys/shm.h>
 148 #include <sys/swap.h>
 149 #include <sys/synch.h>
 150 #include <sys/systm.h>
 151 #include <sys/var.h>
 152 #include <sys/vm_usage.h>
 153 #include <sys/zone.h>
 154 #include <sys/sunddi.h>
 155 #include <sys/avl.h>
 156 #include <vm/anon.h>
 157 #include <vm/as.h>
 158 #include <vm/seg_vn.h>
 159 #include <vm/seg_spt.h>
 160
 161 #define VMUSAGE_HASH_SIZE               512
 162
 163 #define VMUSAGE_TYPE_VNODE              1
 164 #define VMUSAGE_TYPE_AMP                2
 165 #define VMUSAGE_TYPE_ANON               3
 166
 167 #define VMUSAGE_BOUND_UNKNOWN           0
 168 #define VMUSAGE_BOUND_INCORE            1
 169 #define VMUSAGE_BOUND_NOT_INCORE        2
 170
 171 #define ISWITHIN(node, addr)    ((node)->vmb_start <= addr && \
 172                                     (node)->vmb_end >= addr ? 1 : 0)
 173
 174 /*
 175  * bounds for vnodes and shared amps
 176  * Each bound is either entirely incore, entirely not in core, or
 177  * entirely unknown.  bounds are stored in an avl tree sorted by start member
 178  * when in use, otherwise (free or temporary lists) they're strung
 179  * together off of vmb_next.
 180  */
 181 typedef struct vmu_bound {
 182         avl_node_t vmb_node;
 183         struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
 184         pgcnt_t vmb_start;  /* page offset in vnode/amp on which bound starts */
 185         pgcnt_t vmb_end;    /* page offset in vnode/amp on which bound ends */
 186         char    vmb_type;   /* One of VMUSAGE_BOUND_* */
 187 } vmu_bound_t;
 188
 189 /*
 190  * hash of visited objects (vnodes or shared amps)
 191  * key is address of vnode or amp.  Bounds lists known incore/non-incore
 192  * bounds for vnode/amp.
 193  */
 194 typedef struct vmu_object {
 195         struct vmu_object       *vmo_next;      /* free list */
 196         caddr_t         vmo_key;
 197         short           vmo_type;
 198         avl_tree_t      vmo_bounds;
 199 } vmu_object_t;
 200
 201 /*
 202  * Entity by which to count results.
 203  *
 204  * The entity structure keeps the current rss/swap counts for each entity
 205  * (zone, project, etc), and hashes of vm structures that have already
 206  * been visited for the entity.
 207  *
 208  * vme_next:    links the list of all entities currently being counted by
 209  *              vmu_calculate().
 210  *
 211  * vme_next_calc: links the list of entities related to the current process
 212  *               being counted by vmu_calculate_proc().
 213  *
 214  * vmu_calculate_proc() walks all processes.  For each process, it makes a
 215  * list of the entities related to that process using vme_next_calc.  This
 216  * list changes each time vmu_calculate_proc() is called.
 217  *
 218  */
 219 typedef struct vmu_entity {
 220         struct vmu_entity *vme_next;
 221         struct vmu_entity *vme_next_calc;
 222         mod_hash_t      *vme_vnode_hash; /* vnodes visited for entity */
 223         mod_hash_t      *vme_amp_hash;   /* shared amps visited for entity */
 224         mod_hash_t      *vme_anon_hash;  /* COW anons visited for entity */
 225         vmusage_t       vme_result;      /* identifies entity and results */
 226 } vmu_entity_t;
 227
 228 /*
 229  * Hash of entities visited within a zone, and an entity for the zone
 230  * itself.
 231  */
 232 typedef struct vmu_zone {
 233         struct vmu_zone *vmz_next;      /* free list */
 234         id_t            vmz_id;
 235         vmu_entity_t    *vmz_zone;
 236         mod_hash_t      *vmz_projects_hash;
 237         mod_hash_t      *vmz_tasks_hash;
 238         mod_hash_t      *vmz_rusers_hash;
 239         mod_hash_t      *vmz_eusers_hash;
 240 } vmu_zone_t;
 241
 242 /*
 243  * Cache of results from last calculation
 244  */
 245 typedef struct vmu_cache {
 246         vmusage_t       *vmc_results;   /* Results from last call to */
 247                                         /* vm_getusage(). */
 248         uint64_t        vmc_nresults;   /* Count of cached results */
 249         uint64_t        vmc_refcnt;     /* refcnt for free */
 250         uint_t          vmc_flags;      /* Flags for vm_getusage() */
 251         hrtime_t        vmc_timestamp;  /* when cache was created */
 252 } vmu_cache_t;
 253
 254 /*
 255  * top level rss info for the system
 256  */
 257 typedef struct vmu_data {
 258         kmutex_t        vmu_lock;               /* Protects vmu_data */
 259         kcondvar_t      vmu_cv;                 /* Used to signal threads */
 260                                                 /* Waiting for */
 261                                                 /* Rss_calc_thread to finish */
 262         vmu_entity_t    *vmu_system;            /* Entity for tracking */
 263                                                 /* rss/swap for all processes */
 264                                                 /* in all zones */
 265         mod_hash_t      *vmu_zones_hash;        /* Zones visited */
 266         mod_hash_t      *vmu_projects_col_hash; /* These *_col_hash hashes */
 267         mod_hash_t      *vmu_rusers_col_hash;   /* keep track of entities, */
 268         mod_hash_t      *vmu_eusers_col_hash;   /* ignoring zoneid, in order */
 269                                                 /* to implement VMUSAGE_COL_* */
 270                                                 /* flags, which aggregate by */
 271                                                 /* project or user regardless */
 272                                                 /* of zoneid. */
 273         mod_hash_t      *vmu_all_vnodes_hash;   /* System wide visited vnodes */
 274                                                 /* to track incore/not-incore */
 275         mod_hash_t      *vmu_all_amps_hash;     /* System wide visited shared */
 276                                                 /* amps to track incore/not- */
 277                                                 /* incore */
 278         vmu_entity_t    *vmu_entities;          /* Linked list of entities */
 279         size_t          vmu_nentities;          /* Count of entities in list */
 280         vmu_cache_t     *vmu_cache;             /* Cached results */
 281         kthread_t       *vmu_calc_thread;       /* NULL, or thread running */
 282                                                 /* vmu_calculate() */
 283         uint_t          vmu_calc_flags;         /* Flags being using by */
 284                                                 /* currently running calc */
 285                                                 /* thread */
 286         uint_t          vmu_pending_flags;      /* Flags of vm_getusage() */
 287                                                 /* threads waiting for */
 288                                                 /* calc thread to finish */
 289         uint_t          vmu_pending_waiters;    /* Number of threads waiting */
 290                                                 /* for calc thread */
 291         vmu_bound_t     *vmu_free_bounds;
 292         vmu_object_t    *vmu_free_objects;
 293         vmu_entity_t    *vmu_free_entities;
 294         vmu_zone_t      *vmu_free_zones;
 295 } vmu_data_t;
 296
 297 extern struct as kas;
 298 extern proc_t *practive;
 299 extern zone_t *global_zone;
 300 extern const struct seg_ops segvn_ops;
 301 extern const struct seg_ops segspt_shmops;
 302
 303 static vmu_data_t vmu_data;
 304 static kmem_cache_t *vmu_bound_cache;
 305 static kmem_cache_t *vmu_object_cache;
 306
 307 /*
 308  * Comparison routine for AVL tree. We base our comparison on vmb_start.
 309  */
 310 static int
 311 bounds_cmp(const void *bnd1, const void *bnd2)
 312 {
 313         const vmu_bound_t *bound1 = bnd1;
 314         const vmu_bound_t *bound2 = bnd2;
 315
 316         if (bound1->vmb_start == bound2->vmb_start) {
 317                 return (0);
 318         }
 319         if (bound1->vmb_start < bound2->vmb_start) {
 320                 return (-1);
 321         }
 322
 323         return (1);
 324 }
 325
 326 /*
 327  * Save a bound on the free list.
 328  */
 329 static void
 330 vmu_free_bound(vmu_bound_t *bound)
 331 {
 332         bound->vmb_next = vmu_data.vmu_free_bounds;
 333         bound->vmb_start = 0;
 334         bound->vmb_end = 0;
 335         bound->vmb_type = 0;
 336         vmu_data.vmu_free_bounds = bound;
 337 }
 338
 339 /*
 340  * Free an object, and all visited bound info.
 341  */
 342 static void
 343 vmu_free_object(mod_hash_val_t val)
 344 {
 345         vmu_object_t *obj = (vmu_object_t *)val;
 346         avl_tree_t *tree = &(obj->vmo_bounds);
 347         vmu_bound_t *bound;
 348         void *cookie = NULL;
 349
 350         while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
 351                 vmu_free_bound(bound);
 352         avl_destroy(tree);
 353
 354         obj->vmo_type = 0;
 355         obj->vmo_next = vmu_data.vmu_free_objects;
 356         vmu_data.vmu_free_objects = obj;
 357 }
 358
 359 /*
 360  * Free an entity, and hashes of visited objects for that entity.
 361  */
 362 static void
 363 vmu_free_entity(mod_hash_val_t val)
 364 {
 365         vmu_entity_t *entity = (vmu_entity_t *)val;
 366
 367         if (entity->vme_vnode_hash != NULL)
 368                 i_mod_hash_clear_nosync(entity->vme_vnode_hash);
 369         if (entity->vme_amp_hash != NULL)
 370                 i_mod_hash_clear_nosync(entity->vme_amp_hash);
 371         if (entity->vme_anon_hash != NULL)
 372                 i_mod_hash_clear_nosync(entity->vme_anon_hash);
 373
 374         entity->vme_next = vmu_data.vmu_free_entities;
 375         vmu_data.vmu_free_entities = entity;
 376 }
 377
 378 /*
 379  * Free zone entity, and all hashes of entities inside that zone,
 380  * which are projects, tasks, and users.
 381  */
 382 static void
 383 vmu_free_zone(mod_hash_val_t val)
 384 {
 385         vmu_zone_t *zone = (vmu_zone_t *)val;
 386
 387         if (zone->vmz_zone != NULL) {
 388                 vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
 389                 zone->vmz_zone = NULL;
 390         }
 391         if (zone->vmz_projects_hash != NULL)
 392                 i_mod_hash_clear_nosync(zone->vmz_projects_hash);
 393         if (zone->vmz_tasks_hash != NULL)
 394                 i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
 395         if (zone->vmz_rusers_hash != NULL)
 396                 i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
 397         if (zone->vmz_eusers_hash != NULL)
 398                 i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
 399         zone->vmz_next = vmu_data.vmu_free_zones;
 400         vmu_data.vmu_free_zones = zone;
 401 }
 402
 403 /*
 404  * Initialize synchronization primitives and hashes for system-wide tracking
 405  * of visited vnodes and shared amps.  Initialize results cache.
 406  */
 407 void
 408 vm_usage_init()
 409 {
 410         mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
 411         cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
 412
 413         vmu_data.vmu_system = NULL;
 414         vmu_data.vmu_zones_hash = NULL;
 415         vmu_data.vmu_projects_col_hash = NULL;
 416         vmu_data.vmu_rusers_col_hash = NULL;
 417         vmu_data.vmu_eusers_col_hash = NULL;
 418
 419         vmu_data.vmu_free_bounds = NULL;
 420         vmu_data.vmu_free_objects = NULL;
 421         vmu_data.vmu_free_entities = NULL;
 422         vmu_data.vmu_free_zones = NULL;
 423
 424         vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
 425             "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 426             sizeof (vnode_t));
 427         vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
 428             "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 429             sizeof (struct anon_map));
 430         vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
 431             "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
 432             vmu_free_entity);
 433         vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
 434             "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
 435             vmu_free_entity);
 436         vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
 437             "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
 438             vmu_free_entity);
 439         vmu_data.vmu_zones_hash = mod_hash_create_idhash(
 440             "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
 441
 442         vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
 443             sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 444         vmu_object_cache = kmem_cache_create("vmu_object_cache",
 445             sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 446
 447         vmu_data.vmu_entities = NULL;
 448         vmu_data.vmu_nentities = 0;
 449
 450         vmu_data.vmu_cache = NULL;
 451         vmu_data.vmu_calc_thread = NULL;
 452         vmu_data.vmu_calc_flags = 0;
 453         vmu_data.vmu_pending_flags = 0;
 454         vmu_data.vmu_pending_waiters = 0;
 455 }
 456
 457 /*
 458  * Allocate hashes for tracking vm objects visited for an entity.
 459  * Update list of entities.
 460  */
 461 static vmu_entity_t *
 462 vmu_alloc_entity(id_t id, int type, id_t zoneid)
 463 {
 464         vmu_entity_t *entity;
 465
 466         if (vmu_data.vmu_free_entities != NULL) {
 467                 entity = vmu_data.vmu_free_entities;
 468                 vmu_data.vmu_free_entities =
 469                     vmu_data.vmu_free_entities->vme_next;
 470                 bzero(&entity->vme_result, sizeof (vmusage_t));
 471         } else {
 472                 entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
 473         }
 474         entity->vme_result.vmu_id = id;
 475         entity->vme_result.vmu_zoneid = zoneid;
 476         entity->vme_result.vmu_type = type;
 477
 478         if (entity->vme_vnode_hash == NULL)
 479                 entity->vme_vnode_hash = mod_hash_create_ptrhash(
 480                     "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 481                     sizeof (vnode_t));
 482
 483         if (entity->vme_amp_hash == NULL)
 484                 entity->vme_amp_hash = mod_hash_create_ptrhash(
 485                     "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 486                     sizeof (struct anon_map));
 487
 488         if (entity->vme_anon_hash == NULL)
 489                 entity->vme_anon_hash = mod_hash_create_ptrhash(
 490                     "vmusage anon hash", VMUSAGE_HASH_SIZE,
 491                     mod_hash_null_valdtor, sizeof (struct anon));
 492
 493         entity->vme_next = vmu_data.vmu_entities;
 494         vmu_data.vmu_entities = entity;
 495         vmu_data.vmu_nentities++;
 496
 497         return (entity);
 498 }
 499
 500 /*
 501  * Allocate a zone entity, and hashes for tracking visited vm objects
 502  * for projects, tasks, and users within that zone.
 503  */
 504 static vmu_zone_t *
 505 vmu_alloc_zone(id_t id)
 506 {
 507         vmu_zone_t *zone;
 508
 509         if (vmu_data.vmu_free_zones != NULL) {
 510                 zone = vmu_data.vmu_free_zones;
 511                 vmu_data.vmu_free_zones =
 512                     vmu_data.vmu_free_zones->vmz_next;
 513                 zone->vmz_next = NULL;
 514                 zone->vmz_zone = NULL;
 515         } else {
 516                 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
 517         }
 518
 519         zone->vmz_id = id;
 520
 521         if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
 522                 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
 523
 524         if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
 525             VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
 526                 zone->vmz_projects_hash = mod_hash_create_idhash(
 527                     "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 528
 529         if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
 530             != 0 && zone->vmz_tasks_hash == NULL)
 531                 zone->vmz_tasks_hash = mod_hash_create_idhash(
 532                     "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 533
 534         if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
 535             != 0 && zone->vmz_rusers_hash == NULL)
 536                 zone->vmz_rusers_hash = mod_hash_create_idhash(
 537                     "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 538
 539         if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
 540             != 0 && zone->vmz_eusers_hash == NULL)
 541                 zone->vmz_eusers_hash = mod_hash_create_idhash(
 542                     "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 543
 544         return (zone);
 545 }
 546
 547 /*
 548  * Allocate a structure for tracking visited bounds for a vm object.
 549  */
 550 static vmu_object_t *
 551 vmu_alloc_object(caddr_t key, int type)
 552 {
 553         vmu_object_t *object;
 554
 555         if (vmu_data.vmu_free_objects != NULL) {
 556                 object = vmu_data.vmu_free_objects;
 557                 vmu_data.vmu_free_objects =
 558                     vmu_data.vmu_free_objects->vmo_next;
 559         } else {
 560                 object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
 561         }
 562
 563         object->vmo_next = NULL;
 564         object->vmo_key = key;
 565         object->vmo_type = type;
 566         avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);
 567
 568         return (object);
 569 }
 570
 571 /*
 572  * Allocate and return a bound structure.
 573  */
 574 static vmu_bound_t *
 575 vmu_alloc_bound()
 576 {
 577         vmu_bound_t *bound;
 578
 579         if (vmu_data.vmu_free_bounds != NULL) {
 580                 bound = vmu_data.vmu_free_bounds;
 581                 vmu_data.vmu_free_bounds =
 582                     vmu_data.vmu_free_bounds->vmb_next;
 583         } else {
 584                 bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
 585         }
 586
 587         bound->vmb_next = NULL;
 588         bound->vmb_start = 0;
 589         bound->vmb_end = 0;
 590         bound->vmb_type = 0;
 591         return (bound);
 592 }
 593
 594 /*
 595  * vmu_find_insert_* functions implement hash lookup or allocate and
 596  * insert operations.
 597  */
 598 static vmu_object_t *
 599 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
 600 {
 601         int ret;
 602         vmu_object_t *object;
 603
 604         ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
 605             (mod_hash_val_t *)&object);
 606         if (ret != 0) {
 607                 object = vmu_alloc_object(key, type);
 608                 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
 609                     (mod_hash_val_t)object, (mod_hash_hndl_t)0);
 610                 ASSERT(ret == 0);
 611         }
 612         return (object);
 613 }
 614
 615 static int
 616 vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
 617 {
 618         int ret;
 619         caddr_t val;
 620
 621         ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
 622             (mod_hash_val_t *)&val);
 623
 624         if (ret == 0)
 625                 return (0);
 626
 627         ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
 628             (mod_hash_val_t)key, (mod_hash_hndl_t)0);
 629
 630         ASSERT(ret == 0);
 631
 632         return (1);
 633 }
 634
 635 static vmu_entity_t *
 636 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
 637 {
 638         int ret;
 639         vmu_entity_t *entity;
 640
 641         ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
 642             (mod_hash_val_t *)&entity);
 643         if (ret != 0) {
 644                 entity = vmu_alloc_entity(id, type, zoneid);
 645                 ret = i_mod_hash_insert_nosync(hash,
 646                     (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
 647                     (mod_hash_hndl_t)0);
 648                 ASSERT(ret == 0);
 649         }
 650         return (entity);
 651 }
 652
 653
 654
 655
 656 /*
 657  * Returns list of object bounds between start and end.  New bounds inserted
 658  * by this call are given type.
 659  *
 660  * Returns the number of pages covered if new bounds are created.  Returns 0
 661  * if region between start/end consists of all existing bounds.
 662  */
 663 static pgcnt_t
 664 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
 665     end, char type, vmu_bound_t **first, vmu_bound_t **last)
 666 {
 667         avl_tree_t      *tree = &(ro->vmo_bounds);
 668         avl_index_t     where;
 669         vmu_bound_t     *walker, *tmp;
 670         pgcnt_t         ret = 0;
 671
 672         ASSERT(start <= end);
 673
 674         *first = *last = NULL;
 675
 676         tmp = vmu_alloc_bound();
 677         tmp->vmb_start = start;
 678         tmp->vmb_type = type;
 679
 680         /* Hopelessly optimistic case. */
 681         if (walker = avl_find(tree, tmp, &where)) {
 682                 /* We got lucky. */
 683                 vmu_free_bound(tmp);
 684                 *first = walker;
 685         }
 686
 687         if (walker == NULL) {
 688                 /* Is start in the previous node? */
 689                 walker = avl_nearest(tree, where, AVL_BEFORE);
 690                 if (walker != NULL) {
 691                         if (ISWITHIN(walker, start)) {
 692                                 /* We found start. */
 693                                 vmu_free_bound(tmp);
 694                                 *first = walker;
 695                         }
 696                 }
 697         }
 698
 699         /*
 700          * At this point, if *first is still NULL, then we
 701          * didn't get a direct hit and start isn't covered
 702          * by the previous node. We know that the next node
 703          * must have a greater start value than we require
 704          * because avl_find tells us where the AVL routines would
 705          * insert our new node. We have some gap between the
 706          * start we want and the next node.
 707          */
 708         if (*first == NULL) {
 709                 walker = avl_nearest(tree, where, AVL_AFTER);
 710                 if (walker != NULL && walker->vmb_start <= end) {
 711                         /* Fill the gap. */
 712                         tmp->vmb_end = walker->vmb_start - 1;
 713                         *first = tmp;
 714                 } else {
 715                         /* We have a gap over [start, end]. */
 716                         tmp->vmb_end = end;
 717                         *first = *last = tmp;
 718                 }
 719                 ret += tmp->vmb_end - tmp->vmb_start + 1;
 720                 avl_insert(tree, tmp, where);
 721         }
 722
 723         ASSERT(*first != NULL);
 724
 725         if (*last != NULL) {
 726                 /* We're done. */
 727                 return (ret);
 728         }
 729
 730         /*
 731          * If we are here we still need to set *last and
 732          * that may involve filling in some gaps.
 733          */
 734         *last = *first;
 735         for (;;) {
 736                 if (ISWITHIN(*last, end)) {
 737                         /* We're done. */
 738                         break;
 739                 }
 740                 walker = AVL_NEXT(tree, *last);
 741                 if (walker == NULL || walker->vmb_start > end) {
 742                         /* Bottom or mid tree with gap. */
 743                         tmp = vmu_alloc_bound();
 744                         tmp->vmb_start = (*last)->vmb_end + 1;
 745                         tmp->vmb_end = end;
 746                         tmp->vmb_type = type;
 747                         ret += tmp->vmb_end - tmp->vmb_start + 1;
 748                         avl_insert_here(tree, tmp, *last, AVL_AFTER);
 749                         *last = tmp;
 750                         break;
 751                 } else {
 752                         if ((*last)->vmb_end + 1 != walker->vmb_start) {
 753                                 /* Non-contiguous. */
 754                                 tmp = vmu_alloc_bound();
 755                                 tmp->vmb_start = (*last)->vmb_end + 1;
 756                                 tmp->vmb_end = walker->vmb_start - 1;
 757                                 tmp->vmb_type = type;
 758                                 ret += tmp->vmb_end - tmp->vmb_start + 1;
 759                                 avl_insert_here(tree, tmp, *last, AVL_AFTER);
 760                                 *last = tmp;
 761                         } else {
 762                                 *last = walker;
 763                         }
 764                 }
 765         }
 766
 767         return (ret);
 768 }
 769
 770 /*
 771  * vmu_update_bounds()
 772  *
 773  * tree: avl_tree in which first and last hang.
 774  *
 775  * first, last: list of continuous bounds, of which zero or more are of
 776  *              type VMUSAGE_BOUND_UNKNOWN.
 777  *
 778  * new_tree: avl_tree in which new_first and new_last hang.
 779  *
 780  * new_first, new_last: list of continuous bounds, of which none are of
 781  *                      type VMUSAGE_BOUND_UNKNOWN.  These bounds are used to
 782  *                      update the types of bounds in (first,last) with
 783  *                      type VMUSAGE_BOUND_UNKNOWN.
 784  *
 785  * For the list of bounds (first,last), this function updates any bounds
 786  * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
 787  * the list (new_first, new_last).
 788  *
 789  * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
 790  * (new_first, new_last), it will be split into multiple bounds.
 791  *
 792  * Return value:
 793  *      The number of pages in the list of bounds (first,last) that were of
 794  *      type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
 795  *      VMUSAGE_BOUND_INCORE.
 796  *
 797  */
 798 static pgcnt_t
 799 vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
 800     avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
 801 {
 802         vmu_bound_t *next, *new_next, *tmp;
 803         pgcnt_t rss = 0;
 804
 805         next = *first;
 806         new_next = new_first;
 807
 808         /*
 809          * Verify first and last bound are covered by new bounds if they
 810          * have unknown type.
 811          */
 812         ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
 813             (*first)->vmb_start >= new_first->vmb_start);
 814         ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
 815             (*last)->vmb_end <= new_last->vmb_end);
 816         for (;;) {
 817                 /* If bound already has type, proceed to next bound. */
 818                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
 819                         if (next == *last)
 820                                 break;
 821                         next = AVL_NEXT(tree, next);
 822                         continue;
 823                 }
 824                 while (new_next->vmb_end < next->vmb_start)
 825                         new_next = AVL_NEXT(new_tree, new_next);
 826                 ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
 827                 next->vmb_type = new_next->vmb_type;
 828                 if (new_next->vmb_end < next->vmb_end) {
 829                         /* need to split bound */
 830                         tmp = vmu_alloc_bound();
 831                         tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
 832                         tmp->vmb_start = new_next->vmb_end + 1;
 833                         tmp->vmb_end = next->vmb_end;
 834                         avl_insert_here(tree, tmp, next, AVL_AFTER);
 835                         next->vmb_end = new_next->vmb_end;
 836                         if (*last == next)
 837                                 *last = tmp;
 838                         if (next->vmb_type == VMUSAGE_BOUND_INCORE)
 839                                 rss += next->vmb_end - next->vmb_start + 1;
 840                         next = tmp;
 841                 } else {
 842                         if (next->vmb_type == VMUSAGE_BOUND_INCORE)
 843                                 rss += next->vmb_end - next->vmb_start + 1;
 844                         if (next == *last)
 845                                 break;
 846                         next = AVL_NEXT(tree, next);
 847                 }
 848         }
 849         return (rss);
 850 }
 851
 852 /*
 853  * Merges adjacent bounds with same type between first and last bound.
 854  * After merge, last pointer may point to a different bound, as (incoming)
 855  * last bound may have been merged away.
 856  */
 857 static void
 858 vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
 859 {
 860         vmu_bound_t *current;
 861         vmu_bound_t *next;
 862
 863         ASSERT(tree != NULL);
 864         ASSERT(*first != NULL);
 865         ASSERT(*last != NULL);
 866
 867         current = *first;
 868         while (current != *last) {
 869                 next = AVL_NEXT(tree, current);
 870                 if ((current->vmb_end + 1) == next->vmb_start &&
 871                     current->vmb_type == next->vmb_type) {
 872                         current->vmb_end = next->vmb_end;
 873                         avl_remove(tree, next);
 874                         vmu_free_bound(next);
 875                         if (next == *last) {
 876                                 *last = current;
 877                         }
 878                 } else {
 879                         current = AVL_NEXT(tree, current);
 880                 }
 881         }
 882 }
 883
 884 /*
 885  * Given an amp and a list of bounds, updates each bound's type with
 886  * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
 887  *
 888  * If a bound is partially incore, it will be split into two bounds.
 889  * first and last may be modified, as bounds may be split into multiple
 890  * bounds if they are partially incore/not-incore.
 891  *
 892  * Set incore to non-zero if bounds are already known to be incore.
 893  *
 894  */
 895 static void
 896 vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 897     vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
 898 {
 899         vmu_bound_t *next;
 900         vmu_bound_t *tmp;
 901         pgcnt_t index;
 902         short bound_type;
 903         short page_type;
 904         vnode_t *vn;
 905         anoff_t off;
 906         struct anon *ap;
 907
 908         next = *first;
 909         /* Shared anon slots don't change once set. */
 910         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
 911         for (;;) {
 912                 if (incore == B_TRUE)
 913                         next->vmb_type = VMUSAGE_BOUND_INCORE;
 914
 915                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
 916                         if (next == *last)
 917                                 break;
 918                         next = AVL_NEXT(tree, next);
 919                         continue;
 920                 }
 921                 bound_type = next->vmb_type;
 922                 index = next->vmb_start;
 923                 while (index <= next->vmb_end) {
 924
 925                         /*
 926                          * These are used to determine how much to increment
 927                          * index when a large page is found.
 928                          */
 929                         page_t *page;
 930                         pgcnt_t pgcnt = 1;
 931                         uint_t pgshft;
 932                         pgcnt_t pgmsk;
 933
 934                         ap = anon_get_ptr(amp->ahp, index);
 935                         if (ap != NULL)
 936                                 swap_xlate(ap, &vn, &off);
 937
 938                         if (ap != NULL && vn != NULL && vn_has_cached_data(vn) &&
 939                             (page = page_exists(&vn->v_object, off)) != NULL) {
 940                                 page_type = VMUSAGE_BOUND_INCORE;
 941                                 if (page->p_szc > 0) {
 942                                         pgcnt = page_get_pagecnt(page->p_szc);
 943                                         pgshft = page_get_shift(page->p_szc);
 944                                         pgmsk = (0x1 << (pgshft - PAGESHIFT))
 945                                             - 1;
 946                                 }
 947                         } else {
 948                                 page_type = VMUSAGE_BOUND_NOT_INCORE;
 949                         }
 950                         if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 951                                 next->vmb_type = page_type;
 952                         } else if (next->vmb_type != page_type) {
 953                                 /*
 954                                  * If current bound type does not match page
 955                                  * type, need to split off new bound.
 956                                  */
 957                                 tmp = vmu_alloc_bound();
 958                                 tmp->vmb_type = page_type;
 959                                 tmp->vmb_start = index;
 960                                 tmp->vmb_end = next->vmb_end;
 961                                 avl_insert_here(tree, tmp, next, AVL_AFTER);
 962                                 next->vmb_end = index - 1;
 963                                 if (*last == next)
 964                                         *last = tmp;
 965                                 next = tmp;
 966                         }
 967                         if (pgcnt > 1) {
 968                                 /*
 969                                  * If inside large page, jump to next large
 970                                  * page
 971                                  */
 972                                 index = (index & ~pgmsk) + pgcnt;
 973                         } else {
 974                                 index++;
 975                         }
 976                 }
 977                 if (next == *last) {
 978                         ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
 979                         break;
 980                 } else
 981                         next = AVL_NEXT(tree, next);
 982         }
 983         ANON_LOCK_EXIT(&amp->a_rwlock);
 984 }
 985
 986 /*
 987  * Same as vmu_amp_update_incore_bounds(), except for tracking
 988  * incore-/not-incore for vnodes.
 989  */
 990 static void
 991 vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
 992     vmu_bound_t **first, vmu_bound_t **last)
 993 {
 994         vmu_bound_t *next;
 995         vmu_bound_t *tmp;
 996         pgcnt_t index;
 997         short bound_type;
 998         short page_type;
 999
1000         next = *first;
1001         for (;;) {
1002                 if (!vn_has_cached_data(vnode))
1003                         next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1004
1005                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1006                         if (next == *last)
1007                                 break;
1008                         next = AVL_NEXT(tree, next);
1009                         continue;
1010                 }
1011
1012                 bound_type = next->vmb_type;
1013                 index = next->vmb_start;
1014                 while (index <= next->vmb_end) {
1015
1016                         /*
1017                          * These are used to determine how much to increment
1018                          * index when a large page is found.
1019                          */
1020                         page_t *page;
1021                         pgcnt_t pgcnt = 1;
1022                         uint_t pgshft;
1023                         pgcnt_t pgmsk;
1024
1025                         if (vn_has_cached_data(vnode) &&
1026                             (page = page_exists(&vnode->v_object, ptob(index))) != NULL) {
1027                                 page_type = VMUSAGE_BOUND_INCORE;
1028                                 if (page->p_szc > 0) {
1029                                         pgcnt = page_get_pagecnt(page->p_szc);
1030                                         pgshft = page_get_shift(page->p_szc);
1031                                         pgmsk = (0x1 << (pgshft - PAGESHIFT))
1032                                             - 1;
1033                                 }
1034                         } else {
1035                                 page_type = VMUSAGE_BOUND_NOT_INCORE;
1036                         }
1037                         if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1038                                 next->vmb_type = page_type;
1039                         } else if (next->vmb_type != page_type) {
1040                                 /*
1041                                  * If current bound type does not match page
1042                                  * type, need to split off new bound.
1043                                  */
1044                                 tmp = vmu_alloc_bound();
1045                                 tmp->vmb_type = page_type;
1046                                 tmp->vmb_start = index;
1047                                 tmp->vmb_end = next->vmb_end;
1048                                 avl_insert_here(tree, tmp, next, AVL_AFTER);
1049                                 next->vmb_end = index - 1;
1050                                 if (*last == next)
1051                                         *last = tmp;
1052                                 next = tmp;
1053                         }
1054                         if (pgcnt > 1) {
1055                                 /*
1056                                  * If inside large page, jump to next large
1057                                  * page
1058                                  */
1059                                 index = (index & ~pgmsk) + pgcnt;
1060                         } else {
1061                                 index++;
1062                         }
1063                 }
1064                 if (next == *last) {
1065                         ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1066                         break;
1067                 } else
1068                         next = AVL_NEXT(tree, next);
1069         }
1070 }
1071
1072 /*
1073  * Calculate the rss and swap consumed by a segment.  vmu_entities is the
1074  * list of entities to visit.  For shared segments, the vnode or amp
1075  * is looked up in each entity to see if it has been already counted.  Private
1076  * anon pages are checked per entity to ensure that COW pages are not
1077  * double counted.
1078  *
1079  * For private mapped files, first the amp is checked for private pages.
1080  * Bounds not backed by the amp are looked up in the vnode for each entity
1081  * to avoid double counting of private COW vnode pages.
1082  */
1083 static void
1084 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1085 {
1086         struct segvn_data *svd;
1087         struct shm_data *shmd;
1088         struct spt_data *sptd;
1089         vmu_object_t *shared_object = NULL;
1090         vmu_object_t *entity_object = NULL;
1091         vmu_entity_t *entity;
1092         vmusage_t *result;
1093         vmu_bound_t *first = NULL;
1094         vmu_bound_t *last = NULL;
1095         vmu_bound_t *cur = NULL;
1096         vmu_bound_t *e_first = NULL;
1097         vmu_bound_t *e_last = NULL;
1098         vmu_bound_t *tmp;
1099         pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1100         struct anon_map *private_amp = NULL;
1101         boolean_t incore = B_FALSE;
1102         boolean_t shared = B_FALSE;
1103         int file = 0;
1104         pgcnt_t swresv = 0;
1105         pgcnt_t panon = 0;
1106
1107         /* Can zero-length segments exist?  Not sure, so paranoia. */
1108         if (seg->s_size <= 0)
1109                 return;
1110
1111         /*
1112          * Figure out if there is a shared object (such as a named vnode or
1113          * a shared amp, then figure out if there is a private amp, which
1114          * identifies private pages.
1115          */
1116         if (seg->s_ops == &segvn_ops) {
1117                 svd = (struct segvn_data *)seg->s_data;
1118                 if (svd->type == MAP_SHARED) {
1119                         shared = B_TRUE;
1120                 } else {
1121                         swresv = svd->swresv;
1122
1123                         if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
1124                             RW_READER) != 0) {
1125                                 /*
1126                                  * Text replication anon maps can be shared
1127                                  * across all zones. Space used for text
1128                                  * replication is typically capped as a small %
1129                                  * of memory.  To keep it simple for now we
1130                                  * don't account for swap and memory space used
1131                                  * for text replication.
1132                                  */
1133                                 if (svd->tr_state == SEGVN_TR_OFF &&
1134                                     svd->amp != NULL) {
1135                                         private_amp = svd->amp;
1136                                         p_start = svd->anon_index;
1137                                         p_end = svd->anon_index +
1138                                             btop(seg->s_size) - 1;
1139                                 }
1140                                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1141                         }
1142                 }
1143                 if (svd->vp != NULL) {
1144                         file = 1;
1145                         shared_object = vmu_find_insert_object(
1146                             vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1147                             VMUSAGE_TYPE_VNODE);
1148                         s_start = btop(svd->offset);
1149                         s_end = btop(svd->offset + seg->s_size) - 1;
1150                 }
1151                 if (svd->amp != NULL && svd->type == MAP_SHARED) {
1152                         ASSERT(shared_object == NULL);
1153                         shared_object = vmu_find_insert_object(
1154                             vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1155                             VMUSAGE_TYPE_AMP);
1156                         s_start = svd->anon_index;
1157                         s_end = svd->anon_index + btop(seg->s_size) - 1;
1158                         /* schedctl mappings are always in core */
1159                         if (svd->amp->swresv == 0)
1160                                 incore = B_TRUE;
1161                 }
1162         } else if (seg->s_ops == &segspt_shmops) {
1163                 shared = B_TRUE;
1164                 shmd = (struct shm_data *)seg->s_data;
1165                 shared_object = vmu_find_insert_object(
1166                     vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1167                     VMUSAGE_TYPE_AMP);
1168                 s_start = 0;
1169                 s_end = btop(seg->s_size) - 1;
1170                 sptd = shmd->shm_sptseg->s_data;
1171
1172                 /* ism segments are always incore and do not reserve swap */
1173                 if (sptd->spt_flags & SHM_SHARE_MMU)
1174                         incore = B_TRUE;
1175
1176         } else {
1177                 return;
1178         }
1179
1180         /*
1181          * If there is a private amp, count anon pages that exist.  If an
1182          * anon has a refcnt > 1 (COW sharing), then save the anon in a
1183          * hash so that it is not double counted.
1184          *
1185          * If there is also a shared object, then figure out the bounds
1186          * which are not mapped by the private amp.
1187          */
1188         if (private_amp != NULL) {
1189
1190                 /* Enter as writer to prevent COW anons from being freed */
1191                 ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1192
1193                 p_index = p_start;
1194                 s_index = s_start;
1195
1196                 while (p_index <= p_end) {
1197
1198                         pgcnt_t p_index_next;
1199                         pgcnt_t p_bound_size;
1200                         int cnt;
1201                         anoff_t off;
1202                         struct vnode *vn;
1203                         struct anon *ap;
1204                         page_t *page;           /* For handling of large */
1205                         pgcnt_t pgcnt = 1;      /* pages */
1206                         pgcnt_t pgstart;
1207                         pgcnt_t pgend;
1208                         uint_t pgshft;
1209                         pgcnt_t pgmsk;
1210
1211                         p_index_next = p_index;
1212                         ap = anon_get_next_ptr(private_amp->ahp,
1213                             &p_index_next);
1214
1215                         /*
1216                          * If next anon is past end of mapping, simulate
1217                          * end of anon so loop terminates.
1218                          */
1219                         if (p_index_next > p_end) {
1220                                 p_index_next = p_end + 1;
1221                                 ap = NULL;
1222                         }
1223                         /*
1224                          * For COW segments, keep track of bounds not
1225                          * backed by private amp so they can be looked
1226                          * up in the backing vnode
1227                          */
1228                         if (p_index_next != p_index) {
1229
1230                                 /*
1231                                  * Compute index difference between anon and
1232                                  * previous anon.
1233                                  */
1234                                 p_bound_size = p_index_next - p_index - 1;
1235
1236                                 if (shared_object != NULL) {
1237                                         cur = vmu_alloc_bound();
1238                                         cur->vmb_start = s_index;
1239                                         cur->vmb_end = s_index + p_bound_size;
1240                                         cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1241                                         if (first == NULL) {
1242                                                 first = cur;
1243                                                 last = cur;
1244                                         } else {
1245                                                 last->vmb_next = cur;
1246                                                 last = cur;
1247                                         }
1248                                 }
1249                                 p_index = p_index + p_bound_size + 1;
1250                                 s_index = s_index + p_bound_size + 1;
1251                         }
1252
1253                         /* Detect end of anons in amp */
1254                         if (ap == NULL)
1255                                 break;
1256
1257                         cnt = ap->an_refcnt;
1258                         swap_xlate(ap, &vn, &off);
1259
1260                         if (vn == NULL || !vn_has_cached_data(vn) ||
1261                             (page = page_exists(&vn->v_object, off)) == NULL) {
1262                                 p_index++;
1263                                 s_index++;
1264                                 continue;
1265                         }
1266
1267                         /*
1268                          * If large page is found, compute portion of large
1269                          * page in mapping, and increment indicies to the next
1270                          * large page.
1271                          */
1272                         if (page->p_szc > 0) {
1273
1274                                 pgcnt = page_get_pagecnt(page->p_szc);
1275                                 pgshft = page_get_shift(page->p_szc);
1276                                 pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1277
1278                                 /* First page in large page */
1279                                 pgstart = p_index & ~pgmsk;
1280                                 /* Last page in large page */
1281                                 pgend = pgstart + pgcnt - 1;
1282                                 /*
1283                                  * Artifically end page if page extends past
1284                                  * end of mapping.
1285                                  */
1286                                 if (pgend > p_end)
1287                                         pgend = p_end;
1288
1289                                 /*
1290                                  * Compute number of pages from large page
1291                                  * which are mapped.
1292                                  */
1293                                 pgcnt = pgend - p_index + 1;
1294
1295                                 /*
1296                                  * Point indicies at page after large page,
1297                                  * or at page after end of mapping.
1298                                  */
1299                                 p_index += pgcnt;
1300                                 s_index += pgcnt;
1301                         } else {
1302                                 p_index++;
1303                                 s_index++;
1304                         }
1305
1306                         /*
1307                          * Assume anon structs with a refcnt
1308                          * of 1 are not COW shared, so there
1309                          * is no reason to track them per entity.
1310                          */
1311                         if (cnt == 1) {
1312                                 panon += pgcnt;
1313                                 continue;
1314                         }
1315                         for (entity = vmu_entities; entity != NULL;
1316                             entity = entity->vme_next_calc) {
1317
1318                                 result = &entity->vme_result;
1319                                 /*
1320                                  * Track COW anons per entity so
1321                                  * they are not double counted.
1322                                  */
1323                                 if (vmu_find_insert_anon(entity->vme_anon_hash,
1324                                     (caddr_t)ap) == 0)
1325                                         continue;
1326
1327                                 result->vmu_rss_all += (pgcnt << PAGESHIFT);
1328                                 result->vmu_rss_private +=
1329                                     (pgcnt << PAGESHIFT);
1330                         }
1331                 }
1332                 ANON_LOCK_EXIT(&private_amp->a_rwlock);
1333         }
1334
1335         /* Add up resident anon and swap reserved for private mappings */
1336         if (swresv > 0 || panon > 0) {
1337                 for (entity = vmu_entities; entity != NULL;
1338                     entity = entity->vme_next_calc) {
1339                         result = &entity->vme_result;
1340                         result->vmu_swap_all += swresv;
1341                         result->vmu_swap_private += swresv;
1342                         result->vmu_rss_all += (panon << PAGESHIFT);
1343                         result->vmu_rss_private += (panon << PAGESHIFT);
1344                 }
1345         }
1346
1347         /* Compute resident pages backing shared amp or named vnode */
1348         if (shared_object != NULL) {
1349                 avl_tree_t *tree = &(shared_object->vmo_bounds);
1350
1351                 if (first == NULL) {
1352                         /*
1353                          * No private amp, or private amp has no anon
1354                          * structs.  This means entire segment is backed by
1355                          * the shared object.
1356                          */
1357                         first = vmu_alloc_bound();
1358                         first->vmb_start = s_start;
1359                         first->vmb_end = s_end;
1360                         first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1361                 }
1362                 /*
1363                  * Iterate bounds not backed by private amp, and compute
1364                  * resident pages.
1365                  */
1366                 cur = first;
1367                 while (cur != NULL) {
1368
1369                         if (vmu_insert_lookup_object_bounds(shared_object,
1370                             cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1371                             &first, &last) > 0) {
1372                                 /* new bounds, find incore/not-incore */
1373                                 if (shared_object->vmo_type ==
1374                                     VMUSAGE_TYPE_VNODE) {
1375                                         vmu_vnode_update_incore_bounds(
1376                                             tree,
1377                                             (vnode_t *)
1378                                             shared_object->vmo_key, &first,
1379                                             &last);
1380                                 } else {
1381                                         vmu_amp_update_incore_bounds(
1382                                             tree,
1383                                             (struct anon_map *)
1384                                             shared_object->vmo_key, &first,
1385                                             &last, incore);
1386                                 }
1387                                 vmu_merge_bounds(tree, &first, &last);
1388                         }
1389                         for (entity = vmu_entities; entity != NULL;
1390                             entity = entity->vme_next_calc) {
1391                                 avl_tree_t *e_tree;
1392
1393                                 result = &entity->vme_result;
1394
1395                                 entity_object = vmu_find_insert_object(
1396                                     shared_object->vmo_type ==
1397                                     VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1398                                     entity->vme_amp_hash,
1399                                     shared_object->vmo_key,
1400                                     shared_object->vmo_type);
1401
1402                                 virt = vmu_insert_lookup_object_bounds(
1403                                     entity_object, cur->vmb_start, cur->vmb_end,
1404                                     VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1405
1406                                 if (virt == 0)
1407                                         continue;
1408                                 /*
1409                                  * Range visited for this entity
1410                                  */
1411                                 e_tree = &(entity_object->vmo_bounds);
1412                                 rss = vmu_update_bounds(e_tree, &e_first,
1413                                     &e_last, tree, first, last);
1414                                 result->vmu_rss_all += (rss << PAGESHIFT);
1415                                 if (shared == B_TRUE && file == B_FALSE) {
1416                                         /* shared anon mapping */
1417                                         result->vmu_swap_all +=
1418                                             (virt << PAGESHIFT);
1419                                         result->vmu_swap_shared +=
1420                                             (virt << PAGESHIFT);
1421                                         result->vmu_rss_shared +=
1422                                             (rss << PAGESHIFT);
1423                                 } else if (shared == B_TRUE && file == B_TRUE) {
1424                                         /* shared file mapping */
1425                                         result->vmu_rss_shared +=
1426                                             (rss << PAGESHIFT);
1427                                 } else if (shared == B_FALSE &&
1428                                     file == B_TRUE) {
1429                                         /* private file mapping */
1430                                         result->vmu_rss_private +=
1431                                             (rss << PAGESHIFT);
1432                                 }
1433                                 vmu_merge_bounds(e_tree, &e_first, &e_last);
1434                         }
1435                         tmp = cur;
1436                         cur = cur->vmb_next;
1437                         vmu_free_bound(tmp);
1438                 }
1439         }
1440 }
1441
1442 /*
1443  * Based on the current calculation flags, find the relevant entities
1444  * which are relative to the process.  Then calculate each segment
1445  * in the process'es address space for each relevant entity.
1446  */
1447 static void
1448 vmu_calculate_proc(proc_t *p)
1449 {
1450         vmu_entity_t *entities = NULL;
1451         vmu_zone_t *zone;
1452         vmu_entity_t *tmp;
1453         struct as *as;
1454         struct seg *seg;
1455         int ret;
1456
1457         /* Figure out which entities are being computed */
1458         if ((vmu_data.vmu_system) != NULL) {
1459                 tmp = vmu_data.vmu_system;
1460                 tmp->vme_next_calc = entities;
1461                 entities = tmp;
1462         }
1463         if (vmu_data.vmu_calc_flags &
1464             (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
1465             VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1466             VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1467             VMUSAGE_ALL_EUSERS)) {
1468                 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1469                     (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1470                     (mod_hash_val_t *)&zone);
1471                 if (ret != 0) {
1472                         zone = vmu_alloc_zone(p->p_zone->zone_id);
1473                         ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1474                             (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1475                             (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1476                         ASSERT(ret == 0);
1477                 }
1478                 if (zone->vmz_zone != NULL) {
1479                         tmp = zone->vmz_zone;
1480                         tmp->vme_next_calc = entities;
1481                         entities = tmp;
1482                 }
1483                 if (vmu_data.vmu_calc_flags &
1484                     (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1485                         tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1486                             p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1487                             zone->vmz_id);
1488                         tmp->vme_next_calc = entities;
1489                         entities = tmp;
1490                 }
1491                 if (vmu_data.vmu_calc_flags &
1492                     (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1493                         tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1494                             p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1495                         tmp->vme_next_calc = entities;
1496                         entities = tmp;
1497                 }
1498                 if (vmu_data.vmu_calc_flags &
1499                     (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1500                         tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1501                             crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1502                         tmp->vme_next_calc = entities;
1503                         entities = tmp;
1504                 }
1505                 if (vmu_data.vmu_calc_flags &
1506                     (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1507                         tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1508                             crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1509                         tmp->vme_next_calc = entities;
1510                         entities = tmp;
1511                 }
1512         }
1513         /* Entities which collapse projects and users for all zones */
1514         if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1515                 tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1516                     p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1517                 tmp->vme_next_calc = entities;
1518                 entities = tmp;
1519         }
1520         if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1521                 tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1522                     crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1523                 tmp->vme_next_calc = entities;
1524                 entities = tmp;
1525         }
1526         if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1527                 tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1528                     crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1529                 tmp->vme_next_calc = entities;
1530                 entities = tmp;
1531         }
1532
1533         ASSERT(entities != NULL);
1534         /* process all segs in process's address space */
1535         as = p->p_as;
1536         AS_LOCK_ENTER(as, RW_READER);
1537         for (seg = AS_SEGFIRST(as); seg != NULL;
1538             seg = AS_SEGNEXT(as, seg)) {
1539                 vmu_calculate_seg(entities, seg);
1540         }
1541         AS_LOCK_EXIT(as);
1542 }
1543
1544 /*
1545  * Free data created by previous call to vmu_calculate().
1546  */
1547 static void
1548 vmu_clear_calc()
1549 {
1550         if (vmu_data.vmu_system != NULL) {
1551                 vmu_free_entity(vmu_data.vmu_system);
1552                 vmu_data.vmu_system = NULL;
1553         }
1554         if (vmu_data.vmu_zones_hash != NULL)
1555                 i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1556         if (vmu_data.vmu_projects_col_hash != NULL)
1557                 i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1558         if (vmu_data.vmu_rusers_col_hash != NULL)
1559                 i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1560         if (vmu_data.vmu_eusers_col_hash != NULL)
1561                 i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1562
1563         i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1564         i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1565 }
1566
1567 /*
1568  * Free unused data structures.  These can result if the system workload
1569  * decreases between calculations.
1570  */
1571 static void
1572 vmu_free_extra()
1573 {
1574         vmu_bound_t *tb;
1575         vmu_object_t *to;
1576         vmu_entity_t *te;
1577         vmu_zone_t *tz;
1578
1579         while (vmu_data.vmu_free_bounds != NULL) {
1580                 tb = vmu_data.vmu_free_bounds;
1581                 vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1582                 kmem_cache_free(vmu_bound_cache, tb);
1583         }
1584         while (vmu_data.vmu_free_objects != NULL) {
1585                 to = vmu_data.vmu_free_objects;
1586                 vmu_data.vmu_free_objects =
1587                     vmu_data.vmu_free_objects->vmo_next;
1588                 kmem_cache_free(vmu_object_cache, to);
1589         }
1590         while (vmu_data.vmu_free_entities != NULL) {
1591                 te = vmu_data.vmu_free_entities;
1592                 vmu_data.vmu_free_entities =
1593                     vmu_data.vmu_free_entities->vme_next;
1594                 if (te->vme_vnode_hash != NULL)
1595                         mod_hash_destroy_hash(te->vme_vnode_hash);
1596                 if (te->vme_amp_hash != NULL)
1597                         mod_hash_destroy_hash(te->vme_amp_hash);
1598                 if (te->vme_anon_hash != NULL)
1599                         mod_hash_destroy_hash(te->vme_anon_hash);
1600                 kmem_free(te, sizeof (vmu_entity_t));
1601         }
1602         while (vmu_data.vmu_free_zones != NULL) {
1603                 tz = vmu_data.vmu_free_zones;
1604                 vmu_data.vmu_free_zones =
1605                     vmu_data.vmu_free_zones->vmz_next;
1606                 if (tz->vmz_projects_hash != NULL)
1607                         mod_hash_destroy_hash(tz->vmz_projects_hash);
1608                 if (tz->vmz_tasks_hash != NULL)
1609                         mod_hash_destroy_hash(tz->vmz_tasks_hash);
1610                 if (tz->vmz_rusers_hash != NULL)
1611                         mod_hash_destroy_hash(tz->vmz_rusers_hash);
1612                 if (tz->vmz_eusers_hash != NULL)
1613                         mod_hash_destroy_hash(tz->vmz_eusers_hash);
1614                 kmem_free(tz, sizeof (vmu_zone_t));
1615         }
1616 }
1617
1618 extern kcondvar_t *pr_pid_cv;
1619
1620 /*
1621  * Determine which entity types are relevant and allocate the hashes to
1622  * track them.  Then walk the process table and count rss and swap
1623  * for each process'es address space.  Address space object such as
1624  * vnodes, amps and anons are tracked per entity, so that they are
1625  * not double counted in the results.
1626  *
1627  */
1628 static void
1629 vmu_calculate()
1630 {
1631         int i = 0;
1632         int ret;
1633         proc_t *p;
1634
1635         vmu_clear_calc();
1636
1637         if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1638                 vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1639                     ALL_ZONES);
1640
1641         /*
1642          * Walk process table and calculate rss of each proc.
1643          *
1644          * Pidlock and p_lock cannot be held while doing the rss calculation.
1645          * This is because:
1646          *      1.  The calculation allocates using KM_SLEEP.
1647          *      2.  The calculation grabs a_lock, which cannot be grabbed
1648          *          after p_lock.
1649          *
1650          * Since pidlock must be dropped, we cannot simply just walk the
1651          * practive list.  Instead, we walk the process table, and sprlock
1652          * each process to ensure that it does not exit during the
1653          * calculation.
1654          */
1655
1656         mutex_enter(&pidlock);
1657         for (i = 0; i < v.v_proc; i++) {
1658 again:
1659                 p = pid_entry(i);
1660                 if (p == NULL)
1661                         continue;
1662
1663                 mutex_enter(&p->p_lock);
1664                 mutex_exit(&pidlock);
1665
1666                 if (panicstr) {
1667                         mutex_exit(&p->p_lock);
1668                         return;
1669                 }
1670
1671                 /* Try to set P_PR_LOCK */
1672                 ret = sprtrylock_proc(p);
1673                 if (ret == -1) {
1674                         /* Process in invalid state */
1675                         mutex_exit(&p->p_lock);
1676                         mutex_enter(&pidlock);
1677                         continue;
1678                 } else if (ret == 1) {
1679                         /*
1680                          * P_PR_LOCK is already set.  Wait and try again.
1681                          * This also drops p_lock.
1682                          */
1683                         sprwaitlock_proc(p);
1684                         mutex_enter(&pidlock);
1685                         goto again;
1686                 }
1687                 mutex_exit(&p->p_lock);
1688
1689                 vmu_calculate_proc(p);
1690
1691                 mutex_enter(&p->p_lock);
1692                 sprunlock(p);
1693                 mutex_enter(&pidlock);
1694         }
1695         mutex_exit(&pidlock);
1696
1697         vmu_free_extra();
1698 }
1699
1700 /*
1701  * allocate a new cache for N results satisfying flags
1702  */
1703 vmu_cache_t *
1704 vmu_cache_alloc(size_t nres, uint_t flags)
1705 {
1706         vmu_cache_t *cache;
1707
1708         cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1709         cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1710         cache->vmc_nresults = nres;
1711         cache->vmc_flags = flags;
1712         cache->vmc_refcnt = 1;
1713         return (cache);
1714 }
1715
1716 /*
1717  * Make sure cached results are not freed
1718  */
1719 static void
1720 vmu_cache_hold(vmu_cache_t *cache)
1721 {
1722         ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1723         cache->vmc_refcnt++;
1724 }
1725
1726 /*
1727  * free cache data
1728  */
1729 static void
1730 vmu_cache_rele(vmu_cache_t *cache)
1731 {
1732         ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1733         ASSERT(cache->vmc_refcnt > 0);
1734         cache->vmc_refcnt--;
1735         if (cache->vmc_refcnt == 0) {
1736                 kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1737                     cache->vmc_nresults);
1738                 kmem_free(cache, sizeof (vmu_cache_t));
1739         }
1740 }
1741
1742 /*
1743  * Copy out the cached results to a caller.  Inspect the callers flags
1744  * and zone to determine which cached results should be copied.
1745  */
1746 static int
1747 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1748     uint_t flags, int cpflg)
1749 {
1750         vmusage_t *result, *out_result;
1751         vmusage_t dummy;
1752         size_t i, count = 0;
1753         size_t bufsize;
1754         int ret = 0;
1755         uint_t types = 0;
1756
1757         if (nres != NULL) {
1758                 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1759                         return (set_errno(EFAULT));
1760         } else {
1761                 bufsize = 0;
1762         }
1763
1764         /* figure out what results the caller is interested in. */
1765         if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1766                 types |= VMUSAGE_SYSTEM;
1767         if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
1768                 types |= VMUSAGE_ZONE;
1769         if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1770             VMUSAGE_COL_PROJECTS))
1771                 types |= VMUSAGE_PROJECTS;
1772         if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1773                 types |= VMUSAGE_TASKS;
1774         if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1775                 types |= VMUSAGE_RUSERS;
1776         if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1777                 types |= VMUSAGE_EUSERS;
1778
1779         /* count results for current zone */
1780         out_result = buf;
1781         for (result = cache->vmc_results, i = 0;
1782             i < cache->vmc_nresults; result++, i++) {
1783
1784                 /* Do not return "other-zone" results to non-global zones */
1785                 if (curproc->p_zone != global_zone &&
1786                     curproc->p_zone->zone_id != result->vmu_zoneid)
1787                         continue;
1788
1789                 /*
1790                  * If non-global zone requests VMUSAGE_SYSTEM, fake
1791                  * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1792                  */
1793                 if (curproc->p_zone != global_zone &&
1794                     (flags & VMUSAGE_SYSTEM) != 0 &&
1795                     result->vmu_type == VMUSAGE_ZONE) {
1796                         count++;
1797                         if (out_result != NULL) {
1798                                 if (bufsize < count) {
1799                                         ret = set_errno(EOVERFLOW);
1800                                 } else {
1801                                         dummy = *result;
1802                                         dummy.vmu_zoneid = ALL_ZONES;
1803                                         dummy.vmu_id = 0;
1804                                         dummy.vmu_type = VMUSAGE_SYSTEM;
1805                                         if (ddi_copyout(&dummy, out_result,
1806                                             sizeof (vmusage_t), cpflg))
1807                                                 return (set_errno(EFAULT));
1808                                         out_result++;
1809                                 }
1810                         }
1811                 }
1812
1813                 /* Skip results that do not match requested type */
1814                 if ((result->vmu_type & types) == 0)
1815                         continue;
1816
1817                 /* Skip collated results if not requested */
1818                 if (result->vmu_zoneid == ALL_ZONES) {
1819                         if (result->vmu_type == VMUSAGE_PROJECTS &&
1820                             (flags & VMUSAGE_COL_PROJECTS) == 0)
1821                                 continue;
1822                         if (result->vmu_type == VMUSAGE_EUSERS &&
1823                             (flags & VMUSAGE_COL_EUSERS) == 0)
1824                                 continue;
1825                         if (result->vmu_type == VMUSAGE_RUSERS &&
1826                             (flags & VMUSAGE_COL_RUSERS) == 0)
1827                                 continue;
1828                 }
1829
1830                 /* Skip "other zone" results if not requested */
1831                 if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1832                         if (result->vmu_type == VMUSAGE_ZONE &&
1833                             (flags & VMUSAGE_ALL_ZONES) == 0)
1834                                 continue;
1835                         if (result->vmu_type == VMUSAGE_PROJECTS &&
1836                             (flags & (VMUSAGE_ALL_PROJECTS |
1837                             VMUSAGE_COL_PROJECTS)) == 0)
1838                                 continue;
1839                         if (result->vmu_type == VMUSAGE_TASKS &&
1840                             (flags & VMUSAGE_ALL_TASKS) == 0)
1841                                 continue;
1842                         if (result->vmu_type == VMUSAGE_RUSERS &&
1843                             (flags & (VMUSAGE_ALL_RUSERS |
1844                             VMUSAGE_COL_RUSERS)) == 0)
1845                                 continue;
1846                         if (result->vmu_type == VMUSAGE_EUSERS &&
1847                             (flags & (VMUSAGE_ALL_EUSERS |
1848                             VMUSAGE_COL_EUSERS)) == 0)
1849                                 continue;
1850                 }
1851                 count++;
1852                 if (out_result != NULL) {
1853                         if (bufsize < count) {
1854                                 ret = set_errno(EOVERFLOW);
1855                         } else {
1856                                 if (ddi_copyout(result, out_result,
1857                                     sizeof (vmusage_t), cpflg))
1858                                         return (set_errno(EFAULT));
1859                                 out_result++;
1860                         }
1861                 }
1862         }
1863         if (nres != NULL)
1864                 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1865                         return (set_errno(EFAULT));
1866
1867         return (ret);
1868 }
1869
1870 /*
1871  * vm_getusage()
1872  *
1873  * Counts rss and swap by zone, project, task, and/or user.  The flags argument
1874  * determines the type of results structures returned.  Flags requesting
1875  * results from more than one zone are "flattened" to the local zone if the
1876  * caller is not the global zone.
1877  *
1878  * args:
1879  *      flags:  bitmap consisting of one or more of VMUSAGE_*.
1880  *      age:    maximum allowable age (time since counting was done) in
1881  *              seconds of the results.  Results from previous callers are
1882  *              cached in kernel.
1883  *      buf:    pointer to buffer array of vmusage_t.  If NULL, then only nres
1884  *              set on success.
1885  *      nres:   Set to number of vmusage_t structures pointed to by buf
1886  *              before calling vm_getusage().
1887  *              On return 0 (success) or ENOSPC, is set to the number of result
1888  *              structures returned or attempted to return.
1889  *
1890  * returns 0 on success, -1 on failure:
1891  *      EINTR (interrupted)
1892  *      ENOSPC (nres to small for results, nres set to needed value for success)
1893  *      EINVAL (flags invalid)
1894  *      EFAULT (bad address for buf or nres)
1895  */
1896 int
1897 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1898 {
1899         vmu_entity_t *entity;
1900         vmusage_t *result;
1901         int ret = 0;
1902         int cacherecent = 0;
1903         hrtime_t now;
1904         uint_t flags_orig;
1905
1906         /*
1907          * Non-global zones cannot request system wide and/or collated
1908          * results, or the system result, so munge the flags accordingly.
1909          */
1910         flags_orig = flags;
1911         if (curproc->p_zone != global_zone) {
1912                 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1913                         flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1914                         flags |= VMUSAGE_PROJECTS;
1915                 }
1916                 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1917                         flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1918                         flags |= VMUSAGE_RUSERS;
1919                 }
1920                 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1921                         flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1922                         flags |= VMUSAGE_EUSERS;
1923                 }
1924                 if (flags & VMUSAGE_SYSTEM) {
1925                         flags &= ~VMUSAGE_SYSTEM;
1926                         flags |= VMUSAGE_ZONE;
1927                 }
1928         }
1929
1930         /* Check for unknown flags */
1931         if ((flags & (~VMUSAGE_MASK)) != 0)
1932                 return (set_errno(EINVAL));
1933
1934         /* Check for no flags */
1935         if ((flags & VMUSAGE_MASK) == 0)
1936                 return (set_errno(EINVAL));
1937
1938         mutex_enter(&vmu_data.vmu_lock);
1939         now = gethrtime();
1940
1941 start:
1942         if (vmu_data.vmu_cache != NULL) {
1943
1944                 vmu_cache_t *cache;
1945
1946                 if ((vmu_data.vmu_cache->vmc_timestamp +
1947                     ((hrtime_t)age * NANOSEC)) > now)
1948                         cacherecent = 1;
1949
1950                 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
1951                     cacherecent == 1) {
1952                         cache = vmu_data.vmu_cache;
1953                         vmu_cache_hold(cache);
1954                         mutex_exit(&vmu_data.vmu_lock);
1955
1956                         ret = vmu_copyout_results(cache, buf, nres, flags_orig,
1957                             cpflg);
1958                         mutex_enter(&vmu_data.vmu_lock);
1959                         vmu_cache_rele(cache);
1960                         if (vmu_data.vmu_pending_waiters > 0)
1961                                 cv_broadcast(&vmu_data.vmu_cv);
1962                         mutex_exit(&vmu_data.vmu_lock);
1963                         return (ret);
1964                 }
1965                 /*
1966                  * If the cache is recent, it is likely that there are other
1967                  * consumers of vm_getusage running, so add their flags to the
1968                  * desired flags for the calculation.
1969                  */
1970                 if (cacherecent == 1)
1971                         flags = vmu_data.vmu_cache->vmc_flags | flags;
1972         }
1973         if (vmu_data.vmu_calc_thread == NULL) {
1974
1975                 vmu_cache_t *cache;
1976
1977                 vmu_data.vmu_calc_thread = curthread;
1978                 vmu_data.vmu_calc_flags = flags;
1979                 vmu_data.vmu_entities = NULL;
1980                 vmu_data.vmu_nentities = 0;
1981                 if (vmu_data.vmu_pending_waiters > 0)
1982                         vmu_data.vmu_calc_flags |=
1983                             vmu_data.vmu_pending_flags;
1984
1985                 vmu_data.vmu_pending_flags = 0;
1986                 mutex_exit(&vmu_data.vmu_lock);
1987                 vmu_calculate();
1988                 mutex_enter(&vmu_data.vmu_lock);
1989                 /* copy results to cache */
1990                 if (vmu_data.vmu_cache != NULL)
1991                         vmu_cache_rele(vmu_data.vmu_cache);
1992                 cache = vmu_data.vmu_cache =
1993                     vmu_cache_alloc(vmu_data.vmu_nentities,
1994                     vmu_data.vmu_calc_flags);
1995
1996                 result = cache->vmc_results;
1997                 for (entity = vmu_data.vmu_entities; entity != NULL;
1998                     entity = entity->vme_next) {
1999                         *result = entity->vme_result;
2000                         result++;
2001                 }
2002                 cache->vmc_timestamp = gethrtime();
2003                 vmu_cache_hold(cache);
2004
2005                 vmu_data.vmu_calc_flags = 0;
2006                 vmu_data.vmu_calc_thread = NULL;
2007
2008                 if (vmu_data.vmu_pending_waiters > 0)
2009                         cv_broadcast(&vmu_data.vmu_cv);
2010
2011                 mutex_exit(&vmu_data.vmu_lock);
2012
2013                 /* copy cache */
2014                 ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
2015                 mutex_enter(&vmu_data.vmu_lock);
2016                 vmu_cache_rele(cache);
2017                 mutex_exit(&vmu_data.vmu_lock);
2018
2019                 return (ret);
2020         }
2021         vmu_data.vmu_pending_flags |= flags;
2022         vmu_data.vmu_pending_waiters++;
2023         while (vmu_data.vmu_calc_thread != NULL) {
2024                 if (cv_wait_sig(&vmu_data.vmu_cv,
2025                     &vmu_data.vmu_lock) == 0) {
2026                         vmu_data.vmu_pending_waiters--;
2027                         mutex_exit(&vmu_data.vmu_lock);
2028                         return (set_errno(EINTR));
2029                 }
2030         }
2031         vmu_data.vmu_pending_waiters--;
2032         goto start;
2033 }