Move /var/svc/log to /var/log/svc
[unleashed/lotheac.git] / usr / src / cmd / rcap / rcapd / rcapd_scanner.c
blob090535ab76fcb12a697158369e8d8504b53bbccb
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/mman.h>
27 #include <sys/param.h>
28 #include <sys/stat.h>
29 #include <sys/types.h>
30 #include <assert.h>
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <libproc.h>
34 #include <limits.h>
35 #include <procfs.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <strings.h>
39 #include <time.h>
40 #include <unistd.h>
41 #include "rcapd.h"
42 #include "rcapd_rfd.h"
43 #include "rcapd_mapping.h"
44 #include "utils.h"
46 static int lpc_xmap_update(lprocess_t *);
47 #ifdef DEBUG
48 extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2);
49 #endif /* DEBUG */
52 * The number of file descriptors required to grab a process and create an
53 * agent in it.
55 #define PGRAB_FD_COUNT 10
58 * Record a position in an address space as it corresponds to a prpageheader_t
59 * and affiliated structures.
61 typedef struct prpageheader_cur {
62 int pr_nmap; /* number of mappings in address space */
63 int pr_map; /* number of this mapping */
64 uint64_t pr_pgoff; /* page offset into mapping */
65 uint64_t pr_npage; /* number of pages in mapping */
66 uint64_t pr_pagesize; /* page size of mapping */
67 uintptr_t pr_addr; /* base of mapping */
68 prpageheader_t *pr_prpageheader; /* associated page header */
69 void *pr_pdaddr; /* address of page's byte in pagedata */
70 prxmap_t *pr_xmap; /* array containing per-segment information */
71 int pr_nxmap; /* number of xmaps in array */
72 int64_t pr_rss; /* number of resident pages in mapping, */
73 /* or -1 if xmap is out of sync */
74 int64_t pr_pg_rss; /* number of pageable pages in mapping, or -1 */
75 } prpageheader_cur_t;
77 static struct ps_prochandle *scan_pr; /* currently-scanned process's handle */
79 typedef enum {
80 STDL_NORMAL,
81 STDL_HIGH
82 } st_debug_level_t;
85 * Output a scanning-related debug message.
87 /*PRINTFLIKE3*/ /*ARGSUSED*/
88 static void
89 st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...)
91 #ifdef DEBUG_MSG
92 va_list alist;
93 char *buf;
94 size_t len;
96 if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH
97 : RCM_DEBUG))
98 return;
100 len = strlen(msg) + LINELEN;
101 buf = malloc(len);
102 if (buf == NULL)
103 return;
104 (void) snprintf(buf, len, "%s %s scanner %s",
105 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
106 lcol->lcol_name, msg);
108 va_start(alist, msg);
109 vdprintfe(RCM_DEBUG, buf, alist);
110 va_end(alist);
112 free(buf);
113 #endif /* DEBUG_MSG */
117 * Determine the collection's current victim, based on its last. The last will
118 * be returned, or, if invalid, any other valid process, if the collection has
119 * any.
121 static lprocess_t *
122 get_valid_victim(lcollection_t *lcol, lprocess_t *lpc)
124 if (lpc == NULL || !lcollection_member(lcol, lpc))
125 lpc = lcol->lcol_lprocess;
128 * Find the next scannable process, and make it the victim.
130 while (lpc != NULL && lpc->lpc_unscannable != 0)
131 lpc = lpc->lpc_next;
133 return (lpc);
137 * Get a process's combined current pagedata (per-page referenced and modified
138 * bits) and set the supplied pointer to it. The caller is responsible for
139 * freeing the data. If the pagedata is unreadable, a nonzero value is
140 * returned, and errno is set. Otherwise, 0 is returned.
142 static int
143 get_pagedata(prpageheader_t **pghpp, int fd)
145 int res;
146 struct stat st;
148 redo:
149 errno = 0;
150 if (fstat(fd, &st) != 0) {
151 debug("cannot stat pagedata\n");
152 return (-1);
155 errno = 0;
156 *pghpp = malloc(st.st_size);
157 if (*pghpp == NULL) {
158 debug("cannot malloc() %ld bytes for pagedata", st.st_size);
159 return (-1);
161 (void) bzero(*pghpp, st.st_size);
163 errno = 0;
164 if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) {
165 free(*pghpp);
166 *pghpp = NULL;
167 if (res > 0 || errno == E2BIG) {
168 debug("pagedata changed size, retrying\n");
169 goto redo;
170 } else {
171 debug("cannot read pagedata");
172 return (-1);
176 return (0);
180 * Return the count of kilobytes of pages represented by the given pagedata
181 * which meet the given criteria, having pages which are in all of the states
182 * specified by the mask, and in none of the states in the notmask. If the
183 * CP_CLEAR flag is set, the pagedata will also be cleared.
185 #define CP_CLEAR 1
186 static uint64_t
187 count_pages(prpageheader_t *pghp, int flags, int mask, int notmask)
189 int map;
190 caddr_t cur, end;
191 prpageheader_t pgh = *pghp;
192 prasmap_t *asmapp;
193 uint64_t count = 0;
195 cur = (caddr_t)pghp + sizeof (*pghp);
196 for (map = 0; map < pgh.pr_nmap; map++) {
197 asmapp = (prasmap_t *)(uintptr_t)cur;
198 cur += sizeof (*asmapp);
199 end = cur + asmapp->pr_npage;
200 while (cur < end) {
201 if ((*cur & mask) == mask && (*cur & notmask) == 0)
202 count += asmapp->pr_pagesize / 1024;
203 if ((flags & CP_CLEAR) != 0)
204 *cur = 0;
205 cur++;
209 * Skip to next 64-bit-aligned address to get the next
210 * prasmap_t.
212 cur = (caddr_t)((intptr_t)(cur + 7) & ~7);
215 return (count);
219 * Return the amount of memory (in kilobytes) that hasn't been referenced or
220 * modified, which memory which will be paged out first. Should be written to
221 * exclude nonresident pages when sufficient interfaces exist.
223 static uint64_t
224 unrm_size(lprocess_t *lpc)
226 return (count_pages(lpc->lpc_prpageheader, CP_CLEAR,
227 0, PG_MODIFIED | PG_REFERENCED));
231 * Advance a prpageheader_cur_t to the address space's next mapping, returning
232 * its address, or NULL if there is none. Any known nonpageable or nonresident
233 * mappings will be skipped over.
235 static uintptr_t
236 advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp)
238 prasmap_t *pap;
239 int i;
241 next:
242 ASSERT(pcp->pr_map < pcp->pr_nmap);
243 if ((pcp->pr_map + 1) == pcp->pr_nmap)
244 return ((uintptr_t)NULL);
245 pcp->pr_map++;
246 if (pcp->pr_pgoff < pcp->pr_npage) {
247 pcp->pr_pdaddr = (caddr_t)(uintptr_t)
248 ((uintptr_t)pcp->pr_pdaddr +
249 (pcp->pr_npage - pcp->pr_pgoff));
250 pcp->pr_pgoff = pcp->pr_npage;
253 * Skip to next 64-bit-aligned address to get the next prasmap_t.
255 pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7);
256 pap = (prasmap_t *)pcp->pr_pdaddr;
257 pcp->pr_pgoff = 0;
258 pcp->pr_npage = pap->pr_npage;
259 pcp->pr_pagesize = pap->pr_pagesize;
260 pcp->pr_addr = pap->pr_vaddr;
261 pcp->pr_pdaddr = pap + 1;
264 * Skip any known nonpageable mappings. Currently, the only one
265 * detected is the schedctl page.
267 if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC |
268 MA_ANON)) == 0 && pap->pr_npage == 1) {
269 debug("identified nonpageable schedctl mapping at %p\n",
270 (void *)pcp->pr_addr);
271 goto next;
275 * Skip mappings with no resident pages. If the xmap does not
276 * correspond to the pagedata for any reason, it will be ignored.
278 pcp->pr_rss = -1;
279 pcp->pr_pg_rss = -1;
280 for (i = 0; i < pcp->pr_nxmap; i++) {
281 prxmap_t *xmap = &pcp->pr_xmap[i];
283 if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size ==
284 (pcp->pr_npage * pcp->pr_pagesize)) {
285 pcp->pr_rss = xmap->pr_rss;
287 * Remove COW pages from the pageable RSS count.
289 if ((xmap->pr_mflags & MA_SHARED) == 0)
290 pcp->pr_pg_rss = xmap->pr_anon;
291 break;
294 if (pcp->pr_rss == 0) {
295 debug("identified nonresident mapping at 0x%p\n",
296 (void *)pcp->pr_addr);
297 goto next;
298 } else if (pcp->pr_pg_rss == 0) {
299 debug("identified unpageable mapping at 0x%p\n",
300 (void *)pcp->pr_addr);
301 goto next;
304 return (pcp->pr_addr);
308 * Advance a prpageheader_cur_t to the mapping's next page, returning its
309 * address, or NULL if there is none.
311 static void *
312 advance_prpageheader_cur(prpageheader_cur_t *pcp)
314 ASSERT(pcp->pr_pgoff < pcp->pr_npage);
315 if ((pcp->pr_pgoff + 1) == pcp->pr_npage)
316 return (NULL);
317 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1;
318 pcp->pr_pgoff++;
320 ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0);
321 return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize);
325 * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
326 * of an address space.
328 static void *
329 set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php,
330 prxmap_t *xmap, int nxmap)
332 bzero(pcp, sizeof (*pcp));
333 pcp->pr_nmap = php->pr_nmap;
334 pcp->pr_map = -1;
335 pcp->pr_prpageheader = php;
336 pcp->pr_xmap = xmap;
337 pcp->pr_nxmap = nxmap;
338 pcp->pr_pdaddr = (prpageheader_t *)php + 1;
340 return ((void *)advance_prpageheader_cur_nextmapping(pcp));
344 * Position a prpageheader_cur_t to the mapped address greater or equal to the
345 * given value.
347 static void *
348 set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php,
349 prxmap_t *xmap, int nxmap, void *naddr)
351 void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap);
353 while (addr != NULL && addr <= naddr)
354 if (naddr < (void *)((caddr_t)pcp->pr_addr +
355 pcp->pr_pagesize * pcp->pr_npage)) {
356 uint64_t pgdiff = ((uintptr_t)naddr -
357 (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize;
358 pcp->pr_pgoff += pgdiff;
359 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff;
360 addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize *
361 pcp->pr_pgoff;
362 break;
363 } else
364 addr =
365 (void *)advance_prpageheader_cur_nextmapping(pcp);
367 return (addr);
370 static void
371 revoke_pagedata(rfd_t *rfd)
373 lprocess_t *lpc = rfd->rfd_data;
375 st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for"
376 " process %d\n", (int)lpc->lpc_pid);
377 ASSERT(lpc->lpc_pgdata_fd != -1);
378 lpc->lpc_pgdata_fd = -1;
381 #ifdef DEBUG
382 static void
383 mklmapping(lmapping_t **lm, prpageheader_t *pgh)
385 prpageheader_cur_t cur;
386 void *addr;
388 addr = set_prpageheader_cur(&cur, pgh, NULL, -1);
389 ASSERT(*lm == NULL);
390 while (addr != NULL) {
391 (void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage *
392 cur.pr_pagesize);
393 addr = (void *)advance_prpageheader_cur_nextmapping(&cur);
397 static void
398 lmapping_dump(lmapping_t *lm)
400 debug("lm: %p\n", (void *)lm);
401 while (lm != NULL) {
402 debug("\t(%p, %llx\n", (void *)lm->lm_addr,
403 (unsigned long long)lm->lm_size);
404 lm = lm->lm_next;
407 #endif /* DEBUG */
410 * OR two prpagedata_t which are supposedly snapshots of the same address
411 * space. Intersecting mappings with different page sizes are tolerated but
412 * not normalized (not accurate). If the mappings of the two snapshots differ
413 * in any regard, the supplied mappings_changed flag will be set.
415 static void
416 OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp)
418 prpageheader_cur_t src_cur;
419 prpageheader_cur_t dst_cur;
420 uintptr_t src_addr;
421 uintptr_t dst_addr;
422 int mappings_changed = 0;
425 * OR source pagedata with the destination, for pages of intersecting
426 * mappings.
428 src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1);
429 dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1);
430 while (src_addr != (uintptr_t)NULL && dst_addr != (uintptr_t)NULL) {
431 while (src_addr == dst_addr && src_addr != (uintptr_t)NULL) {
432 *(char *)dst_cur.pr_pdaddr |=
433 *(char *)src_cur.pr_pdaddr;
434 src_addr = (uintptr_t)advance_prpageheader_cur(
435 &src_cur);
436 dst_addr = (uintptr_t)advance_prpageheader_cur(
437 &dst_cur);
439 if (src_addr != dst_addr)
440 mappings_changed = 1;
441 src_addr = advance_prpageheader_cur_nextmapping(&src_cur);
442 dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur);
443 while (src_addr != dst_addr && src_addr != (uintptr_t)NULL &&
444 dst_addr != (uintptr_t)NULL) {
445 mappings_changed = 1;
446 if (src_addr < dst_addr)
447 src_addr = advance_prpageheader_cur_nextmapping(
448 &src_cur);
449 else
450 dst_addr = advance_prpageheader_cur_nextmapping(
451 &dst_cur);
455 *mappings_changedp = mappings_changed;
459 * Merge the current pagedata with that on hand. If the pagedata is
460 * unretrievable for any reason, such as the process having exited or being a
461 * zombie, a nonzero value is returned, the process should be marked
462 * unscannable, and future attempts to scan it should be avoided, since the
463 * symptom is probably permament. If the mappings of either pagedata
464 * differ in any respect, the supplied callback will be invoked once.
466 static int
467 merge_current_pagedata(lprocess_t *lpc,
468 void(*mappings_changed_cb) (lprocess_t *))
470 prpageheader_t *pghp;
471 int mappings_changed = 0;
472 uint64_t cnt;
474 if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) !=
475 0) {
476 char pathbuf[PROC_PATH_MAX];
478 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata",
479 (int)lpc->lpc_pid);
480 if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA,
481 revoke_pagedata, lpc, O_RDONLY, 0)) < 0 ||
482 get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0)
483 return (-1);
484 debug("starting/resuming pagedata collection for %d\n",
485 (int)lpc->lpc_pid);
488 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
489 if (cnt != 0 || lpc->lpc_rss != 0)
490 debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
491 (int)lpc->lpc_pid, (unsigned long long)cnt,
492 (unsigned long long)lpc->lpc_rss);
493 if (lpc->lpc_prpageheader != NULL) {
495 * OR the two snapshots.
497 #ifdef DEBUG
498 lmapping_t *old = NULL;
499 lmapping_t *new = NULL;
501 mklmapping(&new, pghp);
502 mklmapping(&old, lpc->lpc_prpageheader);
503 #endif /* DEBUG */
504 OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed);
505 #ifdef DEBUG
506 if (((mappings_changed != 0) ^
507 (lmapping_dump_diff(old, new) != 0))) {
508 debug("lmapping_changed inconsistent with lmapping\n");
509 debug("old\n");
510 lmapping_dump(old);
511 debug("new\n");
512 lmapping_dump(new);
513 debug("ignored\n");
514 lmapping_dump(lpc->lpc_ignore);
515 ASSERT(0);
517 lmapping_free(&new);
518 lmapping_free(&old);
519 #endif /* DEBUG */
520 free(lpc->lpc_prpageheader);
521 } else
522 mappings_changed = 1;
523 lpc->lpc_prpageheader = pghp;
525 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
526 if (cnt != 0 || lpc->lpc_rss != 0)
527 debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
528 (int)lpc->lpc_pid, (unsigned long long)cnt,
529 (unsigned long long)lpc->lpc_rss);
530 if (mappings_changed != 0) {
531 debug("process %d: mappings changed\n", (int)lpc->lpc_pid);
532 if (mappings_changed_cb != NULL)
533 mappings_changed_cb(lpc);
535 return (0);
539 * Attempt to page out a region of the given process's address space. May
540 * return nonzero if not all of the pages may are pageable, for any reason.
542 static int
543 pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
545 int res;
547 if (end <= start)
548 return (0);
550 errno = 0;
551 res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
552 (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
553 debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
556 * EBUSY indicates none of the pages have backing store allocated, or
557 * some pages were locked, which are less interesting than other
558 * conditions, which are noted.
560 if (res != 0)
561 if (errno == EBUSY)
562 res = 0;
563 else
564 debug("%d: can't pageout %p+%llx (errno %d)", (int)pid,
565 (void *)start, (long long)(end - start), errno);
567 return (res);
571 * Compute the delta of the victim process's RSS since the last call. If the
572 * psinfo cannot be obtained, no work is done, and no error is returned; it is
573 * up to the caller to detect the process' termination via other means.
575 static int64_t
576 rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic)
578 int64_t d_rss = 0;
580 if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd,
581 lprocess_update_psinfo_fd_cb, vic, vic) == 0) {
582 d_rss = (int64_t)new_psinfo->pr_rssize -
583 (int64_t)old_psinfo->pr_rssize;
584 if (d_rss < 0)
585 vic->lpc_collection->lcol_stat.lcols_pg_eff +=
586 (- d_rss);
587 *old_psinfo = *new_psinfo;
590 return (d_rss);
593 static void
594 unignore_mappings(lprocess_t *lpc)
596 lmapping_free(&lpc->lpc_ignore);
599 static void
600 unignore_referenced_mappings(lprocess_t *lpc)
602 prpageheader_cur_t cur;
603 void *vicaddr;
605 vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1);
606 while (vicaddr != NULL) {
607 if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED))
608 != 0) {
609 if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr,
610 cur.pr_npage * cur.pr_pagesize) == 0)
611 debug("removed mapping 0x%p+0t%llukB from"
612 " ignored set\n", (void *)cur.pr_addr,
613 (unsigned long long)(cur.pr_npage *
614 cur.pr_pagesize / 1024));
615 vicaddr = (void *)advance_prpageheader_cur_nextmapping(
616 &cur);
617 } else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL)
618 vicaddr = (void *)advance_prpageheader_cur_nextmapping(
619 &cur);
624 * Resume scanning, starting with the last victim, if it is still valid, or any
625 * other one, otherwise.
627 void
628 scan(lcollection_t *lcol, int64_t excess)
630 lprocess_t *vic, *lpc;
631 void *vicaddr, *endaddr, *nvicaddr;
632 prpageheader_cur_t cur;
633 psinfo_t old_psinfo, new_psinfo;
634 hrtime_t scan_start;
635 int res, resumed;
636 uint64_t col_unrm_size;
638 st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n",
639 (long long)excess);
642 * Determine the address to start scanning at, depending on whether
643 * scanning can be resumed.
645 endaddr = NULL;
646 if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) ==
647 lcol->lcol_victim && lcol->lcol_resaddr != NULL) {
648 vicaddr = lcol->lcol_resaddr;
649 st_debug(STDL_NORMAL, lcol, "resuming process %d\n",
650 (int)vic->lpc_pid);
651 resumed = 1;
652 } else {
653 vicaddr = NULL;
654 resumed = 0;
657 scan_start = gethrtime();
659 * Obtain the most current pagedata for the processes that might be
660 * scanned, and remove from the ignored set any mappings which have
661 * referenced or modified pages (in the hopes that the pageability of
662 * the mapping's pages may have changed). Determine if the
663 * unreferenced and unmodified portion is impossibly small to suffice
664 * to reduce the excess completely. If so, ignore these bits so that
665 * even working set will be paged out.
667 col_unrm_size = 0;
668 lpc = vic;
669 while (lpc != NULL && should_run) {
670 if (merge_current_pagedata(lpc, unignore_mappings) != 0) {
671 st_debug(STDL_NORMAL, lcol, "process %d:"
672 " exited/temporarily unscannable",
673 (int)lpc->lpc_pid);
674 goto next;
676 debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid,
677 (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)),
678 (unsigned long long)lpc->lpc_size);
679 col_unrm_size += lpc->lpc_unrm = unrm_size(lpc);
681 if ((lcol->lcol_stat.lcols_scan_count %
682 RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) {
684 * Periodically clear the set of ignored mappings.
685 * This will allow processes whose ignored segments'
686 * pageability have changed (without a corresponding
687 * reference or modification to a page) to be
688 * recognized.
690 if (lcol->lcol_stat.lcols_scan_count > 0)
691 unignore_mappings(lpc);
692 } else {
694 * Ensure mappings with referenced or modified pages
695 * are not in the ignored set. Their usage might mean
696 * the condition which made them unpageable is gone.
698 unignore_referenced_mappings(lpc);
700 next:
701 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
702 lpc->lpc_next) : NULL;
704 if (col_unrm_size < excess) {
705 lpc = vic;
706 debug("will not reduce excess with only unreferenced pages\n");
707 while (lpc != NULL && should_run) {
708 if (lpc->lpc_prpageheader != NULL) {
709 (void) count_pages(lpc->lpc_prpageheader,
710 CP_CLEAR, 0, 0);
711 if (lpc->lpc_pgdata_fd >= 0) {
712 if (rfd_close(lpc->lpc_pgdata_fd) != 0)
713 debug("coud not close %d"
714 " lpc_pgdata_fd %d",
715 (int)lpc->lpc_pid,
716 lpc->lpc_pgdata_fd);
717 lpc->lpc_pgdata_fd = -1;
720 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
721 lpc->lpc_next) : NULL;
726 * Examine each process for pages to remove until the excess is
727 * reduced.
729 while (vic != NULL && excess > 0 && should_run) {
731 * Skip processes whose death was reported when the merging of
732 * pagedata was attempted.
734 if (vic->lpc_prpageheader == NULL)
735 goto nextproc;
738 * Obtain optional segment residency information.
740 if (lpc_xmap_update(vic) != 0)
741 st_debug(STDL_NORMAL, lcol, "process %d: xmap"
742 " unreadable; ignoring", (int)vic->lpc_pid);
744 #ifdef DEBUG_MSG
746 void *ovicaddr = vicaddr;
747 #endif /* DEBUG_MSG */
748 vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader,
749 vic->lpc_xmap, vic->lpc_nxmap, vicaddr);
750 #ifdef DEBUG_MSG
751 st_debug(STDL_NORMAL, lcol, "trying to resume from"
752 " 0x%p, next 0x%p\n", ovicaddr, vicaddr);
754 #endif /* DEBUG_MSG */
757 * Take control of the victim.
759 if (get_psinfo(vic->lpc_pid, &old_psinfo,
760 vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb,
761 vic, vic) != 0) {
762 st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo",
763 (int)vic->lpc_pid);
764 goto nextproc;
766 (void) rfd_reserve(PGRAB_FD_COUNT);
767 if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) {
768 st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)",
769 (int)vic->lpc_pid, res);
770 goto nextproc;
772 if (Pcreate_agent(scan_pr) != 0) {
773 st_debug(STDL_NORMAL, lcol, "cannot control %d",
774 (int)vic->lpc_pid);
775 goto nextproc;
778 * Be very pessimistic about the state of the agent LWP --
779 * verify it's actually stopped.
781 errno = 0;
782 while (Pstate(scan_pr) == PS_RUN)
783 (void) Pwait(scan_pr, 0);
784 if (Pstate(scan_pr) != PS_STOP) {
785 st_debug(STDL_NORMAL, lcol, "agent not in expected"
786 " state (%d)", Pstate(scan_pr));
787 goto nextproc;
791 * Within the victim's address space, find contiguous ranges of
792 * unreferenced pages to page out.
794 st_debug(STDL_NORMAL, lcol, "paging out process %d\n",
795 (int)vic->lpc_pid);
796 while (excess > 0 && vicaddr != NULL && should_run) {
798 * Skip mappings in the ignored set. Mappings get
799 * placed in the ignored set when all their resident
800 * pages are unreference and unmodified, yet unpageable
801 * -- such as when they are locked, or involved in
802 * asynchronous I/O. They will be scanned again when
803 * some page is referenced or modified.
805 if (lmapping_contains(vic->lpc_ignore, cur.pr_addr,
806 cur.pr_npage * cur.pr_pagesize)) {
807 debug("ignored mapping at 0x%p\n",
808 (void *)cur.pr_addr);
810 * Update statistics.
812 lcol->lcol_stat.lcols_pg_att +=
813 cur.pr_npage * cur.pr_pagesize / 1024;
815 vicaddr = (void *)
816 advance_prpageheader_cur_nextmapping(&cur);
817 continue;
821 * Determine a range of unreferenced pages to page out,
822 * and clear the R/M bits in the preceding referenced
823 * range.
825 st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p,"
826 " npage %llu\n", vicaddr,
827 (unsigned long long)cur.pr_npage);
828 while (vicaddr != NULL &&
829 *(caddr_t)cur.pr_pdaddr != 0) {
830 *(caddr_t)cur.pr_pdaddr = 0;
831 vicaddr = advance_prpageheader_cur(&cur);
833 st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr"
834 " %p\n", vicaddr, cur.pr_pdaddr);
835 if (vicaddr == NULL) {
837 * The end of mapping was reached before any
838 * unreferenced pages were seen.
840 vicaddr = (void *)
841 advance_prpageheader_cur_nextmapping(&cur);
842 continue;
844 do {
845 endaddr = advance_prpageheader_cur(&cur);
846 } while (endaddr != NULL &&
847 *(caddr_t)cur.pr_pdaddr == 0 &&
848 (((intptr_t)endaddr - (intptr_t)vicaddr) / 1024)
849 < excess);
850 st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n",
851 endaddr, *(caddr_t)cur.pr_pdaddr);
854 * Page out from vicaddr to the end of the mapping, or
855 * endaddr if set, then continue scanning after
856 * endaddr, or the next mapping, if not set.
858 nvicaddr = endaddr;
859 if (endaddr == NULL)
860 endaddr = (caddr_t)cur.pr_addr +
861 cur.pr_pagesize * cur.pr_npage;
862 if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) ==
863 0) {
864 int64_t d_rss, att;
865 int willignore = 0;
867 excess += (d_rss = rss_delta(
868 &new_psinfo, &old_psinfo, vic));
871 * If this pageout attempt was unsuccessful
872 * (the resident portion was not affected), and
873 * was for the whole mapping, put it in the
874 * ignored set, so it will not be scanned again
875 * until some page is referenced or modified.
877 if (d_rss >= 0 && (void *)cur.pr_addr ==
878 vicaddr && (cur.pr_pagesize * cur.pr_npage)
879 == ((uintptr_t)endaddr -
880 (uintptr_t)vicaddr)) {
881 if (lmapping_insert(
882 &vic->lpc_ignore,
883 cur.pr_addr,
884 cur.pr_pagesize *
885 cur.pr_npage) != 0)
886 debug("not enough memory to add"
887 " mapping at %p to ignored"
888 " set\n",
889 (void *)cur.pr_addr);
890 willignore = 1;
894 * Update statistics.
896 lcol->lcol_stat.lcols_pg_att += (att =
897 ((intptr_t)endaddr - (intptr_t)vicaddr) /
898 1024);
899 st_debug(STDL_NORMAL, lcol, "paged out 0x%p"
900 "+0t(%llu/%llu)kB%s\n", vicaddr,
901 (unsigned long long)((d_rss <
902 0) ? - d_rss : 0), (unsigned long long)att,
903 willignore ? " (will ignore)" : "");
904 } else {
905 st_debug(STDL_NORMAL, lcol,
906 "process %d: exited/unscannable\n",
907 (int)vic->lpc_pid);
908 vic->lpc_unscannable = 1;
909 goto nextproc;
913 * Update the statistics file, if it's time.
915 check_update_statistics();
917 vicaddr = (nvicaddr != NULL) ? nvicaddr : (void
918 *)advance_prpageheader_cur_nextmapping(&cur);
920 excess += rss_delta(&new_psinfo, &old_psinfo, vic);
921 st_debug(STDL_NORMAL, lcol, "done, excess %lld\n",
922 (long long)excess);
923 nextproc:
925 * If a process was grabbed, release it, destroying its agent.
927 if (scan_pr != NULL) {
928 (void) Prelease(scan_pr, 0);
929 scan_pr = NULL;
931 lcol->lcol_victim = vic;
933 * Scan the collection at most once. Only if scanning was not
934 * aborted for any reason, and the end of lprocess has not been
935 * reached, determine the next victim and scan it.
937 if (vic != NULL) {
938 if (vic->lpc_next != NULL) {
940 * Determine the next process to be scanned.
942 if (excess > 0) {
943 vic = get_valid_victim(lcol,
944 vic->lpc_next);
945 vicaddr = 0;
947 } else {
949 * A complete scan of the collection was made,
950 * so tick the scan counter and stop scanning
951 * until the next request.
953 lcol->lcol_stat.lcols_scan_count++;
954 lcol->lcol_stat.lcols_scan_time_complete
955 = lcol->lcol_stat.lcols_scan_time;
957 * If an excess still exists, tick the
958 * "ineffective scan" counter, signalling that
959 * the cap may be uneforceable.
961 if (resumed == 0 && excess > 0)
962 lcol->lcol_stat
963 .lcols_scan_ineffective++;
965 * Scanning should start at the beginning of
966 * the process list at the next request.
968 if (excess > 0)
969 vic = NULL;
973 lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start);
974 st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n",
975 (long long)excess);
977 lcol->lcol_resaddr = vicaddr;
978 if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) {
979 lcol->lcol_victim = get_valid_victim(lcol,
980 lcol->lcol_victim->lpc_next);
985 * Abort the scan in progress, and destroy the agent LWP of any grabbed
986 * processes.
988 void
989 scan_abort(void)
991 if (scan_pr != NULL)
992 (void) Prelease(scan_pr, 0);
995 static void
996 revoke_xmap(rfd_t *rfd)
998 lprocess_t *lpc = rfd->rfd_data;
1000 debug("revoking xmap for process %d\n", (int)lpc->lpc_pid);
1001 ASSERT(lpc->lpc_xmap_fd != -1);
1002 lpc->lpc_xmap_fd = -1;
1006 * Retrieve the process's current xmap , which is used to determine the size of
1007 * the resident portion of its segments. Return zero if successful.
1009 static int
1010 lpc_xmap_update(lprocess_t *lpc)
1012 int res;
1013 struct stat st;
1015 free(lpc->lpc_xmap);
1016 lpc->lpc_xmap = NULL;
1017 lpc->lpc_nxmap = -1;
1019 if (lpc->lpc_xmap_fd == -1) {
1020 char pathbuf[PROC_PATH_MAX];
1022 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap",
1023 (int)lpc->lpc_pid);
1024 if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP,
1025 revoke_xmap, lpc, O_RDONLY, 0)) < 0)
1026 return (-1);
1029 redo:
1030 errno = 0;
1031 if (fstat(lpc->lpc_xmap_fd, &st) != 0) {
1032 debug("cannot stat xmap\n");
1033 (void) rfd_close(lpc->lpc_xmap_fd);
1034 lpc->lpc_xmap_fd = -1;
1035 return (-1);
1038 if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) {
1039 debug("xmap wrong size\n");
1040 (void) rfd_close(lpc->lpc_xmap_fd);
1041 lpc->lpc_xmap_fd = -1;
1042 return (-1);
1045 lpc->lpc_xmap = malloc(st.st_size);
1046 if (lpc->lpc_xmap == NULL) {
1047 debug("cannot malloc() %ld bytes for xmap", st.st_size);
1048 (void) rfd_close(lpc->lpc_xmap_fd);
1049 lpc->lpc_xmap_fd = -1;
1050 return (-1);
1053 if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) !=
1054 st.st_size) {
1055 free(lpc->lpc_xmap);
1056 lpc->lpc_xmap = NULL;
1057 if (res > 0) {
1058 debug("xmap changed size, retrying\n");
1059 goto redo;
1060 } else {
1061 debug("cannot read xmap");
1062 return (-1);
1065 lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap);
1067 return (0);