4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
29 #include <sys/param.h>
31 #include <sys/types.h>
44 #include "rcapd_rfd.h"
45 #include "rcapd_mapping.h"
48 static int lpc_xmap_update(lprocess_t
*);
50 extern int lmapping_dump_diff(lmapping_t
*lm1
, lmapping_t
*lm2
);
54 * The number of file descriptors required to grab a process and create an
57 #define PGRAB_FD_COUNT 10
60 * Record a position in an address space as it corresponds to a prpageheader_t
61 * and affiliated structures.
63 typedef struct prpageheader_cur
{
64 int pr_nmap
; /* number of mappings in address space */
65 int pr_map
; /* number of this mapping */
66 uint64_t pr_pgoff
; /* page offset into mapping */
67 uint64_t pr_npage
; /* number of pages in mapping */
68 uint64_t pr_pagesize
; /* page size of mapping */
69 uintptr_t pr_addr
; /* base of mapping */
70 prpageheader_t
*pr_prpageheader
; /* associated page header */
71 void *pr_pdaddr
; /* address of page's byte in pagedata */
72 prxmap_t
*pr_xmap
; /* array containing per-segment information */
73 int pr_nxmap
; /* number of xmaps in array */
74 int64_t pr_rss
; /* number of resident pages in mapping, */
75 /* or -1 if xmap is out of sync */
76 int64_t pr_pg_rss
; /* number of pageable pages in mapping, or -1 */
79 static struct ps_prochandle
*scan_pr
; /* currently-scanned process's handle */
87 * Output a scanning-related debug message.
89 /*PRINTFLIKE3*/ /*ARGSUSED*/
91 st_debug(st_debug_level_t level
, lcollection_t
*lcol
, char *msg
, ...)
98 if (get_message_priority() < ((level
== STDL_HIGH
) ? RCM_DEBUG_HIGH
102 len
= strlen(msg
) + LINELEN
;
106 (void) snprintf(buf
, len
, "%s %s scanner %s",
107 (lcol
->lcol_id
.rcid_type
== RCIDT_PROJECT
? "project" : "zone"),
108 lcol
->lcol_name
, msg
);
110 va_start(alist
, msg
);
111 vdprintfe(RCM_DEBUG
, buf
, alist
);
115 #endif /* DEBUG_MSG */
119 * Determine the collection's current victim, based on its last. The last will
120 * be returned, or, if invalid, any other valid process, if the collection has
124 get_valid_victim(lcollection_t
*lcol
, lprocess_t
*lpc
)
126 if (lpc
== NULL
|| !lcollection_member(lcol
, lpc
))
127 lpc
= lcol
->lcol_lprocess
;
130 * Find the next scannable process, and make it the victim.
132 while (lpc
!= NULL
&& lpc
->lpc_unscannable
!= 0)
139 * Get a process's combined current pagedata (per-page referenced and modified
140 * bits) and set the supplied pointer to it. The caller is responsible for
141 * freeing the data. If the pagedata is unreadable, a nonzero value is
142 * returned, and errno is set. Otherwise, 0 is returned.
145 get_pagedata(prpageheader_t
**pghpp
, int fd
)
152 if (fstat(fd
, &st
) != 0) {
153 debug("cannot stat pagedata\n");
158 *pghpp
= malloc(st
.st_size
);
159 if (*pghpp
== NULL
) {
160 debug("cannot malloc() %ld bytes for pagedata", st
.st_size
);
163 (void) bzero(*pghpp
, st
.st_size
);
166 if ((res
= read(fd
, *pghpp
, st
.st_size
)) != st
.st_size
) {
169 if (res
> 0 || errno
== E2BIG
) {
170 debug("pagedata changed size, retrying\n");
173 debug("cannot read pagedata");
182 * Return the count of kilobytes of pages represented by the given pagedata
183 * which meet the given criteria, having pages which are in all of the states
184 * specified by the mask, and in none of the states in the notmask. If the
185 * CP_CLEAR flag is set, the pagedata will also be cleared.
189 count_pages(prpageheader_t
*pghp
, int flags
, int mask
, int notmask
)
193 prpageheader_t pgh
= *pghp
;
197 cur
= (caddr_t
)pghp
+ sizeof (*pghp
);
198 for (map
= 0; map
< pgh
.pr_nmap
; map
++) {
199 asmapp
= (prasmap_t
*)(uintptr_t)cur
;
200 cur
+= sizeof (*asmapp
);
201 end
= cur
+ asmapp
->pr_npage
;
203 if ((*cur
& mask
) == mask
&& (*cur
& notmask
) == 0)
204 count
+= asmapp
->pr_pagesize
/ 1024;
205 if ((flags
& CP_CLEAR
) != 0)
211 * Skip to next 64-bit-aligned address to get the next
214 cur
= (caddr_t
)((intptr_t)(cur
+ 7) & ~7);
221 * Return the amount of memory (in kilobytes) that hasn't been referenced or
222 * modified, which memory which will be paged out first. Should be written to
223 * exclude nonresident pages when sufficient interfaces exist.
226 unrm_size(lprocess_t
*lpc
)
228 return (count_pages(lpc
->lpc_prpageheader
, CP_CLEAR
,
229 0, PG_MODIFIED
| PG_REFERENCED
));
233 * Advance a prpageheader_cur_t to the address space's next mapping, returning
234 * its address, or NULL if there is none. Any known nonpageable or nonresident
235 * mappings will be skipped over.
238 advance_prpageheader_cur_nextmapping(prpageheader_cur_t
*pcp
)
244 ASSERT(pcp
->pr_map
< pcp
->pr_nmap
);
245 if ((pcp
->pr_map
+ 1) == pcp
->pr_nmap
)
248 if (pcp
->pr_pgoff
< pcp
->pr_npage
) {
249 pcp
->pr_pdaddr
= (caddr_t
)(uintptr_t)
250 ((uintptr_t)pcp
->pr_pdaddr
+
251 (pcp
->pr_npage
- pcp
->pr_pgoff
));
252 pcp
->pr_pgoff
= pcp
->pr_npage
;
255 * Skip to next 64-bit-aligned address to get the next prasmap_t.
257 pcp
->pr_pdaddr
= (caddr_t
)(((uintptr_t)pcp
->pr_pdaddr
+ 7) & ~7);
258 pap
= (prasmap_t
*)pcp
->pr_pdaddr
;
260 pcp
->pr_npage
= pap
->pr_npage
;
261 pcp
->pr_pagesize
= pap
->pr_pagesize
;
262 pcp
->pr_addr
= pap
->pr_vaddr
;
263 pcp
->pr_pdaddr
= pap
+ 1;
266 * Skip any known nonpageable mappings. Currently, the only one
267 * detected is the schedctl page.
269 if ((pap
->pr_mflags
^ (MA_SHARED
| MA_READ
| MA_WRITE
| MA_EXEC
|
270 MA_ANON
)) == 0 && pap
->pr_npage
== 1) {
271 debug("identified nonpageable schedctl mapping at %p\n",
272 (void *)pcp
->pr_addr
);
277 * Skip mappings with no resident pages. If the xmap does not
278 * correspond to the pagedata for any reason, it will be ignored.
282 for (i
= 0; i
< pcp
->pr_nxmap
; i
++) {
283 prxmap_t
*xmap
= &pcp
->pr_xmap
[i
];
285 if (pcp
->pr_addr
== xmap
->pr_vaddr
&& xmap
->pr_size
==
286 (pcp
->pr_npage
* pcp
->pr_pagesize
)) {
287 pcp
->pr_rss
= xmap
->pr_rss
;
289 * Remove COW pages from the pageable RSS count.
291 if ((xmap
->pr_mflags
& MA_SHARED
) == 0)
292 pcp
->pr_pg_rss
= xmap
->pr_anon
;
296 if (pcp
->pr_rss
== 0) {
297 debug("identified nonresident mapping at 0x%p\n",
298 (void *)pcp
->pr_addr
);
300 } else if (pcp
->pr_pg_rss
== 0) {
301 debug("identified unpageable mapping at 0x%p\n",
302 (void *)pcp
->pr_addr
);
306 return (pcp
->pr_addr
);
310 * Advance a prpageheader_cur_t to the mapping's next page, returning its
311 * address, or NULL if there is none.
314 advance_prpageheader_cur(prpageheader_cur_t
*pcp
)
316 ASSERT(pcp
->pr_pgoff
< pcp
->pr_npage
);
317 if ((pcp
->pr_pgoff
+ 1) == pcp
->pr_npage
)
319 pcp
->pr_pdaddr
= (caddr_t
)pcp
->pr_pdaddr
+ 1;
322 ASSERT((*(char *)pcp
->pr_pdaddr
& ~(PG_MODIFIED
| PG_REFERENCED
)) == 0);
323 return ((caddr_t
)pcp
->pr_addr
+ pcp
->pr_pgoff
* pcp
->pr_pagesize
);
327 * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
328 * of an address space.
331 set_prpageheader_cur(prpageheader_cur_t
*pcp
, prpageheader_t
*php
,
332 prxmap_t
*xmap
, int nxmap
)
334 bzero(pcp
, sizeof (*pcp
));
335 pcp
->pr_nmap
= php
->pr_nmap
;
337 pcp
->pr_prpageheader
= php
;
339 pcp
->pr_nxmap
= nxmap
;
340 pcp
->pr_pdaddr
= (prpageheader_t
*)php
+ 1;
342 return ((void *)advance_prpageheader_cur_nextmapping(pcp
));
346 * Position a prpageheader_cur_t to the mapped address greater or equal to the
350 set_prpageheader_cur_addr(prpageheader_cur_t
*pcp
, prpageheader_t
*php
,
351 prxmap_t
*xmap
, int nxmap
, void *naddr
)
353 void *addr
= set_prpageheader_cur(pcp
, php
, xmap
, nxmap
);
355 while (addr
!= NULL
&& addr
<= naddr
)
356 if (naddr
< (void *)((caddr_t
)pcp
->pr_addr
+
357 pcp
->pr_pagesize
* pcp
->pr_npage
)) {
358 uint64_t pgdiff
= ((uintptr_t)naddr
-
359 (uintptr_t)pcp
->pr_addr
) / pcp
->pr_pagesize
;
360 pcp
->pr_pgoff
+= pgdiff
;
361 pcp
->pr_pdaddr
= (caddr_t
)pcp
->pr_pdaddr
+ pgdiff
;
362 addr
= (caddr_t
)pcp
->pr_addr
+ pcp
->pr_pagesize
*
367 (void *)advance_prpageheader_cur_nextmapping(pcp
);
373 revoke_pagedata(rfd_t
*rfd
)
375 lprocess_t
*lpc
= rfd
->rfd_data
;
377 st_debug(STDL_NORMAL
, lpc
->lpc_collection
, "revoking pagedata for"
378 " process %d\n", (int)lpc
->lpc_pid
);
379 ASSERT(lpc
->lpc_pgdata_fd
!= -1);
380 lpc
->lpc_pgdata_fd
= -1;
385 mklmapping(lmapping_t
**lm
, prpageheader_t
*pgh
)
387 prpageheader_cur_t cur
;
390 addr
= set_prpageheader_cur(&cur
, pgh
, NULL
, -1);
392 while (addr
!= NULL
) {
393 (void) lmapping_insert(lm
, cur
.pr_addr
, cur
.pr_npage
*
395 addr
= (void *)advance_prpageheader_cur_nextmapping(&cur
);
400 lmapping_dump(lmapping_t
*lm
)
402 debug("lm: %p\n", (void *)lm
);
404 debug("\t(%p, %llx\n", (void *)lm
->lm_addr
,
405 (unsigned long long)lm
->lm_size
);
412 * OR two prpagedata_t which are supposedly snapshots of the same address
413 * space. Intersecting mappings with different page sizes are tolerated but
414 * not normalized (not accurate). If the mappings of the two snapshots differ
415 * in any regard, the supplied mappings_changed flag will be set.
418 OR_pagedata(prpageheader_t
*src
, prpageheader_t
*dst
, int *mappings_changedp
)
420 prpageheader_cur_t src_cur
;
421 prpageheader_cur_t dst_cur
;
424 int mappings_changed
= 0;
427 * OR source pagedata with the destination, for pages of intersecting
430 src_addr
= (uintptr_t)set_prpageheader_cur(&src_cur
, src
, NULL
, -1);
431 dst_addr
= (uintptr_t)set_prpageheader_cur(&dst_cur
, dst
, NULL
, -1);
432 while (src_addr
!= NULL
&& dst_addr
!= NULL
) {
433 while (src_addr
== dst_addr
&& src_addr
!= NULL
) {
434 *(char *)dst_cur
.pr_pdaddr
|=
435 *(char *)src_cur
.pr_pdaddr
;
436 src_addr
= (uintptr_t)advance_prpageheader_cur(
438 dst_addr
= (uintptr_t)advance_prpageheader_cur(
441 if (src_addr
!= dst_addr
)
442 mappings_changed
= 1;
443 src_addr
= advance_prpageheader_cur_nextmapping(&src_cur
);
444 dst_addr
= advance_prpageheader_cur_nextmapping(&dst_cur
);
445 while (src_addr
!= dst_addr
&& src_addr
!= NULL
&& dst_addr
!=
447 mappings_changed
= 1;
448 if (src_addr
< dst_addr
)
449 src_addr
= advance_prpageheader_cur_nextmapping(
452 dst_addr
= advance_prpageheader_cur_nextmapping(
457 *mappings_changedp
= mappings_changed
;
461 * Merge the current pagedata with that on hand. If the pagedata is
462 * unretrievable for any reason, such as the process having exited or being a
463 * zombie, a nonzero value is returned, the process should be marked
464 * unscannable, and future attempts to scan it should be avoided, since the
465 * symptom is probably permament. If the mappings of either pagedata
466 * differ in any respect, the supplied callback will be invoked once.
469 merge_current_pagedata(lprocess_t
*lpc
,
470 void(*mappings_changed_cb
) (lprocess_t
*))
472 prpageheader_t
*pghp
;
473 int mappings_changed
= 0;
476 if (lpc
->lpc_pgdata_fd
< 0 || get_pagedata(&pghp
, lpc
->lpc_pgdata_fd
) !=
478 char pathbuf
[PROC_PATH_MAX
];
480 (void) snprintf(pathbuf
, sizeof (pathbuf
), "/proc/%d/pagedata",
482 if ((lpc
->lpc_pgdata_fd
= rfd_open(pathbuf
, 1, RFD_PAGEDATA
,
483 revoke_pagedata
, lpc
, O_RDONLY
, 0)) < 0 ||
484 get_pagedata(&pghp
, lpc
->lpc_pgdata_fd
) != 0)
486 debug("starting/resuming pagedata collection for %d\n",
490 cnt
= count_pages(pghp
, 0, PG_MODIFIED
| PG_REFERENCED
, 0);
491 if (cnt
!= 0 || lpc
->lpc_rss
!= 0)
492 debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
493 (int)lpc
->lpc_pid
, (unsigned long long)cnt
,
494 (unsigned long long)lpc
->lpc_rss
);
495 if (lpc
->lpc_prpageheader
!= NULL
) {
497 * OR the two snapshots.
500 lmapping_t
*old
= NULL
;
501 lmapping_t
*new = NULL
;
503 mklmapping(&new, pghp
);
504 mklmapping(&old
, lpc
->lpc_prpageheader
);
506 OR_pagedata(lpc
->lpc_prpageheader
, pghp
, &mappings_changed
);
508 if (((mappings_changed
!= 0) ^
509 (lmapping_dump_diff(old
, new) != 0))) {
510 debug("lmapping_changed inconsistent with lmapping\n");
516 lmapping_dump(lpc
->lpc_ignore
);
522 free(lpc
->lpc_prpageheader
);
524 mappings_changed
= 1;
525 lpc
->lpc_prpageheader
= pghp
;
527 cnt
= count_pages(pghp
, 0, PG_MODIFIED
| PG_REFERENCED
, 0);
528 if (cnt
!= 0 || lpc
->lpc_rss
!= 0)
529 debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
530 (int)lpc
->lpc_pid
, (unsigned long long)cnt
,
531 (unsigned long long)lpc
->lpc_rss
);
532 if (mappings_changed
!= 0) {
533 debug("process %d: mappings changed\n", (int)lpc
->lpc_pid
);
534 if (mappings_changed_cb
!= NULL
)
535 mappings_changed_cb(lpc
);
541 * Attempt to page out a region of the given process's address space. May
542 * return nonzero if not all of the pages may are pageable, for any reason.
545 pageout(pid_t pid
, struct ps_prochandle
*Pr
, caddr_t start
, caddr_t end
)
553 res
= pr_memcntl(Pr
, start
, (end
- start
), MC_SYNC
,
554 (caddr_t
)(MS_ASYNC
| MS_INVALIDATE
), 0, 0);
555 debug_high("pr_memcntl [%p-%p): %d", (void *)start
, (void *)end
, res
);
558 * EBUSY indicates none of the pages have backing store allocated, or
559 * some pages were locked, which are less interesting than other
560 * conditions, which are noted.
566 debug("%d: can't pageout %p+%llx (errno %d)", (int)pid
,
567 (void *)start
, (long long)(end
- start
), errno
);
573 * Compute the delta of the victim process's RSS since the last call. If the
574 * psinfo cannot be obtained, no work is done, and no error is returned; it is
575 * up to the caller to detect the process' termination via other means.
578 rss_delta(psinfo_t
*new_psinfo
, psinfo_t
*old_psinfo
, lprocess_t
*vic
)
582 if (get_psinfo(vic
->lpc_pid
, new_psinfo
, vic
->lpc_psinfo_fd
,
583 lprocess_update_psinfo_fd_cb
, vic
, vic
) == 0) {
584 d_rss
= (int64_t)new_psinfo
->pr_rssize
-
585 (int64_t)old_psinfo
->pr_rssize
;
587 vic
->lpc_collection
->lcol_stat
.lcols_pg_eff
+=
589 *old_psinfo
= *new_psinfo
;
596 unignore_mappings(lprocess_t
*lpc
)
598 lmapping_free(&lpc
->lpc_ignore
);
602 unignore_referenced_mappings(lprocess_t
*lpc
)
604 prpageheader_cur_t cur
;
607 vicaddr
= set_prpageheader_cur(&cur
, lpc
->lpc_prpageheader
, NULL
, -1);
608 while (vicaddr
!= NULL
) {
609 if (((*(char *)cur
.pr_pdaddr
) & (PG_REFERENCED
| PG_MODIFIED
))
611 if (lmapping_remove(&lpc
->lpc_ignore
, cur
.pr_addr
,
612 cur
.pr_npage
* cur
.pr_pagesize
) == 0)
613 debug("removed mapping 0x%p+0t%llukB from"
614 " ignored set\n", (void *)cur
.pr_addr
,
615 (unsigned long long)(cur
.pr_npage
*
616 cur
.pr_pagesize
/ 1024));
617 vicaddr
= (void *)advance_prpageheader_cur_nextmapping(
619 } else if ((vicaddr
= advance_prpageheader_cur(&cur
)) == NULL
)
620 vicaddr
= (void *)advance_prpageheader_cur_nextmapping(
626 * Resume scanning, starting with the last victim, if it is still valid, or any
627 * other one, otherwise.
630 scan(lcollection_t
*lcol
, int64_t excess
)
632 lprocess_t
*vic
, *lpc
;
633 void *vicaddr
, *endaddr
, *nvicaddr
;
634 prpageheader_cur_t cur
;
635 psinfo_t old_psinfo
, new_psinfo
;
638 uint64_t col_unrm_size
;
640 st_debug(STDL_NORMAL
, lcol
, "starting to scan, excess %lldk\n",
644 * Determine the address to start scanning at, depending on whether
645 * scanning can be resumed.
648 if ((vic
= get_valid_victim(lcol
, lcol
->lcol_victim
)) ==
649 lcol
->lcol_victim
&& lcol
->lcol_resaddr
!= NULL
) {
650 vicaddr
= lcol
->lcol_resaddr
;
651 st_debug(STDL_NORMAL
, lcol
, "resuming process %d\n",
659 scan_start
= gethrtime();
661 * Obtain the most current pagedata for the processes that might be
662 * scanned, and remove from the ignored set any mappings which have
663 * referenced or modified pages (in the hopes that the pageability of
664 * the mapping's pages may have changed). Determine if the
665 * unreferenced and unmodified portion is impossibly small to suffice
666 * to reduce the excess completely. If so, ignore these bits so that
667 * even working set will be paged out.
671 while (lpc
!= NULL
&& should_run
) {
672 if (merge_current_pagedata(lpc
, unignore_mappings
) != 0) {
673 st_debug(STDL_NORMAL
, lcol
, "process %d:"
674 " exited/temporarily unscannable",
678 debug("process %d: %llu/%llukB scannable\n", (int)lpc
->lpc_pid
,
679 (unsigned long long)(lpc
->lpc_unrm
= unrm_size(lpc
)),
680 (unsigned long long)lpc
->lpc_size
);
681 col_unrm_size
+= lpc
->lpc_unrm
= unrm_size(lpc
);
683 if ((lcol
->lcol_stat
.lcols_scan_count
%
684 RCAPD_IGNORED_SET_FLUSH_IVAL
) == 0) {
686 * Periodically clear the set of ignored mappings.
687 * This will allow processes whose ignored segments'
688 * pageability have changed (without a corresponding
689 * reference or modification to a page) to be
692 if (lcol
->lcol_stat
.lcols_scan_count
> 0)
693 unignore_mappings(lpc
);
696 * Ensure mappings with referenced or modified pages
697 * are not in the ignored set. Their usage might mean
698 * the condition which made them unpageable is gone.
700 unignore_referenced_mappings(lpc
);
703 lpc
= lpc
->lpc_next
!= NULL
? get_valid_victim(lcol
,
704 lpc
->lpc_next
) : NULL
;
706 if (col_unrm_size
< excess
) {
708 debug("will not reduce excess with only unreferenced pages\n");
709 while (lpc
!= NULL
&& should_run
) {
710 if (lpc
->lpc_prpageheader
!= NULL
) {
711 (void) count_pages(lpc
->lpc_prpageheader
,
713 if (lpc
->lpc_pgdata_fd
>= 0) {
714 if (rfd_close(lpc
->lpc_pgdata_fd
) != 0)
715 debug("coud not close %d"
719 lpc
->lpc_pgdata_fd
= -1;
722 lpc
= lpc
->lpc_next
!= NULL
? get_valid_victim(lcol
,
723 lpc
->lpc_next
) : NULL
;
728 * Examine each process for pages to remove until the excess is
731 while (vic
!= NULL
&& excess
> 0 && should_run
) {
733 * Skip processes whose death was reported when the merging of
734 * pagedata was attempted.
736 if (vic
->lpc_prpageheader
== NULL
)
740 * Obtain optional segment residency information.
742 if (lpc_xmap_update(vic
) != 0)
743 st_debug(STDL_NORMAL
, lcol
, "process %d: xmap"
744 " unreadable; ignoring", (int)vic
->lpc_pid
);
748 void *ovicaddr
= vicaddr
;
749 #endif /* DEBUG_MSG */
750 vicaddr
= set_prpageheader_cur_addr(&cur
, vic
->lpc_prpageheader
,
751 vic
->lpc_xmap
, vic
->lpc_nxmap
, vicaddr
);
753 st_debug(STDL_NORMAL
, lcol
, "trying to resume from"
754 " 0x%p, next 0x%p\n", ovicaddr
, vicaddr
);
756 #endif /* DEBUG_MSG */
759 * Take control of the victim.
761 if (get_psinfo(vic
->lpc_pid
, &old_psinfo
,
762 vic
->lpc_psinfo_fd
, lprocess_update_psinfo_fd_cb
,
764 st_debug(STDL_NORMAL
, lcol
, "cannot get %d psinfo",
768 (void) rfd_reserve(PGRAB_FD_COUNT
);
769 if ((scan_pr
= Pgrab(vic
->lpc_pid
, 0, &res
)) == NULL
) {
770 st_debug(STDL_NORMAL
, lcol
, "cannot grab %d (%d)",
771 (int)vic
->lpc_pid
, res
);
774 if (Pcreate_agent(scan_pr
) != 0) {
775 st_debug(STDL_NORMAL
, lcol
, "cannot control %d",
780 * Be very pessimistic about the state of the agent LWP --
781 * verify it's actually stopped.
784 while (Pstate(scan_pr
) == PS_RUN
)
785 (void) Pwait(scan_pr
, 0);
786 if (Pstate(scan_pr
) != PS_STOP
) {
787 st_debug(STDL_NORMAL
, lcol
, "agent not in expected"
788 " state (%d)", Pstate(scan_pr
));
793 * Within the victim's address space, find contiguous ranges of
794 * unreferenced pages to page out.
796 st_debug(STDL_NORMAL
, lcol
, "paging out process %d\n",
798 while (excess
> 0 && vicaddr
!= NULL
&& should_run
) {
800 * Skip mappings in the ignored set. Mappings get
801 * placed in the ignored set when all their resident
802 * pages are unreference and unmodified, yet unpageable
803 * -- such as when they are locked, or involved in
804 * asynchronous I/O. They will be scanned again when
805 * some page is referenced or modified.
807 if (lmapping_contains(vic
->lpc_ignore
, cur
.pr_addr
,
808 cur
.pr_npage
* cur
.pr_pagesize
)) {
809 debug("ignored mapping at 0x%p\n",
810 (void *)cur
.pr_addr
);
814 lcol
->lcol_stat
.lcols_pg_att
+=
815 cur
.pr_npage
* cur
.pr_pagesize
/ 1024;
818 advance_prpageheader_cur_nextmapping(&cur
);
823 * Determine a range of unreferenced pages to page out,
824 * and clear the R/M bits in the preceding referenced
827 st_debug(STDL_HIGH
, lcol
, "start from mapping at 0x%p,"
828 " npage %llu\n", vicaddr
,
829 (unsigned long long)cur
.pr_npage
);
830 while (vicaddr
!= NULL
&&
831 *(caddr_t
)cur
.pr_pdaddr
!= 0) {
832 *(caddr_t
)cur
.pr_pdaddr
= 0;
833 vicaddr
= advance_prpageheader_cur(&cur
);
835 st_debug(STDL_HIGH
, lcol
, "advance, vicaddr %p, pdaddr"
836 " %p\n", vicaddr
, cur
.pr_pdaddr
);
837 if (vicaddr
== NULL
) {
839 * The end of mapping was reached before any
840 * unreferenced pages were seen.
843 advance_prpageheader_cur_nextmapping(&cur
);
847 endaddr
= advance_prpageheader_cur(&cur
);
848 while (endaddr
!= NULL
&&
849 *(caddr_t
)cur
.pr_pdaddr
== 0 &&
850 (((intptr_t)endaddr
- (intptr_t)vicaddr
) /
852 st_debug(STDL_HIGH
, lcol
, "endaddr %p, *cur %d\n",
853 endaddr
, *(caddr_t
)cur
.pr_pdaddr
);
856 * Page out from vicaddr to the end of the mapping, or
857 * endaddr if set, then continue scanning after
858 * endaddr, or the next mapping, if not set.
862 endaddr
= (caddr_t
)cur
.pr_addr
+
863 cur
.pr_pagesize
* cur
.pr_npage
;
864 if (pageout(vic
->lpc_pid
, scan_pr
, vicaddr
, endaddr
) ==
869 excess
+= (d_rss
= rss_delta(
870 &new_psinfo
, &old_psinfo
, vic
));
873 * If this pageout attempt was unsuccessful
874 * (the resident portion was not affected), and
875 * was for the whole mapping, put it in the
876 * ignored set, so it will not be scanned again
877 * until some page is referenced or modified.
879 if (d_rss
>= 0 && (void *)cur
.pr_addr
==
880 vicaddr
&& (cur
.pr_pagesize
* cur
.pr_npage
)
881 == ((uintptr_t)endaddr
-
882 (uintptr_t)vicaddr
)) {
888 debug("not enough memory to add"
889 " mapping at %p to ignored"
891 (void *)cur
.pr_addr
);
898 lcol
->lcol_stat
.lcols_pg_att
+= (att
=
899 ((intptr_t)endaddr
- (intptr_t)vicaddr
) /
901 st_debug(STDL_NORMAL
, lcol
, "paged out 0x%p"
902 "+0t(%llu/%llu)kB%s\n", vicaddr
,
903 (unsigned long long)((d_rss
<
904 0) ? - d_rss
: 0), (unsigned long long)att
,
905 willignore
? " (will ignore)" : "");
907 st_debug(STDL_NORMAL
, lcol
,
908 "process %d: exited/unscannable\n",
910 vic
->lpc_unscannable
= 1;
915 * Update the statistics file, if it's time.
917 check_update_statistics();
919 vicaddr
= (nvicaddr
!= NULL
) ? nvicaddr
: (void
920 *)advance_prpageheader_cur_nextmapping(&cur
);
922 excess
+= rss_delta(&new_psinfo
, &old_psinfo
, vic
);
923 st_debug(STDL_NORMAL
, lcol
, "done, excess %lld\n",
927 * If a process was grabbed, release it, destroying its agent.
929 if (scan_pr
!= NULL
) {
930 (void) Prelease(scan_pr
, 0);
933 lcol
->lcol_victim
= vic
;
935 * Scan the collection at most once. Only if scanning was not
936 * aborted for any reason, and the end of lprocess has not been
937 * reached, determine the next victim and scan it.
940 if (vic
->lpc_next
!= NULL
) {
942 * Determine the next process to be scanned.
945 vic
= get_valid_victim(lcol
,
951 * A complete scan of the collection was made,
952 * so tick the scan counter and stop scanning
953 * until the next request.
955 lcol
->lcol_stat
.lcols_scan_count
++;
956 lcol
->lcol_stat
.lcols_scan_time_complete
957 = lcol
->lcol_stat
.lcols_scan_time
;
959 * If an excess still exists, tick the
960 * "ineffective scan" counter, signalling that
961 * the cap may be uneforceable.
963 if (resumed
== 0 && excess
> 0)
965 .lcols_scan_ineffective
++;
967 * Scanning should start at the beginning of
968 * the process list at the next request.
975 lcol
->lcol_stat
.lcols_scan_time
+= (gethrtime() - scan_start
);
976 st_debug(STDL_HIGH
, lcol
, "done scanning; excess %lld\n",
979 lcol
->lcol_resaddr
= vicaddr
;
980 if (lcol
->lcol_resaddr
== NULL
&& lcol
->lcol_victim
!= NULL
) {
981 lcol
->lcol_victim
= get_valid_victim(lcol
,
982 lcol
->lcol_victim
->lpc_next
);
987 * Abort the scan in progress, and destroy the agent LWP of any grabbed
994 (void) Prelease(scan_pr
, NULL
);
998 revoke_xmap(rfd_t
*rfd
)
1000 lprocess_t
*lpc
= rfd
->rfd_data
;
1002 debug("revoking xmap for process %d\n", (int)lpc
->lpc_pid
);
1003 ASSERT(lpc
->lpc_xmap_fd
!= -1);
1004 lpc
->lpc_xmap_fd
= -1;
1008 * Retrieve the process's current xmap , which is used to determine the size of
1009 * the resident portion of its segments. Return zero if successful.
1012 lpc_xmap_update(lprocess_t
*lpc
)
1017 free(lpc
->lpc_xmap
);
1018 lpc
->lpc_xmap
= NULL
;
1019 lpc
->lpc_nxmap
= -1;
1021 if (lpc
->lpc_xmap_fd
== -1) {
1022 char pathbuf
[PROC_PATH_MAX
];
1024 (void) snprintf(pathbuf
, sizeof (pathbuf
), "/proc/%d/xmap",
1026 if ((lpc
->lpc_xmap_fd
= rfd_open(pathbuf
, 1, RFD_XMAP
,
1027 revoke_xmap
, lpc
, O_RDONLY
, 0)) < 0)
1033 if (fstat(lpc
->lpc_xmap_fd
, &st
) != 0) {
1034 debug("cannot stat xmap\n");
1035 (void) rfd_close(lpc
->lpc_xmap_fd
);
1036 lpc
->lpc_xmap_fd
= -1;
1040 if ((st
.st_size
% sizeof (*lpc
->lpc_xmap
)) != 0) {
1041 debug("xmap wrong size\n");
1042 (void) rfd_close(lpc
->lpc_xmap_fd
);
1043 lpc
->lpc_xmap_fd
= -1;
1047 lpc
->lpc_xmap
= malloc(st
.st_size
);
1048 if (lpc
->lpc_xmap
== NULL
) {
1049 debug("cannot malloc() %ld bytes for xmap", st
.st_size
);
1050 (void) rfd_close(lpc
->lpc_xmap_fd
);
1051 lpc
->lpc_xmap_fd
= -1;
1055 if ((res
= pread(lpc
->lpc_xmap_fd
, lpc
->lpc_xmap
, st
.st_size
, 0)) !=
1057 free(lpc
->lpc_xmap
);
1058 lpc
->lpc_xmap
= NULL
;
1060 debug("xmap changed size, retrying\n");
1063 debug("cannot read xmap");
1067 lpc
->lpc_nxmap
= st
.st_size
/ sizeof (*lpc
->lpc_xmap
);