4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 #include <sys/param.h>
29 #include <sys/types.h>
42 #include "rcapd_rfd.h"
43 #include "rcapd_mapping.h"
46 static int lpc_xmap_update(lprocess_t
*);
48 extern int lmapping_dump_diff(lmapping_t
*lm1
, lmapping_t
*lm2
);
52 * The number of file descriptors required to grab a process and create an
55 #define PGRAB_FD_COUNT 10
58 * Record a position in an address space as it corresponds to a prpageheader_t
59 * and affiliated structures.
61 typedef struct prpageheader_cur
{
62 int pr_nmap
; /* number of mappings in address space */
63 int pr_map
; /* number of this mapping */
64 uint64_t pr_pgoff
; /* page offset into mapping */
65 uint64_t pr_npage
; /* number of pages in mapping */
66 uint64_t pr_pagesize
; /* page size of mapping */
67 uintptr_t pr_addr
; /* base of mapping */
68 prpageheader_t
*pr_prpageheader
; /* associated page header */
69 void *pr_pdaddr
; /* address of page's byte in pagedata */
70 prxmap_t
*pr_xmap
; /* array containing per-segment information */
71 int pr_nxmap
; /* number of xmaps in array */
72 int64_t pr_rss
; /* number of resident pages in mapping, */
73 /* or -1 if xmap is out of sync */
74 int64_t pr_pg_rss
; /* number of pageable pages in mapping, or -1 */
77 static struct ps_prochandle
*scan_pr
; /* currently-scanned process's handle */
85 * Output a scanning-related debug message.
87 /*PRINTFLIKE3*/ /*ARGSUSED*/
89 st_debug(st_debug_level_t level
, lcollection_t
*lcol
, char *msg
, ...)
96 if (get_message_priority() < ((level
== STDL_HIGH
) ? RCM_DEBUG_HIGH
100 len
= strlen(msg
) + LINELEN
;
104 (void) snprintf(buf
, len
, "%s %s scanner %s",
105 (lcol
->lcol_id
.rcid_type
== RCIDT_PROJECT
? "project" : "zone"),
106 lcol
->lcol_name
, msg
);
108 va_start(alist
, msg
);
109 vdprintfe(RCM_DEBUG
, buf
, alist
);
113 #endif /* DEBUG_MSG */
117 * Determine the collection's current victim, based on its last. The last will
118 * be returned, or, if invalid, any other valid process, if the collection has
122 get_valid_victim(lcollection_t
*lcol
, lprocess_t
*lpc
)
124 if (lpc
== NULL
|| !lcollection_member(lcol
, lpc
))
125 lpc
= lcol
->lcol_lprocess
;
128 * Find the next scannable process, and make it the victim.
130 while (lpc
!= NULL
&& lpc
->lpc_unscannable
!= 0)
137 * Get a process's combined current pagedata (per-page referenced and modified
138 * bits) and set the supplied pointer to it. The caller is responsible for
139 * freeing the data. If the pagedata is unreadable, a nonzero value is
140 * returned, and errno is set. Otherwise, 0 is returned.
143 get_pagedata(prpageheader_t
**pghpp
, int fd
)
150 if (fstat(fd
, &st
) != 0) {
151 debug("cannot stat pagedata\n");
156 *pghpp
= malloc(st
.st_size
);
157 if (*pghpp
== NULL
) {
158 debug("cannot malloc() %ld bytes for pagedata", st
.st_size
);
161 (void) bzero(*pghpp
, st
.st_size
);
164 if ((res
= read(fd
, *pghpp
, st
.st_size
)) != st
.st_size
) {
167 if (res
> 0 || errno
== E2BIG
) {
168 debug("pagedata changed size, retrying\n");
171 debug("cannot read pagedata");
180 * Return the count of kilobytes of pages represented by the given pagedata
181 * which meet the given criteria, having pages which are in all of the states
182 * specified by the mask, and in none of the states in the notmask. If the
183 * CP_CLEAR flag is set, the pagedata will also be cleared.
187 count_pages(prpageheader_t
*pghp
, int flags
, int mask
, int notmask
)
191 prpageheader_t pgh
= *pghp
;
195 cur
= (caddr_t
)pghp
+ sizeof (*pghp
);
196 for (map
= 0; map
< pgh
.pr_nmap
; map
++) {
197 asmapp
= (prasmap_t
*)(uintptr_t)cur
;
198 cur
+= sizeof (*asmapp
);
199 end
= cur
+ asmapp
->pr_npage
;
201 if ((*cur
& mask
) == mask
&& (*cur
& notmask
) == 0)
202 count
+= asmapp
->pr_pagesize
/ 1024;
203 if ((flags
& CP_CLEAR
) != 0)
209 * Skip to next 64-bit-aligned address to get the next
212 cur
= (caddr_t
)((intptr_t)(cur
+ 7) & ~7);
219 * Return the amount of memory (in kilobytes) that hasn't been referenced or
220 * modified, which memory which will be paged out first. Should be written to
221 * exclude nonresident pages when sufficient interfaces exist.
224 unrm_size(lprocess_t
*lpc
)
226 return (count_pages(lpc
->lpc_prpageheader
, CP_CLEAR
,
227 0, PG_MODIFIED
| PG_REFERENCED
));
231 * Advance a prpageheader_cur_t to the address space's next mapping, returning
232 * its address, or NULL if there is none. Any known nonpageable or nonresident
233 * mappings will be skipped over.
236 advance_prpageheader_cur_nextmapping(prpageheader_cur_t
*pcp
)
242 ASSERT(pcp
->pr_map
< pcp
->pr_nmap
);
243 if ((pcp
->pr_map
+ 1) == pcp
->pr_nmap
)
244 return ((uintptr_t)NULL
);
246 if (pcp
->pr_pgoff
< pcp
->pr_npage
) {
247 pcp
->pr_pdaddr
= (caddr_t
)(uintptr_t)
248 ((uintptr_t)pcp
->pr_pdaddr
+
249 (pcp
->pr_npage
- pcp
->pr_pgoff
));
250 pcp
->pr_pgoff
= pcp
->pr_npage
;
253 * Skip to next 64-bit-aligned address to get the next prasmap_t.
255 pcp
->pr_pdaddr
= (caddr_t
)(((uintptr_t)pcp
->pr_pdaddr
+ 7) & ~7);
256 pap
= (prasmap_t
*)pcp
->pr_pdaddr
;
258 pcp
->pr_npage
= pap
->pr_npage
;
259 pcp
->pr_pagesize
= pap
->pr_pagesize
;
260 pcp
->pr_addr
= pap
->pr_vaddr
;
261 pcp
->pr_pdaddr
= pap
+ 1;
264 * Skip any known nonpageable mappings. Currently, the only one
265 * detected is the schedctl page.
267 if ((pap
->pr_mflags
^ (MA_SHARED
| MA_READ
| MA_WRITE
| MA_EXEC
|
268 MA_ANON
)) == 0 && pap
->pr_npage
== 1) {
269 debug("identified nonpageable schedctl mapping at %p\n",
270 (void *)pcp
->pr_addr
);
275 * Skip mappings with no resident pages. If the xmap does not
276 * correspond to the pagedata for any reason, it will be ignored.
280 for (i
= 0; i
< pcp
->pr_nxmap
; i
++) {
281 prxmap_t
*xmap
= &pcp
->pr_xmap
[i
];
283 if (pcp
->pr_addr
== xmap
->pr_vaddr
&& xmap
->pr_size
==
284 (pcp
->pr_npage
* pcp
->pr_pagesize
)) {
285 pcp
->pr_rss
= xmap
->pr_rss
;
287 * Remove COW pages from the pageable RSS count.
289 if ((xmap
->pr_mflags
& MA_SHARED
) == 0)
290 pcp
->pr_pg_rss
= xmap
->pr_anon
;
294 if (pcp
->pr_rss
== 0) {
295 debug("identified nonresident mapping at 0x%p\n",
296 (void *)pcp
->pr_addr
);
298 } else if (pcp
->pr_pg_rss
== 0) {
299 debug("identified unpageable mapping at 0x%p\n",
300 (void *)pcp
->pr_addr
);
304 return (pcp
->pr_addr
);
308 * Advance a prpageheader_cur_t to the mapping's next page, returning its
309 * address, or NULL if there is none.
312 advance_prpageheader_cur(prpageheader_cur_t
*pcp
)
314 ASSERT(pcp
->pr_pgoff
< pcp
->pr_npage
);
315 if ((pcp
->pr_pgoff
+ 1) == pcp
->pr_npage
)
317 pcp
->pr_pdaddr
= (caddr_t
)pcp
->pr_pdaddr
+ 1;
320 ASSERT((*(char *)pcp
->pr_pdaddr
& ~(PG_MODIFIED
| PG_REFERENCED
)) == 0);
321 return ((caddr_t
)pcp
->pr_addr
+ pcp
->pr_pgoff
* pcp
->pr_pagesize
);
325 * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
326 * of an address space.
329 set_prpageheader_cur(prpageheader_cur_t
*pcp
, prpageheader_t
*php
,
330 prxmap_t
*xmap
, int nxmap
)
332 bzero(pcp
, sizeof (*pcp
));
333 pcp
->pr_nmap
= php
->pr_nmap
;
335 pcp
->pr_prpageheader
= php
;
337 pcp
->pr_nxmap
= nxmap
;
338 pcp
->pr_pdaddr
= (prpageheader_t
*)php
+ 1;
340 return ((void *)advance_prpageheader_cur_nextmapping(pcp
));
344 * Position a prpageheader_cur_t to the mapped address greater or equal to the
348 set_prpageheader_cur_addr(prpageheader_cur_t
*pcp
, prpageheader_t
*php
,
349 prxmap_t
*xmap
, int nxmap
, void *naddr
)
351 void *addr
= set_prpageheader_cur(pcp
, php
, xmap
, nxmap
);
353 while (addr
!= NULL
&& addr
<= naddr
)
354 if (naddr
< (void *)((caddr_t
)pcp
->pr_addr
+
355 pcp
->pr_pagesize
* pcp
->pr_npage
)) {
356 uint64_t pgdiff
= ((uintptr_t)naddr
-
357 (uintptr_t)pcp
->pr_addr
) / pcp
->pr_pagesize
;
358 pcp
->pr_pgoff
+= pgdiff
;
359 pcp
->pr_pdaddr
= (caddr_t
)pcp
->pr_pdaddr
+ pgdiff
;
360 addr
= (caddr_t
)pcp
->pr_addr
+ pcp
->pr_pagesize
*
365 (void *)advance_prpageheader_cur_nextmapping(pcp
);
371 revoke_pagedata(rfd_t
*rfd
)
373 lprocess_t
*lpc
= rfd
->rfd_data
;
375 st_debug(STDL_NORMAL
, lpc
->lpc_collection
, "revoking pagedata for"
376 " process %d\n", (int)lpc
->lpc_pid
);
377 ASSERT(lpc
->lpc_pgdata_fd
!= -1);
378 lpc
->lpc_pgdata_fd
= -1;
383 mklmapping(lmapping_t
**lm
, prpageheader_t
*pgh
)
385 prpageheader_cur_t cur
;
388 addr
= set_prpageheader_cur(&cur
, pgh
, NULL
, -1);
390 while (addr
!= NULL
) {
391 (void) lmapping_insert(lm
, cur
.pr_addr
, cur
.pr_npage
*
393 addr
= (void *)advance_prpageheader_cur_nextmapping(&cur
);
398 lmapping_dump(lmapping_t
*lm
)
400 debug("lm: %p\n", (void *)lm
);
402 debug("\t(%p, %llx\n", (void *)lm
->lm_addr
,
403 (unsigned long long)lm
->lm_size
);
410 * OR two prpagedata_t which are supposedly snapshots of the same address
411 * space. Intersecting mappings with different page sizes are tolerated but
412 * not normalized (not accurate). If the mappings of the two snapshots differ
413 * in any regard, the supplied mappings_changed flag will be set.
416 OR_pagedata(prpageheader_t
*src
, prpageheader_t
*dst
, int *mappings_changedp
)
418 prpageheader_cur_t src_cur
;
419 prpageheader_cur_t dst_cur
;
422 int mappings_changed
= 0;
425 * OR source pagedata with the destination, for pages of intersecting
428 src_addr
= (uintptr_t)set_prpageheader_cur(&src_cur
, src
, NULL
, -1);
429 dst_addr
= (uintptr_t)set_prpageheader_cur(&dst_cur
, dst
, NULL
, -1);
430 while (src_addr
!= (uintptr_t)NULL
&& dst_addr
!= (uintptr_t)NULL
) {
431 while (src_addr
== dst_addr
&& src_addr
!= (uintptr_t)NULL
) {
432 *(char *)dst_cur
.pr_pdaddr
|=
433 *(char *)src_cur
.pr_pdaddr
;
434 src_addr
= (uintptr_t)advance_prpageheader_cur(
436 dst_addr
= (uintptr_t)advance_prpageheader_cur(
439 if (src_addr
!= dst_addr
)
440 mappings_changed
= 1;
441 src_addr
= advance_prpageheader_cur_nextmapping(&src_cur
);
442 dst_addr
= advance_prpageheader_cur_nextmapping(&dst_cur
);
443 while (src_addr
!= dst_addr
&& src_addr
!= (uintptr_t)NULL
&&
444 dst_addr
!= (uintptr_t)NULL
) {
445 mappings_changed
= 1;
446 if (src_addr
< dst_addr
)
447 src_addr
= advance_prpageheader_cur_nextmapping(
450 dst_addr
= advance_prpageheader_cur_nextmapping(
455 *mappings_changedp
= mappings_changed
;
459 * Merge the current pagedata with that on hand. If the pagedata is
460 * unretrievable for any reason, such as the process having exited or being a
461 * zombie, a nonzero value is returned, the process should be marked
462 * unscannable, and future attempts to scan it should be avoided, since the
463 * symptom is probably permament. If the mappings of either pagedata
464 * differ in any respect, the supplied callback will be invoked once.
467 merge_current_pagedata(lprocess_t
*lpc
,
468 void(*mappings_changed_cb
) (lprocess_t
*))
470 prpageheader_t
*pghp
;
471 int mappings_changed
= 0;
474 if (lpc
->lpc_pgdata_fd
< 0 || get_pagedata(&pghp
, lpc
->lpc_pgdata_fd
) !=
476 char pathbuf
[PROC_PATH_MAX
];
478 (void) snprintf(pathbuf
, sizeof (pathbuf
), "/proc/%d/pagedata",
480 if ((lpc
->lpc_pgdata_fd
= rfd_open(pathbuf
, 1, RFD_PAGEDATA
,
481 revoke_pagedata
, lpc
, O_RDONLY
, 0)) < 0 ||
482 get_pagedata(&pghp
, lpc
->lpc_pgdata_fd
) != 0)
484 debug("starting/resuming pagedata collection for %d\n",
488 cnt
= count_pages(pghp
, 0, PG_MODIFIED
| PG_REFERENCED
, 0);
489 if (cnt
!= 0 || lpc
->lpc_rss
!= 0)
490 debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
491 (int)lpc
->lpc_pid
, (unsigned long long)cnt
,
492 (unsigned long long)lpc
->lpc_rss
);
493 if (lpc
->lpc_prpageheader
!= NULL
) {
495 * OR the two snapshots.
498 lmapping_t
*old
= NULL
;
499 lmapping_t
*new = NULL
;
501 mklmapping(&new, pghp
);
502 mklmapping(&old
, lpc
->lpc_prpageheader
);
504 OR_pagedata(lpc
->lpc_prpageheader
, pghp
, &mappings_changed
);
506 if (((mappings_changed
!= 0) ^
507 (lmapping_dump_diff(old
, new) != 0))) {
508 debug("lmapping_changed inconsistent with lmapping\n");
514 lmapping_dump(lpc
->lpc_ignore
);
520 free(lpc
->lpc_prpageheader
);
522 mappings_changed
= 1;
523 lpc
->lpc_prpageheader
= pghp
;
525 cnt
= count_pages(pghp
, 0, PG_MODIFIED
| PG_REFERENCED
, 0);
526 if (cnt
!= 0 || lpc
->lpc_rss
!= 0)
527 debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
528 (int)lpc
->lpc_pid
, (unsigned long long)cnt
,
529 (unsigned long long)lpc
->lpc_rss
);
530 if (mappings_changed
!= 0) {
531 debug("process %d: mappings changed\n", (int)lpc
->lpc_pid
);
532 if (mappings_changed_cb
!= NULL
)
533 mappings_changed_cb(lpc
);
539 * Attempt to page out a region of the given process's address space. May
540 * return nonzero if not all of the pages may are pageable, for any reason.
543 pageout(pid_t pid
, struct ps_prochandle
*Pr
, caddr_t start
, caddr_t end
)
551 res
= pr_memcntl(Pr
, start
, (end
- start
), MC_SYNC
,
552 (caddr_t
)(MS_ASYNC
| MS_INVALIDATE
), 0, 0);
553 debug_high("pr_memcntl [%p-%p): %d", (void *)start
, (void *)end
, res
);
556 * EBUSY indicates none of the pages have backing store allocated, or
557 * some pages were locked, which are less interesting than other
558 * conditions, which are noted.
564 debug("%d: can't pageout %p+%llx (errno %d)", (int)pid
,
565 (void *)start
, (long long)(end
- start
), errno
);
571 * Compute the delta of the victim process's RSS since the last call. If the
572 * psinfo cannot be obtained, no work is done, and no error is returned; it is
573 * up to the caller to detect the process' termination via other means.
576 rss_delta(psinfo_t
*new_psinfo
, psinfo_t
*old_psinfo
, lprocess_t
*vic
)
580 if (get_psinfo(vic
->lpc_pid
, new_psinfo
, vic
->lpc_psinfo_fd
,
581 lprocess_update_psinfo_fd_cb
, vic
, vic
) == 0) {
582 d_rss
= (int64_t)new_psinfo
->pr_rssize
-
583 (int64_t)old_psinfo
->pr_rssize
;
585 vic
->lpc_collection
->lcol_stat
.lcols_pg_eff
+=
587 *old_psinfo
= *new_psinfo
;
594 unignore_mappings(lprocess_t
*lpc
)
596 lmapping_free(&lpc
->lpc_ignore
);
600 unignore_referenced_mappings(lprocess_t
*lpc
)
602 prpageheader_cur_t cur
;
605 vicaddr
= set_prpageheader_cur(&cur
, lpc
->lpc_prpageheader
, NULL
, -1);
606 while (vicaddr
!= NULL
) {
607 if (((*(char *)cur
.pr_pdaddr
) & (PG_REFERENCED
| PG_MODIFIED
))
609 if (lmapping_remove(&lpc
->lpc_ignore
, cur
.pr_addr
,
610 cur
.pr_npage
* cur
.pr_pagesize
) == 0)
611 debug("removed mapping 0x%p+0t%llukB from"
612 " ignored set\n", (void *)cur
.pr_addr
,
613 (unsigned long long)(cur
.pr_npage
*
614 cur
.pr_pagesize
/ 1024));
615 vicaddr
= (void *)advance_prpageheader_cur_nextmapping(
617 } else if ((vicaddr
= advance_prpageheader_cur(&cur
)) == NULL
)
618 vicaddr
= (void *)advance_prpageheader_cur_nextmapping(
624 * Resume scanning, starting with the last victim, if it is still valid, or any
625 * other one, otherwise.
628 scan(lcollection_t
*lcol
, int64_t excess
)
630 lprocess_t
*vic
, *lpc
;
631 void *vicaddr
, *endaddr
, *nvicaddr
;
632 prpageheader_cur_t cur
;
633 psinfo_t old_psinfo
, new_psinfo
;
636 uint64_t col_unrm_size
;
638 st_debug(STDL_NORMAL
, lcol
, "starting to scan, excess %lldk\n",
642 * Determine the address to start scanning at, depending on whether
643 * scanning can be resumed.
646 if ((vic
= get_valid_victim(lcol
, lcol
->lcol_victim
)) ==
647 lcol
->lcol_victim
&& lcol
->lcol_resaddr
!= NULL
) {
648 vicaddr
= lcol
->lcol_resaddr
;
649 st_debug(STDL_NORMAL
, lcol
, "resuming process %d\n",
657 scan_start
= gethrtime();
659 * Obtain the most current pagedata for the processes that might be
660 * scanned, and remove from the ignored set any mappings which have
661 * referenced or modified pages (in the hopes that the pageability of
662 * the mapping's pages may have changed). Determine if the
663 * unreferenced and unmodified portion is impossibly small to suffice
664 * to reduce the excess completely. If so, ignore these bits so that
665 * even working set will be paged out.
669 while (lpc
!= NULL
&& should_run
) {
670 if (merge_current_pagedata(lpc
, unignore_mappings
) != 0) {
671 st_debug(STDL_NORMAL
, lcol
, "process %d:"
672 " exited/temporarily unscannable",
676 debug("process %d: %llu/%llukB scannable\n", (int)lpc
->lpc_pid
,
677 (unsigned long long)(lpc
->lpc_unrm
= unrm_size(lpc
)),
678 (unsigned long long)lpc
->lpc_size
);
679 col_unrm_size
+= lpc
->lpc_unrm
= unrm_size(lpc
);
681 if ((lcol
->lcol_stat
.lcols_scan_count
%
682 RCAPD_IGNORED_SET_FLUSH_IVAL
) == 0) {
684 * Periodically clear the set of ignored mappings.
685 * This will allow processes whose ignored segments'
686 * pageability have changed (without a corresponding
687 * reference or modification to a page) to be
690 if (lcol
->lcol_stat
.lcols_scan_count
> 0)
691 unignore_mappings(lpc
);
694 * Ensure mappings with referenced or modified pages
695 * are not in the ignored set. Their usage might mean
696 * the condition which made them unpageable is gone.
698 unignore_referenced_mappings(lpc
);
701 lpc
= lpc
->lpc_next
!= NULL
? get_valid_victim(lcol
,
702 lpc
->lpc_next
) : NULL
;
704 if (col_unrm_size
< excess
) {
706 debug("will not reduce excess with only unreferenced pages\n");
707 while (lpc
!= NULL
&& should_run
) {
708 if (lpc
->lpc_prpageheader
!= NULL
) {
709 (void) count_pages(lpc
->lpc_prpageheader
,
711 if (lpc
->lpc_pgdata_fd
>= 0) {
712 if (rfd_close(lpc
->lpc_pgdata_fd
) != 0)
713 debug("coud not close %d"
717 lpc
->lpc_pgdata_fd
= -1;
720 lpc
= lpc
->lpc_next
!= NULL
? get_valid_victim(lcol
,
721 lpc
->lpc_next
) : NULL
;
726 * Examine each process for pages to remove until the excess is
729 while (vic
!= NULL
&& excess
> 0 && should_run
) {
731 * Skip processes whose death was reported when the merging of
732 * pagedata was attempted.
734 if (vic
->lpc_prpageheader
== NULL
)
738 * Obtain optional segment residency information.
740 if (lpc_xmap_update(vic
) != 0)
741 st_debug(STDL_NORMAL
, lcol
, "process %d: xmap"
742 " unreadable; ignoring", (int)vic
->lpc_pid
);
746 void *ovicaddr
= vicaddr
;
747 #endif /* DEBUG_MSG */
748 vicaddr
= set_prpageheader_cur_addr(&cur
, vic
->lpc_prpageheader
,
749 vic
->lpc_xmap
, vic
->lpc_nxmap
, vicaddr
);
751 st_debug(STDL_NORMAL
, lcol
, "trying to resume from"
752 " 0x%p, next 0x%p\n", ovicaddr
, vicaddr
);
754 #endif /* DEBUG_MSG */
757 * Take control of the victim.
759 if (get_psinfo(vic
->lpc_pid
, &old_psinfo
,
760 vic
->lpc_psinfo_fd
, lprocess_update_psinfo_fd_cb
,
762 st_debug(STDL_NORMAL
, lcol
, "cannot get %d psinfo",
766 (void) rfd_reserve(PGRAB_FD_COUNT
);
767 if ((scan_pr
= Pgrab(vic
->lpc_pid
, 0, &res
)) == NULL
) {
768 st_debug(STDL_NORMAL
, lcol
, "cannot grab %d (%d)",
769 (int)vic
->lpc_pid
, res
);
772 if (Pcreate_agent(scan_pr
) != 0) {
773 st_debug(STDL_NORMAL
, lcol
, "cannot control %d",
778 * Be very pessimistic about the state of the agent LWP --
779 * verify it's actually stopped.
782 while (Pstate(scan_pr
) == PS_RUN
)
783 (void) Pwait(scan_pr
, 0);
784 if (Pstate(scan_pr
) != PS_STOP
) {
785 st_debug(STDL_NORMAL
, lcol
, "agent not in expected"
786 " state (%d)", Pstate(scan_pr
));
791 * Within the victim's address space, find contiguous ranges of
792 * unreferenced pages to page out.
794 st_debug(STDL_NORMAL
, lcol
, "paging out process %d\n",
796 while (excess
> 0 && vicaddr
!= NULL
&& should_run
) {
798 * Skip mappings in the ignored set. Mappings get
799 * placed in the ignored set when all their resident
800 * pages are unreference and unmodified, yet unpageable
801 * -- such as when they are locked, or involved in
802 * asynchronous I/O. They will be scanned again when
803 * some page is referenced or modified.
805 if (lmapping_contains(vic
->lpc_ignore
, cur
.pr_addr
,
806 cur
.pr_npage
* cur
.pr_pagesize
)) {
807 debug("ignored mapping at 0x%p\n",
808 (void *)cur
.pr_addr
);
812 lcol
->lcol_stat
.lcols_pg_att
+=
813 cur
.pr_npage
* cur
.pr_pagesize
/ 1024;
816 advance_prpageheader_cur_nextmapping(&cur
);
821 * Determine a range of unreferenced pages to page out,
822 * and clear the R/M bits in the preceding referenced
825 st_debug(STDL_HIGH
, lcol
, "start from mapping at 0x%p,"
826 " npage %llu\n", vicaddr
,
827 (unsigned long long)cur
.pr_npage
);
828 while (vicaddr
!= NULL
&&
829 *(caddr_t
)cur
.pr_pdaddr
!= 0) {
830 *(caddr_t
)cur
.pr_pdaddr
= 0;
831 vicaddr
= advance_prpageheader_cur(&cur
);
833 st_debug(STDL_HIGH
, lcol
, "advance, vicaddr %p, pdaddr"
834 " %p\n", vicaddr
, cur
.pr_pdaddr
);
835 if (vicaddr
== NULL
) {
837 * The end of mapping was reached before any
838 * unreferenced pages were seen.
841 advance_prpageheader_cur_nextmapping(&cur
);
845 endaddr
= advance_prpageheader_cur(&cur
);
846 } while (endaddr
!= NULL
&&
847 *(caddr_t
)cur
.pr_pdaddr
== 0 &&
848 (((intptr_t)endaddr
- (intptr_t)vicaddr
) / 1024)
850 st_debug(STDL_HIGH
, lcol
, "endaddr %p, *cur %d\n",
851 endaddr
, *(caddr_t
)cur
.pr_pdaddr
);
854 * Page out from vicaddr to the end of the mapping, or
855 * endaddr if set, then continue scanning after
856 * endaddr, or the next mapping, if not set.
860 endaddr
= (caddr_t
)cur
.pr_addr
+
861 cur
.pr_pagesize
* cur
.pr_npage
;
862 if (pageout(vic
->lpc_pid
, scan_pr
, vicaddr
, endaddr
) ==
867 excess
+= (d_rss
= rss_delta(
868 &new_psinfo
, &old_psinfo
, vic
));
871 * If this pageout attempt was unsuccessful
872 * (the resident portion was not affected), and
873 * was for the whole mapping, put it in the
874 * ignored set, so it will not be scanned again
875 * until some page is referenced or modified.
877 if (d_rss
>= 0 && (void *)cur
.pr_addr
==
878 vicaddr
&& (cur
.pr_pagesize
* cur
.pr_npage
)
879 == ((uintptr_t)endaddr
-
880 (uintptr_t)vicaddr
)) {
886 debug("not enough memory to add"
887 " mapping at %p to ignored"
889 (void *)cur
.pr_addr
);
896 lcol
->lcol_stat
.lcols_pg_att
+= (att
=
897 ((intptr_t)endaddr
- (intptr_t)vicaddr
) /
899 st_debug(STDL_NORMAL
, lcol
, "paged out 0x%p"
900 "+0t(%llu/%llu)kB%s\n", vicaddr
,
901 (unsigned long long)((d_rss
<
902 0) ? - d_rss
: 0), (unsigned long long)att
,
903 willignore
? " (will ignore)" : "");
905 st_debug(STDL_NORMAL
, lcol
,
906 "process %d: exited/unscannable\n",
908 vic
->lpc_unscannable
= 1;
913 * Update the statistics file, if it's time.
915 check_update_statistics();
917 vicaddr
= (nvicaddr
!= NULL
) ? nvicaddr
: (void
918 *)advance_prpageheader_cur_nextmapping(&cur
);
920 excess
+= rss_delta(&new_psinfo
, &old_psinfo
, vic
);
921 st_debug(STDL_NORMAL
, lcol
, "done, excess %lld\n",
925 * If a process was grabbed, release it, destroying its agent.
927 if (scan_pr
!= NULL
) {
928 (void) Prelease(scan_pr
, 0);
931 lcol
->lcol_victim
= vic
;
933 * Scan the collection at most once. Only if scanning was not
934 * aborted for any reason, and the end of lprocess has not been
935 * reached, determine the next victim and scan it.
938 if (vic
->lpc_next
!= NULL
) {
940 * Determine the next process to be scanned.
943 vic
= get_valid_victim(lcol
,
949 * A complete scan of the collection was made,
950 * so tick the scan counter and stop scanning
951 * until the next request.
953 lcol
->lcol_stat
.lcols_scan_count
++;
954 lcol
->lcol_stat
.lcols_scan_time_complete
955 = lcol
->lcol_stat
.lcols_scan_time
;
957 * If an excess still exists, tick the
958 * "ineffective scan" counter, signalling that
959 * the cap may be uneforceable.
961 if (resumed
== 0 && excess
> 0)
963 .lcols_scan_ineffective
++;
965 * Scanning should start at the beginning of
966 * the process list at the next request.
973 lcol
->lcol_stat
.lcols_scan_time
+= (gethrtime() - scan_start
);
974 st_debug(STDL_HIGH
, lcol
, "done scanning; excess %lld\n",
977 lcol
->lcol_resaddr
= vicaddr
;
978 if (lcol
->lcol_resaddr
== NULL
&& lcol
->lcol_victim
!= NULL
) {
979 lcol
->lcol_victim
= get_valid_victim(lcol
,
980 lcol
->lcol_victim
->lpc_next
);
985 * Abort the scan in progress, and destroy the agent LWP of any grabbed
992 (void) Prelease(scan_pr
, 0);
996 revoke_xmap(rfd_t
*rfd
)
998 lprocess_t
*lpc
= rfd
->rfd_data
;
1000 debug("revoking xmap for process %d\n", (int)lpc
->lpc_pid
);
1001 ASSERT(lpc
->lpc_xmap_fd
!= -1);
1002 lpc
->lpc_xmap_fd
= -1;
1006 * Retrieve the process's current xmap , which is used to determine the size of
1007 * the resident portion of its segments. Return zero if successful.
1010 lpc_xmap_update(lprocess_t
*lpc
)
1015 free(lpc
->lpc_xmap
);
1016 lpc
->lpc_xmap
= NULL
;
1017 lpc
->lpc_nxmap
= -1;
1019 if (lpc
->lpc_xmap_fd
== -1) {
1020 char pathbuf
[PROC_PATH_MAX
];
1022 (void) snprintf(pathbuf
, sizeof (pathbuf
), "/proc/%d/xmap",
1024 if ((lpc
->lpc_xmap_fd
= rfd_open(pathbuf
, 1, RFD_XMAP
,
1025 revoke_xmap
, lpc
, O_RDONLY
, 0)) < 0)
1031 if (fstat(lpc
->lpc_xmap_fd
, &st
) != 0) {
1032 debug("cannot stat xmap\n");
1033 (void) rfd_close(lpc
->lpc_xmap_fd
);
1034 lpc
->lpc_xmap_fd
= -1;
1038 if ((st
.st_size
% sizeof (*lpc
->lpc_xmap
)) != 0) {
1039 debug("xmap wrong size\n");
1040 (void) rfd_close(lpc
->lpc_xmap_fd
);
1041 lpc
->lpc_xmap_fd
= -1;
1045 lpc
->lpc_xmap
= malloc(st
.st_size
);
1046 if (lpc
->lpc_xmap
== NULL
) {
1047 debug("cannot malloc() %ld bytes for xmap", st
.st_size
);
1048 (void) rfd_close(lpc
->lpc_xmap_fd
);
1049 lpc
->lpc_xmap_fd
= -1;
1053 if ((res
= pread(lpc
->lpc_xmap_fd
, lpc
->lpc_xmap
, st
.st_size
, 0)) !=
1055 free(lpc
->lpc_xmap
);
1056 lpc
->lpc_xmap
= NULL
;
1058 debug("xmap changed size, retrying\n");
1061 debug("cannot read xmap");
1065 lpc
->lpc_nxmap
= st
.st_size
/ sizeof (*lpc
->lpc_xmap
);