Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / usr / src / cmd / rcap / rcapd / rcapd_main.c
blob1c48e0ff9371d9caab57cf2696d0a5568f129a5d
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
27 * rcapd is a long-running daemon enforcing project-based resource caps (see
28 * rcapd(8)). Each instance of a process aggregate (project or, generically,
29 * "collection") may have a memory cap. A single thread monitors the resource
30 * utilization of capped collections, enforces caps when they are exceeded (and
31 * other conditions are met), and incorporates changes in configuration or
32 * caps. Each of these actions occurs not more frequently than the rate
33 * specified with rcapadm(8).
36 #include <sys/priocntl.h>
37 #include <sys/proc.h>
38 #include <sys/resource.h>
39 #include <sys/sysinfo.h>
40 #include <sys/stat.h>
41 #include <sys/sysmacros.h>
42 #include <sys/time.h>
43 #include <sys/types.h>
44 #include <dirent.h>
45 #include <errno.h>
46 #include <fcntl.h>
47 #include <kstat.h>
48 #include <libintl.h>
49 #include <limits.h>
50 #include <locale.h>
51 #include <priv.h>
52 #include <signal.h>
53 #include <stdarg.h>
54 #include <stdio.h>
55 #include <stdio_ext.h>
56 #include <stdlib.h>
57 #include <libscf.h>
58 #include <strings.h>
59 #include <time.h>
60 #include <unistd.h>
61 #include <zone.h>
62 #include <assert.h>
63 #include <sys/vm_usage.h>
64 #include "rcapd.h"
65 #include "rcapd_mapping.h"
66 #include "rcapd_rfd.h"
67 #include "rcapd_stat.h"
68 #include "utils.h"
70 #define POSITIVE_MIN(x, y) \
71 (((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y))
72 #define NEXT_EVENT_TIME(base, seconds) \
73 (((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \
74 : (hrtime_t)0)
75 #define NEXT_REPORT_EVENT_TIME(base, seconds) \
76 ((rcfg.rcfg_stat_file[0] != 0) ? \
77 NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0)
78 #define EVENT_TIME(time, eventtime) \
79 (((time) > (eventtime)) && (eventtime) != 0)
80 #define STAT_TEMPLATE_SUFFIX ".XXXXXX" /* suffix of mkstemp() arg */
81 #define DAEMON_UID 1 /* uid to use */
83 #define CAPPED_PROJECT 0x01
84 #define CAPPED_ZONE 0x02
86 typedef struct soft_scan_arg {
87 uint64_t ssa_sum_excess;
88 int64_t ssa_scan_goal;
89 boolean_t ssa_project_over_cap;
90 } soft_scan_arg_t;
92 typedef struct sample_col_arg {
93 boolean_t sca_any_over_cap;
94 boolean_t sca_project_over_cap;
95 } sample_col_arg_t;
98 static int debug_mode = 0; /* debug mode flag */
99 static pid_t rcapd_pid; /* rcapd's pid to ensure it's not */
100 /* scanned */
101 static kstat_ctl_t *kctl; /* kstat chain */
102 static int memory_pressure = 0; /* physical memory utilization (%) */
103 static int memory_pressure_sample = 0; /* count of samples */
104 static long page_size_kb = 0; /* system page size in KB */
105 static size_t nvmu_vals = 0; /* # of kernel RSS/swap vals in array */
106 static size_t vmu_vals_len = 0; /* size of RSS/swap vals array */
107 static vmusage_t *vmu_vals = NULL; /* snapshot of kernel RSS/swap values */
108 static hrtime_t next_report; /* time of next report */
109 static int termination_signal = 0; /* terminating signal */
110 static zoneid_t my_zoneid = (zoneid_t)-1;
111 static lcollection_t *gz_col; /* global zone collection */
113 rcfg_t rcfg;
115 * Updated when we re-read the collection configurations if this rcapd instance
116 * is running in the global zone and the global zone is capped.
118 boolean_t gz_capped = B_FALSE;
121 * Flags.
123 static int ever_ran;
124 int should_run;
125 static int should_reconfigure;
127 static int verify_statistics(void);
128 static int update_statistics(void);
131 * Checks if a process is marked 'system'. Returns FALSE only when it is not.
133 static boolean_t
134 proc_issystem(pid_t pid)
136 char pc_clname[PC_CLNMSZ];
138 if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
139 PC_KY_NULL) != -1) {
140 return (strcmp(pc_clname, "SYS") == 0);
141 } else {
142 debug("cannot get class-specific scheduling parameters; "
143 "assuming system process\n");
144 return (B_TRUE);
148 static void
149 lprocess_insert_mark(psinfo_t *psinfop)
151 pid_t pid = psinfop->pr_pid;
152 /* flag indicating whether the process should be scanned. */
153 int unscannable = psinfop->pr_nlwp == 0;
154 rcid_t colid;
155 lcollection_t *lcol;
156 lprocess_t *lproc;
159 * Determine which collection to put this process into. We only have
160 * to worry about tracking both zone and project capped processes if
161 * this rcapd instance is running in the global zone, since we'll only
162 * see processes in our own projects in a non-global zone. In the
163 * global zone, if the process belongs to a non-global zone, we only
164 * need to track it for the capped non-global zone collection. For
165 * global zone processes, we first attempt to put the process into a
166 * capped project collection. On the second pass into this function
167 * the projid will be cleared so we will just track the process for the
168 * global zone collection as a whole.
170 if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) {
171 colid.rcid_type = RCIDT_PROJECT;
172 colid.rcid_val = psinfop->pr_projid;
173 } else {
174 /* try to add to zone collection */
175 colid.rcid_type = RCIDT_ZONE;
176 colid.rcid_val = psinfop->pr_zoneid;
179 if ((lcol = lcollection_find(&colid)) == NULL)
180 return;
183 * If the process is already being tracked, update the unscannable flag,
184 * as determined by the caller, from the process's psinfo.
186 lproc = lcol->lcol_lprocess;
187 while (lproc != NULL) {
188 if (lproc->lpc_pid == pid) {
189 lproc->lpc_mark = 1;
190 if (unscannable != 0 && lproc->lpc_unscannable == 0) {
191 debug("process %d: became unscannable\n",
192 (int)lproc->lpc_pid);
193 lproc->lpc_unscannable = 1;
195 return;
197 lproc = lproc->lpc_next;
201 * We've fallen off the list without finding our current process;
202 * insert it at the list head.
204 if ((lproc = malloc(sizeof (*lproc))) == NULL)
205 debug("insufficient memory to track new process %d", (int)pid);
206 else {
207 (void) bzero(lproc, sizeof (*lproc));
208 lproc->lpc_pid = pid;
209 lproc->lpc_mark = 1;
210 lproc->lpc_collection = lcol;
211 lproc->lpc_psinfo_fd = -1;
212 lproc->lpc_pgdata_fd = -1;
213 lproc->lpc_xmap_fd = -1;
216 * If the caller didn't flag this process as unscannable
217 * already, do some more checking.
219 lproc->lpc_unscannable = unscannable || proc_issystem(pid);
221 #ifdef DEBUG
223 * Verify the sanity of lprocess. It should not contain the
224 * process we are about to prepend.
226 if (lcollection_member(lcol, lproc)) {
227 lprocess_t *cur = lcol->lcol_lprocess;
228 debug("The collection %lld already has these members, "
229 "including me, %d!\n",
230 (long long)lcol->lcol_id.rcid_val,
231 (int)lproc->lpc_pid);
232 while (cur != NULL) {
233 debug("\t%d\n", (int)cur->lpc_pid);
234 cur = cur->lpc_next;
236 info(gettext("process already on lprocess\n"));
237 abort();
239 #endif /* DEBUG */
240 lproc->lpc_next = lcol->lcol_lprocess;
241 if (lproc->lpc_next != NULL)
242 lproc->lpc_next->lpc_prev = lproc;
243 lproc->lpc_prev = NULL;
244 lcol->lcol_lprocess = lproc;
246 debug("tracking %s %ld %d %s%s\n",
247 (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
248 (long)colid.rcid_val,
249 (int)pid, psinfop->pr_psargs,
250 (lproc->lpc_unscannable != 0) ? " (not scannable)" : "");
251 lcol->lcol_stat.lcols_proc_in++;
255 static int
256 list_walk_process_cb(lcollection_t *lcol, void *arg)
258 int (*cb)(lcollection_t *, lprocess_t *) =
259 (int(*)(lcollection_t *, lprocess_t *))arg;
260 lprocess_t *member;
261 lprocess_t *next;
263 member = lcol->lcol_lprocess;
264 while (member != NULL) {
265 pid_t pid = member->lpc_pid;
266 next = member->lpc_next;
268 debug_high("list_walk_all lpc %d\n", (int)pid);
269 if (cb(lcol, member) != 0) {
270 debug_high("list_walk_all aborted at lpc %d\n",
271 (int)pid);
272 return (1);
274 member = next;
277 return (0);
281 * Invoke the given callback for each process in each collection. Callbacks
282 * are allowed to change the linkage of the process on which they act.
284 static void
285 list_walk_all(int (*cb)(lcollection_t *, lprocess_t *))
287 list_walk_collection(list_walk_process_cb, (void *)cb);
290 static void
291 revoke_psinfo(rfd_t *rfd)
293 lprocess_t *lpc = (lprocess_t *)rfd->rfd_data;
295 if (lpc != NULL) {
296 debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid);
297 ASSERT(lpc->lpc_psinfo_fd != -1);
298 lpc->lpc_psinfo_fd = -1;
299 } else
300 debug("revoking psinfo fd for unknown process\n");
304 * Retrieve a process's psinfo via an already-opened or new file descriptor.
305 * The supplied descriptor will be closed on failure. An optional callback
306 * will be invoked with the last descriptor tried, and a supplied callback
307 * argument, as its arguments, such that the new descriptor may be cached, or
308 * an old one may be invalidated. If the result of the callback is zero, the
309 * the caller is to assume responsibility for the file descriptor, to close it
310 * with rfd_close().
312 * On failure, a nonzero value is returned.
315 get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd,
316 int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc)
318 int fd;
319 int can_try_uncached;
321 ASSERT(!(cached_fd > 0 && fd_update_cb == NULL));
323 do {
324 if (cached_fd >= 0) {
325 fd = cached_fd;
326 can_try_uncached = 1;
327 debug_high("%d/psinfo, trying cached fd %d\n",
328 (int)pid, fd);
329 } else {
330 char pathbuf[PROC_PATH_MAX];
332 can_try_uncached = 0;
333 (void) snprintf(pathbuf, sizeof (pathbuf),
334 "/proc/%d/psinfo", (int)pid);
335 if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO,
336 revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) {
337 debug("cannot open %s", pathbuf);
338 break;
339 } else
340 debug_high("opened %s, fd %d\n", pathbuf, fd);
343 if (pread(fd, psinfo, sizeof (*psinfo), 0) ==
344 sizeof (*psinfo) && psinfo->pr_pid == pid)
345 break;
346 else {
347 debug_high("closed fd %d\n", fd);
348 if (rfd_close(fd) != 0)
349 debug("could not close fd %d", fd);
350 fd = cached_fd = -1;
352 } while (can_try_uncached == 1);
354 if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0)
355 if (fd >= 0) {
356 debug_high("closed %s fd %d\n", fd_update_cb == NULL ?
357 "uncached" : "cached", fd);
358 if (rfd_close(fd) != 0)
359 debug("could not close fd %d", fd);
362 debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd,
363 fd_update_cb != NULL ? "cached" : "uncached");
364 return ((fd >= 0) ? 0 : -1);
368 * Retrieve the collection membership of all processes and update the psinfo of
369 * those non-system, non-zombie ones in collections. For global zone processes,
370 * we first attempt to put the process into a capped project collection. We
371 * also want to track the process for the global zone collection as a whole.
373 static void
374 proc_cb(const pid_t pid)
376 psinfo_t psinfo;
378 if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) {
379 lprocess_insert_mark(&psinfo);
380 if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) {
382 * We also want to track this process for the global
383 * zone as a whole so add it to the global zone
384 * collection as well.
386 psinfo.pr_projid = -1;
387 lprocess_insert_mark(&psinfo);
393 * Cache the process' psinfo fd, taking responsibility for freeing it.
396 lprocess_update_psinfo_fd_cb(void *arg, int fd)
398 lprocess_t *lpc = arg;
400 lpc->lpc_psinfo_fd = fd;
401 return (0);
405 * Get the system pagesize.
407 static void
408 get_page_size(void)
410 page_size_kb = sysconf(_SC_PAGESIZE) / 1024;
411 debug("physical page size: %luKB\n", page_size_kb);
414 static void
415 tm_fmt(char *msg, hrtime_t t1, hrtime_t t2)
417 hrtime_t diff = t2 - t1;
419 if (diff < MILLISEC)
420 debug("%s: %lld nanoseconds\n", msg, diff);
421 else if (diff < MICROSEC)
422 debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC);
423 else if (diff < NANOSEC)
424 debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC);
425 else
426 debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC);
430 * Get the zone's & project's RSS from the kernel.
432 static void
433 rss_sample(boolean_t my_zone_only, uint_t col_types)
435 size_t nres;
436 size_t i;
437 uint_t flags;
438 hrtime_t t1, t2;
440 if (my_zone_only) {
441 flags = VMUSAGE_ZONE;
442 } else {
443 flags = 0;
444 if (col_types & CAPPED_PROJECT)
445 flags |= VMUSAGE_PROJECTS;
446 if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID)
447 flags |= VMUSAGE_ALL_ZONES;
450 debug("vmusage sample flags 0x%x\n", flags);
451 if (flags == 0)
452 return;
454 again:
455 /* try the current buffer to see if the list will fit */
456 nres = vmu_vals_len;
457 t1 = gethrtime();
458 if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval,
459 vmu_vals, &nres) != 0) {
460 if (errno != EOVERFLOW) {
461 warn(gettext("can't read RSS from kernel\n"));
462 return;
465 t2 = gethrtime();
466 tm_fmt("getvmusage time", t1, t2);
468 debug("kernel nres %lu\n", (ulong_t)nres);
470 if (nres > vmu_vals_len) {
471 /* array size is now too small, increase it and try again */
472 free(vmu_vals);
474 if ((vmu_vals = (vmusage_t *)calloc(nres,
475 sizeof (vmusage_t))) == NULL) {
476 warn(gettext("out of memory: could not read RSS from "
477 "kernel\n"));
478 vmu_vals_len = nvmu_vals = 0;
479 return;
481 vmu_vals_len = nres;
482 goto again;
485 nvmu_vals = nres;
487 debug("vmusage_sample\n");
488 for (i = 0; i < nvmu_vals; i++) {
489 debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), "
490 "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id,
491 vmu_vals[i].vmu_type,
492 (unsigned long long)vmu_vals[i].vmu_rss_all,
493 (unsigned long long)vmu_vals[i].vmu_rss_all / 1024,
494 (unsigned long long)vmu_vals[i].vmu_swap_all);
498 static void
499 update_col_rss(lcollection_t *lcol)
501 int i;
503 lcol->lcol_rss = 0;
504 lcol->lcol_image_size = 0;
506 for (i = 0; i < nvmu_vals; i++) {
507 if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val)
508 continue;
510 if (vmu_vals[i].vmu_type == VMUSAGE_ZONE &&
511 lcol->lcol_id.rcid_type != RCIDT_ZONE)
512 continue;
514 if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS &&
515 lcol->lcol_id.rcid_type != RCIDT_PROJECT)
516 continue;
518 /* we found the right RSS entry, update the collection vals */
519 lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024;
520 lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024;
521 break;
526 * Sample the collection RSS, updating the collection's statistics with the
527 * results. Also, sum the rss of all capped projects & return true if
528 * the collection is over cap.
530 static int
531 rss_sample_col_cb(lcollection_t *lcol, void *arg)
533 int64_t excess;
534 uint64_t rss;
535 sample_col_arg_t *col_argp = (sample_col_arg_t *)arg;
537 update_col_rss(lcol);
539 lcol->lcol_stat.lcols_rss_sample++;
540 rss = lcol->lcol_rss;
541 excess = rss - lcol->lcol_rss_cap;
542 if (excess > 0) {
543 lcol->lcol_stat.lcols_rss_act_sum += rss;
544 col_argp->sca_any_over_cap = B_TRUE;
545 if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
546 col_argp->sca_project_over_cap = B_TRUE;
548 lcol->lcol_stat.lcols_rss_sum += rss;
550 if (lcol->lcol_stat.lcols_min_rss > rss)
551 lcol->lcol_stat.lcols_min_rss = rss;
552 if (lcol->lcol_stat.lcols_max_rss < rss)
553 lcol->lcol_stat.lcols_max_rss = rss;
555 return (0);
559 * Determine if we have capped projects, capped zones or both.
561 static int
562 col_type_cb(lcollection_t *lcol, void *arg)
564 uint_t *col_type = (uint_t *)arg;
566 /* skip uncapped collections */
567 if (lcol->lcol_rss_cap == 0)
568 return (1);
570 if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
571 *col_type |= CAPPED_PROJECT;
572 else
573 *col_type |= CAPPED_ZONE;
575 /* once we know everything is capped, we can stop looking */
576 if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT))
577 return (1);
579 return (0);
583 * Open /proc and walk entries.
585 static void
586 proc_walk_all(void (*cb)(const pid_t))
588 DIR *pdir;
589 struct dirent *dirent;
590 pid_t pid;
592 (void) rfd_reserve(1);
593 if ((pdir = opendir("/proc")) == NULL)
594 die(gettext("couldn't open /proc!"));
596 while ((dirent = readdir(pdir)) != NULL) {
597 if (strcmp(".", dirent->d_name) == 0 ||
598 strcmp("..", dirent->d_name) == 0)
599 continue;
600 pid = atoi(dirent->d_name);
601 ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0);
602 if (pid == rcapd_pid)
603 continue;
604 else
605 cb(pid);
607 (void) closedir(pdir);
611 * Clear unmarked callback.
613 /*ARGSUSED*/
614 static int
615 sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc)
617 if (lpc->lpc_mark) {
618 lpc->lpc_mark = 0;
619 } else {
620 debug("process %d finished\n", (int)lpc->lpc_pid);
621 lprocess_free(lpc);
624 return (0);
628 * Print, for debugging purposes, a collection's recently-sampled RSS and
629 * excess.
631 /*ARGSUSED*/
632 static int
633 excess_print_cb(lcollection_t *lcol, void *arg)
635 int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap;
637 debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n",
638 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
639 lcol->lcol_name,
640 (unsigned long long)lcol->lcol_rss,
641 (unsigned long long)lcol->lcol_rss_cap,
642 (long long)excess);
644 return (0);
648 * Scan those collections which have exceeded their caps.
650 * If we're running in the global zone it might have a cap. We don't want to
651 * do any capping for the global zone yet since we might get under the cap by
652 * just capping the projects in the global zone.
654 /*ARGSUSED*/
655 static int
656 scan_cb(lcollection_t *lcol, void *arg)
658 int64_t excess;
660 /* skip over global zone collection for now but keep track for later */
661 if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
662 lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
663 gz_col = lcol;
664 return (0);
667 if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
668 scan(lcol, excess);
669 lcol->lcol_stat.lcols_scan++;
672 return (0);
676 * Scan the global zone collection and see if it still exceeds its cap.
677 * We take into account the effects of capping any global zone projects here.
679 static void
680 scan_gz(lcollection_t *lcol, boolean_t project_over_cap)
682 int64_t excess;
685 * If we had projects over their cap and the global zone was also over
686 * its cap then we need to get the up-to-date global zone rss to
687 * determine if we are still over the global zone cap. We might have
688 * gone under while we scanned the capped projects. If there were no
689 * projects over cap then we can use the rss value we already have for
690 * the global zone.
692 excess = lcol->lcol_rss - lcol->lcol_rss_cap;
693 if (project_over_cap && excess > 0) {
694 rss_sample(B_TRUE, CAPPED_ZONE);
695 update_col_rss(lcol);
696 excess = lcol->lcol_rss - lcol->lcol_rss_cap;
699 if (excess > 0) {
700 debug("global zone excess %lldKB\n", (long long)excess);
701 scan(lcol, excess);
702 lcol->lcol_stat.lcols_scan++;
707 * Do a soft scan of those collections which have excesses. A soft scan is one
708 * in which the cap enforcement pressure is taken into account. The difference
709 * between the utilized physical memory and the cap enforcement pressure will
710 * be scanned-for, and each collection will be scanned proportionally by their
711 * present excesses.
713 static int
714 soft_scan_cb(lcollection_t *lcol, void *a)
716 int64_t excess;
717 soft_scan_arg_t *arg = a;
719 /* skip over global zone collection for now but keep track for later */
720 if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
721 lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
722 gz_col = lcol;
723 return (0);
726 if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
727 int64_t adjusted_excess =
728 excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
730 debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
731 "scanning %lld\n",
732 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
733 "project" : "zone"),
734 (long)lcol->lcol_id.rcid_val,
735 (long long)excess, (long long)arg->ssa_scan_goal,
736 (unsigned long long)arg->ssa_sum_excess,
737 (long long)adjusted_excess);
739 scan(lcol, adjusted_excess);
740 lcol->lcol_stat.lcols_scan++;
743 return (0);
746 static void
747 soft_scan_gz(lcollection_t *lcol, void *a)
749 int64_t excess;
750 soft_scan_arg_t *arg = a;
753 * If we had projects over their cap and the global zone was also over
754 * its cap then we need to get the up-to-date global zone rss to
755 * determine if we are still over the global zone cap. We might have
756 * gone under while we scanned the capped projects. If there were no
757 * projects over cap then we can use the rss value we already have for
758 * the global zone.
760 excess = lcol->lcol_rss - lcol->lcol_rss_cap;
761 if (arg->ssa_project_over_cap && excess > 0) {
762 rss_sample(B_TRUE, CAPPED_ZONE);
763 update_col_rss(lcol);
764 excess = lcol->lcol_rss - lcol->lcol_rss_cap;
767 if (excess > 0) {
768 int64_t adjusted_excess =
769 excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
771 debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
772 "scanning %lld\n",
773 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
774 "project" : "zone"),
775 (long)lcol->lcol_id.rcid_val,
776 (long long)excess, (long long)arg->ssa_scan_goal,
777 (unsigned long long)arg->ssa_sum_excess,
778 (long long)adjusted_excess);
780 scan(lcol, adjusted_excess);
781 lcol->lcol_stat.lcols_scan++;
786 * When a scan could happen, but caps aren't enforced tick the
787 * lcols_unenforced_cap counter.
789 /*ARGSUSED*/
790 static int
791 unenforced_cap_cb(lcollection_t *lcol, void *arg)
793 lcol->lcol_stat.lcols_unenforced_cap++;
795 return (0);
799 * Update the count of physically installed memory.
801 static void
802 update_phys_total(void)
804 uint64_t old_phys_total;
806 old_phys_total = phys_total;
807 phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb;
808 if (phys_total != old_phys_total)
809 debug("physical memory%s: %lluM\n", (old_phys_total == 0 ?
810 "" : " adjusted"), (unsigned long long)(phys_total / 1024));
814 * Unlink a process from its collection, updating relevant statistics, and
815 * freeing its associated memory.
817 void
818 lprocess_free(lprocess_t *lpc)
820 pid_t pid;
822 lpc->lpc_collection->lcol_stat.lcols_proc_out++;
824 if (lpc->lpc_prev != NULL)
825 lpc->lpc_prev->lpc_next = lpc->lpc_next;
826 if (lpc->lpc_next != NULL)
827 lpc->lpc_next->lpc_prev = lpc->lpc_prev;
828 if (lpc->lpc_collection->lcol_lprocess == lpc)
829 lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next !=
830 lpc ? lpc->lpc_next : NULL);
831 lpc->lpc_next = lpc->lpc_prev = NULL;
833 free(lpc->lpc_prpageheader);
834 free(lpc->lpc_xmap);
835 if (lpc->lpc_psinfo_fd >= 0) {
836 if (rfd_close(lpc->lpc_psinfo_fd) != 0)
837 debug("could not close %d lpc_psinfo_fd %d",
838 (int)lpc->lpc_pid, lpc->lpc_psinfo_fd);
839 lpc->lpc_psinfo_fd = -1;
841 if (lpc->lpc_pgdata_fd >= 0) {
842 if (rfd_close(lpc->lpc_pgdata_fd) != 0)
843 debug("could not close %d lpc_pgdata_fd %d",
844 (int)lpc->lpc_pid, lpc->lpc_pgdata_fd);
845 lpc->lpc_pgdata_fd = -1;
847 if (lpc->lpc_xmap_fd >= 0) {
848 if (rfd_close(lpc->lpc_xmap_fd) != 0)
849 debug("could not close %d lpc_xmap_fd %d",
850 (int)lpc->lpc_pid, lpc->lpc_xmap_fd);
851 lpc->lpc_xmap_fd = -1;
853 if (lpc->lpc_ignore != NULL)
854 lmapping_free(&lpc->lpc_ignore);
855 pid = lpc->lpc_pid;
856 free(lpc);
857 debug_high("process %d freed\n", (int)pid);
861 * Collection clear callback.
863 /*ARGSUSED*/
864 static int
865 collection_clear_cb(lcollection_t *lcol, void *arg)
867 lcol->lcol_mark = 0;
869 return (0);
873 * Respond to a terminating signal by setting a termination flag.
875 /*ARGSUSED*/
876 static void
877 terminate_signal(int signal)
879 if (termination_signal == 0)
880 termination_signal = signal;
881 should_run = 0;
885 * Handle any synchronous or asynchronous signals that would ordinarily cause a
886 * process to abort.
888 /*ARGSUSED*/
889 static void
890 abort_signal(int signal)
893 * Allow the scanner to make a last-ditch effort to resume any stopped
894 * processes.
896 scan_abort();
897 abort();
901 * Clean up collections which have been removed due to configuration. Unlink
902 * the collection from lcollection and free it.
904 /*ARGSUSED*/
905 static int
906 collection_sweep_cb(lcollection_t *lcol, void *arg)
908 if (lcol->lcol_mark == 0) {
909 debug("freeing %s %s\n",
910 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
911 "project" : "zone"), lcol->lcol_name);
912 lcollection_free(lcol);
915 return (0);
919 * Set those variables which depend on the global configuration.
921 static void
922 finish_configuration(void)
925 * Warn that any lnode (or non-project) mode specification (by an SRM
926 * 1.3 configuration file, for example) is ignored.
928 if (strcmp(rcfg.rcfg_mode_name, "project") != 0) {
929 warn(gettext("%s mode specification ignored -- using project"
930 " mode\n"), rcfg.rcfg_mode_name);
931 rcfg.rcfg_mode_name = "project";
932 rcfg.rcfg_mode = rctype_project;
937 * Cause the configuration to be reread and applied.
939 static void
940 reread_configuration(void)
942 rcfg_t rcfg_new;
944 if (rcfg_read(&rcfg_new, update_statistics) != E_SUCCESS) {
945 warn(gettext("can't reread configuration \n"));
946 exit(SMF_EXIT_ERR_CONFIG);
947 } else {
949 * Done reading configuration. Remove existing
950 * collections in case there is a change in collection type.
952 if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) {
953 list_walk_collection(collection_clear_cb, NULL);
954 list_walk_collection(collection_sweep_cb, NULL);
958 * Make the newly-read configuration the global one, and update
959 * any variables that depend on it.
961 rcfg = rcfg_new;
962 finish_configuration();
967 * First, examine changes, additions, and deletions to cap definitions.
968 * Then, set the next event time.
970 static void
971 reconfigure(hrtime_t now, hrtime_t *next_configuration,
972 hrtime_t *next_proc_walk, hrtime_t *next_rss_sample)
974 debug("reconfigure...\n");
977 * Walk the lcollection, marking active collections so inactive ones
978 * can be freed.
980 list_walk_collection(collection_clear_cb, NULL);
981 lcollection_update(LCU_ACTIVE_ONLY); /* mark */
982 list_walk_collection(collection_sweep_cb, NULL);
984 *next_configuration = NEXT_EVENT_TIME(now,
985 rcfg.rcfg_reconfiguration_interval);
988 * Reset each event time to the shorter of the previous and new
989 * intervals.
991 if (next_report == 0 && rcfg.rcfg_report_interval > 0)
992 next_report = now;
993 else
994 next_report = POSITIVE_MIN(next_report,
995 NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval));
997 if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0)
998 *next_proc_walk = now;
999 else
1000 *next_proc_walk = POSITIVE_MIN(*next_proc_walk,
1001 NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval));
1003 if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0)
1004 *next_rss_sample = now;
1005 else
1006 *next_rss_sample = POSITIVE_MIN(*next_rss_sample,
1007 NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval));
1011 * Respond to SIGHUP by triggering the rereading the configuration and cap
1012 * definitions.
1014 /*ARGSUSED*/
1015 static void
1016 sighup(int signal)
1018 should_reconfigure = 1;
1022 * Print, for debugging purposes, each collection's interval statistics.
1024 /*ARGSUSED*/
1025 static int
1026 simple_report_collection_cb(lcollection_t *lcol, void *arg)
1028 #define DELTA(field) \
1029 (unsigned long long)( \
1030 (lcol->lcol_stat.field - lcol->lcol_stat_old.field))
1032 debug("%s %s status: succeeded/attempted (k): %llu/%llu, "
1033 "ineffective/scans/unenforced/samplings: %llu/%llu/%llu/%llu, RSS "
1034 "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, "
1035 "%llu scans over %llu ms\n",
1036 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
1037 lcol->lcol_name,
1038 DELTA(lcols_pg_eff), DELTA(lcols_pg_att),
1039 DELTA(lcols_scan_ineffective), DELTA(lcols_scan),
1040 DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample),
1041 (unsigned long long)lcol->lcol_stat.lcols_min_rss,
1042 (unsigned long long)lcol->lcol_stat.lcols_max_rss,
1043 (unsigned long long)lcol->lcol_rss_cap,
1044 (unsigned long long)(lcol->lcol_stat.lcols_proc_in -
1045 lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out),
1046 DELTA(lcols_scan_count),
1047 NSEC2MSEC(DELTA(lcols_scan_time_complete)));
1049 #undef DELTA
1051 return (0);
1055 * Record each collection's interval statistics in the statistics file.
1057 static int
1058 report_collection_cb(lcollection_t *lcol, void *arg)
1060 lcollection_report_t dc;
1061 int fd = (intptr_t)arg;
1064 * Copy the relevant fields to the collection's record.
1066 bzero(&dc, sizeof (dc));
1067 dc.lcol_id = lcol->lcol_id;
1068 (void) strcpy(dc.lcol_name, lcol->lcol_name);
1069 dc.lcol_rss = lcol->lcol_rss;
1070 dc.lcol_image_size = lcol->lcol_image_size;
1071 dc.lcol_rss_cap = lcol->lcol_rss_cap;
1072 dc.lcol_stat = lcol->lcol_stat;
1074 if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) {
1075 lcol->lcol_stat_old = lcol->lcol_stat;
1076 } else {
1077 debug("can't write %s %s statistics",
1078 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
1079 "project" : "zone"),
1080 lcol->lcol_name);
1083 return (0);
1087 * Determine the count of pages scanned by the global page scanner, obtained
1088 * from the cpu_stat:*::scan kstats. Return zero on success.
1090 static int
1091 get_globally_scanned_pages(uint64_t *scannedp)
1093 kstat_t *ksp;
1094 uint64_t scanned = 0;
1096 if (kstat_chain_update(kctl) == -1) {
1097 warn(gettext("can't update kstat chain"));
1098 return (0);
1101 for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
1102 if (strcmp(ksp->ks_module, "cpu_stat") == 0) {
1103 if (kstat_read(kctl, ksp, NULL) != -1) {
1104 scanned += ((cpu_stat_t *)
1105 ksp->ks_data)->cpu_vminfo.scan;
1106 } else {
1107 return (-1);
1112 *scannedp = scanned;
1113 return (0);
1117 * Determine if the global page scanner is running, during which no memory
1118 * caps should be enforced, to prevent interference with the global page
1119 * scanner.
1121 static boolean_t
1122 is_global_scanner_running()
1124 /* measure delta in page scan count */
1125 static uint64_t new_sp = 0;
1126 static uint64_t old_sp = 0;
1127 boolean_t res = B_FALSE;
1129 if (get_globally_scanned_pages(&new_sp) == 0) {
1130 if (old_sp != 0 && (new_sp - old_sp) > 0) {
1131 debug("global memory pressure detected (%llu "
1132 "pages scanned since last interval)\n",
1133 (unsigned long long)(new_sp - old_sp));
1134 res = B_TRUE;
1136 old_sp = new_sp;
1137 } else {
1138 warn(gettext("unable to read cpu statistics"));
1139 new_sp = old_sp;
1142 return (res);
1146 * If soft caps are in use, determine if global memory pressure exceeds the
1147 * configured maximum above which soft caps are enforced.
1149 static boolean_t
1150 must_enforce_soft_caps()
1153 * Check for changes to the amount of installed physical memory, to
1154 * compute the current memory pressure.
1156 update_phys_total();
1158 memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb)
1159 * 100.0 / phys_total);
1160 memory_pressure_sample++;
1161 if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 &&
1162 memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) {
1163 return (B_TRUE);
1166 return (B_FALSE);
1170 * Update the shared statistics file with each collection's current statistics.
1171 * Return zero on success.
1173 static int
1174 update_statistics(void)
1176 int fd, res;
1177 static char template[LINELEN];
1180 * Try to create a directory irrespective of whether it is existing
1181 * or not. If it is not there then it will create. Otherwise any way
1182 * it will fail at mkstemp call below.
1184 (void) mkdir(STAT_FILE_DIR, 0755);
1187 * Create a temporary file.
1189 if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) +
1190 strlen(STAT_TEMPLATE_SUFFIX) + 1)) {
1191 debug("temporary file template size too small\n");
1192 return (-1);
1194 (void) strcpy(template, rcfg.rcfg_stat_file);
1195 (void) strcat(template, STAT_TEMPLATE_SUFFIX);
1196 (void) rfd_reserve(1);
1197 fd = mkstemp(template);
1200 * Write the header and per-collection statistics.
1202 if (fd >= 0) {
1203 rcapd_stat_hdr_t rs;
1205 rs.rs_pid = rcapd_pid;
1206 rs.rs_time = gethrtime();
1207 ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name));
1208 (void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name);
1209 rs.rs_pressure_cur = memory_pressure;
1210 rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure;
1211 rs.rs_pressure_sample = memory_pressure_sample;
1213 if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) ==
1214 sizeof (rs)) {
1215 list_walk_collection(report_collection_cb,
1216 (void *)(intptr_t)fd);
1218 * Replace the existing statistics file with this new
1219 * one.
1221 res = rename(template, rcfg.rcfg_stat_file);
1222 } else
1223 res = -1;
1224 (void) close(fd);
1225 } else
1226 res = -1;
1228 return (res);
1232 * Verify the statistics file can be created and written to, and die if an
1233 * existing file may be in use by another rcapd.
1235 static int
1236 verify_statistics(void)
1238 pid_t pid;
1241 * Warn if another instance of rcapd might be active.
1243 (void) rfd_reserve(1);
1244 pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file);
1245 if (pid != rcapd_pid && pid != -1)
1246 die(gettext("%s exists; rcapd may already be active\n"),
1247 rcfg.rcfg_stat_file);
1249 return (update_statistics());
1252 static int
1253 sum_excess_cb(lcollection_t *lcol, void *arg)
1255 uint64_t *sum_excess = arg;
1257 *sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss -
1258 lcol->lcol_rss_cap));
1259 return (0);
1263 * Compute the quantity of memory (in kilobytes) above the cap enforcement
1264 * pressure. Set the scan goal to that quantity (or at most the excess).
1266 static void
1267 compute_soft_scan_goal(soft_scan_arg_t *argp)
1270 * Compute the sum of the collections' excesses, which will be the
1271 * denominator.
1273 argp->ssa_sum_excess = 0;
1274 list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess));
1276 argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) *
1277 (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 -
1278 sysconf(_SC_AVPHYS_PAGES)) * page_size_kb,
1279 argp->ssa_sum_excess);
1282 static void
1283 rcapd_usage(void)
1285 info(gettext("usage: rcapd [-d]\n"));
1288 void
1289 check_update_statistics(void)
1291 hrtime_t now = gethrtime();
1293 if (EVENT_TIME(now, next_report)) {
1294 debug("updating statistics...\n");
1295 list_walk_collection(simple_report_collection_cb, NULL);
1296 if (update_statistics() != 0)
1297 debug("couldn't update statistics");
1298 next_report = NEXT_REPORT_EVENT_TIME(now,
1299 rcfg.rcfg_report_interval);
1303 static void
1304 verify_and_set_privileges(void)
1306 priv_set_t *required =
1307 priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL);
1310 * Ensure the required privileges, suitable for controlling processes,
1311 * are possessed.
1313 if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv(
1314 PRIV_SET, PRIV_EFFECTIVE, required) != 0)
1315 die(gettext("can't set requisite privileges"));
1318 * Ensure access to /var/run/daemon.
1320 if (setreuid(DAEMON_UID, DAEMON_UID) != 0)
1321 die(gettext("cannot become user daemon"));
1323 priv_freeset(required);
1327 * This function does the top-level work to determine if we should do any
1328 * memory capping, and if so, it invokes the right call-backs to do the work.
1330 static void
1331 do_capping(hrtime_t now, hrtime_t *next_proc_walk)
1333 boolean_t enforce_caps;
1334 /* soft cap enforcement flag, depending on memory pressure */
1335 boolean_t enforce_soft_caps;
1336 /* avoid interference with kernel's page scanner */
1337 boolean_t global_scanner_running;
1338 sample_col_arg_t col_arg;
1339 soft_scan_arg_t arg;
1340 uint_t col_types = 0;
1342 /* check what kind of collections (project/zone) are capped */
1343 list_walk_collection(col_type_cb, &col_types);
1344 debug("collection types: 0x%x\n", col_types);
1346 /* no capped collections, skip checking rss */
1347 if (col_types == 0)
1348 return;
1350 /* Determine if soft caps are enforced. */
1351 enforce_soft_caps = must_enforce_soft_caps();
1353 /* Determine if the global page scanner is running. */
1354 global_scanner_running = is_global_scanner_running();
1357 * Sample collections' member processes RSSes and recompute
1358 * collections' excess.
1360 rss_sample(B_FALSE, col_types);
1362 col_arg.sca_any_over_cap = B_FALSE;
1363 col_arg.sca_project_over_cap = B_FALSE;
1364 list_walk_collection(rss_sample_col_cb, &col_arg);
1365 list_walk_collection(excess_print_cb, NULL);
1366 debug("any collection/project over cap = %d, %d\n",
1367 col_arg.sca_any_over_cap, col_arg.sca_project_over_cap);
1369 if (enforce_soft_caps)
1370 debug("memory pressure %d%%\n", memory_pressure);
1373 * Cap enforcement is determined by the previous conditions.
1375 enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap &&
1376 (rcfg.rcfg_memory_cap_enforcement_pressure == 0 ||
1377 enforce_soft_caps);
1379 debug("%senforcing caps\n", enforce_caps ? "" : "not ");
1382 * If soft caps are in use, determine the size of the portion from each
1383 * collection to scan for.
1385 if (enforce_caps && enforce_soft_caps)
1386 compute_soft_scan_goal(&arg);
1389 * Victimize offending collections.
1391 if (enforce_caps && (!enforce_soft_caps ||
1392 (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) {
1395 * Since at least one collection is over its cap & needs
1396 * enforcing, check if it is at least time for a process walk
1397 * (we could be well past time since we only walk /proc when
1398 * we need to) and if so, update each collections process list
1399 * in a single pass through /proc.
1401 if (EVENT_TIME(now, *next_proc_walk)) {
1402 debug("scanning process list...\n");
1403 proc_walk_all(proc_cb); /* insert & mark */
1404 list_walk_all(sweep_process_cb); /* free dead procs */
1405 *next_proc_walk = NEXT_EVENT_TIME(now,
1406 rcfg.rcfg_proc_walk_interval);
1409 gz_col = NULL;
1410 if (enforce_soft_caps) {
1411 debug("scan goal is %lldKB\n",
1412 (long long)arg.ssa_scan_goal);
1413 list_walk_collection(soft_scan_cb, &arg);
1414 if (gz_capped && gz_col != NULL) {
1415 /* process global zone */
1416 arg.ssa_project_over_cap =
1417 col_arg.sca_project_over_cap;
1418 soft_scan_gz(gz_col, &arg);
1420 } else {
1421 list_walk_collection(scan_cb, NULL);
1422 if (gz_capped && gz_col != NULL) {
1423 /* process global zone */
1424 scan_gz(gz_col, col_arg.sca_project_over_cap);
1427 } else if (col_arg.sca_any_over_cap) {
1428 list_walk_collection(unenforced_cap_cb, NULL);
1433 main(int argc, char *argv[])
1435 int res;
1436 int should_fork = 1; /* fork flag */
1437 hrtime_t now; /* current time */
1438 hrtime_t next; /* time of next event */
1439 int sig; /* signal iteration */
1440 struct rlimit rl;
1441 hrtime_t next_proc_walk; /* time of next /proc scan */
1442 hrtime_t next_configuration; /* time of next configuration */
1443 hrtime_t next_rss_sample; /* (latest) time of next RSS sample */
1445 (void) set_message_priority(RCM_INFO);
1446 (void) setpname("rcapd");
1447 rcapd_pid = getpid();
1448 (void) chdir("/");
1449 should_run = 1;
1450 ever_ran = 0;
1452 (void) setlocale(LC_ALL, "");
1453 (void) textdomain(TEXT_DOMAIN);
1456 * Parse command-line options.
1458 while ((res = getopt(argc, argv, "dF")) > 0)
1459 switch (res) {
1460 case 'd':
1461 should_fork = 0;
1462 if (debug_mode == 0) {
1463 debug_mode = 1;
1464 (void) set_message_priority(RCM_DEBUG);
1465 } else
1466 (void) set_message_priority(RCM_DEBUG_HIGH);
1467 break;
1468 case 'F':
1469 should_fork = 0;
1470 break;
1471 default:
1472 rcapd_usage();
1473 return (E_USAGE);
1474 /*NOTREACHED*/
1478 * Read the configuration.
1480 if (rcfg_read(&rcfg, verify_statistics) != E_SUCCESS) {
1481 warn(gettext("resource caps not configured\n"));
1482 return (SMF_EXIT_ERR_CONFIG);
1486 * If not debugging, fork and continue operating, changing the
1487 * destination of messages to syslog().
1489 if (should_fork == 1) {
1490 pid_t child;
1491 debug("forking\n");
1492 child = fork();
1493 if (child == -1)
1494 die(gettext("cannot fork"));
1495 if (child > 0)
1496 return (0);
1497 else {
1498 rcapd_pid = getpid();
1499 (void) set_message_destination(RCD_SYSLOG);
1500 (void) fclose(stdin);
1501 (void) fclose(stdout);
1502 (void) fclose(stderr);
1505 * Start a new session and detatch from the controlling tty.
1507 if (setsid() == (pid_t)-1)
1508 debug(gettext("setsid() failed; cannot detach from "
1509 "terminal"));
1512 finish_configuration();
1513 should_reconfigure = 0;
1516 * Check that required privileges are possessed.
1518 verify_and_set_privileges();
1520 now = next_report = next_proc_walk = next_rss_sample = gethrtime();
1521 next_configuration = NEXT_EVENT_TIME(gethrtime(),
1522 rcfg.rcfg_reconfiguration_interval);
1525 * Open the kstat chain.
1527 kctl = kstat_open();
1528 if (kctl == NULL)
1529 die(gettext("can't open kstats"));
1532 * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can
1533 * be effectively managed without revoking descriptors (at 3 per
1534 * process).
1536 rl.rlim_cur = 32 * 1024;
1537 rl.rlim_max = 32 * 1024;
1538 if (setrlimit(RLIMIT_NOFILE, &rl) != 0 &&
1539 getrlimit(RLIMIT_NOFILE, &rl) == 0) {
1540 rl.rlim_cur = rl.rlim_max;
1541 (void) setrlimit(RLIMIT_NOFILE, &rl);
1543 (void) enable_extended_FILE_stdio(-1, -1);
1545 if (getrlimit(RLIMIT_NOFILE, &rl) == 0)
1546 debug("fd limit: %lu\n", rl.rlim_cur);
1547 else
1548 debug("fd limit: unknown\n");
1550 get_page_size();
1551 my_zoneid = getzoneid();
1554 * Handle those signals whose (default) exit disposition
1555 * prevents rcapd from finishing scanning before terminating.
1557 (void) sigset(SIGINT, terminate_signal);
1558 (void) sigset(SIGQUIT, abort_signal);
1559 (void) sigset(SIGILL, abort_signal);
1560 (void) sigset(SIGEMT, abort_signal);
1561 (void) sigset(SIGFPE, abort_signal);
1562 (void) sigset(SIGBUS, abort_signal);
1563 (void) sigset(SIGSEGV, abort_signal);
1564 (void) sigset(SIGSYS, abort_signal);
1565 (void) sigset(SIGPIPE, terminate_signal);
1566 (void) sigset(SIGALRM, terminate_signal);
1567 (void) sigset(SIGTERM, terminate_signal);
1568 (void) sigset(SIGUSR1, terminate_signal);
1569 (void) sigset(SIGUSR2, terminate_signal);
1570 (void) sigset(SIGPOLL, terminate_signal);
1571 (void) sigset(SIGVTALRM, terminate_signal);
1572 (void) sigset(SIGXCPU, abort_signal);
1573 (void) sigset(SIGXFSZ, abort_signal);
1574 for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++)
1575 (void) sigset(sig, terminate_signal);
1578 * Install a signal handler for reconfiguration processing.
1580 (void) sigset(SIGHUP, sighup);
1583 * Determine which process collections to cap.
1585 lcollection_update(LCU_COMPLETE);
1588 * Loop forever, monitoring collections' resident set sizes and
1589 * enforcing their caps. Look for changes in caps as well as
1590 * responding to requests to reread the configuration. Update
1591 * per-collection statistics periodically.
1593 while (should_run != 0) {
1594 struct timespec ts;
1597 * Announce that rcapd is starting.
1599 if (ever_ran == 0) {
1600 info(gettext("starting\n"));
1601 ever_ran = 1;
1605 * Check the configuration at every next_configuration interval.
1606 * Update the rss data once every next_rss_sample interval.
1607 * The condition of global memory pressure is also checked at
1608 * the same frequency, if strict caps are in use.
1610 now = gethrtime();
1613 * Detect configuration and cap changes only when SIGHUP
1614 * is received. Call reconfigure to apply new configuration
1615 * parameters.
1617 if (should_reconfigure == 1) {
1618 reread_configuration();
1619 should_reconfigure = 0;
1620 reconfigure(now, &next_configuration, &next_proc_walk,
1621 &next_rss_sample);
1624 if (EVENT_TIME(now, next_configuration)) {
1625 reconfigure(now, &next_configuration, &next_proc_walk,
1626 &next_rss_sample);
1630 * Do the main work for enforcing caps.
1632 if (EVENT_TIME(now, next_rss_sample)) {
1633 do_capping(now, &next_proc_walk);
1635 next_rss_sample = NEXT_EVENT_TIME(now,
1636 rcfg.rcfg_rss_sample_interval);
1640 * Update the statistics file, if it's time.
1642 check_update_statistics();
1645 * Sleep for some time before repeating.
1647 now = gethrtime();
1648 next = next_configuration;
1649 next = POSITIVE_MIN(next, next_report);
1650 next = POSITIVE_MIN(next, next_rss_sample);
1651 if (next > now && should_run != 0) {
1652 debug("sleeping %-4.2f seconds\n", (float)(next -
1653 now) / (float)NANOSEC);
1654 hrt2ts(next - now, &ts);
1655 (void) nanosleep(&ts, NULL);
1658 if (termination_signal != 0)
1659 debug("exiting due to signal %d\n", termination_signal);
1660 if (ever_ran != 0)
1661 info(gettext("exiting\n"));
1664 * Unlink the statistics file before exiting.
1666 if (rcfg.rcfg_stat_file[0] != 0)
1667 (void) unlink(rcfg.rcfg_stat_file);
1669 return (E_SUCCESS);