Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / usr / src / uts / common / conf / param.c
blobdc783f59f8d5e622611949b5cb0464be43811034
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright 2012 Milan Jurik. All rights reserved.
28 #include <sys/cfgparam.h>
29 #include <sys/types.h>
30 #include <sys/time.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/signal.h>
34 #include <sys/sysmacros.h>
35 #include <sys/cmn_err.h>
36 #include <sys/user.h>
37 #include <sys/proc.h>
38 #include <sys/task.h>
39 #include <sys/project.h>
40 #include <sys/klwp.h>
41 #include <sys/vnode.h>
42 #include <sys/file.h>
43 #include <sys/fcntl.h>
44 #include <sys/flock.h>
45 #include <sys/var.h>
46 #include <sys/stream.h>
47 #include <sys/strsubr.h>
48 #include <sys/conf.h>
49 #include <sys/class.h>
50 #include <sys/ts.h>
51 #include <sys/rt.h>
52 #include <sys/exec.h>
53 #include <sys/exechdr.h>
54 #include <sys/buf.h>
55 #include <sys/resource.h>
56 #include <vm/seg.h>
57 #include <vm/pvn.h>
58 #include <vm/seg_kmem.h>
59 #include <sys/vmparam.h>
60 #include <sys/machparam.h>
61 #include <sys/utsname.h>
62 #include <sys/kmem.h>
63 #include <sys/stack.h>
64 #include <sys/modctl.h>
65 #include <sys/fdbuffer.h>
66 #include <sys/cyclic_impl.h>
67 #include <sys/disp.h>
68 #include <sys/tuneable.h>
69 #include <sys/systeminfo.h>
71 #include <sys/vmem.h>
72 #include <sys/clock.h>
73 #include <sys/clock_impl.h>
74 #include <sys/serializer.h>
77 * The following few lines describe generic things that must be compiled
78 * into the booted executable (unix) rather than genunix or any other
79 * module because they're required by crash dump readers, etc.
81 struct modctl modules; /* head of linked list of modules */
82 char *default_path; /* default module loading path */
83 struct swapinfo *swapinfo; /* protected by the swapinfo_lock */
84 proc_t *practive; /* active process list */
85 uint_t nproc; /* current number of processes */
86 proc_t p0; /* process 0 */
87 struct plock p0lock; /* p0's p_lock */
88 klwp_t lwp0; /* t0's lwp */
89 task_t *task0p; /* task 0 */
90 kproject_t *proj0p; /* location of project 0 */
93 * The following are "implementation architecture" dependent constants made
94 * available here in the form of initialized data for use by "implementation
95 * architecture" independent modules. See machparam.h.
97 const unsigned long _pagesize = (unsigned long)PAGESIZE;
98 const unsigned int _pageshift = (unsigned int)PAGESHIFT;
99 const unsigned long _pageoffset = (unsigned long)PAGEOFFSET;
101 * XXX - This value pagemask has to be a 64bit size because
102 * large file support uses this mask on offsets which are 64 bit size.
103 * using unsigned leaves the higher 32 bits value as zero thus
104 * corrupting offset calculations in the file system and VM.
106 const u_longlong_t _pagemask = (u_longlong_t)PAGEMASK;
107 const unsigned long _mmu_pagesize = (unsigned long)MMU_PAGESIZE;
108 const unsigned int _mmu_pageshift = (unsigned int)MMU_PAGESHIFT;
109 const unsigned long _mmu_pageoffset = (unsigned long)MMU_PAGEOFFSET;
110 const unsigned long _mmu_pagemask = (unsigned long)MMU_PAGEMASK;
111 uintptr_t _kernelbase = (uintptr_t)KERNELBASE;
112 uintptr_t _userlimit = (uintptr_t)USERLIMIT;
113 uintptr_t _userlimit32 = (uintptr_t)USERLIMIT32;
114 const uintptr_t _argsbase = (uintptr_t)ARGSBASE;
115 const unsigned int _diskrpm = (unsigned int)DISKRPM;
116 const unsigned long _pgthresh = (unsigned long)PGTHRESH;
117 const unsigned int _maxslp = (unsigned int)MAXSLP;
118 const unsigned long _maxhandspreadpages = (unsigned long)MAXHANDSPREADPAGES;
119 const int _ncpu = (int)NCPU;
120 const int _ncpu_log2 = (int)NCPU_LOG2;
121 const int _ncpu_p2 = (int)NCPU_P2;
122 const unsigned long _defaultstksz = (unsigned long)DEFAULTSTKSZ;
123 const unsigned int _nbpg = (unsigned int)MMU_PAGESIZE;
126 * System parameter formulae.
128 * This file is copied into each directory where we compile
129 * the kernel; it should be modified there to suit local taste
130 * if necessary.
134 * Default hz is 100, but if we set hires_tick we get higher resolution
135 * clock behavior (currently defined to be 1000 hz). Higher values seem
136 * to work, but are not supported.
138 * If we do decide to play with higher values, remember that hz should
139 * satisfy the following constraints to avoid integer round-off problems:
141 * (1) hz should be in the range 100 <= hz <= MICROSEC. If hz exceeds
142 * MICROSEC, usec_per_tick will be zero and lots of stuff will break.
143 * Similarly, if hz < 100 then hz / 100 == 0 and stuff will break.
145 * (2) If hz <= 1000, it should be both a multiple of 100 and a
146 * divisor of 1000.
148 * (3) If hz > 1000, it should be both a multiple of 1000 and a
149 * divisor of MICROSEC.
151 * Thus the only reasonable values of hz (i.e. the values that won't
152 * cause roundoff error) are: 100, 200, 500, 1000, 2000, 4000, 5000,
153 * 8000, 10000, 20000, 25000, 40000, 50000, 100000, 125000, 200000,
154 * 250000, 500000, 1000000. As of this writing (1996) a clock rate
155 * of more than about 10 kHz seems utterly ridiculous, although
156 * this observation will no doubt seem quaintly amusing one day.
158 #define HIRES_HZ_DEFAULT 1000
160 int hz = HZ_DEFAULT;
161 int hires_hz = HIRES_HZ_DEFAULT;
163 int hires_tick = 0;
164 int cpu_decay_factor = 10; /* this is no longer tied to clock */
165 int max_hres_adj; /* maximum adjustment of hrtime per tick */
166 int tick_per_msec; /* clock ticks per millisecond (zero if hz < 1000) */
169 * Milliseconds, Microseconds, and Nanoseconds per clock tick
171 * Note:
172 * msec_per_tick is zero if hz > 1000
174 int msec_per_tick;
175 int usec_per_tick;
176 int nsec_per_tick;
179 * Time Resolution values. These are defined in condvar.h and initialized in
180 * param_init(). Consumers of cv_reltimedwait() and cv_reltimedwait_sig()
181 * need to specify how accurate the timeout argument should be through
182 * one of these values. The intention is to allow the underlying implementation
183 * to anticipate or defer the expiration of timeouts, preventing unnecessary
184 * wakeups by batch processing similarly expiring events.
186 time_res_t time_res[TR_COUNT];
189 * Setting "snooping" to a non-zero value will cause a deadman panic if
190 * snoop_interval microseconds elapse without lbolt increasing. The default
191 * snoop_interval is 50 seconds.
193 #define SNOOP_INTERVAL_MIN (MICROSEC)
194 #define SNOOP_INTERVAL_DEFAULT (50 * MICROSEC)
196 int snooping = 0;
197 uint_t snoop_interval = SNOOP_INTERVAL_DEFAULT;
200 * Tables of initialization functions, called from main().
203 extern void system_taskq_init(void);
204 extern void binit(void);
205 extern void space_init(void);
206 extern void dnlc_init(void);
207 extern void vfsinit(void);
208 extern void finit(void);
209 extern void strinit(void);
210 extern void flk_init(void);
211 extern void ftrace_init(void);
212 extern void softcall_init(void);
213 extern void ttyinit(void);
214 extern void schedctl_init(void);
215 extern void deadman_init(void);
216 extern void clock_timer_init(void);
217 extern void clock_realtime_init(void);
218 extern void clock_highres_init(void);
219 extern void clock_tick_mp_init(void);
220 extern void cu_init(void);
221 extern void callout_mp_init(void);
222 extern void cpu_seq_tbl_init(void);
224 void (*init_tbl[])(void) = {
225 system_taskq_init,
226 binit,
227 space_init,
228 dnlc_init,
229 vfsinit,
230 finit,
231 strinit,
232 serializer_init,
233 softcall_init,
234 ttyinit,
235 as_init,
236 pvn_init,
237 anon_init,
238 segvn_init,
239 flk_init,
240 cpu_seq_tbl_init,
241 schedctl_init,
242 fdb_init,
243 deadman_init,
244 clock_timer_init,
245 clock_realtime_init,
246 clock_highres_init,
253 * Any per cpu resources should be initialized via
254 * an entry in mp_init_tbl().
256 void (*mp_init_tbl[])(void) = {
257 ftrace_init,
258 cyclic_mp_init,
259 clock_tick_mp_init,
260 cu_init,
261 callout_mp_init,
265 int maxusers; /* kitchen-sink knob for dynamic configuration */
268 * pidmax -- highest pid value assigned by the system
269 * Settable in /etc/system
271 int pidmax = DEFAULT_MAXPID;
274 * jump_pid - if set, this value is where pid numbers should start
275 * after the first few system pids (0-3) are used. If 0, pids are
276 * chosen in the usual way. This variable can be used to quickly
277 * create large pids (by setting it to 100000, for example). pids
278 * less than this value will never be chosen.
280 pid_t jump_pid = DEFAULT_JUMPPID;
283 * autoup -- used in struct var for dynamic config of the age a delayed-write
284 * buffer must be in seconds before bdflush will write it out.
286 #define DEFAULT_AUTOUP 30
287 int autoup = DEFAULT_AUTOUP;
290 * bufhwm -- tuneable variable for struct var for v_bufhwm.
291 * high water mark for buffer cache mem usage in units of K bytes.
293 * bufhwm_pct -- ditto, but given in % of physmem.
295 int bufhwm = 0;
296 int bufhwm_pct = 0;
299 * Process table.
301 int maxpid;
302 int max_nprocs; /* set in param_init() */
303 int maxuprc; /* set in param_init() */
304 int reserved_procs;
305 int nthread = 1;
308 * UFS tunables
310 int ufs_ninode; /* declared here due to backwards compatibility */
311 int ndquot; /* declared here due to backwards compatibility */
314 * Exec switch table. This is used by the generic exec module
315 * to switch out to the desired executable type, based on the
316 * magic number. The currently supported types are ELF, a.out
317 * (both NMAGIC and ZMAGIC), interpreter (#!) files,
318 * and Java executables.
321 * Magic numbers
323 short elfmagic = 0x7f45;
324 short intpmagic = 0x2321;
325 short jmagic = 0x504b;
327 short nomagic = 0;
330 * Magic strings
332 #define ELF32MAGIC_STRING "\x7f""ELF\x1"
333 #define ELF64MAGIC_STRING "\x7f""ELF\x2"
334 #define INTPMAGIC_STRING "#!"
335 #define JAVAMAGIC_STRING "PK\003\004"
336 #define AOUT_OMAGIC_STRING "\x1""\x07" /* 0407 */
337 #define AOUT_NMAGIC_STRING "\x1""\x08" /* 0410 */
338 #define AOUT_ZMAGIC_STRING "\x1""\x0b" /* 0413 */
339 #define NOMAGIC_STRING ""
341 char elf32magicstr[] = ELF32MAGIC_STRING;
342 char elf64magicstr[] = ELF64MAGIC_STRING;
343 char intpmagicstr[] = INTPMAGIC_STRING;
344 char javamagicstr[] = JAVAMAGIC_STRING;
345 char nomagicstr[] = NOMAGIC_STRING;
347 char *execswnames[] = {
348 "elfexec", /* Elf32 */
349 #ifdef _LP64
350 "elfexec", /* Elf64 */
351 #endif
352 "intpexec",
353 "javaexec",
354 NULL,
355 NULL,
356 NULL
359 struct execsw execsw[] = {
360 { elf32magicstr, 0, 5, NULL, NULL, NULL },
361 #ifdef _LP64
362 { elf64magicstr, 0, 5, NULL, NULL, NULL },
363 #endif
364 { intpmagicstr, 0, 2, NULL, NULL, NULL },
365 { javamagicstr, 0, 4, NULL, NULL, NULL },
366 { nomagicstr, 0, 0, NULL, NULL, NULL },
367 { nomagicstr, 0, 0, NULL, NULL, NULL },
368 { nomagicstr, 0, 0, NULL, NULL, NULL },
369 { nomagicstr, 0, 0, NULL, NULL, NULL }
371 int nexectype = sizeof (execsw) / sizeof (execsw[0]); /* # of exec types */
372 kmutex_t execsw_lock; /* Used for allocation of execsw entries */
375 * symbols added to make changing proc.max-file-descriptor
376 * simple via /etc/system
378 #define RLIM_FD_CUR 0x10000
379 #define RLIM_FD_MAX 0x10000
381 uint_t rlim_fd_cur = RLIM_FD_CUR;
382 uint_t rlim_fd_max = RLIM_FD_MAX;
385 * (Default resource limits were formerly declared here, but are now provided by
386 * the more general resource controls framework.)
390 * STREAMS tunables
392 int nstrpush = 9; /* maximum # of modules/drivers on a stream */
393 ssize_t strctlsz = 1024; /* maximum size of user-generated M_PROTO */
394 ssize_t strmsgsz = 0x10000; /* maximum size of user-generated M_DATA */
395 /* for `strmsgsz', zero means unlimited */
397 * Filesystem tunables
399 int rstchown = 1; /* POSIX_CHOWN_RESTRICTED is enabled */
400 int ngroups_max = NGROUPS_MAX_DEFAULT;
403 * generic scheduling stuff
405 * Configurable parameters for RT and TS are in the respective
406 * scheduling class modules.
409 pri_t maxclsyspri = MAXCLSYSPRI;
410 pri_t minclsyspri = MINCLSYSPRI;
411 char sys_name[] = "SYS";
413 extern pri_t sys_init(id_t, int, classfuncs_t **);
414 extern classfuncs_t sys_classfuncs;
416 sclass_t sclass[] = {
417 { "SYS", sys_init, &sys_classfuncs, STATIC_SCHED, 0 },
418 { "", NULL, NULL, NULL, 0 },
419 { "", NULL, NULL, NULL, 0 },
420 { "", NULL, NULL, NULL, 0 },
421 { "", NULL, NULL, NULL, 0 },
422 { "", NULL, NULL, NULL, 0 },
423 { "", NULL, NULL, NULL, 0 },
424 { "", NULL, NULL, NULL, 0 },
425 { "", NULL, NULL, NULL, 0 },
426 { "", NULL, NULL, NULL, 0 }
429 int loaded_classes = 1; /* for loaded classes */
430 kmutex_t class_lock; /* lock for class[] */
432 int nclass = sizeof (sclass) / sizeof (sclass_t);
433 char initcls[] = "TS";
434 char *defaultclass = initcls;
437 * Tunable system parameters.
441 * The integers tune_* are done this way so that the tune
442 * data structure may be "tuned" if necessary from the /etc/system
443 * file. The tune data structure is initialized in param_init();
446 tune_t tune;
449 * If freemem < t_getpgslow, then start to steal pages from processes.
451 int tune_t_gpgslo = 25;
454 * Rate at which fsflush is run, in seconds.
456 #define DEFAULT_TUNE_T_FSFLUSHR 1
457 int tune_t_fsflushr = DEFAULT_TUNE_T_FSFLUSHR;
460 * The minimum available resident (not swappable) memory to maintain
461 * in order to avoid deadlock. In pages.
463 int tune_t_minarmem = 25;
466 * The minimum available swappable memory to maintain in order to avoid
467 * deadlock. In pages.
469 int tune_t_minasmem = 25;
471 int tune_t_flckrec = 512; /* max # of active frlocks */
474 * Number of currently available pages that cannot be 'locked'
475 * This is set in init_pages_pp_maximum, and must be initialized
476 * to zero here to detect an override in /etc/system
478 pgcnt_t pages_pp_maximum = 0;
480 int boothowto; /* boot flags passed to kernel */
481 struct var v; /* System Configuration Information */
484 * System Configuration Information
488 * The physical system's host identifier, expressed as a decimal string.
489 * Code should only directly access this value when writing to it (setting the
490 * physical system's host identifier). Code that reads the physical system's
491 * host identifier should use zone_get_hostid(NULL) instead.
493 char hw_serial[HW_HOSTID_LEN] = "0";
495 #if defined(__sparc) || defined(__amd64)
497 char architecture[] = CONFIG_MACH64_STR;
498 char architecture_32[] = CONFIG_MACH_STR;
500 #elif defined(__i386)
502 char architecture[] = CONFIG_MACH_STR;
503 char architecture_32[] = CONFIG_MACH_STR;
505 #else
506 #error "unknown processor architecture"
507 #endif
509 char hw_provider[SYS_NMLN] = "";
510 char srpc_domain[SYS_NMLN] = "";
511 char platform[SYS_NMLN] = ""; /* read from the devinfo root node */
513 /* Initialize isa_list */
514 char *isa_list = architecture;
516 static pgcnt_t original_physmem = 0;
518 #define MIN_DEFAULT_MAXUSERS 8u
519 #define MAX_DEFAULT_MAXUSERS 2048u
520 #define MAX_MAXUSERS 4096u
522 void
523 param_preset(void)
525 original_physmem = physmem;
528 void
529 param_calc(int platform_max_nprocs)
532 * Default to about one "user" per megabyte, taking into
533 * account both physical and virtual constraints.
534 * Note: 2^20 is a meg; shifting right by (20 - PAGESHIFT)
535 * converts pages to megs without integer overflow.
537 if (maxusers == 0) {
538 pgcnt_t physmegs = physmem >> (20 - PAGESHIFT);
539 pgcnt_t virtmegs = vmem_size(heap_arena, VMEM_FREE) >> 20;
540 maxusers = MIN(MAX(MIN(physmegs, virtmegs),
541 MIN_DEFAULT_MAXUSERS), MAX_DEFAULT_MAXUSERS);
543 if (maxusers > MAX_MAXUSERS) {
544 maxusers = MAX_MAXUSERS;
545 cmn_err(CE_NOTE, "maxusers limited to %d", MAX_MAXUSERS);
548 #ifdef DEBUG
550 * The purpose of maxusers is to prevent memory overcommit.
551 * DEBUG kernels take more space, so reduce maxusers a bit.
553 maxusers = (3 * maxusers) / 4;
554 #endif
557 * We need to dynamically change any variables now so that
558 * the setting of maxusers and pidmax propagate to the other
559 * variables that are dependent on them.
561 if (reserved_procs == 0)
562 reserved_procs = 5;
563 if (pidmax < reserved_procs || pidmax > MAX_MAXPID)
564 maxpid = MAX_MAXPID;
565 else
566 maxpid = pidmax;
569 * This allows platform-dependent code to constrain the maximum
570 * number of processes allowed in case there are e.g. VM limitations
571 * with how many contexts are available.
573 if (max_nprocs == 0)
574 max_nprocs = (10 + 16 * maxusers);
575 if (platform_max_nprocs > 0 && max_nprocs > platform_max_nprocs)
576 max_nprocs = platform_max_nprocs;
577 if (max_nprocs > maxpid)
578 max_nprocs = maxpid;
580 if (maxuprc == 0)
581 maxuprc = (max_nprocs - reserved_procs);
584 void
585 param_init(void)
588 * Set each individual element of struct var v to be the
589 * default value. This is done this way
590 * so that a user can set the assigned integer value in the
591 * /etc/system file *IF* tuning is needed.
593 v.v_proc = max_nprocs; /* v_proc - max # of processes system wide */
594 v.v_maxupttl = max_nprocs - reserved_procs;
595 v.v_maxsyspri = (int)maxclsyspri; /* max global pri for sysclass */
596 v.v_maxup = MIN(maxuprc, v.v_maxupttl); /* max procs per user */
597 v.v_autoup = autoup; /* v_autoup - delay for delayed writes */
600 * Set each individual element of struct tune to be the
601 * default value. Each struct element This is done this way
602 * so that a user can set the assigned integer value in the
603 * /etc/system file *IF* tuning is needed.
605 tune.t_gpgslo = tune_t_gpgslo;
606 tune.t_fsflushr = tune_t_fsflushr;
607 tune.t_minarmem = tune_t_minarmem;
608 tune.t_minasmem = tune_t_minasmem;
609 tune.t_flckrec = tune_t_flckrec;
612 * Initialization for file descriptors to correct mistaken settings in
613 * /etc/system. Initialization of limits performed by resource control
614 * system.
616 if (rlim_fd_cur > rlim_fd_max)
617 rlim_fd_cur = rlim_fd_max;
620 * calculations needed if hz was set in /etc/system
622 if (hires_tick)
623 hz = hires_hz;
625 tick_per_msec = hz / MILLISEC;
626 msec_per_tick = MILLISEC / hz;
627 usec_per_tick = MICROSEC / hz;
628 nsec_per_tick = NANOSEC / hz;
629 max_hres_adj = nsec_per_tick >> ADJ_SHIFT;
632 * Consumers of relative timedwait functions must specify how accurately
633 * the given timeout must expire. This is currently TR_CLOCK_TICK for
634 * the vast majority of consumers, but nsec_per_tick becomes an
635 * artificial value in a tickless world. Each caller of such routines
636 * should re-evaluate their usage and specify the appropriate
637 * resolution.
639 time_res[TR_NANOSEC] = NANOSEC / NANOSEC;
640 time_res[TR_MICROSEC] = NANOSEC / MICROSEC;
641 time_res[TR_MILLISEC] = NANOSEC / MILLISEC;
642 time_res[TR_SEC] = NANOSEC / SEC;
643 time_res[TR_CLOCK_TICK] = nsec_per_tick;
647 * Validate tuneable parameters following /etc/system processing,
648 * but prior to param_init().
650 void
651 param_check(void)
653 #if defined(__x86)
654 if (physmem != original_physmem) {
655 cmn_err(CE_NOTE, "physmem cannot be modified to 0x%lx"
656 " via /etc/system. Please use eeprom(8) instead.",
657 physmem);
658 physmem = original_physmem;
660 #endif
661 if (ngroups_max < NGROUPS_UMIN)
662 ngroups_max = NGROUPS_UMIN;
663 if (ngroups_max > NGROUPS_UMAX)
664 ngroups_max = NGROUPS_UMAX;
666 /* If we have many groups then the ucred proto message also grows. */
667 if (ngroups_max > NGROUPS_OLDMAX &&
668 strctlsz < (ngroups_max - NGROUPS_OLDMAX) * sizeof (gid_t) + 1024) {
669 strctlsz = (ngroups_max - NGROUPS_OLDMAX) * sizeof (gid_t) +
670 1024;
673 if (autoup <= 0) {
674 autoup = DEFAULT_AUTOUP;
675 cmn_err(CE_WARN, "autoup <= 0; defaulting to %d", autoup);
678 if (tune_t_fsflushr <= 0) {
679 tune_t_fsflushr = DEFAULT_TUNE_T_FSFLUSHR;
680 cmn_err(CE_WARN, "tune_t_fsflushr <= 0; defaulting to %d",
681 tune_t_fsflushr);
684 if (jump_pid < 0 || jump_pid >= pidmax) {
685 jump_pid = 0;
686 cmn_err(CE_WARN, "jump_pid < 0 or >= pidmax; ignored");
689 if (snoop_interval < SNOOP_INTERVAL_MIN) {
690 snoop_interval = SNOOP_INTERVAL_DEFAULT;
691 cmn_err(CE_WARN, "snoop_interval < minimum (%d); defaulting"
692 " to %d", SNOOP_INTERVAL_MIN, SNOOP_INTERVAL_DEFAULT);