4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright 2012 Milan Jurik. All rights reserved.
28 #include <sys/cfgparam.h>
29 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/signal.h>
34 #include <sys/sysmacros.h>
35 #include <sys/cmn_err.h>
39 #include <sys/project.h>
41 #include <sys/vnode.h>
43 #include <sys/fcntl.h>
44 #include <sys/flock.h>
46 #include <sys/stream.h>
47 #include <sys/strsubr.h>
49 #include <sys/class.h>
53 #include <sys/exechdr.h>
55 #include <sys/resource.h>
58 #include <vm/seg_kmem.h>
59 #include <sys/vmparam.h>
60 #include <sys/machparam.h>
61 #include <sys/utsname.h>
63 #include <sys/stack.h>
64 #include <sys/modctl.h>
65 #include <sys/fdbuffer.h>
66 #include <sys/cyclic_impl.h>
68 #include <sys/tuneable.h>
69 #include <sys/systeminfo.h>
72 #include <sys/clock.h>
73 #include <sys/clock_impl.h>
74 #include <sys/serializer.h>
77 * The following few lines describe generic things that must be compiled
78 * into the booted executable (unix) rather than genunix or any other
79 * module because they're required by crash dump readers, etc.
81 struct modctl modules
; /* head of linked list of modules */
82 char *default_path
; /* default module loading path */
83 struct swapinfo
*swapinfo
; /* protected by the swapinfo_lock */
84 proc_t
*practive
; /* active process list */
85 uint_t nproc
; /* current number of processes */
86 proc_t p0
; /* process 0 */
87 struct plock p0lock
; /* p0's p_lock */
88 klwp_t lwp0
; /* t0's lwp */
89 task_t
*task0p
; /* task 0 */
90 kproject_t
*proj0p
; /* location of project 0 */
93 * The following are "implementation architecture" dependent constants made
94 * available here in the form of initialized data for use by "implementation
95 * architecture" independent modules. See machparam.h.
97 const unsigned long _pagesize
= (unsigned long)PAGESIZE
;
98 const unsigned int _pageshift
= (unsigned int)PAGESHIFT
;
99 const unsigned long _pageoffset
= (unsigned long)PAGEOFFSET
;
101 * XXX - This value pagemask has to be a 64bit size because
102 * large file support uses this mask on offsets which are 64 bit size.
103 * using unsigned leaves the higher 32 bits value as zero thus
104 * corrupting offset calculations in the file system and VM.
106 const u_longlong_t _pagemask
= (u_longlong_t
)PAGEMASK
;
107 const unsigned long _mmu_pagesize
= (unsigned long)MMU_PAGESIZE
;
108 const unsigned int _mmu_pageshift
= (unsigned int)MMU_PAGESHIFT
;
109 const unsigned long _mmu_pageoffset
= (unsigned long)MMU_PAGEOFFSET
;
110 const unsigned long _mmu_pagemask
= (unsigned long)MMU_PAGEMASK
;
111 uintptr_t _kernelbase
= (uintptr_t)KERNELBASE
;
112 uintptr_t _userlimit
= (uintptr_t)USERLIMIT
;
113 uintptr_t _userlimit32
= (uintptr_t)USERLIMIT32
;
114 const uintptr_t _argsbase
= (uintptr_t)ARGSBASE
;
115 const unsigned int _diskrpm
= (unsigned int)DISKRPM
;
116 const unsigned long _pgthresh
= (unsigned long)PGTHRESH
;
117 const unsigned int _maxslp
= (unsigned int)MAXSLP
;
118 const unsigned long _maxhandspreadpages
= (unsigned long)MAXHANDSPREADPAGES
;
119 const int _ncpu
= (int)NCPU
;
120 const int _ncpu_log2
= (int)NCPU_LOG2
;
121 const int _ncpu_p2
= (int)NCPU_P2
;
122 const unsigned long _defaultstksz
= (unsigned long)DEFAULTSTKSZ
;
123 const unsigned int _nbpg
= (unsigned int)MMU_PAGESIZE
;
126 * System parameter formulae.
128 * This file is copied into each directory where we compile
129 * the kernel; it should be modified there to suit local taste
134 * Default hz is 100, but if we set hires_tick we get higher resolution
135 * clock behavior (currently defined to be 1000 hz). Higher values seem
136 * to work, but are not supported.
138 * If we do decide to play with higher values, remember that hz should
139 * satisfy the following constraints to avoid integer round-off problems:
141 * (1) hz should be in the range 100 <= hz <= MICROSEC. If hz exceeds
142 * MICROSEC, usec_per_tick will be zero and lots of stuff will break.
143 * Similarly, if hz < 100 then hz / 100 == 0 and stuff will break.
145 * (2) If hz <= 1000, it should be both a multiple of 100 and a
148 * (3) If hz > 1000, it should be both a multiple of 1000 and a
149 * divisor of MICROSEC.
151 * Thus the only reasonable values of hz (i.e. the values that won't
152 * cause roundoff error) are: 100, 200, 500, 1000, 2000, 4000, 5000,
153 * 8000, 10000, 20000, 25000, 40000, 50000, 100000, 125000, 200000,
154 * 250000, 500000, 1000000. As of this writing (1996) a clock rate
155 * of more than about 10 kHz seems utterly ridiculous, although
156 * this observation will no doubt seem quaintly amusing one day.
158 #define HIRES_HZ_DEFAULT 1000
161 int hires_hz
= HIRES_HZ_DEFAULT
;
164 int cpu_decay_factor
= 10; /* this is no longer tied to clock */
165 int max_hres_adj
; /* maximum adjustment of hrtime per tick */
166 int tick_per_msec
; /* clock ticks per millisecond (zero if hz < 1000) */
169 * Milliseconds, Microseconds, and Nanoseconds per clock tick
172 * msec_per_tick is zero if hz > 1000
179 * Time Resolution values. These are defined in condvar.h and initialized in
180 * param_init(). Consumers of cv_reltimedwait() and cv_reltimedwait_sig()
181 * need to specify how accurate the timeout argument should be through
182 * one of these values. The intention is to allow the underlying implementation
183 * to anticipate or defer the expiration of timeouts, preventing unnecessary
184 * wakeups by batch processing similarly expiring events.
186 time_res_t time_res
[TR_COUNT
];
189 * Setting "snooping" to a non-zero value will cause a deadman panic if
190 * snoop_interval microseconds elapse without lbolt increasing. The default
191 * snoop_interval is 50 seconds.
193 #define SNOOP_INTERVAL_MIN (MICROSEC)
194 #define SNOOP_INTERVAL_DEFAULT (50 * MICROSEC)
197 uint_t snoop_interval
= SNOOP_INTERVAL_DEFAULT
;
200 * Tables of initialization functions, called from main().
203 extern void system_taskq_init(void);
204 extern void binit(void);
205 extern void space_init(void);
206 extern void dnlc_init(void);
207 extern void vfsinit(void);
208 extern void finit(void);
209 extern void strinit(void);
210 extern void flk_init(void);
211 extern void ftrace_init(void);
212 extern void softcall_init(void);
213 extern void ttyinit(void);
214 extern void schedctl_init(void);
215 extern void deadman_init(void);
216 extern void clock_timer_init(void);
217 extern void clock_realtime_init(void);
218 extern void clock_highres_init(void);
219 extern void clock_tick_mp_init(void);
220 extern void cu_init(void);
221 extern void callout_mp_init(void);
222 extern void cpu_seq_tbl_init(void);
224 void (*init_tbl
[])(void) = {
253 * Any per cpu resources should be initialized via
254 * an entry in mp_init_tbl().
256 void (*mp_init_tbl
[])(void) = {
265 int maxusers
; /* kitchen-sink knob for dynamic configuration */
268 * pidmax -- highest pid value assigned by the system
269 * Settable in /etc/system
271 int pidmax
= DEFAULT_MAXPID
;
274 * jump_pid - if set, this value is where pid numbers should start
275 * after the first few system pids (0-3) are used. If 0, pids are
276 * chosen in the usual way. This variable can be used to quickly
277 * create large pids (by setting it to 100000, for example). pids
278 * less than this value will never be chosen.
280 pid_t jump_pid
= DEFAULT_JUMPPID
;
283 * autoup -- used in struct var for dynamic config of the age a delayed-write
284 * buffer must be in seconds before bdflush will write it out.
286 #define DEFAULT_AUTOUP 30
287 int autoup
= DEFAULT_AUTOUP
;
290 * bufhwm -- tuneable variable for struct var for v_bufhwm.
291 * high water mark for buffer cache mem usage in units of K bytes.
293 * bufhwm_pct -- ditto, but given in % of physmem.
302 int max_nprocs
; /* set in param_init() */
303 int maxuprc
; /* set in param_init() */
310 int ufs_ninode
; /* declared here due to backwards compatibility */
311 int ndquot
; /* declared here due to backwards compatibility */
314 * Exec switch table. This is used by the generic exec module
315 * to switch out to the desired executable type, based on the
316 * magic number. The currently supported types are ELF, a.out
317 * (both NMAGIC and ZMAGIC), interpreter (#!) files,
318 * and Java executables.
323 short elfmagic
= 0x7f45;
324 short intpmagic
= 0x2321;
325 short jmagic
= 0x504b;
332 #define ELF32MAGIC_STRING "\x7f""ELF\x1"
333 #define ELF64MAGIC_STRING "\x7f""ELF\x2"
334 #define INTPMAGIC_STRING "#!"
335 #define JAVAMAGIC_STRING "PK\003\004"
336 #define AOUT_OMAGIC_STRING "\x1""\x07" /* 0407 */
337 #define AOUT_NMAGIC_STRING "\x1""\x08" /* 0410 */
338 #define AOUT_ZMAGIC_STRING "\x1""\x0b" /* 0413 */
339 #define NOMAGIC_STRING ""
341 char elf32magicstr
[] = ELF32MAGIC_STRING
;
342 char elf64magicstr
[] = ELF64MAGIC_STRING
;
343 char intpmagicstr
[] = INTPMAGIC_STRING
;
344 char javamagicstr
[] = JAVAMAGIC_STRING
;
345 char nomagicstr
[] = NOMAGIC_STRING
;
347 char *execswnames
[] = {
348 "elfexec", /* Elf32 */
350 "elfexec", /* Elf64 */
359 struct execsw execsw
[] = {
360 { elf32magicstr
, 0, 5, NULL
, NULL
, NULL
},
362 { elf64magicstr
, 0, 5, NULL
, NULL
, NULL
},
364 { intpmagicstr
, 0, 2, NULL
, NULL
, NULL
},
365 { javamagicstr
, 0, 4, NULL
, NULL
, NULL
},
366 { nomagicstr
, 0, 0, NULL
, NULL
, NULL
},
367 { nomagicstr
, 0, 0, NULL
, NULL
, NULL
},
368 { nomagicstr
, 0, 0, NULL
, NULL
, NULL
},
369 { nomagicstr
, 0, 0, NULL
, NULL
, NULL
}
371 int nexectype
= sizeof (execsw
) / sizeof (execsw
[0]); /* # of exec types */
372 kmutex_t execsw_lock
; /* Used for allocation of execsw entries */
375 * symbols added to make changing proc.max-file-descriptor
376 * simple via /etc/system
378 #define RLIM_FD_CUR 0x10000
379 #define RLIM_FD_MAX 0x10000
381 uint_t rlim_fd_cur
= RLIM_FD_CUR
;
382 uint_t rlim_fd_max
= RLIM_FD_MAX
;
385 * (Default resource limits were formerly declared here, but are now provided by
386 * the more general resource controls framework.)
392 int nstrpush
= 9; /* maximum # of modules/drivers on a stream */
393 ssize_t strctlsz
= 1024; /* maximum size of user-generated M_PROTO */
394 ssize_t strmsgsz
= 0x10000; /* maximum size of user-generated M_DATA */
395 /* for `strmsgsz', zero means unlimited */
397 * Filesystem tunables
399 int rstchown
= 1; /* POSIX_CHOWN_RESTRICTED is enabled */
400 int ngroups_max
= NGROUPS_MAX_DEFAULT
;
403 * generic scheduling stuff
405 * Configurable parameters for RT and TS are in the respective
406 * scheduling class modules.
409 pri_t maxclsyspri
= MAXCLSYSPRI
;
410 pri_t minclsyspri
= MINCLSYSPRI
;
411 char sys_name
[] = "SYS";
413 extern pri_t
sys_init(id_t
, int, classfuncs_t
**);
414 extern classfuncs_t sys_classfuncs
;
416 sclass_t sclass
[] = {
417 { "SYS", sys_init
, &sys_classfuncs
, STATIC_SCHED
, 0 },
418 { "", NULL
, NULL
, NULL
, 0 },
419 { "", NULL
, NULL
, NULL
, 0 },
420 { "", NULL
, NULL
, NULL
, 0 },
421 { "", NULL
, NULL
, NULL
, 0 },
422 { "", NULL
, NULL
, NULL
, 0 },
423 { "", NULL
, NULL
, NULL
, 0 },
424 { "", NULL
, NULL
, NULL
, 0 },
425 { "", NULL
, NULL
, NULL
, 0 },
426 { "", NULL
, NULL
, NULL
, 0 }
429 int loaded_classes
= 1; /* for loaded classes */
430 kmutex_t class_lock
; /* lock for class[] */
432 int nclass
= sizeof (sclass
) / sizeof (sclass_t
);
433 char initcls
[] = "TS";
434 char *defaultclass
= initcls
;
437 * Tunable system parameters.
441 * The integers tune_* are done this way so that the tune
442 * data structure may be "tuned" if necessary from the /etc/system
443 * file. The tune data structure is initialized in param_init();
449 * If freemem < t_getpgslow, then start to steal pages from processes.
451 int tune_t_gpgslo
= 25;
454 * Rate at which fsflush is run, in seconds.
456 #define DEFAULT_TUNE_T_FSFLUSHR 1
457 int tune_t_fsflushr
= DEFAULT_TUNE_T_FSFLUSHR
;
460 * The minimum available resident (not swappable) memory to maintain
461 * in order to avoid deadlock. In pages.
463 int tune_t_minarmem
= 25;
466 * The minimum available swappable memory to maintain in order to avoid
467 * deadlock. In pages.
469 int tune_t_minasmem
= 25;
471 int tune_t_flckrec
= 512; /* max # of active frlocks */
474 * Number of currently available pages that cannot be 'locked'
475 * This is set in init_pages_pp_maximum, and must be initialized
476 * to zero here to detect an override in /etc/system
478 pgcnt_t pages_pp_maximum
= 0;
480 int boothowto
; /* boot flags passed to kernel */
481 struct var v
; /* System Configuration Information */
484 * System Configuration Information
488 * The physical system's host identifier, expressed as a decimal string.
489 * Code should only directly access this value when writing to it (setting the
490 * physical system's host identifier). Code that reads the physical system's
491 * host identifier should use zone_get_hostid(NULL) instead.
493 char hw_serial
[HW_HOSTID_LEN
] = "0";
495 #if defined(__sparc) || defined(__amd64)
497 char architecture
[] = CONFIG_MACH64_STR
;
498 char architecture_32
[] = CONFIG_MACH_STR
;
500 #elif defined(__i386)
502 char architecture
[] = CONFIG_MACH_STR
;
503 char architecture_32
[] = CONFIG_MACH_STR
;
506 #error "unknown processor architecture"
509 char hw_provider
[SYS_NMLN
] = "";
510 char srpc_domain
[SYS_NMLN
] = "";
511 char platform
[SYS_NMLN
] = ""; /* read from the devinfo root node */
513 /* Initialize isa_list */
514 char *isa_list
= architecture
;
516 static pgcnt_t original_physmem
= 0;
518 #define MIN_DEFAULT_MAXUSERS 8u
519 #define MAX_DEFAULT_MAXUSERS 2048u
520 #define MAX_MAXUSERS 4096u
525 original_physmem
= physmem
;
529 param_calc(int platform_max_nprocs
)
532 * Default to about one "user" per megabyte, taking into
533 * account both physical and virtual constraints.
534 * Note: 2^20 is a meg; shifting right by (20 - PAGESHIFT)
535 * converts pages to megs without integer overflow.
538 pgcnt_t physmegs
= physmem
>> (20 - PAGESHIFT
);
539 pgcnt_t virtmegs
= vmem_size(heap_arena
, VMEM_FREE
) >> 20;
540 maxusers
= MIN(MAX(MIN(physmegs
, virtmegs
),
541 MIN_DEFAULT_MAXUSERS
), MAX_DEFAULT_MAXUSERS
);
543 if (maxusers
> MAX_MAXUSERS
) {
544 maxusers
= MAX_MAXUSERS
;
545 cmn_err(CE_NOTE
, "maxusers limited to %d", MAX_MAXUSERS
);
550 * The purpose of maxusers is to prevent memory overcommit.
551 * DEBUG kernels take more space, so reduce maxusers a bit.
553 maxusers
= (3 * maxusers
) / 4;
557 * We need to dynamically change any variables now so that
558 * the setting of maxusers and pidmax propagate to the other
559 * variables that are dependent on them.
561 if (reserved_procs
== 0)
563 if (pidmax
< reserved_procs
|| pidmax
> MAX_MAXPID
)
569 * This allows platform-dependent code to constrain the maximum
570 * number of processes allowed in case there are e.g. VM limitations
571 * with how many contexts are available.
574 max_nprocs
= (10 + 16 * maxusers
);
575 if (platform_max_nprocs
> 0 && max_nprocs
> platform_max_nprocs
)
576 max_nprocs
= platform_max_nprocs
;
577 if (max_nprocs
> maxpid
)
581 maxuprc
= (max_nprocs
- reserved_procs
);
588 * Set each individual element of struct var v to be the
589 * default value. This is done this way
590 * so that a user can set the assigned integer value in the
591 * /etc/system file *IF* tuning is needed.
593 v
.v_proc
= max_nprocs
; /* v_proc - max # of processes system wide */
594 v
.v_maxupttl
= max_nprocs
- reserved_procs
;
595 v
.v_maxsyspri
= (int)maxclsyspri
; /* max global pri for sysclass */
596 v
.v_maxup
= MIN(maxuprc
, v
.v_maxupttl
); /* max procs per user */
597 v
.v_autoup
= autoup
; /* v_autoup - delay for delayed writes */
600 * Set each individual element of struct tune to be the
601 * default value. Each struct element This is done this way
602 * so that a user can set the assigned integer value in the
603 * /etc/system file *IF* tuning is needed.
605 tune
.t_gpgslo
= tune_t_gpgslo
;
606 tune
.t_fsflushr
= tune_t_fsflushr
;
607 tune
.t_minarmem
= tune_t_minarmem
;
608 tune
.t_minasmem
= tune_t_minasmem
;
609 tune
.t_flckrec
= tune_t_flckrec
;
612 * Initialization for file descriptors to correct mistaken settings in
613 * /etc/system. Initialization of limits performed by resource control
616 if (rlim_fd_cur
> rlim_fd_max
)
617 rlim_fd_cur
= rlim_fd_max
;
620 * calculations needed if hz was set in /etc/system
625 tick_per_msec
= hz
/ MILLISEC
;
626 msec_per_tick
= MILLISEC
/ hz
;
627 usec_per_tick
= MICROSEC
/ hz
;
628 nsec_per_tick
= NANOSEC
/ hz
;
629 max_hres_adj
= nsec_per_tick
>> ADJ_SHIFT
;
632 * Consumers of relative timedwait functions must specify how accurately
633 * the given timeout must expire. This is currently TR_CLOCK_TICK for
634 * the vast majority of consumers, but nsec_per_tick becomes an
635 * artificial value in a tickless world. Each caller of such routines
636 * should re-evaluate their usage and specify the appropriate
639 time_res
[TR_NANOSEC
] = NANOSEC
/ NANOSEC
;
640 time_res
[TR_MICROSEC
] = NANOSEC
/ MICROSEC
;
641 time_res
[TR_MILLISEC
] = NANOSEC
/ MILLISEC
;
642 time_res
[TR_SEC
] = NANOSEC
/ SEC
;
643 time_res
[TR_CLOCK_TICK
] = nsec_per_tick
;
647 * Validate tuneable parameters following /etc/system processing,
648 * but prior to param_init().
654 if (physmem
!= original_physmem
) {
655 cmn_err(CE_NOTE
, "physmem cannot be modified to 0x%lx"
656 " via /etc/system. Please use eeprom(8) instead.",
658 physmem
= original_physmem
;
661 if (ngroups_max
< NGROUPS_UMIN
)
662 ngroups_max
= NGROUPS_UMIN
;
663 if (ngroups_max
> NGROUPS_UMAX
)
664 ngroups_max
= NGROUPS_UMAX
;
666 /* If we have many groups then the ucred proto message also grows. */
667 if (ngroups_max
> NGROUPS_OLDMAX
&&
668 strctlsz
< (ngroups_max
- NGROUPS_OLDMAX
) * sizeof (gid_t
) + 1024) {
669 strctlsz
= (ngroups_max
- NGROUPS_OLDMAX
) * sizeof (gid_t
) +
674 autoup
= DEFAULT_AUTOUP
;
675 cmn_err(CE_WARN
, "autoup <= 0; defaulting to %d", autoup
);
678 if (tune_t_fsflushr
<= 0) {
679 tune_t_fsflushr
= DEFAULT_TUNE_T_FSFLUSHR
;
680 cmn_err(CE_WARN
, "tune_t_fsflushr <= 0; defaulting to %d",
684 if (jump_pid
< 0 || jump_pid
>= pidmax
) {
686 cmn_err(CE_WARN
, "jump_pid < 0 or >= pidmax; ignored");
689 if (snoop_interval
< SNOOP_INTERVAL_MIN
) {
690 snoop_interval
= SNOOP_INTERVAL_DEFAULT
;
691 cmn_err(CE_WARN
, "snoop_interval < minimum (%d); defaulting"
692 " to %d", SNOOP_INTERVAL_MIN
, SNOOP_INTERVAL_DEFAULT
);