4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2013, Joyent, Inc. All rights reserved.
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/signal.h>
35 #include <sys/policy.h>
37 #include <sys/systm.h>
38 #include <sys/cpuvar.h>
40 #include <sys/vnode.h>
42 #include <sys/errno.h>
45 #include <sys/cmn_err.h>
47 #include <sys/tuneable.h>
48 #include <sys/class.h>
50 #include <sys/session.h>
51 #include <sys/ucontext.h>
52 #include <sys/stack.h>
53 #include <sys/procfs.h>
54 #include <sys/prsystm.h>
55 #include <sys/vmsystm.h>
56 #include <sys/vtrace.h>
57 #include <sys/debug.h>
58 #include <sys/shm_impl.h>
59 #include <sys/door_data.h>
64 #include <sys/schedctl.h>
65 #include <sys/utrap.h>
67 #include <sys/resource.h>
68 #include <sys/cyclic.h>
71 #include <sys/contract_impl.h>
72 #include <sys/contract/process_impl.h>
74 #include <sys/dtrace.h>
78 #include <sys/class.h>
79 #include <sys/corectl.h>
80 #include <sys/brand.h>
83 static int64_t cfork(int, int, int);
84 static int getproc(proc_t
**, pid_t
, uint_t
);
85 #define GETPROC_USER 0x0
86 #define GETPROC_KERNEL 0x1
88 static void fork_fail(proc_t
*);
89 static void forklwp_fail(proc_t
*);
91 int fork_fail_pending
;
93 extern struct kmem_cache
*process_cache
;
96 * The vfork() system call trap is no longer invoked by libc.
97 * It is retained only for the benefit of applications running
98 * within a solaris10 branded zone. It should be eliminated
99 * when we no longer support solaris10 branded zones.
104 curthread
->t_post_sys
= 1; /* so vfwait() will be called */
105 return (cfork(1, 1, 0));
109 * forksys system call - forkx, forkallx, vforkx. This is the
110 * interface invoked by libc for fork1(), forkall(), and vfork()
113 forksys(int subcode
, int flags
)
117 return (cfork(0, 1, flags
)); /* forkx(flags) */
119 return (cfork(0, 0, flags
)); /* forkallx(flags) */
121 curthread
->t_post_sys
= 1; /* so vfwait() will be called */
122 return (cfork(1, 1, flags
)); /* vforkx(flags) */
124 return ((int64_t)set_errno(EINVAL
));
129 * Remove the associations of a child process from its parent and siblings.
132 disown_proc(proc_t
*pp
, proc_t
*cp
)
136 ASSERT(MUTEX_HELD(&pidlock
));
138 orphpp
= &pp
->p_orphan
;
139 while (*orphpp
!= cp
)
140 orphpp
= &(*orphpp
)->p_nextorph
;
141 *orphpp
= cp
->p_nextorph
;
143 if (pp
->p_child
== cp
)
144 pp
->p_child
= cp
->p_sibling
;
146 cp
->p_sibling
->p_psibling
= cp
->p_psibling
;
148 cp
->p_psibling
->p_sibling
= cp
->p_sibling
;
153 cfork(int isvfork
, int isfork1
, int flags
)
155 proc_t
*p
= ttoproc(curthread
);
165 rctl_alloc_gp_t
*dup_gp
;
172 * Allow only these two flags.
174 if ((flags
& ~(FORK_NOSIGCHLD
| FORK_WAITPID
)) != 0) {
176 atomic_inc_32(&curproc
->p_zone
->zone_ffmisc
);
181 * fork is not supported for the /proc agent lwp.
183 if (curthread
== p
->p_agenttp
) {
185 atomic_inc_32(&curproc
->p_zone
->zone_ffmisc
);
189 if ((error
= secpolicy_basic_fork(CRED())) != 0) {
190 atomic_inc_32(&p
->p_zone
->zone_ffmisc
);
195 * If the calling lwp is doing a fork1() then the
196 * other lwps in this process are not duplicated and
197 * don't need to be held where their kernel stacks can be
198 * cloned. If doing forkall(), the process is held with
199 * SHOLDFORK, so that the lwps are at a point where their
200 * stacks can be copied which is on entry or exit from
203 if (!holdlwps(isfork1
? SHOLDFORK1
: SHOLDFORK
)) {
206 atomic_inc_32(&p
->p_zone
->zone_ffmisc
);
211 mutex_enter(&p
->p_lock
);
213 * If this is vfork(), cancel any suspend request we might
214 * have gotten from some other thread via lwp_suspend().
215 * Otherwise we could end up with a deadlock on return
216 * from the vfork() in both the parent and the child.
219 curthread
->t_proc_flag
&= ~TP_HOLDLWP
;
221 * Prevent our resource set associations from being changed during fork.
223 pool_barrier_enter();
224 mutex_exit(&p
->p_lock
);
227 * Create a child proc struct. Place a VN_HOLD on appropriate vnodes.
229 if (getproc(&cp
, 0, GETPROC_USER
) < 0) {
230 mutex_enter(&p
->p_lock
);
233 mutex_exit(&p
->p_lock
);
238 TRACE_2(TR_FAC_PROC
, TR_PROC_FORK
, "proc_fork:cp %p p %p", cp
, p
);
241 * Assign an address space to child
245 * Clear any watched areas and remember the
246 * watched pages for restoring in vfwait().
249 if (avl_numnodes(&as
->a_wpage
) != 0) {
250 AS_LOCK_ENTER(as
, RW_WRITER
);
252 p
->p_wpage
= as
->a_wpage
;
253 avl_create(&as
->a_wpage
, wp_compare
,
254 sizeof (struct watched_page
),
255 offsetof(struct watched_page
, wp_link
));
259 cp
->p_flag
|= SVFORK
;
262 * Use the parent's shm segment list information for
263 * the child as it uses its address space till it execs.
265 cp
->p_segacct
= p
->p_segacct
;
268 * We need to hold P_PR_LOCK until the address space has
269 * been duplicated and we've had a chance to remove from the
270 * child any DTrace probes that were in the parent. Holding
271 * P_PR_LOCK prevents any new probes from being added and any
272 * extant probes from being removed.
274 mutex_enter(&p
->p_lock
);
276 p
->p_flag
|= SFORKING
;
277 mutex_exit(&p
->p_lock
);
279 error
= as_dup(p
->p_as
, cp
);
281 mutex_enter(&p
->p_lock
);
284 mutex_enter(&pidlock
);
286 mutex_enter(&cp
->p_lock
);
289 ASSERT(cp
->p_pool
->pool_ref
> 0);
290 atomic_dec_32(&cp
->p_pool
->pool_ref
);
291 mutex_exit(&cp
->p_lock
);
293 mutex_exit(&pidlock
);
296 mutex_enter(&p
->p_lock
);
297 p
->p_flag
&= ~SFORKING
;
300 mutex_exit(&p
->p_lock
);
302 * Preserve ENOMEM error condition but
303 * map all others to EAGAIN.
305 error
= (error
== ENOMEM
) ? ENOMEM
: EAGAIN
;
306 atomic_inc_32(&p
->p_zone
->zone_ffnomem
);
311 * Remove all DTrace tracepoints from the child process. We
312 * need to do this _before_ duplicating USDT providers since
313 * any associated probes may be immediately enabled.
315 if (p
->p_dtrace_count
> 0)
316 dtrace_fasttrap_fork(p
, cp
);
318 mutex_enter(&p
->p_lock
);
321 /* Duplicate parent's shared memory */
326 * Duplicate any helper actions and providers. The SFORKING
327 * we set above informs the code to enable USDT probes that
328 * sprlock() may fail because the child is being forked.
330 if (p
->p_dtrace_helpers
!= NULL
) {
331 ASSERT(dtrace_helpers_fork
!= NULL
);
332 (*dtrace_helpers_fork
)(p
, cp
);
335 mutex_enter(&p
->p_lock
);
336 p
->p_flag
&= ~SFORKING
;
337 mutex_exit(&p
->p_lock
);
341 * Duplicate parent's resource controls.
343 dup_set
= rctl_set_create();
345 dup_gp
= rctl_set_dup_prealloc(p
->p_rctls
);
346 mutex_enter(&p
->p_rctls
->rcs_lock
);
347 if (rctl_set_dup_ready(p
->p_rctls
, dup_gp
))
349 mutex_exit(&p
->p_rctls
->rcs_lock
);
350 rctl_prealloc_destroy(dup_gp
);
353 e
.rcep_t
= RCENTITY_PROCESS
;
354 cp
->p_rctls
= rctl_set_dup(p
->p_rctls
, p
, cp
, &e
, dup_set
, dup_gp
,
355 RCD_DUP
| RCD_CALLBACK
);
356 mutex_exit(&p
->p_rctls
->rcs_lock
);
358 rctl_prealloc_destroy(dup_gp
);
361 * Allocate the child's lwp directory and lwpid hash table.
366 cp
->p_lwpdir_sz
= p
->p_lwpdir_sz
;
367 cp
->p_lwpdir
= cp
->p_lwpfree
= ldp
=
368 kmem_zalloc(cp
->p_lwpdir_sz
* sizeof (lwpdir_t
), KM_SLEEP
);
369 for (i
= 1; i
< cp
->p_lwpdir_sz
; i
++, ldp
++)
370 ldp
->ld_next
= ldp
+ 1;
371 cp
->p_tidhash_sz
= (cp
->p_lwpdir_sz
+ 2) / 2;
373 kmem_zalloc(cp
->p_tidhash_sz
* sizeof (tidhash_t
), KM_SLEEP
);
376 * Duplicate parent's lwps.
377 * Mutual exclusion is not needed because the process is
378 * in the hold state and only the current lwp is running.
380 klgrpset_clear(cp
->p_lgrpset
);
382 clone
= forklwp(ttolwp(curthread
), cp
, curthread
->t_tid
);
386 * Inherit only the lwp_wait()able flag,
387 * Daemon threads should not call fork1(), but oh well...
389 lwptot(clone
)->t_proc_flag
|=
390 (curthread
->t_proc_flag
& TP_TWAIT
);
392 /* this is forkall(), no one can be in lwp_wait() */
393 ASSERT(p
->p_lwpwait
== 0 && p
->p_lwpdwait
== 0);
394 /* for each entry in the parent's lwp directory... */
395 for (i
= 0, ldp
= p
->p_lwpdir
; i
< p
->p_lwpdir_sz
; i
++, ldp
++) {
399 if ((lep
= ldp
->ld_entry
) == NULL
)
402 if ((t
= lep
->le_thread
) != NULL
) {
403 clwp
= forklwp(ttolwp(t
), cp
, t
->t_tid
);
408 * Inherit lwp_wait()able and daemon flags.
411 (t
->t_proc_flag
& (TP_TWAIT
|TP_DAEMON
));
413 * Keep track of the clone of curthread to
414 * post return values through lwp_setrval().
415 * Mark other threads for special treatment
416 * by lwp_rtt() / post_syscall().
421 ct
->t_flag
|= T_FORKALL
;
424 * Replicate zombie lwps in the child.
426 clep
= kmem_zalloc(sizeof (*clep
), KM_SLEEP
);
427 clep
->le_lwpid
= lep
->le_lwpid
;
428 clep
->le_start
= lep
->le_start
;
429 lwp_hash_in(cp
, clep
,
430 cp
->p_tidhash
, cp
->p_tidhash_sz
, 0);
436 * Put new process in the parent's process contract, or put it
437 * in a new one if there is an active process template. Send a
438 * fork event (if requested) to whatever contract the child is
439 * a member of. Fails if the parent has been SIGKILLed.
441 if (contract_process_fork(NULL
, cp
, p
, B_TRUE
) == NULL
) {
442 atomic_inc_32(&p
->p_zone
->zone_ffmisc
);
447 * No fork failures occur beyond this point.
450 cp
->p_lwpid
= p
->p_lwpid
;
452 cp
->p_lwpdaemon
= p
->p_lwpdaemon
;
453 cp
->p_zombcnt
= p
->p_zombcnt
;
455 * If the parent's lwp ids have wrapped around, so have the
458 cp
->p_flag
|= p
->p_flag
& SLWPWRAP
;
461 mutex_enter(&p
->p_lock
);
462 corectl_path_hold(cp
->p_corefile
= p
->p_corefile
);
463 corectl_content_hold(cp
->p_content
= p
->p_content
);
464 mutex_exit(&p
->p_lock
);
467 * Duplicate process context ops, if any.
473 * If the child process has been marked to stop on exit
474 * from this fork, arrange for all other lwps to stop in
475 * sympathy with the active lwp.
477 if (PTOU(cp
)->u_systrap
&&
478 prismember(&PTOU(cp
)->u_exitmask
, curthread
->t_sysnum
)) {
479 mutex_enter(&cp
->p_lock
);
482 t
->t_proc_flag
|= TP_PRSTOP
;
483 aston(t
); /* so TP_PRSTOP will be seen */
484 } while ((t
= t
->t_forw
) != cp
->p_tlist
);
485 mutex_exit(&cp
->p_lock
);
488 * If the parent process has been marked to stop on exit
489 * from this fork, and its asynchronous-stop flag has not
490 * been set, arrange for all other lwps to stop before
491 * they return back to user level.
493 if (!(p
->p_proc_flag
& P_PR_ASYNC
) && PTOU(p
)->u_systrap
&&
494 prismember(&PTOU(p
)->u_exitmask
, curthread
->t_sysnum
)) {
495 mutex_enter(&p
->p_lock
);
498 t
->t_proc_flag
|= TP_PRSTOP
;
499 aston(t
); /* so TP_PRSTOP will be seen */
500 } while ((t
= t
->t_forw
) != p
->p_tlist
);
501 mutex_exit(&p
->p_lock
);
504 if (PROC_IS_BRANDED(p
))
505 BROP(p
)->b_lwp_setrval(clone
, p
->p_pid
, 1);
507 lwp_setrval(clone
, p
->p_pid
, 1);
509 /* set return values for parent */
510 r
.r_val1
= (int)cp
->p_pid
;
514 * pool_barrier_exit() can now be called because the child process has:
515 * - all identifying features cloned or set (p_pid, p_task, p_pool)
516 * - all resource sets associated (p_tlist->*->t_cpupart, p_as->a_mset)
517 * - any other fields set which are used in resource set binding.
519 mutex_enter(&p
->p_lock
);
521 mutex_exit(&p
->p_lock
);
523 mutex_enter(&pidlock
);
524 mutex_enter(&cp
->p_lock
);
527 * Set flags telling the child what (not) to do on exit.
529 if (flags
& FORK_NOSIGCHLD
)
530 cp
->p_pidflag
|= CLDNOSIGCHLD
;
531 if (flags
& FORK_WAITPID
)
532 cp
->p_pidflag
|= CLDWAITPID
;
535 * Now that there are lwps and threads attached, add the new
536 * process to the process group.
538 pgjoin(cp
, p
->p_pgidp
);
541 * We are now done with all the lwps in the child process.
546 * Set the lwp_suspend()ed lwps running.
547 * They will suspend properly at syscall exit.
549 if (t
->t_proc_flag
& TP_HOLDLWP
)
552 /* set TS_CREATE to allow continuelwps() to work */
554 ASSERT(t
->t_state
== TS_STOPPED
&&
555 !(t
->t_schedflag
& (TS_CREATE
|TS_CSTART
)));
556 t
->t_schedflag
|= TS_CREATE
;
559 } while ((t
= t
->t_forw
) != cp
->p_tlist
);
560 mutex_exit(&cp
->p_lock
);
563 CPU_STATS_ADDQ(CPU
, sys
, sysvfork
, 1);
564 mutex_enter(&p
->p_lock
);
565 p
->p_flag
|= SVFWAIT
;
566 curthread
->t_flag
|= T_VFPARENT
;
567 DTRACE_PROC1(create
, proc_t
*, cp
);
568 cv_broadcast(&pr_pid_cv
[p
->p_slot
]); /* inform /proc */
569 mutex_exit(&p
->p_lock
);
571 * Grab child's p_lock before dropping pidlock to ensure
572 * the process will not disappear before we set it running.
574 mutex_enter(&cp
->p_lock
);
575 mutex_exit(&pidlock
);
578 mutex_exit(&cp
->p_lock
);
580 CPU_STATS_ADDQ(CPU
, sys
, sysfork
, 1);
581 DTRACE_PROC1(create
, proc_t
*, cp
);
583 * It is CL_FORKRET's job to drop pidlock.
584 * If we do it here, the process could be set running
585 * and disappear before CL_FORKRET() is called.
587 CL_FORKRET(curthread
, cp
->p_tlist
);
588 schedctl_set_cidpri(curthread
);
589 ASSERT(MUTEX_NOT_HELD(&pidlock
));
596 if (avl_numnodes(&p
->p_wpage
) != 0) {
597 /* restore watchpoints to parent */
599 AS_LOCK_ENTER(as
, RW_WRITER
);
600 as
->a_wpage
= p
->p_wpage
;
601 avl_create(&p
->p_wpage
, wp_compare
,
602 sizeof (struct watched_page
),
603 offsetof(struct watched_page
, wp_link
));
616 for (i
= 0, ldp
= cp
->p_lwpdir
; i
< cp
->p_lwpdir_sz
; i
++, ldp
++)
617 if ((lep
= ldp
->ld_entry
) != NULL
)
618 kmem_free(lep
, sizeof (*lep
));
619 kmem_free(cp
->p_lwpdir
,
620 cp
->p_lwpdir_sz
* sizeof (*cp
->p_lwpdir
));
623 cp
->p_lwpfree
= NULL
;
627 kmem_free(cp
->p_tidhash
,
628 cp
->p_tidhash_sz
* sizeof (*cp
->p_tidhash
));
629 cp
->p_tidhash
= NULL
;
630 cp
->p_tidhash_sz
= 0;
634 if (cp
->p_dtrace_helpers
!= NULL
) {
635 ASSERT(dtrace_helpers_cleanup
!= NULL
);
636 (*dtrace_helpers_cleanup
)(cp
);
638 rctl_set_free(cp
->p_rctls
);
639 mutex_enter(&pidlock
);
642 * Detach failed child from task.
644 mutex_enter(&cp
->p_lock
);
647 ASSERT(cp
->p_pool
->pool_ref
> 0);
648 atomic_dec_32(&cp
->p_pool
->pool_ref
);
649 mutex_exit(&cp
->p_lock
);
653 mutex_exit(&pidlock
);
657 mutex_enter(&p
->p_lock
);
660 mutex_exit(&p
->p_lock
);
663 return ((int64_t)set_errno(error
));
667 * Free allocated resources from getproc() if a fork failed.
670 fork_fail(proc_t
*cp
)
672 uf_info_t
*fip
= P_FINFO(cp
);
675 sigdelq(cp
, NULL
, 0);
677 mutex_enter(&pidlock
);
678 upcount_dec(crgetruid(cp
->p_cred
), crgetzoneid(cp
->p_cred
));
679 mutex_exit(&pidlock
);
682 * single threaded, so no locking needed here
686 kmem_free(fip
->fi_list
, fip
->fi_nfiles
* sizeof (uf_entry_t
));
688 VN_RELE(PTOU(curproc
)->u_cdir
);
689 if (PTOU(curproc
)->u_rdir
)
690 VN_RELE(PTOU(curproc
)->u_rdir
);
694 VN_RELE(cp
->p_execdir
);
695 if (PTOU(curproc
)->u_cwd
)
696 refstr_rele(PTOU(curproc
)->u_cwd
);
697 if (PROC_IS_BRANDED(cp
)) {
698 brand_clearbrand(cp
, B_TRUE
);
703 * Clean up the lwps already created for this child process.
704 * The fork failed while duplicating all the lwps of the parent
705 * and those lwps already created must be freed.
706 * This process is invisible to the rest of the system,
707 * so we don't need to hold p->p_lock to protect the list.
710 forklwp_fail(proc_t
*p
)
716 if (PROC_IS_BRANDED(p
))
719 while ((t
= p
->p_tlist
) != NULL
) {
721 * First remove the lwp from the process's p_tlist.
724 p
->p_tlist
= t
->t_forw
;
728 t
->t_forw
->t_back
= t
->t_back
;
729 t
->t_back
->t_forw
= t
->t_forw
;
732 mutex_enter(&p
->p_zone
->zone_nlwps_lock
);
734 tk
->tk_proj
->kpj_nlwps
--;
735 p
->p_zone
->zone_nlwps
--;
736 mutex_exit(&p
->p_zone
->zone_nlwps_lock
);
738 ASSERT(t
->t_schedctl
== NULL
);
741 BROP(p
)->b_freelwp(ttolwp(t
));
743 if (t
->t_door
!= NULL
) {
744 kmem_free(t
->t_door
, sizeof (door_data_t
));
747 lwp_ctmpl_clear(ttolwp(t
));
750 * Remove the thread from the all threads list.
751 * We need to hold pidlock for this.
753 mutex_enter(&pidlock
);
754 t
->t_next
->t_prev
= t
->t_prev
;
755 t
->t_prev
->t_next
= t
->t_next
;
756 CL_EXIT(t
); /* tell the scheduler that we're exiting */
757 cv_broadcast(&t
->t_joincv
); /* tell anyone in thread_join */
758 mutex_exit(&pidlock
);
761 * Let the lgroup load averages know that this thread isn't
762 * going to show up (i.e. un-do what was done on behalf of
763 * this thread by the earlier lgrp_move_thread()).
766 lgrp_move_thread(t
, NULL
, 1);
770 * The thread was created TS_STOPPED.
771 * We change it to TS_FREE to avoid an
772 * ASSERT() panic in thread_free().
774 t
->t_state
= TS_FREE
;
780 extern struct as kas
;
783 * fork a kernel process.
786 newproc(void (*pc
)(), caddr_t arg
, id_t cid
, int pri
, struct contract
**ct
,
792 cont_process_t
*ctp
= NULL
;
795 ASSERT(cid
!= sysdccid
);
796 ASSERT(cid
!= syscid
|| ct
== NULL
);
797 if (CLASS_KERNEL(cid
)) {
798 rctl_alloc_gp_t
*init_gp
;
799 rctl_set_t
*init_set
;
803 if (getproc(&p
, pid
, GETPROC_KERNEL
) < 0)
807 * Release the hold on the p_exec and p_execdir, these
808 * were acquired in getproc()
810 if (p
->p_execdir
!= NULL
)
811 VN_RELE(p
->p_execdir
);
812 if (p
->p_exec
!= NULL
)
814 p
->p_flag
|= SNOWAIT
;
818 init_set
= rctl_set_create();
819 init_gp
= rctl_set_init_prealloc(RCENTITY_PROCESS
);
822 * kernel processes do not inherit /proc tracing flags.
824 sigemptyset(&p
->p_sigmask
);
825 premptyset(&p
->p_fltmask
);
828 premptyset(&(up
->u_entrymask
));
829 premptyset(&(up
->u_exitmask
));
830 mutex_enter(&p
->p_lock
);
832 e
.rcep_t
= RCENTITY_PROCESS
;
833 p
->p_rctls
= rctl_set_init(RCENTITY_PROCESS
, p
, &e
, init_set
,
835 mutex_exit(&p
->p_lock
);
837 rctl_prealloc_destroy(init_gp
);
839 t
= lwp_kernel_create(p
, pc
, arg
, TS_STOPPED
, pri
);
841 rctl_alloc_gp_t
*init_gp
, *default_gp
;
842 rctl_set_t
*init_set
;
846 if (getproc(&p
, pid
, GETPROC_USER
) < 0)
849 * init creates a new task, distinct from the task
850 * containing kernel "processes".
852 tk
= task_create(0, p
->p_zone
);
853 mutex_enter(&tk
->tk_zone
->zone_nlwps_lock
);
854 tk
->tk_proj
->kpj_ntasks
++;
856 mutex_exit(&tk
->tk_zone
->zone_nlwps_lock
);
858 default_gp
= rctl_rlimit_set_prealloc(RLIM_NLIMITS
);
859 init_gp
= rctl_set_init_prealloc(RCENTITY_PROCESS
);
860 init_set
= rctl_set_create();
862 mutex_enter(&pidlock
);
863 mutex_enter(&p
->p_lock
);
864 tk_old
= p
->p_task
; /* switch to new task */
868 mutex_exit(&pidlock
);
870 mutex_enter(&tk_old
->tk_zone
->zone_nlwps_lock
);
872 mutex_exit(&tk_old
->tk_zone
->zone_nlwps_lock
);
875 e
.rcep_t
= RCENTITY_PROCESS
;
876 p
->p_rctls
= rctl_set_init(RCENTITY_PROCESS
, p
, &e
, init_set
,
878 rctlproc_default_init(p
, default_gp
);
879 mutex_exit(&p
->p_lock
);
882 rctl_prealloc_destroy(default_gp
);
883 rctl_prealloc_destroy(init_gp
);
885 if ((lwp
= lwp_create(pc
, arg
, 0, p
, TS_STOPPED
, pri
,
886 &curthread
->t_hold
, cid
, 1)) == NULL
) {
890 mutex_enter(&pidlock
);
891 disown_proc(p
->p_parent
, p
);
893 mutex_enter(&p
->p_lock
);
896 ASSERT(p
->p_pool
->pool_ref
> 0);
897 atomic_add_32(&p
->p_pool
->pool_ref
, -1);
898 mutex_exit(&p
->p_lock
);
901 mutex_exit(&pidlock
);
907 ctp
= contract_process_fork(sys_process_tmpl
, p
, curproc
,
911 *ct
= &ctp
->conp_contract
;
914 ASSERT3U(t
->t_tid
, ==, 1);
916 mutex_enter(&pidlock
);
917 pgjoin(p
, p
->p_parent
->p_pgidp
);
919 mutex_enter(&p
->p_lock
);
920 t
->t_proc_flag
&= ~TP_HOLDLWP
;
922 mutex_exit(&p
->p_lock
);
923 mutex_exit(&pidlock
);
928 * create a child proc struct.
931 getproc(proc_t
**cpp
, pid_t pid
, uint_t flags
)
945 if (zone_status_get(curproc
->p_zone
) >= ZONE_IS_SHUTTING_DOWN
)
946 return (-1); /* no point in starting new processes */
948 pp
= (flags
& GETPROC_KERNEL
) ? &p0
: curproc
;
950 proj
= task
->tk_proj
;
953 mutex_enter(&pp
->p_lock
);
954 mutex_enter(&zone
->zone_nlwps_lock
);
955 if (proj
!= proj0p
) {
956 if (task
->tk_nprocs
>= task
->tk_nprocs_ctl
)
957 if (rctl_test(rc_task_nprocs
, task
->tk_rctls
,
958 pp
, 1, 0) & RCT_DENY
)
961 if (proj
->kpj_nprocs
>= proj
->kpj_nprocs_ctl
)
962 if (rctl_test(rc_project_nprocs
, proj
->kpj_rctls
,
963 pp
, 1, 0) & RCT_DENY
)
966 if (zone
->zone_nprocs
>= zone
->zone_nprocs_ctl
)
967 if (rctl_test(rc_zone_nprocs
, zone
->zone_rctls
,
968 pp
, 1, 0) & RCT_DENY
)
972 mutex_exit(&zone
->zone_nlwps_lock
);
973 mutex_exit(&pp
->p_lock
);
974 atomic_inc_32(&zone
->zone_ffcap
);
981 mutex_exit(&zone
->zone_nlwps_lock
);
982 mutex_exit(&pp
->p_lock
);
984 cp
= kmem_cache_alloc(process_cache
, KM_SLEEP
);
985 bzero(cp
, sizeof (proc_t
));
988 * Make proc entry for child process
990 mutex_init(&cp
->p_splock
, NULL
, MUTEX_DEFAULT
, NULL
);
991 mutex_init(&cp
->p_crlock
, NULL
, MUTEX_DEFAULT
, NULL
);
992 mutex_init(&cp
->p_pflock
, NULL
, MUTEX_DEFAULT
, NULL
);
994 mutex_init(&cp
->p_ldtlock
, NULL
, MUTEX_DEFAULT
, NULL
);
996 mutex_init(&cp
->p_maplock
, NULL
, MUTEX_DEFAULT
, NULL
);
998 cp
->p_mstart
= gethrtime();
1001 * p_zone must be set before we call pid_allocate since the process
1002 * will be visible after that and code such as prfind_zone will
1003 * look at the p_zone field.
1005 cp
->p_zone
= pp
->p_zone
;
1006 cp
->p_t1_lgrpid
= LGRP_NONE
;
1007 cp
->p_tr_lgrpid
= LGRP_NONE
;
1009 if ((newpid
= pid_allocate(cp
, pid
, PID_ALLOC_PROC
)) == -1) {
1010 if (nproc
== v
.v_proc
) {
1011 CPU_STATS_ADDQ(CPU
, sys
, procovf
, 1);
1012 cmn_err(CE_WARN
, "out of processes");
1017 mutex_enter(&pp
->p_lock
);
1018 cp
->p_exec
= pp
->p_exec
;
1019 cp
->p_execdir
= pp
->p_execdir
;
1020 mutex_exit(&pp
->p_lock
);
1023 VN_HOLD(cp
->p_exec
);
1025 * Each fop_open() must be paired with a corresponding
1026 * fop_close(). In this case, the executable will be
1027 * closed for the child in either proc_exit() or gexec().
1029 if (fop_open(&cp
->p_exec
, FREAD
, CRED(), NULL
) != 0) {
1030 VN_RELE(cp
->p_exec
);
1031 cp
->p_exec
= NULLVP
;
1032 cp
->p_execdir
= NULLVP
;
1037 VN_HOLD(cp
->p_execdir
);
1040 * If not privileged make sure that this user hasn't exceeded
1041 * v.v_maxup processes, and that users collectively haven't
1042 * exceeded v.v_maxupttl processes.
1044 mutex_enter(&pidlock
);
1045 ASSERT(nproc
< v
.v_proc
); /* otherwise how'd we get our pid? */
1047 ruid
= crgetruid(cr
);
1048 zoneid
= crgetzoneid(cr
);
1049 if (nproc
>= v
.v_maxup
&& /* short-circuit; usually false */
1050 (nproc
>= v
.v_maxupttl
||
1051 upcount_get(ruid
, zoneid
) >= v
.v_maxup
) &&
1052 secpolicy_newproc(cr
) != 0) {
1053 mutex_exit(&pidlock
);
1054 zcmn_err(zoneid
, CE_NOTE
,
1055 "out of per-user processes for uid %d", ruid
);
1060 * Everything is cool, put the new proc on the active process list.
1061 * It is already on the pid list and in /proc.
1062 * Increment the per uid process count (upcount).
1065 upcount_inc(ruid
, zoneid
);
1067 cp
->p_next
= practive
;
1068 practive
->p_prev
= cp
;
1071 cp
->p_ignore
= pp
->p_ignore
;
1072 cp
->p_siginfo
= pp
->p_siginfo
;
1073 cp
->p_flag
= pp
->p_flag
& (SJCTL
|SNOWAIT
|SNOCD
);
1074 cp
->p_sessp
= pp
->p_sessp
;
1076 cp
->p_brand
= pp
->p_brand
;
1077 if (PROC_IS_BRANDED(pp
))
1078 BROP(pp
)->b_copy_procdata(cp
, pp
);
1079 cp
->p_bssbase
= pp
->p_bssbase
;
1080 cp
->p_brkbase
= pp
->p_brkbase
;
1081 cp
->p_brksize
= pp
->p_brksize
;
1082 cp
->p_brkpageszc
= pp
->p_brkpageszc
;
1083 cp
->p_stksize
= pp
->p_stksize
;
1084 cp
->p_stkpageszc
= pp
->p_stkpageszc
;
1085 cp
->p_stkprot
= pp
->p_stkprot
;
1086 cp
->p_datprot
= pp
->p_datprot
;
1087 cp
->p_usrstack
= pp
->p_usrstack
;
1088 cp
->p_model
= pp
->p_model
;
1089 cp
->p_ppid
= pp
->p_pid
;
1090 cp
->p_ancpid
= pp
->p_pid
;
1091 cp
->p_portcnt
= pp
->p_portcnt
;
1093 * Security flags are preserved on fork, the inherited copy come into
1096 cp
->p_secflags
= pp
->p_secflags
;
1099 * Initialize watchpoint structures
1101 avl_create(&cp
->p_warea
, wa_compare
, sizeof (struct watched_area
),
1102 offsetof(struct watched_area
, wa_link
));
1105 * Initialize immediate resource control values.
1107 cp
->p_stk_ctl
= pp
->p_stk_ctl
;
1108 cp
->p_fsz_ctl
= pp
->p_fsz_ctl
;
1109 cp
->p_vmem_ctl
= pp
->p_vmem_ctl
;
1110 cp
->p_fno_ctl
= pp
->p_fno_ctl
;
1113 * Link up to parent-child-sibling chain. No need to lock
1114 * in general since only a call to freeproc() (done by the
1115 * same parent as newproc()) diddles with the child chain.
1117 cp
->p_sibling
= pp
->p_child
;
1119 pp
->p_child
->p_psibling
= cp
;
1124 cp
->p_child_ns
= NULL
;
1125 cp
->p_sibling_ns
= NULL
;
1127 cp
->p_nextorph
= pp
->p_orphan
;
1128 cp
->p_nextofkin
= pp
;
1132 * Inherit profiling state; do not inherit REALPROF profiling state.
1134 cp
->p_prof
= pp
->p_prof
;
1135 cp
->p_rprof_cyclic
= CYCLIC_NONE
;
1138 * Inherit pool pointer from the parent. Kernel processes are
1139 * always bound to the default pool.
1141 mutex_enter(&pp
->p_lock
);
1142 if (flags
& GETPROC_KERNEL
) {
1143 cp
->p_pool
= pool_default
;
1146 cp
->p_pool
= pp
->p_pool
;
1148 atomic_inc_32(&cp
->p_pool
->pool_ref
);
1149 mutex_exit(&pp
->p_lock
);
1152 * Add the child process to the current task. Kernel processes
1153 * are always attached to task0.
1155 mutex_enter(&cp
->p_lock
);
1156 if (flags
& GETPROC_KERNEL
)
1157 task_attach(task0p
, cp
);
1159 task_attach(pp
->p_task
, cp
);
1160 mutex_exit(&cp
->p_lock
);
1161 mutex_exit(&pidlock
);
1163 avl_create(&cp
->p_ct_held
, contract_compar
, sizeof (contract_t
),
1164 offsetof(contract_t
, ct_ctlist
));
1167 * Duplicate any audit information kept in the process table
1169 if (audit_active
) /* copy audit data to cp */
1172 crhold(cp
->p_cred
= cr
);
1175 * Bump up the counts on the file structures pointed at by the
1176 * parent's file table since the child will point at them too.
1178 fcnt_add(P_FINFO(pp
), 1);
1180 if (PTOU(pp
)->u_cdir
) {
1181 VN_HOLD(PTOU(pp
)->u_cdir
);
1185 * We must be at or before vfs_mountroot(); it will take care of
1186 * assigning our current directory.
1189 if (PTOU(pp
)->u_rdir
)
1190 VN_HOLD(PTOU(pp
)->u_rdir
);
1191 if (PTOU(pp
)->u_cwd
)
1192 refstr_hold(PTOU(pp
)->u_cwd
);
1195 * copy the parent's uarea.
1198 bcopy(PTOU(pp
), uarea
, sizeof (*uarea
));
1199 flist_fork(P_FINFO(pp
), P_FINFO(cp
));
1201 gethrestime(&uarea
->u_start
);
1202 uarea
->u_ticks
= ddi_get_lbolt();
1203 uarea
->u_mem
= rm_asrss(pp
->p_as
);
1204 uarea
->u_acflag
= AFORK
;
1207 * If inherit-on-fork, copy /proc tracing flags to child.
1209 if ((pp
->p_proc_flag
& P_PR_FORK
) != 0) {
1210 cp
->p_proc_flag
|= pp
->p_proc_flag
& (P_PR_TRACE
|P_PR_FORK
);
1211 cp
->p_sigmask
= pp
->p_sigmask
;
1212 cp
->p_fltmask
= pp
->p_fltmask
;
1214 sigemptyset(&cp
->p_sigmask
);
1215 premptyset(&cp
->p_fltmask
);
1216 uarea
->u_systrap
= 0;
1217 premptyset(&uarea
->u_entrymask
);
1218 premptyset(&uarea
->u_exitmask
);
1221 * If microstate accounting is being inherited, mark child
1223 if ((pp
->p_flag
& SMSFORK
) != 0)
1224 cp
->p_flag
|= pp
->p_flag
& (SMSFORK
|SMSACCT
);
1227 * Inherit fixalignment flag from the parent
1229 cp
->p_fixalignment
= pp
->p_fixalignment
;
1235 ASSERT(MUTEX_NOT_HELD(&pidlock
));
1237 mutex_destroy(&cp
->p_crlock
);
1238 mutex_destroy(&cp
->p_pflock
);
1240 mutex_destroy(&cp
->p_ldtlock
);
1243 proc_entry_free(cp
->p_pidp
);
1244 (void) pid_rele(cp
->p_pidp
);
1246 kmem_cache_free(process_cache
, cp
);
1248 mutex_enter(&zone
->zone_nlwps_lock
);
1251 zone
->zone_nprocs
--;
1252 mutex_exit(&zone
->zone_nlwps_lock
);
1253 atomic_inc_32(&zone
->zone_ffnoproc
);
1257 * We most likely got into this situation because some process is
1258 * forking out of control. As punishment, put it to sleep for a
1259 * bit so it can't eat the machine alive. Sleep interval is chosen
1260 * to allow no more than one fork failure per cpu per clock tick
1261 * on average (yes, I just made this up). This has two desirable
1262 * properties: (1) it sets a constant limit on the fork failure
1263 * rate, and (2) the busier the system is, the harsher the penalty
1264 * for abusing it becomes.
1266 INCR_COUNT(&fork_fail_pending
, &pidlock
);
1267 delay(fork_fail_pending
/ ncpus
+ 1);
1268 DECR_COUNT(&fork_fail_pending
, &pidlock
);
1270 return (-1); /* out of memory or proc slots */
1274 * Release virtual memory.
1275 * In the case of vfork(), the child was given exclusive access to its
1276 * parent's address space. The parent is waiting in vfwait() for the
1277 * child to release its exclusive claim via relvm().
1282 proc_t
*p
= curproc
;
1284 ASSERT((unsigned)p
->p_lwpcnt
<= 1);
1286 prrelvm(); /* inform /proc */
1288 if (p
->p_flag
& SVFORK
) {
1289 proc_t
*pp
= p
->p_parent
;
1291 * The child process is either exec'ing or exit'ing.
1292 * The child is now separated from the parent's address
1293 * space. The parent process is made dispatchable.
1295 * This is a delicate locking maneuver, involving
1296 * both the parent's p_lock and the child's p_lock.
1297 * As soon as the SVFORK flag is turned off, the
1298 * parent is free to run, but it must not run until
1299 * we wake it up using its p_cv because it might
1300 * exit and we would be referencing invalid memory.
1301 * Therefore, we hold the parent with its p_lock
1302 * while protecting our p_flags with our own p_lock.
1305 mutex_enter(&p
->p_lock
); /* grab child's lock first */
1306 prbarrier(p
); /* make sure /proc is blocked out */
1307 mutex_enter(&pp
->p_lock
);
1310 * Check if parent is locked by /proc.
1312 if (pp
->p_proc_flag
& P_PR_LOCK
) {
1314 * Delay until /proc is done with the parent.
1315 * We must drop our (the child's) p->p_lock, wait
1316 * via prbarrier() on the parent, then start over.
1318 mutex_exit(&p
->p_lock
);
1320 mutex_exit(&pp
->p_lock
);
1323 p
->p_flag
&= ~SVFORK
;
1328 * notify hat of change in thread's address space
1330 hat_thread_exit(curthread
);
1334 * child sizes are copied back to parent because
1335 * child may have grown.
1337 pp
->p_brkbase
= p
->p_brkbase
;
1338 pp
->p_brksize
= p
->p_brksize
;
1339 pp
->p_stksize
= p
->p_stksize
;
1342 * Copy back the shm accounting information
1343 * to the parent process.
1345 pp
->p_segacct
= p
->p_segacct
;
1346 p
->p_segacct
= NULL
;
1349 * The parent is no longer waiting for the vfork()d child.
1350 * Restore the parent's watched pages, if any. This is
1351 * safe because we know the parent is not locked by /proc
1353 pp
->p_flag
&= ~SVFWAIT
;
1354 if (avl_numnodes(&pp
->p_wpage
) != 0) {
1355 pp
->p_as
->a_wpage
= pp
->p_wpage
;
1356 avl_create(&pp
->p_wpage
, wp_compare
,
1357 sizeof (struct watched_page
),
1358 offsetof(struct watched_page
, wp_link
));
1360 cv_signal(&pp
->p_cv
);
1361 mutex_exit(&pp
->p_lock
);
1362 mutex_exit(&p
->p_lock
);
1364 if (p
->p_as
!= &kas
) {
1371 * We grab p_lock for the benefit of /proc
1374 mutex_enter(&p
->p_lock
);
1375 prbarrier(p
); /* make sure /proc is blocked out */
1378 mutex_exit(&p
->p_lock
);
1381 * notify hat of change in thread's address space
1383 hat_thread_exit(curthread
);
1387 p
->p_tr_lgrpid
= LGRP_NONE
;
1393 * Wait for child to exec or exit.
1394 * Called by parent of vfork'ed process.
1395 * See important comments in relvm(), above.
1401 proc_t
*pp
= ttoproc(curthread
);
1405 * Wait for child to exec or exit.
1408 mutex_enter(&pidlock
);
1410 if (cp
== NULL
|| cp
->p_parent
!= pp
) {
1412 * Child has exit()ed.
1414 mutex_exit(&pidlock
);
1418 * Grab the child's p_lock before releasing pidlock.
1419 * Otherwise, the child could exit and we would be
1420 * referencing invalid memory.
1422 mutex_enter(&cp
->p_lock
);
1423 mutex_exit(&pidlock
);
1424 if (!(cp
->p_flag
& SVFORK
)) {
1426 * Child has exec()ed or is exit()ing.
1428 mutex_exit(&cp
->p_lock
);
1431 mutex_enter(&pp
->p_lock
);
1432 mutex_exit(&cp
->p_lock
);
1434 * We might be waked up spuriously from the cv_wait().
1435 * We have to do the whole operation over again to be
1436 * sure the child's SVFORK flag really is turned off.
1437 * We cannot make reference to the child because it can
1438 * exit before we return and we would be referencing
1441 * Because this is potentially a very long-term wait,
1442 * we call cv_wait_sig() (for its jobcontrol and /proc
1443 * side-effects) unless there is a current signal, in
1444 * which case we use cv_wait() because we cannot return
1445 * from this function until the child has released the
1446 * address space. Calling cv_wait_sig() with a current
1447 * signal would lead to an indefinite loop here because
1448 * cv_wait_sig() returns immediately in this case.
1451 cv_wait(&pp
->p_cv
, &pp
->p_lock
);
1453 signalled
= !cv_wait_sig(&pp
->p_cv
, &pp
->p_lock
);
1454 mutex_exit(&pp
->p_lock
);
1457 /* restore watchpoints to parent */
1458 if (pr_watch_active(pp
)) {
1459 struct as
*as
= pp
->p_as
;
1460 AS_LOCK_ENTER(as
, RW_WRITER
);
1465 mutex_enter(&pp
->p_lock
);
1466 prbarrier(pp
); /* barrier against /proc locking */
1468 mutex_exit(&pp
->p_lock
);