1 /* $NetBSD: kern_exec.c,v 1.292 2009/12/10 14:13:54 matt Exp $ */
4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (C) 1993, 1994, 1996 Christopher G. Demetriou
31 * Copyright (C) 1992 Wolfgang Solfrank.
32 * Copyright (C) 1992 TooLs GmbH.
33 * All rights reserved.
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 * must display the following acknowledgement:
45 * This product includes software developed by TooLs GmbH.
46 * 4. The name of TooLs GmbH may not be used to endorse or promote products
47 * derived from this software without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
50 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
51 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
52 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
53 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
54 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
55 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
56 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
57 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
58 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
61 #include <sys/cdefs.h>
62 __KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.292 2009/12/10 14:13:54 matt Exp $");
64 #include "opt_ktrace.h"
65 #include "opt_modular.h"
66 #include "opt_syscall_debug.h"
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/filedesc.h>
74 #include <sys/kernel.h>
76 #include <sys/mount.h>
77 #include <sys/malloc.h>
79 #include <sys/namei.h>
80 #include <sys/vnode.h>
84 #include <sys/ktrace.h>
85 #include <sys/uidinfo.h>
89 #include <sys/signalvar.h>
91 #include <sys/syscall.h>
92 #include <sys/kauth.h>
93 #include <sys/lwpctl.h>
96 #include <sys/module.h>
98 #include <sys/savar.h>
99 #include <sys/syscallvar.h>
100 #include <sys/syscallargs.h>
102 #include <sys/verified_exec.h>
103 #endif /* NVERIEXEC > 0 */
105 #include <uvm/uvm_extern.h>
107 #include <machine/reg.h>
109 #include <compat/common/compat_util.h>
111 static int exec_sigcode_map(struct proc
*, const struct emul
*);
114 #define DPRINTF(a) uprintf a
117 #endif /* DEBUG_EXEC */
120 * Exec function switch:
122 * Note that each makecmds function is responsible for loading the
123 * exec package with the necessary functions for any exec-type-specific
126 * Functions for specific exec types should be defined in their own
129 static const struct execsw
**execsw
= NULL
;
132 u_int exec_maxhdrsz
; /* must not be static - used by netbsd32 */
134 /* list of dynamically loaded execsw entries */
135 static LIST_HEAD(execlist_head
, exec_entry
) ex_head
=
136 LIST_HEAD_INITIALIZER(ex_head
);
138 LIST_ENTRY(exec_entry
) ex_list
;
139 SLIST_ENTRY(exec_entry
) ex_slist
;
140 const struct execsw
*ex_sw
;
143 #ifndef __HAVE_SYSCALL_INTERN
148 static struct sa_emul saemul_netbsd
= {
151 sizeof(struct sa_t
*),
155 (void (*)(struct lwp
*, void *))getucontext_sa
,
160 /* NetBSD emul struct */
161 struct emul emul_netbsd
= {
164 #ifndef __HAVE_MINIMAL_EMUL
165 .e_flags
= EMUL_HAS_SYS___syscall
,
167 .e_nosys
= SYS_syscall
,
168 .e_nsysent
= SYS_NSYSENT
,
172 .e_syscallnames
= syscallnames
,
174 .e_syscallnames
= NULL
,
176 .e_sendsig
= sendsig
,
177 .e_trapsignal
= trapsignal
,
182 .e_setregs
= setregs
,
188 #ifdef __HAVE_SYSCALL_INTERN
189 .e_syscall_intern
= syscall_intern
,
191 .e_syscall
= syscall
,
193 .e_sysctlovly
= NULL
,
195 .e_vm_default_addr
= uvm_default_mapaddr
,
198 .e_sa
= &saemul_netbsd
,
202 .e_ucsize
= sizeof(ucontext_t
),
203 .e_startlwp
= startlwp
207 * Exec lock. Used to control access to execsw[] structures.
208 * This must not be static so that netbsd32 can access it, too.
212 static kmutex_t sigobject_lock
;
215 exec_pool_alloc(struct pool
*pp
, int flags
)
218 return (void *)uvm_km_alloc(kernel_map
, NCARGS
, 0,
219 UVM_KMF_PAGEABLE
| UVM_KMF_WAITVA
);
223 exec_pool_free(struct pool
*pp
, void *addr
)
226 uvm_km_free(kernel_map
, (vaddr_t
)addr
, NCARGS
, UVM_KMF_PAGEABLE
);
229 static struct pool exec_pool
;
231 static struct pool_allocator exec_palloc
= {
232 .pa_alloc
= exec_pool_alloc
,
233 .pa_free
= exec_pool_free
,
239 * given an "executable" described in the exec package's namei info,
240 * see what we can do with it.
243 * exec package with appropriate namei info
244 * lwp pointer of exec'ing lwp
245 * NO SELF-LOCKED VNODES
248 * error: nothing held, etc. exec header still allocated.
249 * ok: filled exec package, executable's vnode (unlocked).
252 * Locked vnode to check, exec package, proc.
255 * ok: return 0, filled exec package, executable's vnode (unlocked).
256 * error: destructive:
257 * everything deallocated execept exec header.
259 * error code, executable's vnode (unlocked),
260 * exec header unmodified.
264 check_exec(struct lwp
*l
, struct exec_package
*epp
)
268 struct nameidata
*ndp
;
272 ndp
->ni_cnd
.cn_nameiop
= LOOKUP
;
273 ndp
->ni_cnd
.cn_flags
= FOLLOW
| LOCKLEAF
| SAVENAME
| TRYEMULROOT
;
274 /* first get the vnode */
275 if ((error
= namei(ndp
)) != 0)
277 epp
->ep_vp
= vp
= ndp
->ni_vp
;
279 /* check access and type */
280 if (vp
->v_type
!= VREG
) {
284 if ((error
= VOP_ACCESS(vp
, VEXEC
, l
->l_cred
)) != 0)
288 if ((error
= VOP_GETATTR(vp
, epp
->ep_vap
, l
->l_cred
)) != 0)
291 /* Check mount point */
292 if (vp
->v_mount
->mnt_flag
& MNT_NOEXEC
) {
296 if (vp
->v_mount
->mnt_flag
& MNT_NOSUID
)
297 epp
->ep_vap
->va_mode
&= ~(S_ISUID
| S_ISGID
);
300 if ((error
= VOP_OPEN(vp
, FREAD
, l
->l_cred
)) != 0)
303 /* unlock vp, since we need it unlocked from here on out. */
307 error
= veriexec_verify(l
, vp
, ndp
->ni_cnd
.cn_pnbuf
,
308 epp
->ep_flags
& EXEC_INDIR
? VERIEXEC_INDIRECT
: VERIEXEC_DIRECT
,
312 #endif /* NVERIEXEC > 0 */
315 error
= pax_segvguard(l
, vp
, ndp
->ni_cnd
.cn_pnbuf
, false);
318 #endif /* PAX_SEGVGUARD */
320 /* now we have the file, get the exec header */
321 error
= vn_rdwr(UIO_READ
, vp
, epp
->ep_hdr
, epp
->ep_hdrlen
, 0,
322 UIO_SYSSPACE
, 0, l
->l_cred
, &resid
, NULL
);
325 epp
->ep_hdrvalid
= epp
->ep_hdrlen
- resid
;
328 * Set up default address space limits. Can be overridden
329 * by individual exec packages.
331 * XXX probably should be all done in the exec packages.
333 epp
->ep_vm_minaddr
= VM_MIN_ADDRESS
;
334 epp
->ep_vm_maxaddr
= VM_MAXUSER_ADDRESS
;
336 * set up the vmcmds for creation of the process
340 for (i
= 0; i
< nexecs
; i
++) {
343 epp
->ep_esch
= execsw
[i
];
344 newerror
= (*execsw
[i
]->es_makecmds
)(l
, epp
);
347 /* Seems ok: check that entry point is sane */
348 if (epp
->ep_entry
> VM_MAXUSER_ADDRESS
) {
354 if ((epp
->ep_tsize
> MAXTSIZ
) ||
355 (epp
->ep_dsize
> (u_quad_t
)l
->l_proc
->p_rlimit
356 [RLIMIT_DATA
].rlim_cur
)) {
363 if (epp
->ep_emul_root
!= NULL
) {
364 vrele(epp
->ep_emul_root
);
365 epp
->ep_emul_root
= NULL
;
367 if (epp
->ep_interp
!= NULL
) {
368 vrele(epp
->ep_interp
);
369 epp
->ep_interp
= NULL
;
372 /* make sure the first "interesting" error code is saved. */
373 if (error
== ENOEXEC
)
376 if (epp
->ep_flags
& EXEC_DESTR
)
377 /* Error from "#!" code, tidied up by recursive call */
381 /* not found, error */
384 * free any vmspace-creation commands,
385 * and release their references
387 kill_vmcmds(&epp
->ep_vmcmds
);
391 * close and release the vnode, restore the old one, free the
392 * pathname buf, and punt.
394 vn_lock(vp
, LK_EXCLUSIVE
| LK_RETRY
);
395 VOP_CLOSE(vp
, FREAD
, l
->l_cred
);
397 PNBUF_PUT(ndp
->ni_cnd
.cn_pnbuf
);
402 * free the namei pathname buffer, and put the vnode
403 * (which we don't yet have open).
405 vput(vp
); /* was still locked */
406 PNBUF_PUT(ndp
->ni_cnd
.cn_pnbuf
);
410 #ifdef __MACHINE_STACK_GROWS_UP
411 #define STACK_PTHREADSPACE NBPG
413 #define STACK_PTHREADSPACE 0
417 execve_fetch_element(char * const *array
, size_t index
, char **value
)
419 return copyin(array
+ index
, value
, sizeof(*value
));
427 sys_execve(struct lwp
*l
, const struct sys_execve_args
*uap
, register_t
*retval
)
430 syscallarg(const char *) path;
431 syscallarg(char * const *) argp;
432 syscallarg(char * const *) envp;
435 return execve1(l
, SCARG(uap
, path
), SCARG(uap
, argp
),
436 SCARG(uap
, envp
), execve_fetch_element
);
440 * Load modules to try and execute an image that we do not understand.
441 * If no execsw entries are present, we load those likely to be needed
442 * in order to run native images only. Otherwise, we autoload all
443 * possible modules that could let us run the binary. XXX lame
449 static const char * const native
[] = {
455 static const char * const compat
[] = {
476 char const * const *list
;
479 mutex_enter(&module_lock
);
480 list
= (nexecs
== 0 ? native
: compat
);
481 for (i
= 0; list
[i
] != NULL
; i
++) {
482 if (module_autoload(list
[i
], MODULE_CLASS_MISC
) != 0) {
485 mutex_exit(&module_lock
);
487 mutex_enter(&module_lock
);
489 mutex_exit(&module_lock
);
494 execve1(struct lwp
*l
, const char *path
, char * const *args
,
495 char * const *envs
, execve_fetch_element_t fetch_element
)
498 struct exec_package pack
;
499 struct nameidata nid
;
507 struct ps_strings arginfo
;
508 struct ps_strings
*aip
= &arginfo
;
510 struct exec_fakearg
*tmpfap
;
512 struct exec_vmcmd
*base_vcp
;
524 * Check if we have exceeded our number of processes limit.
525 * This is so that we handle the case where a root daemon
526 * forked, ran setuid to become the desired user and is trying
527 * to exec. The obvious place to do the reference counting check
528 * is setuid(), but we don't do the reference counting check there
529 * like other OS's do because then all the programs that use setuid()
530 * must be modified to check the return code of setuid() and exit().
531 * It is dangerous to make setuid() fail, because it fails open and
532 * the program will continue to run as root. If we make it succeed
533 * and return an error code, again we are not enforcing the limit.
534 * The best place to enforce the limit is here, when the process tries
535 * to execute a new image, because eventually the process will need
536 * to call exec in order to do something useful.
539 if ((p
->p_flag
& PK_SUGID
) && kauth_authorize_generic(l
->l_cred
,
540 KAUTH_GENERIC_ISSUSER
, NULL
) != 0 && chgproccnt(kauth_cred_getuid(
541 l
->l_cred
), 0) > p
->p_rlimit
[RLIMIT_NPROC
].rlim_cur
)
544 oldlwpflags
= l
->l_flag
& (LW_SA
| LW_SA_UPCALL
);
545 if (l
->l_flag
& LW_SA
) {
547 l
->l_flag
&= ~(LW_SA
| LW_SA_UPCALL
);
552 * Drain existing references and forbid new ones. The process
553 * should be left alone until we're done here. This is necessary
554 * to avoid race conditions - e.g. in ptrace() - that might allow
555 * a local user to illicitly obtain elevated privileges.
557 rw_enter(&p
->p_reflock
, RW_WRITER
);
561 * Init the namei data to point the file user's program name.
562 * This is done here rather than in check_exec(), so that it's
563 * possible to override this settings if any of makecmd/probe
564 * functions call check_exec() recursively - for example,
565 * see exec_script_makecmds().
567 pathbuf
= PNBUF_GET();
568 error
= copyinstr(path
, pathbuf
, MAXPATHLEN
, &pathbuflen
);
570 DPRINTF(("execve: copyinstr path %d", error
));
574 NDINIT(&nid
, LOOKUP
, NOFOLLOW
| TRYEMULROOT
, UIO_SYSSPACE
, pathbuf
);
577 * initialize the fields of the exec package.
580 pack
.ep_hdr
= kmem_alloc(exec_maxhdrsz
, KM_SLEEP
);
581 pack
.ep_hdrlen
= exec_maxhdrsz
;
582 pack
.ep_hdrvalid
= 0;
584 pack
.ep_emul_arg
= NULL
;
585 pack
.ep_vmcmds
.evs_cnt
= 0;
586 pack
.ep_vmcmds
.evs_used
= 0;
589 pack
.ep_emul_root
= NULL
;
590 pack
.ep_interp
= NULL
;
592 pack
.ep_pax_flags
= 0;
594 rw_enter(&exec_lock
, RW_READER
);
596 /* see if we can run it. */
597 if ((error
= check_exec(l
, &pack
)) != 0) {
598 if (error
!= ENOENT
) {
599 DPRINTF(("execve: check exec failed %d\n", error
));
604 /* XXX -- THE FOLLOWING SECTION NEEDS MAJOR CLEANUP */
606 /* allocate an argument buffer */
607 argp
= pool_get(&exec_pool
, PR_WAITOK
);
608 KASSERT(argp
!= NULL
);
612 /* copy the fake args list, if there's one, freeing it as we go */
613 if (pack
.ep_flags
& EXEC_HASARGL
) {
615 while (tmpfap
->fa_arg
!= NULL
) {
622 ktrexecarg(tmpfap
->fa_arg
, cp
- tmpfap
->fa_arg
);
624 kmem_free(tmpfap
->fa_arg
, tmpfap
->fa_len
);
627 kmem_free(pack
.ep_fa
, pack
.ep_fa_len
);
628 pack
.ep_flags
&= ~EXEC_HASARGL
;
631 /* Now get argv & environment */
633 DPRINTF(("execve: null args\n"));
637 /* 'i' will index the argp/envp element to be retrieved */
639 if (pack
.ep_flags
& EXEC_SKIPARG
)
643 len
= argp
+ ARG_MAX
- dp
;
644 if ((error
= (*fetch_element
)(args
, i
, &sp
)) != 0) {
645 DPRINTF(("execve: fetch_element args %d\n", error
));
650 if ((error
= copyinstr(sp
, dp
, len
, &len
)) != 0) {
651 DPRINTF(("execve: copyinstr args %d\n", error
));
652 if (error
== ENAMETOOLONG
)
656 ktrexecarg(dp
, len
- 1);
663 /* environment need not be there */
667 len
= argp
+ ARG_MAX
- dp
;
668 if ((error
= (*fetch_element
)(envs
, i
, &sp
)) != 0) {
669 DPRINTF(("execve: fetch_element env %d\n", error
));
674 if ((error
= copyinstr(sp
, dp
, len
, &len
)) != 0) {
675 DPRINTF(("execve: copyinstr env %d\n", error
));
676 if (error
== ENAMETOOLONG
)
680 ktrexecenv(dp
, len
- 1);
687 dp
= (char *) ALIGN(dp
);
689 szsigcode
= pack
.ep_esch
->es_emul
->e_esigcode
-
690 pack
.ep_esch
->es_emul
->e_sigcode
;
692 #ifdef __MACHINE_STACK_GROWS_UP
693 /* See big comment lower down */
699 /* Now check if args & environ fit into new stack */
700 if (pack
.ep_flags
& EXEC_32
)
701 len
= ((argc
+ envc
+ 2 + pack
.ep_esch
->es_arglen
) *
702 sizeof(int) + sizeof(int) + dp
+ RTLD_GAP
+
703 szsigcode
+ sizeof(struct ps_strings
) + STACK_PTHREADSPACE
)
706 len
= ((argc
+ envc
+ 2 + pack
.ep_esch
->es_arglen
) *
707 sizeof(char *) + sizeof(int) + dp
+ RTLD_GAP
+
708 szsigcode
+ sizeof(struct ps_strings
) + STACK_PTHREADSPACE
)
712 if (pax_aslr_active(l
))
713 len
+= (arc4random() % PAGE_SIZE
);
714 #endif /* PAX_ASLR */
716 #ifdef STACKLALIGN /* arm, etc. */
717 len
= STACKALIGN(len
); /* make the stack "safely" aligned */
719 len
= ALIGN(len
); /* make the stack "safely" aligned */
722 if (len
> pack
.ep_ssize
) { /* in effect, compare to initial limit */
723 DPRINTF(("execve: stack limit exceeded %zu\n", len
));
728 /* Get rid of other LWPs. */
729 if (p
->p_sa
|| p
->p_nlwps
> 1) {
730 mutex_enter(p
->p_lock
);
732 mutex_exit(p
->p_lock
);
734 KDASSERT(p
->p_nlwps
== 1);
736 /* Destroy any lwpctl info. */
737 if (p
->p_lwpctl
!= NULL
)
740 /* This is now LWP 1 */
745 /* Release any SA state. */
750 /* Remove POSIX timers */
751 timers_free(p
, TIMERS_POSIX
);
753 /* adjust "active stack depth" for process VSZ */
754 pack
.ep_ssize
= len
; /* maybe should go elsewhere, but... */
757 * Do whatever is necessary to prepare the address space
758 * for remapping. Note that this might replace the current
759 * vmspace with another!
761 uvmspace_exec(l
, pack
.ep_vm_minaddr
, pack
.ep_vm_maxaddr
);
763 /* record proc's vnode, for use by procfs and others */
767 p
->p_textvp
= pack
.ep_vp
;
769 /* Now map address space */
771 vm
->vm_taddr
= (void *)pack
.ep_taddr
;
772 vm
->vm_tsize
= btoc(pack
.ep_tsize
);
773 vm
->vm_daddr
= (void*)pack
.ep_daddr
;
774 vm
->vm_dsize
= btoc(pack
.ep_dsize
);
775 vm
->vm_ssize
= btoc(pack
.ep_ssize
);
777 vm
->vm_maxsaddr
= (void *)pack
.ep_maxsaddr
;
778 vm
->vm_minsaddr
= (void *)pack
.ep_minsaddr
;
781 pax_aslr_init(l
, vm
);
782 #endif /* PAX_ASLR */
784 /* create the new process's VM space by running the vmcmds */
786 if (pack
.ep_vmcmds
.evs_used
== 0)
787 panic("execve: no vmcmds");
789 for (i
= 0; i
< pack
.ep_vmcmds
.evs_used
&& !error
; i
++) {
790 struct exec_vmcmd
*vcp
;
792 vcp
= &pack
.ep_vmcmds
.evs_cmds
[i
];
793 if (vcp
->ev_flags
& VMCMD_RELATIVE
) {
795 if (base_vcp
== NULL
)
796 panic("execve: relative vmcmd with no base");
797 if (vcp
->ev_flags
& VMCMD_BASE
)
798 panic("execve: illegal base & relative vmcmd");
800 vcp
->ev_addr
+= base_vcp
->ev_addr
;
802 error
= (*vcp
->ev_proc
)(l
, vcp
);
806 struct exec_vmcmd
*vp
= &pack
.ep_vmcmds
.evs_cmds
[0];
807 for (j
= 0; j
<= i
; j
++)
809 "vmcmd[%zu] = %#lx/%#lx fd@%#lx prot=0%o flags=%d\n",
810 j
, vp
[j
].ev_addr
, vp
[j
].ev_len
,
811 vp
[j
].ev_offset
, vp
[j
].ev_prot
,
814 #endif /* DEBUG_EXEC */
815 if (vcp
->ev_flags
& VMCMD_BASE
)
819 /* free the vmspace-creation commands, and release their references */
820 kill_vmcmds(&pack
.ep_vmcmds
);
822 vn_lock(pack
.ep_vp
, LK_EXCLUSIVE
| LK_RETRY
);
823 VOP_CLOSE(pack
.ep_vp
, FREAD
, l
->l_cred
);
826 /* if an error happened, deallocate and punt */
828 DPRINTF(("execve: vmcmd %zu failed: %d\n", i
- 1, error
));
832 /* remember information about the process */
833 arginfo
.ps_nargvstr
= argc
;
834 arginfo
.ps_nenvstr
= envc
;
836 /* set command name & other accounting info */
837 i
= min(nid
.ni_cnd
.cn_namelen
, MAXCOMLEN
);
838 (void)memcpy(p
->p_comm
, nid
.ni_cnd
.cn_nameptr
, i
);
843 * If the path starts with /, we don't need to do any work.
844 * This handles the majority of the cases.
845 * In the future perhaps we could canonicalize it?
847 if (pathbuf
[0] == '/')
848 (void)strlcpy(pack
.ep_path
= dp
, pathbuf
, MAXPATHLEN
);
851 * Although this works most of the time [since the entry was just
852 * entered in the cache] we don't use it because it theoretically
853 * can fail and it is not the cleanest interface, because there
854 * could be races. When the namei cache is re-written, this can
855 * be changed to use the appropriate function.
857 else if (!(error
= vnode_to_path(dp
, MAXPATHLEN
, p
->p_textvp
, l
, p
)))
862 printf("Cannot get path for pid %d [%s] (error %d)",
863 (int)p
->p_pid
, p
->p_comm
, error
);
869 stack
= (char *)STACK_ALLOC(STACK_GROW(vm
->vm_minsaddr
,
870 STACK_PTHREADSPACE
+ sizeof(struct ps_strings
) + szsigcode
),
871 len
- (sizeof(struct ps_strings
) + szsigcode
));
873 #ifdef __MACHINE_STACK_GROWS_UP
875 * The copyargs call always copies into lower addresses
876 * first, moving towards higher addresses, starting with
877 * the stack pointer that we give. When the stack grows
878 * down, this puts argc/argv/envp very shallow on the
879 * stack, right at the first user stack pointer.
880 * When the stack grows up, the situation is reversed.
882 * Normally, this is no big deal. But the ld_elf.so _rtld()
883 * function expects to be called with a single pointer to
884 * a region that has a few words it can stash values into,
885 * followed by argc/argv/envp. When the stack grows down,
886 * it's easy to decrement the stack pointer a little bit to
887 * allocate the space for these few words and pass the new
888 * stack pointer to _rtld. When the stack grows up, however,
889 * a few words before argc is part of the signal trampoline, XXX
890 * so we have a problem.
892 * Instead of changing how _rtld works, we take the easy way
893 * out and steal 32 bytes before we call copyargs.
894 * This extra space was allowed for when 'len' was calculated.
897 #endif /* __MACHINE_STACK_GROWS_UP */
899 /* Now copy argc, args & environ to new stack */
900 error
= (*pack
.ep_esch
->es_copyargs
)(l
, &pack
, &arginfo
, &stack
, argp
);
902 PNBUF_PUT(pack
.ep_path
);
906 DPRINTF(("execve: copyargs failed %d\n", error
));
909 /* Move the stack back to original point */
910 stack
= (char *)STACK_GROW(vm
->vm_minsaddr
, len
);
912 /* fill process ps_strings info */
913 p
->p_psstr
= (struct ps_strings
*)
914 STACK_ALLOC(STACK_GROW(vm
->vm_minsaddr
, STACK_PTHREADSPACE
),
915 sizeof(struct ps_strings
));
916 p
->p_psargv
= offsetof(struct ps_strings
, ps_argvstr
);
917 p
->p_psnargv
= offsetof(struct ps_strings
, ps_nargvstr
);
918 p
->p_psenv
= offsetof(struct ps_strings
, ps_envstr
);
919 p
->p_psnenv
= offsetof(struct ps_strings
, ps_nenvstr
);
921 /* copy out the process's ps_strings structure */
922 if ((error
= copyout(aip
, (char *)p
->p_psstr
,
923 sizeof(arginfo
))) != 0) {
924 DPRINTF(("execve: ps_strings copyout %p->%p size %ld failed\n",
925 aip
, (char *)p
->p_psstr
, (long)sizeof(arginfo
)));
929 fd_closeexec(); /* handle close on exec */
930 execsigs(p
); /* reset catched signals */
932 l
->l_ctxlink
= NULL
; /* reset ucontext link */
935 p
->p_acflag
&= ~AFORK
;
936 mutex_enter(p
->p_lock
);
937 p
->p_flag
|= PK_EXEC
;
938 mutex_exit(p
->p_lock
);
943 if ((p
->p_stflag
& PST_PROFIL
) != 0) {
944 mutex_spin_enter(&p
->p_stmutex
);
946 mutex_spin_exit(&p
->p_stmutex
);
950 * It's OK to test PL_PPWAIT unlocked here, as other LWPs have
951 * exited and exec()/exit() are the only places it will be cleared.
953 if ((p
->p_lflag
& PL_PPWAIT
) != 0) {
954 mutex_enter(proc_lock
);
955 p
->p_lflag
&= ~PL_PPWAIT
;
956 cv_broadcast(&p
->p_pptr
->p_waitcv
);
957 mutex_exit(proc_lock
);
961 * Deal with set[ug]id. MNT_NOSUID has already been used to disable
962 * s[ug]id. It's OK to check for PSL_TRACED here as we have blocked
963 * out additional references on the process for the moment.
965 if ((p
->p_slflag
& PSL_TRACED
) == 0 &&
967 (((attr
.va_mode
& S_ISUID
) != 0 &&
968 kauth_cred_geteuid(l
->l_cred
) != attr
.va_uid
) ||
970 ((attr
.va_mode
& S_ISGID
) != 0 &&
971 kauth_cred_getegid(l
->l_cred
) != attr
.va_gid
))) {
973 * Mark the process as SUGID before we do
974 * anything that might block.
977 proc_crmod_leave(NULL
, NULL
, true);
979 /* Make sure file descriptors 0..2 are in use. */
980 if ((error
= fd_checkstd()) != 0) {
981 DPRINTF(("execve: fdcheckstd failed %d\n", error
));
986 * Copy the credential so other references don't see our
989 l
->l_cred
= kauth_cred_copy(l
->l_cred
);
992 * If the persistent trace flag isn't set, turn off.
995 mutex_enter(&ktrace_lock
);
996 if (!(p
->p_traceflag
& KTRFAC_PERSISTENT
))
998 mutex_exit(&ktrace_lock
);
1001 if (attr
.va_mode
& S_ISUID
)
1002 kauth_cred_seteuid(l
->l_cred
, attr
.va_uid
);
1003 if (attr
.va_mode
& S_ISGID
)
1004 kauth_cred_setegid(l
->l_cred
, attr
.va_gid
);
1006 if (kauth_cred_geteuid(l
->l_cred
) ==
1007 kauth_cred_getuid(l
->l_cred
) &&
1008 kauth_cred_getegid(l
->l_cred
) ==
1009 kauth_cred_getgid(l
->l_cred
))
1010 p
->p_flag
&= ~PK_SUGID
;
1014 * Copy the credential so other references don't see our changes.
1015 * Test to see if this is necessary first, since in the common case
1016 * we won't need a private reference.
1018 if (kauth_cred_geteuid(l
->l_cred
) != kauth_cred_getsvuid(l
->l_cred
) ||
1019 kauth_cred_getegid(l
->l_cred
) != kauth_cred_getsvgid(l
->l_cred
)) {
1020 l
->l_cred
= kauth_cred_copy(l
->l_cred
);
1021 kauth_cred_setsvuid(l
->l_cred
, kauth_cred_geteuid(l
->l_cred
));
1022 kauth_cred_setsvgid(l
->l_cred
, kauth_cred_getegid(l
->l_cred
));
1025 /* Update the master credentials. */
1026 if (l
->l_cred
!= p
->p_cred
) {
1029 kauth_cred_hold(l
->l_cred
);
1030 mutex_enter(p
->p_lock
);
1032 p
->p_cred
= l
->l_cred
;
1033 mutex_exit(p
->p_lock
);
1034 kauth_cred_free(ocred
);
1037 #if defined(__HAVE_RAS)
1039 * Remove all RASs from the address space.
1046 /* setup new registers and do misc. setup. */
1047 (*pack
.ep_esch
->es_emul
->e_setregs
)(l
, &pack
, (vaddr_t
)stack
);
1048 if (pack
.ep_esch
->es_setregs
)
1049 (*pack
.ep_esch
->es_setregs
)(l
, &pack
, (vaddr_t
)stack
);
1051 /* map the process's signal trampoline code */
1052 if (exec_sigcode_map(p
, pack
.ep_esch
->es_emul
)) {
1053 DPRINTF(("execve: map sigcode failed %d\n", error
));
1057 pool_put(&exec_pool
, argp
);
1059 PNBUF_PUT(nid
.ni_cnd
.cn_pnbuf
);
1061 /* notify others that we exec'd */
1062 KNOTE(&p
->p_klist
, NOTE_EXEC
);
1064 kmem_free(pack
.ep_hdr
, pack
.ep_hdrlen
);
1066 /* The emulation root will usually have been found when we looked
1067 * for the elf interpreter (or similar), if not look now. */
1068 if (pack
.ep_esch
->es_emul
->e_path
!= NULL
&& pack
.ep_emul_root
== NULL
)
1069 emul_find_root(l
, &pack
);
1071 /* Any old emulation root got removed by fdcloseexec */
1072 rw_enter(&p
->p_cwdi
->cwdi_lock
, RW_WRITER
);
1073 p
->p_cwdi
->cwdi_edir
= pack
.ep_emul_root
;
1074 rw_exit(&p
->p_cwdi
->cwdi_lock
);
1075 pack
.ep_emul_root
= NULL
;
1076 if (pack
.ep_interp
!= NULL
)
1077 vrele(pack
.ep_interp
);
1080 * Call emulation specific exec hook. This can setup per-process
1081 * p->p_emuldata or do any other per-process stuff an emulation needs.
1083 * If we are executing process of different emulation than the
1084 * original forked process, call e_proc_exit() of the old emulation
1085 * first, then e_proc_exec() of new emulation. If the emulation is
1086 * same, the exec hook code should deallocate any old emulation
1087 * resources held previously by this process.
1089 if (p
->p_emul
&& p
->p_emul
->e_proc_exit
1090 && p
->p_emul
!= pack
.ep_esch
->es_emul
)
1091 (*p
->p_emul
->e_proc_exit
)(p
);
1094 * Call exec hook. Emulation code may NOT store reference to anything
1097 if (pack
.ep_esch
->es_emul
->e_proc_exec
)
1098 (*pack
.ep_esch
->es_emul
->e_proc_exec
)(p
, &pack
);
1100 /* update p_emul, the old value is no longer needed */
1101 p
->p_emul
= pack
.ep_esch
->es_emul
;
1103 /* ...and the same for p_execsw */
1104 p
->p_execsw
= pack
.ep_esch
;
1106 #ifdef __HAVE_SYSCALL_INTERN
1107 (*p
->p_emul
->e_syscall_intern
)(p
);
1111 /* Allow new references from the debugger/procfs. */
1112 rw_exit(&p
->p_reflock
);
1113 rw_exit(&exec_lock
);
1115 mutex_enter(proc_lock
);
1117 if ((p
->p_slflag
& (PSL_TRACED
|PSL_SYSCALL
)) == PSL_TRACED
) {
1118 KSI_INIT_EMPTY(&ksi
);
1119 ksi
.ksi_signo
= SIGTRAP
;
1120 ksi
.ksi_lid
= l
->l_lid
;
1121 kpsignal(p
, &ksi
, NULL
);
1124 if (p
->p_sflag
& PS_STOPEXEC
) {
1125 KERNEL_UNLOCK_ALL(l
, &l
->l_biglocks
);
1126 p
->p_pptr
->p_nstopchild
++;
1127 p
->p_pptr
->p_waited
= 0;
1128 mutex_enter(p
->p_lock
);
1129 ksiginfo_queue_init(&kq
);
1130 sigclearall(p
, &contsigmask
, &kq
);
1135 mutex_exit(p
->p_lock
);
1136 mutex_exit(proc_lock
);
1138 ksiginfo_queue_drain(&kq
);
1139 KERNEL_LOCK(l
->l_biglocks
, l
);
1141 mutex_exit(proc_lock
);
1145 return (EJUSTRETURN
);
1148 /* free the vmspace-creation commands, and release their references */
1149 kill_vmcmds(&pack
.ep_vmcmds
);
1150 /* kill any opened file descriptor, if necessary */
1151 if (pack
.ep_flags
& EXEC_HASFD
) {
1152 pack
.ep_flags
&= ~EXEC_HASFD
;
1153 fd_close(pack
.ep_fd
);
1155 /* close and put the exec'd file */
1156 vn_lock(pack
.ep_vp
, LK_EXCLUSIVE
| LK_RETRY
);
1157 VOP_CLOSE(pack
.ep_vp
, FREAD
, l
->l_cred
);
1159 PNBUF_PUT(nid
.ni_cnd
.cn_pnbuf
);
1160 pool_put(&exec_pool
, argp
);
1163 kmem_free(pack
.ep_hdr
, pack
.ep_hdrlen
);
1164 if (pack
.ep_emul_root
!= NULL
)
1165 vrele(pack
.ep_emul_root
);
1166 if (pack
.ep_interp
!= NULL
)
1167 vrele(pack
.ep_interp
);
1169 rw_exit(&exec_lock
);
1173 l
->l_flag
|= oldlwpflags
;
1176 rw_exit(&p
->p_reflock
);
1178 if (modgen
!= module_gen
&& error
== ENOEXEC
) {
1179 modgen
= module_gen
;
1188 rw_exit(&p
->p_reflock
);
1189 rw_exit(&exec_lock
);
1192 * the old process doesn't exist anymore. exit gracefully.
1193 * get rid of the (new) address space we have created, if any, get rid
1194 * of our namei data and vnode, and exit noting failure
1196 uvm_deallocate(&vm
->vm_map
, VM_MIN_ADDRESS
,
1197 VM_MAXUSER_ADDRESS
- VM_MIN_ADDRESS
);
1198 if (pack
.ep_emul_arg
)
1199 free(pack
.ep_emul_arg
, M_TEMP
);
1200 PNBUF_PUT(nid
.ni_cnd
.cn_pnbuf
);
1201 pool_put(&exec_pool
, argp
);
1202 kmem_free(pack
.ep_hdr
, pack
.ep_hdrlen
);
1203 if (pack
.ep_emul_root
!= NULL
)
1204 vrele(pack
.ep_emul_root
);
1205 if (pack
.ep_interp
!= NULL
)
1206 vrele(pack
.ep_interp
);
1208 /* Acquire the sched-state mutex (exit1() will release it). */
1209 mutex_enter(p
->p_lock
);
1210 exit1(l
, W_EXITCODE(error
, SIGABRT
));
1218 copyargs(struct lwp
*l
, struct exec_package
*pack
, struct ps_strings
*arginfo
,
1219 char **stackp
, void *argp
)
1221 char **cpp
, *dp
, *sp
;
1227 cpp
= (char **)*stackp
;
1229 argc
= arginfo
->ps_nargvstr
;
1230 envc
= arginfo
->ps_nenvstr
;
1231 if ((error
= copyout(&argc
, cpp
++, sizeof(argc
))) != 0)
1234 dp
= (char *) (cpp
+ argc
+ envc
+ 2 + pack
->ep_esch
->es_arglen
);
1237 /* XXX don't copy them out, remap them! */
1238 arginfo
->ps_argvstr
= cpp
; /* remember location of argv for later */
1240 for (; --argc
>= 0; sp
+= len
, dp
+= len
)
1241 if ((error
= copyout(&dp
, cpp
++, sizeof(dp
))) != 0 ||
1242 (error
= copyoutstr(sp
, dp
, ARG_MAX
, &len
)) != 0)
1245 if ((error
= copyout(&nullp
, cpp
++, sizeof(nullp
))) != 0)
1248 arginfo
->ps_envstr
= cpp
; /* remember location of envp for later */
1250 for (; --envc
>= 0; sp
+= len
, dp
+= len
)
1251 if ((error
= copyout(&dp
, cpp
++, sizeof(dp
))) != 0 ||
1252 (error
= copyoutstr(sp
, dp
, ARG_MAX
, &len
)) != 0)
1255 if ((error
= copyout(&nullp
, cpp
++, sizeof(nullp
))) != 0)
1258 *stackp
= (char *)cpp
;
1264 * Add execsw[] entries.
1267 exec_add(struct execsw
*esp
, int count
)
1269 struct exec_entry
*it
;
1276 /* Check for duplicates. */
1277 rw_enter(&exec_lock
, RW_WRITER
);
1278 for (i
= 0; i
< count
; i
++) {
1279 LIST_FOREACH(it
, &ex_head
, ex_list
) {
1280 /* assume unique (makecmds, probe_func, emulation) */
1281 if (it
->ex_sw
->es_makecmds
== esp
[i
].es_makecmds
&&
1282 it
->ex_sw
->u
.elf_probe_func
==
1283 esp
[i
].u
.elf_probe_func
&&
1284 it
->ex_sw
->es_emul
== esp
[i
].es_emul
) {
1285 rw_exit(&exec_lock
);
1291 /* Allocate new entries. */
1292 for (i
= 0; i
< count
; i
++) {
1293 it
= kmem_alloc(sizeof(*it
), KM_SLEEP
);
1294 it
->ex_sw
= &esp
[i
];
1295 LIST_INSERT_HEAD(&ex_head
, it
, ex_list
);
1298 /* update execsw[] */
1300 rw_exit(&exec_lock
);
1305 * Remove execsw[] entry.
1308 exec_remove(struct execsw
*esp
, int count
)
1310 struct exec_entry
*it
, *next
;
1312 const struct proclist_desc
*pd
;
1319 /* Abort if any are busy. */
1320 rw_enter(&exec_lock
, RW_WRITER
);
1321 for (i
= 0; i
< count
; i
++) {
1322 mutex_enter(proc_lock
);
1323 for (pd
= proclists
; pd
->pd_list
!= NULL
; pd
++) {
1324 PROCLIST_FOREACH(p
, pd
->pd_list
) {
1325 if (p
->p_execsw
== &esp
[i
]) {
1326 mutex_exit(proc_lock
);
1327 rw_exit(&exec_lock
);
1332 mutex_exit(proc_lock
);
1335 /* None are busy, so remove them all. */
1336 for (i
= 0; i
< count
; i
++) {
1337 for (it
= LIST_FIRST(&ex_head
); it
!= NULL
; it
= next
) {
1338 next
= LIST_NEXT(it
, ex_list
);
1339 if (it
->ex_sw
== &esp
[i
]) {
1340 LIST_REMOVE(it
, ex_list
);
1341 kmem_free(it
, sizeof(*it
));
1347 /* update execsw[] */
1349 rw_exit(&exec_lock
);
1354 * Initialize exec structures. If init_boot is true, also does necessary
1355 * one-time initialization (it's called from main() that way).
1356 * Once system is multiuser, this should be called with exec_lock held,
1357 * i.e. via exec_{add|remove}().
1360 exec_init(int init_boot
)
1362 const struct execsw
**sw
;
1363 struct exec_entry
*ex
;
1364 SLIST_HEAD(,exec_entry
) first
;
1365 SLIST_HEAD(,exec_entry
) any
;
1366 SLIST_HEAD(,exec_entry
) last
;
1370 /* do one-time initializations */
1371 rw_init(&exec_lock
);
1372 mutex_init(&sigobject_lock
, MUTEX_DEFAULT
, IPL_NONE
);
1373 pool_init(&exec_pool
, NCARGS
, 0, 0, PR_NOALIGN
|PR_NOTOUCH
,
1374 "execargs", &exec_palloc
, IPL_NONE
);
1375 pool_sethardlimit(&exec_pool
, maxexec
, "should not happen", 0);
1377 KASSERT(rw_write_held(&exec_lock
));
1380 /* Sort each entry onto the appropriate queue. */
1385 LIST_FOREACH(ex
, &ex_head
, ex_list
) {
1386 switch(ex
->ex_sw
->es_prio
) {
1387 case EXECSW_PRIO_FIRST
:
1388 SLIST_INSERT_HEAD(&first
, ex
, ex_slist
);
1390 case EXECSW_PRIO_ANY
:
1391 SLIST_INSERT_HEAD(&any
, ex
, ex_slist
);
1393 case EXECSW_PRIO_LAST
:
1394 SLIST_INSERT_HEAD(&last
, ex
, ex_slist
);
1404 * Create new execsw[]. Ensure we do not try a zero-sized
1407 sw
= kmem_alloc(sz
* sizeof(struct execsw
*) + 1, KM_SLEEP
);
1409 SLIST_FOREACH(ex
, &first
, ex_slist
) {
1410 sw
[i
++] = ex
->ex_sw
;
1412 SLIST_FOREACH(ex
, &any
, ex_slist
) {
1413 sw
[i
++] = ex
->ex_sw
;
1415 SLIST_FOREACH(ex
, &last
, ex_slist
) {
1416 sw
[i
++] = ex
->ex_sw
;
1419 /* Replace old execsw[] and free used memory. */
1420 if (execsw
!= NULL
) {
1421 kmem_free(__UNCONST(execsw
),
1422 nexecs
* sizeof(struct execsw
*) + 1);
1427 /* Figure out the maximum size of an exec header. */
1428 exec_maxhdrsz
= sizeof(int);
1429 for (i
= 0; i
< nexecs
; i
++) {
1430 if (execsw
[i
]->es_hdrsz
> exec_maxhdrsz
)
1431 exec_maxhdrsz
= execsw
[i
]->es_hdrsz
;
1438 exec_sigcode_map(struct proc
*p
, const struct emul
*e
)
1443 struct uvm_object
*uobj
;
1445 sz
= (vaddr_t
)e
->e_esigcode
- (vaddr_t
)e
->e_sigcode
;
1447 if (e
->e_sigobject
== NULL
|| sz
== 0) {
1452 * If we don't have a sigobject for this emulation, create one.
1454 * sigobject is an anonymous memory object (just like SYSV shared
1455 * memory) that we keep a permanent reference to and that we map
1456 * in all processes that need this sigcode. The creation is simple,
1457 * we create an object, add a permanent reference to it, map it in
1458 * kernel space, copy out the sigcode to it and unmap it.
1459 * We map it with PROT_READ|PROT_EXEC into the process just
1460 * the way sys_mmap() would map it.
1463 uobj
= *e
->e_sigobject
;
1465 mutex_enter(&sigobject_lock
);
1466 if ((uobj
= *e
->e_sigobject
) == NULL
) {
1467 uobj
= uao_create(sz
, 0);
1468 (*uobj
->pgops
->pgo_reference
)(uobj
);
1469 va
= vm_map_min(kernel_map
);
1470 if ((error
= uvm_map(kernel_map
, &va
, round_page(sz
),
1472 UVM_MAPFLAG(UVM_PROT_RW
, UVM_PROT_RW
,
1473 UVM_INH_SHARE
, UVM_ADV_RANDOM
, 0)))) {
1474 printf("kernel mapping failed %d\n", error
);
1475 (*uobj
->pgops
->pgo_detach
)(uobj
);
1476 mutex_exit(&sigobject_lock
);
1479 memcpy((void *)va
, e
->e_sigcode
, sz
);
1480 #ifdef PMAP_NEED_PROCWR
1481 pmap_procwr(&proc0
, va
, sz
);
1483 uvm_unmap(kernel_map
, va
, va
+ round_page(sz
));
1484 *e
->e_sigobject
= uobj
;
1486 mutex_exit(&sigobject_lock
);
1489 /* Just a hint to uvm_map where to put it. */
1490 va
= e
->e_vm_default_addr(p
, (vaddr_t
)p
->p_vmspace
->vm_daddr
,
1495 * Tru64 puts /sbin/loader at the end of user virtual memory,
1496 * which causes the above calculation to put the sigcode at
1497 * an invalid address. Put it just below the text instead.
1499 if (va
== (vaddr_t
)vm_map_max(&p
->p_vmspace
->vm_map
)) {
1500 va
= (vaddr_t
)p
->p_vmspace
->vm_taddr
- round_page(sz
);
1504 (*uobj
->pgops
->pgo_reference
)(uobj
);
1505 error
= uvm_map(&p
->p_vmspace
->vm_map
, &va
, round_page(sz
),
1507 UVM_MAPFLAG(UVM_PROT_RX
, UVM_PROT_RX
, UVM_INH_SHARE
,
1508 UVM_ADV_RANDOM
, 0));
1510 (*uobj
->pgops
->pgo_detach
)(uobj
);
1513 p
->p_sigctx
.ps_sigcode
= (void *)va
;