2 * Copyright (c) 1993, David Greenman
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
30 #include "opt_hwpmc_hooks.h"
31 #include "opt_kdtrace.h"
32 #include "opt_ktrace.h"
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/eventhandler.h>
40 #include <sys/mutex.h>
41 #include <sys/sysproto.h>
42 #include <sys/signalvar.h>
43 #include <sys/kernel.h>
44 #include <sys/mount.h>
45 #include <sys/filedesc.h>
46 #include <sys/fcntl.h>
49 #include <sys/imgact.h>
50 #include <sys/imgact_elf.h>
52 #include <sys/malloc.h>
55 #include <sys/pioctl.h>
56 #include <sys/namei.h>
57 #include <sys/resourcevar.h>
59 #include <sys/sf_buf.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysent.h>
63 #include <sys/sysctl.h>
64 #include <sys/vnode.h>
67 #include <sys/ktrace.h>
71 #include <vm/vm_param.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_map.h>
75 #include <vm/vm_kern.h>
76 #include <vm/vm_extern.h>
77 #include <vm/vm_object.h>
78 #include <vm/vm_pager.h>
81 #include <sys/pmckern.h>
84 #include <machine/reg.h>
86 #include <security/audit/audit.h>
87 #include <security/mac/mac_framework.h>
90 #include <sys/dtrace_bsd.h>
91 dtrace_execexit_func_t dtrace_fasttrap_exec
;
94 SDT_PROVIDER_DECLARE(proc
);
95 SDT_PROBE_DEFINE(proc
, kernel
, , exec
);
96 SDT_PROBE_ARGTYPE(proc
, kernel
, , exec
, 0, "char *");
97 SDT_PROBE_DEFINE(proc
, kernel
, , exec_failure
);
98 SDT_PROBE_ARGTYPE(proc
, kernel
, , exec_failure
, 0, "int");
99 SDT_PROBE_DEFINE(proc
, kernel
, , exec_success
);
100 SDT_PROBE_ARGTYPE(proc
, kernel
, , exec_success
, 0, "char *");
102 MALLOC_DEFINE(M_PARGS
, "proc-args", "Process arguments");
104 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS
);
105 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS
);
106 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS
);
107 static int do_execve(struct thread
*td
, struct image_args
*args
,
109 static void exec_free_args(struct image_args
*);
111 /* XXX This should be vm_size_t. */
112 SYSCTL_PROC(_kern
, KERN_PS_STRINGS
, ps_strings
, CTLTYPE_ULONG
|CTLFLAG_RD
,
113 NULL
, 0, sysctl_kern_ps_strings
, "LU", "");
115 /* XXX This should be vm_size_t. */
116 SYSCTL_PROC(_kern
, KERN_USRSTACK
, usrstack
, CTLTYPE_ULONG
|CTLFLAG_RD
,
117 NULL
, 0, sysctl_kern_usrstack
, "LU", "");
119 SYSCTL_PROC(_kern
, OID_AUTO
, stackprot
, CTLTYPE_INT
|CTLFLAG_RD
,
120 NULL
, 0, sysctl_kern_stackprot
, "I", "");
122 u_long ps_arg_cache_limit
= PAGE_SIZE
/ 16;
123 SYSCTL_ULONG(_kern
, OID_AUTO
, ps_arg_cache_limit
, CTLFLAG_RW
,
124 &ps_arg_cache_limit
, 0, "");
127 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS
)
134 if (req
->flags
& SCTL_MASK32
) {
136 val
= (unsigned int)p
->p_sysent
->sv_psstrings
;
137 error
= SYSCTL_OUT(req
, &val
, sizeof(val
));
140 error
= SYSCTL_OUT(req
, &p
->p_sysent
->sv_psstrings
,
141 sizeof(p
->p_sysent
->sv_psstrings
));
146 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS
)
153 if (req
->flags
& SCTL_MASK32
) {
155 val
= (unsigned int)p
->p_sysent
->sv_usrstack
;
156 error
= SYSCTL_OUT(req
, &val
, sizeof(val
));
159 error
= SYSCTL_OUT(req
, &p
->p_sysent
->sv_usrstack
,
160 sizeof(p
->p_sysent
->sv_usrstack
));
165 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS
)
170 return (SYSCTL_OUT(req
, &p
->p_sysent
->sv_stackprot
,
171 sizeof(p
->p_sysent
->sv_stackprot
)));
175 * Each of the items is a pointer to a `const struct execsw', hence the
176 * double pointer here.
178 static const struct execsw
**execsw
;
180 #ifndef _SYS_SYSPROTO_H_
191 struct execve_args
/* {
198 struct image_args args
;
200 error
= exec_copyin_args(&args
, uap
->fname
, UIO_USERSPACE
,
201 uap
->argv
, uap
->envv
);
203 error
= kern_execve(td
, &args
, NULL
);
207 #ifndef _SYS_SYSPROTO_H_
208 struct fexecve_args
{
215 fexecve(struct thread
*td
, struct fexecve_args
*uap
)
218 struct image_args args
;
220 error
= exec_copyin_args(&args
, NULL
, UIO_SYSSPACE
,
221 uap
->argv
, uap
->envv
);
224 error
= kern_execve(td
, &args
, NULL
);
229 #ifndef _SYS_SYSPROTO_H_
230 struct __mac_execve_args
{
239 __mac_execve(td
, uap
)
241 struct __mac_execve_args
/* {
250 struct image_args args
;
252 error
= exec_copyin_args(&args
, uap
->fname
, UIO_USERSPACE
,
253 uap
->argv
, uap
->envv
);
255 error
= kern_execve(td
, &args
, uap
->mac_p
);
263 * XXX: kern_execve has the astonishing property of not always returning to
264 * the caller. If sufficiently bad things happen during the call to
265 * do_execve(), it can end up calling exit1(); as a result, callers must
266 * avoid doing anything which they might need to undo (e.g., allocating
270 kern_execve(td
, args
, mac_p
)
272 struct image_args
*args
;
275 struct proc
*p
= td
->td_proc
;
278 AUDIT_ARG(argv
, args
->begin_argv
, args
->argc
,
279 args
->begin_envv
- args
->begin_argv
);
280 AUDIT_ARG(envv
, args
->begin_envv
, args
->envc
,
281 args
->endp
- args
->begin_envv
);
282 if (p
->p_flag
& P_HADTHREADS
) {
284 if (thread_single(SINGLE_BOUNDARY
)) {
286 exec_free_args(args
);
287 return (ERESTART
); /* Try again later. */
292 error
= do_execve(td
, args
, mac_p
);
294 if (p
->p_flag
& P_HADTHREADS
) {
297 * If success, we upgrade to SINGLE_EXIT state to
298 * force other threads to suicide.
301 thread_single(SINGLE_EXIT
);
311 * In-kernel implementation of execve(). All arguments are assumed to be
312 * userspace pointers from the passed thread.
315 do_execve(td
, args
, mac_p
)
317 struct image_args
*args
;
320 struct proc
*p
= td
->td_proc
;
321 struct nameidata nd
, *ndp
;
322 struct ucred
*newcred
= NULL
, *oldcred
;
323 struct uidinfo
*euip
;
324 register_t
*stack_base
;
325 int error
, len
= 0, i
;
326 struct image_params image_params
, *imgp
;
328 int (*img_first
)(struct image_params
*);
329 struct pargs
*oldargs
= NULL
, *newargs
= NULL
;
330 struct sigacts
*oldsigacts
, *newsigacts
;
332 struct vnode
*tracevp
= NULL
;
333 struct ucred
*tracecred
= NULL
;
335 struct vnode
*textvp
= NULL
, *binvp
= NULL
;
336 int credential_changing
;
340 struct label
*interpvplabel
= NULL
;
344 struct pmckern_procexec pe
;
346 static const char fexecv_proc_title
[] = "(fexecv)";
349 imgp
= &image_params
;
352 * Lock the process and set the P_INEXEC flag to indicate that
353 * it should be left alone until we're done here. This is
354 * necessary to avoid race conditions - e.g. in ptrace() -
355 * that might allow a local user to illicitly obtain elevated
359 KASSERT((p
->p_flag
& P_INEXEC
) == 0,
360 ("%s(): process already has P_INEXEC flag", __func__
));
361 p
->p_flag
|= P_INEXEC
;
365 * Initialize part of the common data
368 imgp
->execlabel
= NULL
;
370 imgp
->entry_addr
= 0;
371 imgp
->vmspace_destroyed
= 0;
372 imgp
->interpreted
= 0;
374 imgp
->interpreter_name
= args
->buf
+ PATH_MAX
+ ARG_MAX
;
375 imgp
->auxargs
= NULL
;
378 imgp
->firstpage
= NULL
;
379 imgp
->ps_strings
= 0;
380 imgp
->auxarg_size
= 0;
384 error
= mac_execve_enter(imgp
, mac_p
);
389 imgp
->image_header
= NULL
;
391 SDT_PROBE(proc
, kernel
, , exec
, args
->fname
, 0, 0, 0, 0 );
394 * Translate the file name. namei() returns a vnode pointer
395 * in ni_vp amoung other things.
397 * XXXAUDIT: It would be desirable to also audit the name of the
398 * interpreter if this is an interpreted binary.
400 if (args
->fname
!= NULL
) {
402 NDINIT(ndp
, LOOKUP
, ISOPEN
| LOCKLEAF
| FOLLOW
| SAVENAME
403 | MPSAFE
| AUDITVNODE1
, UIO_SYSSPACE
, args
->fname
, td
);
407 if (args
->fname
!= NULL
) {
412 vfslocked
= NDHASGIANT(ndp
);
416 AUDIT_ARG(fd
, args
->fd
);
417 error
= fgetvp(td
, args
->fd
, &binvp
);
420 vfslocked
= VFS_LOCK_GIANT(binvp
->v_mount
);
421 vn_lock(binvp
, LK_EXCLUSIVE
| LK_RETRY
);
422 AUDIT_ARG(vnode
, binvp
, ARG_VNODE1
);
427 * Check file permissions (also 'opens' file)
429 error
= exec_check_permissions(imgp
);
431 goto exec_fail_dealloc
;
433 imgp
->object
= imgp
->vp
->v_object
;
434 if (imgp
->object
!= NULL
)
435 vm_object_reference(imgp
->object
);
438 * Set VV_TEXT now so no one can write to the executable while we're
441 * Remember if this was set before and unset it in case this is not
442 * actually an executable image.
444 textset
= imgp
->vp
->v_vflag
& VV_TEXT
;
445 imgp
->vp
->v_vflag
|= VV_TEXT
;
447 error
= exec_map_first_page(imgp
);
449 goto exec_fail_dealloc
;
451 imgp
->proc
->p_osrel
= 0;
453 * If the current process has a special image activator it
454 * wants to try first, call it. For example, emulating shell
455 * scripts differently.
458 if ((img_first
= imgp
->proc
->p_sysent
->sv_imgact_try
) != NULL
)
459 error
= img_first(imgp
);
462 * Loop through the list of image activators, calling each one.
463 * An activator returns -1 if there is no match, 0 on success,
464 * and an error otherwise.
466 for (i
= 0; error
== -1 && execsw
[i
]; ++i
) {
467 if (execsw
[i
]->ex_imgact
== NULL
||
468 execsw
[i
]->ex_imgact
== img_first
) {
471 error
= (*execsw
[i
]->ex_imgact
)(imgp
);
477 imgp
->vp
->v_vflag
&= ~VV_TEXT
;
480 goto exec_fail_dealloc
;
484 * Special interpreter operation, cleanup and loop up to try to
485 * activate the interpreter.
487 if (imgp
->interpreted
) {
488 exec_unmap_first_page(imgp
);
490 * VV_TEXT needs to be unset for scripts. There is a short
491 * period before we determine that something is a script where
492 * VV_TEXT will be set. The vnode lock is held over this
493 * entire period so nothing should illegitimately be blocked.
495 imgp
->vp
->v_vflag
&= ~VV_TEXT
;
496 /* free name buffer and old vnode */
497 if (args
->fname
!= NULL
)
498 NDFREE(ndp
, NDF_ONLY_PNBUF
);
500 mac_execve_interpreter_enter(binvp
, &interpvplabel
);
503 VOP_CLOSE(binvp
, FREAD
, td
->td_ucred
, td
);
507 vm_object_deallocate(imgp
->object
);
509 VFS_UNLOCK_GIANT(vfslocked
);
511 /* set new name to that of the interpreter */
512 NDINIT(ndp
, LOOKUP
, LOCKLEAF
| FOLLOW
| SAVENAME
| MPSAFE
,
513 UIO_SYSSPACE
, imgp
->interpreter_name
, td
);
514 args
->fname
= imgp
->interpreter_name
;
519 * NB: We unlock the vnode here because it is believed that none
520 * of the sv_copyout_strings/sv_fixup operations require the vnode.
522 VOP_UNLOCK(imgp
->vp
, 0);
524 * Copy out strings (args and env) and initialize stack base
526 if (p
->p_sysent
->sv_copyout_strings
)
527 stack_base
= (*p
->p_sysent
->sv_copyout_strings
)(imgp
);
529 stack_base
= exec_copyout_strings(imgp
);
532 * If custom stack fixup routine present for this process
533 * let it do the stack setup.
534 * Else stuff argument count as first item on stack
536 if (p
->p_sysent
->sv_fixup
!= NULL
)
537 (*p
->p_sysent
->sv_fixup
)(&stack_base
, imgp
);
539 suword(--stack_base
, imgp
->args
->argc
);
542 * For security and other reasons, the file descriptor table cannot
543 * be shared after an exec.
548 * Malloc things before we need locks.
551 euip
= uifind(attr
.va_uid
);
552 i
= imgp
->args
->begin_envv
- imgp
->args
->begin_argv
;
553 /* Cache arguments if they fit inside our allowance */
554 if (ps_arg_cache_limit
>= i
+ sizeof(struct pargs
)) {
555 newargs
= pargs_alloc(i
);
556 bcopy(imgp
->args
->begin_argv
, newargs
->ar_args
, i
);
559 /* close files on exec */
561 vn_lock(imgp
->vp
, LK_EXCLUSIVE
| LK_RETRY
);
563 /* Get a reference to the vnode prior to locking the proc */
567 * For security and other reasons, signal handlers cannot
568 * be shared after an exec. The new process gets a copy of the old
569 * handlers. In execsigs(), the new process will have its signals
573 if (sigacts_shared(p
->p_sigacts
)) {
574 oldsigacts
= p
->p_sigacts
;
576 newsigacts
= sigacts_alloc();
577 sigacts_copy(newsigacts
, oldsigacts
);
579 p
->p_sigacts
= newsigacts
;
586 /* reset caught signals */
589 /* name this process - nameiexec(p, ndp) */
591 len
= min(ndp
->ni_cnd
.cn_namelen
,MAXCOMLEN
);
592 bcopy(ndp
->ni_cnd
.cn_nameptr
, p
->p_comm
, len
);
595 if (vn_commname(binvp
, p
->p_comm
, MAXCOMLEN
+ 1) == 0)
598 len
= sizeof(fexecv_proc_title
);
599 bcopy(fexecv_proc_title
, p
->p_comm
, len
);
603 bcopy(p
->p_comm
, td
->td_name
, sizeof(td
->td_name
));
606 * mark as execed, wakeup the process that vforked (if any) and tell
607 * it that it now has its own resources back
610 if (p
->p_pptr
&& (p
->p_flag
& P_PPWAIT
)) {
611 p
->p_flag
&= ~P_PPWAIT
;
616 * Implement image setuid/setgid.
618 * Don't honor setuid/setgid if the filesystem prohibits it or if
619 * the process is being traced.
621 * XXXMAC: For the time being, use NOSUID to also prohibit
622 * transitions on the file system.
624 oldcred
= p
->p_ucred
;
625 credential_changing
= 0;
626 credential_changing
|= (attr
.va_mode
& S_ISUID
) && oldcred
->cr_uid
!=
628 credential_changing
|= (attr
.va_mode
& S_ISGID
) && oldcred
->cr_gid
!=
631 will_transition
= mac_vnode_execve_will_transition(oldcred
, imgp
->vp
,
632 interpvplabel
, imgp
);
633 credential_changing
|= will_transition
;
636 if (credential_changing
&&
637 (imgp
->vp
->v_mount
->mnt_flag
& MNT_NOSUID
) == 0 &&
638 (p
->p_flag
& P_TRACED
) == 0) {
640 * Turn off syscall tracing for set-id programs, except for
641 * root. Record any set-id flags first to make sure that
642 * we do not regain any tracing during a possible block.
647 if (p
->p_tracevp
!= NULL
&&
648 priv_check_cred(oldcred
, PRIV_DEBUG_DIFFCRED
, 0)) {
649 mtx_lock(&ktrace_mtx
);
651 tracevp
= p
->p_tracevp
;
653 tracecred
= p
->p_tracecred
;
654 p
->p_tracecred
= NULL
;
655 mtx_unlock(&ktrace_mtx
);
659 * Close any file descriptors 0..2 that reference procfs,
660 * then make sure file descriptors 0..2 are in use.
662 * setugidsafety() may call closef() and then pfind()
663 * which may grab the process lock.
664 * fdcheckstd() may call falloc() which may block to
665 * allocate memory, so temporarily drop the process lock.
669 VOP_UNLOCK(imgp
->vp
, 0);
670 error
= fdcheckstd(td
);
671 vn_lock(imgp
->vp
, LK_EXCLUSIVE
| LK_RETRY
);
676 * Set the new credentials.
678 crcopy(newcred
, oldcred
);
679 if (attr
.va_mode
& S_ISUID
)
680 change_euid(newcred
, euip
);
681 if (attr
.va_mode
& S_ISGID
)
682 change_egid(newcred
, attr
.va_gid
);
684 if (will_transition
) {
685 mac_vnode_execve_transition(oldcred
, newcred
, imgp
->vp
,
686 interpvplabel
, imgp
);
690 * Implement correct POSIX saved-id behavior.
692 * XXXMAC: Note that the current logic will save the
693 * uid and gid if a MAC domain transition occurs, even
694 * though maybe it shouldn't.
696 change_svuid(newcred
, newcred
->cr_uid
);
697 change_svgid(newcred
, newcred
->cr_gid
);
698 p
->p_ucred
= newcred
;
701 if (oldcred
->cr_uid
== oldcred
->cr_ruid
&&
702 oldcred
->cr_gid
== oldcred
->cr_rgid
)
703 p
->p_flag
&= ~P_SUGID
;
705 * Implement correct POSIX saved-id behavior.
707 * XXX: It's not clear that the existing behavior is
708 * POSIX-compliant. A number of sources indicate that the
709 * saved uid/gid should only be updated if the new ruid is
710 * not equal to the old ruid, or the new euid is not equal
711 * to the old euid and the new euid is not equal to the old
712 * ruid. The FreeBSD code always updates the saved uid/gid.
713 * Also, this code uses the new (replaced) euid and egid as
714 * the source, which may or may not be the right ones to use.
716 if (oldcred
->cr_svuid
!= oldcred
->cr_uid
||
717 oldcred
->cr_svgid
!= oldcred
->cr_gid
) {
718 crcopy(newcred
, oldcred
);
719 change_svuid(newcred
, newcred
->cr_uid
);
720 change_svgid(newcred
, newcred
->cr_gid
);
721 p
->p_ucred
= newcred
;
727 * Store the vp for use in procfs. This vnode was referenced prior
728 * to locking the proc lock.
730 textvp
= p
->p_textvp
;
735 * Tell the DTrace fasttrap provider about the exec if it
736 * has declared an interest.
738 if (dtrace_fasttrap_exec
)
739 dtrace_fasttrap_exec(p
);
743 * Notify others that we exec'd, and clear the P_INEXEC flag
744 * as we're now a bona fide freshly-execed process.
746 KNOTE_LOCKED(&p
->p_klist
, NOTE_EXEC
);
747 p
->p_flag
&= ~P_INEXEC
;
750 * If tracing the process, trap to debugger so breakpoints
751 * can be set before the program executes.
752 * Use tdsignal to deliver signal to current thread, use
753 * psignal may cause the signal to be delivered to wrong thread
754 * because that thread will exit, remember we are going to enter
755 * single thread mode.
757 if (p
->p_flag
& P_TRACED
)
758 tdsignal(p
, td
, SIGTRAP
, NULL
);
760 /* clear "fork but no exec" flag, as we _are_ execing */
761 p
->p_acflag
&= ~AFORK
;
764 * Free any previous argument cache and replace it with
765 * the new argument cache, if any.
773 * Check if system-wide sampling is in effect or if the
774 * current process is using PMCs. If so, do exec() time
775 * processing. This processing needs to happen AFTER the
776 * P_INEXEC flag is cleared.
778 * The proc lock needs to be released before taking the PMC
781 if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p
)) {
783 pe
.pm_credentialschanged
= credential_changing
;
784 pe
.pm_entryaddr
= imgp
->entry_addr
;
786 PMC_CALL_HOOK_X(td
, PMC_FN_PROCESS_EXEC
, (void *) &pe
);
789 #else /* !HWPMC_HOOKS */
793 /* Set values passed into the program in registers. */
794 if (p
->p_sysent
->sv_setregs
)
795 (*p
->p_sysent
->sv_setregs
)(td
, imgp
->entry_addr
,
796 (u_long
)(uintptr_t)stack_base
, imgp
->ps_strings
);
798 exec_setregs(td
, imgp
->entry_addr
,
799 (u_long
)(uintptr_t)stack_base
, imgp
->ps_strings
);
801 vfs_mark_atime(imgp
->vp
, td
->td_ucred
);
806 * Free any resources malloc'd earlier that we didn't use.
813 VOP_UNLOCK(imgp
->vp
, 0);
815 SDT_PROBE(proc
, kernel
, , exec_success
, args
->fname
, 0, 0, 0, 0);
818 * Handle deferred decrement of ref counts.
820 if (textvp
!= NULL
) {
823 tvfslocked
= VFS_LOCK_GIANT(textvp
->v_mount
);
825 VFS_UNLOCK_GIANT(tvfslocked
);
827 if (binvp
&& error
!= 0)
830 if (tracevp
!= NULL
) {
833 tvfslocked
= VFS_LOCK_GIANT(tracevp
->v_mount
);
835 VFS_UNLOCK_GIANT(tvfslocked
);
837 if (tracecred
!= NULL
)
840 vn_lock(imgp
->vp
, LK_EXCLUSIVE
| LK_RETRY
);
843 if (oldsigacts
!= NULL
)
844 sigacts_free(oldsigacts
);
849 * free various allocated resources
851 if (imgp
->firstpage
!= NULL
)
852 exec_unmap_first_page(imgp
);
854 if (imgp
->vp
!= NULL
) {
856 NDFREE(ndp
, NDF_ONLY_PNBUF
);
858 VOP_CLOSE(imgp
->vp
, FREAD
, td
->td_ucred
, td
);
862 if (imgp
->object
!= NULL
)
863 vm_object_deallocate(imgp
->object
);
867 * Stop the process here if its stop event mask has
868 * the S_EXEC bit set.
870 STOPEVENT(p
, S_EXEC
, 0);
875 /* we're done here, clear P_INEXEC */
877 p
->p_flag
&= ~P_INEXEC
;
880 SDT_PROBE(proc
, kernel
, , exec_failure
, error
, 0, 0, 0, 0);
884 mac_execve_exit(imgp
);
885 mac_execve_interpreter_exit(interpvplabel
);
887 VFS_UNLOCK_GIANT(vfslocked
);
888 exec_free_args(args
);
890 if (error
&& imgp
->vmspace_destroyed
) {
891 /* sorry, no more process anymore. exit gracefully */
892 exit1(td
, W_EXITCODE(0, SIGABRT
));
899 exec_map_first_page(imgp
)
900 struct image_params
*imgp
;
904 vm_page_t ma
[VM_INITIAL_PAGEIN
];
907 if (imgp
->firstpage
!= NULL
)
908 exec_unmap_first_page(imgp
);
910 object
= imgp
->vp
->v_object
;
913 VM_OBJECT_LOCK(object
);
914 #if VM_NRESERVLEVEL > 0
915 if ((object
->flags
& OBJ_COLORED
) == 0) {
916 object
->flags
|= OBJ_COLORED
;
917 object
->pg_color
= 0;
920 ma
[0] = vm_page_grab(object
, 0, VM_ALLOC_NORMAL
| VM_ALLOC_RETRY
);
921 if ((ma
[0]->valid
& VM_PAGE_BITS_ALL
) != VM_PAGE_BITS_ALL
) {
922 initial_pagein
= VM_INITIAL_PAGEIN
;
923 if (initial_pagein
> object
->size
)
924 initial_pagein
= object
->size
;
925 for (i
= 1; i
< initial_pagein
; i
++) {
926 if ((ma
[i
] = vm_page_lookup(object
, i
)) != NULL
) {
929 if ((ma
[i
]->oflags
& VPO_BUSY
) || ma
[i
]->busy
)
933 ma
[i
] = vm_page_alloc(object
, i
,
934 VM_ALLOC_NORMAL
| VM_ALLOC_IFNOTCACHED
);
940 rv
= vm_pager_get_pages(object
, ma
, initial_pagein
, 0);
941 ma
[0] = vm_page_lookup(object
, 0);
942 if ((rv
!= VM_PAGER_OK
) || (ma
[0] == NULL
) ||
943 (ma
[0]->valid
== 0)) {
945 vm_page_lock_queues();
947 vm_page_unlock_queues();
949 VM_OBJECT_UNLOCK(object
);
953 vm_page_lock_queues();
955 vm_page_unlock_queues();
956 vm_page_wakeup(ma
[0]);
957 VM_OBJECT_UNLOCK(object
);
959 imgp
->firstpage
= sf_buf_alloc(ma
[0], 0);
960 imgp
->image_header
= (char *)sf_buf_kva(imgp
->firstpage
);
966 exec_unmap_first_page(imgp
)
967 struct image_params
*imgp
;
971 if (imgp
->firstpage
!= NULL
) {
972 m
= sf_buf_page(imgp
->firstpage
);
973 sf_buf_free(imgp
->firstpage
);
974 imgp
->firstpage
= NULL
;
975 vm_page_lock_queues();
977 vm_page_unlock_queues();
982 * Destroy old address space, and allocate a new stack
983 * The new stack is only SGROWSIZ large because it is grown
984 * automatically in trap.c.
987 exec_new_vmspace(imgp
, sv
)
988 struct image_params
*imgp
;
989 struct sysentvec
*sv
;
992 struct proc
*p
= imgp
->proc
;
993 struct vmspace
*vmspace
= p
->p_vmspace
;
994 vm_offset_t stack_addr
;
998 imgp
->vmspace_destroyed
= 1;
1001 /* May be called with Giant held */
1002 EVENTHANDLER_INVOKE(process_exec
, p
, imgp
);
1005 * Blow away entire process VM, if address space not shared,
1006 * otherwise, create a new VM space so that other threads are
1009 map
= &vmspace
->vm_map
;
1010 if (vmspace
->vm_refcnt
== 1 && vm_map_min(map
) == sv
->sv_minuser
&&
1011 vm_map_max(map
) == sv
->sv_maxuser
) {
1013 pmap_remove_pages(vmspace_pmap(vmspace
));
1014 vm_map_remove(map
, vm_map_min(map
), vm_map_max(map
));
1016 error
= vmspace_exec(p
, sv
->sv_minuser
, sv
->sv_maxuser
);
1019 vmspace
= p
->p_vmspace
;
1020 map
= &vmspace
->vm_map
;
1023 /* Allocate a new stack */
1024 if (sv
->sv_maxssiz
!= NULL
)
1025 ssiz
= *sv
->sv_maxssiz
;
1028 stack_addr
= sv
->sv_usrstack
- ssiz
;
1029 error
= vm_map_stack(map
, stack_addr
, (vm_size_t
)ssiz
,
1030 sv
->sv_stackprot
, VM_PROT_ALL
, MAP_STACK_GROWS_DOWN
);
1035 /* Allocate a new register stack */
1036 stack_addr
= IA64_BACKINGSTORE
;
1037 error
= vm_map_stack(map
, stack_addr
, (vm_size_t
)ssiz
,
1038 sv
->sv_stackprot
, VM_PROT_ALL
, MAP_STACK_GROWS_UP
);
1043 /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
1044 * VM_STACK case, but they are still used to monitor the size of the
1045 * process stack so we can check the stack rlimit.
1047 vmspace
->vm_ssize
= sgrowsiz
>> PAGE_SHIFT
;
1048 vmspace
->vm_maxsaddr
= (char *)sv
->sv_usrstack
- ssiz
;
1054 * Copy out argument and environment strings from the old process address
1055 * space into the temporary string buffer.
1058 exec_copyin_args(struct image_args
*args
, char *fname
,
1059 enum uio_seg segflg
, char **argv
, char **envv
)
1067 bzero(args
, sizeof(*args
));
1071 * Allocate temporary demand zeroed space for argument and
1072 * environment strings:
1074 * o ARG_MAX for argument and environment;
1075 * o MAXSHELLCMDLEN for the name of interpreters.
1077 args
->buf
= (char *) kmem_alloc_wait(exec_map
,
1078 PATH_MAX
+ ARG_MAX
+ MAXSHELLCMDLEN
);
1079 if (args
->buf
== NULL
)
1081 args
->begin_argv
= args
->buf
;
1082 args
->endp
= args
->begin_argv
;
1083 args
->stringspace
= ARG_MAX
;
1085 * Copy the file name.
1087 if (fname
!= NULL
) {
1088 args
->fname
= args
->buf
+ ARG_MAX
;
1089 error
= (segflg
== UIO_SYSSPACE
) ?
1090 copystr(fname
, args
->fname
, PATH_MAX
, &length
) :
1091 copyinstr(fname
, args
->fname
, PATH_MAX
, &length
);
1098 * extract arguments first
1100 while ((argp
= (caddr_t
) (intptr_t) fuword(argv
++))) {
1101 if (argp
== (caddr_t
) -1) {
1105 if ((error
= copyinstr(argp
, args
->endp
,
1106 args
->stringspace
, &length
))) {
1107 if (error
== ENAMETOOLONG
)
1111 args
->stringspace
-= length
;
1112 args
->endp
+= length
;
1116 args
->begin_envv
= args
->endp
;
1119 * extract environment strings
1122 while ((envp
= (caddr_t
)(intptr_t)fuword(envv
++))) {
1123 if (envp
== (caddr_t
)-1) {
1127 if ((error
= copyinstr(envp
, args
->endp
,
1128 args
->stringspace
, &length
))) {
1129 if (error
== ENAMETOOLONG
)
1133 args
->stringspace
-= length
;
1134 args
->endp
+= length
;
1142 exec_free_args(args
);
1147 exec_free_args(struct image_args
*args
)
1151 kmem_free_wakeup(exec_map
, (vm_offset_t
)args
->buf
,
1152 PATH_MAX
+ ARG_MAX
+ MAXSHELLCMDLEN
);
1158 * Copy strings out to the new process address space, constructing new arg
1159 * and env vector tables. Return a pointer to the base so that it can be used
1160 * as the initial stack pointer.
1163 exec_copyout_strings(imgp
)
1164 struct image_params
*imgp
;
1168 char *stringp
, *destp
;
1169 register_t
*stack_base
;
1170 struct ps_strings
*arginfo
;
1175 * Calculate string base and vector table pointers.
1176 * Also deal with signal trampoline code for this exec type.
1180 arginfo
= (struct ps_strings
*)p
->p_sysent
->sv_psstrings
;
1181 if (p
->p_sysent
->sv_szsigcode
!= NULL
)
1182 szsigcode
= *(p
->p_sysent
->sv_szsigcode
);
1183 destp
= (caddr_t
)arginfo
- szsigcode
- SPARE_USRSPACE
-
1184 roundup((ARG_MAX
- imgp
->args
->stringspace
), sizeof(char *));
1190 copyout(p
->p_sysent
->sv_sigcode
, ((caddr_t
)arginfo
-
1191 szsigcode
), szsigcode
);
1194 * If we have a valid auxargs ptr, prepare some room
1197 if (imgp
->auxargs
) {
1199 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
1200 * lower compatibility.
1202 imgp
->auxarg_size
= (imgp
->auxarg_size
) ? imgp
->auxarg_size
:
1205 * The '+ 2' is for the null pointers at the end of each of
1206 * the arg and env vector sets,and imgp->auxarg_size is room
1207 * for argument of Runtime loader.
1209 vectp
= (char **)(destp
- (imgp
->args
->argc
+
1210 imgp
->args
->envc
+ 2 + imgp
->auxarg_size
) *
1215 * The '+ 2' is for the null pointers at the end of each of
1216 * the arg and env vector sets
1218 vectp
= (char **)(destp
- (imgp
->args
->argc
+ imgp
->args
->envc
+ 2) *
1223 * vectp also becomes our initial stack base
1225 stack_base
= (register_t
*)vectp
;
1227 stringp
= imgp
->args
->begin_argv
;
1228 argc
= imgp
->args
->argc
;
1229 envc
= imgp
->args
->envc
;
1232 * Copy out strings - arguments and environment.
1234 copyout(stringp
, destp
, ARG_MAX
- imgp
->args
->stringspace
);
1237 * Fill in "ps_strings" struct for ps, w, etc.
1239 suword(&arginfo
->ps_argvstr
, (long)(intptr_t)vectp
);
1240 suword(&arginfo
->ps_nargvstr
, argc
);
1243 * Fill in argument portion of vector table.
1245 for (; argc
> 0; --argc
) {
1246 suword(vectp
++, (long)(intptr_t)destp
);
1247 while (*stringp
++ != 0)
1252 /* a null vector table pointer separates the argp's from the envp's */
1255 suword(&arginfo
->ps_envstr
, (long)(intptr_t)vectp
);
1256 suword(&arginfo
->ps_nenvstr
, envc
);
1259 * Fill in environment portion of vector table.
1261 for (; envc
> 0; --envc
) {
1262 suword(vectp
++, (long)(intptr_t)destp
);
1263 while (*stringp
++ != 0)
1268 /* end of vector table is a null pointer */
1271 return (stack_base
);
1275 * Check permissions of file to execute.
1276 * Called with imgp->vp locked.
1277 * Return 0 for success or error code on failure.
1280 exec_check_permissions(imgp
)
1281 struct image_params
*imgp
;
1283 struct vnode
*vp
= imgp
->vp
;
1284 struct vattr
*attr
= imgp
->attr
;
1290 /* Get file attributes */
1291 error
= VOP_GETATTR(vp
, attr
, td
->td_ucred
);
1296 error
= mac_vnode_check_exec(td
->td_ucred
, imgp
->vp
, imgp
);
1302 * 1) Check if file execution is disabled for the filesystem that this
1304 * 2) Insure that at least one execute bit is on - otherwise root
1305 * will always succeed, and we don't want to happen unless the
1306 * file really is executable.
1307 * 3) Insure that the file is a regular file.
1309 if ((vp
->v_mount
->mnt_flag
& MNT_NOEXEC
) ||
1310 ((attr
->va_mode
& 0111) == 0) ||
1311 (attr
->va_type
!= VREG
))
1315 * Zero length files can't be exec'd
1317 if (attr
->va_size
== 0)
1321 * Check for execute permission to file based on current credentials.
1323 error
= VOP_ACCESS(vp
, VEXEC
, td
->td_ucred
, td
);
1328 * Check number of open-for-writes on the file and deny execution
1331 if (vp
->v_writecount
)
1335 * Call filesystem specific open routine (which does nothing in the
1338 error
= VOP_OPEN(vp
, FREAD
, td
->td_ucred
, td
, NULL
);
1345 * Exec handler registration
1348 exec_register(execsw_arg
)
1349 const struct execsw
*execsw_arg
;
1351 const struct execsw
**es
, **xs
, **newexecsw
;
1352 int count
= 2; /* New slot and trailing NULL */
1355 for (es
= execsw
; *es
; es
++)
1357 newexecsw
= malloc(count
* sizeof(*es
), M_TEMP
, M_WAITOK
);
1358 if (newexecsw
== NULL
)
1362 for (es
= execsw
; *es
; es
++)
1367 free(execsw
, M_TEMP
);
1373 exec_unregister(execsw_arg
)
1374 const struct execsw
*execsw_arg
;
1376 const struct execsw
**es
, **xs
, **newexecsw
;
1380 panic("unregister with no handlers left?\n");
1382 for (es
= execsw
; *es
; es
++) {
1383 if (*es
== execsw_arg
)
1388 for (es
= execsw
; *es
; es
++)
1389 if (*es
!= execsw_arg
)
1391 newexecsw
= malloc(count
* sizeof(*es
), M_TEMP
, M_WAITOK
);
1392 if (newexecsw
== NULL
)
1395 for (es
= execsw
; *es
; es
++)
1396 if (*es
!= execsw_arg
)
1400 free(execsw
, M_TEMP
);