sys/kern/kern_exec.c

   1 /*      $NetBSD: kern_exec.c,v 1.292 2009/12/10 14:13:54 matt Exp $     */
   2
   3 /*-
   4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26  * POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 /*-
  30  * Copyright (C) 1993, 1994, 1996 Christopher G. Demetriou
  31  * Copyright (C) 1992 Wolfgang Solfrank.
  32  * Copyright (C) 1992 TooLs GmbH.
  33  * All rights reserved.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 3. All advertising materials mentioning features or use of this software
  44  *    must display the following acknowledgement:
  45  *      This product includes software developed by TooLs GmbH.
  46  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  47  *    derived from this software without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  50  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  51  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  52  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  53  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  54  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  55  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  56  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  57  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  58  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  59  */
  60
  61 #include <sys/cdefs.h>
  62 __KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.292 2009/12/10 14:13:54 matt Exp $");
  63
  64 #include "opt_ktrace.h"
  65 #include "opt_modular.h"
  66 #include "opt_syscall_debug.h"
  67 #include "veriexec.h"
  68 #include "opt_pax.h"
  69 #include "opt_sa.h"
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/filedesc.h>
  74 #include <sys/kernel.h>
  75 #include <sys/proc.h>
  76 #include <sys/mount.h>
  77 #include <sys/malloc.h>
  78 #include <sys/kmem.h>
  79 #include <sys/namei.h>
  80 #include <sys/vnode.h>
  81 #include <sys/file.h>
  82 #include <sys/acct.h>
  83 #include <sys/exec.h>
  84 #include <sys/ktrace.h>
  85 #include <sys/uidinfo.h>
  86 #include <sys/wait.h>
  87 #include <sys/mman.h>
  88 #include <sys/ras.h>
  89 #include <sys/signalvar.h>
  90 #include <sys/stat.h>
  91 #include <sys/syscall.h>
  92 #include <sys/kauth.h>
  93 #include <sys/lwpctl.h>
  94 #include <sys/pax.h>
  95 #include <sys/cpu.h>
  96 #include <sys/module.h>
  97 #include <sys/sa.h>
  98 #include <sys/savar.h>
  99 #include <sys/syscallvar.h>
 100 #include <sys/syscallargs.h>
 101 #if NVERIEXEC > 0
 102 #include <sys/verified_exec.h>
 103 #endif /* NVERIEXEC > 0 */
 104
 105 #include <uvm/uvm_extern.h>
 106
 107 #include <machine/reg.h>
 108
 109 #include <compat/common/compat_util.h>
 110
 111 static int exec_sigcode_map(struct proc *, const struct emul *);
 112
 113 #ifdef DEBUG_EXEC
 114 #define DPRINTF(a) uprintf a
 115 #else
 116 #define DPRINTF(a)
 117 #endif /* DEBUG_EXEC */
 118
 119 /*
 120  * Exec function switch:
 121  *
 122  * Note that each makecmds function is responsible for loading the
 123  * exec package with the necessary functions for any exec-type-specific
 124  * handling.
 125  *
 126  * Functions for specific exec types should be defined in their own
 127  * header file.
 128  */
 129 static const struct execsw      **execsw = NULL;
 130 static int                      nexecs;
 131
 132 u_int   exec_maxhdrsz;   /* must not be static - used by netbsd32 */
 133
 134 /* list of dynamically loaded execsw entries */
 135 static LIST_HEAD(execlist_head, exec_entry) ex_head =
 136     LIST_HEAD_INITIALIZER(ex_head);
 137 struct exec_entry {
 138         LIST_ENTRY(exec_entry)  ex_list;
 139         SLIST_ENTRY(exec_entry) ex_slist;
 140         const struct execsw     *ex_sw;
 141 };
 142
 143 #ifndef __HAVE_SYSCALL_INTERN
 144 void    syscall(void);
 145 #endif
 146
 147 #ifdef KERN_SA
 148 static struct sa_emul saemul_netbsd = {
 149         sizeof(ucontext_t),
 150         sizeof(struct sa_t),
 151         sizeof(struct sa_t *),
 152         NULL,
 153         NULL,
 154         cpu_upcall,
 155         (void (*)(struct lwp *, void *))getucontext_sa,
 156         sa_ucsp
 157 };
 158 #endif /* KERN_SA */
 159
 160 /* NetBSD emul struct */
 161 struct emul emul_netbsd = {
 162         .e_name =               "netbsd",
 163         .e_path =               NULL,
 164 #ifndef __HAVE_MINIMAL_EMUL
 165         .e_flags =              EMUL_HAS_SYS___syscall,
 166         .e_errno =              NULL,
 167         .e_nosys =              SYS_syscall,
 168         .e_nsysent =            SYS_NSYSENT,
 169 #endif
 170         .e_sysent =             sysent,
 171 #ifdef SYSCALL_DEBUG
 172         .e_syscallnames =       syscallnames,
 173 #else
 174         .e_syscallnames =       NULL,
 175 #endif
 176         .e_sendsig =            sendsig,
 177         .e_trapsignal =         trapsignal,
 178         .e_tracesig =           NULL,
 179         .e_sigcode =            NULL,
 180         .e_esigcode =           NULL,
 181         .e_sigobject =          NULL,
 182         .e_setregs =            setregs,
 183         .e_proc_exec =          NULL,
 184         .e_proc_fork =          NULL,
 185         .e_proc_exit =          NULL,
 186         .e_lwp_fork =           NULL,
 187         .e_lwp_exit =           NULL,
 188 #ifdef __HAVE_SYSCALL_INTERN
 189         .e_syscall_intern =     syscall_intern,
 190 #else
 191         .e_syscall =            syscall,
 192 #endif
 193         .e_sysctlovly =         NULL,
 194         .e_fault =              NULL,
 195         .e_vm_default_addr =    uvm_default_mapaddr,
 196         .e_usertrap =           NULL,
 197 #ifdef KERN_SA
 198         .e_sa =                 &saemul_netbsd,
 199 #else
 200         .e_sa =                 NULL,
 201 #endif
 202         .e_ucsize =             sizeof(ucontext_t),
 203         .e_startlwp =           startlwp
 204 };
 205
 206 /*
 207  * Exec lock. Used to control access to execsw[] structures.
 208  * This must not be static so that netbsd32 can access it, too.
 209  */
 210 krwlock_t exec_lock;
 211
 212 static kmutex_t sigobject_lock;
 213
 214 static void *
 215 exec_pool_alloc(struct pool *pp, int flags)
 216 {
 217
 218         return (void *)uvm_km_alloc(kernel_map, NCARGS, 0,
 219             UVM_KMF_PAGEABLE | UVM_KMF_WAITVA);
 220 }
 221
 222 static void
 223 exec_pool_free(struct pool *pp, void *addr)
 224 {
 225
 226         uvm_km_free(kernel_map, (vaddr_t)addr, NCARGS, UVM_KMF_PAGEABLE);
 227 }
 228
 229 static struct pool exec_pool;
 230
 231 static struct pool_allocator exec_palloc = {
 232         .pa_alloc = exec_pool_alloc,
 233         .pa_free = exec_pool_free,
 234         .pa_pagesz = NCARGS
 235 };
 236
 237 /*
 238  * check exec:
 239  * given an "executable" described in the exec package's namei info,
 240  * see what we can do with it.
 241  *
 242  * ON ENTRY:
 243  *      exec package with appropriate namei info
 244  *      lwp pointer of exec'ing lwp
 245  *      NO SELF-LOCKED VNODES
 246  *
 247  * ON EXIT:
 248  *      error:  nothing held, etc.  exec header still allocated.
 249  *      ok:     filled exec package, executable's vnode (unlocked).
 250  *
 251  * EXEC SWITCH ENTRY:
 252  *      Locked vnode to check, exec package, proc.
 253  *
 254  * EXEC SWITCH EXIT:
 255  *      ok:     return 0, filled exec package, executable's vnode (unlocked).
 256  *      error:  destructive:
 257  *                      everything deallocated execept exec header.
 258  *              non-destructive:
 259  *                      error code, executable's vnode (unlocked),
 260  *                      exec header unmodified.
 261  */
 262 int
 263 /*ARGSUSED*/
 264 check_exec(struct lwp *l, struct exec_package *epp)
 265 {
 266         int             error, i;
 267         struct vnode    *vp;
 268         struct nameidata *ndp;
 269         size_t          resid;
 270
 271         ndp = epp->ep_ndp;
 272         ndp->ni_cnd.cn_nameiop = LOOKUP;
 273         ndp->ni_cnd.cn_flags = FOLLOW | LOCKLEAF | SAVENAME | TRYEMULROOT;
 274         /* first get the vnode */
 275         if ((error = namei(ndp)) != 0)
 276                 return error;
 277         epp->ep_vp = vp = ndp->ni_vp;
 278
 279         /* check access and type */
 280         if (vp->v_type != VREG) {
 281                 error = EACCES;
 282                 goto bad1;
 283         }
 284         if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != 0)
 285                 goto bad1;
 286
 287         /* get attributes */
 288         if ((error = VOP_GETATTR(vp, epp->ep_vap, l->l_cred)) != 0)
 289                 goto bad1;
 290
 291         /* Check mount point */
 292         if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
 293                 error = EACCES;
 294                 goto bad1;
 295         }
 296         if (vp->v_mount->mnt_flag & MNT_NOSUID)
 297                 epp->ep_vap->va_mode &= ~(S_ISUID | S_ISGID);
 298
 299         /* try to open it */
 300         if ((error = VOP_OPEN(vp, FREAD, l->l_cred)) != 0)
 301                 goto bad1;
 302
 303         /* unlock vp, since we need it unlocked from here on out. */
 304         VOP_UNLOCK(vp, 0);
 305
 306 #if NVERIEXEC > 0
 307         error = veriexec_verify(l, vp, ndp->ni_cnd.cn_pnbuf,
 308             epp->ep_flags & EXEC_INDIR ? VERIEXEC_INDIRECT : VERIEXEC_DIRECT,
 309             NULL);
 310         if (error)
 311                 goto bad2;
 312 #endif /* NVERIEXEC > 0 */
 313
 314 #ifdef PAX_SEGVGUARD
 315         error = pax_segvguard(l, vp, ndp->ni_cnd.cn_pnbuf, false);
 316         if (error)
 317                 goto bad2;
 318 #endif /* PAX_SEGVGUARD */
 319
 320         /* now we have the file, get the exec header */
 321         error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, 0,
 322                         UIO_SYSSPACE, 0, l->l_cred, &resid, NULL);
 323         if (error)
 324                 goto bad2;
 325         epp->ep_hdrvalid = epp->ep_hdrlen - resid;
 326
 327         /*
 328          * Set up default address space limits.  Can be overridden
 329          * by individual exec packages.
 330          *
 331          * XXX probably should be all done in the exec packages.
 332          */
 333         epp->ep_vm_minaddr = VM_MIN_ADDRESS;
 334         epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS;
 335         /*
 336          * set up the vmcmds for creation of the process
 337          * address space
 338          */
 339         error = ENOEXEC;
 340         for (i = 0; i < nexecs; i++) {
 341                 int newerror;
 342
 343                 epp->ep_esch = execsw[i];
 344                 newerror = (*execsw[i]->es_makecmds)(l, epp);
 345
 346                 if (!newerror) {
 347                         /* Seems ok: check that entry point is sane */
 348                         if (epp->ep_entry > VM_MAXUSER_ADDRESS) {
 349                                 error = ENOEXEC;
 350                                 break;
 351                         }
 352
 353                         /* check limits */
 354                         if ((epp->ep_tsize > MAXTSIZ) ||
 355                             (epp->ep_dsize > (u_quad_t)l->l_proc->p_rlimit
 356                                                     [RLIMIT_DATA].rlim_cur)) {
 357                                 error = ENOMEM;
 358                                 break;
 359                         }
 360                         return 0;
 361                 }
 362
 363                 if (epp->ep_emul_root != NULL) {
 364                         vrele(epp->ep_emul_root);
 365                         epp->ep_emul_root = NULL;
 366                 }
 367                 if (epp->ep_interp != NULL) {
 368                         vrele(epp->ep_interp);
 369                         epp->ep_interp = NULL;
 370                 }
 371
 372                 /* make sure the first "interesting" error code is saved. */
 373                 if (error == ENOEXEC)
 374                         error = newerror;
 375
 376                 if (epp->ep_flags & EXEC_DESTR)
 377                         /* Error from "#!" code, tidied up by recursive call */
 378                         return error;
 379         }
 380
 381         /* not found, error */
 382
 383         /*
 384          * free any vmspace-creation commands,
 385          * and release their references
 386          */
 387         kill_vmcmds(&epp->ep_vmcmds);
 388
 389 bad2:
 390         /*
 391          * close and release the vnode, restore the old one, free the
 392          * pathname buf, and punt.
 393          */
 394         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 395         VOP_CLOSE(vp, FREAD, l->l_cred);
 396         vput(vp);
 397         PNBUF_PUT(ndp->ni_cnd.cn_pnbuf);
 398         return error;
 399
 400 bad1:
 401         /*
 402          * free the namei pathname buffer, and put the vnode
 403          * (which we don't yet have open).
 404          */
 405         vput(vp);                               /* was still locked */
 406         PNBUF_PUT(ndp->ni_cnd.cn_pnbuf);
 407         return error;
 408 }
 409
 410 #ifdef __MACHINE_STACK_GROWS_UP
 411 #define STACK_PTHREADSPACE NBPG
 412 #else
 413 #define STACK_PTHREADSPACE 0
 414 #endif
 415
 416 static int
 417 execve_fetch_element(char * const *array, size_t index, char **value)
 418 {
 419         return copyin(array + index, value, sizeof(*value));
 420 }
 421
 422 /*
 423  * exec system call
 424  */
 425 /* ARGSUSED */
 426 int
 427 sys_execve(struct lwp *l, const struct sys_execve_args *uap, register_t *retval)
 428 {
 429         /* {
 430                 syscallarg(const char *)        path;
 431                 syscallarg(char * const *)      argp;
 432                 syscallarg(char * const *)      envp;
 433         } */
 434
 435         return execve1(l, SCARG(uap, path), SCARG(uap, argp),
 436             SCARG(uap, envp), execve_fetch_element);
 437 }
 438
 439 /*
 440  * Load modules to try and execute an image that we do not understand.
 441  * If no execsw entries are present, we load those likely to be needed
 442  * in order to run native images only.  Otherwise, we autoload all
 443  * possible modules that could let us run the binary.  XXX lame
 444  */
 445 static void
 446 exec_autoload(void)
 447 {
 448 #ifdef MODULAR
 449         static const char * const native[] = {
 450                 "exec_elf32",
 451                 "exec_elf64",
 452                 "exec_script",
 453                 NULL
 454         };
 455         static const char * const compat[] = {
 456                 "exec_elf32",
 457                 "exec_elf64",
 458                 "exec_script",
 459                 "exec_aout",
 460                 "exec_coff",
 461                 "exec_ecoff",
 462                 "compat_aoutm68k",
 463                 "compat_freebsd",
 464                 "compat_ibcs2",
 465                 "compat_irix",
 466                 "compat_linux",
 467                 "compat_linux32",
 468                 "compat_netbsd32",
 469                 "compat_sunos",
 470                 "compat_sunos32",
 471                 "compat_svr4",
 472                 "compat_svr4_32",
 473                 "compat_ultrix",
 474                 NULL
 475         };
 476         char const * const *list;
 477         int i;
 478
 479         mutex_enter(&module_lock);
 480         list = (nexecs == 0 ? native : compat);
 481         for (i = 0; list[i] != NULL; i++) {
 482                 if (module_autoload(list[i], MODULE_CLASS_MISC) != 0) {
 483                         continue;
 484                 }
 485                 mutex_exit(&module_lock);
 486                 yield();
 487                 mutex_enter(&module_lock);
 488         }
 489         mutex_exit(&module_lock);
 490 #endif
 491 }
 492
 493 int
 494 execve1(struct lwp *l, const char *path, char * const *args,
 495     char * const *envs, execve_fetch_element_t fetch_element)
 496 {
 497         int                     error;
 498         struct exec_package     pack;
 499         struct nameidata        nid;
 500         struct vattr            attr;
 501         struct proc             *p;
 502         char                    *argp;
 503         char                    *dp, *sp;
 504         long                    argc, envc;
 505         size_t                  i, len;
 506         char                    *stack;
 507         struct ps_strings       arginfo;
 508         struct ps_strings       *aip = &arginfo;
 509         struct vmspace          *vm;
 510         struct exec_fakearg     *tmpfap;
 511         int                     szsigcode;
 512         struct exec_vmcmd       *base_vcp;
 513         int                     oldlwpflags;
 514         ksiginfo_t              ksi;
 515         ksiginfoq_t             kq;
 516         char                    *pathbuf;
 517         size_t                  pathbuflen;
 518         u_int                   modgen;
 519
 520         p = l->l_proc;
 521         modgen = 0;
 522
 523         /*
 524          * Check if we have exceeded our number of processes limit.
 525          * This is so that we handle the case where a root daemon
 526          * forked, ran setuid to become the desired user and is trying
 527          * to exec. The obvious place to do the reference counting check
 528          * is setuid(), but we don't do the reference counting check there
 529          * like other OS's do because then all the programs that use setuid()
 530          * must be modified to check the return code of setuid() and exit().
 531          * It is dangerous to make setuid() fail, because it fails open and
 532          * the program will continue to run as root. If we make it succeed
 533          * and return an error code, again we are not enforcing the limit.
 534          * The best place to enforce the limit is here, when the process tries
 535          * to execute a new image, because eventually the process will need
 536          * to call exec in order to do something useful.
 537          */
 538  retry:
 539         if ((p->p_flag & PK_SUGID) && kauth_authorize_generic(l->l_cred,
 540             KAUTH_GENERIC_ISSUSER, NULL) != 0 && chgproccnt(kauth_cred_getuid(
 541             l->l_cred), 0) > p->p_rlimit[RLIMIT_NPROC].rlim_cur)
 542                 return EAGAIN;
 543
 544         oldlwpflags = l->l_flag & (LW_SA | LW_SA_UPCALL);
 545         if (l->l_flag & LW_SA) {
 546                 lwp_lock(l);
 547                 l->l_flag &= ~(LW_SA | LW_SA_UPCALL);
 548                 lwp_unlock(l);
 549         }
 550
 551         /*
 552          * Drain existing references and forbid new ones.  The process
 553          * should be left alone until we're done here.  This is necessary
 554          * to avoid race conditions - e.g. in ptrace() - that might allow
 555          * a local user to illicitly obtain elevated privileges.
 556          */
 557         rw_enter(&p->p_reflock, RW_WRITER);
 558
 559         base_vcp = NULL;
 560         /*
 561          * Init the namei data to point the file user's program name.
 562          * This is done here rather than in check_exec(), so that it's
 563          * possible to override this settings if any of makecmd/probe
 564          * functions call check_exec() recursively - for example,
 565          * see exec_script_makecmds().
 566          */
 567         pathbuf = PNBUF_GET();
 568         error = copyinstr(path, pathbuf, MAXPATHLEN, &pathbuflen);
 569         if (error) {
 570                 DPRINTF(("execve: copyinstr path %d", error));
 571                 goto clrflg;
 572         }
 573
 574         NDINIT(&nid, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_SYSSPACE, pathbuf);
 575
 576         /*
 577          * initialize the fields of the exec package.
 578          */
 579         pack.ep_name = path;
 580         pack.ep_hdr = kmem_alloc(exec_maxhdrsz, KM_SLEEP);
 581         pack.ep_hdrlen = exec_maxhdrsz;
 582         pack.ep_hdrvalid = 0;
 583         pack.ep_ndp = &nid;
 584         pack.ep_emul_arg = NULL;
 585         pack.ep_vmcmds.evs_cnt = 0;
 586         pack.ep_vmcmds.evs_used = 0;
 587         pack.ep_vap = &attr;
 588         pack.ep_flags = 0;
 589         pack.ep_emul_root = NULL;
 590         pack.ep_interp = NULL;
 591         pack.ep_esch = NULL;
 592         pack.ep_pax_flags = 0;
 593
 594         rw_enter(&exec_lock, RW_READER);
 595
 596         /* see if we can run it. */
 597         if ((error = check_exec(l, &pack)) != 0) {
 598                 if (error != ENOENT) {
 599                         DPRINTF(("execve: check exec failed %d\n", error));
 600                 }
 601                 goto freehdr;
 602         }
 603
 604         /* XXX -- THE FOLLOWING SECTION NEEDS MAJOR CLEANUP */
 605
 606         /* allocate an argument buffer */
 607         argp = pool_get(&exec_pool, PR_WAITOK);
 608         KASSERT(argp != NULL);
 609         dp = argp;
 610         argc = 0;
 611
 612         /* copy the fake args list, if there's one, freeing it as we go */
 613         if (pack.ep_flags & EXEC_HASARGL) {
 614                 tmpfap = pack.ep_fa;
 615                 while (tmpfap->fa_arg != NULL) {
 616                         const char *cp;
 617
 618                         cp = tmpfap->fa_arg;
 619                         while (*cp)
 620                                 *dp++ = *cp++;
 621                         *dp++ = '\0';
 622                         ktrexecarg(tmpfap->fa_arg, cp - tmpfap->fa_arg);
 623
 624                         kmem_free(tmpfap->fa_arg, tmpfap->fa_len);
 625                         tmpfap++; argc++;
 626                 }
 627                 kmem_free(pack.ep_fa, pack.ep_fa_len);
 628                 pack.ep_flags &= ~EXEC_HASARGL;
 629         }
 630
 631         /* Now get argv & environment */
 632         if (args == NULL) {
 633                 DPRINTF(("execve: null args\n"));
 634                 error = EINVAL;
 635                 goto bad;
 636         }
 637         /* 'i' will index the argp/envp element to be retrieved */
 638         i = 0;
 639         if (pack.ep_flags & EXEC_SKIPARG)
 640                 i++;
 641
 642         while (1) {
 643                 len = argp + ARG_MAX - dp;
 644                 if ((error = (*fetch_element)(args, i, &sp)) != 0) {
 645                         DPRINTF(("execve: fetch_element args %d\n", error));
 646                         goto bad;
 647                 }
 648                 if (!sp)
 649                         break;
 650                 if ((error = copyinstr(sp, dp, len, &len)) != 0) {
 651                         DPRINTF(("execve: copyinstr args %d\n", error));
 652                         if (error == ENAMETOOLONG)
 653                                 error = E2BIG;
 654                         goto bad;
 655                 }
 656                 ktrexecarg(dp, len - 1);
 657                 dp += len;
 658                 i++;
 659                 argc++;
 660         }
 661
 662         envc = 0;
 663         /* environment need not be there */
 664         if (envs != NULL) {
 665                 i = 0;
 666                 while (1) {
 667                         len = argp + ARG_MAX - dp;
 668                         if ((error = (*fetch_element)(envs, i, &sp)) != 0) {
 669                                 DPRINTF(("execve: fetch_element env %d\n", error));
 670                                 goto bad;
 671                         }
 672                         if (!sp)
 673                                 break;
 674                         if ((error = copyinstr(sp, dp, len, &len)) != 0) {
 675                                 DPRINTF(("execve: copyinstr env %d\n", error));
 676                                 if (error == ENAMETOOLONG)
 677                                         error = E2BIG;
 678                                 goto bad;
 679                         }
 680                         ktrexecenv(dp, len - 1);
 681                         dp += len;
 682                         i++;
 683                         envc++;
 684                 }
 685         }
 686
 687         dp = (char *) ALIGN(dp);
 688
 689         szsigcode = pack.ep_esch->es_emul->e_esigcode -
 690             pack.ep_esch->es_emul->e_sigcode;
 691
 692 #ifdef __MACHINE_STACK_GROWS_UP
 693 /* See big comment lower down */
 694 #define RTLD_GAP        32
 695 #else
 696 #define RTLD_GAP        0
 697 #endif
 698
 699         /* Now check if args & environ fit into new stack */
 700         if (pack.ep_flags & EXEC_32)
 701                 len = ((argc + envc + 2 + pack.ep_esch->es_arglen) *
 702                     sizeof(int) + sizeof(int) + dp + RTLD_GAP +
 703                     szsigcode + sizeof(struct ps_strings) + STACK_PTHREADSPACE)
 704                     - argp;
 705         else
 706                 len = ((argc + envc + 2 + pack.ep_esch->es_arglen) *
 707                     sizeof(char *) + sizeof(int) + dp + RTLD_GAP +
 708                     szsigcode + sizeof(struct ps_strings) + STACK_PTHREADSPACE)
 709                     - argp;
 710
 711 #ifdef PAX_ASLR
 712         if (pax_aslr_active(l))
 713                 len += (arc4random() % PAGE_SIZE);
 714 #endif /* PAX_ASLR */
 715
 716 #ifdef STACKLALIGN      /* arm, etc. */
 717         len = STACKALIGN(len);  /* make the stack "safely" aligned */
 718 #else
 719         len = ALIGN(len);       /* make the stack "safely" aligned */
 720 #endif
 721
 722         if (len > pack.ep_ssize) { /* in effect, compare to initial limit */
 723                 DPRINTF(("execve: stack limit exceeded %zu\n", len));
 724                 error = ENOMEM;
 725                 goto bad;
 726         }
 727
 728         /* Get rid of other LWPs. */
 729         if (p->p_sa || p->p_nlwps > 1) {
 730                 mutex_enter(p->p_lock);
 731                 exit_lwps(l);
 732                 mutex_exit(p->p_lock);
 733         }
 734         KDASSERT(p->p_nlwps == 1);
 735
 736         /* Destroy any lwpctl info. */
 737         if (p->p_lwpctl != NULL)
 738                 lwp_ctl_exit();
 739
 740         /* This is now LWP 1 */
 741         l->l_lid = 1;
 742         p->p_nlwpid = 1;
 743
 744 #ifdef KERN_SA
 745         /* Release any SA state. */
 746         if (p->p_sa)
 747                 sa_release(p);
 748 #endif /* KERN_SA */
 749
 750         /* Remove POSIX timers */
 751         timers_free(p, TIMERS_POSIX);
 752
 753         /* adjust "active stack depth" for process VSZ */
 754         pack.ep_ssize = len;    /* maybe should go elsewhere, but... */
 755
 756         /*
 757          * Do whatever is necessary to prepare the address space
 758          * for remapping.  Note that this might replace the current
 759          * vmspace with another!
 760          */
 761         uvmspace_exec(l, pack.ep_vm_minaddr, pack.ep_vm_maxaddr);
 762
 763         /* record proc's vnode, for use by procfs and others */
 764         if (p->p_textvp)
 765                 vrele(p->p_textvp);
 766         vref(pack.ep_vp);
 767         p->p_textvp = pack.ep_vp;
 768
 769         /* Now map address space */
 770         vm = p->p_vmspace;
 771         vm->vm_taddr = (void *)pack.ep_taddr;
 772         vm->vm_tsize = btoc(pack.ep_tsize);
 773         vm->vm_daddr = (void*)pack.ep_daddr;
 774         vm->vm_dsize = btoc(pack.ep_dsize);
 775         vm->vm_ssize = btoc(pack.ep_ssize);
 776         vm->vm_issize = 0;
 777         vm->vm_maxsaddr = (void *)pack.ep_maxsaddr;
 778         vm->vm_minsaddr = (void *)pack.ep_minsaddr;
 779
 780 #ifdef PAX_ASLR
 781         pax_aslr_init(l, vm);
 782 #endif /* PAX_ASLR */
 783
 784         /* create the new process's VM space by running the vmcmds */
 785 #ifdef DIAGNOSTIC
 786         if (pack.ep_vmcmds.evs_used == 0)
 787                 panic("execve: no vmcmds");
 788 #endif
 789         for (i = 0; i < pack.ep_vmcmds.evs_used && !error; i++) {
 790                 struct exec_vmcmd *vcp;
 791
 792                 vcp = &pack.ep_vmcmds.evs_cmds[i];
 793                 if (vcp->ev_flags & VMCMD_RELATIVE) {
 794 #ifdef DIAGNOSTIC
 795                         if (base_vcp == NULL)
 796                                 panic("execve: relative vmcmd with no base");
 797                         if (vcp->ev_flags & VMCMD_BASE)
 798                                 panic("execve: illegal base & relative vmcmd");
 799 #endif
 800                         vcp->ev_addr += base_vcp->ev_addr;
 801                 }
 802                 error = (*vcp->ev_proc)(l, vcp);
 803 #ifdef DEBUG_EXEC
 804                 if (error) {
 805                         size_t j;
 806                         struct exec_vmcmd *vp = &pack.ep_vmcmds.evs_cmds[0];
 807                         for (j = 0; j <= i; j++)
 808                                 uprintf(
 809                         "vmcmd[%zu] = %#lx/%#lx fd@%#lx prot=0%o flags=%d\n",
 810                                     j, vp[j].ev_addr, vp[j].ev_len,
 811                                     vp[j].ev_offset, vp[j].ev_prot,
 812                                     vp[j].ev_flags);
 813                 }
 814 #endif /* DEBUG_EXEC */
 815                 if (vcp->ev_flags & VMCMD_BASE)
 816                         base_vcp = vcp;
 817         }
 818
 819         /* free the vmspace-creation commands, and release their references */
 820         kill_vmcmds(&pack.ep_vmcmds);
 821
 822         vn_lock(pack.ep_vp, LK_EXCLUSIVE | LK_RETRY);
 823         VOP_CLOSE(pack.ep_vp, FREAD, l->l_cred);
 824         vput(pack.ep_vp);
 825
 826         /* if an error happened, deallocate and punt */
 827         if (error) {
 828                 DPRINTF(("execve: vmcmd %zu failed: %d\n", i - 1, error));
 829                 goto exec_abort;
 830         }
 831
 832         /* remember information about the process */
 833         arginfo.ps_nargvstr = argc;
 834         arginfo.ps_nenvstr = envc;
 835
 836         /* set command name & other accounting info */
 837         i = min(nid.ni_cnd.cn_namelen, MAXCOMLEN);
 838         (void)memcpy(p->p_comm, nid.ni_cnd.cn_nameptr, i);
 839         p->p_comm[i] = '\0';
 840
 841         dp = PNBUF_GET();
 842         /*
 843          * If the path starts with /, we don't need to do any work.
 844          * This handles the majority of the cases.
 845          * In the future perhaps we could canonicalize it?
 846          */
 847         if (pathbuf[0] == '/')
 848                 (void)strlcpy(pack.ep_path = dp, pathbuf, MAXPATHLEN);
 849 #ifdef notyet
 850         /*
 851          * Although this works most of the time [since the entry was just
 852          * entered in the cache] we don't use it because it theoretically
 853          * can fail and it is not the cleanest interface, because there
 854          * could be races. When the namei cache is re-written, this can
 855          * be changed to use the appropriate function.
 856          */
 857         else if (!(error = vnode_to_path(dp, MAXPATHLEN, p->p_textvp, l, p)))
 858                 pack.ep_path = dp;
 859 #endif
 860         else {
 861 #ifdef notyet
 862                 printf("Cannot get path for pid %d [%s] (error %d)",
 863                     (int)p->p_pid, p->p_comm, error);
 864 #endif
 865                 pack.ep_path = NULL;
 866                 PNBUF_PUT(dp);
 867         }
 868
 869         stack = (char *)STACK_ALLOC(STACK_GROW(vm->vm_minsaddr,
 870                 STACK_PTHREADSPACE + sizeof(struct ps_strings) + szsigcode),
 871                 len - (sizeof(struct ps_strings) + szsigcode));
 872
 873 #ifdef __MACHINE_STACK_GROWS_UP
 874         /*
 875          * The copyargs call always copies into lower addresses
 876          * first, moving towards higher addresses, starting with
 877          * the stack pointer that we give.  When the stack grows
 878          * down, this puts argc/argv/envp very shallow on the
 879          * stack, right at the first user stack pointer.
 880          * When the stack grows up, the situation is reversed.
 881          *
 882          * Normally, this is no big deal.  But the ld_elf.so _rtld()
 883          * function expects to be called with a single pointer to
 884          * a region that has a few words it can stash values into,
 885          * followed by argc/argv/envp.  When the stack grows down,
 886          * it's easy to decrement the stack pointer a little bit to
 887          * allocate the space for these few words and pass the new
 888          * stack pointer to _rtld.  When the stack grows up, however,
 889          * a few words before argc is part of the signal trampoline, XXX
 890          * so we have a problem.
 891          *
 892          * Instead of changing how _rtld works, we take the easy way
 893          * out and steal 32 bytes before we call copyargs.
 894          * This extra space was allowed for when 'len' was calculated.
 895          */
 896         stack += RTLD_GAP;
 897 #endif /* __MACHINE_STACK_GROWS_UP */
 898
 899         /* Now copy argc, args & environ to new stack */
 900         error = (*pack.ep_esch->es_copyargs)(l, &pack, &arginfo, &stack, argp);
 901         if (pack.ep_path) {
 902                 PNBUF_PUT(pack.ep_path);
 903                 pack.ep_path = NULL;
 904         }
 905         if (error) {
 906                 DPRINTF(("execve: copyargs failed %d\n", error));
 907                 goto exec_abort;
 908         }
 909         /* Move the stack back to original point */
 910         stack = (char *)STACK_GROW(vm->vm_minsaddr, len);
 911
 912         /* fill process ps_strings info */
 913         p->p_psstr = (struct ps_strings *)
 914             STACK_ALLOC(STACK_GROW(vm->vm_minsaddr, STACK_PTHREADSPACE),
 915             sizeof(struct ps_strings));
 916         p->p_psargv = offsetof(struct ps_strings, ps_argvstr);
 917         p->p_psnargv = offsetof(struct ps_strings, ps_nargvstr);
 918         p->p_psenv = offsetof(struct ps_strings, ps_envstr);
 919         p->p_psnenv = offsetof(struct ps_strings, ps_nenvstr);
 920
 921         /* copy out the process's ps_strings structure */
 922         if ((error = copyout(aip, (char *)p->p_psstr,
 923             sizeof(arginfo))) != 0) {
 924                 DPRINTF(("execve: ps_strings copyout %p->%p size %ld failed\n",
 925                        aip, (char *)p->p_psstr, (long)sizeof(arginfo)));
 926                 goto exec_abort;
 927         }
 928
 929         fd_closeexec();         /* handle close on exec */
 930         execsigs(p);            /* reset catched signals */
 931
 932         l->l_ctxlink = NULL;    /* reset ucontext link */
 933
 934
 935         p->p_acflag &= ~AFORK;
 936         mutex_enter(p->p_lock);
 937         p->p_flag |= PK_EXEC;
 938         mutex_exit(p->p_lock);
 939
 940         /*
 941          * Stop profiling.
 942          */
 943         if ((p->p_stflag & PST_PROFIL) != 0) {
 944                 mutex_spin_enter(&p->p_stmutex);
 945                 stopprofclock(p);
 946                 mutex_spin_exit(&p->p_stmutex);
 947         }
 948
 949         /*
 950          * It's OK to test PL_PPWAIT unlocked here, as other LWPs have
 951          * exited and exec()/exit() are the only places it will be cleared.
 952          */
 953         if ((p->p_lflag & PL_PPWAIT) != 0) {
 954                 mutex_enter(proc_lock);
 955                 p->p_lflag &= ~PL_PPWAIT;
 956                 cv_broadcast(&p->p_pptr->p_waitcv);
 957                 mutex_exit(proc_lock);
 958         }
 959
 960         /*
 961          * Deal with set[ug]id.  MNT_NOSUID has already been used to disable
 962          * s[ug]id.  It's OK to check for PSL_TRACED here as we have blocked
 963          * out additional references on the process for the moment.
 964          */
 965         if ((p->p_slflag & PSL_TRACED) == 0 &&
 966
 967             (((attr.va_mode & S_ISUID) != 0 &&
 968               kauth_cred_geteuid(l->l_cred) != attr.va_uid) ||
 969
 970              ((attr.va_mode & S_ISGID) != 0 &&
 971               kauth_cred_getegid(l->l_cred) != attr.va_gid))) {
 972                 /*
 973                  * Mark the process as SUGID before we do
 974                  * anything that might block.
 975                  */
 976                 proc_crmod_enter();
 977                 proc_crmod_leave(NULL, NULL, true);
 978
 979                 /* Make sure file descriptors 0..2 are in use. */
 980                 if ((error = fd_checkstd()) != 0) {
 981                         DPRINTF(("execve: fdcheckstd failed %d\n", error));
 982                         goto exec_abort;
 983                 }
 984
 985                 /*
 986                  * Copy the credential so other references don't see our
 987                  * changes.
 988                  */
 989                 l->l_cred = kauth_cred_copy(l->l_cred);
 990 #ifdef KTRACE
 991                 /*
 992                  * If the persistent trace flag isn't set, turn off.
 993                  */
 994                 if (p->p_tracep) {
 995                         mutex_enter(&ktrace_lock);
 996                         if (!(p->p_traceflag & KTRFAC_PERSISTENT))
 997                                 ktrderef(p);
 998                         mutex_exit(&ktrace_lock);
 999                 }
1000 #endif
1001                 if (attr.va_mode & S_ISUID)
1002                         kauth_cred_seteuid(l->l_cred, attr.va_uid);
1003                 if (attr.va_mode & S_ISGID)
1004                         kauth_cred_setegid(l->l_cred, attr.va_gid);
1005         } else {
1006                 if (kauth_cred_geteuid(l->l_cred) ==
1007                     kauth_cred_getuid(l->l_cred) &&
1008                     kauth_cred_getegid(l->l_cred) ==
1009                     kauth_cred_getgid(l->l_cred))
1010                         p->p_flag &= ~PK_SUGID;
1011         }
1012
1013         /*
1014          * Copy the credential so other references don't see our changes.
1015          * Test to see if this is necessary first, since in the common case
1016          * we won't need a private reference.
1017          */
1018         if (kauth_cred_geteuid(l->l_cred) != kauth_cred_getsvuid(l->l_cred) ||
1019             kauth_cred_getegid(l->l_cred) != kauth_cred_getsvgid(l->l_cred)) {
1020                 l->l_cred = kauth_cred_copy(l->l_cred);
1021                 kauth_cred_setsvuid(l->l_cred, kauth_cred_geteuid(l->l_cred));
1022                 kauth_cred_setsvgid(l->l_cred, kauth_cred_getegid(l->l_cred));
1023         }
1024
1025         /* Update the master credentials. */
1026         if (l->l_cred != p->p_cred) {
1027                 kauth_cred_t ocred;
1028
1029                 kauth_cred_hold(l->l_cred);
1030                 mutex_enter(p->p_lock);
1031                 ocred = p->p_cred;
1032                 p->p_cred = l->l_cred;
1033                 mutex_exit(p->p_lock);
1034                 kauth_cred_free(ocred);
1035         }
1036
1037 #if defined(__HAVE_RAS)
1038         /*
1039          * Remove all RASs from the address space.
1040          */
1041         ras_purgeall();
1042 #endif
1043
1044         doexechooks(p);
1045
1046         /* setup new registers and do misc. setup. */
1047         (*pack.ep_esch->es_emul->e_setregs)(l, &pack, (vaddr_t)stack);
1048         if (pack.ep_esch->es_setregs)
1049                 (*pack.ep_esch->es_setregs)(l, &pack, (vaddr_t)stack);
1050
1051         /* map the process's signal trampoline code */
1052         if (exec_sigcode_map(p, pack.ep_esch->es_emul)) {
1053                 DPRINTF(("execve: map sigcode failed %d\n", error));
1054                 goto exec_abort;
1055         }
1056
1057         pool_put(&exec_pool, argp);
1058
1059         PNBUF_PUT(nid.ni_cnd.cn_pnbuf);
1060
1061         /* notify others that we exec'd */
1062         KNOTE(&p->p_klist, NOTE_EXEC);
1063
1064         kmem_free(pack.ep_hdr, pack.ep_hdrlen);
1065
1066         /* The emulation root will usually have been found when we looked
1067          * for the elf interpreter (or similar), if not look now. */
1068         if (pack.ep_esch->es_emul->e_path != NULL && pack.ep_emul_root == NULL)
1069                 emul_find_root(l, &pack);
1070
1071         /* Any old emulation root got removed by fdcloseexec */
1072         rw_enter(&p->p_cwdi->cwdi_lock, RW_WRITER);
1073         p->p_cwdi->cwdi_edir = pack.ep_emul_root;
1074         rw_exit(&p->p_cwdi->cwdi_lock);
1075         pack.ep_emul_root = NULL;
1076         if (pack.ep_interp != NULL)
1077                 vrele(pack.ep_interp);
1078
1079         /*
1080          * Call emulation specific exec hook. This can setup per-process
1081          * p->p_emuldata or do any other per-process stuff an emulation needs.
1082          *
1083          * If we are executing process of different emulation than the
1084          * original forked process, call e_proc_exit() of the old emulation
1085          * first, then e_proc_exec() of new emulation. If the emulation is
1086          * same, the exec hook code should deallocate any old emulation
1087          * resources held previously by this process.
1088          */
1089         if (p->p_emul && p->p_emul->e_proc_exit
1090             && p->p_emul != pack.ep_esch->es_emul)
1091                 (*p->p_emul->e_proc_exit)(p);
1092
1093         /*
1094          * Call exec hook. Emulation code may NOT store reference to anything
1095          * from &pack.
1096          */
1097         if (pack.ep_esch->es_emul->e_proc_exec)
1098                 (*pack.ep_esch->es_emul->e_proc_exec)(p, &pack);
1099
1100         /* update p_emul, the old value is no longer needed */
1101         p->p_emul = pack.ep_esch->es_emul;
1102
1103         /* ...and the same for p_execsw */
1104         p->p_execsw = pack.ep_esch;
1105
1106 #ifdef __HAVE_SYSCALL_INTERN
1107         (*p->p_emul->e_syscall_intern)(p);
1108 #endif
1109         ktremul();
1110
1111         /* Allow new references from the debugger/procfs. */
1112         rw_exit(&p->p_reflock);
1113         rw_exit(&exec_lock);
1114
1115         mutex_enter(proc_lock);
1116
1117         if ((p->p_slflag & (PSL_TRACED|PSL_SYSCALL)) == PSL_TRACED) {
1118                 KSI_INIT_EMPTY(&ksi);
1119                 ksi.ksi_signo = SIGTRAP;
1120                 ksi.ksi_lid = l->l_lid;
1121                 kpsignal(p, &ksi, NULL);
1122         }
1123
1124         if (p->p_sflag & PS_STOPEXEC) {
1125                 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
1126                 p->p_pptr->p_nstopchild++;
1127                 p->p_pptr->p_waited = 0;
1128                 mutex_enter(p->p_lock);
1129                 ksiginfo_queue_init(&kq);
1130                 sigclearall(p, &contsigmask, &kq);
1131                 lwp_lock(l);
1132                 l->l_stat = LSSTOP;
1133                 p->p_stat = SSTOP;
1134                 p->p_nrlwps--;
1135                 mutex_exit(p->p_lock);
1136                 mutex_exit(proc_lock);
1137                 mi_switch(l);
1138                 ksiginfo_queue_drain(&kq);
1139                 KERNEL_LOCK(l->l_biglocks, l);
1140         } else {
1141                 mutex_exit(proc_lock);
1142         }
1143
1144         PNBUF_PUT(pathbuf);
1145         return (EJUSTRETURN);
1146
1147  bad:
1148         /* free the vmspace-creation commands, and release their references */
1149         kill_vmcmds(&pack.ep_vmcmds);
1150         /* kill any opened file descriptor, if necessary */
1151         if (pack.ep_flags & EXEC_HASFD) {
1152                 pack.ep_flags &= ~EXEC_HASFD;
1153                 fd_close(pack.ep_fd);
1154         }
1155         /* close and put the exec'd file */
1156         vn_lock(pack.ep_vp, LK_EXCLUSIVE | LK_RETRY);
1157         VOP_CLOSE(pack.ep_vp, FREAD, l->l_cred);
1158         vput(pack.ep_vp);
1159         PNBUF_PUT(nid.ni_cnd.cn_pnbuf);
1160         pool_put(&exec_pool, argp);
1161
1162  freehdr:
1163         kmem_free(pack.ep_hdr, pack.ep_hdrlen);
1164         if (pack.ep_emul_root != NULL)
1165                 vrele(pack.ep_emul_root);
1166         if (pack.ep_interp != NULL)
1167                 vrele(pack.ep_interp);
1168
1169         rw_exit(&exec_lock);
1170
1171  clrflg:
1172         lwp_lock(l);
1173         l->l_flag |= oldlwpflags;
1174         lwp_unlock(l);
1175         PNBUF_PUT(pathbuf);
1176         rw_exit(&p->p_reflock);
1177
1178         if (modgen != module_gen && error == ENOEXEC) {
1179                 modgen = module_gen;
1180                 exec_autoload();
1181                 goto retry;
1182         }
1183
1184         return error;
1185
1186  exec_abort:
1187         PNBUF_PUT(pathbuf);
1188         rw_exit(&p->p_reflock);
1189         rw_exit(&exec_lock);
1190
1191         /*
1192          * the old process doesn't exist anymore.  exit gracefully.
1193          * get rid of the (new) address space we have created, if any, get rid
1194          * of our namei data and vnode, and exit noting failure
1195          */
1196         uvm_deallocate(&vm->vm_map, VM_MIN_ADDRESS,
1197                 VM_MAXUSER_ADDRESS - VM_MIN_ADDRESS);
1198         if (pack.ep_emul_arg)
1199                 free(pack.ep_emul_arg, M_TEMP);
1200         PNBUF_PUT(nid.ni_cnd.cn_pnbuf);
1201         pool_put(&exec_pool, argp);
1202         kmem_free(pack.ep_hdr, pack.ep_hdrlen);
1203         if (pack.ep_emul_root != NULL)
1204                 vrele(pack.ep_emul_root);
1205         if (pack.ep_interp != NULL)
1206                 vrele(pack.ep_interp);
1207
1208         /* Acquire the sched-state mutex (exit1() will release it). */
1209         mutex_enter(p->p_lock);
1210         exit1(l, W_EXITCODE(error, SIGABRT));
1211
1212         /* NOTREACHED */
1213         return 0;
1214 }
1215
1216
1217 int
1218 copyargs(struct lwp *l, struct exec_package *pack, struct ps_strings *arginfo,
1219     char **stackp, void *argp)
1220 {
1221         char    **cpp, *dp, *sp;
1222         size_t  len;
1223         void    *nullp;
1224         long    argc, envc;
1225         int     error;
1226
1227         cpp = (char **)*stackp;
1228         nullp = NULL;
1229         argc = arginfo->ps_nargvstr;
1230         envc = arginfo->ps_nenvstr;
1231         if ((error = copyout(&argc, cpp++, sizeof(argc))) != 0)
1232                 return error;
1233
1234         dp = (char *) (cpp + argc + envc + 2 + pack->ep_esch->es_arglen);
1235         sp = argp;
1236
1237         /* XXX don't copy them out, remap them! */
1238         arginfo->ps_argvstr = cpp; /* remember location of argv for later */
1239
1240         for (; --argc >= 0; sp += len, dp += len)
1241                 if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0 ||
1242                     (error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0)
1243                         return error;
1244
1245         if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0)
1246                 return error;
1247
1248         arginfo->ps_envstr = cpp; /* remember location of envp for later */
1249
1250         for (; --envc >= 0; sp += len, dp += len)
1251                 if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0 ||
1252                     (error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0)
1253                         return error;
1254
1255         if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0)
1256                 return error;
1257
1258         *stackp = (char *)cpp;
1259         return 0;
1260 }
1261
1262
1263 /*
1264  * Add execsw[] entries.
1265  */
1266 int
1267 exec_add(struct execsw *esp, int count)
1268 {
1269         struct exec_entry       *it;
1270         int                     i;
1271
1272         if (count == 0) {
1273                 return 0;
1274         }
1275
1276         /* Check for duplicates. */
1277         rw_enter(&exec_lock, RW_WRITER);
1278         for (i = 0; i < count; i++) {
1279                 LIST_FOREACH(it, &ex_head, ex_list) {
1280                         /* assume unique (makecmds, probe_func, emulation) */
1281                         if (it->ex_sw->es_makecmds == esp[i].es_makecmds &&
1282                             it->ex_sw->u.elf_probe_func ==
1283                             esp[i].u.elf_probe_func &&
1284                             it->ex_sw->es_emul == esp[i].es_emul) {
1285                                 rw_exit(&exec_lock);
1286                                 return EEXIST;
1287                         }
1288                 }
1289         }
1290
1291         /* Allocate new entries. */
1292         for (i = 0; i < count; i++) {
1293                 it = kmem_alloc(sizeof(*it), KM_SLEEP);
1294                 it->ex_sw = &esp[i];
1295                 LIST_INSERT_HEAD(&ex_head, it, ex_list);
1296         }
1297
1298         /* update execsw[] */
1299         exec_init(0);
1300         rw_exit(&exec_lock);
1301         return 0;
1302 }
1303
1304 /*
1305  * Remove execsw[] entry.
1306  */
1307 int
1308 exec_remove(struct execsw *esp, int count)
1309 {
1310         struct exec_entry       *it, *next;
1311         int                     i;
1312         const struct proclist_desc *pd;
1313         proc_t                  *p;
1314
1315         if (count == 0) {
1316                 return 0;
1317         }
1318
1319         /* Abort if any are busy. */
1320         rw_enter(&exec_lock, RW_WRITER);
1321         for (i = 0; i < count; i++) {
1322                 mutex_enter(proc_lock);
1323                 for (pd = proclists; pd->pd_list != NULL; pd++) {
1324                         PROCLIST_FOREACH(p, pd->pd_list) {
1325                                 if (p->p_execsw == &esp[i]) {
1326                                         mutex_exit(proc_lock);
1327                                         rw_exit(&exec_lock);
1328                                         return EBUSY;
1329                                 }
1330                         }
1331                 }
1332                 mutex_exit(proc_lock);
1333         }
1334
1335         /* None are busy, so remove them all. */
1336         for (i = 0; i < count; i++) {
1337                 for (it = LIST_FIRST(&ex_head); it != NULL; it = next) {
1338                         next = LIST_NEXT(it, ex_list);
1339                         if (it->ex_sw == &esp[i]) {
1340                                 LIST_REMOVE(it, ex_list);
1341                                 kmem_free(it, sizeof(*it));
1342                                 break;
1343                         }
1344                 }
1345         }
1346
1347         /* update execsw[] */
1348         exec_init(0);
1349         rw_exit(&exec_lock);
1350         return 0;
1351 }
1352
1353 /*
1354  * Initialize exec structures. If init_boot is true, also does necessary
1355  * one-time initialization (it's called from main() that way).
1356  * Once system is multiuser, this should be called with exec_lock held,
1357  * i.e. via exec_{add|remove}().
1358  */
1359 int
1360 exec_init(int init_boot)
1361 {
1362         const struct execsw     **sw;
1363         struct exec_entry       *ex;
1364         SLIST_HEAD(,exec_entry) first;
1365         SLIST_HEAD(,exec_entry) any;
1366         SLIST_HEAD(,exec_entry) last;
1367         int                     i, sz;
1368
1369         if (init_boot) {
1370                 /* do one-time initializations */
1371                 rw_init(&exec_lock);
1372                 mutex_init(&sigobject_lock, MUTEX_DEFAULT, IPL_NONE);
1373                 pool_init(&exec_pool, NCARGS, 0, 0, PR_NOALIGN|PR_NOTOUCH,
1374                     "execargs", &exec_palloc, IPL_NONE);
1375                 pool_sethardlimit(&exec_pool, maxexec, "should not happen", 0);
1376         } else {
1377                 KASSERT(rw_write_held(&exec_lock));
1378         }
1379
1380         /* Sort each entry onto the appropriate queue. */
1381         SLIST_INIT(&first);
1382         SLIST_INIT(&any);
1383         SLIST_INIT(&last);
1384         sz = 0;
1385         LIST_FOREACH(ex, &ex_head, ex_list) {
1386                 switch(ex->ex_sw->es_prio) {
1387                 case EXECSW_PRIO_FIRST:
1388                         SLIST_INSERT_HEAD(&first, ex, ex_slist);
1389                         break;
1390                 case EXECSW_PRIO_ANY:
1391                         SLIST_INSERT_HEAD(&any, ex, ex_slist);
1392                         break;
1393                 case EXECSW_PRIO_LAST:
1394                         SLIST_INSERT_HEAD(&last, ex, ex_slist);
1395                         break;
1396                 default:
1397                         panic("exec_init");
1398                         break;
1399                 }
1400                 sz++;
1401         }
1402
1403         /*
1404          * Create new execsw[].  Ensure we do not try a zero-sized
1405          * allocation.
1406          */
1407         sw = kmem_alloc(sz * sizeof(struct execsw *) + 1, KM_SLEEP);
1408         i = 0;
1409         SLIST_FOREACH(ex, &first, ex_slist) {
1410                 sw[i++] = ex->ex_sw;
1411         }
1412         SLIST_FOREACH(ex, &any, ex_slist) {
1413                 sw[i++] = ex->ex_sw;
1414         }
1415         SLIST_FOREACH(ex, &last, ex_slist) {
1416                 sw[i++] = ex->ex_sw;
1417         }
1418
1419         /* Replace old execsw[] and free used memory. */
1420         if (execsw != NULL) {
1421                 kmem_free(__UNCONST(execsw),
1422                     nexecs * sizeof(struct execsw *) + 1);
1423         }
1424         execsw = sw;
1425         nexecs = sz;
1426
1427         /* Figure out the maximum size of an exec header. */
1428         exec_maxhdrsz = sizeof(int);
1429         for (i = 0; i < nexecs; i++) {
1430                 if (execsw[i]->es_hdrsz > exec_maxhdrsz)
1431                         exec_maxhdrsz = execsw[i]->es_hdrsz;
1432         }
1433
1434         return 0;
1435 }
1436
1437 static int
1438 exec_sigcode_map(struct proc *p, const struct emul *e)
1439 {
1440         vaddr_t va;
1441         vsize_t sz;
1442         int error;
1443         struct uvm_object *uobj;
1444
1445         sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode;
1446
1447         if (e->e_sigobject == NULL || sz == 0) {
1448                 return 0;
1449         }
1450
1451         /*
1452          * If we don't have a sigobject for this emulation, create one.
1453          *
1454          * sigobject is an anonymous memory object (just like SYSV shared
1455          * memory) that we keep a permanent reference to and that we map
1456          * in all processes that need this sigcode. The creation is simple,
1457          * we create an object, add a permanent reference to it, map it in
1458          * kernel space, copy out the sigcode to it and unmap it.
1459          * We map it with PROT_READ|PROT_EXEC into the process just
1460          * the way sys_mmap() would map it.
1461          */
1462
1463         uobj = *e->e_sigobject;
1464         if (uobj == NULL) {
1465                 mutex_enter(&sigobject_lock);
1466                 if ((uobj = *e->e_sigobject) == NULL) {
1467                         uobj = uao_create(sz, 0);
1468                         (*uobj->pgops->pgo_reference)(uobj);
1469                         va = vm_map_min(kernel_map);
1470                         if ((error = uvm_map(kernel_map, &va, round_page(sz),
1471                             uobj, 0, 0,
1472                             UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
1473                             UVM_INH_SHARE, UVM_ADV_RANDOM, 0)))) {
1474                                 printf("kernel mapping failed %d\n", error);
1475                                 (*uobj->pgops->pgo_detach)(uobj);
1476                                 mutex_exit(&sigobject_lock);
1477                                 return (error);
1478                         }
1479                         memcpy((void *)va, e->e_sigcode, sz);
1480 #ifdef PMAP_NEED_PROCWR
1481                         pmap_procwr(&proc0, va, sz);
1482 #endif
1483                         uvm_unmap(kernel_map, va, va + round_page(sz));
1484                         *e->e_sigobject = uobj;
1485                 }
1486                 mutex_exit(&sigobject_lock);
1487         }
1488
1489         /* Just a hint to uvm_map where to put it. */
1490         va = e->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr,
1491             round_page(sz));
1492
1493 #ifdef __alpha__
1494         /*
1495          * Tru64 puts /sbin/loader at the end of user virtual memory,
1496          * which causes the above calculation to put the sigcode at
1497          * an invalid address.  Put it just below the text instead.
1498          */
1499         if (va == (vaddr_t)vm_map_max(&p->p_vmspace->vm_map)) {
1500                 va = (vaddr_t)p->p_vmspace->vm_taddr - round_page(sz);
1501         }
1502 #endif
1503
1504         (*uobj->pgops->pgo_reference)(uobj);
1505         error = uvm_map(&p->p_vmspace->vm_map, &va, round_page(sz),
1506                         uobj, 0, 0,
1507                         UVM_MAPFLAG(UVM_PROT_RX, UVM_PROT_RX, UVM_INH_SHARE,
1508                                     UVM_ADV_RANDOM, 0));
1509         if (error) {
1510                 (*uobj->pgops->pgo_detach)(uobj);
1511                 return (error);
1512         }
1513         p->p_sigctx.ps_sigcode = (void *)va;
1514         return (0);
1515 }