4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, Joyent Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
31 #include <sys/param.h>
32 #include <sys/types.h>
34 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
39 #include <sys/utsname.h>
40 #include <sys/errno.h>
41 #include <sys/signal.h>
42 #include <sys/siginfo.h>
43 #include <sys/fault.h>
44 #include <sys/syscall.h>
45 #include <sys/ucontext.h>
46 #include <sys/prsystm.h>
47 #include <sys/vnode.h>
50 #include <sys/pathname.h>
53 #include <sys/debug.h>
54 #include <sys/stack.h>
56 #include <sys/schedctl.h>
58 #include <sys/corectl.h>
59 #include <sys/cmn_err.h>
62 #include <sys/nbmlock.h>
65 #include <sys/contract/process_impl.h>
69 * Processes running within a zone potentially dump core in 3 locations,
70 * based on the per-process, per-zone, and the global zone's core settings.
72 * Per-zone and global zone settings are often referred to as "global"
73 * settings since they apply to the system (or zone) as a whole, as
74 * opposed to a particular process.
77 CORE_PROC
, /* Use per-process settings */
78 CORE_ZONE
, /* Use per-zone settings */
79 CORE_GLOBAL
/* Use global zone settings */
83 * Log information about "global" core dumps to syslog.
86 core_log(struct core_globals
*cg
, int error
, const char *why
, const char *path
,
91 char *fn
= PTOU(p
)->u_comm
;
93 if (!(cg
->core_options
& CC_GLOBAL_LOG
))
97 zcmn_err(zoneid
, CE_NOTE
, "core_log: %s[%d] %s", fn
, pid
, why
);
99 zcmn_err(zoneid
, CE_NOTE
, "core_log: %s[%d] %s: %s", fn
, pid
,
102 zcmn_err(zoneid
, CE_NOTE
, "core_log: %s[%d] %s, errno=%d: %s",
103 fn
, pid
, why
, error
, path
);
107 * Private version of vn_remove().
108 * Refuse to unlink a directory or an unwritable file.
109 * Also allow the process to access files normally inaccessible due to
110 * chroot(2) or Zone limitations.
113 remove_core_file(char *fp
, enum core_types core_type
)
115 vnode_t
*vp
= NULL
; /* entry vnode */
116 vnode_t
*dvp
; /* ptr to parent dir vnode */
120 pathname_t pn
; /* name of entry */
121 vnode_t
*startvp
, *rootvp
;
123 if ((error
= pn_get(fp
, UIO_SYSSPACE
, &pn
)) != 0)
126 * Determine what rootvp to use.
128 if (core_type
== CORE_PROC
) {
129 rootvp
= (PTOU(curproc
)->u_rdir
== NULL
?
130 curproc
->p_zone
->zone_rootvp
: PTOU(curproc
)->u_rdir
);
131 startvp
= (fp
[0] == '/' ? rootvp
: PTOU(curproc
)->u_cdir
);
132 } else if (core_type
== CORE_ZONE
) {
133 startvp
= curproc
->p_zone
->zone_rootvp
;
134 rootvp
= curproc
->p_zone
->zone_rootvp
;
136 ASSERT(core_type
== CORE_GLOBAL
);
141 if (rootvp
!= rootdir
)
143 if ((error
= lookuppnvp(&pn
, NULL
, NO_FOLLOW
, &dvp
, &vp
, rootvp
,
144 startvp
, CRED())) != 0) {
149 * Succeed if there is no file.
150 * Fail if the file is not a regular file.
151 * Fail if the filesystem is mounted read-only.
152 * Fail if the file is not writeable.
153 * Fail if the file has NBMAND share reservations.
157 else if (vp
->v_type
!= VREG
)
159 else if ((dvfsp
= dvp
->v_vfsp
) != NULL
&&
160 (dvfsp
->vfs_flag
& VFS_RDONLY
))
162 else if ((error
= VOP_ACCESS(vp
, VWRITE
, 0, CRED(), NULL
)) == 0) {
163 if (nbl_need_check(vp
)) {
164 nbl_start_crit(vp
, RW_READER
);
166 if (nbl_share_conflict(vp
, NBL_REMOVE
, NULL
)) {
171 error
= VOP_REMOVE(dvp
, pn
.pn_path
, CRED(), NULL
, 0);
186 * Create the core file in a location that may be normally inaccessible due
187 * to chroot(2) or Zone limitations.
190 create_core_file(char *fp
, enum core_types core_type
, vnode_t
**vpp
)
193 mode_t perms
= (S_IRUSR
| S_IWUSR
);
199 cred_t
*credp
= CRED();
201 if (core_type
== CORE_PROC
) {
203 dvp
= NULL
; /* regular lookup */
205 vnode_t
*startvp
, *rootvp
;
207 ASSERT(core_type
== CORE_ZONE
|| core_type
== CORE_GLOBAL
);
209 * This is tricky because we want to dump the core in
210 * a location which may normally be inaccessible
211 * to us (due to chroot(2) limitations, or zone
212 * membership), and hence need to overcome u_rdir
213 * restrictions. The basic idea is to separate
214 * the path from the filename, lookup the
215 * pathname separately (starting from the global
216 * zone's root directory), and then open the
217 * file starting at the directory vnode.
219 if (error
= pn_get(fp
, UIO_SYSSPACE
, &pn
))
222 if (core_type
== CORE_ZONE
) {
223 startvp
= rootvp
= curproc
->p_zone
->zone_rootvp
;
225 startvp
= rootvp
= rootdir
;
228 * rootvp and startvp will be VN_RELE()'d by lookuppnvp() if
232 if (rootvp
!= rootdir
)
235 * Do a lookup on the full path, ignoring the actual file, but
236 * finding the vnode for the directory. It's OK if the file
237 * doesn't exist -- it most likely won't since we just removed
240 error
= lookuppnvp(&pn
, NULL
, FOLLOW
, &dvp
, NULLVPP
,
241 rootvp
, startvp
, credp
);
247 * Now find the final component in the path (ie, the name of
250 if (error
= pn_get(fp
, UIO_SYSSPACE
, &pn
)) {
257 error
= vn_openat(file
, UIO_SYSSPACE
,
258 FWRITE
| FTRUNC
| FEXCL
| FCREAT
| FOFFMAX
,
259 perms
, &vp
, CRCREAT
, PTOU(curproc
)->u_cmask
, dvp
, -1);
260 if (core_type
!= CORE_PROC
) {
265 * Don't dump a core file owned by "nobody".
267 vattr
.va_mask
= AT_UID
;
269 (VOP_GETATTR(vp
, &vattr
, 0, credp
, NULL
) != 0 ||
270 vattr
.va_uid
!= crgetuid(credp
))) {
271 (void) VOP_CLOSE(vp
, FWRITE
, 1, (offset_t
)0,
274 (void) remove_core_file(fp
, core_type
);
282 * Install the specified held cred into the process, and return a pointer to
283 * the held cred which was previously the value of p->p_cred.
286 set_cred(proc_t
*p
, cred_t
*newcr
)
289 uid_t olduid
, newuid
;
292 * Place a hold on the existing cred, and then install the new
293 * cred into the proc structure.
295 mutex_enter(&p
->p_crlock
);
299 mutex_exit(&p
->p_crlock
);
301 ASSERT(crgetzoneid(oldcr
) == crgetzoneid(newcr
));
304 * If the real uid is changing, keep the per-user process
307 olduid
= crgetruid(oldcr
);
308 newuid
= crgetruid(newcr
);
309 if (olduid
!= newuid
) {
310 zoneid_t zoneid
= crgetzoneid(newcr
);
312 mutex_enter(&pidlock
);
313 upcount_dec(olduid
, zoneid
);
314 upcount_inc(newuid
, zoneid
);
315 mutex_exit(&pidlock
);
319 * Broadcast the new cred to all the other threads. The old
320 * cred can be safely returned because we have a hold on it.
327 do_core(char *fp
, int sig
, enum core_types core_type
, struct core_globals
*cg
)
330 cred_t
*credp
= CRED();
335 cred_t
*ocredp
= NULL
;
337 core_content_t content
;
341 if (core_type
== CORE_GLOBAL
|| core_type
== CORE_ZONE
) {
342 mutex_enter(&cg
->core_lock
);
343 content
= cg
->core_content
;
344 mutex_exit(&cg
->core_lock
);
345 rlimit
= cg
->core_rlimit
;
347 mutex_enter(&p
->p_lock
);
348 rlimit
= rctl_enforced_value(rctlproc_legacy
[RLIMIT_CORE
],
350 content
= corectl_content_value(p
->p_content
);
351 mutex_exit(&p
->p_lock
);
358 * If SNOCD is set, or if the effective, real, and saved ids do
359 * not match up, no one but a privileged user is allowed to view
360 * this core file. Set the credentials and the owner to root.
362 if ((p
->p_flag
& SNOCD
) ||
363 (uid
= crgetuid(credp
)) != crgetruid(credp
) ||
364 uid
!= crgetsuid(credp
) ||
365 (gid
= crgetgid(credp
)) != crgetrgid(credp
) ||
366 gid
!= crgetsgid(credp
)) {
368 * Because this is insecure against certain forms of file
369 * system attack, do it only if set-id core files have been
370 * enabled via corectl(CC_GLOBAL_SETID | CC_PROCESS_SETID).
372 if (((core_type
== CORE_GLOBAL
|| core_type
== CORE_ZONE
) &&
373 !(cg
->core_options
& CC_GLOBAL_SETID
)) ||
374 (core_type
== CORE_PROC
&&
375 !(cg
->core_options
& CC_PROCESS_SETID
)))
382 * If we are doing a "global" core dump or a set-id core dump,
383 * use kcred to do the dumping.
385 if (core_type
== CORE_GLOBAL
|| core_type
== CORE_ZONE
|| is_setid
) {
387 * Use the zone's "kcred" to prevent privilege
390 credp
= zone_get_kcred(getzoneid());
391 ASSERT(credp
!= NULL
);
392 ocredp
= set_cred(p
, credp
);
396 * First remove any existing core file, then
397 * open the new core file with (O_EXCL|O_CREAT).
399 * The reasons for doing this are manifold:
401 * For security reasons, we don't want root processes
402 * to dump core through a symlink because that would
403 * allow a malicious user to clobber any file on
404 * the system if they could convince a root process,
405 * perhaps a set-uid root process that they started,
406 * to dump core in a directory writable by that user.
407 * Similar security reasons apply to hard links.
408 * For symmetry we do this unconditionally, not
409 * just for root processes.
411 * If the process has the core file mmap()d into the
412 * address space, we would be modifying the address
413 * space that we are trying to dump if we did not first
414 * remove the core file. (The command "file core"
415 * is the canonical example of this possibility.)
417 * Opening the core file with O_EXCL|O_CREAT ensures than
418 * two concurrent core dumps don't clobber each other.
419 * One is bound to lose; we don't want to make both lose.
421 if ((error
= remove_core_file(fp
, core_type
)) == 0) {
422 error
= create_core_file(fp
, core_type
, &vp
);
426 * Now that vn_open is complete, reset the process's credentials if
427 * we changed them, and make 'credp' point to kcred used
428 * above. We use 'credp' to do i/o on the core file below, but leave
429 * p->p_cred set to the original credential to allow the core file
430 * to record this information.
433 credp
= set_cred(p
, ocredp
);
438 (void) flush_user_windows_to_stack(NULL
);
440 if ((eswp
= PTOU(curproc
)->u_execsw
) == NULL
||
441 (eswp
= findexec_by_magic(eswp
->exec_magic
)) == NULL
) {
444 error
= eswp
->exec_core(vp
, p
, credp
, rlimit
, sig
,
446 rw_exit(eswp
->exec_lock
);
449 closerr
= VOP_CLOSE(vp
, FWRITE
, 1, (offset_t
)0, credp
, NULL
);
462 * Convert a core name pattern to a pathname.
465 expand_string(const char *pat
, char *fp
, int size
, cred_t
*cr
)
473 while ((c
= *pat
++) != '\0') {
475 return (ENAMETOOLONG
);
481 if ((c
= *pat
++) == '\0') {
488 (void) sprintf((s
= buf
), "%d", p
->p_pid
);
490 case 'u': /* effective uid */
491 (void) sprintf((s
= buf
), "%u", crgetuid(p
->p_cred
));
493 case 'g': /* effective gid */
494 (void) sprintf((s
= buf
), "%u", crgetgid(p
->p_cred
));
496 case 'f': /* exec'd filename */
499 case 'd': /* exec'd dirname */
501 * Even if pathname caching is disabled, we should
502 * be able to lookup the pathname for a directory.
504 if (p
->p_execdir
!= NULL
&& vnodetopath(NULL
,
505 p
->p_execdir
, fp
, size
, cr
) == 0) {
506 len
= (int)strlen(fp
);
509 ASSERT(fp
[0] == '/');
512 * Strip off the leading slash.
514 for (i
= 0; i
< len
; i
++) {
527 case 'n': /* system nodename */
530 case 'm': /* machine (sun4u, etc) */
533 case 't': /* decimal value of time(2) */
534 (void) sprintf((s
= buf
), "%ld", gethrestime_sec());
537 s
= p
->p_zone
->zone_name
;
540 /* This is zonepath + "/root/", except for GZ */
541 s
= p
->p_zone
->zone_rootpath
;
544 (void) strcpy((s
= buf
), "%");
553 len
= (int)strlen(s
);
554 if ((size
-= len
) <= 0)
555 return (ENAMETOOLONG
);
556 (void) strcpy(fp
, s
);
557 /* strip trailing "/root/" from non-GZ zonepath string */
558 if (c
== 'Z' && len
> 6) {
560 ASSERT(strncmp(fp
+ len
, "/root/", 6) == 0);
570 dump_one_core(int sig
, rlim64_t rlimit
, enum core_types core_type
,
571 struct core_globals
*cg
, char **name
)
580 ASSERT(core_type
== CORE_ZONE
|| core_type
== CORE_GLOBAL
);
581 zoneid
= (core_type
== CORE_ZONE
? getzoneid() : GLOBAL_ZONEID
);
583 mutex_enter(&cg
->core_lock
);
584 if ((rp
= cg
->core_file
) != NULL
)
586 mutex_exit(&cg
->core_lock
);
588 core_log(cg
, 0, "no global core file pattern exists", NULL
,
590 return (1); /* core file not generated */
592 fp
= kmem_alloc(MAXPATHLEN
, KM_SLEEP
);
593 cr
= zone_get_kcred(getzoneid());
594 error
= expand_string(refstr_value(rp
), fp
, MAXPATHLEN
, cr
);
597 core_log(cg
, 0, "global core file pattern too long",
598 refstr_value(rp
), zoneid
);
599 } else if ((error
= do_core(fp
, sig
, core_type
, cg
)) == 0) {
600 core_log(cg
, 0, "core dumped", fp
, zoneid
);
601 } else if (error
== ENOTSUP
) {
602 core_log(cg
, 0, "setid process, core not dumped", fp
, zoneid
);
603 } else if (error
== ENOSPC
) {
604 core_log(cg
, 0, "no space left on device, core truncated",
606 } else if (error
== EFBIG
) {
608 core_log(cg
, 0, "core rlimit is zero, core not dumped",
611 core_log(cg
, 0, "core rlimit exceeded, core truncated",
614 * In addition to the core result logging, we
615 * may also have explicit actions defined on
616 * core file size violations via the resource
619 mutex_enter(&p
->p_lock
);
620 (void) rctl_action(rctlproc_legacy
[RLIMIT_CORE
],
621 p
->p_rctls
, p
, RCA_SAFE
);
622 mutex_exit(&p
->p_lock
);
624 core_log(cg
, error
, "core dump failed", fp
, zoneid
);
630 kmem_free(fp
, MAXPATHLEN
);
635 core(int sig
, int ext
)
638 klwp_t
*lwp
= ttolwp(curthread
);
640 char *fp_process
= NULL
, *fp_global
= NULL
, *fp_zone
= NULL
;
647 struct core_globals
*my_cg
, *global_cg
;
649 global_cg
= zone_getspecific(core_zone_key
, global_zone
);
650 ASSERT(global_cg
!= NULL
);
652 my_cg
= zone_getspecific(core_zone_key
, curproc
->p_zone
);
653 ASSERT(my_cg
!= NULL
);
655 /* core files suppressed? */
656 if (!(my_cg
->core_options
& (CC_PROCESS_PATH
|CC_GLOBAL_PATH
)) &&
657 !(global_cg
->core_options
& CC_GLOBAL_PATH
)) {
658 if (!ext
&& p
->p_ct_process
!= NULL
)
659 contract_process_core(p
->p_ct_process
, p
, sig
,
665 * Block all signals except SIGHUP, SIGINT, SIGKILL, and SIGTERM; no
666 * other signal may interrupt a core dump. For each signal, we
667 * explicitly unblock it and set it in p_siginfo to allow for some
668 * minimal error reporting. Additionally, we get the current limit on
669 * core file size for handling later error reporting.
671 mutex_enter(&p
->p_lock
);
673 p
->p_flag
|= SDOCORE
;
674 schedctl_finish_sigblock(curthread
);
675 sigmask
= curthread
->t_hold
; /* remember for later */
676 sigfillset(&sighold
);
677 if (!sigismember(&sigmask
, SIGHUP
))
678 sigdelset(&sighold
, SIGHUP
);
679 if (!sigismember(&sigmask
, SIGINT
))
680 sigdelset(&sighold
, SIGINT
);
681 if (!sigismember(&sigmask
, SIGKILL
))
682 sigdelset(&sighold
, SIGKILL
);
683 if (!sigismember(&sigmask
, SIGTERM
))
684 sigdelset(&sighold
, SIGTERM
);
686 sigaddset(&p
->p_siginfo
, SIGHUP
);
687 sigaddset(&p
->p_siginfo
, SIGINT
);
688 sigaddset(&p
->p_siginfo
, SIGKILL
);
689 sigaddset(&p
->p_siginfo
, SIGTERM
);
691 curthread
->t_hold
= sighold
;
693 rlimit
= rctl_enforced_value(rctlproc_legacy
[RLIMIT_CORE
], p
->p_rctls
,
696 mutex_exit(&p
->p_lock
);
699 * Undo any watchpoints.
701 pr_free_watched_pages(p
);
704 * The presence of a current signal prevents file i/o
705 * from succeeding over a network. We copy the current
706 * signal information to the side and cancel the current
707 * signal so that the core dump will succeed.
709 ASSERT(lwp
->lwp_cursig
== sig
);
712 if (lwp
->lwp_curinfo
== NULL
) {
713 bzero(&lwp
->lwp_siginfo
, sizeof (k_siginfo_t
));
714 lwp
->lwp_siginfo
.si_signo
= sig
;
715 lwp
->lwp_siginfo
.si_code
= SI_NOINFO
;
717 bcopy(&lwp
->lwp_curinfo
->sq_info
,
718 &lwp
->lwp_siginfo
, sizeof (k_siginfo_t
));
719 siginfofree(lwp
->lwp_curinfo
);
720 lwp
->lwp_curinfo
= NULL
;
724 * Convert the core file name patterns into path names
725 * and call do_core() to write the core files.
728 if (my_cg
->core_options
& CC_PROCESS_PATH
) {
729 mutex_enter(&p
->p_lock
);
730 if (p
->p_corefile
!= NULL
)
731 rp
= corectl_path_value(p
->p_corefile
);
734 mutex_exit(&p
->p_lock
);
736 fp_process
= kmem_alloc(MAXPATHLEN
, KM_SLEEP
);
737 error1
= expand_string(refstr_value(rp
),
738 fp_process
, MAXPATHLEN
, p
->p_cred
);
740 error1
= do_core(fp_process
, sig
, CORE_PROC
,
746 if (my_cg
->core_options
& CC_GLOBAL_PATH
)
747 error2
= dump_one_core(sig
, rlimit
, CORE_ZONE
, my_cg
,
749 if (global_cg
!= my_cg
&& (global_cg
->core_options
& CC_GLOBAL_PATH
))
750 error3
= dump_one_core(sig
, rlimit
, CORE_GLOBAL
, global_cg
,
754 * Restore the signal hold mask.
756 mutex_enter(&p
->p_lock
);
757 curthread
->t_hold
= sigmask
;
758 mutex_exit(&p
->p_lock
);
760 if (!ext
&& p
->p_ct_process
!= NULL
)
761 contract_process_core(p
->p_ct_process
, p
, sig
,
762 error1
== 0 ? fp_process
: NULL
,
763 error2
== 0 ? fp_global
: NULL
,
764 error3
== 0 ? fp_zone
: NULL
);
766 if (fp_process
!= NULL
)
767 kmem_free(fp_process
, MAXPATHLEN
);
768 if (fp_global
!= NULL
)
769 kmem_free(fp_global
, MAXPATHLEN
);
771 kmem_free(fp_zone
, MAXPATHLEN
);
774 * Return non-zero if no core file was created.
776 return (error1
!= 0 && error2
!= 0 && error3
!= 0);
780 * Maximum chunk size for dumping core files,
781 * size in pages, patchable in /etc/system
783 uint_t core_chunk
= 32;
786 * The delay between core_write() calls, in microseconds. The default
787 * matches one "normal" clock tick, or 10 milliseconds.
789 clock_t core_delay_usec
= 10000;
792 * Common code to core dump process memory. The core_seg routine does i/o
793 * using core_write() below, and so it has the same failure semantics.
796 core_seg(proc_t
*p
, vnode_t
*vp
, offset_t offset
, caddr_t addr
, size_t size
,
797 rlim64_t rlimit
, cred_t
*credp
)
805 for (base
= addr
; base
< eaddr
; base
+= len
) {
807 if (as_memory(p
->p_as
, &base
, &len
) != 0)
811 * Reduce len to a reasonable value so that we don't
812 * overwhelm the VM system with a monstrously large
813 * single write and cause pageout to stop running.
815 if (len
> (size_t)core_chunk
* PAGESIZE
)
816 len
= (size_t)core_chunk
* PAGESIZE
;
818 err
= core_write(vp
, UIO_USERSPACE
,
819 offset
+ (size_t)(base
- addr
), base
, len
, rlimit
, credp
);
825 * If we have taken a signal, return EINTR to allow the dump
828 if (issig(JUSTLOOKING
) && issig(FORREAL
))
836 * Wrapper around vn_rdwr to perform writes to a core file. For core files,
837 * we always want to write as much as we possibly can, and then make sure to
838 * return either 0 to the caller (for success), or the actual errno value.
839 * By using this function, the caller can omit additional code for handling
840 * retries and errors for partial writes returned by vn_rdwr. If vn_rdwr
841 * unexpectedly returns zero but no progress has been made, we return ENOSPC.
844 core_write(vnode_t
*vp
, enum uio_seg segflg
, offset_t offset
,
845 const void *buf
, size_t len
, rlim64_t rlimit
, cred_t
*credp
)
851 error
= vn_rdwr(UIO_WRITE
, vp
, (caddr_t
)buf
, len
, offset
,
852 segflg
, 0, rlimit
, credp
, &resid
);
860 buf
= (const char *)buf
+ len
- resid
;
861 offset
+= len
- resid
;