4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2011 Joyent Inc.
28 * method.c - method execution functions
30 * This file contains the routines needed to run a method: a fork(2)-exec(2)
31 * invocation monitored using either the contract filesystem or waitpid(2).
32 * (Plain fork1(2) support is provided in fork.c.)
35 * When we restart a service, we want to transfer any contracts that the old
36 * service's contract inherited. This means that (a) we must not abandon the
37 * old contract when the service dies and (b) we must write the id of the old
38 * contract into the terms of the new contract. There should be limits to
39 * (a), though, since we don't want to keep the contract around forever. To
40 * this end we'll say that services in the offline state may have a contract
41 * to be transfered and services in the disabled or maintenance states cannot.
42 * This means that when a service transitions from online (or degraded) to
43 * offline, the contract should be preserved, and when the service transitions
44 * from offline to online (i.e., the start method), we'll transfer inherited
48 #include <sys/contract/process.h>
52 #include <sys/types.h>
59 #include <libcontract.h>
60 #include <libcontract_priv.h>
62 #include <librestart.h>
74 #include <libscf_priv.h>
78 #define SBIN_SH "/sbin/sh"
81 * Used to tell if contracts are in the process of being
82 * stored into the svc.startd internal hash table.
84 volatile uint16_t storing_contract
= 0;
87 * Mapping from restart_on method-type to contract events. Must correspond to
88 * enum method_restart_t.
90 static uint_t method_events
[] = {
91 /* METHOD_RESTART_ALL */
92 CT_PR_EV_HWERR
| CT_PR_EV_SIGNAL
| CT_PR_EV_CORE
| CT_PR_EV_EMPTY
,
93 /* METHOD_RESTART_EXTERNAL_FAULT */
94 CT_PR_EV_HWERR
| CT_PR_EV_SIGNAL
,
95 /* METHOD_RESTART_ANY_FAULT */
96 CT_PR_EV_HWERR
| CT_PR_EV_SIGNAL
| CT_PR_EV_CORE
100 * method_record_start(restarter_inst_t *)
101 * Record a service start for rate limiting. Place the current time
102 * in the circular array of instance starts.
105 method_record_start(restarter_inst_t
*inst
)
107 int index
= inst
->ri_start_index
++ % RINST_START_TIMES
;
109 inst
->ri_start_time
[index
] = gethrtime();
113 * method_rate_critical(restarter_inst_t *)
114 * Return true if the average start interval is less than the permitted
115 * interval. The implicit interval defaults to RINST_FAILURE_RATE_NS and
116 * RINST_START_TIMES but may be overridden with the svc properties
117 * startd/critical_failure_count and startd/critical_failure_period
118 * which represent the number of failures to consider and the amount of
119 * time in seconds in which that number may occur, respectively. Note that
120 * this time is measured as of the transition to 'enabled' rather than wall
122 * Implicit success if insufficient measurements for an average exist.
125 method_rate_critical(restarter_inst_t
*inst
)
127 hrtime_t critical_failure_period
;
128 uint_t critical_failure_count
= RINST_START_TIMES
;
129 uint_t n
= inst
->ri_start_index
;
131 uint64_t scf_fr
, scf_st
;
132 scf_propvec_t
*prop
= NULL
;
133 scf_propvec_t restart_critical
[] = {
134 { "critical_failure_period", NULL
, SCF_TYPE_INTEGER
, NULL
, 0 },
135 { "critical_failure_count", NULL
, SCF_TYPE_INTEGER
, NULL
, 0 },
139 if (instance_is_wait_style(inst
))
140 critical_failure_period
= RINST_WT_SVC_FAILURE_RATE_NS
;
142 critical_failure_period
= RINST_FAILURE_RATE_NS
;
144 restart_critical
[0].pv_ptr
= &scf_fr
;
145 restart_critical
[1].pv_ptr
= &scf_st
;
147 if (scf_read_propvec(inst
->ri_i
.i_fmri
, "startd",
148 B_TRUE
, restart_critical
, &prop
) != SCF_FAILED
) {
150 * critical_failure_period is expressed
151 * in seconds but tracked in ns
153 critical_failure_period
= (hrtime_t
)scf_fr
* NANOSEC
;
154 critical_failure_count
= (uint_t
)scf_st
;
156 if (inst
->ri_start_index
< critical_failure_count
)
160 (inst
->ri_start_time
[(n
- 1) % critical_failure_count
] -
161 inst
->ri_start_time
[n
% critical_failure_count
]) /
162 (critical_failure_count
- 1);
164 return (avg_ns
< critical_failure_period
);
168 * int method_is_transient()
169 * Determine if the method for the given instance is transient,
170 * from a contract perspective. Return 1 if it is, and 0 if it isn't.
173 method_is_transient(restarter_inst_t
*inst
, int type
)
175 if (instance_is_transient_style(inst
) || type
!= METHOD_START
)
182 * void method_store_contract()
183 * Store the newly created contract id into local structures and
184 * the repository. If the repository connection is broken it is rebound.
187 method_store_contract(restarter_inst_t
*inst
, int type
, ctid_t
*cid
)
192 if (errno
= contract_latest(cid
))
193 uu_die("%s: Couldn't get new contract's id", inst
->ri_i
.i_fmri
);
195 primary
= !method_is_transient(inst
, type
);
198 if (inst
->ri_i
.i_transient_ctid
!= 0) {
199 log_framework(LOG_INFO
,
200 "%s: transient ctid expected to be 0 but "
201 "was set to %ld\n", inst
->ri_i
.i_fmri
,
202 inst
->ri_i
.i_transient_ctid
);
205 inst
->ri_i
.i_transient_ctid
= *cid
;
207 if (inst
->ri_i
.i_primary_ctid
!= 0) {
209 * There was an old contract that we transferred.
212 method_remove_contract(inst
, B_TRUE
, B_FALSE
);
215 if (inst
->ri_i
.i_primary_ctid
!= 0) {
216 log_framework(LOG_INFO
,
217 "%s: primary ctid expected to be 0 but "
218 "was set to %ld\n", inst
->ri_i
.i_fmri
,
219 inst
->ri_i
.i_primary_ctid
);
222 inst
->ri_i
.i_primary_ctid
= *cid
;
223 inst
->ri_i
.i_primary_ctid_stopped
= 0;
225 log_framework(LOG_DEBUG
, "Storing primary contract %ld for "
226 "%s.\n", *cid
, inst
->ri_i
.i_fmri
);
228 contract_hash_store(*cid
, inst
->ri_id
);
232 if (inst
->ri_mi_deleted
)
235 r
= restarter_store_contract(inst
->ri_m_inst
, *cid
, primary
?
236 RESTARTER_CONTRACT_PRIMARY
: RESTARTER_CONTRACT_TRANSIENT
);
242 inst
->ri_mi_deleted
= B_TRUE
;
246 libscf_handle_rebind(scf_instance_handle(inst
->ri_m_inst
));
250 libscf_reget_instance(inst
);
257 uu_die("%s: Couldn't store contract id %ld",
258 inst
->ri_i
.i_fmri
, *cid
);
263 bad_error("restarter_store_contract", r
);
268 * void method_remove_contract()
269 * Remove any non-permanent contracts from internal structures and
270 * the repository, then abandon them.
273 * ECANCELED - inst was deleted from the repository
275 * If the repository connection was broken, it is rebound.
278 method_remove_contract(restarter_inst_t
*inst
, boolean_t primary
,
281 ctid_t
* const ctidp
= primary
? &inst
->ri_i
.i_primary_ctid
:
282 &inst
->ri_i
.i_transient_ctid
;
288 log_framework(LOG_DEBUG
, "Removing %s contract %lu for %s.\n",
289 primary
? "primary" : "transient", *ctidp
, inst
->ri_i
.i_fmri
);
292 contract_abandon(*ctidp
);
295 if (inst
->ri_mi_deleted
) {
300 r
= restarter_remove_contract(inst
->ri_m_inst
, *ctidp
, primary
?
301 RESTARTER_CONTRACT_PRIMARY
: RESTARTER_CONTRACT_TRANSIENT
);
307 inst
->ri_mi_deleted
= B_TRUE
;
311 libscf_handle_rebind(scf_instance_handle(inst
->ri_m_inst
));
315 libscf_reget_instance(inst
);
322 log_error(LOG_INFO
, "%s: Couldn't remove contract id %ld: "
323 "%s.\n", inst
->ri_i
.i_fmri
, *ctidp
, strerror(r
));
328 bad_error("restarter_remove_contract", r
);
333 contract_hash_remove(*ctidp
);
338 static const char *method_names
[] = { "start", "stop", "refresh" };
341 * int method_ready_contract(restarter_inst_t *, int, method_restart_t, int)
343 * Activate a contract template for the type method of inst. type,
344 * restart_on, and cte_mask dictate the critical events term of the contract.
347 * ECANCELED - inst has been deleted from the repository
350 method_ready_contract(restarter_inst_t
*inst
, int type
,
351 method_restart_t restart_on
, uint_t cte_mask
)
353 int tmpl
, err
, istrans
, iswait
, ret
;
354 uint_t cevents
, fevents
;
357 * Correctly supporting wait-style services is tricky without
358 * rearchitecting startd to cope with multiple event sources
359 * simultaneously trying to stop an instance. Until a better
360 * solution is implemented, we avoid this problem for
361 * wait-style services by making contract events fatal and
362 * letting the wait code alone handle stopping the service.
364 iswait
= instance_is_wait_style(inst
);
365 istrans
= method_is_transient(inst
, type
);
367 tmpl
= open64(CTFS_ROOT
"/process/template", O_RDWR
);
369 uu_die("Could not create contract template");
372 * We assume non-login processes are unlikely to create
373 * multiple process groups, and set CT_PR_PGRPONLY for all
374 * wait-style services' contracts.
376 err
= ct_pr_tmpl_set_param(tmpl
, CT_PR_INHERIT
| CT_PR_REGENT
|
377 (iswait
? CT_PR_PGRPONLY
: 0));
384 assert(restart_on
>= 0);
385 assert(restart_on
<= METHOD_RESTART_ANY_FAULT
);
386 cevents
= method_events
[restart_on
] & ~cte_mask
;
388 (method_events
[restart_on
] & ~cte_mask
& CT_PR_ALLFATAL
) :
392 err
= ct_tmpl_set_critical(tmpl
, cevents
);
395 err
= ct_tmpl_set_informative(tmpl
, 0);
397 err
= ct_pr_tmpl_set_fatal(tmpl
, fevents
);
400 err
= ct_tmpl_set_cookie(tmpl
, istrans
? METHOD_OTHER_COOKIE
:
401 METHOD_START_COOKIE
);
404 if (type
== METHOD_START
&& inst
->ri_i
.i_primary_ctid
!= 0) {
405 ret
= ct_pr_tmpl_set_transfer(tmpl
, inst
->ri_i
.i_primary_ctid
);
411 /* No contracts for you! */
412 method_remove_contract(inst
, B_TRUE
, B_TRUE
);
413 if (inst
->ri_mi_deleted
) {
423 bad_error("ct_pr_tmpl_set_transfer", ret
);
427 err
= ct_pr_tmpl_set_svc_fmri(tmpl
, inst
->ri_i
.i_fmri
);
429 err
= ct_pr_tmpl_set_svc_aux(tmpl
, method_names
[type
]);
432 err
= ct_tmpl_activate(tmpl
);
445 exec_method(const restarter_inst_t
*inst
, int type
, const char *method
,
446 struct method_context
*mcp
, uint8_t need_session
)
453 cmd
= uu_msprintf("exec %s", method
);
455 if (inst
->ri_utmpx_prefix
[0] != '\0' && inst
->ri_utmpx_prefix
!= NULL
)
456 (void) utmpx_mark_init(getpid(), inst
->ri_utmpx_prefix
);
458 setlog(inst
->ri_logstem
);
459 log_instance(inst
, B_FALSE
, "Executing %s method (\"%s\").",
460 method_names
[type
], method
);
465 /* Set credentials. */
466 rsmc_errno
= restarter_set_method_context(mcp
, &errf
);
467 if (rsmc_errno
!= 0) {
468 log_instance(inst
, B_FALSE
,
469 "svc.startd could not set context for method: ");
471 if (rsmc_errno
== -1) {
472 if (strcmp(errf
, "core_set_process_path") == 0) {
473 log_instance(inst
, B_FALSE
,
474 "Could not set corefile path.");
475 } else if (strcmp(errf
, "setproject") == 0) {
476 log_instance(inst
, B_FALSE
, "%s: a resource "
477 "control assignment failed", errf
);
478 } else if (strcmp(errf
, "pool_set_binding") == 0) {
479 log_instance(inst
, B_FALSE
, "%s: a system "
480 "error occurred", errf
);
483 uu_warn("%s:%d: Bad function name \"%s\" for "
485 "restarter_set_method_context().\n",
486 __FILE__
, __LINE__
, errf
, rsmc_errno
);
494 if (errf
!= NULL
&& strcmp(errf
, "pool_set_binding") == 0) {
495 switch (rsmc_errno
) {
497 log_instance(inst
, B_FALSE
, "%s: the pool "
498 "could not be found", errf
);
502 log_instance(inst
, B_FALSE
, "%s: the "
503 "configuration is invalid", errf
);
507 log_instance(inst
, B_FALSE
, "%s: pool name "
508 "\"%s\" is invalid", errf
,
514 uu_warn("%s:%d: Bad error %d for function %s "
515 "in restarter_set_method_context().\n",
516 __FILE__
, __LINE__
, rsmc_errno
, errf
);
521 exit(SMF_EXIT_ERR_CONFIG
);
524 if (errf
!= NULL
&& strcmp(errf
, "chdir") == 0) {
525 switch (rsmc_errno
) {
534 log_instance(inst
, B_FALSE
, "%s: %s (\"%s\")",
536 strerror(rsmc_errno
), mcp
->working_dir
);
541 uu_warn("%s:%d: Bad error %d for function %s "
542 "in restarter_set_method_context().\n",
543 __FILE__
, __LINE__
, rsmc_errno
, errf
);
548 exit(SMF_EXIT_ERR_CONFIG
);
555 switch (rsmc_errno
) {
562 exit(SMF_EXIT_ERR_CONFIG
);
570 switch (rsmc_errno
) {
572 log_instance(inst
, B_FALSE
, "Out of memory.");
577 log_instance(inst
, B_FALSE
, "Missing passwd entry for "
579 exit(SMF_EXIT_ERR_CONFIG
);
584 uu_warn("%s:%d: Bad miscellaneous error %d from "
585 "restarter_set_method_context().\n", __FILE__
,
586 __LINE__
, rsmc_errno
);
592 nenv
= set_smf_env(mcp
->env
, mcp
->env_sz
, NULL
, inst
,
597 (void) execle(SBIN_SH
, SBIN_SH
, "-c", cmd
, NULL
, nenv
);
603 write_status(restarter_inst_t
*inst
, const char *mname
, int stat
)
608 if (inst
->ri_mi_deleted
)
611 r
= libscf_write_method_status(inst
->ri_m_inst
, mname
, stat
);
617 libscf_reget_instance(inst
);
621 inst
->ri_mi_deleted
= 1;
627 log_framework(LOG_INFO
, "Could not write exit status "
628 "for %s method of %s: %s.\n", mname
,
629 inst
->ri_i
.i_fmri
, strerror(r
));
634 bad_error("libscf_write_method_status", r
);
640 * Execute the type method of instp. If it requires a fork(), wait for it
641 * to return and return its exit code in *exit_code. Otherwise set
642 * *exit_code to 0 if the method succeeds & -1 if it fails. If the
643 * repository connection is broken, it is rebound, but inst may not be
647 * EINVAL - A correct method or method context couldn't be retrieved.
648 * EIO - Contract kill failed.
649 * EFAULT - Method couldn't be executed successfully.
650 * ELOOP - Retry threshold exceeded.
651 * ECANCELED - inst was deleted from the repository before method was run
652 * ERANGE - Timeout retry threshold exceeded.
653 * EAGAIN - Failed due to external cause, retry.
656 method_run(restarter_inst_t
**instp
, int type
, int *exit_code
)
661 method_restart_t restart_on
;
663 uint8_t need_session
;
665 scf_snapshot_t
*snap
;
668 struct method_context
*mcp
;
669 int result
= 0, timeout_fired
= 0;
673 uint8_t timeout_retry
;
676 restarter_inst_t
*inst
= *instp
;
677 int id
= inst
->ri_id
;
680 assert(MUTEX_HELD(&inst
->ri_lock
));
681 assert(instance_in_transition(inst
));
683 if (inst
->ri_mi_deleted
)
688 assert(0 <= type
&& type
<= 2);
689 mname
= method_names
[type
];
691 if (type
== METHOD_START
)
692 inst
->ri_pre_online_hook();
694 h
= scf_instance_handle(inst
->ri_m_inst
);
696 snap
= scf_snapshot_create(h
);
698 scf_instance_get_snapshot(inst
->ri_m_inst
, "running", snap
) != 0) {
699 log_framework(LOG_DEBUG
,
700 "Could not get running snapshot for %s. "
701 "Using editing version to run method %s.\n",
702 inst
->ri_i
.i_fmri
, mname
);
703 scf_snapshot_destroy(snap
);
708 * After this point, we may be logging to the instance log.
709 * Make sure we've noted where that log is as a property of
712 r
= libscf_note_method_log(inst
->ri_m_inst
, st
->st_log_prefix
,
715 log_framework(LOG_WARNING
,
716 "%s: couldn't note log location: %s\n",
717 inst
->ri_i
.i_fmri
, strerror(r
));
720 if ((method
= libscf_get_method(h
, type
, inst
, snap
, &restart_on
,
721 &cte_mask
, &need_session
, &timeout
, &timeout_retry
)) == NULL
) {
722 if (errno
== LIBSCF_PGROUP_ABSENT
) {
723 log_framework(LOG_DEBUG
,
724 "%s: instance has no method property group '%s'.\n",
725 inst
->ri_i
.i_fmri
, mname
);
726 if (type
== METHOD_REFRESH
)
727 log_instance(inst
, B_TRUE
, "No '%s' method "
728 "defined. Treating as :true.", mname
);
730 log_instance(inst
, B_TRUE
, "Method property "
731 "group '%s' is not present.", mname
);
732 scf_snapshot_destroy(snap
);
734 } else if (errno
== LIBSCF_PROPERTY_ABSENT
) {
735 log_framework(LOG_DEBUG
,
736 "%s: instance has no '%s/exec' method property.\n",
737 inst
->ri_i
.i_fmri
, mname
);
738 log_instance(inst
, B_TRUE
, "Method property '%s/exec "
739 "is not present.", mname
);
740 scf_snapshot_destroy(snap
);
743 log_error(LOG_WARNING
,
744 "%s: instance libscf_get_method failed\n",
746 scf_snapshot_destroy(snap
);
751 /* open service contract if stopping a non-transient service */
752 if (type
== METHOD_STOP
&& (!instance_is_transient_style(inst
))) {
753 if (inst
->ri_i
.i_primary_ctid
== 0) {
754 /* service is not running, nothing to stop */
755 log_framework(LOG_DEBUG
, "%s: instance has no primary "
756 "contract, no service to stop.\n",
758 scf_snapshot_destroy(snap
);
761 if ((ctfd
= contract_open(inst
->ri_i
.i_primary_ctid
, "process",
762 "events", O_RDONLY
)) < 0) {
764 log_instance(inst
, B_TRUE
, "Could not open service "
765 "contract %ld. Stop method not run.",
766 inst
->ri_i
.i_primary_ctid
);
771 if (restarter_is_null_method(method
)) {
772 log_framework(LOG_DEBUG
, "%s: null method succeeds\n",
775 log_instance(inst
, B_TRUE
, "Executing %s method (null).",
778 if (type
== METHOD_START
)
779 write_status(inst
, mname
, 0);
783 sig
= restarter_is_kill_method(method
);
786 if (inst
->ri_i
.i_primary_ctid
== 0) {
787 log_error(LOG_ERR
, "%s: :kill with no contract\n",
789 log_instance(inst
, B_TRUE
, "Invalid use of \":kill\" "
790 "as stop method for transient service.");
795 log_framework(LOG_DEBUG
,
796 "%s: :killing contract with signal %d\n",
797 inst
->ri_i
.i_fmri
, sig
);
799 log_instance(inst
, B_TRUE
, "Executing %s method (:kill).",
802 if (contract_kill(inst
->ri_i
.i_primary_ctid
, sig
,
803 inst
->ri_i
.i_fmri
) != 0) {
810 log_framework(LOG_DEBUG
, "%s: forking to run method %s\n",
811 inst
->ri_i
.i_fmri
, method
);
813 m_error
= restarter_get_method_context(RESTARTER_METHOD_CONTEXT_VERSION
,
814 inst
->ri_m_inst
, snap
, mname
, method
, &mcp
);
816 if (m_error
!= NULL
) {
817 log_instance(inst
, B_TRUE
, "%s", m_error
->msg
);
818 restarter_mc_error_destroy(m_error
);
823 r
= method_ready_contract(inst
, type
, restart_on
, cte_mask
);
825 assert(r
== ECANCELED
);
826 assert(inst
->ri_mi_deleted
);
827 restarter_free_method_context(mcp
);
833 * Validate safety of method contexts, to save children work.
835 if (!restarter_rm_libs_loadable())
836 log_framework(LOG_DEBUG
, "%s: method contexts limited "
837 "to root-accessible libraries\n", inst
->ri_i
.i_fmri
);
840 * For wait-style svc, sanity check that method exists to prevent an
843 if (instance_is_wait_style(inst
) && type
== METHOD_START
) {
848 * We need to handle start method strings that have arguments,
849 * such as '/lib/svc/method/console-login %i'.
851 if ((pend
= strchr(method
, ' ')) != NULL
)
854 if (*method
== '/' && stat64(method
, &sbuf
) == -1 &&
856 log_instance(inst
, B_TRUE
, "Missing start method (%s), "
857 "changing state to maintenance.", method
);
858 restarter_free_method_context(mcp
);
867 * If the service is restarting too quickly, send it to
870 if (type
== METHOD_START
) {
871 method_record_start(inst
);
872 if (method_rate_critical(inst
) &&
873 !instance_is_wait_style(inst
)) {
874 log_instance(inst
, B_TRUE
, "Restarting too quickly, "
875 "changing state to maintenance.");
877 restarter_free_method_context(mcp
);
882 atomic_add_16(&storing_contract
, 1);
883 pid
= startd_fork1(&forkerr
);
885 exec_method(inst
, type
, method
, mcp
, need_session
);
888 atomic_add_16(&storing_contract
, -1);
889 if (forkerr
== EAGAIN
)
894 log_error(LOG_WARNING
,
895 "%s: Couldn't fork to execute method %s: %s\n",
896 inst
->ri_i
.i_fmri
, method
, strerror(forkerr
));
898 restarter_free_method_context(mcp
);
904 * Get the contract id, decide whether it is primary or transient, and
905 * stash it in inst & the repository.
907 method_store_contract(inst
, type
, &ctid
);
908 atomic_add_16(&storing_contract
, -1);
910 restarter_free_method_context(mcp
);
913 * Similarly for the start method PID.
915 if (type
== METHOD_START
&& !inst
->ri_mi_deleted
)
916 (void) libscf_write_start_pid(inst
->ri_m_inst
, pid
);
918 if (instance_is_wait_style(inst
) && type
== METHOD_START
) {
919 /* Wait style instances don't get timeouts on start methods. */
920 if (wait_register(pid
, inst
->ri_i
.i_fmri
, 1, 0)) {
921 log_error(LOG_WARNING
,
922 "%s: couldn't register %ld for wait\n",
923 inst
->ri_i
.i_fmri
, pid
);
927 write_status(inst
, mname
, 0);
935 * Because on upgrade/live-upgrade we may have no chance
936 * to override faulty timeout values on the way to
937 * manifest import, all services on the path to manifest
938 * import are treated the same as INFINITE timeout services.
941 start_time
= time(NULL
);
942 if (timeout
!= METHOD_TIMEOUT_INFINITE
&& !is_timeout_ovr(inst
))
943 timeout_insert(inst
, ctid
, timeout
);
945 timeout
= METHOD_TIMEOUT_INFINITE
;
947 /* Unlock the instance while waiting for the method. */
948 MUTEX_UNLOCK(&inst
->ri_lock
);
951 r
= waitpid(pid
, &ret_status
, NULL
);
952 } while (r
== -1 && errno
== EINTR
);
956 /* Re-grab the lock. */
957 inst
= inst_lookup_by_id(id
);
960 * inst can't be removed, as the removal thread waits
961 * for completion of this one.
963 assert(inst
!= NULL
);
966 if (inst
->ri_timeout
!= NULL
&& inst
->ri_timeout
->te_fired
)
969 timeout_remove(inst
, ctid
);
971 log_framework(LOG_DEBUG
,
972 "%s method for %s exited with status %d.\n", mname
,
973 inst
->ri_i
.i_fmri
, WEXITSTATUS(ret_status
));
976 log_error(LOG_WARNING
,
977 "Couldn't waitpid() for %s method of %s (%s).\n",
978 mname
, inst
->ri_i
.i_fmri
, strerror(err
));
983 if (type
== METHOD_START
)
984 write_status(inst
, mname
, ret_status
);
986 /* return ERANGE if this service doesn't retry on timeout */
987 if (timeout_fired
== 1 && timeout_retry
== 0) {
992 if (!WIFEXITED(ret_status
)) {
994 * If method didn't exit itself (it was killed by an
995 * external entity, etc.), consider the entire
996 * method_run as failed.
998 if (WIFSIGNALED(ret_status
)) {
999 char buf
[SIG2STR_MAX
];
1000 (void) sig2str(WTERMSIG(ret_status
), buf
);
1002 log_error(LOG_WARNING
, "%s: Method \"%s\" "
1003 "failed due to signal %s.\n",
1004 inst
->ri_i
.i_fmri
, method
, buf
);
1005 log_instance(inst
, B_TRUE
, "Method \"%s\" "
1006 "failed due to signal %s.", mname
, buf
);
1008 log_error(LOG_WARNING
, "%s: Method \"%s\" "
1009 "failed with exit status %d.\n",
1010 inst
->ri_i
.i_fmri
, method
,
1011 WEXITSTATUS(ret_status
));
1012 log_instance(inst
, B_TRUE
, "Method \"%s\" "
1013 "failed with exit status %d.", mname
,
1014 WEXITSTATUS(ret_status
));
1020 *exit_code
= WEXITSTATUS(ret_status
);
1021 if (*exit_code
!= 0) {
1022 log_error(LOG_WARNING
,
1023 "%s: Method \"%s\" failed with exit status %d.\n",
1024 inst
->ri_i
.i_fmri
, method
, WEXITSTATUS(ret_status
));
1027 log_instance(inst
, B_TRUE
, "Method \"%s\" exited with status "
1028 "%d.", mname
, *exit_code
);
1030 if (*exit_code
!= 0)
1033 end_time
= time(NULL
);
1035 /* Give service contract remaining seconds to empty */
1036 if (timeout
!= METHOD_TIMEOUT_INFINITE
)
1037 timeout
-= (end_time
- start_time
);
1042 * For stop methods, assure that the service contract has emptied
1045 if (type
== METHOD_STOP
&& (!instance_is_transient_style(inst
)) &&
1046 !(contract_is_empty(inst
->ri_i
.i_primary_ctid
))) {
1049 if (timeout
!= METHOD_TIMEOUT_INFINITE
)
1050 timeout_insert(inst
, inst
->ri_i
.i_primary_ctid
,
1055 * Check frequently at first, then back off. This
1056 * keeps startd from idling while shutting down.
1059 (void) poll(NULL
, 0, 5);
1062 (void) poll(NULL
, 0, 100);
1064 if (contract_is_empty(inst
->ri_i
.i_primary_ctid
))
1068 if (timeout
!= METHOD_TIMEOUT_INFINITE
)
1069 if (inst
->ri_timeout
->te_fired
)
1072 timeout_remove(inst
, inst
->ri_i
.i_primary_ctid
);
1076 /* Abandon contracts for transient methods & methods that fail. */
1077 transient
= method_is_transient(inst
, type
);
1078 if ((transient
|| *exit_code
!= 0 || result
!= 0) &&
1079 (restarter_is_kill_method(method
) < 0))
1080 method_remove_contract(inst
, !transient
, B_TRUE
);
1085 scf_snapshot_destroy(snap
);
1091 * The method thread executes a service method to effect a state transition.
1092 * The next_state of info->sf_id should be non-_NONE on entrance, and it will
1093 * be _NONE on exit (state will either be what next_state was (on success), or
1094 * it will be _MAINT (on error)).
1096 * There are six classes of methods to consider: start & other (stop, refresh)
1097 * for each of "normal" services, wait services, and transient services. For
1098 * each, the method must be fetched from the repository & executed. fork()ed
1099 * methods must be waited on, except for the start method of wait services
1100 * (which must be registered with the wait subsystem via wait_register()). If
1101 * the method succeeded (returned 0), then for start methods its contract
1102 * should be recorded as the primary contract for the service. For other
1103 * methods, it should be abandoned. If the method fails, then depending on
1104 * the failure, either the method should be reexecuted or the service should
1105 * be put into maintenance. Either way the contract should be abandoned.
1108 method_thread(void *arg
)
1110 fork_info_t
*info
= arg
;
1111 restarter_inst_t
*inst
;
1112 scf_handle_t
*local_handle
;
1113 scf_instance_t
*s_inst
= NULL
;
1115 boolean_t retryable
;
1116 restarter_str_t reason
;
1118 assert(0 <= info
->sf_method_type
&& info
->sf_method_type
<= 2);
1120 /* Get (and lock) the restarter_inst_t. */
1121 inst
= inst_lookup_by_id(info
->sf_id
);
1123 assert(inst
->ri_method_thread
!= 0);
1124 assert(instance_in_transition(inst
) == 1);
1127 * We cannot leave this function with inst in transition, because
1128 * protocol.c withholds messages for inst otherwise.
1131 log_framework(LOG_DEBUG
, "method_thread() running %s method for %s.\n",
1132 method_names
[info
->sf_method_type
], inst
->ri_i
.i_fmri
);
1134 local_handle
= libscf_handle_create_bound_loop();
1137 /* get scf_instance_t */
1138 switch (r
= libscf_fmri_get_instance(local_handle
, inst
->ri_i
.i_fmri
,
1144 libscf_handle_rebind(local_handle
);
1149 * It's not there, but we need to call this so protocol.c
1150 * doesn't think it's in transition anymore.
1152 (void) restarter_instance_update_states(local_handle
, inst
,
1153 inst
->ri_i
.i_state
, RESTARTER_STATE_NONE
, RERR_NONE
,
1154 restarter_str_none
);
1160 bad_error("libscf_fmri_get_instance", r
);
1163 inst
->ri_m_inst
= s_inst
;
1164 inst
->ri_mi_deleted
= B_FALSE
;
1167 if (info
->sf_method_type
== METHOD_START
)
1168 log_transition(inst
, START_REQUESTED
);
1170 r
= method_run(&inst
, info
->sf_method_type
, &exit_code
);
1172 if (r
== 0 && exit_code
== 0) {
1174 assert(inst
->ri_i
.i_next_state
!= RESTARTER_STATE_NONE
);
1177 * When a stop method succeeds, remove the primary contract of
1178 * the service, unless we're going to offline, in which case
1179 * retain the contract so we can transfer inherited contracts to
1180 * the replacement service.
1183 if (info
->sf_method_type
== METHOD_STOP
&&
1184 inst
->ri_i
.i_primary_ctid
!= 0) {
1185 if (inst
->ri_i
.i_next_state
== RESTARTER_STATE_OFFLINE
)
1186 inst
->ri_i
.i_primary_ctid_stopped
= 1;
1188 method_remove_contract(inst
, B_TRUE
, B_TRUE
);
1191 * We don't care whether the handle was rebound because this is
1192 * the last thing we do with it.
1194 (void) restarter_instance_update_states(local_handle
, inst
,
1195 inst
->ri_i
.i_next_state
, RESTARTER_STATE_NONE
,
1196 info
->sf_event_type
, info
->sf_reason
);
1198 (void) update_fault_count(inst
, FAULT_COUNT_RESET
);
1203 /* Failure. Retry or go to maintenance. */
1205 if (r
!= 0 && r
!= EAGAIN
) {
1206 retryable
= B_FALSE
;
1208 switch (exit_code
) {
1209 case SMF_EXIT_ERR_CONFIG
:
1210 case SMF_EXIT_ERR_NOSMF
:
1211 case SMF_EXIT_ERR_PERM
:
1212 case SMF_EXIT_ERR_FATAL
:
1213 retryable
= B_FALSE
;
1221 if (retryable
&& update_fault_count(inst
, FAULT_COUNT_INCR
) != 1)
1226 log_transition(inst
, START_FAILED_REPEATEDLY
);
1227 else if (r
== ERANGE
)
1228 log_transition(inst
, START_FAILED_TIMEOUT_FATAL
);
1229 else if (exit_code
== SMF_EXIT_ERR_CONFIG
)
1230 log_transition(inst
, START_FAILED_CONFIGURATION
);
1231 else if (exit_code
== SMF_EXIT_ERR_FATAL
)
1232 log_transition(inst
, START_FAILED_FATAL
);
1234 log_transition(inst
, START_FAILED_OTHER
);
1237 reason
= restarter_str_restarting_too_quickly
;
1238 } else if (retryable
) {
1239 reason
= restarter_str_fault_threshold_reached
;
1241 reason
= restarter_str_method_failed
;
1244 (void) restarter_instance_update_states(local_handle
, inst
,
1245 RESTARTER_STATE_MAINT
, RESTARTER_STATE_NONE
, RERR_FAULT
,
1248 if (!method_is_transient(inst
, info
->sf_method_type
) &&
1249 inst
->ri_i
.i_primary_ctid
!= 0)
1250 method_remove_contract(inst
, B_TRUE
, B_TRUE
);
1253 inst
->ri_method_thread
= 0;
1256 * Unlock the mutex after broadcasting to avoid a race condition
1257 * with restarter_delete_inst() when the 'inst' structure is freed.
1259 (void) pthread_cond_broadcast(&inst
->ri_method_cv
);
1260 MUTEX_UNLOCK(&inst
->ri_lock
);
1262 scf_instance_destroy(s_inst
);
1263 scf_handle_destroy(local_handle
);
1264 startd_free(info
, sizeof (fork_info_t
));