2 Copyright Red Hat, Inc. 2002-2003
4 This program is free software; you can redistribute it and/or modify it
5 under the terms of the GNU General Public License as published by the
6 Free Software Foundation; either version 2, or (at your option) any
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; see the file COPYING. If not, write to the
16 Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
20 * Service Manager for RHCM. This is the 1.0.x service manager with
21 * extras to make it multi-node capable.
23 * Author: Brian Stevens (bstevens at redhat.com)
24 * Lon Hohberger (lhh at redhat.com)
28 /*static const char *version __attribute__ ((unused)) = "$Revision: 1.71 $";*/
37 #include <sys/types.h>
38 #include <sys/param.h>
40 #include <sys/syslog.h>
42 #include <linux/reboot.h>
43 #include <sys/reboot.h>
45 #include <clusterdefs.h>
52 #include <clushared.h>
53 #include <sharedstate.h>
54 #include <namespace.h>
61 #define reboot(arg) {\
62 clulog(LOG_EMERG, "reboot(%s) @ %s:%d\n", #arg, __FILE__, __LINE__); \
69 clulog(LOG_DEBUG, "Service %d failed @ %s:%d\n",\
70 x, __FILE__, __LINE__); \
74 #define HEARTBEAT_INTERVAL 60
75 #define CHECK_INTERVAL 5
76 #define MSG_TIMEOUT 10
78 #define SVCF_START_DISABLED 1
79 #define SVCF_PENDING 2
80 #define SVCF_RESTART 4
81 #define SVCF_CLEAR_FAILURES 8
82 #define SVCF_RESTARTFAILED 16
85 static int myNodeState
;
86 static int services_locked
= 0;
88 static int alias_owner
= -1;
90 static char *myNodeName
= NULL
;
91 static int ticks
[MAX_SERVICES
];
92 static memb_mask_t membership
, mask_panic
;
93 static int sighup_received
= 0, sigterm_received
= 0;
100 static struct child_sm svc_children
[MAX_SERVICES
];
103 * from clusvcmgrd_cfg.c:
105 int check_config_file(void);
106 int check_config_data(void);
107 int rebuild_config_lockless(void);
108 int rebuild_config(void);
109 int handle_config_update(memb_mask_t mask
, int my_node_id
);
110 int boot_config_init(void);
114 * Service action string table
116 static char *serviceActionStrings
[] = {
125 SVC_STATUS_INQUIRY_STR
,
127 SVC_START_PENDING_STR
,
128 SVC_START_RELOCATE_STR
,
129 "reconfigure", /* XXX */
133 extern void daemon_init(char *);
135 static int svc_stop(int, int);
136 static int svc_stop_unclean(int);
137 static int _svc_fail(int svcID
);
140 static int clu_alias(int);
143 static int request_failback(int);
144 static int failback(int);
146 static int relocate_service(int svcID
, int request
, int target
);
147 static void handle_svc_request(int, int, int, msg_handle_t
);
149 int svc_report_failure(int svcID
);
150 int setServiceStatus(ServiceBlock
*svcblk
);
151 int getServiceStatus(int svcNum
, ServiceBlock
*svcblk
);
152 int removeService(int svcNum
);
156 * Block the given signal.
158 * @param sig Signal to block.
159 * @return See man sigprocmask.
162 block_signal(int sig
)
167 sigaddset(&set
, sig
);
169 return(sigprocmask(SIG_BLOCK
, &set
, NULL
));
174 * unblock the given signal.
176 * @param sig Signal to unblock.
177 * @return See man sigprocmask.
180 unblock_signal(int sig
)
185 sigaddset(&set
, sig
);
187 return(sigprocmask(SIG_UNBLOCK
, &set
, NULL
));
195 if (CFG_Get((char *) "cluster%logfacility", NULL
, &p
) == CFG_OK
) {
203 * Send a SVC_FAILBACK request to the given partner member.
205 * @param partner Partner we are sending request to.
206 * @return FAIL or SUCCESS
210 request_failback(int partner
)
212 msg_handle_t fd_failback
;
213 SmMessageSt msg_failback
;
215 if (partner
== myNodeID
)
219 * Fork here to avoid deadlock.
230 msg_failback
.sm_hdr
.gh_magic
= GENERIC_HDR_MAGIC
;
231 msg_failback
.sm_hdr
.gh_command
= SVC_ACTION_REQUEST
;
232 msg_failback
.sm_hdr
.gh_length
= sizeof (SmMessageSt
);
233 msg_failback
.sm_data
.d_action
= SVC_FAILBACK
;
234 msg_failback
.sm_data
.d_svcOwner
= myNodeID
;
235 msg_failback
.sm_data
.d_ret
= 0;
237 if ((fd_failback
= msg_open(PROCID_CLUSVCMGRD
, partner
)) < 0) {
238 clulog(LOG_DEBUG
, "Failed opening connection to svcmgrd\n");
243 swab_SmMessageSt(&msg_failback
);
245 if (msg_send(fd_failback
, &msg_failback
, sizeof (SmMessageSt
)) !=
246 sizeof (SmMessageSt
)) {
247 msg_close(fd_failback
);
248 clulog(LOG_ERR
, "Error sending failback request.\n");
251 msg_close(fd_failback
);
257 * Handle SVC_FAILBACK from a given node. This shuts down services which
258 * should be running on 'target' instead of 'myNodeID'. Takes into account
259 * service failover domain and preferred node ordering. Services without
260 * a failover domain will never be sent to the requesting node.
262 * @param target Requestor which sent us SVC_FAILBACK
264 * @see request_failback
271 for (svcID
= 0; svcID
< MAX_SERVICES
; svcID
++) {
272 if (serviceExists(svcID
) != YES
)
275 if (!svc_has_domain(svcID
))
279 * If the service has a failover domain, and the requestor
280 * should run it and I shouldn't, then I will give the
283 * This relies on handle_svc_request to determine the
284 * state of the service.
286 if (node_should_start(myNodeID
, membership
, svcID
) <
287 node_should_start(target
, membership
, svcID
))
288 handle_svc_request(svcID
, SVC_RELOCATE
, target
, -1);
296 * See if a child process operating on a specified service has exited.
298 * @param svcID Service ID's child we're checking out.
299 * @return 0 indicates that no child has exited. 1 indicates
300 * that the child for the service has, indeed, been
304 cleanup_child(int svcID
)
306 /* Obvious check: is there even a child for this service? */
307 if (!svc_children
[svcID
].cs_pid
)
309 if (waitpid(svc_children
[svcID
].cs_pid
, NULL
, WNOHANG
) != -1)
314 svc_children
[svcID
].cs_pid
= 0;
320 * Clean up children. This is our SIGCHLD handler.
323 reap_zombies(int __attribute__ ((unused
)) sig
)
330 while ((pid
= waitpid(-1, &status
, WNOHANG
)) != 0) {
334 break; /* No children */
337 /*clulog(LOG_DEBUG, "waitpid reaped %d\n", pid);*/
338 for (svcID
= 0; svcID
< MAX_SERVICES
; svcID
++) {
339 if (pid
== svc_children
[svcID
].cs_pid
) {
340 svc_children
[svcID
].cs_pid
= 0;
345 //return (nchildren);
350 * Clean up services and exit.
352 * @param status Return value passed up to parent process.
353 * @param clean This is set to '1' when we're cleanly shutting down
354 * and we have quorum. Without quorum or during an
355 * unclean shutdown, this is 0.
356 * @return If it returns, that's BAD
359 svcmgr_exit(int status
, int clean
)
367 for (svcID
= 0; svcID
< MAX_SERVICES
; svcID
++) {
369 if (serviceExists(svcID
) != YES
)
372 /* Wait for child process acting on this service to exit */
373 while (!cleanup_child(svcID
))
377 switch(svc_stop(svcID
, 0)) {
380 svc_report_failure(svcID
);
383 /* Lock failure during shutdown == switch to
385 clulog(LOG_ERR
, "Failed to acquire cluster lock "
386 "during shutdown\n");
397 svc_stop_unclean(svcID
);
401 * Tell the quorum daemon that we are leaving
403 clulog(LOG_DEBUG
, "Sending message to quorumd that we are exiting\n");
405 if ((fd
= msg_open(PROCID_CLUQUORUMD
, myNodeID
)) < 0) {
406 clulog(LOG_ERR
, "msg_open failed to quorum daemon\n");
410 if (msg_send_simple(fd
, QUORUM_EXIT
, status
, 0) == -1) {
411 clulog(LOG_ERR
, "Failed sending exit message to cluquorumd\n");
417 clulog(LOG_INFO
, "Exiting\n");
423 * NOTE: If someone kills the service manager during start, it's possible to
424 * have a service script still running the stop phase. This is OKAY!
425 * This is our SIGTERM handler.
430 sigterm_handler(void)
432 block_signal(SIGHUP
);
433 block_signal(SIGTERM
);
434 sigterm_received
= 1;
439 * Retrieve our log level from the cluster database and set it accordingly.
446 if (getSvcMgrLogLevel(&level
) == FAIL
) {
448 "Failed getting log level for from config database\n");
452 if (clu_set_loglevel(level
) == -1) {
453 clulog(LOG_ERR
, "Failed setting log level\n");
459 * Noitify the local daemons that the on-disk configuration has changed, and
460 * so needs to be reread.
463 notify_everybody(void)
466 * Notify local daemons of the cofiguration update...
468 killall("clumembd", SIGHUP
);
469 killall("cluquorumd", SIGHUP
);
470 killall("clulockd", SIGHUP
);
471 killall("clurmtabd", SIGHUP
);
476 * Handle an updated configuration. This is called after we receive a SIGHUP.
478 * @see sighup_handler
483 int really_updated
= 1;
485 block_signal(SIGHUP
);
486 /* XXX check for return code?? */
487 /* We reload the msg service stuff inside handle_config_update */
488 really_updated
= handle_config_update(membership
, myNodeID
);
490 if (really_updated
== 0) {
496 * If we fail to update, the other service managers will reread
497 * the shared config in a few seconds anyway.
499 unblock_signal(SIGHUP
);
504 * When we receive SIGHUP, we set the global flag. We soon after call
510 sighup_handler(int __attribute__ ((unused
)) sig
)
517 * Run the service script for a given service. The service scripts handle
518 * the real meat of starting/stopping services.
520 * @param action The action to take (ie, start/stop/status)
521 * @param svcID The service ID we intend to take 'action' on.
522 * @param block Set to 0 if the service script should run in the
523 * background, 1 if we should wait for it to complete
525 * @param ret The return code of the service script.
526 * @return SUCCESS or FAIL.
529 exec_service_script(char *action
, int svcID
, int block
, int *ret
)
536 struct sched_param param
;
538 getSvcName(svcID
, &svcName
);
540 clulog(LOG_DEBUG
, "Exec of script %s, action %s, service %s\n",
541 SVC_ACTION_SCRIPT
, action
, svcName
);
545 clulog(LOG_ERR
, "fork failed: %s", strerror(errno
));
555 pid
= waitpid(pid
, &local_ret
, 0);
568 "Exec of script for service %s returned %d\n",
577 * we need to set the sched_priority back to normal in case clusvcmgrd
578 * is running in a different prio b/c cluquorumd%rtp is set
580 if (sched_getscheduler(0) != SCHED_OTHER
) {
581 memset(¶m
,0,sizeof(param
));
582 param
.sched_priority
= 0;
583 if (sched_setscheduler(0, SCHED_OTHER
, (void *)¶m
) != 0)
584 clulog(LOG_WARNING
, "Setting child to normal priority "
585 "failed: %s\n", strerror(errno
));
587 clulog(LOG_DEBUG
, "Using normal priority\n");
590 /* lhh - Unblock signals so the user script doesn't break */
592 if (sigprocmask(SIG_UNBLOCK
, &set
, NULL
) != 0) {
593 clulog(LOG_WARNING
, "Failed to unblock signals: %s\n",
597 snprintf(svcIDstr
, sizeof (svcIDstr
), "%d", svcID
);
599 execl(SVC_ACTION_SCRIPT
, SVC_ACTION_SCRIPT
, action
, svcIDstr
, NULL
);
601 clulog(LOG_DEBUG
, "Exec failed of %s, action %s, service %s, err %s\n",
602 SVC_ACTION_SCRIPT
, action
, svcName
, strerror(errno
));
616 if (alias_owner
== myNodeID
)
618 alias_owner
= myNodeID
;
619 clulog(LOG_DEBUG
, "Start cluster alias request\n");
621 if (alias_owner
!= myNodeID
)
624 clulog(LOG_DEBUG
, "Stop cluster alias request\n");
629 clulog(LOG_ERR
, "fork failed: %s", strerror(errno
));
634 pid
= waitpid(pid
, &local_ret
, 0);
635 if ((pid
< 0) && (errno
== EINTR
))
639 clulog(LOG_DEBUG
, "Exec of alias script returned %d\n",
644 block_signal(SIGTERM
);
645 block_signal(SIGHUP
);
649 execl(CLU_ALIAS_SCRIPT
, CLU_ALIAS_SCRIPT
, "start", NULL
);
652 execl(CLU_ALIAS_SCRIPT
, CLU_ALIAS_SCRIPT
, "stop", NULL
);
654 clulog(LOG_DEBUG
, "Exec failed of %s, err %s\n", CLU_ALIAS_SCRIPT
,
663 * Initialize an on-disk service block.
665 * @param svcID Service ID whose block we need to update.
666 * @return FAIL or SUCCESS.
671 ServiceBlock svcStatus
;
673 clulog(LOG_DEBUG
, "Initializing service #%d\n", svcID
);
676 * Make sure the service does not exist
679 if (clu_svc_lock(svcID
) == -1) {
680 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
685 if (getServiceStatus(svcID
, &svcStatus
) == SUCCESS
) {
687 "Service #%d already exists!\n",
689 clu_svc_unlock(svcID
);
693 svcStatus
.sb_id
= svcID
;
694 svcStatus
.sb_owner
= NODE_ID_NONE
;
695 svcStatus
.sb_last_owner
= NODE_ID_NONE
;
696 svcStatus
.sb_state
= SVC_DISABLED
;
697 svcStatus
.sb_transition
= (uint64_t)time(NULL
);
698 svcStatus
.sb_restarts
= 0;
700 if (setServiceStatus(&svcStatus
) != SUCCESS
) {
701 (void) removeService(svcID
);
702 clu_svc_unlock(svcID
);
706 clu_svc_unlock(svcID
);
712 * Set an on-disk service block's state to UNINITIALIZED.
714 * @param svcID Service ID whose block we need to update.
715 * @return FAIL or SUCCESS.
718 svc_remove(int svcID
)
720 clulog(LOG_DEBUG
, "Removing service #%d from database\n", svcID
);
722 if (clu_svc_lock(svcID
) == -1) {
723 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
728 if (removeService(svcID
) != 0) {
729 clulog(LOG_ERR
, "Failed removing service %d from database\n",
731 clu_svc_unlock(svcID
);
735 clu_svc_unlock(svcID
);
741 * Advise service manager as to whether or not to start a service, given
742 * that we already know it's legal to run the service.
744 * @param svcStatus Current service status.
745 * @param svcName Service name
746 * @param flags Specify whether or not it's legal to start a
747 * disabled service, etc.
748 * @return 0 = DO NOT start service, return FAIL
749 * 1 = START service - return whatever it returns.
750 * 2 = DO NOT start service, return SUCCESS
753 svc_advise_start(ServiceBlock
*svcStatus
, char *svcName
, int flags
)
757 switch(svcStatus
->sb_state
) {
759 clulog(LOG_ERR
, "Service %s has failed on all applicable "
760 "members; can not start.\n", svcName
);
765 getNodeName(svcStatus
->sb_owner
, &nodeName
);
766 if ((svcStatus
->sb_owner
== myNodeID
) ||
767 (memb_online(membership
, svcStatus
->sb_owner
)==1) ||
768 (memb_online(mask_panic
, svcStatus
->sb_owner
)==1)) {
770 * Service is running and the owner is online!
773 "Service is running on member %s.\n",
779 * Service is running but owner is down -> FAILOVER
782 "Taking over service %s from down member %s\n",
788 * Starting failed service...
790 if (flags
& SVCF_PENDING
) {
791 clulog(LOG_NOTICE
, "Starting failed service %s\n",
793 svcStatus
->sb_state
= SVC_STOPPED
;
798 /* Don't start, but return success. */
800 "Not starting %s: pending/transitional state\n",
805 clulog(LOG_NOTICE
, "Starting stopped service %s\n", svcName
);
809 case SVC_UNINITIALIZED
:
810 if (flags
& SVCF_START_DISABLED
) {
811 clulog(LOG_NOTICE
, "Starting disabled service %s\n",
816 clulog(LOG_DEBUG
, "Not starting disabled service %s\n",
822 "Cannot start service %s: Invalid State %d\n",
823 svcName
, svcStatus
->sb_state
);
830 * Start a cluster service.
832 * @param svcID Service ID to start.
833 * @param flags Service-operation specific flags to take into account.
834 * @see svc_advise_start
835 * @return FAIL, SUCCESS
838 svc_start(int svcID
, int flags
)
841 ServiceBlock svcStatus
;
844 getSvcName(svcID
, &svcName
);
845 clulog(LOG_DEBUG
, "Handling start request for service %s\n", svcName
);
847 if (clu_svc_lock(svcID
) == -1) {
848 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
853 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
854 clu_svc_unlock(svcID
);
855 clulog(LOG_ERR
, "Failed getting status for service %s\n",
861 switch (svc_advise_start(&svcStatus
, svcName
, flags
)) {
862 case 0: /* Don't start service, return FAIL */
863 clu_svc_unlock(svcID
);
865 case 1: /* Start service. */
867 case 2: /* Don't start service, return SUCCESS */
868 clu_svc_unlock(svcID
);
875 /* LOCK HELD if we get here */
877 if (flags
& SVCF_CLEAR_FAILURES
)
878 memset(svcStatus
.sb_failed_mask
, 0, sizeof(memb_mask_t
));
881 svcStatus
.sb_owner
= myNodeID
;
882 svcStatus
.sb_state
= SVC_STARTED
;
883 svcStatus
.sb_transition
= (uint64_t)time(NULL
);
884 svcStatus
.sb_checks
= (uint16_t)0;
886 if (flags
& (SVCF_START_DISABLED
|SVCF_PENDING
))
887 svcStatus
.sb_false_starts
= (uint16_t)0;
889 if (flags
& SVCF_RESTARTFAILED
)
890 svcStatus
.sb_restarts
++;
892 svcStatus
.sb_restarts
= 0;
894 if (setServiceStatus(&svcStatus
) != SUCCESS
) {
895 clulog(LOG_ERR
, "Failed changing service status\n");
896 clu_svc_unlock(svcID
);
900 clu_svc_unlock(svcID
);
902 if ((exec_service_script(SVC_START_STR
, svcID
, 1, &ret
) != SUCCESS
) ||
912 flip_state(char *svcName
, int svcID
, int state
, int last_owner_flip
)
914 ServiceBlock svcStatus
;
916 if (clu_svc_lock(svcID
) == -1) {
917 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
922 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
923 clu_svc_unlock(svcID
);
924 clulog(LOG_ERR
, "Failed getting status for service %s\n",
929 if (last_owner_flip
) {
930 svcStatus
.sb_last_owner
= svcStatus
.sb_owner
;
931 svcStatus
.sb_owner
= NODE_ID_NONE
;
933 svcStatus
.sb_state
= state
;
934 svcStatus
.sb_transition
= (uint64_t)time(NULL
);
935 if (setServiceStatus(&svcStatus
) != SUCCESS
) {
936 clu_svc_unlock(svcID
);
937 clulog(LOG_ERR
, "Failed changing service status\n");
940 clu_svc_unlock(svcID
);
947 * Stop a cluster service.
949 * @param svcID Service ID to stop.
950 * @param flags Service-operation specific flags to take into account.
951 * @see svc_advise_start
952 * @return FAIL, SUCCESS
955 svc_stop(int svcID
, int flags
)
957 ServiceBlock svcStatus
;
961 getSvcName(svcID
, &svcName
);
962 clulog(LOG_DEBUG
, "Handling stop request for service %s\n", svcName
);
964 if (clu_svc_lock(svcID
) == -1) {
965 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
970 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
971 clu_svc_unlock(svcID
);
972 clulog(LOG_ERR
, "Failed getting status for service %s\n",
977 if (((svcStatus
.sb_state
!= SVC_STARTED
) ||
978 (svcStatus
.sb_owner
!= myNodeID
))
979 && (svcStatus
.sb_state
!= SVC_PENDING
)) {
980 clu_svc_unlock(svcID
);
981 clulog(LOG_DEBUG
, "Unable to stop service %s in %s state\n",
982 svcName
, serviceStateStrings
[svcStatus
.sb_state
]);
986 svcStatus
.sb_state
= SVC_STOPPING
;
987 svcStatus
.sb_transition
= (uint64_t)time(NULL
);
988 if (setServiceStatus(&svcStatus
) != SUCCESS
) {
989 clu_svc_unlock(svcID
);
990 clulog(LOG_ERR
, "Failed changing service status\n");
993 clu_svc_unlock(svcID
);
995 if ((exec_service_script(SVC_STOP_STR
, svcID
, 1, &ret
) != SUCCESS
) ||
1000 if (flags
& SVCF_PENDING
)
1005 flip_state(svcName
, svcID
, ret
, 1);
1012 * Stop a cluster service - without updating the on-disk-block.
1014 * @param svcID Service ID to stop.
1015 * @return FAIL, SUCCESS
1018 svc_stop_unclean(int svcID
)
1026 if (svc_children
[svcID
].cs_pid
) {
1027 kill(svc_children
[svcID
].cs_pid
, SIGKILL
);
1030 if ((waitpid(svc_children
[svcID
].cs_pid
, NULL
, 0)==-1)
1031 && (errno
== EINTR
))
1036 getSvcName(svcID
, &svcName
);
1037 clulog(LOG_WARNING
, "Forcing stop of service %s\n", svcName
);
1039 if ((exec_service_script(SVC_STOP_STR
, svcID
, 1, &ret
) != SUCCESS
) ||
1042 "Failed to stop service %s uncleanly - REBOOTING\n",
1045 REBOOT(RB_AUTOBOOT
);
1052 * Disable a cluster service. Services in the disabled state are never
1053 * automatically started by the service manager - one must send a SVC_START
1056 * @param svcID Service ID to stop.
1057 * @return FAIL, SUCCESS
1060 svc_disable(int svcID
)
1062 ServiceBlock svcStatus
;
1066 if (clu_svc_lock(svcID
) == -1) {
1067 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
1072 getSvcName(svcID
, &svcName
);
1073 clulog(LOG_DEBUG
, "Handling disable request for service %s\n", svcName
);
1075 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
1076 clu_svc_unlock(svcID
);
1077 clulog(LOG_ERR
, "Failed getting status for service %s\n",
1082 if (svcStatus
.sb_state
== SVC_DISABLED
) {
1083 clu_svc_unlock(svcID
);
1084 clulog(LOG_DEBUG
, "Service %s already disabled\n", svcName
);
1088 if (((svcStatus
.sb_state
== SVC_STOPPING
) &&
1089 (svcStatus
.sb_owner
!= myNodeID
)) &&
1090 (memb_online(membership
, svcStatus
.sb_owner
)==1)) {
1092 "Service %s is in stop-transition on node %d"
1093 ", cannot disable\n",
1098 if (((svcStatus
.sb_state
== SVC_STARTED
) &&
1099 (svcStatus
.sb_owner
!= myNodeID
))
1100 || ((svcStatus
.sb_state
!= SVC_STARTED
)
1101 && (svcStatus
.sb_state
!= SVC_STOPPING
)
1102 && (svcStatus
.sb_state
!= SVC_STOPPED
)
1103 && (svcStatus
.sb_state
!= SVC_PENDING
)
1104 && (svcStatus
.sb_state
!= SVC_FAILED
))) {
1105 clu_svc_unlock(svcID
);
1106 clulog(LOG_DEBUG
, "Unable to disable service %s in %s state\n",
1107 svcName
, serviceStateStrings
[svcStatus
.sb_state
]);
1111 svcStatus
.sb_state
= SVC_STOPPING
;
1112 svcStatus
.sb_transition
= (uint64_t)time(NULL
);
1113 if (setServiceStatus(&svcStatus
) != SUCCESS
) {
1114 clu_svc_unlock(svcID
);
1115 clulog(LOG_ERR
, "Failed changing service status\n");
1118 clu_svc_unlock(svcID
);
1120 if ((exec_service_script(SVC_STOP_STR
, svcID
, 1, &ret
) != SUCCESS
) ||
1125 flip_state(svcName
, svcID
, SVC_DISABLED
, 1);
1132 * Mark a cluster service as failed. User intervention required.
1134 * @param svcID Service ID to stop.
1135 * @return FAIL, SUCCESS
1138 _svc_fail(int svcID
)
1140 ServiceBlock svcStatus
;
1143 if (clu_svc_lock(svcID
) == -1) {
1144 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
1149 getSvcName(svcID
, &svcName
);
1150 clulog(LOG_DEBUG
, "Handling failure request for service %s\n", svcName
);
1152 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
1153 clu_svc_unlock(svcID
);
1154 clulog(LOG_ERR
, "Failed getting status for service %s\n",
1159 if ((svcStatus
.sb_state
== SVC_STARTED
) &&
1160 (svcStatus
.sb_owner
!= myNodeID
)) {
1161 clu_svc_unlock(svcID
);
1162 clulog(LOG_DEBUG
, "Unable to disable service %s in %s state\n",
1163 svcName
, serviceStateStrings
[svcStatus
.sb_state
]);
1168 * Leave a bread crumb so we can debug the problem with the service!
1170 if (svcStatus
.sb_owner
!= NODE_ID_NONE
) {
1171 svcStatus
.sb_last_owner
= svcStatus
.sb_owner
;
1172 svcStatus
.sb_owner
= NODE_ID_NONE
;
1174 svcStatus
.sb_state
= SVC_FAILED
;
1175 svcStatus
.sb_transition
= (uint64_t)time(NULL
);
1176 svcStatus
.sb_restarts
= 0;
1177 if (setServiceStatus(&svcStatus
) != SUCCESS
) {
1178 clu_svc_unlock(svcID
);
1179 clulog(LOG_ERR
, "Failed changing service status\n");
1182 clu_svc_unlock(svcID
);
1189 * Check the status of a given service. This execs the service script
1190 * with the argument 'status', and evaluates the return code.
1192 * @param svcID Service ID to check.
1193 * @return FAIL or SUCCESS.
1196 svc_check(int svcID
)
1198 ServiceBlock svcStatus
;
1199 char *svcName
, *maxrestarts
, *maxfs
;
1200 int script_ret
, ret
;
1202 getSvcName(svcID
, &svcName
);
1203 clulog(LOG_DEBUG
, "Handling check request for service %s\n", svcName
);
1205 if (clu_svc_lock(svcID
) == -1) {
1206 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
1211 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
1212 clu_svc_unlock(svcID
);
1213 clulog(LOG_ERR
, "Failed getting status for service %s\n",
1218 if ((svcStatus
.sb_state
!= SVC_STARTED
) ||
1219 (svcStatus
.sb_owner
!= myNodeID
)) {
1220 clu_svc_unlock(svcID
);
1221 clulog(LOG_DEBUG
, "Unable to check service %s in %s state\n",
1222 svcName
, serviceStateStrings
[svcStatus
.sb_state
]);
1225 clu_svc_unlock(svcID
);
1227 if ((exec_service_script(SVC_CHECK_STR
, svcID
, 1, &ret
) != SUCCESS
) ||
1231 script_ret
= SUCCESS
;
1233 clu_svc_lock(svcID
);
1234 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
1235 clu_svc_unlock(svcID
);
1236 clulog(LOG_ERR
, "Failed getting status for service %s\n",
1241 if (script_ret
== FAIL
) {
1244 if (svcStatus
.sb_checks
== 0 &&
1245 (getSvcMaxFalseStarts(svcID
, &maxfs
) == SUCCESS
) &&
1248 /* If we've exceeded false-start count, relocate */
1249 svcStatus
.sb_false_starts
++;
1251 "Service %s false-start detected (%d/%d)\n",
1252 svcName
, svcStatus
.sb_false_starts
, atoi(maxfs
));
1254 if (svcStatus
.sb_false_starts
> atoi(maxfs
)) {
1255 clulog(LOG_ERR
, "Max false starts for service %s"
1256 " exceeded. Relocating\n", svcName
);
1260 /* Update on-disk with new false start info */
1261 setServiceStatus(&svcStatus
);
1264 if (getSvcMaxRestarts(svcID
, &maxrestarts
) == SUCCESS
) {
1265 if (atoi(maxrestarts
) > 0) {
1266 /* We're about to restart. If we would exceed
1267 our restart count, relocate. */
1268 if (svcStatus
.sb_restarts
>=
1269 atoi(maxrestarts
)) {
1270 clulog(LOG_ERR
, "Max restarts for "
1271 "service %s exceeded. "
1272 "Relocating\n", svcName
);
1275 } else if (atoi(maxrestarts
) < 0) {
1276 clulog(LOG_ERR
, "Service %s failed. "
1277 "Relocating\n", svcName
);
1281 } else { /* SUCCESS */
1283 if (!svcStatus
.sb_checks
) {
1284 svcStatus
.sb_checks
= 1;
1285 svcStatus
.sb_false_starts
= 0;
1286 setServiceStatus(&svcStatus
);
1290 clu_svc_unlock(svcID
);
1300 ServiceBlock svcStatus
;
1303 clulog(LOG_INFO
, "Initializing services\n");
1305 for (svcID
= 0; svcID
< MAX_SERVICES
; svcID
++) {
1307 /* This takes a long time... Abort quickly if necessary */
1308 if (sigterm_received
)
1311 if (serviceExists(svcID
) != YES
)
1314 getSvcName(svcID
, &svcName
);
1317 * If service is not on the shared service information disk,
1318 * or it is running and owned by this node, reinitialized it.
1321 if (clu_svc_lock(svcID
) == 0) {
1322 if ((getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) ||
1323 ((svcStatus
.sb_owner
== myNodeID
) &&
1324 ((svcStatus
.sb_state
== SVC_STARTED
) ||
1325 (svcStatus
.sb_state
== SVC_STOPPING
))) ||
1326 ((svcStatus
.sb_owner
== NODE_ID_NONE
) &&
1327 (svcStatus
.sb_state
== SVC_PENDING
))) {
1328 svcStatus
.sb_id
= svcID
;
1329 svcStatus
.sb_last_owner
= svcStatus
.sb_owner
;
1330 svcStatus
.sb_owner
= NODE_ID_NONE
;
1331 svcStatus
.sb_state
= SVC_STOPPED
;
1332 svcStatus
.sb_transition
= (uint64_t)time(NULL
);
1333 svcStatus
.sb_restarts
= 0;
1334 if (setServiceStatus(&svcStatus
) != SUCCESS
) {
1335 clulog(LOG_ERR
, "Failed setting "
1336 "service status for %s\n",
1340 clu_svc_unlock(svcID
);
1343 "Unable to obtain lock for service %s: %s\n",
1349 * We stop all services to clean up any state in the case
1350 * that this system came down without gracefully stopping
1353 if (exec_service_script(SVC_STOP_STR
, svcID
, 1, NULL
) !=
1356 "Failed stopping service %s during init\n",
1368 * Send a message to the target node to start the service.
1371 relocate_service(int svcID
, int request
, int target
)
1373 SmMessageSt msg_relo
;
1374 int fd_relo
, msg_ret
;
1376 /* Build the message header */
1377 msg_relo
.sm_hdr
.gh_magic
= GENERIC_HDR_MAGIC
;
1378 msg_relo
.sm_hdr
.gh_command
= SVC_ACTION_REQUEST
;
1379 msg_relo
.sm_hdr
.gh_length
= sizeof (SmMessageSt
);
1380 msg_relo
.sm_data
.d_action
= request
;
1381 msg_relo
.sm_data
.d_svcID
= svcID
;
1382 msg_relo
.sm_data
.d_ret
= 0;
1384 /* Open a connection to the other node */
1386 if ((fd_relo
= msg_open(PROCID_CLUSVCMGRD
, target
)) < 0) {
1387 clulog(LOG_ERR
, "Failed opening connection to member #%d\n",
1393 swab_SmMessageSt(&msg_relo
);
1395 /* Send relocate message to the other node */
1396 if (msg_send(fd_relo
, &msg_relo
, sizeof (SmMessageSt
)) !=
1397 sizeof (SmMessageSt
)) {
1399 "Error sending relocate request to member #%d\n",
1405 clulog(LOG_DEBUG
, "Sent relocate request.\n");
1407 /* Check the response */
1408 msg_ret
= msg_receive(fd_relo
, &msg_relo
, sizeof (SmMessageSt
));
1410 if (msg_ret
!= sizeof (SmMessageSt
)) {
1412 * In this case, we don't restart the service, because the
1413 * service state is actually unknown to us at this time.
1415 clulog(LOG_ERR
, "Mangled reply from member #%d during service "
1416 "relocate\n", target
);
1418 return SUCCESS
; /* XXX really UNKNOWN */
1421 /* Got a valid response from other node. */
1425 swab_SmMessageSt(&msg_relo
);
1427 return msg_relo
.sm_data
.d_ret
;
1432 * Advise whether or not we should drop a particular request for a given
1435 * @param svcID Service ID in question.
1436 * @param req Particular request in question.
1437 * @param svcStatus Current service status block.
1438 * @return 1 for TRUE (drop service request), 0 for FALSE (do not
1439 * drop given request)
1442 svc_advise_drop_request(int svcID
, int req
, ServiceBlock
* svcStatus
)
1445 * Drop the request if it's not a DISABLE and not a START_PENDING
1446 * if the service is in the PENDING state (ie, it failed on one node)
1448 if ((svcStatus
->sb_state
== SVC_PENDING
) &&
1449 ((req
!= SVC_START_PENDING
) && (req
!= SVC_DISABLE
))) {
1451 "Dropping op %d for svc%d: Service Pending Start\n",
1457 * Drop the request if it's an SVC_CHECK and the service isn't started.
1459 if ((req
== SVC_CHECK
) &&
1460 ((svcStatus
->sb_state
!= SVC_STARTED
) ||
1461 (svcStatus
->sb_owner
!= myNodeID
))) {
1462 clulog(LOG_DEBUG
, "Dropping SVC_CHECK for svc%d: Service "
1463 "not running locally\n", svcID
);
1468 * Drop the request if it's an SVC_CHECK and we're already doing
1469 * something to that service so that other requests may continue.
1471 if ((req
== SVC_CHECK
) && svc_children
[svcID
].cs_pid
) {
1473 "Dropping SVC_CHECK for svc%d: PID%d has not completed",
1474 svcID
, svc_children
[svcID
].cs_pid
);
1479 * Drop the request if it's an SVC_START, we are the owner, and
1480 * the service is currently stopping
1482 if ((req
== SVC_START
) && svc_children
[svcID
].cs_pid
) {
1484 "Dropping SVC_START for svc%d: PID%d has not completed",
1485 svcID
, svc_children
[svcID
].cs_pid
);
1494 * Determine the target node we should relocate the service to if we are
1495 * not given one from cluadmin. This checks the failover domain to see
1496 * the next node online in a given failover group.
1498 * @param rmask The nodes allowed to be checked for when we are
1499 * trying to determine who should start the service.
1500 * @param current_owner The current owner of the service, or the node
1501 * who is requesting the information. This is the
1502 * _last_ member allowed to run the service.
1503 * @param svcID ID of the service in question.
1506 best_target_node(memb_mask_t rmask
, int current_owner
, int svcID
)
1510 x
= current_owner
+ 1;
1515 if (node_should_start(x
, rmask
, svcID
) == FOD_BEST
) {
1522 } while (x
!= current_owner
);
1524 return current_owner
;
1530 * clear_failure_mask(int svcID)
1533 * @see mark_self_failed
1536 clear_failure_mask(int svcID
)
1538 ServiceBlock svcStatus
;
1540 if (clu_svc_lock(svcID
) == -1) {
1541 clulog(LOG_ERR
, "Couldn't obtain lock for service %d: %s\n",
1542 svcID
, strerror(errno
));
1546 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
1547 clulog(LOG_ERR
, "Couldn't obtain status for service %d\n",
1549 clu_svc_unlock(svcID
);
1553 memset(svcStatus
.sb_failed_mask
, 0, sizeof(svcStatus
.sb_failed_mask
));
1554 if (setServiceStatus(&svcStatus
) != SUCCESS
) {
1555 clulog(LOG_ERR
, "Couldn't set FAILURE status for service %d\n",
1560 clu_svc_unlock(svcID
);
1565 * Marks our bit in the failed_nodes bitmask in the service block on disk.
1566 * This is a signal to other members to _not_ send us the service again.
1567 * This mask is cleared when a service is successfully started.
1570 * @see clear_failure_mask
1573 mark_self_failed(int svcID
)
1575 ServiceBlock svcStatus
;
1577 if (clu_svc_lock(svcID
) == -1) {
1578 clulog(LOG_ERR
, "Couldn't obtain lock for service %d: %s\n",
1579 svcID
, strerror(errno
));
1583 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
1584 clulog(LOG_ERR
, "Couldn't obtain status for service %d\n",
1586 clu_svc_unlock(svcID
);
1590 /* Mark ourselves as FAILED for this service */
1591 memb_mark_up(svcStatus
.sb_failed_mask
, myNodeID
);
1593 if (setServiceStatus(&svcStatus
) != SUCCESS
) {
1594 clulog(LOG_ERR
, "Couldn't set FAILURE status for service %d\n",
1599 clu_svc_unlock(svcID
);
1607 svc_report_failure(int svcID
)
1609 ServiceBlock svcStatus
;
1613 getSvcName(svcID
, &svcName
);
1615 if (clu_svc_lock(svcID
) == -1) {
1616 clulog(LOG_ERR
, "Couldn't obtain lock for service %s: %s\n",
1617 svcName
, strerror(errno
));
1621 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
1622 clulog(LOG_ERR
, "Couldn't obtain status for service %s\n",
1624 clu_svc_unlock(svcID
);
1627 clu_svc_unlock(svcID
);
1629 getNodeName(svcStatus
.sb_last_owner
, &nodeName
);
1632 "Service %s returned failure code. Last Owner: %s\n",
1635 "Administrator intervention required.\n",
1642 * handle_relocate_req - Relocate a service. This seems like a huge
1643 * deal, except it really isn't.
1645 * @param svcID Service ID in question.
1646 * @param flags If (flags & SVCF_PENDING), we were called from
1647 * handle_start_req - and so we should ignore all local
1648 * restarts/stops - since handle_start_req does this
1650 * @param preferred_target When sent a relocate message from the
1651 * management software, a destination node
1652 * is sent as well. This causes us to try
1653 * starting the service on that node *first*,
1654 * but does NOT GUARANTEE that the service
1655 * will end up on that node. It will end up
1656 * on whatever node actually successfully
1658 * @param new_owner Member who actually ends up owning the service.
1661 handle_relocate_req(int svcID
, int flags
, int preferred_target
,
1662 uint32_t *new_owner
)
1664 memb_mask_t allowed_nodes
;
1665 int target
= preferred_target
;
1667 char *nodeName
=NULL
, *svcName
=NULL
;
1669 getSvcName(svcID
, &svcName
);
1670 request
= (flags
& SVCF_PENDING
) ? SVC_START_PENDING
:
1674 * Stop the service - if we haven't already done so.
1676 if (!(flags
& SVCF_PENDING
)) {
1677 if (svc_stop(svcID
, flags
) != SUCCESS
) {
1678 if (svc_start(svcID
, flags
) != SUCCESS
)
1685 * First, see if it's legal to relocate to the target node. Legal
1686 * means: the node is online and is in the [restricted] failover
1687 * domain of the service, or the service has no failover domain.
1689 if (preferred_target
>= 0 && preferred_target
<= MAX_NODES
) {
1691 memset(allowed_nodes
, 0, sizeof(allowed_nodes
));
1692 memb_mark_up(allowed_nodes
, preferred_target
);
1693 target
= best_target_node(allowed_nodes
, myNodeID
, svcID
);
1696 * I am the ONLY one capable of running this service,
1699 if (target
== myNodeID
)
1702 if (target
== preferred_target
) {
1704 * It's legal to start the service on the given
1705 * node. Try to do so.
1707 if (relocate_service(svcID
, request
, target
) ==
1709 *new_owner
= target
;
1711 * Great! We're done...
1719 * Ok, so, we failed to send it to the preferred target node.
1720 * Try to start it on all other nodes.
1722 memcpy(allowed_nodes
, membership
, sizeof(memb_mask_t
));
1723 memb_mark_down(allowed_nodes
, myNodeID
);
1725 /* Don't try to relocate to the preferred target more than once. */
1726 if (preferred_target
>= 0 && preferred_target
<= MAX_NODES
)
1727 memb_mark_down(allowed_nodes
, myNodeID
);
1729 while (memb_count(allowed_nodes
)) {
1730 target
= best_target_node(allowed_nodes
, myNodeID
, svcID
);
1731 if (target
== myNodeID
)
1734 switch (relocate_service(svcID
, request
, target
)) {
1736 memb_mark_down(allowed_nodes
, target
);
1739 svc_report_failure(svcID
);
1742 *new_owner
= target
;
1743 getNodeName(target
, &nodeName
);
1745 "Service %s now running on member %s\n",
1749 clulog(LOG_ERR
, "Invalid reply from member %d during"
1750 " relocate operation!\n", target
);
1755 * We got sent here from handle_start_req.
1758 if (flags
& SVCF_PENDING
)
1762 * All potential places for the service to start have been exhausted.
1766 clulog(LOG_WARNING
, "Attempting to restart service %s locally.\n",
1768 if (svc_start(svcID
, flags
) == SUCCESS
) {
1769 *new_owner
= myNodeID
;
1773 if (svc_stop(svcID
, 0) != SUCCESS
) {
1775 svc_report_failure(svcID
);
1783 * handle_start_req - Handle a generic start request from a user or during
1784 * service manager boot.
1786 * @param svcID Service ID to start.
1788 * @param new_owner Owner which actually started the service.
1789 * @return FAIL - Failure.
1790 * SUCCESS - The service is running.
1793 handle_start_req(int svcID
, int flags
, uint32_t *new_owner
)
1795 int ret
, tolerance
= FOD_BEST
, target
= -1;
1798 * When a service request is from a user application (eg, clusvcadm),
1799 * accept FOD_GOOD instead of FOD_BEST
1801 if (flags
& SVCF_START_DISABLED
)
1802 tolerance
= FOD_GOOD
;
1804 if (!(flags
& SVCF_RESTART
) &&
1805 (node_should_start(myNodeID
, membership
, svcID
) < tolerance
)) {
1807 /* Try to send to someone else who might care about it */
1808 target
= best_target_node(membership
, myNodeID
, svcID
);
1809 ret
= handle_relocate_req(svcID
, SVCF_PENDING
, target
,
1818 * Strip out all flags which are invalid.
1820 clulog(LOG_DEBUG
, "Starting service %d - flags 0x%08x\n", svcID
,
1825 * This is a 'root' start request. We need to clear out our failure
1826 * mask here - so that we can try all nodes if necessary.
1828 flags
|= SVCF_CLEAR_FAILURES
;
1830 ret
= svc_start(svcID
, flags
);
1833 if (clear_failure_mask(svcID
) != SUCCESS
) {
1834 clulog(LOG_WARNING
, "Could not clear failure bitmask for "
1835 "service #%s!\n", svcName
);
1840 * If we succeeded, then we're done.
1842 if (ret
== SUCCESS
) {
1843 *new_owner
= myNodeID
;
1848 * Keep the state open so the other nodes don't try to start
1849 * it. This allows us to be the 'root' of a given service.
1851 clulog(LOG_DEBUG
, "Stopping failed service %d\n", svcID
);
1852 if (svc_stop(svcID
, SVCF_PENDING
) != SUCCESS
) {
1853 clulog(LOG_CRIT
, "Service %d failed to stop cleanly", svcID
);
1857 * If we failed to stop the service, we're done. At this
1858 * point, we can't determine the service's status - so
1859 * trying to start it on other nodes is right out.
1865 * OK, it failed to start - but succeeded to stop. Now,
1866 * we should relocate the service.
1868 clulog(LOG_WARNING
, "Relocating failed service %d\n", svcID
);
1869 ret
= handle_relocate_req(svcID
, SVCF_PENDING
, -1, new_owner
);
1879 * handle_start_remote_req - Handle a remote start request.
1881 * @param svcID Service ID to start.
1882 * @param flags Flags to use to determine start behavior.
1883 * @return FAIL - Local failure. ABORT - Unrecoverable error:
1884 * the service didn't start, nor stop cleanly. SUCCESS
1885 * - We started the service.
1888 handle_start_remote_req(int svcID
, int flags
)
1891 int tolerance
= FOD_BEST
;
1893 memset(rmask
, 0, sizeof(rmask
));
1894 memb_mark_up(rmask
, myNodeID
);
1896 if (flags
& SVCF_START_DISABLED
)
1897 tolerance
= FOD_GOOD
;
1900 * See if we agree with our ability to start the given service.
1902 if (node_should_start(myNodeID
, rmask
, svcID
) < tolerance
)
1905 if (svc_start(svcID
, flags
) == SUCCESS
)
1909 if (mark_self_failed(svcID
) == FAIL
) {
1915 if (svc_stop(svcID
, 0) == SUCCESS
)
1924 * Handle a request regarding a service.
1926 * @param svcID ID of service in question.
1927 * @param action Action to be performed on the service.
1928 * @param target In the case of a relocate, target/destination node
1929 * we're relocating to.
1930 * @param fd File descriptor on which we send our response.
1933 handle_svc_request(int svcID
, int action
, int target
, msg_handle_t fd
)
1938 ServiceBlock svcStatus
;
1940 uint32_t new_owner
= NODE_ID_NONE
;
1943 getSvcName(svcID
, &svcName
);
1944 clulog(LOG_DEBUG
, "Service %s request %d\n", svcName
, action
);
1946 if (myNodeState
!= NODE_UP
)
1950 * Don't assume the service exists...
1952 if (serviceExists(svcID
) != YES
) {
1956 if (clu_svc_lock(svcID
) == -1) {
1957 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
1962 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
1963 clulog(LOG_ERR
, "Cannot get status for service %d\n", svcID
);
1964 clu_svc_unlock(svcID
);
1968 clu_svc_unlock(svcID
);
1971 * Check to see if we should drop the service request. This
1972 * is based on the current servuce status, the action required,
1975 if (svc_advise_drop_request(svcID
, action
, &svcStatus
)) {
1981 * Fork so that we can run service actions in parallel.
1983 while (svc_children
[svcID
].cs_pid
!= 0) {
1985 if (svcStatus
.sb_state
== SVC_PENDING
) {
1987 * Shouldn't get here, but if so, avoid deadlock.
1989 clulog(LOG_ERR
, "%s failed during "
1990 "relocate-to-preferred member operation",
1997 "Proc %d already running action on service %s\n",
1998 svc_children
[svcID
].cs_pid
, svcName
);
2000 /* See if we missed the SIGCHLD */
2001 if (!cleanup_child(svcID
))
2005 /* Record what the child will be doing */
2006 svc_children
[svcID
].cs_rq
= action
;
2008 if ((svc_children
[svcID
].cs_pid
= fork())) {
2009 if (svc_children
[svcID
].cs_pid
< 0) {
2014 "Fork failed handling action request.\n");
2015 svc_children
[svcID
].cs_pid
= 0;
2016 svc_children
[svcID
].cs_rq
= 0;
2018 /* Send reply, if applicable */
2022 clulog(LOG_DEBUG
, "[M] Pid %d -> %s for service %s\n",
2023 svc_children
[svcID
].cs_pid
, serviceActionStrings
[action
],
2029 block_signal(SIGTERM
);
2030 block_signal(SIGHUP
);
2032 clulog(LOG_DEBUG
, "[C] Pid %d handling %s request for service %s\n",
2033 getpid(), serviceActionStrings
[action
], svcName
);
2038 flags
|= ((fd
== -1) ? 0 : SVCF_START_DISABLED
);
2040 ret
= handle_start_req(svcID
, flags
, &new_owner
);
2043 case SVC_START_PENDING
:
2045 * We allow starting of pending requests only if
2046 * explicitly asked for from someone else - never on
2047 * a local node event.
2049 flags
= SVCF_PENDING
;
2050 case SVC_START_RELOCATE
:
2052 * We use fd as an indicator to see whether or not we
2053 * were called on behalf of a node event. Generally,
2054 * fd is set, but we usually don't handle relocation of
2055 * disabled services -- it's kind of an anomaly.
2057 flags
|= (fd
== -1) ? 0 : SVCF_START_DISABLED
;
2058 ret
= handle_start_remote_req(svcID
, flags
);
2064 if ((ret
= svc_stop(svcID
, 0)) == SUCCESS
) {
2066 * Ok, we did the stop - now do the whole start
2067 * process, including relocating in the case of
2070 if (action
== SVC_RESTART
) {
2072 flags
= SVCF_RESTART
;
2081 if (svc_start(svcID
, 0) == SUCCESS
)
2089 if ((ret
= svc_disable(svcID
)) == SUCCESS
)
2093 * We don't run svc_fail here because svc_fail could
2094 * put us back where we were. Always allow disable.
2101 if ((ret
= svc_check(svcID
)) == SUCCESS
)
2105 /* Try to relocate service at this point */
2106 ret
= handle_relocate_req(svcID
, 0, -1, &new_owner
);
2112 clulog(LOG_WARNING
, "Restarting locally failed service %s\n",
2115 (void) svc_stop(svcID
, 0);
2118 * Try the whole start process, including relocating it in
2119 * the case that it failed to restart locally.
2121 flags
= SVCF_RESTART
| SVCF_RESTARTFAILED
;
2126 if (svcStatus
.sb_state
== SVC_DISABLED
) {
2128 "Can not relocate disabled service %s\n",
2134 if (target
== myNodeID
)
2137 ret
= handle_relocate_req(svcID
, 0, target
, &new_owner
);
2141 clulog(LOG_ERR
, "Invalid service request %d\n", action
);
2147 * If fd is valid, the request was on behalf of a client who is
2148 * blocking for the status reply.
2152 msg_sm
.sm_data
.d_svcOwner
= new_owner
;
2153 msg_sm
.sm_data
.d_ret
= ret
;
2155 /* Encode before responding... */
2156 swab_SmMessageSt(&msg_sm
);
2158 if (msg_send(fd
, &msg_sm
, sizeof (SmMessageSt
)) !=
2159 sizeof (SmMessageSt
)) {
2160 clulog(LOG_ERR
, "Error replying to action request.\n");
2165 exit(ret
); /* child exit */
2170 * Check to see if we need to kill a child process - and do so if necessary.
2171 * We do not need to reset the cs_pid field. This should only be called
2172 * during a remote node-down event to determine if we had a relocate-request
2173 * or other request out to that node. If so, we need to kill the child
2174 * handling that request.
2176 * @param svcID Service ID
2177 * @param svc Service block (status of svcID)
2180 consider_reapage(int svcID
, ServiceBlock
* svc
)
2183 * Since PENDING is only a valid state when BOTH nodes are up, and
2184 * given that the remote node just died, mark the service as
2185 * 'stopped' if it was in the 'pending' state. Kill the child
2186 * process if it exists.
2188 if (svc
->sb_state
== SVC_PENDING
) {
2189 if (svc_children
[svcID
].cs_pid
)
2190 kill(svc_children
[svcID
].cs_pid
, SIGKILL
);
2192 clulog(LOG_DEBUG
, "Marking %d (state %d) as stopped", svcID
,
2195 /* Mark state -> stopped */
2196 if (clu_svc_lock(svcID
) == -1) {
2197 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
2202 if (getServiceStatus(svcID
, svc
) == SUCCESS
) {
2203 svc
->sb_last_owner
= svc
->sb_owner
;
2204 svc
->sb_owner
= NODE_ID_NONE
;
2205 svc
->sb_state
= SVC_STOPPED
;
2206 if (setServiceStatus(svc
) != SUCCESS
) {
2208 "Failed marking service %d as stopped\n",
2212 clu_svc_unlock(svcID
);
2216 if (!svc_children
[svcID
].cs_pid
)
2220 * The child was SVC_START and the other node is marked as the owner.
2221 * This means we tried to start it locally, failed, and send a
2222 * REMOTE_START to the other node, but the other node died before we
2223 * received a response.
2225 * Simplify: Kill child whenever our partner owns the service.
2227 if (svc
->sb_owner
!= myNodeID
) {
2229 "Killing child PID%d: Remote member went down!",
2230 svc_children
[svcID
].cs_pid
);
2231 kill(svc_children
[svcID
].cs_pid
, SIGKILL
);
2236 * Our last case is an explicit relocate (eg, from cluadmin). The other
2237 * node went down, and we received its node-down event. This could have
2238 * been taken care of above, but we still need to catch the cases where
2239 * it hasn't been taken care of yet...
2241 if (svc_children
[svcID
].cs_rq
== SVC_RELOCATE
) {
2243 "Killing child PID%d: Remote member went down!",
2244 svc_children
[svcID
].cs_pid
);
2245 kill(svc_children
[svcID
].cs_pid
, SIGKILL
);
2251 * Rewrite a service block as 'stopped' if all members of its
2252 * restricted failover domain went offline.
2254 * @param svcID Service ID to stop.
2255 * @return FAIL, SUCCESS
2258 check_rdomain_crash(int svcID
, ServiceBlock
*svcStatus
)
2261 memb_mask_t allowed_nodes
;
2263 if (memb_online(membership
, svcStatus
->sb_owner
) ||
2264 (svcStatus
->sb_state
== SVC_STOPPED
))
2267 memcpy(allowed_nodes
, membership
, sizeof(memb_mask_t
));
2268 memb_mark_down(allowed_nodes
, svcStatus
->sb_owner
);
2269 if (best_target_node(allowed_nodes
, svcStatus
->sb_owner
, svcID
) !=
2270 svcStatus
->sb_owner
)
2273 if (clu_svc_lock(svcID
) == -1) {
2274 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
2279 if (getServiceStatus(svcID
, svcStatus
) != SUCCESS
) {
2280 clu_svc_unlock(svcID
);
2281 clulog(LOG_ERR
, "Failed getting status for service %s\n",
2286 if ((svcStatus
->sb_state
!= SVC_STARTED
) ||
2287 (svcStatus
->sb_owner
== myNodeID
) ||
2288 memb_online(membership
, svcStatus
->sb_owner
)) {
2289 clu_svc_unlock(svcID
);
2293 svcStatus
->sb_last_owner
= svcStatus
->sb_owner
;
2294 svcStatus
->sb_owner
= NODE_ID_NONE
;
2295 svcStatus
->sb_state
= SVC_STOPPED
;
2296 svcStatus
->sb_transition
= (uint64_t)time(NULL
);
2297 if (setServiceStatus(svcStatus
) != SUCCESS
) {
2298 clu_svc_unlock(svcID
);
2299 clulog(LOG_ERR
, "Failed changing service status\n");
2302 clu_svc_unlock(svcID
);
2308 * Called to decide what services to start locally during a node_event.
2309 * Originally a part of node_event, it is now its own function to cut down
2310 * on the length of node_event.
2315 eval_services(int local
, int nodeStatus
)
2318 char *svcName
, *nodeName
;
2319 ServiceBlock svcStatus
;
2321 if (services_locked
)
2324 for (svcID
= 0; svcID
< MAX_SERVICES
; svcID
++) {
2326 if (serviceExists(svcID
) != YES
)
2329 getSvcName(svcID
, &svcName
);
2332 * Lock the service information and get the current service
2335 if (clu_svc_lock(svcID
) == -1) {
2336 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
2341 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
2342 clulog(LOG_ERR
, "Cannot get status for service %s\n",
2344 clu_svc_unlock(svcID
);
2347 clu_svc_unlock(svcID
);
2349 if (svcStatus
.sb_owner
== NODE_ID_NONE
)
2352 getNodeName(svcStatus
.sb_owner
, &nodeName
);
2354 if ((svcStatus
.sb_state
== SVC_DISABLED
) ||
2355 (svcStatus
.sb_state
== SVC_FAILED
))
2358 clulog(LOG_DEBUG
, "Evaluating service %s, state %s, owner "
2360 serviceStateStrings
[svcStatus
.sb_state
], nodeName
);
2362 if (local
&& (nodeStatus
== NODE_UP
)) {
2365 * Start any stopped services, or started services
2366 * that are owned by a down node.
2368 if (node_should_start(myNodeID
, membership
, svcID
) ==
2370 handle_svc_request(svcID
, SVC_START
, -1, -1);
2375 if (!local
&& (nodeStatus
== NODE_DOWN
)) {
2378 * Start any stopped services, or started services
2379 * that are owned by a down node.
2381 consider_reapage(svcID
, &svcStatus
);
2382 if (node_should_start(myNodeID
, membership
, svcID
) ==
2384 handle_svc_request(svcID
, SVC_START
, -1, -1);
2386 check_rdomain_crash(svcID
, &svcStatus
);
2389 * Mark a service as 'stopped' if no members in its restricted
2390 * fail-over domain are running.
2398 * Called to handle the transition of a cluster member from up->down or
2399 * down->up. This handles initializing services (in the local node-up case),
2400 * exiting due to loss of quorum (local node-down), and service fail-over
2401 * (remote node down).
2403 * @param nodeID ID of the member which has come up/gone down.
2404 * @param nodeStatus New state of the member in question.
2405 * @see eval_services
2408 node_event(int nodeID
, int nodeStatus
)
2413 local
= (nodeID
== myNodeID
);
2415 if (nodeStatus
== NODE_UP
) {
2417 if (myNodeState
== NODE_UP
)
2420 myNodeState
= NODE_UP
;
2422 #ifdef OLD_CLU_ALIAS
2427 "local member up, initializing services\n");
2430 * Initialize all services we own. We needed to wait
2431 * for a NODE_UP event as we need the locking
2432 * subsystem for this.
2435 if (init_services() != SUCCESS
) {
2436 clulog(LOG_ERR
, "Cannot initialize services\n");
2441 if (nodeStatus
== NODE_DOWN
) {
2448 * Nothing to do for events from other nodes if we are not up.
2451 if (myNodeState
!= NODE_UP
)
2455 #ifdef OLD_CLU_ALIAS
2456 if (myNodeID
== memb_high_node(membership
)) {
2463 eval_services(local
, nodeStatus
);
2465 /* If we just came up, and our partner is up request a failback */
2466 if (local
&& (nodeStatus
== NODE_UP
)) {
2468 for (partner
= 0; partner
< MAX_NODES
; partner
++) {
2469 if (partner
== myNodeID
)
2472 if (memb_online(membership
, partner
)) {
2473 if (request_failback(partner
) != SUCCESS
) {
2475 "Unable to inform partner "
2476 "to start failback\n");
2485 * Run service status scripts on all services which (a) we are running and
2486 * (b) have check intervals set.
2488 * @param elapsed Number of elapsed seconds since last time
2489 * check_services was run.
2492 check_services(int elapsed
)
2496 ServiceBlock svcStatus
;
2500 for (svcID
= 0; svcID
< MAX_SERVICES
; svcID
++) {
2502 if (serviceExists(svcID
) != YES
)
2505 getSvcName(svcID
, &svcName
);
2508 * Check service interval first, since it doesn't
2511 if (getSvcCheckInterval(svcID
, &intervalStr
) == SUCCESS
)
2512 interval
= atoi(intervalStr
);
2520 * Check service status
2522 if (clu_svc_lock(svcID
) == -1) {
2523 clulog(LOG_ERR
, "Unable to obtain cluster lock: %s\n",
2528 if (getServiceStatus(svcID
, &svcStatus
) != SUCCESS
) {
2529 clu_svc_unlock(svcID
);
2531 "Failed getting status for service %s\n",
2535 clu_svc_unlock(svcID
);
2537 if ((svcStatus
.sb_owner
!= myNodeID
)
2538 || (svcStatus
.sb_state
!= SVC_STARTED
))
2541 ticks
[svcID
] += elapsed
;
2544 "Check interval for service %s is %d, elapsed %d\n",
2545 svcName
, interval
, ticks
[svcID
]);
2547 if (ticks
[svcID
] < interval
) {
2548 clulog(LOG_DEBUG
, "Too early to check service %s\n",
2554 handle_svc_request(svcID
, SVC_CHECK
, -1, -1);
2560 * Handle a QUORUM or QUORUM_GAINED message from the quorum daemon. This
2561 * updates our local membership view and handles whether or not we should
2562 * exit, as well as determines node transitions (thus, calling node_event()).
2564 * @param msg_quorum Cluster event from the quorum daemin.
2569 handle_quorum_msg(cm_event_t
*msg_quorum
)
2571 memb_mask_t node_delta
, old_membership
;
2576 memcpy(old_membership
, membership
, sizeof(memb_mask_t
));
2577 memcpy(membership
, cm_quorum_mask(msg_quorum
), sizeof(memb_mask_t
));
2579 lock_set_quorum_view(cm_quorum_view(msg_quorum
));
2581 clulog(LOG_INFO
, "Quorum Event: View #%d %s\n",
2582 (int)cm_quorum_view(msg_quorum
),
2583 memb_mask_str(cm_quorum_mask(msg_quorum
)));
2586 * Handle nodes lost. Do our local node event first.
2588 memb_mask_lost(node_delta
, old_membership
, membership
);
2590 me
= memb_online(node_delta
, myNodeID
);
2592 /* Should not happen */
2593 clulog(LOG_INFO
, "State change: LOCAL OFFLINE\n");
2594 node_event(myNodeID
, NODE_DOWN
);
2598 for (x
=0; x
<MAX_NODES
; x
++) {
2602 * If a node loses its panic status and is not online,
2603 * take over services. That is - someone decided *for sure*
2604 * that said member is DOWN - so its state is no longer
2605 * unknown. (ie, disk-tiebreaker lost quorum...)
2607 getNodeName(x
, &nodeName
);
2609 if (memb_online(mask_panic
, x
) &&
2610 !memb_online(cm_quorum_mask_panic(msg_quorum
),x
) &&
2611 !memb_online(cm_quorum_mask(msg_quorum
),x
)) {
2612 memb_mark_down(mask_panic
, x
);
2613 node_event(x
, NODE_DOWN
);
2614 clulog(LOG_INFO
, "State change: %s DOWN\n",
2619 if (!memb_online(node_delta
, x
))
2622 if (memb_online(cm_quorum_mask_panic(msg_quorum
), x
)) {
2623 clulog(LOG_WARNING
, "Member %s's state is uncertain: "
2624 "Some services may be unavailable!",
2629 node_event(x
, NODE_DOWN
);
2630 clulog(LOG_INFO
, "State change: %s DOWN\n",
2635 * Store our panic nodemask.
2637 memcpy(mask_panic
, cm_quorum_mask_panic(msg_quorum
),
2638 sizeof(memb_mask_t
));
2641 * Handle nodes gained. Do our local node event first.
2643 me
= memb_mask_gained(node_delta
, old_membership
, membership
);
2645 clulog(LOG_INFO
, "State change: Local UP\n");
2646 node_event(myNodeID
, NODE_UP
);
2649 for (x
=0; x
<MAX_NODES
; x
++) {
2650 if (!memb_online(node_delta
, x
))
2656 node_event(x
, NODE_UP
);
2657 getNodeName(x
, &nodeName
);
2658 clulog(LOG_INFO
, "State change: %s UP\n",
2667 * Read a message on a file descriptor (the one which is connected to
2668 * the quorumd daemon) and process it accordingly.
2670 * @param fd File descriptor connected to the quorum daemon.
2671 * @return FAIL - no message waiting/empty message,
2672 * SUCCESS - successfully handled message.
2676 quorum_msg(msg_handle_t fd
)
2678 cm_event_t
*msg_quorum
;
2680 msg_quorum
= cm_ev_read(fd
);
2685 switch (cm_ev_event(msg_quorum
)) {
2686 case EV_QUORUM_LOST
:
2687 clulog(LOG_CRIT
,"Halting services due to loss of quorum\n");
2693 case EV_QUORUM_GAINED
:
2694 handle_quorum_msg(msg_quorum
);
2702 clulog(LOG_DEBUG
, "unhandled message request %d\n",
2703 cm_ev_event(msg_quorum
));
2707 cm_ev_free(msg_quorum
);
2713 * Receive and process a message on a file descriptor and decide what to
2714 * do with it. This function doesn't handle messages from the quorum daemon.
2716 * @param fd File descriptor with a waiting message.S
2717 * @return FAIL - failed to receive/handle message, or invalid
2718 * data received. SUCCESS - handled message successfully.
2722 dispatch_msg(msg_handle_t fd
)
2725 generic_msg_hdr msg_hdr
;
2729 ret
= msg_peek(fd
, &msg_hdr
, sizeof(msg_hdr
));
2730 if (ret
!= sizeof (generic_msg_hdr
)) {
2731 clulog(LOG_ERR
, "error receiving message header\n");
2735 /* Decode the header */
2736 swab_generic_msg_hdr(&msg_hdr
);
2737 if ((msg_hdr
.gh_magic
!= GENERIC_HDR_MAGIC
)) {
2738 clulog(LOG_ERR
, "Invalid magic: Wanted 0x%08x, got 0x%08x\n",
2739 GENERIC_HDR_MAGIC
, msg_hdr
.gh_magic
);
2743 clulog(LOG_DEBUG
, "received message, fd %d\n", fd
);
2745 switch (msg_hdr
.gh_command
) {
2746 case SVC_CONFIG_UPDATE
:
2747 clulog(LOG_INFO
, "Rereading configuration...\n");
2749 rebuild_config_lockless();
2757 clulog(LOG_NOTICE
, "Service states locked\n");
2758 services_locked
= 1;
2762 clulog(LOG_NOTICE
, "Service states unlocked\n");
2763 services_locked
= 0;
2766 case SVC_QUERY_LOCK
:
2767 msg_send_simple(fd
, services_locked
?SVC_LOCK
:SVC_UNLOCK
, 0, 0);
2770 case SVC_ACTION_REQUEST
:
2772 ret
= msg_receive_timeout(fd
, &msg_sm
, sizeof(msg_sm
),
2774 if (ret
!= sizeof(msg_sm
)) {
2775 clulog(LOG_ERR
, "receiving message data from client "
2776 "error: %d\n", ret
);
2780 /* Decode SmMessageSt message */
2781 swab_SmMessageSt(&msg_sm
);
2783 if (services_locked
) {
2784 msg_sm
.sm_data
.d_ret
= FAIL
;
2785 /* Encode before responding... */
2786 swab_SmMessageSt(&msg_sm
);
2788 if (msg_send(fd
, &msg_sm
, sizeof (SmMessageSt
)) !=
2789 sizeof (SmMessageSt
))
2791 "Error replying to action request.\n");
2796 if (msg_sm
.sm_data
.d_action
== SVC_FAILBACK
) {
2797 failback(msg_sm
.sm_data
.d_svcOwner
);
2801 handle_svc_request(msg_sm
.sm_data
.d_svcID
,
2802 msg_sm
.sm_data
.d_action
,
2803 msg_sm
.sm_data
.d_svcOwner
, fd
);
2807 clulog(LOG_DEBUG
, "unhandled message request %d\n",
2808 msg_hdr
.gh_command
);
2816 main(int argc
, char **argv
)
2818 struct timeval timeout
, tv1
, tv2
;
2820 int check_period
= 0;
2823 msg_handle_t listen_fd
, quorum_fd
;
2826 extern char *optarg
;
2827 int foreground
= 0, debug
= 0, opt
, retries
= 0;
2829 while ((opt
= getopt(argc
, argv
, "fd")) != EOF
) {
2842 (void) clu_set_loglevel(LOG_INFO
);
2844 (void) clu_set_loglevel(LOG_DEBUG
);
2847 daemon_init(argv
[0]);
2852 * Generally, you do this when you know you have quorum.
2853 * However, the service manager simply doesn't get here without
2854 * quorum... (The quorum daemon spawns it when it achieves quorum)
2856 shared_storage_init();
2857 switch(boot_config_init()) {
2859 clulog(LOG_CRIT
, "Configuration invalid!\n");
2869 memset(membership
,0,sizeof(memb_mask_t
));
2875 clulog(LOG_DEBUG
, "Service Manager starting\n");
2878 * daemon_init() blocks most signals, so we need to add the
2879 * ones the Service Manager is interested in.
2882 sigaddset(&set
, SIGINT
);
2883 sigaddset(&set
, SIGTERM
);
2884 sigaddset(&set
, SIGHUP
);
2885 sigaddset(&set
, SIGCHLD
);
2886 sigprocmask(SIG_UNBLOCK
, &set
, NULL
);
2887 (void) signal(SIGINT
, (void (*)(int)) sigterm_handler
);
2888 (void) signal(SIGTERM
, (void (*)(int)) sigterm_handler
);
2889 (void) signal(SIGHUP
, (void (*)(int)) sighup_handler
);
2890 (void) signal(SIGCHLD
, (void (*)(int)) reap_zombies
);
2893 * Retrieve our node id
2895 myNodeID
= memb_local_id();
2897 getNodeName(myNodeID
, &myNodeName
);
2898 myNodeName
= strdup(myNodeName
);
2899 myNodeState
= NODE_DOWN
;
2901 for (i
= 0; i
< MAX_SERVICES
; i
++) {
2903 svc_children
[i
].cs_pid
= 0;
2904 svc_children
[i
].cs_rq
= 0;
2908 * Set up the message service
2911 listen_fd
= msg_listen(PROCID_CLUSVCMGRD
);
2915 if (++retries
< 30) {
2916 sleep(1); /* Arbitrary... */
2920 /* Could be that we lost and regained quorum really quickly */
2921 clulog(LOG_ERR
, "Error setting up message listener: %s\n",
2923 clulog(LOG_ERR
, "%s process may already be running.\n",
2929 * Register for quorum events
2932 quorum_fd
= cm_ev_register(EC_QUORUM
);
2936 if (++retries
< 10) {
2941 clulog(LOG_CRIT
, "Couldn't register with the quorum daemon!");
2947 gettimeofday(&tv1
, NULL
);
2950 * Reap any zombied service scripts, as we do not synchronously
2951 * wait on any of the service scripts. If the process was
2952 * handling a service action, clear out the indication that it
2958 if (sighup_received
) {
2959 sighup_received
= 0;
2963 if (sigterm_received
)
2967 FD_SET(listen_fd
, &rfds
);
2968 FD_SET(quorum_fd
, &rfds
);
2970 timeout
.tv_usec
= 0;
2972 i
= select(MAX(listen_fd
,quorum_fd
) + 1, &rfds
, NULL
, NULL
,
2976 * We used to not check the return from the select call.
2977 * However, this is necessary now because clusvcmgrd needs
2978 * to properly handle SIGHUP
2982 if ((i
== -1) && (errno
!= EINTR
))
2983 clulog(LOG_WARNING
, "select: %s\n",
2987 if (FD_ISSET(listen_fd
, &rfds
)) {
2988 fd
= msg_accept_timeout(listen_fd
, 1);
2990 * Process any waiting messages.
2998 if (FD_ISSET(quorum_fd
, &rfds
)) {
2999 clulog(LOG_DEBUG
, "Processing quorum event\n");
3000 if (quorum_msg(quorum_fd
) == -1) {
3001 clulog(LOG_WARNING
, "Invalid message from "
3002 "Quorum Daemon. Reconnecting\n");
3003 /* Failed to process it? Try reconnecting */
3004 cm_ev_unregister(quorum_fd
);
3007 cm_ev_register(EC_QUORUM
)) == -1) &&
3008 !sigterm_received
) {
3010 clulog(LOG_EMERG
, "Couldn't reconnect "
3011 "to the quorum daemon! "
3013 REBOOT(RB_AUTOBOOT
);
3018 gettimeofday(&tv2
, NULL
);
3019 elapsed_secs
= tv2
.tv_sec
- tv1
.tv_sec
;
3022 * Check the status of running services and the cluster
3023 * configuration file (/etc/cluster.xml).
3025 if ((check_period
+= elapsed_secs
) >= CHECK_INTERVAL
) {
3026 check_config_file();
3027 if (check_config_data() == 1) {
3028 rebuild_config_lockless();
3033 check_services(check_period
);