A new doc condor_config.html is being added showing the condor configuration
[burt-test.git] / frontend / glideinFrontend.py
blob6c138842c5d193349a4329ba0a425f245f617e7f
1 #!/usr/bin/env python
3 # Project:
4 # glideinWMS
6 # File Version:
7 # $Id: glideinFrontend.py,v 1.80 2011/02/10 21:35:31 parag Exp $
9 # Description:
10 # This is the main of the glideinFrontend
12 # Arguments:
13 # $1 = work_dir
15 # Author:
16 # Igor Sfiligoi
19 import os
20 import os.path
21 import sys
23 STARTUP_DIR=sys.path[0]
25 import fcntl
26 import popen2
27 import traceback
28 import signal
29 import time
30 import string
31 import copy
32 import threading
33 sys.path.append(os.path.join(STARTUP_DIR,"../lib"))
35 import glideinFrontendPidLib
36 import glideinFrontendConfig
37 import glideinFrontendLib
38 import glideinFrontendMonitorAggregator
39 import logSupport
42 ############################################################
43 def aggregate_stats():
44 status=glideinFrontendMonitorAggregator.aggregateStatus()
46 return
48 ############################################################
49 def is_crashing_often(startup_time, restart_interval, restart_attempts):
50 crashing_often = True
52 if (len(startup_time) < restart_attempts):
53 # We haven't exhausted restart attempts
54 crashing_often = False
55 else:
56 # Check if the service has been restarted often
57 if restart_attempts == 1:
58 crashing_often = True
59 elif (time.time() - startup_time[0]) >= restart_interval:
60 crashing_often = False
61 else:
62 crashing_often = True
64 return crashing_often
66 ############################################################
67 def spawn(sleep_time,advertize_rate,work_dir,
68 frontendDescript,groups,restart_attempts,restart_interval):
70 global STARTUP_DIR
71 childs={}
72 childs_uptime={}
73 # By default allow max 3 restarts every 30 min before giving up
75 glideinFrontendLib.log_files.logActivity("Starting groups %s"%groups)
76 try:
77 for group_name in groups:
78 childs[group_name]=popen2.Popen3("%s %s %s %s %s"%(sys.executable,os.path.join(STARTUP_DIR,"glideinFrontendElement.py"),os.getpid(),work_dir,group_name),True)
79 # Get the startup time. Used to check if the group is crashing
80 # periodically and needs to be restarted.
81 childs_uptime[group_name]=list()
82 childs_uptime[group_name].insert(0,time.time())
83 glideinFrontendLib.log_files.logActivity("Group startup times: %s"%childs_uptime)
85 for group_name in childs.keys():
86 childs[group_name].tochild.close()
87 # set it in non blocking mode
88 # since we will run for a long time, we do not want to block
89 for fd in (childs[group_name].fromchild.fileno(),childs[group_name].childerr.fileno()):
90 fl = fcntl.fcntl(fd, fcntl.F_GETFL)
91 fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
93 while 1:
94 glideinFrontendLib.log_files.logActivity("Checking groups %s"%groups)
95 for group_name in childs.keys():
96 child=childs[group_name]
98 # empty stdout and stderr
99 try:
100 tempOut = child.fromchild.read()
101 if len(tempOut)!=0:
102 print child, tempOut
103 except IOError:
104 pass # ignore
105 try:
106 tempErr = child.childerr.read()
107 if len(tempErr)!=0:
108 print child, tempErr
109 except IOError:
110 pass # ignore
112 # look for exited child
113 if child.poll()!=-1:
114 # the child exited
115 glideinFrontendLib.log_files.logWarning("Child %s exited. Checking if it should be restarted."%(group_name))
116 tempOut = child.fromchild.readlines()
117 tempErr = child.childerr.readlines()
118 if is_crashing_often(childs_uptime[group_name], restart_interval, restart_attempts):
119 del childs[group_name]
120 raise RuntimeError,"Group '%s' has been crashing too often, quit the whole frontend:\n%s\n%s"%(group_name,tempOut,tempErr)
121 #raise RuntimeError,"Group '%s' exited, quit the whole frontend:\n%s\n%s"%(group_name,tempOut,tempErr)
122 else:
123 # Restart the group setting its restart time
124 glideinFrontendLib.log_files.logWarning("Restarting child %s."%(group_name))
125 del childs[group_name]
126 childs[group_name]=popen2.Popen3("%s %s %s %s %s"%(sys.executable,os.path.join(STARTUP_DIR,"glideinFrontendElement.py"),os.getpid(),work_dir,group_name),True)
127 if len(childs_uptime[group_name]) == restart_attempts:
128 childs_uptime[group_name].pop(0)
129 childs_uptime[group_name].append(time.time())
130 childs[group_name].tochild.close()
131 for fd in (childs[group_name].fromchild.fileno(),childs[group_name].childerr.fileno()):
132 fl = fcntl.fcntl(fd, fcntl.F_GETFL)
133 fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
134 glideinFrontendLib.log_files.logWarning("Group's startup/restart times: %s"%childs_uptime)
135 glideinFrontendLib.log_files.logActivity("Aggregate monitoring data")
136 aggregate_stats()
138 # do it just before the sleep
139 glideinFrontendLib.log_files.cleanup()
141 glideinFrontendLib.log_files.logActivity("Sleep")
142 time.sleep(sleep_time)
143 finally:
144 # cleanup at exit
145 for group_name in childs.keys():
146 try:
147 os.kill(childs[group_name].pid,signal.SIGTERM)
148 except OSError:
149 pass # ignore failed kills of non-existent processes
152 ############################################################
153 def cleanup_environ():
154 for val in os.environ.keys():
155 val_low=val.lower()
156 if val_low[:8]=="_condor_":
157 # remove any CONDOR environment variables
158 # don't want any surprises
159 del os.environ[val]
160 elif val_low[:5]=="x509_":
161 # remove any X509 environment variables
162 # don't want any surprises
163 del os.environ[val]
166 ############################################################
167 def main(work_dir):
168 startup_time=time.time()
170 glideinFrontendConfig.frontendConfig.frontend_descript_file=os.path.join(work_dir,glideinFrontendConfig.frontendConfig.frontend_descript_file)
171 frontendDescript=glideinFrontendConfig.FrontendDescript(work_dir)
173 # the log dir is shared between the frontend main and the groups, so use a subdir
174 log_dir=os.path.join(frontendDescript.data['LogDir'],"frontend")
176 # Configure the process to use the proper LogDir as soon as you get the info
177 glideinFrontendLib.log_files=glideinFrontendLib.LogFiles(log_dir,
178 float(frontendDescript.data['LogRetentionMaxDays']),
179 float(frontendDescript.data['LogRetentionMinDays']),
180 float(frontendDescript.data['LogRetentionMaxMBs']))
182 try:
183 cleanup_environ()
184 # we use a dedicated config... ignore the system-wide
185 os.environ['CONDOR_CONFIG']=frontendDescript.data['CondorConfig']
187 sleep_time=int(frontendDescript.data['LoopDelay'])
188 advertize_rate=int(frontendDescript.data['AdvertiseDelay'])
189 restart_attempts=int(frontendDescript.data['RestartAttempts'])
190 restart_interval=int(frontendDescript.data['RestartInterval'])
192 groups=string.split(frontendDescript.data['Groups'],',')
193 groups.sort()
195 glideinFrontendMonitorAggregator.monitorAggregatorConfig.config_frontend(os.path.join(work_dir,"monitor"),groups)
196 except:
197 tb = traceback.format_exception(sys.exc_info()[0],sys.exc_info()[1],
198 sys.exc_info()[2])
199 glideinFrontendLib.log_files.logWarning("Exception occurred: %s" % tb)
200 raise
202 # create lock file
203 pid_obj=glideinFrontendPidLib.FrontendPidSupport(work_dir)
205 # start
206 pid_obj.register()
207 try:
208 try:
209 spawn(sleep_time,advertize_rate,work_dir,
210 frontendDescript,groups,restart_attempts,restart_interval)
211 except KeyboardInterrupt, e:
212 glideinFrontendLib.log_files.logActivity("Received signal...exit")
213 except:
214 tb = traceback.format_exception(sys.exc_info()[0],sys.exc_info()[1],
215 sys.exc_info()[2])
216 glideinFrontendLib.log_files.logWarning("Exception occurred: %s" % tb)
217 finally:
218 pid_obj.relinquish()
220 ############################################################
222 # S T A R T U P
224 ############################################################
226 def termsignal(signr,frame):
227 raise KeyboardInterrupt, "Received signal %s"%signr
229 if __name__ == '__main__':
230 signal.signal(signal.SIGTERM,termsignal)
231 signal.signal(signal.SIGQUIT,termsignal)
233 main(sys.argv[1])