7 # $Id: glideinFrontend.py,v 1.80 2011/02/10 21:35:31 parag Exp $
10 # This is the main of the glideinFrontend
23 STARTUP_DIR
=sys
.path
[0]
33 sys
.path
.append(os
.path
.join(STARTUP_DIR
,"../lib"))
35 import glideinFrontendPidLib
36 import glideinFrontendConfig
37 import glideinFrontendLib
38 import glideinFrontendMonitorAggregator
42 ############################################################
43 def aggregate_stats():
44 status
=glideinFrontendMonitorAggregator
.aggregateStatus()
48 ############################################################
49 def is_crashing_often(startup_time
, restart_interval
, restart_attempts
):
52 if (len(startup_time
) < restart_attempts
):
53 # We haven't exhausted restart attempts
54 crashing_often
= False
56 # Check if the service has been restarted often
57 if restart_attempts
== 1:
59 elif (time
.time() - startup_time
[0]) >= restart_interval
:
60 crashing_often
= False
66 ############################################################
67 def spawn(sleep_time
,advertize_rate
,work_dir
,
68 frontendDescript
,groups
,restart_attempts
,restart_interval
):
73 # By default allow max 3 restarts every 30 min before giving up
75 glideinFrontendLib
.log_files
.logActivity("Starting groups %s"%groups
)
77 for group_name
in groups
:
78 childs
[group_name
]=popen2
.Popen3("%s %s %s %s %s"%(sys
.executable
,os
.path
.join(STARTUP_DIR
,"glideinFrontendElement.py"),os
.getpid(),work_dir
,group_name
),True)
79 # Get the startup time. Used to check if the group is crashing
80 # periodically and needs to be restarted.
81 childs_uptime
[group_name
]=list()
82 childs_uptime
[group_name
].insert(0,time
.time())
83 glideinFrontendLib
.log_files
.logActivity("Group startup times: %s"%childs
_uptime
)
85 for group_name
in childs
.keys():
86 childs
[group_name
].tochild
.close()
87 # set it in non blocking mode
88 # since we will run for a long time, we do not want to block
89 for fd
in (childs
[group_name
].fromchild
.fileno(),childs
[group_name
].childerr
.fileno()):
90 fl
= fcntl
.fcntl(fd
, fcntl
.F_GETFL
)
91 fcntl
.fcntl(fd
, fcntl
.F_SETFL
, fl | os
.O_NONBLOCK
)
94 glideinFrontendLib
.log_files
.logActivity("Checking groups %s"%groups
)
95 for group_name
in childs
.keys():
96 child
=childs
[group_name
]
98 # empty stdout and stderr
100 tempOut
= child
.fromchild
.read()
106 tempErr
= child
.childerr
.read()
112 # look for exited child
115 glideinFrontendLib
.log_files
.logWarning("Child %s exited. Checking if it should be restarted."%(group_name))
116 tempOut
= child
.fromchild
.readlines()
117 tempErr
= child
.childerr
.readlines()
118 if is_crashing_often(childs_uptime
[group_name
], restart_interval
, restart_attempts
):
119 del childs
[group_name
]
120 raise RuntimeError,"Group '%s' has been crashing too often, quit the whole frontend:\n%s\n%s"%(group_name
,tempOut
,tempErr
)
121 #raise RuntimeError,"Group '%s' exited, quit the whole frontend:\n%s\n%s"%(group_name,tempOut,tempErr)
123 # Restart the group setting its restart time
124 glideinFrontendLib
.log_files
.logWarning("Restarting child %s."%(group_name))
125 del childs
[group_name
]
126 childs
[group_name
]=popen2
.Popen3("%s %s %s %s %s"%(sys
.executable
,os
.path
.join(STARTUP_DIR
,"glideinFrontendElement.py"),os
.getpid(),work_dir
,group_name
),True)
127 if len(childs_uptime
[group_name
]) == restart_attempts
:
128 childs_uptime
[group_name
].pop(0)
129 childs_uptime
[group_name
].append(time
.time())
130 childs
[group_name
].tochild
.close()
131 for fd
in (childs
[group_name
].fromchild
.fileno(),childs
[group_name
].childerr
.fileno()):
132 fl
= fcntl
.fcntl(fd
, fcntl
.F_GETFL
)
133 fcntl
.fcntl(fd
, fcntl
.F_SETFL
, fl | os
.O_NONBLOCK
)
134 glideinFrontendLib
.log_files
.logWarning("Group's startup/restart times: %s"%childs
_uptime
)
135 glideinFrontendLib
.log_files
.logActivity("Aggregate monitoring data")
138 # do it just before the sleep
139 glideinFrontendLib
.log_files
.cleanup()
141 glideinFrontendLib
.log_files
.logActivity("Sleep")
142 time
.sleep(sleep_time
)
145 for group_name
in childs
.keys():
147 os
.kill(childs
[group_name
].pid
,signal
.SIGTERM
)
149 pass # ignore failed kills of non-existent processes
152 ############################################################
153 def cleanup_environ():
154 for val
in os
.environ
.keys():
156 if val_low
[:8]=="_condor_":
157 # remove any CONDOR environment variables
158 # don't want any surprises
160 elif val_low
[:5]=="x509_":
161 # remove any X509 environment variables
162 # don't want any surprises
166 ############################################################
168 startup_time
=time
.time()
170 glideinFrontendConfig
.frontendConfig
.frontend_descript_file
=os
.path
.join(work_dir
,glideinFrontendConfig
.frontendConfig
.frontend_descript_file
)
171 frontendDescript
=glideinFrontendConfig
.FrontendDescript(work_dir
)
173 # the log dir is shared between the frontend main and the groups, so use a subdir
174 log_dir
=os
.path
.join(frontendDescript
.data
['LogDir'],"frontend")
176 # Configure the process to use the proper LogDir as soon as you get the info
177 glideinFrontendLib
.log_files
=glideinFrontendLib
.LogFiles(log_dir
,
178 float(frontendDescript
.data
['LogRetentionMaxDays']),
179 float(frontendDescript
.data
['LogRetentionMinDays']),
180 float(frontendDescript
.data
['LogRetentionMaxMBs']))
184 # we use a dedicated config... ignore the system-wide
185 os
.environ
['CONDOR_CONFIG']=frontendDescript
.data
['CondorConfig']
187 sleep_time
=int(frontendDescript
.data
['LoopDelay'])
188 advertize_rate
=int(frontendDescript
.data
['AdvertiseDelay'])
189 restart_attempts
=int(frontendDescript
.data
['RestartAttempts'])
190 restart_interval
=int(frontendDescript
.data
['RestartInterval'])
192 groups
=string
.split(frontendDescript
.data
['Groups'],',')
195 glideinFrontendMonitorAggregator
.monitorAggregatorConfig
.config_frontend(os
.path
.join(work_dir
,"monitor"),groups
)
197 tb
= traceback
.format_exception(sys
.exc_info()[0],sys
.exc_info()[1],
199 glideinFrontendLib
.log_files
.logWarning("Exception occurred: %s" % tb
)
203 pid_obj
=glideinFrontendPidLib
.FrontendPidSupport(work_dir
)
209 spawn(sleep_time
,advertize_rate
,work_dir
,
210 frontendDescript
,groups
,restart_attempts
,restart_interval
)
211 except KeyboardInterrupt, e
:
212 glideinFrontendLib
.log_files
.logActivity("Received signal...exit")
214 tb
= traceback
.format_exception(sys
.exc_info()[0],sys
.exc_info()[1],
216 glideinFrontendLib
.log_files
.logWarning("Exception occurred: %s" % tb
)
220 ############################################################
224 ############################################################
226 def termsignal(signr
,frame
):
227 raise KeyboardInterrupt, "Received signal %s"%signr
229 if __name__
== '__main__':
230 signal
.signal(signal
.SIGTERM
,termsignal
)
231 signal
.signal(signal
.SIGQUIT
,termsignal
)