new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / Jobcluster / Jobguard.pm
blobe209583aa44546985eeb4ead6564597d43b36c5b
1 =head1 NAME && VERSION
3 Jobcluster::Jobguard - Qsub single or multi line shell to cluster, and guard their jobs.
5 Version: V1.4.2
7 This module is based on some functions of the qsub-sge.pl written by Fan Wei and Hu Yujie, Many Thanks to both of them.
8 <Declare> The two names are from script "/share/backup/jiawl/useful/perl_script/qsub-sge.pl".
10 =head1 SYNOPSIS
12 use lib '/share/backup/jiawl/useful/perl_script/module';
13 use Jobcluster::Jobguard;
15 qsubsingle(<"-s:$shell_path">,<"-m:$memory">,["-c:$change_shell_sign"],["-q:$submit_queue"]);
16 qsubmulti(<"-s:$shell_path">,<"-m:$memory">,["-l:$split_shell_line_number"],["-q:$submit_queue"],["-sp:$subsh_prefix"],
17 ["-ct:[0/1]"],["-cu:[0/1]"]]);
18 jobguard(<"-d:$log_directory">,["-t:$interval"],["-r:$reqsub_sign"],["-n:$logname_prefix"],["-e:$max_eqw_time"],["-h:$qhost_timespan"],
19 [<"-ms:$mem_check_sign">,["-mc:$mem_cycle"],["-me:$mem_exceed"],["-mr:$mem_record"]],["-ql:$queue_limit"],
20 ["-df:$disk,$min_space,$rls_space"]*);
22 * means the parameter is multi tyoe, you can set it several times.
24 =head1 DESCRIPTION
26 Routine 'qsubsingle' allows you to qsub single-line shell script to cluster.
27 >> "-s:$shell_path" is the shell(single line) you want to submit to run.
28 >> "-m:$memory" is the memory you apply for your jobs. Default as 1G when your input lacks of units.
29 >> "-c:$change_shell_sign" defaults as 0, which means this routine couldnot change shell script to the right structure(NOTE<1>).
30 If you want your shell change to the structure automatically, input para "-c:1".
31 >> "-q:$submit_queue" is the job-queue where you want your job to run. (strict format: *.q)
33 Routine 'qsubmulti' allows you to qsub multi-line shell script to cluster.
34 >> "-s:$shell_path" is same as above. (multi lines)
35 >> "-m:$memory" is same as above.
36 >> "-l:$split_shell_line_number" is the Line Number of the small shell script splited from your input $shell_path script. Default as 1.
37 >> "-q:$submit_queue" is same as above.
38 >> "-sp:$subsh_prefix" is prefix of each sub shells split from the original shell "-s:$shell_path". Default as 'work'.
39 >> "-ct:[0/1]" is the sign of using system command 'time' to check the time each subshell used. Default as 0, means disable.
40 >> "-cu:[0/1]" is the sign of using system commands 'qstat -j' and 'grep' to get the 'usage' info of each subshell. Default as 0, means disable.
42 Routine 'jobguard' helps you to guard status of jobs that are submitted by 'qsubsingle' or 'qsubmulti' routine, and do something right.
43 >> "-d:$log_directory" is where the guard_job_log will exist, The directory does Not need to exist, for it will be created by this routine.
44 >> "-t:$interval" is the cycle time of guarding jobs, and it is in second(s). Default as 300(s).
45 >> "-r:$reqsub_sign" is default as '0', which means make the reqsub-error-job function disabled. Set $reqsub_sign as '1' to enable it.
46 >> "-n:$logname_prefix" is the prefix of job_guard_log's name.
47 when it is 'multi line shell' or one 'single line shell' job to guard, it will be set as same as the name of shell script in despite of your input.
48 when it is more than one 'single line shell' job to guard, it will be set as the variable $logname_prefix; or it will be set as 'jobguard' by default.
49 >> "-e:$max_eqw_time" is the max time of Eqw status for each job. Default as 60, but is limited below 100.
50 >> "-h:$qhost_timespan" is the time of each qhost check, default as 1, which means one qhost-check one cycle.
51 >> "-ms:$mem_check_sign" is the memory guard sign, default as 0, which means disable the memory-guard function.
52 >> "-mc:$mem_cycle" is similar to "-h:$qhost_timespan", default as 5. (effective with "-ms:1")
53 >> "-me:$mem_exceed" is the limited memory that the job can exceed its required memory(vf_mem). (effective with "-ms:1")
54 It can be decimals(fraction{[0,1]} of vf_mem) or specific memory(in uint 'M/m/G/g').
55 >> "-mr:$mem_record" is the sign of memory-guarding recording, default as 1, which means enable record function. (effective with "-ms:1")
56 If this function is enabled, memory-guard log will be found at the end of LOG file.
57 >> "-ql:$queue_limit" is the sign of job-submit-queue-limitation in the reqsubing error-jobs.
58 Default as 1, which means keep the original job-queue info. Want to resubmit error-jobs to any job-queues, set it as 0.
59 >> "-df:$disk,$min_space,$rls_space" is about the disk of which free space you want to check.
60 $disk is path of the disk to check via command 'df -h $disk';
61 $min_space is the minimum avail space the disk remains, once less, hold all jobs guarded;
62 $rls_space is the minimum avail space the disk remains to release all holded jobs.
63 Requirement: $rls_space should be at least 50G more than $min_space for safty, else it will die as an exception.
64 The available units are 'K,k,M,m,G,g,T,t'.
65 This parameters can be set in several times, so as to you may have several disks to check, but you should set all disks' info validly.
67 =head1 NOTE
69 <1> # guard works method #
70 1) All shell qsubed should have the '.... && perl -e 'print STDERR "This-work-is-completed\n"' structure to allow the routine 'jobguard' works.
71 2) If single-line shell script doesnot have the structure by itself, then set "-c:1" will change it, or routine 'qsubsingle' will die as an exception.
72 3) multi-line shell script is not required to have this structure, for routine 'qsubmulti' will add it at the end of each line automatically.
73 <2> # job guard method #
74 Use `whoami` command to get the userID for the following " qstat -u userID ".
75 Q: IF `whoami` could not get your valid name to " qstat -u userID ", how to solve this problem ?
76 A: First, Add "use Jobcluster::Jobguard qw/:DEFAULT $USER/;" in your perl script to export the our variable $USER;
77 Then, Change $USER to your valid name to let the " qstat -u $USER " works.
78 <3> # reqsub sign warning #
79 If you set the sign as default ('0'), which means you disable the reqsub-error-job function, once there exists any error job, It will die as an
80 exception-Warning when the jobguard routine finishes its all-job guarding. Because the error jobs cannot give you the right results, and your
81 next step which needs those right results will not succeed. The die just gives you chance to reqsub the error jobs by hand.
82 * absolute path of Shell scripts of all error jobs need to reqsub CAN be found at the end of the related guard_job_log.
83 <4> # elapsed time of job-guarding #
84 Everytime routine 'jobguard' runs, it will record the elapsed time in second(s). It will help you to know how much time each step takes.
85 Check it at the end of guard_job_log of each step ("Jobcluster::Jobguard::jobguard Guard time: XXXXXs"), But you know it isn't the cpu time.
86 <5> # memory guard method #
87 Use `qstat -j JobID | grep usage` to get the usage-info.
88 Try to set the "-mc:$mem_cycle" larger to lighten the burden of SGE.
89 When memory-guard function enabled, once there exists memory-error jobs, the routine 'jobguard' will die as an exception when finishs its
90 jobs' guarding work. So it is easy to undertand that any flow will die at the step whose scripts' max_memory turn out error.
91 <6> # max run/qw job number method #
92 Use "qsub ... -hold_jid XXX ..." to achieve the max run/qw job number control.
93 A Variable named as $MAX_R_JOBS, which defaults as 50, is the max number of run/qw jobs.
94 Add "use Jobcluster::Jobguard qw/:DEFAULT $MAX_R_JOBS/;" in your perl script to export the our variable $MAX_R_JOBS, then change it as you like.
95 <7> # delete logs of bad jobs #
96 A our type variable named as $DEL_BAD_LOG, defaults as 0, which means enable the deletion.
97 Add "use Jobcluster::Jobguard qw/:DEFAULT $DEL_BAD_LOG/;" in your perl script to export this variable, and you can change it.
98 <8> # operations when disk are full #
99 As your setting "-df:...." in the Routine 'jobguard', this module can do something right when disk reaches the limitation to avoid writing.
100 To the running jobs, use command 'qmod -s' to let them sleep, while using 'qmod -us' to wake up them when disk is ok;
101 To the qw/hqw jobs, use command 'qhold' to let them hold, while using 'qrls' to unhold them.
103 =head1 AUTHOR && CONTACT
105 Author : Jiawenlong at 2011/03/06
106 Contact: jiawenlong@genomics.org.cn
108 Welcome any question, bug-report or suggestion, TIA.
110 =head1 UPDATE LOG
112 V1.0
113 achieve the basic function.
114 V1.1
115 1) add the WARNING-log for nonexistence of jobs' log.
116 2) optimize some perldoc infos.
117 V1.2
118 1) update the routines' input para model.
119 2) add the single-line shell change function.
120 3) use 'qmod' to deal eqw-jobs, max deal time defaults as 60 for each job. (Many Thanks to Ye Rui and Chen Shuisheng)
121 4) add the qhost time span para (-h) in routine 'jobguard', and default as 1, which means one qhost-check one cycle.
122 5) optimize the deadnode-log.
123 V1.3
124 1) add four memory guard paras in routine 'jobguard'. (-ms,-mc,-me,-mr)
125 2) add the -q para to let user specify the queue to run on.
126 3) add the userID feedback step.
127 4) add the submit-queue-limit para in the routine 'jobguard'.
128 5) optimize the memory-record-log for memory-error-shell-script.
129 6) add the max_run_jobs function.
130 7) add the function that deal the line prefixed by '#' in shell script.
131 8) add the function that check the used time and usage info of each sub shell.
132 9) add the parameter to set prefix of sub-shell.
133 V1.4
134 1) add the check function of disk available space.
135 2) add the save fuction of bad-jobs' logs for users to check errors manually.
136 =cut
138 package Jobcluster::Jobguard;
140 use strict;
141 use warnings;
142 use File::Basename qw/basename dirname/;
143 use Cwd qw/abs_path/;
144 require Exporter;
146 #----- systemic variables -----
147 our (@ISA, @EXPORT, $VERSION, @EXPORT_OK, %EXPORT_TAGS);
148 @ISA = qw(Exporter);
149 @EXPORT = qw(jobguard qsubmulti qsubsingle);
150 @EXPORT_OK = qw(mendnumber $USER $MAX_R_JOBS $DEL_BAD_LOG $CHECK_USER);
151 %EXPORT_TAGS = ( DEFAULT => [qw(jobguard qsubmulti qsubsingle)],
152 OTHER => [qw(mendnumber)]);
153 #----- version --------
154 $VERSION = "1.4.2";
156 #----- required variables ------
157 our ($USER,%JOB_HASH,$CHECK_USER,$MAX_R_JOBS,$DEL_BAD_LOG);
158 chomp($USER = `whoami`);
159 $CHECK_USER = 1; ## default need to be check by user
160 my $SINGLE_CHECK_SIGN = 1;
161 $MAX_R_JOBS = 50; ## initial the max_run_job_number as 50, others hqw
162 my $USAGE_CHECK_PERL_SCRIPT;
163 my (%DEVICE,$DEVICE_HOLD);
164 $DEVICE_HOLD = 0;
165 foreach (@INC){
166 $USAGE_CHECK_PERL_SCRIPT = abs_path("$_/Jobcluster/USAGE_CHECK.pl") if(-e abs_path("$_/Jobcluster/USAGE_CHECK.pl"));
168 die "Cannot locate the USAGE_CHECK_PERL_SCRIPT\nIt should exist in the same directory with this module\n" unless($USAGE_CHECK_PERL_SCRIPT);
169 $DEL_BAD_LOG = 1;
170 #---------- check the status of submitted jobs until all done ----------
171 #----------------- if error, then rm deadlogs and resub ----------------
172 sub jobguard{
173 my %para_hash;
174 &jobguard_para(\%para_hash,@_);
175 #-------- necessary para ---------
176 my ($log_dir,$interval,$reqsub,$max_eqw_time,$qhost_timespan) = ($para_hash{'d'},$para_hash{'t'},$para_hash{'r'},$para_hash{'e'},$para_hash{'h'});
177 #-------- other para ---------
178 my ($mem_check_sign,$mem_cycle,$mem_exceed,$mem_record) = ($para_hash{'ms'},$para_hash{'mc'},$para_hash{'me'},$para_hash{'mr'});
179 my $queue_limit = $para_hash{'ql'};
180 unless(exists($JOB_HASH{START_TIME})){
181 chomp(my $time_anchor = `date +\%s`);
182 $JOB_HASH{START_TIME} = $time_anchor;
184 my $log = ((exists($JOB_HASH{SHELL}) && @{$JOB_HASH{SHELL}} == 1)?(basename @{$JOB_HASH{SHELL}}[0]):($para_hash{'n'} || 'jobguard')).".$JOB_HASH{START_TIME}.log";
185 delete $JOB_HASH{SHELL};
186 delete $JOB_HASH{START_TIME};
187 die "No jobs have been submitted(no jobs to guard), so you cannot not run Jobcluster::Jobguard::jobguard\n" if(scalar(keys %JOB_HASH) == 0);
188 open (LOG,">$log_dir/$log") || die "fail $log_dir/$log: $!\n"; ## open log
189 print "Please check $log_dir/$log for jobs' stat-info\n"; ## tell user to check log
190 my ($finishjob,$reqsub_number,$error_number,$mem_error_number) = (0,0,0,0);
191 my $jobnum = scalar(keys %JOB_HASH);
192 my @finished_jobs = ();
193 my @need_reqsub = ();
194 my %Eqw_Time; # this hash doesnot need to clear real-time
195 my %MEM_USE;
196 chomp(my $start_second = `date +\%s`);
197 my $cycle_time = 0;
198 while(($finishjob + scalar(@need_reqsub)/2 + $mem_error_number) != $jobnum){
199 $cycle_time++;
200 my @need_qmod = ();
201 sleep $interval;
202 ####### get the queue status, find the dead nodes #################
203 my @queue_status;
204 if($cycle_time % $qhost_timespan == 0){
205 HOST:{
206 @queue_status=split /\n/,`qhost`;
207 if(scalar(@queue_status)>0 && ($queue_status[0]!~/HOSTNAME/ || $queue_status[0]!~/LOAD/)){
208 sleep 60;
209 print LOG "\n<WARNING>:\tThe shell order 'qhost' gets invaid info! Redoing it Now!\n\n";
210 redo HOST;
214 else{
215 print LOG "\nINFO:\tQhost-time-span is $qhost_timespan, this time($cycle_time) skip qhost check...\n\n";
217 my %deadnode=map{
218 if(/^compute/){
219 my @ele=split;
220 my $sign='';
221 $sign.=$_ for @ele[3..7];
222 if($sign=~/-/){
223 ($ele[0],1);
225 else{
229 else{
232 } @queue_status;
233 #------------ get job status ------------
234 my @jobstat;
235 STAT:{
236 @jobstat=split /\n/,`qstat -u $USER`;
237 if(scalar(@jobstat)>0 && ($jobstat[0]!~/job-ID/ || $jobstat[0]!~/name/ || $jobstat[0]!~/state/)){
238 sleep 60;
239 print LOG "\n<WARNING>:\tThe shell order 'qstat -u $USER' gets invaid info! Redoing it Now!\n\n";
240 redo STAT;
243 my %jobstat=map{
244 if(/^\s*\d/){
245 s/^\s+//;
246 my @ele=split;
247 if(/@(compute-\d+-\d+)\D/){
248 if(exists($deadnode{$1})){
249 ($ele[0],"dqueue:$1");
251 else{
252 @ele[0,4];
255 else{
256 @ele[0,4];
259 else{
262 } @jobstat;
263 my @jobID=sort {$a<=>$b} keys %JOB_HASH;
264 foreach my $jobID(@jobID){
265 if(exists $jobstat{$jobID}){ ## jobs are on the cluster now
266 my $qdel_sign = 1; ## default as qdel those jobs
267 my $memory_ok = 1; ## default as job's used memory is ok, so can reqsub
268 if($jobstat{$jobID}=~/^[dE]/i){ ## d or E status
269 if($jobstat{$jobID} =~ /^dqueue:(compute.+)$/){ # dead node
270 print LOG "\n<WARNING>:\tJob $jobID is running on a dead node($1)! Try to qdel job $jobID...\n\n";
272 elsif($jobstat{$jobID} eq 'Eqw'){ # Error qw status
273 print LOG "\n<WARNING>:\tJob $jobID is in $jobstat{$jobID} status! ";
274 $Eqw_Time{$jobID} = 0 unless(exists($Eqw_Time{$jobID}));
275 $Eqw_Time{$jobID}++; # Eqw time + 1
276 if($Eqw_Time{$jobID} > $max_eqw_time){ # This JOB Eqw has exceeeded $max_eqw_time times
277 print LOG "But Its Eqw time has exceeded $max_eqw_time times!\n\n";
279 else{
280 print LOG "Try to qmod -cj job $jobID at the end of this guard-cycle...\n\n";
281 push @need_qmod,$jobID; # record the jobs need to be qmod
282 $qdel_sign = 0; # not qdel this job
286 else{ # other error status, like dr
287 print LOG "\n<WARNING>:\tJob $jobID is in $jobstat{$jobID} status! Try to qdel job $jobID...\n\n";
290 else{ ## jobs not in SGE-error status, next check memory and cpu-time(if ok)
291 #-------- running jobs' memory-check -----------
292 if($mem_check_sign && ($cycle_time % $mem_cycle == 0) && $jobstat{$jobID} eq 'r'){ ## running jobs' memory-check do
293 my $memory_max_usage;
294 ($memory_max_usage,$memory_ok,$qdel_sign) = &memory_guard($jobID,$mem_exceed);
295 if($mem_record && $memory_max_usage ne '-1'){ ## record the memory_max_used
296 my $shell = ${$JOB_HASH{$jobID}}[0];
297 ${$MEM_USE{$shell}}[0] = ${$JOB_HASH{$jobID}}[1]; ## required memory
298 ${$MEM_USE{$shell}}[1] = $memory_max_usage; ## max used memory
299 ${$MEM_USE{$shell}}[2] = 'Memory-Error_Need-modify-and-Reqsub' unless($memory_ok); ## memory-error-record-warning
301 print LOG "\n<WARNING>:\tJob $jobID exceeds required memory(${$JOB_HASH{$jobID}}[1]) more than ".(($mem_exceed =~ /m/i)?$mem_exceed:"its $mem_exceed")."! Try to qdel job $jobID...\n\n" if($qdel_sign);
303 else{ ## everything is fine...
304 $qdel_sign = 0; # not qdel this job
307 if($qdel_sign){
308 &qdel_job($jobID); ## `qdel $jobID`;
309 if($DEL_BAD_LOG){
310 push my @deadlog , (@{$JOB_HASH{$jobID}}[0].'.o'.$jobID) , (@{$JOB_HASH{$jobID}}[0].'.e'.$jobID);
311 &check_rm($_) for @deadlog;
312 print LOG "INFO:\tlogs of dead job $jobID have been deleted!\n";
314 if($reqsub){
315 if($memory_ok){
316 print LOG "INFO:\tAnd the new job for running the shell: ".${$JOB_HASH{$jobID}}[0]." is ".&qsubsingle("-s:${$JOB_HASH{$jobID}}[0]","-m:${$JOB_HASH{$jobID}}[1]","-r:2","-q:".(($queue_limit)?${$JOB_HASH{$jobID}}[2]:'NA'))."\n\n";
317 $reqsub_number++;
319 else{
320 print LOG "INFO:\tShell: ${$JOB_HASH{$jobID}}[0] exceeds required memory more than ".(($mem_exceed =~ /m/i)?$mem_exceed:"its $mem_exceed").", So it cannot qsub again!\n";
321 $mem_error_number++;
323 $error_number++;
325 else{
326 print LOG "\n<WARNING>:\t".@{$JOB_HASH{$jobID}}[0]." is not reqsubed!\n\n";
327 if(&test_store(\@need_reqsub,@{$JOB_HASH{$jobID}}[0])){
328 push @need_reqsub,@{$JOB_HASH{$jobID}};
329 $error_number++;
332 delete $JOB_HASH{$jobID};
335 else{ ## jobs are not on the cluster now
336 my $log=@{$JOB_HASH{$jobID}}[0].".e$jobID";
337 my $sign=(-e $log)?`tail -1 $log`:'The log not exists';
338 my $key=0;
339 if($sign=~/This-work-is-completed/){
340 $key=1;
342 else{
343 sleep 120; ## give the second chance to check log
344 $sign=(-e $log)?`tail -1 $log`:'The log not exists';
345 $key=1 if($sign=~/This-work-is-completed/);
347 if($key==1){
348 $finishjob++;
349 print LOG "INFO:\tJob $jobID has finished successfully!\n";
350 delete $JOB_HASH{$jobID};
351 push @finished_jobs,$jobID;
353 else{
354 if($sign eq 'The log not exists'){
355 print LOG "\n<WARNING>:\tCannot find the output log of job $jobID. ($log)\n";
357 else{
358 print LOG "\n<WARNING>:\toutput log of job $jobID doesnot have the 'This-work-is-completed' ($log)\n";
360 print LOG "<WARNING>:\tJob $jobID ruined by some reason! Try to qdel job $jobID...\n\n";
361 &qdel_job($jobID); ## `qdel $jobID`;
362 if($DEL_BAD_LOG){
363 push my @deadlog , (@{$JOB_HASH{$jobID}}[0].'.o'.$jobID) , (@{$JOB_HASH{$jobID}}[0].'.e'.$jobID);
364 &check_rm($_) for @deadlog;
365 print LOG "INFO:\tlogs of dead job $jobID have been deleted!\n";
367 if($reqsub){
368 print LOG "INFO:\tAnd the new job for running the shell: ".${$JOB_HASH{$jobID}}[0]." is ".&qsubsingle("-s:${$JOB_HASH{$jobID}}[0]","-m:${$JOB_HASH{$jobID}}[1]","-r:2","-q:".(($queue_limit)?${$JOB_HASH{$jobID}}[2]:'NA'))."\n\n";
369 $reqsub_number++;
370 $error_number++;
372 else{
373 print LOG "\n<WARNING>:\t".@{$JOB_HASH{$jobID}}[0]." is not reqsubed!\n\n";
374 if(&test_store(\@need_reqsub,@{$JOB_HASH{$jobID}}[0])){
375 push @need_reqsub,@{$JOB_HASH{$jobID}};
376 $error_number++;
379 delete $JOB_HASH{$jobID};
383 #------- do qmod tehe Eqw jobs -------
384 if(@need_qmod != 0){
385 print LOG "Now Try to qmod -cj the below jobs:\n";
386 print LOG "$_\n" for @need_qmod;
387 `qmod -cj @need_qmod`; # clear the Eqw status
388 #------- although the log not exists, but try del them for safe -------
389 foreach my $eqw_jobID (@need_qmod) {
390 push my @deadlog , (@{$JOB_HASH{$eqw_jobID}}[0].'.o'.$eqw_jobID) , (@{$JOB_HASH{$eqw_jobID}}[0].'.e'.$eqw_jobID);
391 &check_rm($_) for @deadlog;
393 print LOG "qmod finished\n";
395 #-------- check the disk free space -------
396 if(scalar(keys %DEVICE) != 0){
397 if($DEVICE_HOLD == 0){ ## no job qhold for the disk reason
398 unless(&disk_free){ ## never do anything else for remaining the user's freedom
399 print LOG "\n<WARNING>:\tDisk Avail space is not ok!\n";
400 &hold_jobs;
401 $DEVICE_HOLD = 1;
402 print LOG "\n<WARNING>:\thold all jobs done!\n";
405 else{
406 if(&disk_free){ ## never do anything else for remaining the user's freedom
407 print LOG "Disk Avail space is ok!\n";
408 &release_jobs;
409 $DEVICE_HOLD = 0;
410 print LOG "release all holded jobs done!\n";
414 #-------- output the each-cycle-info ---------
415 my @running_jobs = sort {$a<=>$b} keys %JOB_HASH;
416 print LOG "\nINFO:\tGuard-$cycle_time-time: all:$jobnum finsih:$finishjob [h]run/[h]qw/s:".scalar(@running_jobs)." error:$error_number ".(($reqsub)?"reqsub:$reqsub_number":("need_reqsub:".(scalar(@need_reqsub)/2)))." memory_error:$mem_error_number".' at '.`date`."\n";
417 print LOG "Jobs have been holded for the reason of not-enough-free-space-in-disk\n" if($DEVICE_HOLD == 1);
419 # undef %JOB_HASH;
420 chomp(my $end_second = `date +\%s`);
421 my @running_jobs = sort {$a<=>$b} keys %JOB_HASH;
422 die "!ERROR!:\tThe Last number of running jobs is not '0'!\n" if(@running_jobs != 0);
423 delete $JOB_HASH{$_} for keys %JOB_HASH;
424 if($mem_check_sign && $mem_record){ ## output the memory recorded of each shell_script
425 my @shells = sort keys %MEM_USE;
426 print LOG "\n---------- Shell scripts' recorded max-memory-usage ------------\n";
427 if(@shells == 0){
428 print LOG "No job's memory usage is record, because the function didnot run during the whole guarding time!\n\n";
430 else{
431 print LOG "Shell scripts' recorded memory usage are as followed:\n(<shell> <vf_memory> <max_used_memory> [warning])\n";
432 print LOG "$_\t${$MEM_USE{$_}}[0]\t${$MEM_USE{$_}}[1]".(defined(${$MEM_USE{$_}}[2])?"\t${$MEM_USE{$_}}[2]":'')."\n" for @shells;
435 print LOG "\nJobcluster::Jobguard::jobguard Guard time: ".($end_second - $start_second)."s\n\n";
436 print LOG "All jobs are finished successfully\n" if($jobnum == $finishjob);
437 if(!$reqsub && @need_reqsub/2 > 0){
438 print LOG "\n";
439 print LOG "------------ As 'reqsub function' is closed -------------\n";
440 print LOG "----------- And Error job exists IN this time -----------\n";
441 print LOG "------------- So Shell need to be reqsubed --------------\n";
442 print LOG "vf=$need_reqsub[($_-1)*2+1]\t$need_reqsub[($_-1)*2]\n" for (1..scalar(@need_reqsub)/2);
443 print LOG "\n\n";
444 print LOG "As 'reqsub function' is closed, so some Jobs Error lead to ALL Flow Shutdown!\nCHECK this log carefully!!!\n";
446 my $final_exception = '';
447 $final_exception .= "\nAs 'reqsub function' is closed, so some Jobs Error lead to ALL Flow Shutdown!\n" if(!$reqsub && @need_reqsub/2 > 0);
448 $final_exception .= "\nAs some shell scripts' max-used-memory have exceeded their required memory more than ".(($mem_exceed =~ /m/i)?$mem_exceed:"their $mem_exceed").", So they have been qdel.\nThis is the end of FLOW. Please optimize the shell script or modify your memory-paras\n" if($mem_error_number != 0);
449 if(length($final_exception) != 0){ ## will die?
450 print LOG $final_exception;
451 die $final_exception; ## yes, this is the exception.
453 close LOG;
455 #-------------------- rm the logs of dead jobs --------------------
456 sub check_rm{
457 my $rmfile=shift;
458 `rm $rmfile` if(-e $rmfile);
460 #-------------------- qsub the jobs -------------------
461 sub qsubsingle{
462 &check_user if($CHECK_USER);
463 my %para_hash;
464 &qsubsingle_para(\%para_hash,@_);
465 my ($sh,$memory,$record,$change_sign,$qsub_queue) = ($para_hash{'s'},$para_hash{'m'},$para_hash{'r'},$para_hash{'c'},$para_hash{'q'});
466 (my $line_sum) = (`wc -l $sh` =~ /^(\d+)\s+\S+$/); ## get the line number of mutil-shell
467 die "Shell $sh is not single-line\n" if($line_sum != 1 && $SINGLE_CHECK_SIGN);
468 chomp(my $line_info = `cat $sh`);
469 if($line_info =~ /^\#/){
470 print STDERR "single line shell but prefixed by '#'!\n";
471 exit();
473 # if($line_info !~ /&&/ || $line_info !~ /This-work-is-completed/ || $line_info !~ /STDERR/){
474 if($line_info !~ /\s+&&\s+\(*perl\s+-e\s+'print\s+STDERR\s+".*This-work-is-completed\\n"'\)*\s*$/){
475 die "$sh should ends with '.... && (perl -e 'print STDERR \"This-work-is-completed\\n\"')\n" unless($change_sign);
476 #------- change the original shell ---------
477 open(SH,">$sh") || die "fail change $sh: $!\n";
478 print SH "($line_info) && perl -e 'print STDERR \"This-work-is-completed\\n\"'\n";
479 close SH;
481 my $bin=dirname $sh;
482 my $queue_para = ($qsub_queue eq 'NA')?'':"-q $qsub_queue";
483 my $qsub = "qsub -o $bin -e $bin $queue_para -l vf=$memory ";
484 ## check the number of qsub jobs, max_run_jobs number works
485 my @sort_r_jobs = grep {/^\d+$/} (keys %JOB_HASH);
486 @sort_r_jobs = sort {$a<=>$b} @sort_r_jobs;
487 my $r_jobs_numbers = scalar(@sort_r_jobs);
488 if($r_jobs_numbers >= $MAX_R_JOBS){
489 $qsub .= ("-hold_jid ".$sort_r_jobs[$r_jobs_numbers - $MAX_R_JOBS]);
491 my $jobID;
492 LIST: {
493 $jobID = `$qsub $sh`;
494 print "$jobID";
495 $jobID = (split /\s+/,$jobID)[2];
496 if ($jobID !~/^\d+$/) {
497 print "qsub has not been done for $sh ...\n";
498 sleep 120;
499 redo LIST;
502 push @{$JOB_HASH{$jobID}},$sh,$memory,$qsub_queue;
503 push @{$JOB_HASH{SHELL}},$sh if($record == 1);
504 return $jobID;
506 #-------------------- qsub mutil line shell -----------------
507 #---- split the shell into single line shell, and qsub ------
508 sub qsubmulti{
509 &check_user if($CHECK_USER);
510 my %para_hash;
511 &qsubmulti_para(\%para_hash,@_);
512 my ($sh,$memory,$splitline,$queue_para,$subsh_prefix) = ($para_hash{'s'},$para_hash{'m'},$para_hash{'l'},$para_hash{'q'},$para_hash{'sp'});
513 chomp(my $time_anchor = `date +\%s`);
514 die "Jobcluster::Jobguard::qsubmult has run once, You should run Jobcluster::Jobguard::jobguard\n" if(exists($JOB_HASH{START_TIME}));
515 $JOB_HASH{START_TIME} = $time_anchor;
516 `mkdir -p $sh.$time_anchor.qsub`;
517 # (my $line_sum) = (`wc -l $sh` =~ /^(\d+)\s+\S+$/); ## get the line number of mutil-shell
518 (my $line_sum) = (`grep -v '^#' $sh | wc -l` =~ /^(\d+)/); ## get the line number of mutil-shell, but not '^#'
519 die "Cannot split $sh, because its executable line number is zero.\n" if($line_sum == 0);
520 $splitline = $line_sum if($splitline > $line_sum);
521 my $work_num = 1;
522 open (OLDSH,"$sh") || die "fail $sh: $!\n";
523 my $out_line = '';
524 while(<OLDSH>){
525 next if(/^\s+$/ || /^\#/);
526 s/\s+$//; ## chomp
527 $out_line .= ($para_hash{'ct'})?"time ($_) && ":"($_) && ";
528 if($. >= ($work_num * $splitline) || $. == $line_sum){
529 my $split_shell = "$sh.$time_anchor.qsub/$subsh_prefix".&mendnumber(length($work_num),length(int($line_sum/$splitline)+1)).$work_num.'.sh';
530 open (SPLITSH,">$split_shell") || die "fail $split_shell: $!\n";
531 # $out_line =~ s/;/ && /g; ## convert all ';' to ' && '
532 $out_line .= ($para_hash{'cu'})?"(perl $USAGE_CHECK_PERL_SCRIPT -u $USER -s $split_shell) && ":'';
533 print SPLITSH "${out_line}(perl -e 'print STDERR \"\\nThis-work-is-completed\\n\"')\n";
534 close SPLITSH;
535 $SINGLE_CHECK_SIGN = 0; ## close the singleline check
536 &qsubsingle("-s:$split_shell","-m:$memory","-r:2","-q:$queue_para"); ## unrecord in qsubsingle
537 $SINGLE_CHECK_SIGN = 1; ## open the singleline check
538 $work_num++;
539 $out_line = '';
542 close OLDSH;
543 push @{$JOB_HASH{SHELL}},$sh;
545 #-------------- mend the number with prefix-'0' ---------------
546 sub mendnumber{
547 my ($length_mend,$length_max) = @_;
548 die "Wrong: $length_max < $length_mend\n" if($length_max < $length_mend);
549 return ('0' x ($length_max - $length_mend));
551 #-------------- qdel job on the cluster ---------------
552 #------------- qdel first and then check --------------
553 sub qdel_job{
554 my $dID = shift;
555 for (my $time = 1;;$time++){
556 `qdel $dID`; ## qdel first and then check
557 sleep 10; ## wait qdel result
558 my $info = `qstat -j $dID 2>&1 | head -2`;
559 if($info =~ /Following\s+jobs\s+do\s+not\s+exist/){ ## qdel successfully ## Following jobs do not exist
560 print LOG "INFO:\tError job $dID has been qdel\n";
561 return;
563 if($time > 2){ ## three times 'qdel' failed
564 print LOG "!ERROR!:\tError job $dID cannot qdel in $time times\n\n";
565 return;
567 sleep 60;
570 #------- test wheather store the new error job's shell --------
571 sub test_store{
572 my ($error_array,$error_shell) = @_;
573 for (my $i=0; $i < @$error_array; $i+=2) {
574 return 0 if($$error_array[$i] eq $error_shell);
576 return 1;
578 #------- deal the para of routine jobguard ---------
579 sub jobguard_para{
580 my $phash = shift;
581 my %test;
582 $test{$_} = 1 for ('d','t','r','n','e','h','ms','mc','me','mr','ql','df'); # $log_dir,$interval,$reqsub,$logname_prefix,$max_eqw_time,$qhost_time_span,$mem_check_sign,$mem_cycle,$mem_exceed,$mem_record,$queue_limit,check-disk-free
583 foreach my $para (@_) {
584 my ($sign,$value) = ($para =~ /^\s*\-+([^\s:]+)\s*[:=]\s*(\S+)\s*$/);
585 die"Para_sign:$sign is not required.\n$para is wrong!\n" unless($sign && $value && exists($test{$sign})); # check the para_sign existence
586 if(exists($$phash{$sign})){
587 die "$sign is not a multi parameter, you can not give it several values!\n" if($sign ne 'df'); ## special for multi parameter 'df'
588 $$phash{$sign} .= ';'.$value;
590 else{
591 $$phash{$sign} = $value; # upload the para_hash
594 #------ check the log_dir --------
595 die "Your para should contain '-d:log_directory'\n" unless($$phash{'d'});
596 `mkdir -p $$phash{'d'}` unless(-d $$phash{'d'});
597 $$phash{'d'} = abs_path($$phash{'d'});
598 #------ check the interval -------
599 $$phash{'t'} ||= 300;
600 $$phash{'t'} =~ s/[a-z][A-Z]//g;
601 $$phash{'t'} = 300 if($$phash{'t'} !~ /^\d+$/ || $$phash{'t'} < 120);
602 #------- check the reqsub sign -----
603 $$phash{'r'} ||= 0; ## default as not reqsub error-jobs
604 #------- check the max eqw time -----
605 $$phash{'e'} = 60 if(!$$phash{'e'} || $$phash{'e'} !~ /^\d+$/ || $$phash{'e'} > 100);
606 #------- check the qhost time span -----
607 $$phash{'h'} = 1 if(!$$phash{'h'} || $$phash{'h'} !~ /^\d+$/ || $$phash{'h'} > 10);
608 #------- check mem_check_sign --------
609 $$phash{'ms'} ||= 0; ## default as close memort check
610 #------- check mem_cycle -------
611 die "Cannot set memory_check_cycle while memory_check_function closed!\n" if(!$$phash{'ms'} && exists($$phash{'mc'}));
612 $$phash{'mc'} = 5 if(!$$phash{'mc'} || $$phash{'mc'} !~ /^\d+$/ || !($$phash{'mc'} > 0));
613 #------- check mem_exceed -------
614 die "Cannot set memory_exceed while memory_check_function closed!\n" if(!$$phash{'ms'} && exists($$phash{'me'}));
615 $$phash{'me'} ||= 0.2; ## default
616 if($$phash{'me'} =~ /^[\d\.]+$/){ ## pecentage mode
617 $$phash{'me'} =~ s/\.+$//;
618 $$phash{'me'} = 0.2 unless($$phash{'me'} >= 0 && $$phash{'me'} <= 1);
620 elsif($$phash{'me'} =~ /^([\d\.]+)([mg])$/i){ ## just the specific memory
621 $$phash{'me'} = &get_m_memory($$phash{'me'}).'m'; ## in unit 'm' for easily memory-compare
623 else{ ## bad memory_exceed input
624 die "memory_exceed: $$phash{'me'} is not right form!\nOnly decimals or specific memory in 'M/m/G/g' unit!\n";
626 #------- check memory_record_sign -------
627 die "Cannot set memory_record while memory_check_function closed!\n" if(!$$phash{'ms'} && exists($$phash{'mr'}));
628 $$phash{'mr'} = 1 if(!exists($$phash{'mr'})); ## default as open memory record while memory_check_function opens
629 #------- job queue limit sign ---------
630 $$phash{'ql'} = 1 unless(exists($$phash{'ql'})); ## default queue-limit
631 #------- disk free space -------
632 if(exists($$phash{'df'})){
633 @{$DEVICE{abs_path((split /,/)[0])}} = ((split /,/)[1,2],0) for (split /;/,$$phash{'df'});
634 foreach my $disk (keys %DEVICE) {
635 my ($min,$rls) = @{$DEVICE{$disk}}[0,1];
636 $min = &transform_space($min);
637 $rls = &transform_space($rls);
638 if($min + 50 > $rls){
639 my @jobs = keys %JOB_HASH;
640 `qdel @jobs`;
641 die "\nThe rls_disk_space should be more than min_disk_space at least 50G for safty!\nBut your input is not!\ndisk: $disk\nmin_disk_space: ${$DEVICE{$disk}}[0]\nrls_disk_space: ${$DEVICE{$disk}}[1]\nQdel all jobs!\nPlease check your jobs by hand!\n\n";
646 #------- deal the para of routine qsubmulti ---------
647 sub qsubmulti_para{
648 my $phash = shift;
649 my %test;
650 $test{$_} = 1 for ('s','m','l','q','ct','cu','sp'); # $sh,$memory,$splitline,$qsub_queue,check_job_time,check_job_usage,$subsh_prefix
651 foreach my $para (@_) {
652 my ($sign,$value) = ($para =~ /^\s*\-+([^\s:]+)\s*[:=]\s*(\S+)\s*$/);
653 die "Cannot identify the para: $para\n" unless($sign && $value);
654 die "Para_sign:$sign is not required.\n$para is wrong!\n" unless(exists($test{$sign})); # check the para_sign existence
655 $$phash{$sign} = $value; # upload the para_hash
657 #------ check the shellpath --------
658 die "Your para should contain '-s:shell_path'\n" unless($$phash{'s'});
659 die "Cannot find the shellpath: $$phash{'s'}\n" unless(-e $$phash{'s'});
660 $$phash{'s'} = abs_path($$phash{'s'});
661 #------ check the interval -------
662 die "Your para should contain '-m:memory'\n" unless($$phash{'m'});
663 if($$phash{'m'} =~ /^\d+$/){
664 print STDERR "WARNING: Your memory for $$phash{'s'} has been reset as 1G\nBecause your input $$phash{'m'} has no unit!\n";
665 $$phash{'m'} = '1G';
667 #------- check the splitline -----
668 if(!$$phash{'l'} || $$phash{'l'} !~ /^\d+$/){
669 $$phash{'l'} = 1;
671 #------- check the submit queue -------
672 $$phash{'q'} ||= 'NA';
673 #------- check the subsh prefix -------
674 $$phash{'sp'} ||= 'work';
676 #------- deal the para of routine qsubsingle ---------
677 sub qsubsingle_para{
678 my $phash = shift;
679 my %test;
680 $test{$_} = 1 for ('s','m','r','c','q'); # $sh,$memory,$record,$change_single_sign,$qsub_queue
681 foreach my $para (@_) {
682 my ($sign,$value) = ($para =~ /^\s*\-+([^\s:]+)\s*[:=]\s*(\S+)\s*$/);
683 die"Para_sign:$sign is not required.\n$para is wrong!\n" unless($sign && $value && exists($test{$sign})); # check the para_sign existence
684 $$phash{$sign} = $value; # upload the para_hash
686 #------ check the shellpath --------
687 die "Your para should contain '-s:shell_path'\n" unless($$phash{'s'});
688 die "Cannot find the shellpath: $$phash{'s'}\n" unless(-e $$phash{'s'});
689 $$phash{'s'} = abs_path($$phash{'s'});
690 #------ check the interval -------
691 die "Your para should contain '-m:memory'\n" unless($$phash{'m'});
692 if($$phash{'m'} =~ /^\d+$/){
693 print STDERR "WARNING: Your memory for $$phash{'s'} has been reset as 1G\nBecause your input $$phash{'m'} has no unit!\n";
694 $$phash{'m'} = '1G';
696 #------- check the splitline -----
697 if(!$$phash{'r'} || $$phash{'r'} !~ /^\d+$/){
698 $$phash{'r'} = 1;
700 #------- check the change shell sign ------
701 $$phash{'c'} ||= 0;
702 #------- check the submit queue -------
703 $$phash{'q'} = (exists($$phash{'q'}) && $$phash{'q'} =~ /(\S+)\.q$/)?"$$phash{'q'}":'NA';
705 #---------- get the memory in unit 'm' -------
706 sub get_m_memory{
707 my $ele = shift;
708 my ($number,$unit) = ($ele =~ /^([\d\.]+)([mg])$/i);
709 die "$ele is not the right format of Jobcluster::Jobguard::get_m_memory\n" unless($number && $unit);
710 $number =~ s/\.+$//; ## do safe
711 if($unit eq 'm'){
712 return $number;
714 elsif($unit eq 'M'){
715 return ($number * 1.024);
717 elsif($unit eq 'g'){
718 return ($number * 1000);
720 elsif($unit eq 'G'){
721 return ($number * 1024);
723 else{
724 die "$ele is not the right format of Jobcluster::Jobguard::get_m_memory\n";
727 #-------- jobs' memory guard --------
728 sub memory_guard{
729 my ($jobID,$mem_exceed) = @_[0,1];
730 my $usage;
731 my $qj_time = 0;
732 UASGE:{
733 $usage = `qstat -j $jobID | grep usage`; ## stderr to stdout
734 $qj_time++;
735 # usage 1: cpu=02:23:06, mem=4481.25708 GBs, io=2.67408, vmem=602.082M, maxvmem=602.082M
736 unless($usage =~ /^usage/ && $usage =~ /cpu=[^,]+,\s+mem=[^,]+,\s+io=[^,]+,\s+vmem=[^,]+,\s+maxvmem=/){
737 if($qj_time > 3){
738 print LOG "\n<WARNING>:\tThe shell order 'qstat -j $jobID | grep usage' gets invaid info for 3 times! Skipping it...\n\n";
739 return ('-1',1,0); ## memory=-1, memory is ok, not qdel
741 if((my $new = `qstat -j $jobID 2>&1`) =~ /Following\s+jobs\s+do\s+not\s+exist/){ ## job not exists
742 print LOG "\n<WARNING>:\t$jobID does not exist! Skipping it...\n\n";
743 return ('-1',1,0); ## memory=-1, memory is ok, not qdel
745 sleep 30;
746 print LOG "\n" if($qj_time == 1);
747 print LOG "<WARNING>:\tThe shell order 'qstat -j $jobID | grep usage' gets invaid info! Redoing it Now!\n";
748 redo UASGE;
751 chomp($usage);
752 my @usage = split /\n+/,$usage;
753 my $max_usage;
754 foreach my $usage_info (@usage) {
755 ($max_usage) = ($usage_info =~ /maxvmem=(\S+)/);
756 if($max_usage =~ /N\/?A/i){
757 return ('NA',1,0); ## memory=NA, memory is ok, not qdel
759 else{
760 my $used_max_memory = &get_m_memory($max_usage);
761 my $required_memory = &get_m_memory(${$JOB_HASH{$jobID}}[1]);
762 my $limit_upper_memory;
763 if($mem_exceed =~ /m/i){
764 (my $mem_excd_number = $mem_exceed) =~ s/m//i;
765 $limit_upper_memory = $required_memory + $mem_excd_number;
767 else{
768 $limit_upper_memory = $required_memory * (1 + $mem_exceed);
770 if($used_max_memory > $limit_upper_memory){
771 return ("${used_max_memory}m",0,1); ## memory=used_max_memory, memory is not ok, qdel
775 return ($max_usage,1,0); ## memory=used_max_memory, memory is ok, not qdel
777 #-------- feedback the user name, ten seconds to kill this backstage task ------
778 sub check_user{
779 my $wait_time = 15;
780 print STDERR "\n".('#' x 26)." <NOTE> ".('#' x 26)."\nUse `whoami` and get your userID is '$USER'.\nIs it right?\nIf not, Your will have $wait_time seconds to kill this running backstage task, OR leave it...\nWant to get your valid(real) userID, please perldoc /share/backup/jiawl/useful/perl_script/module/Jobcluster/Jobguard.pm and see NOTE<2>\n".('#' x 60)."\n";
781 $CHECK_USER = 0;
782 # sleep $wait_time; ## abandon this
783 print STDERR "Counting Down: ";
784 for (my $second = $wait_time;$second > 0;$second--) { ## wait to be killed....oh, my god~~~ come on~~~
785 print STDERR "$second ";
786 sleep 1;
788 print STDERR "\nOK! The USERID is confirmed, all works start NOW...\n";
789 sleep 2;
791 #------ check the disk free space info -------
792 sub disk_free{
793 my ($anchor) = $_[0];
794 my $return = 1;
795 foreach my $disk (keys %DEVICE) {
796 my $disk_avail = &get_disk_info($disk);
797 my $anchor = ${$DEVICE{$disk}}[2];
798 if(&compare_space($disk_avail,${$DEVICE{$disk}}[$anchor]) eq 'NA'){ ## compare the space 'NA'
799 print LOG "\n<WARNING>:\tAvial space ($disk_avail) of disk $disk is less than the minimum amount(${$DEVICE{$disk}}[$anchor])." if($anchor == 0);
800 $return = 0;
801 ${$DEVICE{$disk}}[2] = 1; ## this disk is not ok
803 else{ ## avail space OK
804 print LOG "Avial space ($disk_avail) of disk(ever bad) $disk has reached the release amount(${$DEVICE{$disk}}[$anchor])." if($anchor == 1);
805 ${$DEVICE{$disk}}[2] = 0; ## this disk is ok
807 sleep 5; ## have a rest
809 return $return;
811 #------ get the disk free space info -------
812 sub get_disk_info{
813 my ($disk) = $_[0];
814 my $time = 0;
815 my $avail;
816 DF:
818 my $df_info = `df -h $disk`;
819 $time++;
820 if($df_info !~ /Filesystem.+Size.+Used.+Avail/i){
821 die "Cannot get the disk $disk space info!\n" if($time == 3);
822 redo DF;
824 else{
825 ($df_info) = (split /\n+/,$df_info)[2];
826 $df_info =~ s/^\s+//g;
827 $df_info =~ s/\s+$//g;
828 ($avail) = (split /\s+/,$df_info)[2];
829 if($avail !~ /\d/){
830 die "Cannot get the avail space info of disk $disk!\n" if($time == 3);
831 redo DF;
835 return $avail;
837 #------ compare the spaces -------
838 sub compare_space{
839 my ($want_bigger,$want_smaller) = @_[0,1];
840 $want_bigger = &transform_space($want_bigger);
841 $want_smaller = &transform_space($want_smaller);
842 if($want_bigger >= $want_smaller){
843 return 'OK';
845 else{
846 return 'NA';
849 #------ transform space based on the unit --------
850 sub transform_space{
851 my ($old) = $_[0];
852 my ($num,$unit) = ($old =~ /^([\d\.]+)(.+)?$/);
853 $unit ||= 'k'; ## default
854 my (%multiple,@multiple);
855 push @multiple,(1024**($_)) for (-2 .. 1);
856 @multiple{qw/T t G g M m K k/} = @multiple[3,3,2,2,1,1,0,0];
857 if(exists($multiple{$unit})){
858 $num *= $multiple{$unit};
860 else{
861 die "Cannot distinguish the unit $unit from $old!\n";
863 return $num;
865 #------ hold jobs --------
866 sub hold_jobs{
867 my @running_jobs = sort {$a<=>$b} keys %JOB_HASH;
868 my @qhold_jobs = `qmod -s @running_jobs | grep 'can not be' | cut -d ' ' -f10 | cut -d '.' -f1`; ## first qmod -s running jobs
869 if(@qhold_jobs){
870 chomp(@qhold_jobs);
871 `qhold @qhold_jobs`; ## then qhold other jobs in others status
874 #------ release jobs --------
875 sub release_jobs{
876 my @running_jobs = sort {$a<=>$b} keys %JOB_HASH;
877 my @qrls_jobs = `qmod -us @running_jobs | grep 'can not be' | cut -d ' ' -f10 | cut -d '.' -f1`; ## first qmod -us s jobs
878 if(@qrls_jobs){
879 chomp(@qrls_jobs);
880 `qrls @qrls_jobs`; ## then qhold other jobs in others status
884 1; ## tell the perl script the successful access of this module.