modified: myjupyterlab.sh
[GalaxyCodeBases.git] / perl / soap / classic / runSoapNewVersionMemG.pl
blob5c6ffd1486717d9912114e80b9e0e76844159b39
1 #!/bin/env perl
2 use strict;
3 use warnings;
4 use lib '/share/raid010/resequencing/soft/lib';
5 use Data::Dumper;
6 use TableExtract;
7 #require "/panfs/GAG/junli/raid010/pipeline/flow_script/TableExtract.pm";
8 use Getopt::Long;
9 use FindBin qw($Bin $Script);
11 my %opts = ();
12 GetOptions(\%opts,"fqDir:s","outDir:s","backDir:s","gsoap:i","sort:i","statistic:i","ISconfig:s","ref1:s","ref2:s","chrOrder:s","soapVersion:i","lib:s","mem:s","queue:s");
14 #by fangxd Fri May 23 16:46:23 HKT 2008
15 #modify by lijun3 Tue Dec 1 15:38:15 CST 2009
16 #modify by HXs @ Tue May 18 17:57:18 CST 2010
18 unless(defined $opts{fqDir} && $opts{outDir} && $opts{ref1} && $opts{chrOrder}){
19 print "\n$0 : auto run soap after solexa sequencing finished and provide basic statistics on mapping result,if want to bzip the reads and alignment result,please provide the directory to deposited them\n";
20 print "perl $0\n";
21 print "\t-fqDir\t\tthe directory contain FASTQ files,usually locate in /share/raid9/solexa-work/Run_seq_dir/Project_solexa_fq/PROJECT/LIBRARY\n";
22 print "\t-outDir\t\tthe directory for soap alignment result files\n";
23 print "\t-backDir\tthe directory to compress the fastq files,if not necessary,you can omit it \n";
24 print "\t-gsoap\t\t switch for gsoap\n";
25 print "\t-sort\t\t switch for sort the soap result\n";
26 print "\t-statistic\t switch basic statistics for the alignment\n";
27 print "\t-ISconfig\t configure file for Insert Size,useful for being without Eland alignment,the config file looks like\n";
28 print "\t\t\t\t\tFastqName\tLibrary\tisPE\tRead1Length\tRead2Length\tInsertSize\tMinIS\tMaxIS\n";
29 print "\t-lib\t configure file for Insert Size\n\t\t\tformat: libraryName insertSize [min max]\n";
30 print "\t-ref1\t references in fasta format\n";
31 print "\t\t/share/raid8/fangxd/data/personGenome/data/human.fa\n";
32 print "\t-ref2\t references in binary format(soapVersion2 need)\n";
33 print "\t\t /share/raid8/fangxd/data/personGenome/data/human.fa.index\n";
34 print "\t-chrOrder\tchromomose sort order: /share/raid8/fangxd/data/personGenome/data/chroder \n";
35 print "\t-soapVersion\t 1 for old version,2 for the new version\n\n";
36 print "\t-mem\t\tMemory request in qusb,default 7g\n";
37 print "\t-queue\t\tdesignation the queue: gag_snp.q (the all.q will be added automatically \n";
38 exit 0;
41 my $projectDir = $opts{fqDir};
42 my $outputDir = $opts{outDir};
43 my $compressOutDir = $opts{backDir};
44 my $chrOrder = $opts{chrOrder};
45 my $refSeqFileFlat = $opts{ref1} if (defined $opts{ref1});
46 my $refSeqFileBinary = $opts{ref2} if (defined $opts{ref2});
47 my $queue=" ";
48 my $addque="";
49 if(defined $opts{queue}) {
50 $addque = $opts{queue};
51 $queue=" -q all.q -q $addque";
55 my $soapVersion = 2;
56 if(defined $opts{soapVersion}){
57 $soapVersion = $opts{soapVersion};
58 if( $soapVersion == 2 && ! defined $opts{ref2}){
59 die "SOAP2 need a pre-formated reference\n";
60 }elsif($soapVersion == 1 && ! defined $opts{ref1}){
61 die "SOAP1 need a reference in fasta format\n";
65 print "fqDir\t$opts{fqDir}\n";
66 print "outDir\t$opts{outDir}\n";
67 my $seconds = 1;
70 ######################################### globle variable
73 my %fqInformation = ();
74 mkdir $outputDir unless (-d $outputDir);
75 my $fqPathListFile = "$outputDir/fastq.list";
76 my $fqPathListFileOld = "$outputDir/fastq.list.old";
77 my $statisticsFile = "$outputDir/Statistics.txt";
78 my $statisticsFileRate = "$outputDir/Statistics.rate";
79 my @newFastq = ();
80 my %soapCommand = ();
81 my $soapBinOld = "$Bin/subBin/soap1 -w 5"; # only allow 5 equal best hit to save cpu time
82 my $soapBinNew = "$Bin/subBin/soap2.20 -p 4";
83 my $zipBin = `which bzip2`;
84 chomp $zipBin;
85 my $sortSoapBin = "python $Bin/subBin/SortSoap.py";
86 # python SortSoap.py soapFile chrOrder read1Length read2Length outputFile
87 my %libInfo = ();
88 my %fastqInformation = ();
89 my @fileToCompress = ();
91 ######################################### globle variable
94 findNewFastq();
95 generateSoapShell();
96 submitJob();
97 #CompressFile(\@fileToCompress,$compressOutDir) if (defined $opts{backDir});
99 sub generateSoapShell{
100 if(@newFastq > 0){
101 foreach my $line (@newFastq){ # #FastqID Library isPE Read1Length Read2Length Median MinInsertSize MaxInsertSize FastqFullPath
102 my $refSeqFile = $refSeqFileFlat;
103 my $soapComm = '';
104 my @info = split /\s+/,$line;
105 my $shellFile = '';
106 my $readLength = $info[3];
107 my $soapOutFile = '';
108 my $fqFullPath = $info[-1];
109 my $fqBasename = (split '/',$fqFullPath)[-1];
110 my $unmapReadsOutDir = "$outputDir/unmapReads";
111 if ($readLength>=60 && $soapVersion == 1){
112 warn "soap version 1.0 can not handle read longer than 60 bp.. exit\n";
113 exit 0;
115 my $largeLib = '';
116 if($info[7]>1500){
117 $largeLib = " -R ";
119 my $currentOutDir = "$outputDir/soap/";
120 mkdir $currentOutDir unless (-d $currentOutDir);
122 if($info[2]==1){ # PE
123 $currentOutDir = "$outputDir/soap/PE";
124 mkdir $currentOutDir unless (-d $currentOutDir);
125 $shellFile = "$currentOutDir/soap_$fqBasename.sh";
126 my $fqFullPath2 = $fqFullPath;
127 $fqFullPath2 =~ s/\_1\.fq/\_2\.fq/;
128 $soapOutFile = "$currentOutDir/$fqBasename.soap";
129 if($soapVersion == 1){
130 my $seed = $readLength>30?12:10;
131 $soapComm = "$soapBinOld -a $fqFullPath -b $fqFullPath2 -d $refSeqFileFlat -o $soapOutFile -2 $currentOutDir/$fqBasename.single -t -c 52 -p 8 -s $seed -m $info[6] -x $info[7]";
132 }elsif($soapVersion == 2){
133 my $seedLength = 32;
134 my $maxMismatch = $readLength>70?3:1;
135 my $minLength = 40;
136 $soapComm = "$soapBinNew -a $fqFullPath -b $fqFullPath2 -D $refSeqFileBinary -o $soapOutFile -2 $currentOutDir/$fqBasename.single -m $info[6] -x $info[7] $largeLib -t -s $minLength -l $seedLength -v $maxMismatch";
137 #this modify is just for no trim
138 #$soapComm = "$soapBinNew -a $fqFullPath -b $fqFullPath2 -D $refSeqFileBinary -o $soapOutFile -2 $currentOutDir/$fqBasename.single -m $info[6] -x $info[7] $largeLib -l $seedLength -v $maxMismatch";
139 if(defined($opts{gsoap})){
140 $opts{gsoap}=5 if $opts{gsoap}==0;
141 mkdir $unmapReadsOutDir unless(-d $unmapReadsOutDir);
142 $unmapReadsOutDir = "$unmapReadsOutDir/PE";
143 mkdir $unmapReadsOutDir unless(-d $unmapReadsOutDir);
144 my $unmapFile = "$unmapReadsOutDir/$fqBasename.unmap";
145 #$soapComm .= " -u $unmapFile";
146 $soapComm .= " -g $opts{gsoap} -u $unmapFile";
149 push @fileToCompress,$fqFullPath,$fqFullPath2,$soapOutFile,"$currentOutDir/$fqBasename.single";
150 }else{
152 my $currentOutDir = "$outputDir/soap/SE";
153 mkdir $currentOutDir unless (-d $currentOutDir);
155 $shellFile = "$currentOutDir/soap_$fqBasename.sh";
156 $soapOutFile = "$currentOutDir/$fqBasename.soap";
158 if($soapVersion == 1){
159 my $seed = $readLength>30?12:10;
160 $soapComm = "$soapBinOld -a $fqFullPath -d $refSeqFileFlat -o $soapOutFile -t -p 8 -s $seed";
161 }elsif($soapVersion == 2){
162 my $seedLength = 32;
163 my $maxMismatch = $readLength>70?3:1;
164 my $minLength = 40;
165 $soapComm = "$soapBinNew -a $fqFullPath -D $refSeqFileBinary -o $soapOutFile -t -s $minLength -l $seedLength -v $maxMismatch";
166 #this modify if just for been
167 #$soapComm = "$soapBinNew -a $fqFullPath -D $refSeqFileBinary -o $soapOutFile -l $seedLength -v $maxMismatch";
168 if(defined($opts{gsoap})){
169 $opts{gsoap}=5 if $opts{gsoap}==0;
170 mkdir $unmapReadsOutDir unless(-d $unmapReadsOutDir);
171 $unmapReadsOutDir = "$unmapReadsOutDir/SE";
172 mkdir $unmapReadsOutDir unless(-d $unmapReadsOutDir);
173 my $unmapFile = "$unmapReadsOutDir/$fqBasename.unmap";
174 #$soapComm .= " -u $unmapFile";
175 $soapComm .= " -g $opts{gsoap} -u $unmapFile";
178 push @fileToCompress,$fqFullPath,$soapOutFile;
180 my $memeryRequest = '';
181 #if(defined $opts{mem}){
182 # $memeryRequest = $opts{mem};
183 # $memeryRequest =~ s/g//i;
185 $memeryRequest = $soapVersion==1?"14g":"7g";
187 if(defined $opts{mem}){
188 $memeryRequest = $opts{mem};
189 #$memeryRequest =~ s/g//i;
192 $soapCommand{$shellFile} = join "\t",$memeryRequest,$soapComm;
193 $fastqInformation{$shellFile} = $line . "\t" . $soapOutFile;
197 sub submitJob{
199 my %jobIDs = ();
200 my $psudoJobID = 0;
201 foreach my $shellFile (keys %soapCommand){
202 my $soapOutFile = (split "\t",$fastqInformation{$shellFile})[-1];
203 if(-f $soapOutFile){
204 $psudoJobID++;
205 $jobIDs{"psudoJobID.$psudoJobID"} = $fastqInformation{$shellFile};
206 }else{
207 print "$shellFile\n";
208 my ($memeryRequest,$soapComm) = split "\t",$soapCommand{$shellFile};
209 open OUT,">$shellFile" or die "Error in writing [ $shellFile ]:$!\n";
210 print OUT "$soapComm\n";
211 close OUT;
212 #next;
213 my $qsub = "qsub$queue -cwd -l vf=$memeryRequest $shellFile";
214 my $jobID = `$qsub`;
215 print "$qsub jobID $jobID\n";
216 $jobID = (split /\s+/,$jobID)[2];
217 while($jobID !~ /\d+/){
218 $jobID = `$qsub`;
219 $jobID = (split /\s+/,$jobID)[2];
220 print "\tresub $qsub\n";
221 sleep 10;
223 $jobIDs{$jobID} = $fastqInformation{$shellFile};
224 sleep 10;
227 waitJobsDoneSortSoap2(%jobIDs);
230 sub waitJobsDoneSortSoap2{
231 my (%jobsID) = @_;
232 while(keys %jobsID){
234 foreach my $jobid (keys %jobsID){
235 my $qstat = `qstat -j $jobid`;
236 chomp $qstat;
238 my @qstat_lines = split /\n/, $qstat;
241 if(defined($qstat_lines[1]))
243 # print "test : $qstat_lines[1]\n";
244 my $job = (split /\s+/,$qstat_lines[1])[1];
246 # print "job: $job\n";
249 else
252 print "Finish jobid $jobid\n";
253 sortSoap($jobsID{$jobid}) if(defined $opts{sort});
254 mapStatics($jobsID{$jobid}) if (defined $opts{statistic});
255 delete $jobsID{$jobid};
267 sleep $seconds;
278 sub waitJobsDoneSortSoap{
280 my (%jobsID) = @_;
281 while( keys %jobsID > 0){
282 my $qstat = `qstat`;
283 chomp $qstat;
284 my @qstatLines = split /\n/,$qstat;
285 if(@qstatLines < 2){
286 sleep 10;
287 next;
289 my %runningJobs = ();
290 foreach my $line (@qstatLines){
291 my $job = (split /\s+/,$line)[0];
292 next if (! $job);
293 if($line =~ /Eqw/){
294 my $qresub = `qresub $job`;
295 chomp $qresub;
296 my $qresubJobID = (split /\s+/,$qresub)[2];
297 $jobsID{$qresubJobID} = $jobsID{$job};
298 delete $jobsID{$job};
299 `qdel $job`;
300 $runningJobs{$qresubJobID} = 1;
301 }else{
302 if(exists $jobsID{$job}){
303 $runningJobs{$job} = 1;
307 foreach my $jobID(keys %jobsID){
308 if(! exists $runningJobs{$jobID}){
309 print "Finish jobid $jobID\n";
310 if ($jobID !~ /^\d+$/ && $jobID !~ /psudoJobID/){
311 delete $jobsID{$jobID};
312 next;
314 sortSoap($jobsID{$jobID}) if(defined $opts{sort});
315 mapStatics($jobsID{$jobID}) if (defined $opts{statistic});
316 delete $jobsID{$jobID};
319 sleep $seconds;
328 sub mapStatics{
330 #die "exit in mapStatics\n";
331 my ($line) = @_;
332 my @info = split /\s+/,$line;
333 my $fastqID = $info[0];
334 my $library = $info[1];
335 my $isPE = $info[2];
336 my %finishedStatistics = ();
337 if(-f $statisticsFile){
338 open IN,"$statisticsFile" or die "Error in reading [ $statisticsFile ] $!\n";
339 while(my $line = <IN>){
340 if ($line !~ /^\d+/){
341 next;
343 chomp $line;
344 $finishedStatistics{(split /\s+/,$line)[0]} = 1;
346 close IN;
348 return 0 if (exists $finishedStatistics{$fastqID});
350 print "$fastqID\tStatistics...\n";
351 my ($fastqFile,$soapOutFile) = @info[-2,-1];
352 my @soapOutFiles = ();
353 push @soapOutFiles,$soapOutFile;
354 if($isPE){
355 my $soapSingleFile = $soapOutFile;
356 $soapSingleFile =~ s/\.soap/\.single/;
357 push @soapOutFiles,$soapSingleFile;
360 my %readLength = ();
361 $readLength{a} = $info[3];
362 $readLength{b} = $info[4];
364 my $tmpCount = `wc -l $fastqFile`;
365 chomp $tmpCount;
366 my $readCount = (split /\s+/,$tmpCount)[0];
367 $readCount /= 4;
368 my $baseCount = $readCount * $readLength{a};
369 if($isPE){
370 $baseCount = $readCount * ($readLength{a}+$readLength{b});
371 $readCount *= 2;
374 my $mapReadsCount = 0;
375 my $mapBasesCount = 0;
376 my $uniqMapReadsCount = 0;
377 my $uniqMapBasesCount = 0;
378 my $trimReadsCount = 0;
379 my $trimBasesCount = 0;
380 my %mismatchCount = ();
381 my $dupRate = "NA";
382 $mismatchCount{0} = 0;
383 $mismatchCount{1} = 0;
384 $mismatchCount{2} = 0;
386 my $peMapReadsCount = 0;
387 my $peMapReadsRate = 0;
389 my $peMapBasesCount = 0;
391 foreach my $soapOutFile(@soapOutFiles){
392 print "\treading $soapOutFile\n";
393 open IN,"$soapOutFile" or die "Error in reading file [ $soapOutFile ] $!\n";
394 my @lines = ();
395 my $isPESoap = 0;
396 if($isPE && $soapOutFile =~ /\.soap/){
397 $isPESoap = 1;
399 my @checkDuplicate = ();
400 while(my $line = <IN>){
401 @lines = split /\s+/,$line;
402 next if (@info <10 || @info > 12);
403 $mismatchCount{$lines[9]}++;
404 $mapReadsCount++;
405 $mapBasesCount += $lines[5];
406 if($isPESoap){
407 $peMapBasesCount += $lines[5];
408 $peMapReadsCount++;
409 if($.<=2000000){
410 push @checkDuplicate,"$info[0],$info[7],$info[8]";
413 if($lines[3] == 1){
414 $uniqMapReadsCount++;
415 $uniqMapBasesCount+=$lines[5];
417 if($lines[5] != $readLength{$lines[4]}){
418 $trimReadsCount++;
419 $trimBasesCount+=$readLength{$lines[4]} - $lines[5];
422 if($isPESoap){
423 my $dup = 0;
424 my $total = 0;
425 my %pcrDup = ();
426 foreach my $chr(keys %pcrDup){
427 my %freq = ();
428 for(my $i=0;$i<@checkDuplicate;$i+=2){
429 my($id1,$chr1,$pos1) = split ',',$checkDuplicate[$i];
430 my($id2,$chr2,$pos2) = split ',',$checkDuplicate[$i+1];
431 if($id1 eq $id2 && $chr1 eq $chr2){
432 my $key = '';
433 if($pos1>$pos2){
434 $key = "$pos2,$pos1";
435 }else{
436 $key = "$pos1,$pos2";
438 $pcrDup{$chr1}{$key}++;
441 @checkDuplicate = (); # release memory
442 foreach my $chr(keys %pcrDup){
443 foreach my $pair(keys %{$pcrDup{$chr}}){
444 $total++;
445 if( $pcrDup{$chr}{$pair} > 1 ){
446 $dup += $pcrDup{$chr}{$pair} - 1;
451 if($total == 0){
452 $dupRate = "NA";
453 }else{
454 $dupRate = sprintf("%.4f",$dup / $total * 100);
458 if(-f $statisticsFile){
459 open STAT,">>$statisticsFile" or die "Error in writting [ $statisticsFile ] $!\n";
460 open RATE,">>$statisticsFileRate" or die "Error in writting [ $statisticsFileRate ] $!\n";
461 }else{
462 open STAT,">$statisticsFile" or die "Error in writting [ $statisticsFile ] $!\n";
463 open RATE,">$statisticsFileRate" or die "Error in writting [ $statisticsFileRate ] $!\n";
464 my $title1 = "Fastq\tLibrary\tIsPE\tLength1\tLength2\t#Reads\t#Bases\t#MapableReads\t#MapableBases\t#UniqMapReads\t#PEMapableBases\t#UniqMapableBases\t#TrimedBases\t#U0Reads\t#U1Reads\t#U2Reads\n";
465 print STAT "$title1\n";
466 my $title2 = "Fastq\tLibrary\tIsPE\tLength1\tLength2\t#Reads\t#Bases\tMapableReads(%)\tMapableBases(%)\tpeMapBasesRate(%)\tUniqMapRead\tUniqMapableBases(%)\tTrimedBases(%)\tU0Reads(%)\tU1Reads(%)\tU2Reads(%)\tPCRdupRate\n";
467 print RATE "$title2\n";
470 my $mapReadsRate = sprintf("%.2f",$mapReadsCount / $readCount * 100);
471 my $mapBasesRate = sprintf("%.2f",$mapBasesCount / $baseCount * 100);
472 my $uniqBasesRate = sprintf("%.2f",$uniqMapBasesCount / $mapBasesCount * 100);
473 my $trimBasesRate = sprintf("%.2f",$trimBasesCount / $mapBasesCount * 100);
474 my $peMapBasesRate = sprintf("%.2f",$peMapBasesCount / $mapBasesCount * 100);
475 my $u0Rate = sprintf("%.2f",$mismatchCount{0} / $readCount * 100);
476 my $u1Rate = sprintf("%.2f",$mismatchCount{1} / $readCount * 100);
477 my $u2Rate = sprintf("%.2f",$mismatchCount{2} / $readCount * 100);
478 my $uniqMapReadRate = sprintf("%.2f",$uniqMapReadsCount/$mapReadsCount);
480 print STAT "$fastqID\t$library\t$isPE\t$readLength{a}\t$readLength{b}\t$readCount\t$baseCount\t$mapReadsCount\t$mapBasesCount\t$peMapBasesCount\t$uniqMapReadsCount\t$uniqMapBasesCount\t$trimBasesCount\t$mismatchCount{0}\t$mismatchCount{1}\t$mismatchCount{2}\n";
481 print RATE "$fastqID\t$library\t$isPE\t$readLength{a}\t$readLength{b}\t$readCount\t$baseCount\t$mapReadsRate\t$mapBasesRate\t$peMapBasesRate\t$uniqMapReadRate\t$uniqBasesRate\t$trimBasesRate\t$u0Rate\t$u1Rate\t$u2Rate\t$dupRate\n";
483 close STAT;
484 close RATE;
487 sub sortSoap{
488 my ($info) = @_;
489 # 081103_I328_FC30JLNAAXX_L5_HSKnomeR4AADCABPE_1.fq HSKnomeR4AADCABPE 1 75 75 222 198 249 /share/raid9/solexa-work/Run_seq_dir/Project_solexa_fq/SZC08002_HSKnomeR4_resequencing_DNA/HSKnomeR4AADCABPE/081103_I328_0002_FC30JLNAAXX/081103_I328_FC30JLNAAXX_L5_HSKnomeR4AADCABPE/081103_I328_FC30JLNAAXX_L5_HSKnomeR4AADCABPE_1.fq /share/raid11/resequencing/Knome4/NS12665//PE/081103_I328_FC30JLNAAXX_L5_HSKnomeR4AADCABPE_1.fq.soap
490 my @info = split /\s+/,$info;
491 my $isPE = $info[2];
492 my ($read1Len,$read2Len) = @info[3,4];
493 my $soapOutFile = $info[-1];
494 my $soapOutFileBasename = (split '/',$soapOutFile)[-1];
495 my $type = $isPE>0?"PE":"SE";
496 my $sortOutDir = "$outputDir/SoapSort";
497 mkdir $sortOutDir unless (-d $sortOutDir);
498 $sortOutDir = "$outputDir/SoapSort/$type";
499 mkdir $sortOutDir unless (-d $sortOutDir);
500 my $shellFile = "$sortOutDir/sort_$soapOutFileBasename.sh";
501 my $soapSortOutFile = "$sortOutDir/$soapOutFileBasename.sort";
502 #my $sortComm1 = "$sortSoapBin -in $soapOutFile -out $soapSortOutFile -order $chrOrder -a $read1Len -b $read2Len";
503 my $sortComm1 = "$sortSoapBin $soapOutFile $chrOrder $read1Len $read2Len $soapSortOutFile";
504 my $sortComm2 = "";
505 my @commands = ();
506 push @commands,$sortComm1 if(! -f $soapSortOutFile);
507 if($isPE){
508 my $soapSortOutFile2 = $soapSortOutFile;
509 my $soapOutFile2 = $soapOutFile;
510 $soapSortOutFile2 =~ s/\.soap/\.single/g;
511 $soapOutFile2 =~ s/\.soap/\.single/g;
512 $sortComm2 = "$sortSoapBin $soapOutFile2 $chrOrder $read1Len $read2Len $soapSortOutFile2";
513 if(! -f $soapSortOutFile2){
514 push @commands,$sortComm2;
517 if(@commands > 0){
518 open OUT,">$shellFile" or die "Error in writting file: [ $shellFile ] $!\n";
519 print OUT "$sortComm1\n";
520 print OUT "$sortComm2\n";
521 close OUT;
522 my $memeryRequest = "5g";
523 if($read1Len > 60){
524 $memeryRequest = "6g";
526 my $qsub = "qsub$queue -cwd -l vf=$memeryRequest $shellFile";
527 print "Sorting SoapOut\n$qsub\n";
528 `$qsub`;
532 sub findNewFastq{
534 my %oldFq = ();
535 my %newFq = ();
536 my %oldFqListFileLibCount = ();
537 my %newFqListFileLibCount = ();
539 my @newFqFiles = `find $projectDir -name "*.fq"`;
540 chomp @newFqFiles;
541 print "#########################################\n";
542 print "Fastqs\n";
543 my $fqIndex = 0;
544 map{if($_!~/_2\.fq/){$fqIndex++;print "\t$fqIndex\t$_\n"}}@newFqFiles;
545 print "#########################################\n\n";
546 foreach my $fqFile(@newFqFiles){
547 #next if ($fqFile =~ /\/maq/); # tmep add
548 #next if ($fqFile =~ /FC20B3BAAXX/); # tmep add
549 my $fqBasename = (split '/',$fqFile)[-1];
550 $newFq{$fqBasename} = $fqFile;
551 my $libName = findLibName($fqBasename);
552 $newFqListFileLibCount{$libName}++;
555 if(-f $fqPathListFile){
556 open IN,"$fqPathListFile" or die "Error in Reading [ $fqPathListFile ] $! $.\n";
557 while(my $line = <IN>){
558 chomp $line;
559 my @info = split /\s+/,$line;
560 $oldFq{$info[0]} = $info[1];
561 my $libName = findLibName($info[0]);
562 $oldFqListFileLibCount{$libName}++;
564 close IN;
566 my $old = keys %oldFq;
567 my $new = keys %newFq;
568 if(1){
570 system("cp $fqPathListFile $fqPathListFile.old") if (-f $fqPathListFile);
571 open FQLIST,">>$fqPathListFile" or die "Error in appending [ $fqPathListFile ] $!\n";
572 foreach my $fq(sort keys %newFq){
574 my $fqBasename = (split '/',$fq)[-1];
575 next if ($fqBasename =~ /\_2.fq/);
576 if( ! exists $oldFq{$fqBasename}){
578 if(! exists $fqInformation{$fqBasename}){
579 getFqInformation($newFq{$fq});
582 if(! exists $fqInformation{$fqBasename}){
583 warn "\t!!!!!Lacking information of [ $fqBasename ] !!!!!!\n";
584 next;
585 }else{
586 $fqInformation{$fqBasename} = $fqInformation{$fqBasename} . "\t". "$newFq{$fq}";
587 my @elementCount = split /\s+/,$fqInformation{$fqBasename};
588 if(@elementCount < 8){
589 warn "\t!!!!!Lacking information of $fqBasename,please check it !!!!!\n";
590 next;
592 print FQLIST "$fqBasename\t$fqInformation{$fqBasename}\n";
593 push @newFastq,"$fqBasename\t$fqInformation{$fqBasename}"; # find new Read to run soap
597 close FQLIST;
601 sub getFqInformation{
602 my ($fqFile) = @_;
603 my $isPE = 0;
604 if($fqFile =~ /\_1.fq/){
605 $isPE = 1;
607 my $info = '';
608 my $fqBasename = (split '/',$fqFile)[-1];
609 #my $library = (split '/',$fqFile)[-4];
610 my $library = (split /\_/, $fqFile)[-2];
611 if($isPE == 0){
612 open IN,"$fqFile" or die "Error in reading [ $fqFile ] $!\n";
613 <IN>;
614 my $seq = <IN>;
615 chomp $seq;
616 my $readLen = length $seq;
617 $info = join "\t",$library,$isPE,$readLen,0,0,0,0;
618 $fqInformation{$fqBasename} = $info;
619 }else{
620 my @path = (split '/',$fqFile);
621 pop @path;
622 my $reportPath = join '/',@path;
623 pop @path;
624 my $path = join '/',@path;
625 my @html = `find $path -name Summary.htm`;
626 chomp @html;
627 my $htmlFile = $html[0];
628 my %laneInfo = ();
629 my @reportFile = glob("$reportPath/*.report");
630 my $reportFile = '';
631 if(@reportFile>0){
632 $reportFile = $reportFile[0];
634 if(defined $opts{ISconfig} && -f $opts{ISconfig}){
635 readConfig();
637 if(defined $opts{lib} && -f $opts{lib}){
638 print "read lib\n";
639 readLibInfo();
641 if(! exists $fqInformation{$fqBasename}){
642 if(defined $opts{lib} && !exists $libInfo{$library}){
643 print "no $library\n";
644 return 0;
646 if(exists $libInfo{$library}){
647 #modify 091019 open T,"$fqFile" or die "$!\n";
649 open T,"$fqFile" or print "$!\n";
650 my $tmp = <T>;
651 $tmp = <T>;
652 close T;
653 chomp $tmp;
654 my $readLen = length $tmp;
655 $fqInformation{$fqBasename} = join "\t",$library,$isPE,$readLen,$readLen,$libInfo{$library};
656 print "$fqBasename\t$fqInformation{$fqBasename}\n";
657 }elsif(-f $reportFile){
658 print "ReportFile $reportFile\n";
659 open REPORT,"$reportFile" or die "Error in reading [ $reportFile ] $!\n";
660 my $laneIndex = '';
661 my $insertSize = 0;
662 my @infoArray = ();
663 while(my $line = <REPORT>){
664 chomp $line;
665 if($line =~ /Lane\s+(\d+)/){
666 $laneIndex = $1;
668 if($line =~ /Length\s+(\d+)\;(\d+)/){
669 my $readLength1 = $1;
670 my $readLength2 = $2;
671 push @infoArray,$readLength1,$readLength2;
673 if($line =~ /InsertSize\b/){
674 if($line =~ /unknown/i){
675 if (defined $opts{ISconfig} && -f $opts{ISconfig}){
676 readConfig();
677 print "Find Insert information in $opts{ISconfig}\n";
679 if(! exists $fqInformation{$fqBasename}){
680 warn "\n!!!! No insert size information in $reportFile !!!!\n\n";
682 last;
685 $insertSize = (split /\s+/,$line)[-1];
686 push @infoArray,$insertSize;
688 if($line =~ /InsertSizeSD/){
689 my $sd = (split /\s+/,$line)[-1];
690 my ($blow,$above) = ($sd =~ /\-(\d+)\/\+(\d+)/);
693 my $lowThresh = $insertSize - $blow * 3;
694 if($lowThresh<0){ $lowThresh= 3*$blow - $insertSize;}
695 my $highThresh = $insertSize + $above * 3;
696 push @infoArray,$lowThresh,$highThresh;
699 close REPORT;
700 $laneInfo{$laneIndex} = join "\t",@infoArray;
701 $fqInformation{$fqBasename} = join "\t",$library,$isPE,$laneInfo{$laneIndex}; # without fqFullpath
703 }elsif(-f $htmlFile){
704 print "html $htmlFile\n";
705 extractInfoFromHtml($htmlFile,\%laneInfo);
706 my ($currentLaneIndex) = ($fqBasename =~ /_L(\d+)_/);
707 foreach my $laneIndex(sort{$a<=>$b} keys %laneInfo){
708 my $fqNameCopy = $fqBasename;
709 $fqNameCopy =~ s/_L${currentLaneIndex}_/_L${laneIndex}_/;
710 $fqInformation{$fqNameCopy} = join "\t",$library,$isPE,$laneInfo{$laneIndex}; # without fqFullpath
712 }else{
713 print "No Summary file found in [ $path ]\n";
718 sub extractInfoFromHtml{
720 my ($htmlFile,$hLaneInfo) = @_;
721 my $te = HTML::TableExtract->new();
722 $te->parse_file($htmlFile);
723 open OUT,">te.out" or die "$!\n";
724 foreach my $ts ($te->tables) {
725 print OUT "Table (", join(',', $ts->coords), "):\n";
726 foreach my $row ($ts->rows) {
727 print OUT join(',', @$row), "\n";
730 close OUT;
731 open IN,"te.out" or die "$!\n";
732 my $laneIndex = 0;
733 my %laneReadLength = ();
734 my @targetLane = (); # deal with some lane without ELAND alignment
735 while(my $line = <IN>){
736 chomp $line;
738 #Table (0,2):
739 if($line =~ /Table\s\(0,2\)\:/){
740 <IN>;
741 foreach my $index (1..8){
742 my $info = <IN>;
743 #7,unknown,HUMAN_refseq_eland,ELAND_PAIR,26, 26,'((CHASTITY>=0.6))',4,Lane 7
744 my @info = split ',',$info;
745 if($info =~ /ELAND_PAIR/){
746 $laneReadLength{$info[0]} = join "\t",@info[4,5];
747 push @targetLane,$info[0];
751 }elsif($line =~ /Below-median/){
752 my $info = <IN>;
753 chomp $info;
754 my $laneNum = $targetLane[$laneIndex];
755 my ($median,$lowThresh,$highThresh) = (split ',',$info)[0,3,4];
756 $$hLaneInfo{$laneNum} = join "\t",$laneReadLength{$laneNum},$median,$lowThresh,$highThresh;
757 $laneIndex++;
760 close IN;
761 unlink "te.out";
763 sub readConfig{
764 open IN,"$opts{ISconfig}" or die "Error in reading $opts{ISconfig}\n";
765 while(my $line = <IN>){
766 chomp $line;
767 my($fqBasename,$library,$isPE,$read1Length,$read2Length,$insertSize,$min,$max) = split /\s+/,$line;
768 my $lowThresh;
769 my $highThresh;
772 if(defined($min) and defined($max))
774 $lowThresh = $min; #$insertSize - $insertSize * 0.11;
775 $highThresh= $max; #$insertSize + $insertSize * 0.11;
777 else
780 $lowThresh = $insertSize - $insertSize * 0.11;
781 $highThresh= $insertSize + $insertSize * 0.11;
785 $fqInformation{$fqBasename} = join "\t",$library,$isPE,$read1Length,$read2Length,$insertSize,$lowThresh,$highThresh;
787 close IN;
789 sub findLibName{
790 my ($fqFile) = @_;
791 my $libName = '';
792 if($fqFile =~ /_1\.fq/){
793 $libName = (split '_',$fqFile)[-2];
794 }else{
795 $libName = (split '_',$fqFile)[-1];
796 $libName =~ s/\.fq//;
798 return $libName;
800 sub readLibInfo{
801 open IN,"$opts{lib}" or die "$!\n";
802 while(my $line = <IN>){
803 chomp $line;
804 #my ($lib,$ins) = split /\s+/,$line;
805 #my $min = $ins - $ins * 0.08 ;
806 #my $max = $ins + $ins * 0.08 ;
807 my ($lib, $ins, $min, $max) = split /\s+/, $line;
808 unless (defined $max) {
809 $min = $ins - $ins * 0.08 ;
810 $max = $ins + $ins * 0.08 ;
812 $libInfo{$lib} = "$ins\t$min\t$max";
814 close IN;
816 sub CompressFile{
817 my ($a_file,$dir) = @_;
818 my @files = @$a_file;
819 my @bzipComm = ();
820 if(@files>0){
821 mkdir $dir unless(-f $dir);
822 foreach my $file(@files){
823 if( ! -f $file){
824 warn "No $file \n";
825 next;
827 my $fileBasename = (split '/',$file)[-1];
828 my $outFileName = '';
829 my $outDir = '';
831 if($file =~ /\.fq$/){
832 $outDir = "$dir/Fastq";
833 mkdir $outDir unless(-f $outDir);
834 if($file =~ /\_1\.fq/ || $file =~ /\_2\.fq/){
835 $outDir .= '/PE';
836 }else{
837 $outDir .= '/SE';
839 }elsif($file =~ /soap$/ || $file =~ /single$/){
840 $outDir = "$dir/Alignment";
841 mkdir $outDir unless(-f $outDir);
842 if($file =~ /\_1\.fq\.soap/ || $file =~ /single/){
843 $outDir .= '/PE';
844 }else{
845 $outDir .= '/SE';
849 mkdir $outDir unless(-f $outDir);
850 $outFileName = "$outDir/$fileBasename.bz2";
851 next if ( -f $outFileName);
852 my $comm = "$zipBin -c $file > $outFileName";
853 push @bzipComm,$comm;
856 if(@bzipComm>0){
857 foreach my $comm(@bzipComm){
858 print "$comm\n";
859 `$comm`;