4 use lib
'/share/raid010/resequencing/soft/lib';
7 #require "/panfs/GAG/junli/raid010/pipeline/flow_script/TableExtract.pm";
9 use FindBin
qw($Bin $Script);
12 GetOptions(\%opts,"fqDir:s","outDir:s","backDir:s","gsoap:i","sort:i","statistic:i","ISconfig:s","ref1:s","ref2:s","chrOrder:s","soapVersion:i","lib:s","mem:s","queue:s");
14 #by fangxd Fri May 23 16:46:23 HKT 2008
15 #modify by lijun3 Tue Dec 1 15:38:15 CST 2009
16 #modify by HXs @ Tue May 18 17:57:18 CST 2010
18 unless(defined $opts{fqDir} && $opts{outDir} && $opts{ref1} && $opts{chrOrder}){
19 print "\n$0 : auto run soap after solexa sequencing finished and provide basic statistics on mapping result,if want to bzip the reads and alignment result,please provide the directory to deposited them\n";
21 print "\t-fqDir\t\tthe directory contain FASTQ files,usually locate in /share/raid9/solexa-work/Run_seq_dir/Project_solexa_fq/PROJECT/LIBRARY\n";
22 print "\t-outDir\t\tthe directory for soap alignment result files\n";
23 print "\t-backDir\tthe directory to compress the fastq files,if not necessary,you can omit it \n";
24 print "\t-gsoap\t\t switch for gsoap\n";
25 print "\t-sort\t\t switch for sort the soap result\n";
26 print "\t-statistic\t switch basic statistics for the alignment\n";
27 print "\t-ISconfig\t configure file for Insert Size,useful for being without Eland alignment,the config file looks like\n";
28 print "\t\t\t\t\tFastqName\tLibrary\tisPE\tRead1Length\tRead2Length\tInsertSize\tMinIS\tMaxIS\n";
29 print "\t-lib\t configure file for Insert Size\n\t\t\tformat: libraryName insertSize [min max]\n";
30 print "\t-ref1\t references in fasta format\n";
31 print "\t\t/share/raid8/fangxd/data/personGenome/data/human.fa\n";
32 print "\t-ref2\t references in binary format(soapVersion2 need)\n";
33 print "\t\t /share/raid8/fangxd/data/personGenome/data/human.fa.index\n";
34 print "\t-chrOrder\tchromomose sort order: /share/raid8/fangxd/data/personGenome/data/chroder \n";
35 print "\t-soapVersion\t 1 for old version,2 for the new version\n\n";
36 print "\t-mem\t\tMemory request in qusb,default 7g\n";
37 print "\t-queue\t\tdesignation the queue: gag_snp.q (the all.q will be added automatically \n";
41 my $projectDir = $opts{fqDir};
42 my $outputDir = $opts{outDir};
43 my $compressOutDir = $opts{backDir};
44 my $chrOrder = $opts{chrOrder};
45 my $refSeqFileFlat = $opts{ref1} if (defined $opts{ref1});
46 my $refSeqFileBinary = $opts{ref2} if (defined $opts{ref2});
49 if(defined $opts{queue}) {
50 $addque = $opts{queue};
51 $queue=" -q all.q -q $addque";
56 if(defined $opts{soapVersion}){
57 $soapVersion = $opts{soapVersion};
58 if( $soapVersion == 2 && ! defined $opts{ref2}){
59 die "SOAP2 need a pre-formated reference\n";
60 }elsif($soapVersion == 1 && ! defined $opts{ref1}){
61 die "SOAP1 need a reference in fasta format\n";
65 print "fqDir\t$opts{fqDir}\n";
66 print "outDir\t$opts{outDir}\n";
70 ######################################### globle variable
73 my %fqInformation = ();
74 mkdir $outputDir unless (-d $outputDir);
75 my $fqPathListFile = "$outputDir/fastq.list";
76 my $fqPathListFileOld = "$outputDir/fastq.list.old";
77 my $statisticsFile = "$outputDir/Statistics.txt";
78 my $statisticsFileRate = "$outputDir/Statistics.rate";
81 my $soapBinOld = "$Bin/subBin/soap1 -w 5"; # only allow 5 equal best hit to save cpu time
82 my $soapBinNew = "$Bin/subBin/soap2.20 -p 4";
83 my $zipBin = `which bzip2`;
85 my $sortSoapBin = "python $Bin/subBin/SortSoap.py";
86 # python SortSoap.py soapFile chrOrder read1Length read2Length outputFile
88 my %fastqInformation = ();
89 my @fileToCompress = ();
91 ######################################### globle variable
97 #CompressFile(\@fileToCompress,$compressOutDir) if (defined $opts{backDir});
99 sub generateSoapShell{
101 foreach my $line (@newFastq){ # #FastqID Library isPE Read1Length Read2Length Median MinInsertSize MaxInsertSize FastqFullPath
102 my $refSeqFile = $refSeqFileFlat;
104 my @info = split /\s+/,$line;
106 my $readLength = $info[3];
107 my $soapOutFile = '';
108 my $fqFullPath = $info[-1];
109 my $fqBasename = (split '/',$fqFullPath)[-1];
110 my $unmapReadsOutDir = "$outputDir/unmapReads";
111 if ($readLength>=60 && $soapVersion == 1){
112 warn "soap version 1.0 can not handle read longer than 60 bp.. exit\n";
119 my $currentOutDir = "$outputDir/soap/";
120 mkdir $currentOutDir unless (-d $currentOutDir);
122 if($info[2]==1){ # PE
123 $currentOutDir = "$outputDir/soap/PE";
124 mkdir $currentOutDir unless (-d $currentOutDir);
125 $shellFile = "$currentOutDir/soap_$fqBasename.sh";
126 my $fqFullPath2 = $fqFullPath;
127 $fqFullPath2 =~ s/\_1\.fq/\_2\.fq/;
128 $soapOutFile = "$currentOutDir/$fqBasename.soap";
129 if($soapVersion == 1){
130 my $seed = $readLength>30?12:10;
131 $soapComm = "$soapBinOld -a $fqFullPath -b $fqFullPath2 -d $refSeqFileFlat -o $soapOutFile -2 $currentOutDir/$fqBasename.single -t -c 52 -p 8 -s $seed -m $info[6] -x $info[7]";
132 }elsif($soapVersion == 2){
134 my $maxMismatch = $readLength>70?3:1;
136 $soapComm = "$soapBinNew -a $fqFullPath -b $fqFullPath2 -D $refSeqFileBinary -o $soapOutFile -2 $currentOutDir/$fqBasename.single -m $info[6] -x $info[7] $largeLib -t -s $minLength -l $seedLength -v $maxMismatch";
137 #this modify is just for no trim
138 #$soapComm = "$soapBinNew -a $fqFullPath -b $fqFullPath2 -D $refSeqFileBinary -o $soapOutFile -2 $currentOutDir/$fqBasename.single -m $info[6] -x $info[7] $largeLib -l $seedLength -v $maxMismatch";
139 if(defined($opts{gsoap})){
140 $opts{gsoap}=5 if $opts{gsoap}==0;
141 mkdir $unmapReadsOutDir unless(-d $unmapReadsOutDir);
142 $unmapReadsOutDir = "$unmapReadsOutDir/PE";
143 mkdir $unmapReadsOutDir unless(-d $unmapReadsOutDir);
144 my $unmapFile = "$unmapReadsOutDir/$fqBasename.unmap";
145 #$soapComm .= " -u $unmapFile";
146 $soapComm .= " -g $opts{gsoap} -u $unmapFile";
149 push @fileToCompress,$fqFullPath,$fqFullPath2,$soapOutFile,"$currentOutDir/$fqBasename.single";
152 my $currentOutDir = "$outputDir/soap/SE";
153 mkdir $currentOutDir unless (-d $currentOutDir);
155 $shellFile = "$currentOutDir/soap_$fqBasename.sh";
156 $soapOutFile = "$currentOutDir/$fqBasename.soap";
158 if($soapVersion == 1){
159 my $seed = $readLength>30?12:10;
160 $soapComm = "$soapBinOld -a $fqFullPath -d $refSeqFileFlat -o $soapOutFile -t -p 8 -s $seed";
161 }elsif($soapVersion == 2){
163 my $maxMismatch = $readLength>70?3:1;
165 $soapComm = "$soapBinNew -a $fqFullPath -D $refSeqFileBinary -o $soapOutFile -t -s $minLength -l $seedLength -v $maxMismatch";
166 #this modify if just for been
167 #$soapComm = "$soapBinNew -a $fqFullPath -D $refSeqFileBinary -o $soapOutFile -l $seedLength -v $maxMismatch";
168 if(defined($opts{gsoap})){
169 $opts{gsoap}=5 if $opts{gsoap}==0;
170 mkdir $unmapReadsOutDir unless(-d $unmapReadsOutDir);
171 $unmapReadsOutDir = "$unmapReadsOutDir/SE";
172 mkdir $unmapReadsOutDir unless(-d $unmapReadsOutDir);
173 my $unmapFile = "$unmapReadsOutDir/$fqBasename.unmap";
174 #$soapComm .= " -u $unmapFile";
175 $soapComm .= " -g $opts{gsoap} -u $unmapFile";
178 push @fileToCompress,$fqFullPath,$soapOutFile;
180 my $memeryRequest = '';
181 #if(defined $opts{mem}){
182 # $memeryRequest = $opts{mem};
183 # $memeryRequest =~ s/g//i;
185 $memeryRequest = $soapVersion==1?"14g":"7g";
187 if(defined $opts{mem}){
188 $memeryRequest = $opts{mem};
189 #$memeryRequest =~ s/g//i;
192 $soapCommand{$shellFile} = join "\t",$memeryRequest,$soapComm;
193 $fastqInformation{$shellFile} = $line . "\t" . $soapOutFile;
201 foreach my $shellFile (keys %soapCommand){
202 my $soapOutFile = (split "\t",$fastqInformation{$shellFile})[-1];
205 $jobIDs{"psudoJobID.$psudoJobID"} = $fastqInformation{$shellFile};
207 print "$shellFile\n";
208 my ($memeryRequest,$soapComm) = split "\t",$soapCommand{$shellFile};
209 open OUT,">$shellFile" or die "Error in writing [ $shellFile ]:$!\n";
210 print OUT "$soapComm\n";
213 my $qsub = "qsub$queue -cwd -l vf=$memeryRequest $shellFile";
215 print "$qsub jobID $jobID\n";
216 $jobID = (split /\s+/,$jobID)[2];
217 while($jobID !~ /\d+/){
219 $jobID = (split /\s+/,$jobID)[2];
220 print "\tresub $qsub\n";
223 $jobIDs{$jobID} = $fastqInformation{$shellFile};
227 waitJobsDoneSortSoap2(%jobIDs);
230 sub waitJobsDoneSortSoap2{
234 foreach my $jobid (keys %jobsID){
235 my $qstat = `qstat -j $jobid`;
238 my @qstat_lines = split /\n/, $qstat;
241 if(defined($qstat_lines[1]))
243 # print "test : $qstat_lines[1]\n";
244 my $job = (split /\s+/,$qstat_lines[1])[1];
246 # print "job: $job\n";
252 print "Finish jobid $jobid\n";
253 sortSoap($jobsID{$jobid}) if(defined $opts{sort});
254 mapStatics($jobsID{$jobid}) if (defined $opts{statistic});
255 delete $jobsID{$jobid};
278 sub waitJobsDoneSortSoap{
281 while( keys %jobsID > 0){
284 my @qstatLines = split /\n/,$qstat;
289 my %runningJobs = ();
290 foreach my $line (@qstatLines){
291 my $job = (split /\s+/,$line)[0];
294 my $qresub = `qresub $job`;
296 my $qresubJobID = (split /\s+/,$qresub)[2];
297 $jobsID{$qresubJobID} = $jobsID{$job};
298 delete $jobsID{$job};
300 $runningJobs{$qresubJobID} = 1;
302 if(exists $jobsID{$job}){
303 $runningJobs{$job} = 1;
307 foreach my $jobID(keys %jobsID){
308 if(! exists $runningJobs{$jobID}){
309 print "Finish jobid $jobID\n";
310 if ($jobID !~ /^\d+$/ && $jobID !~ /psudoJobID/){
311 delete $jobsID{$jobID};
314 sortSoap($jobsID{$jobID}) if(defined $opts{sort});
315 mapStatics($jobsID{$jobID}) if (defined $opts{statistic});
316 delete $jobsID{$jobID};
330 #die "exit in mapStatics\n";
332 my @info = split /\s+/,$line;
333 my $fastqID = $info[0];
334 my $library = $info[1];
336 my %finishedStatistics = ();
337 if(-f $statisticsFile){
338 open IN,"$statisticsFile" or die "Error in reading [ $statisticsFile ] $!\n";
339 while(my $line = <IN>){
340 if ($line !~ /^\d+/){
344 $finishedStatistics{(split /\s+/,$line)[0]} = 1;
348 return 0 if (exists $finishedStatistics{$fastqID});
350 print "$fastqID\tStatistics...\n";
351 my ($fastqFile,$soapOutFile) = @info[-2,-1];
352 my @soapOutFiles = ();
353 push @soapOutFiles,$soapOutFile;
355 my $soapSingleFile = $soapOutFile;
356 $soapSingleFile =~ s/\.soap/\.single/;
357 push @soapOutFiles,$soapSingleFile;
361 $readLength{a} = $info[3];
362 $readLength{b} = $info[4];
364 my $tmpCount = `wc -l $fastqFile`;
366 my $readCount = (split /\s+/,$tmpCount)[0];
368 my $baseCount = $readCount * $readLength{a};
370 $baseCount = $readCount * ($readLength{a}+$readLength{b});
374 my $mapReadsCount = 0;
375 my $mapBasesCount = 0;
376 my $uniqMapReadsCount = 0;
377 my $uniqMapBasesCount = 0;
378 my $trimReadsCount = 0;
379 my $trimBasesCount = 0;
380 my %mismatchCount = ();
382 $mismatchCount{0} = 0;
383 $mismatchCount{1} = 0;
384 $mismatchCount{2} = 0;
386 my $peMapReadsCount = 0;
387 my $peMapReadsRate = 0;
389 my $peMapBasesCount = 0;
391 foreach my $soapOutFile(@soapOutFiles){
392 print "\treading $soapOutFile\n";
393 open IN,"$soapOutFile" or die "Error in reading file [ $soapOutFile ] $!\n";
396 if($isPE && $soapOutFile =~ /\.soap/){
399 my @checkDuplicate = ();
400 while(my $line = <IN>){
401 @lines = split /\s+/,$line;
402 next if (@info <10 || @info > 12);
403 $mismatchCount{$lines[9]}++;
405 $mapBasesCount += $lines[5];
407 $peMapBasesCount += $lines[5];
410 push @checkDuplicate,"$info[0],$info[7],$info[8]";
414 $uniqMapReadsCount++;
415 $uniqMapBasesCount+=$lines[5];
417 if($lines[5] != $readLength{$lines[4]}){
419 $trimBasesCount+=$readLength{$lines[4]} - $lines[5];
426 foreach my $chr(keys %pcrDup){
428 for(my $i=0;$i<@checkDuplicate;$i+=2){
429 my($id1,$chr1,$pos1) = split ',',$checkDuplicate[$i];
430 my($id2,$chr2,$pos2) = split ',',$checkDuplicate[$i+1];
431 if($id1 eq $id2 && $chr1 eq $chr2){
434 $key = "$pos2,$pos1";
436 $key = "$pos1,$pos2";
438 $pcrDup{$chr1}{$key}++;
441 @checkDuplicate = (); # release memory
442 foreach my $chr(keys %pcrDup){
443 foreach my $pair(keys %{$pcrDup{$chr}}){
445 if( $pcrDup{$chr}{$pair} > 1 ){
446 $dup += $pcrDup{$chr}{$pair} - 1;
454 $dupRate = sprintf("%.4f",$dup / $total * 100);
458 if(-f $statisticsFile){
459 open STAT,">>$statisticsFile" or die "Error in writting [ $statisticsFile ] $!\n";
460 open RATE,">>$statisticsFileRate" or die "Error in writting [ $statisticsFileRate ] $!\n";
462 open STAT,">$statisticsFile" or die "Error in writting [ $statisticsFile ] $!\n";
463 open RATE,">$statisticsFileRate" or die "Error in writting [ $statisticsFileRate ] $!\n";
464 my $title1 = "Fastq\tLibrary\tIsPE\tLength1\tLength2\t#Reads\t#Bases\t#MapableReads\t#MapableBases\t#UniqMapReads\t#PEMapableBases\t#UniqMapableBases\t#TrimedBases\t#U0Reads\t#U1Reads\t#U2Reads\n";
465 print STAT "$title1\n";
466 my $title2 = "Fastq\tLibrary\tIsPE\tLength1\tLength2\t#Reads\t#Bases\tMapableReads(%)\tMapableBases(%)\tpeMapBasesRate(%)\tUniqMapRead\tUniqMapableBases(%)\tTrimedBases(%)\tU0Reads(%)\tU1Reads(%)\tU2Reads(%)\tPCRdupRate\n";
467 print RATE "$title2\n";
470 my $mapReadsRate = sprintf("%.2f",$mapReadsCount / $readCount * 100);
471 my $mapBasesRate = sprintf("%.2f",$mapBasesCount / $baseCount * 100);
472 my $uniqBasesRate = sprintf("%.2f",$uniqMapBasesCount / $mapBasesCount * 100);
473 my $trimBasesRate = sprintf("%.2f",$trimBasesCount / $mapBasesCount * 100);
474 my $peMapBasesRate = sprintf("%.2f",$peMapBasesCount / $mapBasesCount * 100);
475 my $u0Rate = sprintf("%.2f",$mismatchCount{0} / $readCount * 100);
476 my $u1Rate = sprintf("%.2f",$mismatchCount{1} / $readCount * 100);
477 my $u2Rate = sprintf("%.2f",$mismatchCount{2} / $readCount * 100);
478 my $uniqMapReadRate = sprintf("%.2f",$uniqMapReadsCount/$mapReadsCount);
480 print STAT "$fastqID\t$library\t$isPE\t$readLength{a}\t$readLength{b}\t$readCount\t$baseCount\t$mapReadsCount\t$mapBasesCount\t$peMapBasesCount\t$uniqMapReadsCount\t$uniqMapBasesCount\t$trimBasesCount\t$mismatchCount{0}\t$mismatchCount{1}\t$mismatchCount{2}\n";
481 print RATE "$fastqID\t$library\t$isPE\t$readLength{a}\t$readLength{b}\t$readCount\t$baseCount\t$mapReadsRate\t$mapBasesRate\t$peMapBasesRate\t$uniqMapReadRate\t$uniqBasesRate\t$trimBasesRate\t$u0Rate\t$u1Rate\t$u2Rate\t$dupRate\n";
489 # 081103_I328_FC30JLNAAXX_L5_HSKnomeR4AADCABPE_1.fq HSKnomeR4AADCABPE 1 75 75 222 198 249 /share/raid9/solexa-work/Run_seq_dir/Project_solexa_fq/SZC08002_HSKnomeR4_resequencing_DNA/HSKnomeR4AADCABPE/081103_I328_0002_FC30JLNAAXX/081103_I328_FC30JLNAAXX_L5_HSKnomeR4AADCABPE/081103_I328_FC30JLNAAXX_L5_HSKnomeR4AADCABPE_1.fq /share/raid11/resequencing/Knome4/NS12665//PE/081103_I328_FC30JLNAAXX_L5_HSKnomeR4AADCABPE_1.fq.soap
490 my @info = split /\s+/,$info;
492 my ($read1Len,$read2Len) = @info[3,4];
493 my $soapOutFile = $info[-1];
494 my $soapOutFileBasename = (split '/',$soapOutFile)[-1];
495 my $type = $isPE>0?"PE":"SE";
496 my $sortOutDir = "$outputDir/SoapSort";
497 mkdir $sortOutDir unless (-d $sortOutDir);
498 $sortOutDir = "$outputDir/SoapSort/$type";
499 mkdir $sortOutDir unless (-d $sortOutDir);
500 my $shellFile = "$sortOutDir/sort_$soapOutFileBasename.sh";
501 my $soapSortOutFile = "$sortOutDir/$soapOutFileBasename.sort";
502 #my $sortComm1 = "$sortSoapBin -in $soapOutFile -out $soapSortOutFile -order $chrOrder -a $read1Len -b $read2Len";
503 my $sortComm1 = "$sortSoapBin $soapOutFile $chrOrder $read1Len $read2Len $soapSortOutFile";
506 push @commands,$sortComm1 if(! -f $soapSortOutFile);
508 my $soapSortOutFile2 = $soapSortOutFile;
509 my $soapOutFile2 = $soapOutFile;
510 $soapSortOutFile2 =~ s/\.soap/\.single/g;
511 $soapOutFile2 =~ s/\.soap/\.single/g;
512 $sortComm2 = "$sortSoapBin $soapOutFile2 $chrOrder $read1Len $read2Len $soapSortOutFile2";
513 if(! -f $soapSortOutFile2){
514 push @commands,$sortComm2;
518 open OUT,">$shellFile" or die "Error in writting file: [ $shellFile ] $!\n";
519 print OUT "$sortComm1\n";
520 print OUT "$sortComm2\n";
522 my $memeryRequest = "5g";
524 $memeryRequest = "6g";
526 my $qsub = "qsub$queue -cwd -l vf=$memeryRequest $shellFile";
527 print "Sorting SoapOut\n$qsub\n";
536 my %oldFqListFileLibCount = ();
537 my %newFqListFileLibCount = ();
539 my @newFqFiles = `find $projectDir -name "*.fq"`;
541 print "#########################################\n";
544 map{if($_!~/_2\.fq/){$fqIndex++;print "\t$fqIndex\t$_\n"}}@newFqFiles;
545 print "#########################################\n\n";
546 foreach my $fqFile(@newFqFiles){
547 #next if ($fqFile =~ /\/maq/); # tmep add
548 #next if ($fqFile =~ /FC20B3BAAXX/); # tmep add
549 my $fqBasename = (split '/',$fqFile)[-1];
550 $newFq{$fqBasename} = $fqFile;
551 my $libName = findLibName($fqBasename);
552 $newFqListFileLibCount{$libName}++;
555 if(-f $fqPathListFile){
556 open IN,"$fqPathListFile" or die "Error in Reading [ $fqPathListFile ] $! $.\n";
557 while(my $line = <IN>){
559 my @info = split /\s+/,$line;
560 $oldFq{$info[0]} = $info[1];
561 my $libName = findLibName($info[0]);
562 $oldFqListFileLibCount{$libName}++;
566 my $old = keys %oldFq;
567 my $new = keys %newFq;
570 system("cp $fqPathListFile $fqPathListFile.old") if (-f $fqPathListFile);
571 open FQLIST,">>$fqPathListFile" or die "Error in appending [ $fqPathListFile ] $!\n";
572 foreach my $fq(sort keys %newFq){
574 my $fqBasename = (split '/',$fq)[-1];
575 next if ($fqBasename =~ /\_2.fq/);
576 if( ! exists $oldFq{$fqBasename}){
578 if(! exists $fqInformation{$fqBasename}){
579 getFqInformation($newFq{$fq});
582 if(! exists $fqInformation{$fqBasename}){
583 warn "\t!!!!!Lacking information of [ $fqBasename ] !!!!!!\n";
586 $fqInformation{$fqBasename} = $fqInformation{$fqBasename} . "\t". "$newFq{$fq}";
587 my @elementCount = split /\s+/,$fqInformation{$fqBasename};
588 if(@elementCount < 8){
589 warn "\t!!!!!Lacking information of $fqBasename,please check it !!!!!\n";
592 print FQLIST "$fqBasename\t$fqInformation{$fqBasename}\n";
593 push @newFastq,"$fqBasename\t$fqInformation{$fqBasename}"; # find new Read to run soap
601 sub getFqInformation{
604 if($fqFile =~ /\_1.fq/){
608 my $fqBasename = (split '/',$fqFile)[-1];
609 #my $library = (split '/',$fqFile)[-4];
610 my $library = (split /\_/, $fqFile)[-2];
612 open IN,"$fqFile" or die "Error in reading [ $fqFile ] $!\n";
616 my $readLen = length $seq;
617 $info = join "\t",$library,$isPE,$readLen,0,0,0,0;
618 $fqInformation{$fqBasename} = $info;
620 my @path = (split '/',$fqFile);
622 my $reportPath = join '/',@path;
624 my $path = join '/',@path;
625 my @html = `find $path -name Summary.htm`;
627 my $htmlFile = $html[0];
629 my @reportFile = glob("$reportPath/*.report");
632 $reportFile = $reportFile[0];
634 if(defined $opts{ISconfig} && -f $opts{ISconfig}){
637 if(defined $opts{lib} && -f $opts{lib}){
641 if(! exists $fqInformation{$fqBasename}){
642 if(defined $opts{lib} && !exists $libInfo{$library}){
643 print "no $library\n";
646 if(exists $libInfo{$library}){
647 #modify 091019 open T,"$fqFile" or die "$!\n";
649 open T,"$fqFile" or print "$!\n";
654 my $readLen = length $tmp;
655 $fqInformation{$fqBasename} = join "\t",$library,$isPE,$readLen,$readLen,$libInfo{$library};
656 print "$fqBasename\t$fqInformation{$fqBasename}\n";
657 }elsif(-f $reportFile){
658 print "ReportFile $reportFile\n";
659 open REPORT,"$reportFile" or die "Error in reading [ $reportFile ] $!\n";
663 while(my $line = <REPORT>){
665 if($line =~ /Lane\s+(\d+)/){
668 if($line =~ /Length\s+(\d+)\;(\d+)/){
669 my $readLength1 = $1;
670 my $readLength2 = $2;
671 push @infoArray,$readLength1,$readLength2;
673 if($line =~ /InsertSize\b/){
674 if($line =~ /unknown/i){
675 if (defined $opts{ISconfig} && -f $opts{ISconfig}){
677 print "Find Insert information in $opts{ISconfig}\n";
679 if(! exists $fqInformation{$fqBasename}){
680 warn "\n!!!! No insert size information in $reportFile !!!!\n\n";
685 $insertSize = (split /\s+/,$line)[-1];
686 push @infoArray,$insertSize;
688 if($line =~ /InsertSizeSD/){
689 my $sd = (split /\s+/,$line)[-1];
690 my ($blow,$above) = ($sd =~ /\-(\d+)\/\+(\d+)/);
693 my $lowThresh = $insertSize - $blow * 3;
694 if($lowThresh<0){ $lowThresh= 3*$blow - $insertSize;}
695 my $highThresh = $insertSize + $above * 3;
696 push @infoArray,$lowThresh,$highThresh;
700 $laneInfo{$laneIndex} = join "\t",@infoArray;
701 $fqInformation{$fqBasename} = join "\t",$library,$isPE,$laneInfo{$laneIndex}; # without fqFullpath
703 }elsif(-f $htmlFile){
704 print "html $htmlFile\n";
705 extractInfoFromHtml($htmlFile,\%laneInfo);
706 my ($currentLaneIndex) = ($fqBasename =~ /_L(\d+)_/);
707 foreach my $laneIndex(sort{$a<=>$b} keys %laneInfo){
708 my $fqNameCopy = $fqBasename;
709 $fqNameCopy =~ s/_L${currentLaneIndex}_/_L${laneIndex}_/;
710 $fqInformation{$fqNameCopy} = join "\t",$library,$isPE,$laneInfo{$laneIndex}; # without fqFullpath
713 print "No Summary file found in [ $path ]\n";
718 sub extractInfoFromHtml{
720 my ($htmlFile,$hLaneInfo) = @_;
721 my $te = HTML::TableExtract->new();
722 $te->parse_file($htmlFile);
723 open OUT,">te.out" or die "$!\n";
724 foreach my $ts ($te->tables) {
725 print OUT "Table (", join(',', $ts->coords), "):\n";
726 foreach my $row ($ts->rows) {
727 print OUT join(',', @$row), "\n";
731 open IN,"te.out" or die "$!\n";
733 my %laneReadLength = ();
734 my @targetLane = (); # deal with some lane without ELAND alignment
735 while(my $line = <IN>){
739 if($line =~ /Table\s\(0,2\)\:/){
741 foreach my $index (1..8){
743 #7,unknown,HUMAN_refseq_eland,ELAND_PAIR,26, 26,'((CHASTITY>=0.6))',4,Lane 7
744 my @info = split ',',$info;
745 if($info =~ /ELAND_PAIR/){
746 $laneReadLength{$info[0]} = join "\t",@info[4,5];
747 push @targetLane,$info[0];
751 }elsif($line =~ /Below-median/){
754 my $laneNum = $targetLane[$laneIndex];
755 my ($median,$lowThresh,$highThresh) = (split ',',$info)[0,3,4];
756 $$hLaneInfo{$laneNum} = join "\t",$laneReadLength{$laneNum},$median,$lowThresh,$highThresh;
764 open IN,"$opts{ISconfig}" or die "Error in reading $opts{ISconfig}\n";
765 while(my $line = <IN>){
767 my($fqBasename,$library,$isPE,$read1Length,$read2Length,$insertSize,$min,$max) = split /\s+/,$line;
772 if(defined($min) and defined($max))
774 $lowThresh = $min; #$insertSize - $insertSize * 0.11;
775 $highThresh= $max; #$insertSize + $insertSize * 0.11;
780 $lowThresh = $insertSize - $insertSize * 0.11;
781 $highThresh= $insertSize + $insertSize * 0.11;
785 $fqInformation{$fqBasename} = join "\t",$library,$isPE,$read1Length,$read2Length,$insertSize,$lowThresh,$highThresh;
792 if($fqFile =~ /_1\.fq/){
793 $libName = (split '_',$fqFile)[-2];
795 $libName = (split '_',$fqFile)[-1];
796 $libName =~ s/\.fq//;
801 open IN,"$opts{lib}" or die "$!\n";
802 while(my $line = <IN>){
804 #my ($lib,$ins) = split /\s+/,$line;
805 #my $min = $ins - $ins * 0.08 ;
806 #my $max = $ins + $ins * 0.08 ;
807 my ($lib, $ins, $min, $max) = split /\s+/, $line;
808 unless (defined $max) {
809 $min = $ins - $ins * 0.08 ;
810 $max = $ins + $ins * 0.08 ;
812 $libInfo{$lib} = "$ins\t$min\t$max";
817 my ($a_file,$dir) = @_;
818 my @files = @$a_file;
821 mkdir $dir unless(-f $dir);
822 foreach my $file(@files){
827 my $fileBasename = (split '/',$file)[-1];
828 my $outFileName = '';
831 if($file =~ /\.fq$/){
832 $outDir = "$dir/Fastq";
833 mkdir $outDir unless(-f $outDir);
834 if($file =~ /\_1\.fq/ || $file =~ /\_2\.fq/){
839 }elsif($file =~ /soap$/ || $file =~ /single$/){
840 $outDir = "$dir/Alignment";
841 mkdir $outDir unless(-f $outDir);
842 if($file =~ /\_1\.fq\.soap/ || $file =~ /single/){
849 mkdir $outDir unless(-f $outDir);
850 $outFileName = "$outDir/$fileBasename.bz2";
851 next if ( -f $outFileName);
852 my $comm = "$zipBin -c $file > $outFileName";
853 push @bzipComm,$comm;
857 foreach my $comm(@bzipComm){