limit fstBC to 30bp in Python3 ver.
[GalaxyCodeBases.git] / perl / bsvf / lib / BSuitLib.pm
blob556a91d275cc82e6391ad8fd961633d9169641ba
1 package main;
2 use strict;
3 use warnings;
4 require BSuitInc;
5 use File::Path 2.08; # http://search.cpan.org/~riche/File-Path-2.11/lib/File/Path.pm#API_CHANGES
6 use File::Basename;
7 use Galaxy;
8 use Galaxy::IO;
9 use Galaxy::IO::FASTA;
10 #use JSON;
12 sub do_pre() {
13 #my $Config = $_[0];
14 #ddx \$Config;
15 File::Path::make_path("$main::RootPath/Ref",{verbose => 0,mode => 0755});
16 my $Refprefix = getRef2char($main::HostRefName,$main::VirusRefName);
17 my $Refile = "$main::RootPath/Ref/$main::RefFilesSHA/$Refprefix.fa";
18 #warn "[$main::HostRefName,$main::VirusRefName] -> $Refprefix [$main::RefFilesSHA]\n";
19 my $found = 0;
20 if ( -f "$main::RootPath/Ref/Ref.ini" ) {
21 $main::RefConfig->read("$main::RootPath/Ref/Ref.ini");
22 $found=1 if exists $main::RefConfig->{$main::RefFilesSHA};
24 if ($found==0) {
25 File::Path::make_path("$main::RootPath/Ref/$main::RefFilesSHA",{verbose => 0,mode => 0755});
26 $main::RefConfig->{$main::RefFilesSHA} = { Refilename => $Refile, RefChrIDs => '', VirusChrIDs => '' };
27 open O,'>',$Refile or die $!;
28 my $FH=openfile($main::Config->{'RefFiles'}->{'HostRef'});
29 my @ChrIDs;
30 while (my $ret = FastaReadNext($FH)) {
31 if ($$ret[0] =~ /(^chrEBV$)|(Un[_-])|(random$)/) {
32 warn "[!] skip Ref[$$ret[0]].\n";
33 next;
35 my $len = length $$ret[1];
36 warn "[!] read Ref[$$ret[0]], $len bp.\n";
37 print O ">$$ret[0]\n$$ret[1]\n";
38 $main::RefConfig->{$main::RefFilesSHA}->{$$ret[0]} = $len;
39 push @ChrIDs,$$ret[0];
41 $main::RefConfig->{$main::RefFilesSHA}->{RefChrIDs} = join(',',@ChrIDs);
42 close $FH;
43 @ChrIDs=();
44 $FH=openfile($main::Config->{'RefFiles'}->{'VirusRef'});
45 while (my $ret = FastaReadNext($FH)) {
46 my $len = length $$ret[1];
47 warn "[!] read Virus[$$ret[0]], $len bp.\n";
48 print O ">$$ret[0]\n$$ret[1]\n";
49 $main::RefConfig->{$main::RefFilesSHA}->{$$ret[0]} = $len;
50 push @ChrIDs,$$ret[0];
52 $main::RefConfig->{$main::RefFilesSHA}->{VirusChrIDs} = join(',',@ChrIDs);
53 close $FH;
54 $main::RefConfig->write("$main::RootPath/Ref/Ref.ini");
55 } else {
56 warn "[!] Already Read References Pairs:[$main::HostRefName,$main::VirusRefName].\n";
58 #ddx \$main::RefConfig;
59 my @cmd;
60 open O,'>',"$main::RootPath/${main::ProjectID}_index.sh" or die $!;
61 print O "#!/bin/sh\n\nexport $main::PathPrefix\n\n";
62 push @cmd,"samtools faidx $Refile";
63 push @cmd,"$RealBin/bin/bwameth.py index $Refile";
64 push @cmd,"#python2 $RealBin/bin/BSseeker2/bs_seeker2-build.py --aligner bowtie2 -f $Refile -d ${Refile}2";
65 push @cmd,"bwa index $Refile";
66 print O join("\n",@cmd),"\n";
67 close O;
68 chmod 0755,"$main::RootPath/${main::ProjectID}_index.sh";
69 warn "[!] Please run [$main::RootPath/${main::ProjectID}_index.sh] to build index.\n";
72 sub do_aln() {
73 my $Refilename = warnFileExist($main::RefConfig->{$main::RefFilesSHA}->{'Refilename'});
74 #warn "$Refilename\n";
75 my (%tID,%FQc,%maxReadNum);
76 for (@{$main::Config->{$main::FileData}->{'='}}) {
77 /([^.]+)\.(\d)/ or die;
78 $tID{$1}{$2} = $_;
79 @{$FQc{$1}{$2}} = split /\s*,\s*/,$main::Config->{$main::FileData}->{$_};
80 $maxReadNum{$1} = 0 unless exists $maxReadNum{$1};
81 $maxReadNum{$1} = $2 if $maxReadNum{$1} < $2;
83 #ddx \%tID;
84 # if ($main::FileData eq 'Base4Files') {
85 # ;
86 # }
87 File::Path::make_path("$main::RootPath/${main::ProjectID}_aln",{verbose => 0,mode => 0755});
88 open O,'>',"$main::RootPath/${main::ProjectID}_aln.sh" or die $!;
89 print O "#!/bin/sh\n\nexport THREADSCNT=24\nexport $main::PathPrefix\n\n";
91 for my $k (keys %tID) {
92 if ($maxReadNum{$k} == 1) { # SE
93 my @FQ2c;
94 for my $f1 (@{$FQc{$k}{1}}) {
95 #my $fID = basename($f1);
96 my $f2 = " ";
97 push @FQ2c,$f2;
99 @{$FQc{$k}{2}} = @FQ2c;
102 #ddx \%FQc;
103 for my $k (keys %tID) {
104 #my @FQ1c = split /\s*,\s*/,$main::Config->{$main::FileData}->{$tID{$k}{1}};
105 #my @FQ2c = split /\s*,\s*/,$main::Config->{$main::FileData}->{$tID{$k}{2}};
106 my @FQ1c = @{$FQc{$k}{1}}; my @FQ2c = @{$FQc{$k}{2}};
107 die "[x] $main::FileData not paired ! [@FQ1c],[@FQ2c]\n" unless $#FQ1c == $#FQ1c;
108 my ($cmd,$cmd2,$cmd3);
109 if (@FQ1c == 1) {
110 $cmd = <<"CMD";
111 $RealBin/bin/bwameth.py --reference $Refilename -t \${THREADSCNT} --read-group $k @{[warnFileExist($FQ1c[0],$FQ2c[0])]} 2>$main::RootPath/${main::ProjectID}_aln/$k.log | samtools view -bS - | samtools sort -n -m 2415919104 - -T $main::RootPath/${main::ProjectID}_aln/$k -o $main::RootPath/${main::ProjectID}_aln/$k.bam
113 $cmd2 = <<"CMD";
114 python2 $RealBin/bin/BSseeker2/bs_seeker2-align.py --aligner bowtie2 -d ${Refilename}2 -g $Refilename --bt2--rg-id $k -1 $FQ1c[0] -2 $FQ2c[0] -o $main::RootPath/${main::ProjectID}_aln/$k.bam --temp_dir=$main::RootPath/${main::ProjectID}_aln >/dev/null
116 $cmd3 = <<"CMD";
117 bwa mem -MY $Refilename -t \${THREADSCNT} -R '\@RG\\tID:$k\\tSM:$k' @{[warnFileExist($FQ1c[0],$FQ2c[0])]} 2>$main::RootPath/${main::ProjectID}_aln/$k.log | samtools view -bS - | samtools sort -n -m 2415919104 - -T $main::RootPath/${main::ProjectID}_aln/$k -o $main::RootPath/${main::ProjectID}_aln/$k.bam
119 if ($main::Aligner eq 'bwa-meth') {
120 print O $cmd;
121 } elsif ($main::Aligner eq 'BSseeker2') {
122 print O $cmd2;
123 } elsif ($main::Aligner eq 'bwa') {
124 print O $cmd3;
125 } else {die;}
126 } else {
127 my @theBams;
128 for my $i (0 .. $#FQ1c) {
129 my $fID = basename($FQ1c[$i]);
130 $fID =~ s/\.fq(\.gz)?$//i;
131 $cmd = <<"CMD";
132 $RealBin/bin/bwameth.py --reference $Refilename -t \${THREADSCNT} --read-group '\@RG\\tID:${k}_${i}_${fID}\\tSM:$k' @{[warnFileExist($FQ1c[$i],$FQ2c[$i])]} 2>$main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID}.log | samtools view -bS - | samtools sort -n -m 2415919104 - -T $main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID} -o $main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID}.bam
134 $cmd2 = <<"CMD";
135 python2 $RealBin/bin/BSseeker2/bs_seeker2-align.py --aligner bowtie2 -d ${Refilename}2 -g $Refilename --bt2--rg-id '\@RG\\tID:${k}_${i}_${fID}\\tSM:$k' -1 $FQ1c[$i] -2 $FQ2c[$i] -o $main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID}.bam --temp_dir=$main::RootPath/${main::ProjectID}_aln >/dev/null
137 $cmd3 = <<"CMD";
138 bwa mem -MY $Refilename -t \${THREADSCNT} -R '\@RG\\tID:${k}_${i}_${fID}\\tSM:$k' @{[warnFileExist($FQ1c[$i],$FQ2c[$i])]} 2>$main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID}.log | samtools view -bS - | samtools sort -n -m 2415919104 - -T $main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID} -o $main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID}.bam
140 push @theBams,"$main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID}.bam";
141 if ($main::Aligner eq 'bwa-meth') {
142 print O $cmd;
143 } elsif ($main::Aligner eq 'BSseeker2') {
144 print O $cmd2;
145 } elsif ($main::Aligner eq 'bwa') {
146 print O $cmd3;
147 } else {die;}
149 my $theBamsJ = join(' ',@theBams);
150 $cmd = <<"CMD";
151 samtools merge -n -l 9 $main::RootPath/${main::ProjectID}_aln/$k.bam $theBamsJ
153 print O $cmd;
155 $cmd = <<"CMD";
156 $RealBin/bin/bamfilter.pl 30 5 $main::RootPath/${main::ProjectID}_aln/$k.bam $main::RootPath/${main::ProjectID}_aln/S_${k}.bam
157 samtools sort -l 0 -m 2G $main::RootPath/${main::ProjectID}_aln/S_${k}.bam -T $main::RootPath/${main::ProjectID}_aln/P_${k} -o $main::RootPath/${main::ProjectID}_aln/P_${k}.bam
158 samtools index $main::RootPath/${main::ProjectID}_aln/P_${k}.bam
160 =pod
161 $cmd = <<"CMD";
162 samtools sort -m 2415919104 -n $main::RootPath/${main::ProjectID}_aln/$k.bam -O bam -T $main::RootPath/${main::ProjectID}_aln/$k.sn >$main::RootPath/${main::ProjectID}_aln/$k.sn.bam 2>>$main::RootPath/${main::ProjectID}_aln/$k.log
164 samtools view -b -F256 $main::RootPath/${main::ProjectID}_aln/$k.sn.bam >$main::RootPath/${main::ProjectID}_aln/$k.snPstat.bam 2>>$main::RootPath/${main::ProjectID}_aln/$k.log
166 sync
167 sleep 60
170 =cut
171 print O $cmd;
173 # Grep step0 Begin
174 File::Path::make_path("$main::RootPath/${main::ProjectID}_grep",{verbose => 0,mode => 0755});
175 my $WorkINI = Galaxy::IO::INI->new();
176 $WorkINI->{'Output'} = $main::Config->{'Output'};
177 $WorkINI->{'Ref'} = $main::RefConfig->{$main::RefFilesSHA};
178 $WorkINI->{'InsertSizes'} = $main::Config->{'InsertSizes'};
179 my %BamFiles;
180 for my $k (keys %tID) {
181 my $myBamf = "$main::RootPath/${main::ProjectID}_aln/P_$k.bam";
182 $BamFiles{$k} = $myBamf;
184 $WorkINI->{'BamFiles'} = \%BamFiles;
185 $WorkINI->write("$main::RootPath/${main::ProjectID}_grep/ToGrep.ini");
186 my $cli = "$RealBin/bin/bsanalyser -p grep $main::RootPath/${main::ProjectID}_grep/ToGrep.ini";
187 print O "\n$cli\n";
188 print O "\n$RealBin/bsuit grep $main::fullcfgfile\n$RealBin/bsuit analyse $main::fullcfgfile\n";
189 # Grep step0 End
190 close O;
191 chmod 0755,"$main::RootPath/${main::ProjectID}_aln.sh";
192 warn "[!] Please run [$main::RootPath/${main::ProjectID}_aln.sh] to do the aln.\n"
195 sub do_grep($) {
196 my $cfgfile = $_[0];
197 my (%tID,%tFH);
198 for (@{$main::Config->{$main::FileData}->{'='}}) {
199 /([^.]+)\.(\d)/ or die;
200 $tID{$1}{$2} = $_;
202 # "780_T" => { 1 => "780_T.1", 2 => "780_T.2" },
203 # "s01_P" => { 1 => "s01_P.1", 2 => "s01_P.2" },
204 for my $k (keys %tID) {
205 my $myBamf = "$main::RootPath/${main::ProjectID}_grep/$k.bam";
206 print "[$myBamf] "; # 内部有错误示意。换行后移
207 open OUT,'>',"${myBamf}.grep" or die "Error opening ${myBamf}.grep: $!\n";
208 open( IN,"-|","$main::PathPrefix samtools view $myBamf") or die "Error opening $myBamf: $!\n";
209 my ($lastgid,@hReads,@vReads,%Vchr);
210 my ($fhReads,$rhReads,$fvReads,$rvReads,$flagHV,$strandOdd,$strandEven)=(0,0,0,0,0,0,0); # /\bYD:Z:f\b/
211 while (<IN>) {
212 chomp;
213 my @dat = split /\t/;
214 /\tZc:i:(\d+)\b/ or die "[x]TAG:Zc:i not found.\n";
215 my $thisGroup = $1;
216 #print "$lastgid <- $thisGroup\n";
217 #print $thisGroup,"\t",join("][",@dat),"\n";
218 if ($dat[6] ne '=') { # 假设染色体不同就算一边人一边病毒(故也统计了两遍)。严格来说也要分染色体统计,但目前只是临时补充的 用sam文件推断病毒正负链插入 功能,正式版将恢复通过后面的动态规划比对来精确判断。
219 my $flagr = ($dat[1] & 16)>>4;
220 my $flagR = ($dat[1] & 32)>>5;
221 if ($flagr == $flagR) {
222 ++$strandEven; # 相同就在负链上
223 } else {
224 ++$strandOdd;
227 if ($lastgid and ($lastgid != $thisGroup)) {
228 my $skipflag = 0;
229 if ($main::GrepMergeBetter and $main::Aligner eq 'bwa-meth') {
230 $skipflag = 1 if ($fhReads < 1 or $rhReads < 1);
231 } else {
232 $skipflag = 1 if @hReads < 2;
234 #print "$skipflag $lastgid <- $thisGroup\n";
235 if ($flagHV == 3) {
236 $flagHV = 0;
237 #unless ($skipflag)
238 my ($MergedHds,$mostHchr,$mostVchr) = grepmerge(\@hReads,$main::Aligner);
239 my ($MergedVds,$mostRchr,$mostMchr) = grepmerge(\@vReads,$main::Aligner);
240 print STDERR ',' if $mostHchr ne $mostMchr;
241 print STDERR ';' if $mostVchr ne $mostRchr;
242 #ddx [$MergedHds,$MergedVds];
243 # BSuitLib.pm:240: [
245 # 180587608 => [
246 # 19,
247 # "TTGCGAAAGCCCAAGATGATGGGATGGGAATACAGGTGCAATTTCCATCCGTAGGTTTTGTACAGCAACATGAGGGAAACATAGAGTTGCCTTGRRCRRRRRTCRTR",
248 # ],
249 # },
251 # 640 => [
252 # 6,
253 # "CCAGATGCAATCTAATTAAACCCACCAGGTGTCTCCCCTCARRTTRRRRTRCCCCRTR",
254 # ],
255 # },
257 my $tmp = '.';
258 if ($strandEven > $strandOdd) {
259 $tmp = '-';
260 } elsif ($strandEven < $strandOdd) {
261 $tmp = '+';
263 ($strandOdd,$strandEven)=(0,0);
264 my @Keys = sort {$b <=> $a} keys %{$MergedHds};
265 my @VKeys = sort {$b <=> $a} keys %{$MergedVds};
266 my @Vchrs = sort { $Vchr{$b} <=> $Vchr{$a} } keys %Vchr;
267 %Vchr = ();
268 my @Vposes=(-1,-1);
269 if (@VKeys == 1) {
270 if ($VKeys[0] > 0) {
271 @Vposes = ($VKeys[0],-1);
272 } else {
273 @Vposes = (-1,-$VKeys[0]);
275 } elsif (@VKeys == 2) {
276 @Vposes = ($VKeys[0],-$VKeys[1]);
278 if (@Keys == 1) {
279 if ($Keys[0] > 0) {
280 print OUT join("\t",$lastgid,$hReads[0]->[2],$Keys[0],-1,@{$MergedHds->{$Keys[0]}},0,'N',$tmp,$Vchrs[0],@Vposes),"\n";
281 } else {
282 print OUT join("\t",$lastgid,$hReads[0]->[2],-1,-$Keys[0],0,'N',@{$MergedHds->{$Keys[0]}},$tmp,$Vchrs[0],@Vposes),"\n";
284 } elsif (@Keys == 2) {
285 print OUT join("\t",$lastgid,$hReads[0]->[2],$Keys[0],-$Keys[1],@{$MergedHds->{$Keys[0]}},@{$MergedHds->{$Keys[1]}},$tmp,$Vchrs[0],@Vposes),"\n";
287 #print OUT join("\t",$lastgid,$hReads[0]->[2],$_,@{$MergedHds->{$_}}),"\n" for sort { $a <=> $b } keys %{$MergedHds};
288 #die;
290 @hReads=();
291 @vReads=();
292 $lastgid = $thisGroup;
293 ($fhReads,$rhReads,$fvReads,$rvReads)=(0,0,0,0);
294 } else {
295 $lastgid = $thisGroup;
297 if (/\bZd:Z:H\b/) {
298 $flagHV |= 1;
299 if ($dat[5] !~ /[IDH]/) { # ignore [IDH] until 我有空写好 getDeriv()。
300 push @hReads,\@dat;
301 if (/\bYD:Z:f\b/) {++$fhReads}
302 else {++$rhReads;}
304 } elsif (/\bZd:Z:V\b/) {
305 $flagHV |= 2;
306 ++$Vchr{$dat[2]};
307 if ($dat[5] !~ /[IDH]/) {
308 push @vReads,\@dat;
309 if (/\bYD:Z:f\b/) {++$fvReads}
310 else {++$rvReads;}
314 close IN;
315 close OUT;
316 print "\n";
320 sub do_grep0($) {
321 my $cfgfile = $_[0];
322 my (%tID,%tFH);
323 for (@{$main::Config->{$main::FileData}->{'='}}) {
324 /([^.]+)\.(\d)/ or die;
325 $tID{$1}{$2} = $_;
327 # "780_T" => { 1 => "780_T.1", 2 => "780_T.2" },
328 # "s01_P" => { 1 => "s01_P.1", 2 => "s01_P.2" },
329 File::Path::make_path("$main::RootPath/${main::ProjectID}_grep",{verbose => 0,mode => 0755});
330 my $GrepResult = Galaxy::IO::INI->new();
331 my %ReadsIndex;
332 for my $k (keys %tID) {
333 my $myBamf = "$main::RootPath/${main::ProjectID}_aln/$k.bam";
334 my $InsMean = $main::Config->{'InsertSizes'}->{$k} or die;
335 my $InsSD = $main::Config->{'InsertSizes'}->{"$k.SD"} or die; # SD cannot be 0, so no need to test with defined.
336 #warn "$myBamf,$InsMean,$InsSD";
337 $GrepResult->{$k} = {
338 'InBam' => $myBamf,
339 InsMean => $InsMean,
340 InsSD => $InsSD,
342 $GrepResult->{$k} = { DatFile => "$main::RootPath/${main::ProjectID}_grep/$k.sam" };
343 open( IN,"-|","$main::PathPrefix samtools view $myBamf") or die "Error opening $myBamf: $!\n"; # `-F768` later
344 system( "$main::PathPrefix samtools view -H $myBamf >".$GrepResult->{$k}{'DatFile'} );
345 open GOUT,'>>',$GrepResult->{$k}{'DatFile'} or die "$!";
346 print GOUT join("\t",'@PG','ID:bsuit',"CL:\"grep $cfgfile\""),"\n";
347 print STDERR "[!] Reading [$myBamf] ...";
348 while (my $line = <IN>) {
349 my @Dat1 = split /\t/,$line;
350 my $flag = 0;
351 my $maxSC=0;
352 while ($Dat1[5] =~ /(\d+)S/g) {
353 $maxSC = $1 if $maxSC < $1;
355 $flag |= 1 if $maxSC > $main::minSoftClip; # 加上SC的话,内存不乐观
356 if ($Dat1[6] eq '=') {
357 #$flag |= 1 if abs(abs($Dat1[8])-$InsMean) > 3*$InsSD;
358 $flag |= 2 if exists($main::VirusChrIDs{$Dat1[2]}) or exists($main::VirusChrIDs{$Dat1[6]});
359 } else {
360 $flag |= 4 if exists($main::VirusChrIDs{$Dat1[2]}) or exists($main::VirusChrIDs{$Dat1[6]});
361 $flag &= ~1 if $flag & 4; # 人的PE,不同染色体的hit直接扔掉不管。
363 next unless $flag;
364 $flag |= 8 if $Dat1[5] !~ /^\d+M$/; # 数据只是占位置,所以可以去掉次行
365 my $id = join("\t",$k,$Dat1[0]);
366 $ReadsIndex{$id} = [[0,$Dat1[2],$flag]]; # 初始化,数据只是占位置。
368 close IN;
369 open IN,'-|',"$main::PathPrefix samtools view $myBamf" or die "Error opening $myBamf: $!\n"; # `-F768` later
370 while (my $line = <IN>) {
371 #my ($id, $flag, $ref, $pos, $mapq, $CIGAR, $mref, $mpos, $isize, $seq, $qual, @OPT) = split /\t/,$line;
372 #print "$id, $flag, $ref, $pos, $mapq, $CIGAR, $mref, $mpos, $isize\n";
373 my @Dat1 = split /\t/,$line;
374 my $r12R1;
375 if ($Dat1[1] & 0x40) {
376 $r12R1 = 1;
377 } elsif ($Dat1[1] & 0x80) {
378 $r12R1 = 2;
379 } else {$r12R1 = 1;}
380 #--$r12R1;
381 #my $line2 = <IN>;
382 #die '[x]SAM/BAM file not paired !' unless defined($line2);
383 #my @Dat2 = split /\t/,$line2;
384 #next if $Dat1[4]<$CFGminMAPQ;
385 my $flag = 0;
386 my $maxSC=0;
387 while ($Dat1[5] =~ /(\d+)S/g) {
388 $maxSC = $1 if $maxSC < $1;
390 $flag |= 1 if $maxSC > $main::minSoftClip;
391 if ($Dat1[6] eq '=') {
392 #$flag |= 1 if abs(abs($Dat1[8])-$InsMean) > 3*$InsSD;
393 $flag |= 2 if exists($main::VirusChrIDs{$Dat1[2]}) or exists($main::VirusChrIDs{$Dat1[6]});
394 } else {
395 $flag |= 4 if exists($main::VirusChrIDs{$Dat1[2]}) or exists($main::VirusChrIDs{$Dat1[6]});
396 $flag &= ~1 if $flag & 4;
398 next unless $flag;
399 $flag |= 8 if $Dat1[5] !~ /^\d+M$/; # soft-clip
400 my $curpos = tell(GOUT);
401 my $id = join("\t",$k,$Dat1[0]);
402 #warn "$flag $InsSD\t$curpos\n[${line}]\n" if $flag>1;
403 next unless defined $ReadsIndex{$id};
404 push @{$ReadsIndex{$id}},[$curpos,$r12R1,@Dat1[2,3,4,5],$flag]; # Chr, Pos, MapQ, CIGAR
405 #print "($Dat1[2],$Dat1[3],$Dat1[5],$curpos)\n";
406 print GOUT $line;
407 #ddx $ReadsIndex{$id} if scalar @{$ReadsIndex{$id}} > 2;
408 #last;
410 close IN;
411 close GOUT;
412 print STDERR "\b\b\bdone.\n";
413 if ($main::DEBUG) {
414 open $tFH{$k},'<',$GrepResult->{$k}{'DatFile'} or die "$!";
417 for my $tk (keys %ReadsIndex) {
418 my ($flag,%tmpH,%tmpV)=(0);
419 for my $i (1 .. $#{$ReadsIndex{$tk}}) {
420 #my $x = join("\t",$ReadsIndex{$tk}->[$i][2],$ReadsIndex{$tk}->[$i][3]);
421 my $x = $ReadsIndex{$tk}->[$i][4];
422 if (exists $main::VirusChrIDs{$ReadsIndex{$tk}->[$i][2]}) {
423 $tmpV{$x} = $i; # 先认为最高质量的唯一。
424 } else {
425 $tmpH{$x} = $i
427 $flag |= $ReadsIndex{$tk}->[$i][6];
429 if ((keys %tmpH) > 0) { # Select max MapQ as 1st & favors Human to Virus.
430 my @s = sort { $b <=> $a } keys %tmpH;
431 $ReadsIndex{$tk}->[0][0] = $tmpH{$s[0]};
432 $ReadsIndex{$tk}->[0][1] = join("\t",$ReadsIndex{$tk}->[$tmpH{$s[0]}][2],$ReadsIndex{$tk}->[$tmpH{$s[0]}][3]);
433 } else {
434 my @s = sort { $b <=> $a } keys %tmpV;
435 $ReadsIndex{$tk}->[0][0] = $tmpV{$s[0]};
436 $ReadsIndex{$tk}->[0][1] = join("\t",$ReadsIndex{$tk}->[$tmpV{$s[0]}][2],$ReadsIndex{$tk}->[$tmpV{$s[0]}][3]);
438 #my @s = sort sortChrPos(keys %tmp);
439 $ReadsIndex{$tk}->[0][2] = $flag;
440 #ddx $ReadsIndex{$tk};
441 # BSuitLib.pm:213: [
442 # [1, "chr15\t20858350", 15], # index, ChrPos, 总flag
443 # [1477321, 1, "chr15", 20858350, 0, "48M42S", 12],
444 # [33704587, 1, "gi|59585|emb|X04615.1|", 652, 0, "45S45M", 11],
445 # [34264944, 2, "gi|59585|emb|X04615.1|", 717, 60, "56M34S", 12],
448 warn "[!] Bam Reading done.\n";
449 #ddx \$GrepResult;
450 #ddx (\%RefChrIDs,\%main::VirusChrIDs);
451 open BOUT,'>',"$main::RootPath/${main::ProjectID}_grep/blocks.ini" or die "$!";
452 if ($main::DEBUG) {
453 open BOUTDBG,'>',"$main::RootPath/${main::ProjectID}_grep/blocks.txt" or die "$!";
455 my @IDsorted = sort { sortChrPos($ReadsIndex{$a}->[0][1],$ReadsIndex{$b}->[0][1]) } keys %ReadsIndex;
456 my ($Cnt,%hChrRange,%vChrRange,@Store,@PureVirReads)=(0);
457 for my $cid (@IDsorted) {
458 my @minCPR = split /\t/,$ReadsIndex{$cid}->[0][1];
459 push @minCPR,$ReadsIndex{$cid}->[0][0];
460 if (exists $main::VirusChrIDs{$minCPR[0]}) {
461 push @PureVirReads,$cid;
462 next;
464 my ($thisehPos,$thisvChr,$thissvPos,$thisevPos);
465 for my $i (1 .. $#{$ReadsIndex{$cid}}) {
466 my ($DatRef,$isHost);
467 if (exists $main::VirusChrIDs{$ReadsIndex{$cid}->[$i]->[2]}) {
468 $DatRef = \%vChrRange;
469 $isHost = 0;
470 } else {
471 $DatRef = \%hChrRange;
472 $isHost = 1;
474 while ( mergeIn($isHost,$DatRef,$ReadsIndex{$cid}->[$i],\@Store,$cid,$i) ) { # 目前只有桥墩,没有中间部分
475 ++$Cnt;
476 print BOUT "[B$Cnt]\nHostRange=",formatChrRange(\%hChrRange),
477 "\nVirusRange=",formatChrRange(\%vChrRange),"\nSamFS=",
478 join(',',map { my @t = split /\n/;my @f=split /\t/,$t[0];join(':',$f[0],$ReadsIndex{$t[0]}->[$t[1]]->[0]); } @Store),"\n\n";
479 if ($main::DEBUG) {
480 print BOUTDBG "[B$Cnt]\nHostRange=",formatChrRange(\%hChrRange),
481 "\nVirusRange=",formatChrRange(\%vChrRange),"\nSamFS=",
482 join(',',map { my @t = split /\n/;my @f=split /\t/,$t[0];join(':',$f[0],$ReadsIndex{$t[0]}->[$t[1]]->[0]); } @Store),"\n";
483 for (@Store) {
484 my @t = split /\n/;
485 my @f=split /\t/,$t[0];
486 seek($tFH{$f[0]},$ReadsIndex{$t[0]}->[$t[1]]->[0],0);
487 my $str = readline $tFH{$f[0]};
488 print BOUTDBG $str;
490 print BOUTDBG "\n";
492 %hChrRange = %vChrRange = @Store = ();
496 close BOUT;
497 if ($main::DEBUG) {
498 close $tFH{$_} for keys %tFH;
499 close BOUTDBG;
501 #warn $GrepResult->write_string;
502 #ddx $GrepResult;
503 $GrepResult->write("$main::RootPath/${main::ProjectID}_grep.ini");
504 warn "[!] grep done with [$Cnt] items.\n";
507 sub do_analyse {
508 my $Refilename = warnFileExist($main::RefConfig->{$main::RefFilesSHA}->{'Refilename'});
509 my (%tID,%tFH);
510 for (@{$main::Config->{$main::FileData}->{'='}}) {
511 /([^.]+)\.(\d)/ or die;
512 $tID{$1}{$2} = $_;
514 File::Path::make_path("$main::RootPath/${main::ProjectID}_analyse",{verbose => 0,mode => 0755});
515 my @retVirus;
516 my $VirusFN = "$main::RootPath/${main::ProjectID}_analyse/Virus.fa";
517 open VIRUS,'>',$VirusFN or die;
518 my $FH = openfile($main::Config->{'RefFiles'}->{'VirusRef'});
519 while (my $ret = FastaReadNext($FH)) {
520 push @retVirus,$ret;
521 print VIRUS '>',$ret->[0],"\n",$ret->[1],"\n";
523 close $FH;
524 close VIRUS;
526 my $tmptmpf = "$main::RootPath/${main::ProjectID}_grep/Greped.ini";
527 open TMPTMP,'<',$tmptmpf or die;
528 my %TMPtmp;
529 while (<TMPTMP>) {
530 /^\[(\d+)\]$/ or die;
531 my $id = $1;
532 <TMPTMP>;
533 <TMPTMP>;
534 $_=<TMPTMP>;
535 next if /^VirRange=NA$/;
536 chomp;
537 my $tmp = (split /:/,$_)[-1];
538 $TMPtmp{$id}=$tmp;
540 close TMPTMP;
542 for my $k (keys %tID) {
543 my $myGrepf = "$main::RootPath/${main::ProjectID}_grep/$k.bam.grep";
544 print "[$myGrepf]\n";
545 my $outf = "$main::RootPath/${main::ProjectID}_analyse/.$k.analyse";
546 open IN,'<',$myGrepf or die;
547 open OUT,'>',$outf or die;
548 my (@OutCnt,%OutDat)=(0,0,0);
549 while (<IN>) {
550 chomp;
551 my @LineDat = split /\t/,$_;
552 #ddx \@LineDat;
553 #my $query = @LineDat[5,7];
554 #ddx \@retVirus;
555 my ($left,$right,$strand);
556 if ($LineDat[5] ne 'N') { # 时间有限,病毒只支持第一条染色体
557 my $ta = doAln($VirusFN,$LineDat[5],1);
558 if (defined $ta) {
559 $strand = $ta->[0];
560 if ($strand eq '+') {
561 $left = $ta->[1];
562 $right = $ta->[1] + $ta->[2];
563 } else {
564 $right = $ta->[1];
565 $left = $ta->[1] - $ta->[2];
569 if ($LineDat[7] ne 'N') {
570 my $tb = doAln($VirusFN,$LineDat[7],-1);
571 if (defined $tb) {
572 $strand = $tb->[0];
573 if ($strand eq '+') {
574 $left = $tb->[1] unless defined $left;
575 $right = $tb->[1] + $tb->[2];
576 } else {
577 $right = $tb->[1] unless defined $right;;
578 $left = $tb->[1] - $tb->[2]; # 负链还是有问题
582 if ($LineDat[2] == -1) {
583 $LineDat[2] = $LineDat[3];
584 $LineDat[3] = -1;
586 $LineDat[3] = -1 if $LineDat[3] == $LineDat[2] + 1; # 貌似正负链加减一没统一?
587 #next unless defined $strand;
588 my $tVr=[-1];
589 for (@LineDat[10,11]) {
590 push @{$tVr},$_ if $_ != -1;
592 unless (defined $strand) { # Well, we need more poistive.
593 #print OUT join("\t",@LineDat[0..3],'Virus','NA','0','0'),"\n";
594 my @range = split /-/,$TMPtmp{$LineDat[0]};
595 $OutDat{$LineDat[1]}{$LineDat[2]} = [$LineDat[0],$LineDat[3],$LineDat[9],$LineDat[8],@range,$tVr]; # 临时补丁
596 ++$OutCnt[1];
597 next;
599 #print OUT join("\t",@LineDat[0..3],'Virus',$strand,$left,$right),"\n";
600 $OutDat{$LineDat[1]}{$LineDat[2]} = [$LineDat[0],$LineDat[3],$LineDat[9],$strand,$left,$right,$tVr];
601 ++$OutCnt[0];
603 for my $chr (sort {alphanum($a,$b)} keys %OutDat) {
604 my ($lastL,$lastR) = (-1,-1);
605 for my $pos (sort {$a <=> $b} keys %{$OutDat{$chr}}) {
606 if ($lastL == -1) {
607 $lastL = $pos;
608 $lastR = $pos;
609 #print STDERR ">>> $pos\n";
610 } elsif (($pos - $lastR) <= $main::ResultMergeRange) {
611 #print STDERR "-> $pos - ($lastL,$lastR)\n";
612 $lastR = $pos; # 暂时只考虑左端点的合并
613 } else {
614 if ($lastL != -1) {
615 my @Dat = @{$OutDat{$chr}{$lastL}}; # 假设第一条的病毒结果最准确(其实应该是中间某个;最好前期打分)
616 $Dat[1] = -1 if $Dat[1] == $lastL + 1;
617 print OUT join("\t",$Dat[0],$chr,$lastL,@Dat[1..5],join(',',@{$Dat[6]})),"\n";
618 ++$OutCnt[2];
619 #print STDERR join("\t",'---',$Dat[0],$chr,$lastL,@Dat[1..$#Dat]),"\n";
621 ($lastL,$lastR) = ($pos,$pos); # 下一轮初值。
624 if ($lastL != -1) {
625 my @Dat = @{$OutDat{$chr}{$lastL}}; # 假设第一条的病毒结果最准确(其实应该是中间某个;最好前期打分)
626 $Dat[1] = -1 if $Dat[1] == $lastL + 1;
627 print OUT join("\t",$Dat[0],$chr,$lastL,@Dat[1..5],join(',',@{$Dat[6]})),"\n";
628 ++$OutCnt[2];
631 close IN; close OUT;
632 warn "[!]O: $OutCnt[0]+$OutCnt[1] => $OutCnt[2] in [$k], merged=",$OutCnt[0]+$OutCnt[1]-$OutCnt[2],".\n";
634 do_patch();
637 sub do_check {
638 my ($bias) = @_;
639 my $hchrid = 'chr1';
640 unless (exists $main::Config->{'Simed'}) {
641 warn "[x]config_file NOT generated by `simVirusInserts.pl` !\n";
642 return;
644 my (%VirFrag,@Refticks,@Virticks,%VirFragSE);
645 for (@{$main::Config->{'Simed'}->{'='}}) {
646 if (/^([^.]+)\.VirFrag$/) {
647 $VirFrag{$1} = $main::Config->{'Simed'}->{$_} or die;
648 } elsif ($_ eq 'Refticks') {
649 @Refticks = split /,/,$main::Config->{'Simed'}->{$_} or die;
650 } elsif ($_ eq 'Virticks') {
651 @Virticks = split /,/,$main::Config->{'Simed'}->{$_} or die;
654 my %Refticksh = map { $Refticks[$_] => $_ } 0..$#Refticks; # http://stackoverflow.com/a/2957903/159695
655 open OUT,'>',"$main::RootPath/${main::ProjectID}_check.log" or die;
656 for my $k (keys %VirFrag) {
657 my (@VirFragStartEnd);
658 for my $pVir (@Virticks) {
659 my $st = $pVir-int(0.5*$VirFrag{$k});
660 my $ed = $st + $VirFrag{$k};
661 ++$st;
662 push @VirFragStartEnd,[$st,$ed];
664 $VirFragSE{$k} = \@VirFragStartEnd;
665 if ( (! defined $bias) or $bias<0 ) {
666 for my $i (0 .. $#Refticks) {
667 print OUT "$k $i:\t$Refticks[$i], ",join('-',@{$VirFragStartEnd[$i]}),"\n";
671 $bias = 10 unless (defined $bias and $bias >= 0);
672 #ddx \%VirFrag;
673 warn 'Load @Refticks:',scalar(@Refticks),', @Virticks',scalar(@Virticks),". Bias_Allowed=$bias\n";
674 my %VirChrP;
675 for my $p (@Virticks) {
676 $VirChrP{$_}=1 for ($p-$bias)..($p+$bias);
679 my (%tID,%Result,%FragLength);
680 for (@{$main::Config->{$main::FileData}->{'='}}) {
681 /([^.]+)\.(\d)/ or die;
682 $tID{$1}{$2} = $_;
684 for my $k (keys %tID) {
685 my $myAnaf = "$main::RootPath/${main::ProjectID}_analyse/$k.analyse";
686 my $myGrepf = "$main::RootPath/${main::ProjectID}_grep/$k.bam.grep";
687 print "[$myAnaf]\n";
688 print OUT "[$myAnaf]\n";
689 open IN,'<',$myGrepf or die;
690 while (<IN>) {
691 chomp;
692 my @LineDat = split /\t/,$_;
693 if ($LineDat[5] ne 'N') {
694 my $flen1 = length $LineDat[5];
695 ++$FragLength{$k}{$flen1};
696 ++$FragLength{'=Sum='}{$flen1};
698 if ($LineDat[7] ne 'N') {
699 my $flen2 = length $LineDat[7];
700 ++$FragLength{$k}{$flen2};
701 ++$FragLength{'=Sum='}{$flen2};
704 close IN;
705 open ANA,'<',$myAnaf or die;
706 while (<ANA>) {
707 my $flag = 0;
708 chomp;
709 my @dat = split /\t/;
710 my (undef,$chr,$pos1,$pos2,undef,$strand,$vp1,$vp2) = @dat;
711 $pos2 = $pos1 if $pos2 == -1;
712 if ($chr eq $hchrid) {
713 my ($va,$vb);
714 for my $p ( ($pos1-$bias)..($pos2+$bias) ) {
715 if (exists $Refticksh{$p}) {
716 $flag |= 1;
717 my $idx = $Refticksh{$p};
718 print OUT 'h',$Refticks[$idx],",";
719 ($va,$vb) = @{$VirFragSE{$k}->[$idx]};
720 if ($strand eq 'NA') {
721 $flag |= 6;
722 goto NOVIR;
724 if (($vp1<=$vb) and ($vp2>=$va)) {
725 $flag |= 2;
726 $flag &= ~4;
727 $flag &= ~8;
728 print OUT "v$va-$vb,";
729 } elsif (($vp1 <= $vb+$bias) and ($vp2 >= $va-$bias)) {
730 $flag |= 4;
731 $flag &= ~8;
732 print OUT "m$va-$vb,";
733 } else {
734 for my $p ($vp1 .. $vp2) {
735 if (exists $VirChrP{$p}) {
736 $flag |= 8;
737 last;
741 last if $flag == 3;
744 NOVIR:
745 if ($flag==1) {
746 print OUT "x$va-$vb,";
749 print OUT "$flag\t$_\n";
750 ++$Result{$k}{$flag};
751 ++$Result{'=Sum='}{$flag};
753 close ANA;
755 ddx \%FragLength;
756 ddx \%Result;
757 open P,'>',"$main::RootPath/${main::ProjectID}_plot.dat" or die;
758 open PH,'>',"$main::RootPath/${main::ProjectID}_plot.sh" or die;
759 print PH "#!/bin/bash\ngnuplot -persist <<-'EOFP1'
760 set xlabel \"Length\"
761 set ylabel 'Count'
762 set title 'Histgram of Identified Fragments'
763 set term svg
764 set output \"$main::RootPath/${main::ProjectID}_plot.svg\"
766 infile = '$main::RootPath/${main::ProjectID}_plot.dat'
767 tempfile = '$main::RootPath/${main::ProjectID}_plot.tmp'\n",
768 'stats infile u 1:2 nooutput
769 blocks = STATS_blocks
771 set print tempfile
772 first_y = ""
773 first_x = ""
774 do for[i=0:blocks-1] {
775 stats infile index i u (first_x=($0==1)?sprintf("%s %f",first_x,$1):first_x,first_y=($0==1)?sprintf("%s %f",first_y,$2):first_y,$1):2 nooutput
776 print sprintf("%f %f",STATS_pos_max_y,STATS_max_y)
778 print ""
779 print ""
780 do for[i=1:blocks] {
781 print sprintf("%s %s",word(first_x,i),word(first_y,i))
783 set print
784 set key outside;
785 set key right top;
786 set logscale y
787 plot for[i=0:blocks-1] infile i i u 1:2 w lines title columnheader(1),\
788 for[i=0:1] tempfile i i u 1:2:($0+1) w points pt (i==0?7:9) lc variable not
790 my @IDs = sort sortWsum keys %FragLength;
791 for my $k (0 .. $#IDs) {
792 my $id = $IDs[$k];
793 $id =~ s/=//g;
794 print P "'$id'\n";
795 for my $i (sort {$a <=> $b} keys %{$FragLength{$IDs[$k]}}) {
796 print P "$i\t$FragLength{$IDs[$k]}{$i}\n";
798 if ($k != $#IDs) {
799 print P "\n\n";
802 close P;
803 print PH "\nEOFP1\n";
804 close PH;
805 chmod 0755,"$main::RootPath/${main::ProjectID}_plot.sh";
806 system "$main::RootPath/${main::ProjectID}_plot.sh";
809 sub do_analyse0 {
810 my $Refilename = warnFileExist($main::RefConfig->{$main::RefFilesSHA}->{'Refilename'});
811 my (%tID,%tFH);
812 for (@{$main::Config->{$main::FileData}->{'='}}) {
813 /([^.]+)\.(\d)/ or die;
814 $tID{$1}{$2} = $_;
816 my %ReadsIndex;
817 for my $k (keys %tID) {
818 my $GrepResult = "$main::RootPath/${main::ProjectID}_grep/$k.sam";
819 my $InsMean = $main::Config->{'InsertSizes'}->{$k} or die;
820 my $InsSD = $main::Config->{'InsertSizes'}->{"$k.SD"} or die; # SD cannot be 0, so no need to test with defined.
821 open $tFH{$k},'<',$GrepResult or die "$!";
823 File::Path::make_path("$main::RootPath/${main::ProjectID}_analyse",{verbose => 0,mode => 0755});
824 my $BlockINI = Galaxy::IO::INI->new();
825 my $BlockINIFN = "$main::RootPath/${main::ProjectID}_grep/blocks.ini";
826 if ( -f $BlockINIFN ) {
827 $BlockINI->read($BlockINIFN);
828 } else {die "[x] Grep INI not found ! [$BlockINIFN]\n";}
829 warn "[!] minHostDepth = $main::minHostDepth\n";
830 warn "[!] Running in DEVELOP(chr18 only) mode !\n" if ($main::DEVELOP);
831 open OA,'>',"$main::RootPath/${main::ProjectID}_analyse.txt" or die;
832 for my $Bid (@{$BlockINI->{']'}}) {
833 my @HostRange = split /,/,$BlockINI->{$Bid}->{'HostRange'};
834 my @VirusRange = split /,/,$BlockINI->{$Bid}->{'VirusRange'};
835 my @SamFS = split /,/,$BlockINI->{$Bid}->{'SamFS'};
836 next if (@HostRange<1 or @VirusRange<1); # Need to do assembly
837 my ($maxHostDepth,$maxItem) = (0,-1);
838 for my $i (0 .. $#HostRange) {
839 my ($chr,$range,$depth) = split /:/,$HostRange[$i];
840 if ($maxHostDepth < $depth) {
841 $maxHostDepth = $depth;
842 $maxItem = $i;
844 $HostRange[$i] = [$chr,(split /-/,$range),$depth];
846 if ($main::DEVELOP) {
847 next if $HostRange[$maxItem][0] ne 'chr18';
849 next if $maxHostDepth < $main::minHostDepth;
850 #ddx $maxItem,\@HostRange;
851 File::Path::make_path("$main::RootPath/${main::ProjectID}_analyse/idba/$Bid",{verbose => 0,mode => 0755});
852 system("$main::PathPrefix samtools faidx $Refilename $HostRange[$maxItem][0]:$HostRange[$maxItem][1]-$HostRange[$maxItem][2] >$main::RootPath/${main::ProjectID}_analyse/idba/Ref.fa");
853 my $FH;
854 open $FH,'<',"$main::RootPath/${main::ProjectID}_analyse/idba/Ref.fa" or die $!;
855 my $retHost = FastaReadNext($FH);
856 close $FH;
857 my @retVirus;
858 $FH=openfile($main::Config->{'RefFiles'}->{'VirusRef'});
859 while (my $ret = FastaReadNext($FH)) {
860 push @retVirus,$ret;
862 close $FH;
863 open FHo,'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Refo.fa" or die $!;
864 open FHf,'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Reff.fa" or die $!;
865 open FHr,'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Refr.fa" or die $!;
866 for ($retHost,@retVirus) {
867 my ($id,$seq) = @$_;
868 print FHo ">$id\n$seq\n";
869 my $seqF = $seq;
870 $seqF =~ tr /Cc/Tt/;
871 print FHf ">${id}_F\n$seqF\n";
872 my $seqR = $seq;
873 $seqF =~ tr /Gg/Aa/;
874 print FHr ">${id}_R\n$seqR\n";
876 close FHo; close FHf; close FHr;
877 my %ReadsbyID;
878 my %FHO = (
879 o => [undef,0],
880 f => [undef,0],
881 r => [undef,0],
883 open $FHO{'o'}->[0],'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Readso.fa" or die $!;
884 open $FHO{'f'}->[0],'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Readsf.fa" or die $!;
885 open $FHO{'r'}->[0],'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Readsr.fa" or die $!;
886 my $maxReadLen = 0;
887 for (@SamFS) {
888 my ($fid,$pos) = split /:/,$_;
889 seek($tFH{$fid},$pos,0);
890 my $str = readline $tFH{$fid};
891 chomp $str;
892 my $strlen = length $str;
893 $maxReadLen = $strlen if $maxReadLen < $strlen;
894 my @dat = split /\t/,$str;
895 next if $dat[1] & 256; # FLAG, s | 0x0100 | the alignment is not primary
896 next if $dat[4] < 30; # MAPQ
897 my $seq = $dat[9];
898 $seq = revcom($seq) if $dat[1] & 16;
899 if ($dat[1] & 64) {
900 $ReadsbyID{$dat[0]}->[1]=$seq;
901 } elsif ($dat[1] & 128) {
902 $ReadsbyID{$dat[0]}->[2]=$seq;
904 if (($dat[2] eq $HostRange[$maxItem][0]) or (exists $main::VirusChrIDs{$dat[2]})) {
905 ++$ReadsbyID{$dat[0]}->[0];
908 for my $id (keys %ReadsbyID) {
909 next unless defined($ReadsbyID{$id}->[1]) or defined($ReadsbyID{$id}->[2]); # and->or to include SE.
910 $ReadsbyID{$id}->[1] = '' unless defined($ReadsbyID{$id}->[1]);
911 $ReadsbyID{$id}->[2] = '' unless defined($ReadsbyID{$id}->[2]);
912 next unless defined($ReadsbyID{$id}->[0]);
913 if ($main::FORCE_UNMETH) {
914 if ($id =~ /^sf/) {
915 $ReadsbyID{$id}->[1] =~ tr/gG/aA/;
916 $ReadsbyID{$id}->[2] =~ tr/cC/tT/;
917 } else {
918 $ReadsbyID{$id}->[1] =~ tr/cC/tT/;
919 $ReadsbyID{$id}->[2] =~ tr/gG/aA/;
922 my $type = guessMethyl( $ReadsbyID{$id}->[1] . revcom($ReadsbyID{$id}->[2]) );
923 my $str = join("\n",">$id/1",$ReadsbyID{$id}->[1],">$id/2",$ReadsbyID{$id}->[2],'');
924 my $FH;
925 if ($type eq '1CT') {
926 $FH = $FHO{'f'}->[0];
927 ++$FHO{'f'}->[1];
928 } elsif ($type eq '2GA') {
929 $FH = $FHO{'r'}->[0];
930 ++$FHO{'r'}->[1];
931 } else {
932 $FH = $FHO{'o'}->[0];
933 ++$FHO{'o'}->[1];
935 print $FH $str;
937 #close $FHO{$_}->[0] for keys %FHO;
938 my %Assem;
939 my $idbaRead = '-r';
940 $idbaRead = '-l' if $maxReadLen > 128;
941 for my $fro (keys %FHO) {
942 close $FHO{$fro}->[0];
943 next if $FHO{$fro}->[1] <= 2;
944 my $reff = "$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Ref$fro.fa";
945 my $readsf = "$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Reads$fro.fa";
946 next unless -s $readsf;
947 my $outp = "$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/$fro";
948 system("$RealBin/bin/idba_hybrid $main::idbacmd $idbaRead $readsf --reference $reff -o $outp >/dev/null");
949 next unless -s "$outp/scaffold.fa";
950 my ($tFH,@asm);
951 open $tFH,'<',"$outp/scaffold.fa" or die $!;
952 while (my $ret = FastaReadNext($tFH)) {
953 push @asm,$ret;
955 $Assem{$fro} = \@asm;
957 my $ret = doAlign(\%Assem,[$retHost],\@retVirus);
958 #die;
959 print OA "[$Bid]\nRefCut: $ret->[0] $ret->[1]\nVirusCut: $ret->[2] $ret->[3] +$ret->[4]\n\n" if $ret->[1] > 0;
960 if ($main::DEBUG) {
961 open DBG,'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/idba.fa" or die $!;
962 print DBG ">Host\n$$retHost[1]\n\n";
963 #print DBG ">Virus$_->[0]\n$_->[1]\n" for @retVirus;
964 print DBG "\n";
965 for my $fro (sort keys %Assem) {
966 print DBG ">Asm_${fro}$_ $Assem{$fro}->[$_]->[0]\n$Assem{$fro}->[$_]->[1]\n" for 0 .. $#{$Assem{$fro}};
968 print DBG "\nResult: ",join(",",@$ret),"\n";
970 warn "[$Bid]\n";
972 close OA;
973 close $tFH{$_} for keys %tFH;
978 __END__
979 samtools view -h /share/users/huxs/work/bsvir/bsI/SZ0010_aln/780_T.bam '*' | samtools bam2fq -O - | gzip -9 > /share/users/huxs/work/bsvir/bsI/SZ0010_aln/780_T.unmap.fq.gz
980 samtools view -h /share/users/huxs/work/bsvir/bsI/SZ0010_aln/s01_P.bam '*'|samtools bam2fq -O -|gzip -9 > /share/users/huxs/work/bsvir/bsI/SZ0010_aln/s01_P.unmap.fq.gz &
982 ./bin/bwameth.py --reference /share/users/huxs/work/bsvir/HBV.AJ507799.2.fa -t \${THREADSCNT} --read-group 780_T -p ~/work/bsvir/bsI/SZ0010_aln/780_T.unmap ~/work/bsvir/bsI/SZ0010_aln/780_T.unmap.fq.gz 2>~/work/bsvir/bsI/SZ0010_aln/780_T.unmap.log & #/
983 ./bin/bwameth.py --reference /share/users/huxs/work/bsvir/HBV.AJ507799.2.fa -t \${THREADSCNT} --read-group s01_P -p /share/users/huxs/work/bsvir/bsI/SZ0010_aln/s01_P.unmap /share/users/huxs/work/bsvir/bsI/SZ0010_aln/s01_P.unmap.fq.gz 2>/share/users/huxs/work/bsvir/bsI/SZ0010_aln/s01_P.unmap.log &
985 samtools view -h /share/users/huxs/work/bsvir/bsI/SZ0010_aln/780_T.bam 'gi|86261677|emb|AJ507799.2|'