limit fstBC to 30bp in Python3 ver.
[GalaxyCodeBases.git] / perl / bsvf / lib / BSuitInc.pm
blobedcffe27720052996f3ecd0943d85d7d631f9bec
1 package main;
2 use strict;
3 use warnings;
4 use Digest::SHA;
5 use IPC::Open2;
6 use Galaxy::Data;
8 sub getFilesHash(@) {
9 my $fileStr = join(',',@_);
10 return Digest::SHA::sha1_base64($fileStr);
13 sub getRef2char($$) {
14 my ($HostRefName,$VirusRefName)=@_;
15 my $HostChar = substr $HostRefName,0,1;
16 my $VirusChar = substr $VirusRefName,0,1;
17 my $tlen = length($VirusRefName) -1;
18 my $i=0;
19 while ( ($VirusChar eq $HostChar) and (++$i<=$tlen) ) {
20 my $tmpChar = substr $VirusRefName,$i,1;
21 $VirusChar = $tmpChar if $tmpChar=~/\w/;
22 warn "$i - $tmpChar $VirusChar -\n";
24 $VirusChar = 'V' if $VirusChar eq $HostChar;
25 return $HostChar.$VirusChar;
28 sub sortChrPos($$) {
29 my ($ChrA,$PosA) = split /\t/,$_[0];
30 my ($ChrB,$PosB) = split /\t/,$_[1];
31 if ($ChrA eq $ChrB) {
32 return $PosA <=> $PosB;
34 return 1 if exists $main::VirusChrIDs{$ChrA};
35 return -1 if exists $main::VirusChrIDs{$ChrB};
36 $ChrA cmp $ChrB ||
37 $PosA <=> $PosB;
40 sub cigar2poses($) {
41 my ($cigar) = @_;
42 my @cigar = $cigar =~ /(\d+)(\w)/g;
43 my ($reflen,$maxM,$readlen)=(0,0,0);
44 while (@cigar) {
45 my ($len,$op) = splice(@cigar,0,2);
46 if ($op eq 'M') {
47 $reflen += $len;
48 $readlen += $len;
49 $maxM = $len if $maxM < $len;
50 next;
52 $reflen += $len if $op eq 'D';
53 $readlen += $len if $op eq 'I';
55 return ($reflen,$readlen);
58 sub mergeIn($$$$$$) {
59 my ($isHost,$rChrRange,$aRead,$rStore,$cid,$i) = @_;
60 my ($reflen,$readlen) = cigar2poses($aRead->[5]);
61 my $thisehPos = $aRead->[3]+$reflen;
62 my $ret; # 1 -> again, 0 -> merged
63 if ($i > 1 or $isHost == 0 or scalar(keys %{$rChrRange})==0) {
64 # same PE or assume overlap on Virus
65 $ret = 0;
66 if (keys %{$rChrRange} and exists $rChrRange->{$aRead->[2]}) {
67 if ($aRead->[3] <= $rChrRange->{$aRead->[2]}->[0]) {
68 $rChrRange->{$aRead->[2]}->[0] = $aRead->[3];
70 if ($thisehPos >= $rChrRange->{$aRead->[2]}->[1]) {
71 $rChrRange->{$aRead->[2]}->[1] = $thisehPos;
73 } else {
74 $rChrRange->{$aRead->[2]} = [ $aRead->[3],$thisehPos,0 ];
76 } elsif (exists $rChrRange->{$aRead->[2]}) {
77 if ($aRead->[3] <= $rChrRange->{$aRead->[2]}->[1]) { # overlap
78 #die unless $thisehPos >= $rChrRange->{$aRead->[2]}->[1];
79 $rChrRange->{$aRead->[2]}->[1] = $thisehPos if $thisehPos >= $rChrRange->{$aRead->[2]}->[1];
80 $ret = 0;
81 } else {
82 $ret = 1;
84 } else { # different HostChr
85 $ret = 1;
87 unless ($ret) {
88 my $tid = join("\n",$cid,$i);
89 push @{$rStore},$tid;
90 ++$rChrRange->{$aRead->[2]}->[2];
92 return $ret;
94 sub formatChrRange($) {
95 my ($rChrRange) = @_;
96 my @ret;
97 for my $c (sort keys %{$rChrRange}) {
98 push @ret, "${c}:".$rChrRange->{$c}->[0].'-'.$rChrRange->{$c}->[1].':'.$rChrRange->{$c}->[2];
100 return join(',',@ret);
103 sub revcom($) {
104 my $str = $_[0];
105 $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
106 my $rev = reverse $str;
107 $rev =~ tr/[](){}<>/][)(}{></;
108 return $rev;
111 sub guessMethyl($) {
112 my ($seq) = @_;
113 my %BaseCnt=(
114 A => 0, G => 0, C => 0, T => 0,
115 N => 0,
117 for (split //,$seq) {
118 ++$BaseCnt{uc $_};
120 my $seqlen = length($seq) - $BaseCnt{'N'};
121 return 'N' if $seqlen == 0;
122 my @Cnts = sort { $BaseCnt{uc $b} <=> $BaseCnt{uc $a} } keys %BaseCnt;
123 #ddx [\@Cnts,\%BaseCnt];
124 if ($BaseCnt{'C'}<=$seqlen*$main::methly3BaseErrRate and $BaseCnt{'T'}>0) {
125 return '1CT';
126 } elsif ($BaseCnt{'G'}<=$seqlen*$main::methly3BaseErrRate and $BaseCnt{'A'}>0) {
127 return '2GA';
128 } else {
129 return '0Raw';
133 # 由于include在main前面,所以可以在此统一计算打分矩阵。
134 my ($match,$methylmatch,$mismatch,$INDEL) = (2,1,-3,-4);
135 our (%MatrixO,%MatrixFct,%MatrixRga);
136 my @Bases = sort qw(A C G T);
137 # $ perl -e '@Bases = sort qw(A T C G); print join("|",@Bases),"\n"'
138 # A|C|G|T
139 for my $i (0 .. $#Bases) {
140 for my $j ($i .. $#Bases) {
141 my $str = $Bases[$i] . $Bases[$j];
142 if ($i == $j) {
143 $MatrixO{$str} = $MatrixFct{$str} = $MatrixRga{$str} = $match;
144 } else {
145 $MatrixO{$str} = $mismatch;
146 if ($str =~ /^[CT]+$/) {
147 $MatrixFct{$str} = $methylmatch;
148 } else {
149 $MatrixFct{$str} = $mismatch;
151 if ($str =~ /^[GA]+$/) {
152 $MatrixRga{$str} = $methylmatch;
153 } else {
154 $MatrixRga{$str} = $mismatch;
159 #ddx (\%MatrixO,\%MatrixFct,\%MatrixRga);
160 # { AA => 2, AC => -3, AG => -3, AT => -3, CC => 2, CG => -3, CT => -3, GG => 2, GT => -3, TT => 2 },
161 # { AA => 2, AC => -3, AG => -3, AT => -3, CC => 2, CG => -3, CT => 1, GG => 2, GT => -3, TT => 2 },
162 # { AA => 2, AC => -3, AG => 1, AT => -3, CC => 2, CG => -3, CT => -3, GG => 2, GT => -3, TT => 2 },
164 sub doAlign($$$) {
165 my ($AssemHRef,$retHostARef,$retVirusARef) = @_;
166 # 暂时只考虑第一条
167 my $retHost = $$retHostARef[0];
168 my $retVirus = $$retVirusARef[0];
169 my @froDat = sort keys %{$AssemHRef};
170 #ddx $AssemHRef;
171 my $fro0 = shift @froDat;
172 my $result = [$AssemHRef->{$fro0}->[0]->[1]];
173 for my $fro (@froDat) {
174 #$fro0 = mergeAln( $result,$AssemHRef->{$fro}->[0],$fro0,$fro );
175 $result = mergeAln( $result->[0],$AssemHRef->{$fro}->[0]->[1] );
177 #ddx $retHost,$result;
178 my $HostResult = mergeAln( $retHost->[1],$result->[0] );
179 #ddx $HostResult;
180 my @RefInfo = split /[:-]/,$retHost->[0];
181 my ($RefCut,$Left,$Insert,@Rcuts,@Vcuts)=(0);
182 if ($HostResult->[1] =~ /^(D*)([MmR]+)(I+)/) {
183 (undef,$Left,$Insert) = ($1,$2,$3);
184 push @Rcuts,($RefInfo[1] + length($Left)+1);
186 if ($HostResult->[1] =~ /^(I*)([MmR]+)(D+)/) {
187 ($Left,$Insert,undef) = ($1,$2,$3);
188 push @Rcuts,($RefInfo[1] + length($Left) + length($Insert)+1);
190 $RefCut = join(';',@Rcuts);
191 my $VirusResult = mergeAln( $retVirus->[1],$result->[0] );
192 my @VirusInfo = split /[:-]/,$retVirus->[0];
193 my ($VirCut,$VirLeft,$VirInsert)=(0,0,0);
194 if ($VirusResult->[1] =~ /^(D*)([MmR]+)(I+)/) {
195 (undef,$VirLeft,$VirInsert) = ($1,$2,$3);
196 push @Vcuts,($VirusInfo[1] + length($VirLeft)+1);
198 if ($VirusResult->[1] =~ /^(I*)([MmR]+)(D+)/) {
199 ($Left,$Insert,undef) = ($1,$2,$3);
200 push @Vcuts,($VirusInfo[1] + length($Left) + length($Insert)+1);
202 #ddx $VirusResult;
203 $VirCut = join(';',@Vcuts);
204 return [$RefInfo[0],$RefCut,$VirusInfo[0],$VirCut,length($VirInsert)];
206 sub dynAln($$$) { # 废弃 {
207 my ($ref,$query,$MatrixR) = @_;
208 my @a=('',split //,$ref);
209 my @b=('',split //,$query);
210 my (@matrix,@path);
211 my ($i,$j,$sc);
212 $matrix[$_][0]=0 for (0 .. $#a);
213 $matrix[0][$_]=0 for (0 .. $#b);
214 $path[$#a][$#b] = 0;
215 # dynamic recursion
216 for ($i=1;$i<=$#a;$i++) {
217 for ($j=1;$j<=$#b;$j++) {
218 my $str = join('',sort ($a[$i],$b[$j]));
219 $matrix[$i][$j] = $MatrixR->{$str} + $matrix[$i-1][$j-1];
220 $path[$i][$j] = 1; # case 0: i,j are aligned
221 $sc=$matrix[$i-1][$j] + $INDEL; # case 1: i aligned to -
222 if ($sc > $matrix[$i][$j]) {
223 $matrix[$i][$j] = $sc;
224 $path[$i][$j] = 2;
226 $sc=$matrix[$i][$j-1] + $INDEL; # case 2: j aligned to -
227 if ($sc > $matrix[$i][$j]) {
228 $matrix[$i][$j] = $sc;
229 $path[$i][$j] = 3;
231 if ($matrix[$i][$j] < 0) { # Local
232 $matrix[$i][$j] = 0;
233 $path[$i][$j] = 0;
237 # Traceback (with shadow matrix)
238 my ($count,@ax,@ay,@as)=(0); # first is 0.
239 --$i;--$j; # outside for, $i will bigger than $#a by 1 step
240 my ($len,$mv,$mi,$mj)=(0);
241 while ($i>=0 and $j>=0) {
242 # Set Start Point
243 #$mi=$i;$mj=$j;
244 ($mv,$mi,$mj)=@{&getmax(\@matrix,$i,$j)}; # Local
245 # Traceback cycle for single str
246 while ($mi>=0 and $mj>=0) {
247 $as[$count] = $matrix[$mi][$mj] unless $as[$count];
248 ++$len;
249 #&traceback($a[$mi],$b[$mj],$path[$mi][$mj],$ax[$count],$ay[$count]);
250 my $path = $path[$mi][$mj];
251 if (! defined $path) { # $path == 0
252 --$len;
253 if ($len>0) {
254 @{$ax[$count]} = reverse @{$ax[$count]};
255 @{$ay[$count]} = reverse @{$ay[$count]};
256 ++$count;
257 $len=0;
259 --$mi;--$mj;
260 $i=$mi;$j=$mj;
261 ($mv,$mi,$mj)=@{&getmax(\@matrix,$i,$j)}; # Local
262 } elsif ($path == 1) {
263 push @{$ax[$count]},$a[$mi];push @{$ay[$count]},$b[$mj];
264 --$mi;--$mj;
265 } elsif ($path == 2) {
266 push @{$ax[$count]},$a[$mi];push @{$ay[$count]},"-";
267 --$mi;
268 } elsif ($path == 3) {
269 push @{$ax[$count]},"-";push @{$ay[$count]},$b[$mj];
270 --$mj;
271 } else {
272 die;
276 for ($i=1;$i<=$count;$i++) {
277 print "No. $i:\n";
278 print 'X:[',@{$ax[$i-1]},"]\n",'Y:[',@{$ay[$i-1]},"]\n",'Score:',$as[$i-1],"\n";
281 sub getmax($$$) { # 废弃 {
282 my ($arref,$ii,$ij)=@_;
283 my ($mv,$mi,$mj,$i,$j)=(-1,-1,-1);
284 for ($i=1;$i<=$ii;$i++) {
285 for ($j=1;$j<=$ij;$j++) {
286 if ($mv == $$arref[$i][$j]) { # keep ($mi+$mj) max, to achieve max(mi^2+mj^2)
287 if (($mi+$mj) < ($i+$j)) {
288 #$mv=$$arref[$i][$j];
289 $mi=$i;$mj=$j;
291 } elsif ($mv < $$arref[$i][$j]) {
292 $mv=$$arref[$i][$j];
293 $mi=$i;$mj=$j;
297 return [$mv,$mi,$mj];
300 sub getMaxM(@) {
301 my %Dat;
302 for my $c (@_) {
303 my @cigar = $c =~ /(\d+)(\w)/g;
304 my $maxM = 0;
305 while (@cigar) {
306 my ($len,$op) = splice(@cigar,0,2);
307 if ($op eq 'M') {
308 $maxM = $len if $maxM < $len;
311 push @{$Dat{$maxM}},$c;
313 my @keys = sort { $b <=> $a } keys %Dat;
314 # 可以再加上 totalM 最大,然后是 Gap-Open 最少。
315 return $Dat{$keys[0]};
317 sub doAln($$$) {
318 my ($refile,$query,$dir) = @_;
319 #my $pid = open2( \*READER, \*WRITER, "$RealBin/bin/needle" );
320 #WRITER->autoflush(); # default here, actually
321 my $str = "$RealBin/bin/water $refile <(echo $query) -scircular2 -gapopen 10 -gapextend 0.5 stdout -aformat3 sam";
322 my $revstr = "$RealBin/bin/water $refile <(echo $query) -sreverse1 -scircular2 -gapopen 10 -gapextend 0.5 stdout -aformat3 sam";
323 #print "[$str]\n";
324 #print WRITER $toAln,"\n";
325 my %Result;
326 my @Ret = `bash -c \'$str\' 2>/dev/null`;
327 my @tmp = split /\t/,$Ret[2]; # 没空,认为第一个结果最好。
328 $tmp[0] = '+';
329 push @{$Result{$tmp[5]}},[@tmp];
330 @Ret = `bash -c \'$revstr\' 2>/dev/null`;
331 @tmp = split /\t/,$Ret[2];
332 $tmp[0] = '-';
333 push @{$Result{$tmp[5]}},[@tmp];
334 #ddx \%Result;
335 my $t = getMaxM(keys %Result);
336 my $k = $t->[0]; # 先就按一个来了。
337 my $ret;
338 if ($dir == 1) {
339 if ((($Result{$k}->[0][0] eq '+' and $k =~ /^(\d+)M/) or ($Result{$k}->[0][0] eq '-' and $k =~ /(\d+)M$/)) and $1 >= $main::minVirMapLen) { # water的sam格式使用 H 表示前面比不上的,这里先不管这种情况。故 $Result{$k}->[0][0] 也就不用管了。
340 my $p = $Result{$k}->[0][3];
341 # if (0 and $Result{$k}->[0][0] eq '-') {
342 # my $rlen = bam_cigar2rlen($k);
343 # $p += $rlen; die;
345 $ret = [$Result{$k}->[0][0],$p,$1]; # 假设病毒只有一根,而且比对没hard-clip。读通时右端的人的序列无视。
347 } else {
348 if ((($Result{$k}->[0][0] eq '+' and $k =~ /(\d+)M$/) or ($Result{$k}->[0][0] eq '-' and $k =~ /^(\d+)M/)) and $1 >= $main::minVirMapLen) { # water的sam格式使用 H 表示前面比不上的,这里先不管这种情况。故 $Result{$k}->[0][0] 也就不用管了。
349 my $p = $Result{$k}->[0][3];
350 if ($Result{$k}->[0][0] eq '+') {
351 my $rlen = bam_cigar2rlen($k);
352 $p += $rlen;
353 } elsif ($Result{$k}->[0][0] eq '-') { # 负链`water -sreverse1`返回的`POS`是较大坐标,即反向后的左端点。
354 my $rlen = bam_cigar2rlen($k);
355 $p -= $rlen;
357 $ret = [$Result{$k}->[0][0],$p,$1]; # 假设病毒只有一根,而且比对没hard-clip。读通时右端的人的序列无视。
359 } # 假设病毒左端点正确;右端无通读的人的序列(这个应该已经处理对了)。
360 return $ret;
363 sub mergeAln($$) {
364 my ($ref,$query) = @_;
365 my $pid = open2( \*READER, \*WRITER, "$RealBin/bin/alnmethly" );
366 WRITER->autoflush(); # default here, actually
367 my @Dat = ([$query,undef],[revcom($query),undef]);
368 $_->[1] = guessMethyl($_->[0]) for @Dat;
369 @Dat = sort { $a->[1] cmp $b->[1] } @Dat;
370 unshift @Dat,[$ref,guessMethyl($ref)];
371 for (@Dat) {
372 if ($_->[1] eq '1CT') {
373 $_->[0] =~ s/[CT]/Y/ig;
374 } elsif ($_->[1] eq '2GA') {
375 $_->[0] =~ s/[GA]/R/ig;
377 $_->[0]='N' if $_->[1] eq 'N'; # 'N' for empty seq.
379 #ddx \@Dat;
380 print WRITER join("\n",map {$_->[0]} @Dat),"\n";
381 my %Result;
382 while(<READER>) {
383 chomp;
384 if (/^Path(\d): ([IDMmR]+),(\d+)$/) {
385 #print "[$_] [$1] [$2] [$3]\n";
386 $Result{$1} = [$2,$3];
389 waitpid( $pid, 0 );
390 close READER;
391 my @Resu = sort { $Result{$b}->[1] <=> $Result{$a}->[1] } keys %Result;
392 #print join("\n",map {$_->[0]} @Dat),"\n";
393 #ddx $Result{$Resu[0]},$Resu[0];
394 my @ResDat = split //,$Result{$Resu[0]}->[0];
395 my @QuaryDat = split //,$Dat[$Resu[0]]->[0];
396 my @Refdat = split //,$ref;
397 #print "R:@Refdat\nQ:@QuaryDat\nA:@ResDat\n";
398 my @AlnDat;
399 my ($Refp,$Querp) = (0,0);
400 for my $p (0 .. $#ResDat) {
401 if ($ResDat[$p] =~ /^[DMmR]$/) {
402 ++$Querp if $ResDat[$p] eq 'D';
403 my $theBases = $IUB{$Refdat[$p-$Refp]} or die;
404 push @$theBases,@$theBases if $#$theBases==0;
405 ++$AlnDat[$p]->{$_} for @$theBases;
407 if ($ResDat[$p] =~ /^[MmR]$/) {
408 my $theBases = $IUB{$QuaryDat[$p-$Querp]};
409 push @$theBases,@$theBases if $#$theBases==0;
410 ++$AlnDat[$p]->{$_} for @$theBases;
411 } elsif ($ResDat[$p] eq 'I') {
412 ++$Refp;
413 my $theBases = $IUB{$QuaryDat[$p-$Querp]};
414 push @$theBases,@$theBases if $#$theBases==0;
415 ++$AlnDat[$p]->{$_} for @$theBases;
418 #ddx \@AlnDat;
419 my $reseq;
420 for (@AlnDat) {
421 my %t = %{$_};
422 my @k = sort { $t{$a} <=> $t{$b} } keys %t;
423 my $iub;
424 if (@k>1 and $t{$k[0]} == $t{$k[1]}) {
425 $iub = join('',sort @k[0,1]);
426 } else {
427 $iub = $k[0];
429 $reseq .= $REV_IUB{$iub};
431 return [$reseq,$Result{$Resu[0]}->[0]];
434 sub warnFileExist(@) {
435 my %NotFound;
436 for (@_) {
437 next if /^\s*$/;
438 ++$NotFound{$_} unless -f $_;
440 my @NF = sort keys %NotFound;
441 if (@NF > 0) {
442 warn "[!!!] File NOT Found:[",join('],[',@NF),"]\n";
444 #warn "[Debug] @_\n";
445 return join(' ',@_);
447 # http://perldoc.perl.org/perlfaq4.html#How-do-I-expand-function-calls-in-a-string%3f
449 =pod
450 #define BAM_CIGAR_STR "MIDNSHP=XB"
451 /* bam_cigar_type returns a bit flag with:
452 * bit 1 set if the cigar operation consumes the query
453 * bit 2 set if the cigar operation consumes the reference
455 * For reference, the unobfuscated truth table for this function is:
456 * BAM_CIGAR_TYPE QUERY REFERENCE
457 * --------------------------------
458 * BAM_CMATCH 1 1
459 * BAM_CINS 1 0
460 * BAM_CDEL 0 1
461 * BAM_CREF_SKIP 0 1
462 * BAM_CSOFT_CLIP 1 0
463 * BAM_CHARD_CLIP 0 0
464 * BAM_CPAD 0 0
465 * BAM_CEQUAL 1 1
466 * BAM_CDIFF 1 1
467 * BAM_CBACK 0 0
468 * --------------------------------
470 =cut
471 sub bam_cigar2rlen($) {
472 my @cigar = $_[0] =~ /(\d+\w)/g;
473 my $pos = 0;
474 for (@cigar) {
475 if (/(\d+)([MDN=X])/) {
476 $pos += $1;
479 return $pos;
481 sub bam_cigar2qlen($) {
482 my @cigar = $_[0] =~ /(\d+\w)/g;
483 #ddx \@cigar;
484 my $pos = 0;
485 for (@cigar) {
486 #/(\d+)([MIS=X])/ or die $_[0],",$_";
487 if (/(\d+)([MIS=X])/) {
488 $pos += $1;
491 return $pos;
493 sub getSeqCIGAR($$) {
494 my ($minLeft,$i) = @_;
495 my $firstSC=0;
496 my $deltraPos = $i->[3] - $minLeft;
497 my @cigar = $i->[5] =~ /(\d+[MIDNSHP=XB])/g;
498 my $pos = $i->[3];
499 my $offset = $pos - $minLeft;
500 #my $cigar2qlen = bam_cigar2qlen($i->[5]);
501 my ($cursorQ,$cursorR) = (0,0);
502 my $seqCIGAR = 'B' x $offset;
503 #die $offset;
504 for (@cigar) {
505 /(\d+)([MIDNSHP=XB])/ or die;
506 if ($2 eq 'S') {
507 if ($cursorQ == 0) {
508 substr $seqCIGAR,-$1,$1,$2 x $1;
509 $firstSC = $1;
510 } else {
511 $cursorQ += $1;
512 $seqCIGAR .= $2 x $1;
514 } elsif ($2 eq 'M') {
515 $cursorQ += $1;
516 $seqCIGAR .= $2 x $1;
517 for my $p ( ($cursorQ-$1) .. ($cursorQ-1) ) {
518 my $refpos = $pos + $cursorR + $p;
519 #$ReadsMPos2Ref{$p} = $refpos;
521 $cursorR += $1;
522 } elsif ($2 eq 'I') {
523 $cursorQ += $1;
524 $seqCIGAR .= $2 x $1;
525 } elsif ($2 eq 'D') {
526 $cursorR += $1;
527 } else {die;}
529 return ($firstSC,$seqCIGAR);
531 sub grepmerge($$) {
534 # "sf167_E_Ref_727851_728000_728150_Vir_+_214_333_R_150_90",
535 # 145,
536 # "chr1",
537 # 137890,
538 # 0,
539 # "40S50M",
540 # "gi|59585|emb|X04615.1|",
541 # 231,
542 # 0,
543 # "TTATGTGTTTTGGTTAAAATTTGTAGTTTTTAATTTTTAATTAATTAGGAAGAAGAGTTGGGTTTGGAGAGAATGTTTGGAGGGTGTAAG",
544 # "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH",
545 # "AS:i:85",
546 # "XS:i:24",
547 # "RG:Z:Fn90_m13FG",
548 # "YC:Z:CT",
549 # "YD:Z:r",
550 # "Zd:Z:H",
551 # "Zc:i:117",
552 # ],
553 # [同上,另一条Read], ...
555 my $DEBGUHERE = 0;
556 my ($minLeft,$maxLS,@clipReads,%relPoses,%relPosesFR,$chr)=(5000000000,0);
557 my (%Results,%ChrMRNMs,%ChrREf);
558 for my $i (@{$_[0]}) {
559 my $MRNM = $i->[6];
560 ++$ChrMRNMs{$MRNM}; # use the most abondunce one.
561 ++$ChrREf{$i->[2]}
563 my $mostVchr = (sort {$ChrMRNMs{$b}<=>$ChrMRNMs{$a}} keys %ChrMRNMs)[0];
564 my $mostHchr = (sort {$ChrREf{$b}<=>$ChrREf{$a}} keys %ChrREf)[0];
565 for my $i (@{$_[0]}) {
566 next if ( $i->[6] ne $mostVchr ) or ($i->[6] ne $i->[2]) or ($i->[2] ne $mostHchr); # skip other MRNMs
567 my @cigar = $i->[5] =~ /(\d+[MIDNSHP=XB])/g;
568 my $flag = 0;
569 #ddx \@cigar; die $i->[5];
570 for (@cigar) {
571 if (/(\d+)S/) {
572 $flag = 1 if $1 >= $main::minSoftClip;
575 if ($flag) {
576 unless (defined $chr) {
577 $chr = $i->[2];
578 } else {
579 next if $chr ne $i->[2];
581 #print " $i->[2]:$i->[3]:$i->[-1]\n" if $DEBGUHERE;
582 push @clipReads,$i;
583 my $left = $i->[3];
584 if ($cigar[0] =~ /(\d+)S/) {
585 $maxLS = $1 if $maxLS < $1;
586 $left -= $1;
588 $minLeft = $left if $minLeft > $left;
591 return (\%Results,'','') unless @clipReads;
592 #ddx \@clipReads;
593 #my (@b2pCIGAR,@b2pDeriv);
594 my @ReadsCIGAR;
595 print "$minLeft\n" if $DEBGUHERE;
596 for my $i (@clipReads) {
597 #my %ReadsMPos2Ref;
598 my ($firstSC,$seqCIGAR) = getSeqCIGAR($minLeft,$i);
599 my $offset = $i->[3] - $minLeft;
600 my @Poses = getDeriv("M","B",$seqCIGAR);
601 for (@Poses) {
602 ++$relPosesFR{$_};
603 if ($_ >= 0) {
604 ++$relPoses{$_+0.5};
605 } else {
606 ++$relPoses{-$_-0.5};
609 if ($DEBGUHERE) {
610 my @thePoses = sort {$relPoses{$b} <=> $relPoses{$a}} keys %relPoses;
611 my @absPoses;
612 for (@Poses,@thePoses) {
613 if ($_ < 0) {
614 push @absPoses,-($minLeft - $_);
615 } else {
616 push @absPoses,($minLeft + $_);
619 print "$i->[5]\t$offset\t@{$i}[0..4]; @Poses, @thePoses -> @absPoses\n$seqCIGAR\n";
620 print 'B' x ($offset-$firstSC),$i->[9],"\n";
621 my $tmpstr = ' ' x length($seqCIGAR);
622 for my $p (@Poses) {
623 if ($p>=0) {
624 substr $tmpstr,$p,1,']';
625 } else {
626 substr $tmpstr,-$p,1,'[';
629 print $tmpstr,"\n";
631 push @ReadsCIGAR,$seqCIGAR;
633 my @thePosesA = sort {$relPoses{$b} <=> $relPoses{$a}} keys %relPoses;
634 my $thePos = int($thePosesA[0]); # $thePos = $thePosesA[0] - 0.5, 目前只考虑最多的clip位置。
635 my @usingPoses;
636 push @usingPoses,$thePos if exists $relPosesFR{$thePos};
637 push @usingPoses,-1-$thePos if exists $relPosesFR{-1-$thePos};
638 if (@usingPoses == 1) { # 先找最值处的']'&'[',若没有成对,再在对侧找成对的。只提最近的取一对。
639 if ($usingPoses[0] < 0) {
640 for my $p ( ($thePos-$main::posAround) .. $thePos ) {
641 if (exists $relPosesFR{$p}) {
642 push @usingPoses,$p;
645 #@usingPoses = @usingPoses[0,-1] if @usingPoses > 2;
646 if (@usingPoses > 2) {
647 my @t = @usingPoses;
648 shift @t;
649 @t = sort {$relPosesFR{$b} <=> $relPosesFR{$a} || $b <=> $a } @t;
650 @usingPoses = ($usingPoses[0],$t[0]);
652 } elsif ($usingPoses[0] > 0) {
653 for my $p ( ($thePos+1) .. ($thePos+$main::posAround+1) ) {
654 if (exists $relPosesFR{-$p}) {
655 push @usingPoses,-$p;
658 #@usingPoses = @usingPoses[0,1] if @usingPoses > 2;
659 if (@usingPoses > 2) {
660 my @t = @usingPoses;
661 shift @t;
662 @t = sort {$relPosesFR{$b} <=> $relPosesFR{$a} || $a <=> $b } @t;
663 @usingPoses = ($usingPoses[0],$t[0]);
665 } # 坐标需要核实下。done.
667 my %Bases;
668 for my $i (@clipReads) {
669 my $mtype;
670 if ($_[1] eq 'bwa-meth') {
671 my ($YC) = grep /^YC:Z:/,@$i;
672 if ($i->[1] & 16) { # r, 用`YD:Z:f`似乎更好。目前没管PBAT的。
673 if ($YC eq 'YC:Z:CT') {
674 $mtype = 'GA'; # R
675 } elsif ($YC eq 'YC:Z:GA') {
676 $mtype = 'CT'; # Y
677 } else {die;}
678 } else { # f
679 if ($YC eq 'YC:Z:CT') {
680 $mtype = 'CT';
681 } elsif ($YC eq 'YC:Z:GA') {
682 $mtype = 'GA';
683 } else {die;}
685 } elsif ($_[1] eq 'BSseeker2') {
686 my ($XO) = grep /^XO:Z:/,@$i;
687 if ($XO eq 'XO:Z:+FR') {
688 $mtype = 'CT'; # R
689 } elsif ($XO eq 'XO:Z:-FR') {
690 $mtype = 'GA'; # Y
691 } elsif ($XO eq 'XO:Z:+RF') {
692 $mtype = 'GA'; # Y
693 } elsif ($XO eq 'XO:Z:-RF') {
694 $mtype = 'CT'; # Y
695 } else {die;}
696 } elsif ($_[1] eq 'bwa') {
697 $mtype = '-';
699 #print "$mtype $i->[9] $YC\n";
700 my ($firstSC,$seqCIGAR) = getSeqCIGAR($minLeft,$i);
701 my $offset = $i->[3] - $minLeft - $firstSC;
702 for my $p (@usingPoses) {
703 my ($tlen,$tmp,$vseq,$vqual,$t)=(0);
704 my $readlen = length $i->[9];
705 if ($_[1] eq 'BSseeker2' and $i->[10] eq '*') {
706 $i->[10] = '@' x $readlen;
708 if ($p > 0 and $p < length($seqCIGAR)) { # M]S
709 $tmp = substr($seqCIGAR,$p+1) or next;
710 # unless ($tmp) {
711 # ddx $i;
713 =pod
714 11860 sf340_A_Ref_61780248_61780667_61781087_Vir_+_701_820_R_420_90 99 chr1 61780588 60 83M7S = 61780798 300
715 11861 sf450_F_Ref_61780248_61780667_61781087_Vir_+_701_820_R_420_90 609 chr1 61780588 0 53S37M = 61780908 410 Fs90_m458AE.bam,原因不明。MapQ=0.
716 11862 sf10_1_Ref_61780248_61780667_61781087_Vir_+_701_820_R_420_90 147 chr1 61780588 60 83M7S = 61780258 -413
717 =cut
718 $tmp =~ /^(S+)/;
719 if (defined $1) {
720 $tlen = length $1;
721 $t = $p+1 -$offset;
722 my $tl = $readlen - $t;
723 if (abs($t) >= $readlen) {
724 #ddx $i; print "-+->Off:$offset, Start:$t, Len:$tlen\n";
725 #$tlen = $tl;
726 print STDERR ']';
727 next; # CNS部分,不容有失,这个偶发问题暂时跳过
728 #$t = -$readlen; # 修改后Fk150_m2D的TP下降,是故,冒烟吧。
730 $vseq = substr $i->[9],$t,$tlen;
731 $vqual = substr $i->[10],$t,$tlen;
733 } elsif ($p < 0 and (-$p-1) < length($seqCIGAR)) { # S[M
734 $tmp = substr($seqCIGAR,0,-$p-1) or next;
735 $tmp =~ /(S+)$/;
736 if (defined $1) {
737 $tlen = length $1;
738 $t = -$p-1 - $tlen -$offset;
739 if (abs($t) >= $readlen) {
740 #ddx $i; print "--->Off:$offset, Start:$t, Len:$tlen\n";
741 print STDERR '[';
742 next; # 不想倒着算了
743 #$t = $readlen-1;
745 $vseq = substr $i->[9],$t,$tlen;
746 $vqual = substr $i->[10],$t,$tlen;
749 if ($vseq and length($vseq) >= $main::minVirLen) {
750 push @{$Bases{$p}},[$vseq,$vqual,$mtype];
752 # print ">>>$tlen, $tmp, $p\n$vseq\n$vqual\n";
753 # print substr($seqCIGAR,$p,8),"\n" if $p>0;
754 # print substr($seqCIGAR,-$p-2,8),"\n" if $p<0;
758 ddx \%relPoses,\%relPosesFR,\@usingPoses,\%Bases if $DEBGUHERE;
759 #my %Results;
760 for my $p (@usingPoses) {
761 my (@theReads,$absPos);
762 if ($p<0) {
763 $absPos = $p - $minLeft;
764 for (@{$Bases{$p}}) {
765 my $x = reverse $_->[0];
766 my $y = reverse $_->[1];
767 push @theReads,[$x,$y,$_->[2]];
769 } else {
770 $absPos = $p + $minLeft;
771 if ($_[1] eq 'BSseeker2') {
772 next unless exists $Bases{$p}; # '78S5M1D30M1D17M20S' caused trouble
774 @theReads = @{$Bases{$p}};
776 my $mergstr = mergeStr(\@theReads);
777 $mergstr = reverse $mergstr if $p < 0;
778 #ddx \@theReads;
779 #ddx $Bases{$p};
780 my $depth = @theReads;
781 #print "$p,$depth\t$mergstr\n";
782 $Results{$absPos} = [$depth,$mergstr];
784 print "@usingPoses\t",'-' x 25,"\n" if $DEBGUHERE;
785 # my (%absPoses,%absPosesFR);
786 # for (keys %relPoses) {
787 # $absPoses{$minLeft + $_} = $relPoses{$_};
789 # for (keys %relPosesFR) {
790 # if ($_ > 0) {
791 # $absPosesFR{$_ + $minLeft} = $relPosesFR{$_};
792 # } else {
793 # $absPosesFR{$_ - $minLeft} = $relPosesFR{$_};
796 return (\%Results,$mostHchr,$mostVchr);
798 sub mergeStr($) {
800 # ["TTTGTTGATAA", "BBBBBBBBBBB", "CT"],
801 # ["CTTATTAACAAAAATC", "BBBBBBBBBBBBBBBB", "GA"],
802 # ["TTTGTTGATAAGAATTTTTATAATATTATAGAGTTTAGA", ("B" x 39), "CT"],
803 # ["TTTGTTGATAAGAATTTTTATAATATTATAGAGTTTAGATTT", ("B" x 42), "CT"],
805 my @Strs = @{$_[0]};
806 my ($maxLen,$merged)=(0);
807 for (@Strs) {
808 my $l = length $_->[0];
809 $maxLen = $l if $maxLen < $l;
811 for my $p (0 .. ($maxLen-1)) {
812 my (%Col,%ColBp) = ();
813 for (@Strs) {
814 my $str = $_->[0];
815 my $qual = $_->[1];
816 if ($p < length($str)) {
817 my $c = substr $str,$p,1;
818 my $q = substr $qual,$p,1;
819 if ($_->[2] eq 'CT' and $c eq 'T') {
820 $c = 'Y';
821 } elsif ($_->[2] eq 'GA' and $c eq 'A') {
822 $c = 'R';
823 } # $_->[2] eq '-' means leaving the same.
824 ++$ColBp{$c};
825 if ($c eq 'Y') { # CT
826 $Col{$_} += $main::Qual2LgP{$q}->[4] for qw(G A);
827 $Col{$_} += $main::Qual2LgP{$q}->[3] for qw(C T);
828 if ($main::EnableWretchedSequel) {
829 $Col{$_} += -10*log(1/8);
831 } elsif ($c eq 'R') { # GA
832 $Col{$_} += $main::Qual2LgP{$q}->[4] for qw(C T);
833 $Col{$_} += $main::Qual2LgP{$q}->[3] for qw(G A);
834 if ($main::EnableWretchedSequel) {
835 $Col{$_} += -10*log(1/8);
837 } else {
838 $Col{$_} += $main::Qual2LgP{$q}->[2] for qw(A T C G);
839 $Col{$c} += $main::Qual2LgP{$q}->[1];
840 if ($main::EnableWretchedSequel) {
841 $Col{$_} += -10*log(1/4);
846 my ($res,@Bps);
847 if (keys(%ColBp)==1) {
848 $res = (keys(%ColBp))[0];
849 } else {
850 @Bps = sort { $Col{$a} <=> $Col{$b} } keys %Col; # choose the smaller one.
851 if ( $Col{$Bps[1]} - $Col{$Bps[0]} < 0.02 ) { # q=3时,-10*($lgbp-$lgbq)=0.020624399283,最小。
852 my @t = sort { $a cmp $b } @Bps[0,1];
853 if (@t > 1) {
854 $res = $REV_IUB{$t[0].$t[1]};
855 } else {
856 $res = $REV_IUB{$t[0]}; # Same bases がな?
858 } else {
859 $res = $Bps[0];
862 $merged .= $res;
863 # ddx \%Col,\%ColBp,$res,\@Bps;
865 return $merged;
868 sub getDeriv($$$) {
869 my ($interest,$bypass,$str) = @_;
870 my $len = length $str;
871 my $Inserted = 0; # I 与 D 会造成 QSEQ 与 REF 的不同步,必须改成返回两套坐标才行。时间紧迫,先无视。
872 my $Deleted = 0; # 现在的 $seqCIGAR 不含‘D’,在遇到D时会左偏。
873 my @interestedPoses; # define as gap after index, which is from 0. Thus cmp between this & next.
874 for my $i (0 .. $len-2) {
875 my $thischar = substr $str,$i,1;
876 my $nextchar = substr $str,$i+1,1;
877 #++$Inserted if $thischar eq 'I';
878 #++$Deleted if $thischar eq 'D';
879 if ($thischar eq $nextchar or $thischar=~/[$bypass]/) {
880 next;
881 } elsif ($thischar eq $interest) { # MS -> HV
882 push @interestedPoses,$i;
883 } elsif ($nextchar eq $interest) { # SM -> VH
884 push @interestedPoses,-($i+1);
885 } else {
886 next;
889 return @interestedPoses;
892 sub mergehash($$) {
893 my ($sink,$in) = @_;
894 return unless ref($in);
895 return unless ref($in) eq "HASH";
896 for my $k (keys %{$in}) {
897 if (exists $sink->{$k}) {
898 $sink->{$k} += $in->{$k};
899 } else {
900 $sink->{$k} = $in->{$k};
905 sub sortWsum {
906 if ($a eq '=Sum=') { return -1; }
907 elsif ($b eq '=Sum=') { return 1; }
908 else { return $a cmp $b; }
911 sub do_patch {
912 my $Refilename = warnFileExist($main::RefConfig->{$main::RefFilesSHA}->{'Refilename'});
913 my (%tID,%tFH);
914 for (@{$main::Config->{$main::FileData}->{'='}}) {
915 /([^.]+)\.(\d)/ or die;
916 $tID{$1}{$2} = $_;
918 for my $k (keys %tID) {
919 my $outf = "$main::RootPath/${main::ProjectID}_analyse/.$k.analyse";
920 # Well, this is the f*cking reality. You know it.
921 my $outf2 = "$main::RootPath/${main::ProjectID}_analyse/$k.analyse";
922 open IN,'<',$outf or die;
923 open OUT,'>',$outf2 or die;
924 #print OUT join("\t",qw[ID Host_Chr H_Start H_End Virus_Chr Strand V_Start V_End]),"\n";
925 print OUT join("\t",qw[ID Chr BreakPoint1 BreakPoint2 Virus Strand Start End]),"\n"; # Well, he prefer this.
926 my %Results;
927 while (<IN>) {
928 chomp;
929 my @dat = split /\t/;
930 #$dat[4] = $retVirus[0]->[0]; # Well, just do it.
931 $Results{$dat[1]}{$dat[2]} = \@dat;
933 for my $chr (sort keys %Results) {
934 my @Poses = sort {$a<=>$b} keys %{$Results{$chr}};
935 if (@Poses == 1) {
936 my @tmp = @{$Results{$chr}{$Poses[0]}};
937 my $last1 = pop @tmp;
938 my $last2 = pop @tmp;
939 my $last3 = pop @tmp;
940 my @Virus;
941 my @tVr = split /,/,$last1;
942 for (@tVr,$last2,$last3) {
943 push @Virus,$_ if $_ != -1;
945 print OUT join("\t",@tmp,$Virus[0],$Virus[-1]),"\n";
946 next;
949 my @TTT;
950 for my $i (0 .. $#Poses) {
951 if (($i == 0) or ($Poses[$i] - $Poses[$i-1] <= 20)) {
952 push @TTT,$Results{$chr}{$Poses[$i]};
953 } else {
954 if (@TTT) {
955 my (@Virus,@Hum);
956 for my $tt (@TTT) {
957 push @Hum,$tt->[2];
958 push @Hum,$tt->[3] if $tt->[3] != -1;
959 push @Virus,$tt->[6] if $tt->[6] != -1;
960 push @Virus,$tt->[7] if $tt->[7] != -1;
961 my @tVr = split /,/,$tt->[8];
962 for (@tVr) {
963 push @Virus,$_ if $_ != -1;
966 @Hum = sort {$a<=>$b} @Hum;
967 @Virus = sort {$a<=>$b} @Virus;
968 push @Hum,-1 if scalar @Hum == 1;
969 if (scalar @Virus == 1) {
970 push @Virus,-1;
971 } else {
972 my $vlen = $Virus[1] - $Virus[0];
973 if ($vlen < $main::MinVirusLength) {
974 next;
977 print OUT join("\t",$Results{$chr}{$Hum[0]}->[0],$chr,$Hum[0],$Hum[-1],$Results{$chr}{$Hum[0]}->[4],$Results{$chr}{$Hum[0]}->[5],$Virus[0],$Virus[-1]),"\n";
979 @TTT = ($Results{$chr}{$Poses[$i]});
982 if (@TTT) {
983 my @tmp = @{$TTT[0]};
984 my $last1 = pop @tmp;
985 my $last2 = pop @tmp;
986 my $last3 = pop @tmp;
987 my @Virus;
988 my @tVr = split /,/,$last1;
989 for (@tVr,$last2,$last3) {
990 push @Virus,$_ if $_ != -1;
992 print OUT join("\t",@tmp,$Virus[0],$Virus[-1]),"\n";
993 #print OUT join("\t",@{$TTT[0]}),"\n";
996 close IN; close OUT;
997 # EOF this silly thing, which is favored by the mankind.