9 my $fileStr = join(',',@_);
10 return Digest
::SHA
::sha1_base64
($fileStr);
14 my ($HostRefName,$VirusRefName)=@_;
15 my $HostChar = substr $HostRefName,0,1;
16 my $VirusChar = substr $VirusRefName,0,1;
17 my $tlen = length($VirusRefName) -1;
19 while ( ($VirusChar eq $HostChar) and (++$i<=$tlen) ) {
20 my $tmpChar = substr $VirusRefName,$i,1;
21 $VirusChar = $tmpChar if $tmpChar=~/\w/;
22 warn "$i - $tmpChar $VirusChar -\n";
24 $VirusChar = 'V' if $VirusChar eq $HostChar;
25 return $HostChar.$VirusChar;
29 my ($ChrA,$PosA) = split /\t/,$_[0];
30 my ($ChrB,$PosB) = split /\t/,$_[1];
32 return $PosA <=> $PosB;
34 return 1 if exists $main::VirusChrIDs
{$ChrA};
35 return -1 if exists $main::VirusChrIDs
{$ChrB};
42 my @cigar = $cigar =~ /(\d+)(\w)/g;
43 my ($reflen,$maxM,$readlen)=(0,0,0);
45 my ($len,$op) = splice(@cigar,0,2);
49 $maxM = $len if $maxM < $len;
52 $reflen += $len if $op eq 'D';
53 $readlen += $len if $op eq 'I';
55 return ($reflen,$readlen);
59 my ($isHost,$rChrRange,$aRead,$rStore,$cid,$i) = @_;
60 my ($reflen,$readlen) = cigar2poses
($aRead->[5]);
61 my $thisehPos = $aRead->[3]+$reflen;
62 my $ret; # 1 -> again, 0 -> merged
63 if ($i > 1 or $isHost == 0 or scalar(keys %{$rChrRange})==0) {
64 # same PE or assume overlap on Virus
66 if (keys %{$rChrRange} and exists $rChrRange->{$aRead->[2]}) {
67 if ($aRead->[3] <= $rChrRange->{$aRead->[2]}->[0]) {
68 $rChrRange->{$aRead->[2]}->[0] = $aRead->[3];
70 if ($thisehPos >= $rChrRange->{$aRead->[2]}->[1]) {
71 $rChrRange->{$aRead->[2]}->[1] = $thisehPos;
74 $rChrRange->{$aRead->[2]} = [ $aRead->[3],$thisehPos,0 ];
76 } elsif (exists $rChrRange->{$aRead->[2]}) {
77 if ($aRead->[3] <= $rChrRange->{$aRead->[2]}->[1]) { # overlap
78 #die unless $thisehPos >= $rChrRange->{$aRead->[2]}->[1];
79 $rChrRange->{$aRead->[2]}->[1] = $thisehPos if $thisehPos >= $rChrRange->{$aRead->[2]}->[1];
84 } else { # different HostChr
88 my $tid = join("\n",$cid,$i);
90 ++$rChrRange->{$aRead->[2]}->[2];
94 sub formatChrRange
($) {
97 for my $c (sort keys %{$rChrRange}) {
98 push @ret, "${c}:".$rChrRange->{$c}->[0].'-'.$rChrRange->{$c}->[1].':'.$rChrRange->{$c}->[2];
100 return join(',',@ret);
105 $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
106 my $rev = reverse $str;
107 $rev =~ tr/[](){}<>/][)(}{></;
114 A
=> 0, G
=> 0, C
=> 0, T
=> 0,
117 for (split //,$seq) {
120 my $seqlen = length($seq) - $BaseCnt{'N'};
121 return 'N' if $seqlen == 0;
122 my @Cnts = sort { $BaseCnt{uc $b} <=> $BaseCnt{uc $a} } keys %BaseCnt;
123 #ddx [\@Cnts,\%BaseCnt];
124 if ($BaseCnt{'C'}<=$seqlen*$main::methly3BaseErrRate
and $BaseCnt{'T'}>0) {
126 } elsif ($BaseCnt{'G'}<=$seqlen*$main::methly3BaseErrRate
and $BaseCnt{'A'}>0) {
133 # 由于include在main前面,所以可以在此统一计算打分矩阵。
134 my ($match,$methylmatch,$mismatch,$INDEL) = (2,1,-3,-4);
135 our (%MatrixO,%MatrixFct,%MatrixRga);
136 my @Bases = sort qw(A C G T);
137 # $ perl -e '@Bases = sort qw(A T C G); print join("|",@Bases),"\n"'
139 for my $i (0 .. $#Bases) {
140 for my $j ($i .. $#Bases) {
141 my $str = $Bases[$i] . $Bases[$j];
143 $MatrixO{$str} = $MatrixFct{$str} = $MatrixRga{$str} = $match;
145 $MatrixO{$str} = $mismatch;
146 if ($str =~ /^[CT]+$/) {
147 $MatrixFct{$str} = $methylmatch;
149 $MatrixFct{$str} = $mismatch;
151 if ($str =~ /^[GA]+$/) {
152 $MatrixRga{$str} = $methylmatch;
154 $MatrixRga{$str} = $mismatch;
159 #ddx (\%MatrixO,\%MatrixFct,\%MatrixRga);
160 # { AA => 2, AC => -3, AG => -3, AT => -3, CC => 2, CG => -3, CT => -3, GG => 2, GT => -3, TT => 2 },
161 # { AA => 2, AC => -3, AG => -3, AT => -3, CC => 2, CG => -3, CT => 1, GG => 2, GT => -3, TT => 2 },
162 # { AA => 2, AC => -3, AG => 1, AT => -3, CC => 2, CG => -3, CT => -3, GG => 2, GT => -3, TT => 2 },
165 my ($AssemHRef,$retHostARef,$retVirusARef) = @_;
167 my $retHost = $$retHostARef[0];
168 my $retVirus = $$retVirusARef[0];
169 my @froDat = sort keys %{$AssemHRef};
171 my $fro0 = shift @froDat;
172 my $result = [$AssemHRef->{$fro0}->[0]->[1]];
173 for my $fro (@froDat) {
174 #$fro0 = mergeAln( $result,$AssemHRef->{$fro}->[0],$fro0,$fro );
175 $result = mergeAln
( $result->[0],$AssemHRef->{$fro}->[0]->[1] );
177 #ddx $retHost,$result;
178 my $HostResult = mergeAln
( $retHost->[1],$result->[0] );
180 my @RefInfo = split /[:-]/,$retHost->[0];
181 my ($RefCut,$Left,$Insert,@Rcuts,@Vcuts)=(0);
182 if ($HostResult->[1] =~ /^(D*)([MmR]+)(I+)/) {
183 (undef,$Left,$Insert) = ($1,$2,$3);
184 push @Rcuts,($RefInfo[1] + length($Left)+1);
186 if ($HostResult->[1] =~ /^(I*)([MmR]+)(D+)/) {
187 ($Left,$Insert,undef) = ($1,$2,$3);
188 push @Rcuts,($RefInfo[1] + length($Left) + length($Insert)+1);
190 $RefCut = join(';',@Rcuts);
191 my $VirusResult = mergeAln
( $retVirus->[1],$result->[0] );
192 my @VirusInfo = split /[:-]/,$retVirus->[0];
193 my ($VirCut,$VirLeft,$VirInsert)=(0,0,0);
194 if ($VirusResult->[1] =~ /^(D*)([MmR]+)(I+)/) {
195 (undef,$VirLeft,$VirInsert) = ($1,$2,$3);
196 push @Vcuts,($VirusInfo[1] + length($VirLeft)+1);
198 if ($VirusResult->[1] =~ /^(I*)([MmR]+)(D+)/) {
199 ($Left,$Insert,undef) = ($1,$2,$3);
200 push @Vcuts,($VirusInfo[1] + length($Left) + length($Insert)+1);
203 $VirCut = join(';',@Vcuts);
204 return [$RefInfo[0],$RefCut,$VirusInfo[0],$VirCut,length($VirInsert)];
206 sub dynAln
($$$) { # 废弃 {
207 my ($ref,$query,$MatrixR) = @_;
208 my @a=('',split //,$ref);
209 my @b=('',split //,$query);
212 $matrix[$_][0]=0 for (0 .. $#a);
213 $matrix[0][$_]=0 for (0 .. $#b);
216 for ($i=1;$i<=$#a;$i++) {
217 for ($j=1;$j<=$#b;$j++) {
218 my $str = join('',sort ($a[$i],$b[$j]));
219 $matrix[$i][$j] = $MatrixR->{$str} + $matrix[$i-1][$j-1];
220 $path[$i][$j] = 1; # case 0: i,j are aligned
221 $sc=$matrix[$i-1][$j] + $INDEL; # case 1: i aligned to -
222 if ($sc > $matrix[$i][$j]) {
223 $matrix[$i][$j] = $sc;
226 $sc=$matrix[$i][$j-1] + $INDEL; # case 2: j aligned to -
227 if ($sc > $matrix[$i][$j]) {
228 $matrix[$i][$j] = $sc;
231 if ($matrix[$i][$j] < 0) { # Local
237 # Traceback (with shadow matrix)
238 my ($count,@ax,@ay,@as)=(0); # first is 0.
239 --$i;--$j; # outside for, $i will bigger than $#a by 1 step
240 my ($len,$mv,$mi,$mj)=(0);
241 while ($i>=0 and $j>=0) {
244 ($mv,$mi,$mj)=@
{&getmax
(\
@matrix,$i,$j)}; # Local
245 # Traceback cycle for single str
246 while ($mi>=0 and $mj>=0) {
247 $as[$count] = $matrix[$mi][$mj] unless $as[$count];
249 #&traceback($a[$mi],$b[$mj],$path[$mi][$mj],$ax[$count],$ay[$count]);
250 my $path = $path[$mi][$mj];
251 if (! defined $path) { # $path == 0
254 @
{$ax[$count]} = reverse @
{$ax[$count]};
255 @
{$ay[$count]} = reverse @
{$ay[$count]};
261 ($mv,$mi,$mj)=@
{&getmax
(\
@matrix,$i,$j)}; # Local
262 } elsif ($path == 1) {
263 push @
{$ax[$count]},$a[$mi];push @
{$ay[$count]},$b[$mj];
265 } elsif ($path == 2) {
266 push @
{$ax[$count]},$a[$mi];push @
{$ay[$count]},"-";
268 } elsif ($path == 3) {
269 push @
{$ax[$count]},"-";push @
{$ay[$count]},$b[$mj];
276 for ($i=1;$i<=$count;$i++) {
278 print 'X:[',@
{$ax[$i-1]},"]\n",'Y:[',@
{$ay[$i-1]},"]\n",'Score:',$as[$i-1],"\n";
281 sub getmax
($$$) { # 废弃 {
282 my ($arref,$ii,$ij)=@_;
283 my ($mv,$mi,$mj,$i,$j)=(-1,-1,-1);
284 for ($i=1;$i<=$ii;$i++) {
285 for ($j=1;$j<=$ij;$j++) {
286 if ($mv == $$arref[$i][$j]) { # keep ($mi+$mj) max, to achieve max(mi^2+mj^2)
287 if (($mi+$mj) < ($i+$j)) {
288 #$mv=$$arref[$i][$j];
291 } elsif ($mv < $$arref[$i][$j]) {
297 return [$mv,$mi,$mj];
303 my @cigar = $c =~ /(\d+)(\w)/g;
306 my ($len,$op) = splice(@cigar,0,2);
308 $maxM = $len if $maxM < $len;
311 push @
{$Dat{$maxM}},$c;
313 my @keys = sort { $b <=> $a } keys %Dat;
314 # 可以再加上 totalM 最大,然后是 Gap-Open 最少。
315 return $Dat{$keys[0]};
318 my ($refile,$query,$dir) = @_;
319 #my $pid = open2( \*READER, \*WRITER, "$RealBin/bin/needle" );
320 #WRITER->autoflush(); # default here, actually
321 my $str = "$RealBin/bin/water $refile <(echo $query) -scircular2 -gapopen 10 -gapextend 0.5 stdout -aformat3 sam";
322 my $revstr = "$RealBin/bin/water $refile <(echo $query) -sreverse1 -scircular2 -gapopen 10 -gapextend 0.5 stdout -aformat3 sam";
324 #print WRITER $toAln,"\n";
326 my @Ret = `bash -c \'$str\' 2>/dev/null`;
327 my @tmp = split /\t/,$Ret[2]; # 没空,认为第一个结果最好。
329 push @
{$Result{$tmp[5]}},[@tmp];
330 @Ret = `bash -c \'$revstr\' 2>/dev/null`;
331 @tmp = split /\t/,$Ret[2];
333 push @
{$Result{$tmp[5]}},[@tmp];
335 my $t = getMaxM
(keys %Result);
336 my $k = $t->[0]; # 先就按一个来了。
339 if ((($Result{$k}->[0][0] eq '+' and $k =~ /^(\d+)M/) or ($Result{$k}->[0][0] eq '-' and $k =~ /(\d+)M$/)) and $1 >= $main::minVirMapLen
) { # water的sam格式使用 H 表示前面比不上的,这里先不管这种情况。故 $Result{$k}->[0][0] 也就不用管了。
340 my $p = $Result{$k}->[0][3];
341 # if (0 and $Result{$k}->[0][0] eq '-') {
342 # my $rlen = bam_cigar2rlen($k);
345 $ret = [$Result{$k}->[0][0],$p,$1]; # 假设病毒只有一根,而且比对没hard-clip。读通时右端的人的序列无视。
348 if ((($Result{$k}->[0][0] eq '+' and $k =~ /(\d+)M$/) or ($Result{$k}->[0][0] eq '-' and $k =~ /^(\d+)M/)) and $1 >= $main::minVirMapLen
) { # water的sam格式使用 H 表示前面比不上的,这里先不管这种情况。故 $Result{$k}->[0][0] 也就不用管了。
349 my $p = $Result{$k}->[0][3];
350 if ($Result{$k}->[0][0] eq '+') {
351 my $rlen = bam_cigar2rlen
($k);
353 } elsif ($Result{$k}->[0][0] eq '-') { # 负链`water -sreverse1`返回的`POS`是较大坐标,即反向后的左端点。
354 my $rlen = bam_cigar2rlen
($k);
357 $ret = [$Result{$k}->[0][0],$p,$1]; # 假设病毒只有一根,而且比对没hard-clip。读通时右端的人的序列无视。
359 } # 假设病毒左端点正确;右端无通读的人的序列(这个应该已经处理对了)。
364 my ($ref,$query) = @_;
365 my $pid = open2
( \
*READER
, \
*WRITER
, "$RealBin/bin/alnmethly" );
366 WRITER
->autoflush(); # default here, actually
367 my @Dat = ([$query,undef],[revcom
($query),undef]);
368 $_->[1] = guessMethyl
($_->[0]) for @Dat;
369 @Dat = sort { $a->[1] cmp $b->[1] } @Dat;
370 unshift @Dat,[$ref,guessMethyl
($ref)];
372 if ($_->[1] eq '1CT') {
373 $_->[0] =~ s/[CT]/Y/ig;
374 } elsif ($_->[1] eq '2GA') {
375 $_->[0] =~ s/[GA]/R/ig;
377 $_->[0]='N' if $_->[1] eq 'N'; # 'N' for empty seq.
380 print WRITER
join("\n",map {$_->[0]} @Dat),"\n";
384 if (/^Path(\d): ([IDMmR]+),(\d+)$/) {
385 #print "[$_] [$1] [$2] [$3]\n";
386 $Result{$1} = [$2,$3];
391 my @Resu = sort { $Result{$b}->[1] <=> $Result{$a}->[1] } keys %Result;
392 #print join("\n",map {$_->[0]} @Dat),"\n";
393 #ddx $Result{$Resu[0]},$Resu[0];
394 my @ResDat = split //,$Result{$Resu[0]}->[0];
395 my @QuaryDat = split //,$Dat[$Resu[0]]->[0];
396 my @Refdat = split //,$ref;
397 #print "R:@Refdat\nQ:@QuaryDat\nA:@ResDat\n";
399 my ($Refp,$Querp) = (0,0);
400 for my $p (0 .. $#ResDat) {
401 if ($ResDat[$p] =~ /^[DMmR]$/) {
402 ++$Querp if $ResDat[$p] eq 'D';
403 my $theBases = $IUB{$Refdat[$p-$Refp]} or die;
404 push @
$theBases,@
$theBases if $#$theBases==0;
405 ++$AlnDat[$p]->{$_} for @
$theBases;
407 if ($ResDat[$p] =~ /^[MmR]$/) {
408 my $theBases = $IUB{$QuaryDat[$p-$Querp]};
409 push @
$theBases,@
$theBases if $#$theBases==0;
410 ++$AlnDat[$p]->{$_} for @
$theBases;
411 } elsif ($ResDat[$p] eq 'I') {
413 my $theBases = $IUB{$QuaryDat[$p-$Querp]};
414 push @
$theBases,@
$theBases if $#$theBases==0;
415 ++$AlnDat[$p]->{$_} for @
$theBases;
422 my @k = sort { $t{$a} <=> $t{$b} } keys %t;
424 if (@k>1 and $t{$k[0]} == $t{$k[1]}) {
425 $iub = join('',sort @k[0,1]);
429 $reseq .= $REV_IUB{$iub};
431 return [$reseq,$Result{$Resu[0]}->[0]];
434 sub warnFileExist
(@
) {
438 ++$NotFound{$_} unless -f
$_;
440 my @NF = sort keys %NotFound;
442 warn "[!!!] File NOT Found:[",join('],[',@NF),"]\n";
444 #warn "[Debug] @_\n";
447 # http://perldoc.perl.org/perlfaq4.html#How-do-I-expand-function-calls-in-a-string%3f
450 #define BAM_CIGAR_STR "MIDNSHP=XB"
451 /* bam_cigar_type returns a bit flag with:
452 * bit 1 set if the cigar operation consumes the query
453 * bit 2 set if the cigar operation consumes the reference
455 * For reference, the unobfuscated truth table for this function is:
456 * BAM_CIGAR_TYPE QUERY REFERENCE
457 * --------------------------------
468 * --------------------------------
471 sub bam_cigar2rlen
($) {
472 my @cigar = $_[0] =~ /(\d+\w)/g;
475 if (/(\d+)([MDN=X])/) {
481 sub bam_cigar2qlen
($) {
482 my @cigar = $_[0] =~ /(\d+\w)/g;
486 #/(\d+)([MIS=X])/ or die $_[0],",$_";
487 if (/(\d+)([MIS=X])/) {
493 sub getSeqCIGAR
($$) {
494 my ($minLeft,$i) = @_;
496 my $deltraPos = $i->[3] - $minLeft;
497 my @cigar = $i->[5] =~ /(\d+[MIDNSHP=XB])/g;
499 my $offset = $pos - $minLeft;
500 #my $cigar2qlen = bam_cigar2qlen($i->[5]);
501 my ($cursorQ,$cursorR) = (0,0);
502 my $seqCIGAR = 'B' x
$offset;
505 /(\d+)([MIDNSHP=XB])/ or die;
508 substr $seqCIGAR,-$1,$1,$2 x
$1;
512 $seqCIGAR .= $2 x
$1;
514 } elsif ($2 eq 'M') {
516 $seqCIGAR .= $2 x
$1;
517 for my $p ( ($cursorQ-$1) .. ($cursorQ-1) ) {
518 my $refpos = $pos + $cursorR + $p;
519 #$ReadsMPos2Ref{$p} = $refpos;
522 } elsif ($2 eq 'I') {
524 $seqCIGAR .= $2 x
$1;
525 } elsif ($2 eq 'D') {
529 return ($firstSC,$seqCIGAR);
534 # "sf167_E_Ref_727851_728000_728150_Vir_+_214_333_R_150_90",
540 # "gi|59585|emb|X04615.1|",
543 # "TTATGTGTTTTGGTTAAAATTTGTAGTTTTTAATTTTTAATTAATTAGGAAGAAGAGTTGGGTTTGGAGAGAATGTTTGGAGGGTGTAAG",
544 # "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH",
556 my ($minLeft,$maxLS,@clipReads,%relPoses,%relPosesFR,$chr)=(5000000000,0);
557 my (%Results,%ChrMRNMs,%ChrREf);
558 for my $i (@
{$_[0]}) {
560 ++$ChrMRNMs{$MRNM}; # use the most abondunce one.
563 my $mostVchr = (sort {$ChrMRNMs{$b}<=>$ChrMRNMs{$a}} keys %ChrMRNMs)[0];
564 my $mostHchr = (sort {$ChrREf{$b}<=>$ChrREf{$a}} keys %ChrREf)[0];
565 for my $i (@
{$_[0]}) {
566 next if ( $i->[6] ne $mostVchr ) or ($i->[6] ne $i->[2]) or ($i->[2] ne $mostHchr); # skip other MRNMs
567 my @cigar = $i->[5] =~ /(\d+[MIDNSHP=XB])/g;
569 #ddx \@cigar; die $i->[5];
572 $flag = 1 if $1 >= $main::minSoftClip
;
576 unless (defined $chr) {
579 next if $chr ne $i->[2];
581 #print " $i->[2]:$i->[3]:$i->[-1]\n" if $DEBGUHERE;
584 if ($cigar[0] =~ /(\d+)S/) {
585 $maxLS = $1 if $maxLS < $1;
588 $minLeft = $left if $minLeft > $left;
591 return (\
%Results,'','') unless @clipReads;
593 #my (@b2pCIGAR,@b2pDeriv);
595 print "$minLeft\n" if $DEBGUHERE;
596 for my $i (@clipReads) {
598 my ($firstSC,$seqCIGAR) = getSeqCIGAR
($minLeft,$i);
599 my $offset = $i->[3] - $minLeft;
600 my @Poses = getDeriv
("M","B",$seqCIGAR);
606 ++$relPoses{-$_-0.5};
610 my @thePoses = sort {$relPoses{$b} <=> $relPoses{$a}} keys %relPoses;
612 for (@Poses,@thePoses) {
614 push @absPoses,-($minLeft - $_);
616 push @absPoses,($minLeft + $_);
619 print "$i->[5]\t$offset\t@{$i}[0..4]; @Poses, @thePoses -> @absPoses\n$seqCIGAR\n";
620 print 'B' x
($offset-$firstSC),$i->[9],"\n";
621 my $tmpstr = ' ' x
length($seqCIGAR);
624 substr $tmpstr,$p,1,']';
626 substr $tmpstr,-$p,1,'[';
631 push @ReadsCIGAR,$seqCIGAR;
633 my @thePosesA = sort {$relPoses{$b} <=> $relPoses{$a}} keys %relPoses;
634 my $thePos = int($thePosesA[0]); # $thePos = $thePosesA[0] - 0.5, 目前只考虑最多的clip位置。
636 push @usingPoses,$thePos if exists $relPosesFR{$thePos};
637 push @usingPoses,-1-$thePos if exists $relPosesFR{-1-$thePos};
638 if (@usingPoses == 1) { # 先找最值处的']'&'[',若没有成对,再在对侧找成对的。只提最近的取一对。
639 if ($usingPoses[0] < 0) {
640 for my $p ( ($thePos-$main::posAround
) .. $thePos ) {
641 if (exists $relPosesFR{$p}) {
645 #@usingPoses = @usingPoses[0,-1] if @usingPoses > 2;
646 if (@usingPoses > 2) {
649 @t = sort {$relPosesFR{$b} <=> $relPosesFR{$a} || $b <=> $a } @t;
650 @usingPoses = ($usingPoses[0],$t[0]);
652 } elsif ($usingPoses[0] > 0) {
653 for my $p ( ($thePos+1) .. ($thePos+$main::posAround
+1) ) {
654 if (exists $relPosesFR{-$p}) {
655 push @usingPoses,-$p;
658 #@usingPoses = @usingPoses[0,1] if @usingPoses > 2;
659 if (@usingPoses > 2) {
662 @t = sort {$relPosesFR{$b} <=> $relPosesFR{$a} || $a <=> $b } @t;
663 @usingPoses = ($usingPoses[0],$t[0]);
668 for my $i (@clipReads) {
670 if ($_[1] eq 'bwa-meth') {
671 my ($YC) = grep /^YC:Z:/,@
$i;
672 if ($i->[1] & 16) { # r, 用`YD:Z:f`似乎更好。目前没管PBAT的。
673 if ($YC eq 'YC:Z:CT') {
675 } elsif ($YC eq 'YC:Z:GA') {
679 if ($YC eq 'YC:Z:CT') {
681 } elsif ($YC eq 'YC:Z:GA') {
685 } elsif ($_[1] eq 'BSseeker2') {
686 my ($XO) = grep /^XO:Z:/,@
$i;
687 if ($XO eq 'XO:Z:+FR') {
689 } elsif ($XO eq 'XO:Z:-FR') {
691 } elsif ($XO eq 'XO:Z:+RF') {
693 } elsif ($XO eq 'XO:Z:-RF') {
696 } elsif ($_[1] eq 'bwa') {
699 #print "$mtype $i->[9] $YC\n";
700 my ($firstSC,$seqCIGAR) = getSeqCIGAR
($minLeft,$i);
701 my $offset = $i->[3] - $minLeft - $firstSC;
702 for my $p (@usingPoses) {
703 my ($tlen,$tmp,$vseq,$vqual,$t)=(0);
704 my $readlen = length $i->[9];
705 if ($_[1] eq 'BSseeker2' and $i->[10] eq '*') {
706 $i->[10] = '@' x
$readlen;
708 if ($p > 0 and $p < length($seqCIGAR)) { # M]S
709 $tmp = substr($seqCIGAR,$p+1) or next;
714 11860 sf340_A_Ref_61780248_61780667_61781087_Vir_+_701_820_R_420_90 99 chr1 61780588 60 83M7S = 61780798 300
715 11861 sf450_F_Ref_61780248_61780667_61781087_Vir_+_701_820_R_420_90 609 chr1 61780588 0 53S37M = 61780908 410 Fs90_m458AE.bam,原因不明。MapQ=0.
716 11862 sf10_1_Ref_61780248_61780667_61781087_Vir_+_701_820_R_420_90 147 chr1 61780588 60 83M7S = 61780258 -413
722 my $tl = $readlen - $t;
723 if (abs($t) >= $readlen) {
724 #ddx $i; print "-+->Off:$offset, Start:$t, Len:$tlen\n";
727 next; # CNS部分,不容有失,这个偶发问题暂时跳过
728 #$t = -$readlen; # 修改后Fk150_m2D的TP下降,是故,冒烟吧。
730 $vseq = substr $i->[9],$t,$tlen;
731 $vqual = substr $i->[10],$t,$tlen;
733 } elsif ($p < 0 and (-$p-1) < length($seqCIGAR)) { # S[M
734 $tmp = substr($seqCIGAR,0,-$p-1) or next;
738 $t = -$p-1 - $tlen -$offset;
739 if (abs($t) >= $readlen) {
740 #ddx $i; print "--->Off:$offset, Start:$t, Len:$tlen\n";
745 $vseq = substr $i->[9],$t,$tlen;
746 $vqual = substr $i->[10],$t,$tlen;
749 if ($vseq and length($vseq) >= $main::minVirLen
) {
750 push @
{$Bases{$p}},[$vseq,$vqual,$mtype];
752 # print ">>>$tlen, $tmp, $p\n$vseq\n$vqual\n";
753 # print substr($seqCIGAR,$p,8),"\n" if $p>0;
754 # print substr($seqCIGAR,-$p-2,8),"\n" if $p<0;
758 ddx \
%relPoses,\
%relPosesFR,\
@usingPoses,\
%Bases if $DEBGUHERE;
760 for my $p (@usingPoses) {
761 my (@theReads,$absPos);
763 $absPos = $p - $minLeft;
764 for (@
{$Bases{$p}}) {
765 my $x = reverse $_->[0];
766 my $y = reverse $_->[1];
767 push @theReads,[$x,$y,$_->[2]];
770 $absPos = $p + $minLeft;
771 if ($_[1] eq 'BSseeker2') {
772 next unless exists $Bases{$p}; # '78S5M1D30M1D17M20S' caused trouble
774 @theReads = @
{$Bases{$p}};
776 my $mergstr = mergeStr
(\
@theReads);
777 $mergstr = reverse $mergstr if $p < 0;
780 my $depth = @theReads;
781 #print "$p,$depth\t$mergstr\n";
782 $Results{$absPos} = [$depth,$mergstr];
784 print "@usingPoses\t",'-' x
25,"\n" if $DEBGUHERE;
785 # my (%absPoses,%absPosesFR);
786 # for (keys %relPoses) {
787 # $absPoses{$minLeft + $_} = $relPoses{$_};
789 # for (keys %relPosesFR) {
791 # $absPosesFR{$_ + $minLeft} = $relPosesFR{$_};
793 # $absPosesFR{$_ - $minLeft} = $relPosesFR{$_};
796 return (\
%Results,$mostHchr,$mostVchr);
800 # ["TTTGTTGATAA", "BBBBBBBBBBB", "CT"],
801 # ["CTTATTAACAAAAATC", "BBBBBBBBBBBBBBBB", "GA"],
802 # ["TTTGTTGATAAGAATTTTTATAATATTATAGAGTTTAGA", ("B" x 39), "CT"],
803 # ["TTTGTTGATAAGAATTTTTATAATATTATAGAGTTTAGATTT", ("B" x 42), "CT"],
806 my ($maxLen,$merged)=(0);
808 my $l = length $_->[0];
809 $maxLen = $l if $maxLen < $l;
811 for my $p (0 .. ($maxLen-1)) {
812 my (%Col,%ColBp) = ();
816 if ($p < length($str)) {
817 my $c = substr $str,$p,1;
818 my $q = substr $qual,$p,1;
819 if ($_->[2] eq 'CT' and $c eq 'T') {
821 } elsif ($_->[2] eq 'GA' and $c eq 'A') {
823 } # $_->[2] eq '-' means leaving the same.
825 if ($c eq 'Y') { # CT
826 $Col{$_} += $main::Qual2LgP
{$q}->[4] for qw(G A);
827 $Col{$_} += $main::Qual2LgP
{$q}->[3] for qw(C T);
828 if ($main::EnableWretchedSequel
) {
829 $Col{$_} += -10*log(1/8);
831 } elsif ($c eq 'R') { # GA
832 $Col{$_} += $main::Qual2LgP
{$q}->[4] for qw(C T);
833 $Col{$_} += $main::Qual2LgP
{$q}->[3] for qw(G A);
834 if ($main::EnableWretchedSequel
) {
835 $Col{$_} += -10*log(1/8);
838 $Col{$_} += $main::Qual2LgP
{$q}->[2] for qw(A T C G);
839 $Col{$c} += $main::Qual2LgP
{$q}->[1];
840 if ($main::EnableWretchedSequel
) {
841 $Col{$_} += -10*log(1/4);
847 if (keys(%ColBp)==1) {
848 $res = (keys(%ColBp))[0];
850 @Bps = sort { $Col{$a} <=> $Col{$b} } keys %Col; # choose the smaller one.
851 if ( $Col{$Bps[1]} - $Col{$Bps[0]} < 0.02 ) { # q=3时,-10*($lgbp-$lgbq)=0.020624399283,最小。
852 my @t = sort { $a cmp $b } @Bps[0,1];
854 $res = $REV_IUB{$t[0].$t[1]};
856 $res = $REV_IUB{$t[0]}; # Same bases がな?
863 # ddx \%Col,\%ColBp,$res,\@Bps;
869 my ($interest,$bypass,$str) = @_;
870 my $len = length $str;
871 my $Inserted = 0; # I 与 D 会造成 QSEQ 与 REF 的不同步,必须改成返回两套坐标才行。时间紧迫,先无视。
872 my $Deleted = 0; # 现在的 $seqCIGAR 不含‘D’,在遇到D时会左偏。
873 my @interestedPoses; # define as gap after index, which is from 0. Thus cmp between this & next.
874 for my $i (0 .. $len-2) {
875 my $thischar = substr $str,$i,1;
876 my $nextchar = substr $str,$i+1,1;
877 #++$Inserted if $thischar eq 'I';
878 #++$Deleted if $thischar eq 'D';
879 if ($thischar eq $nextchar or $thischar=~/[$bypass]/) {
881 } elsif ($thischar eq $interest) { # MS -> HV
882 push @interestedPoses,$i;
883 } elsif ($nextchar eq $interest) { # SM -> VH
884 push @interestedPoses,-($i+1);
889 return @interestedPoses;
894 return unless ref($in);
895 return unless ref($in) eq "HASH";
896 for my $k (keys %{$in}) {
897 if (exists $sink->{$k}) {
898 $sink->{$k} += $in->{$k};
900 $sink->{$k} = $in->{$k};
906 if ($a eq '=Sum=') { return -1; }
907 elsif ($b eq '=Sum=') { return 1; }
908 else { return $a cmp $b; }
912 my $Refilename = warnFileExist
($main::RefConfig
->{$main::RefFilesSHA
}->{'Refilename'});
914 for (@
{$main::Config
->{$main::FileData
}->{'='}}) {
915 /([^.]+)\.(\d)/ or die;
918 for my $k (keys %tID) {
919 my $outf = "$main::RootPath/${main::ProjectID}_analyse/.$k.analyse";
920 # Well, this is the f*cking reality. You know it.
921 my $outf2 = "$main::RootPath/${main::ProjectID}_analyse/$k.analyse";
922 open IN
,'<',$outf or die;
923 open OUT
,'>',$outf2 or die;
924 #print OUT join("\t",qw[ID Host_Chr H_Start H_End Virus_Chr Strand V_Start V_End]),"\n";
925 print OUT
join("\t",qw
[ID Chr BreakPoint1 BreakPoint2 Virus Strand Start End
]),"\n"; # Well, he prefer this.
929 my @dat = split /\t/;
930 #$dat[4] = $retVirus[0]->[0]; # Well, just do it.
931 $Results{$dat[1]}{$dat[2]} = \
@dat;
933 for my $chr (sort keys %Results) {
934 my @Poses = sort {$a<=>$b} keys %{$Results{$chr}};
936 my @tmp = @
{$Results{$chr}{$Poses[0]}};
937 my $last1 = pop @tmp;
938 my $last2 = pop @tmp;
939 my $last3 = pop @tmp;
941 my @tVr = split /,/,$last1;
942 for (@tVr,$last2,$last3) {
943 push @Virus,$_ if $_ != -1;
945 print OUT
join("\t",@tmp,$Virus[0],$Virus[-1]),"\n";
950 for my $i (0 .. $#Poses) {
951 if (($i == 0) or ($Poses[$i] - $Poses[$i-1] <= 20)) {
952 push @TTT,$Results{$chr}{$Poses[$i]};
958 push @Hum,$tt->[3] if $tt->[3] != -1;
959 push @Virus,$tt->[6] if $tt->[6] != -1;
960 push @Virus,$tt->[7] if $tt->[7] != -1;
961 my @tVr = split /,/,$tt->[8];
963 push @Virus,$_ if $_ != -1;
966 @Hum = sort {$a<=>$b} @Hum;
967 @Virus = sort {$a<=>$b} @Virus;
968 push @Hum,-1 if scalar @Hum == 1;
969 if (scalar @Virus == 1) {
972 my $vlen = $Virus[1] - $Virus[0];
973 if ($vlen < $main::MinVirusLength
) {
977 print OUT
join("\t",$Results{$chr}{$Hum[0]}->[0],$chr,$Hum[0],$Hum[-1],$Results{$chr}{$Hum[0]}->[4],$Results{$chr}{$Hum[0]}->[5],$Virus[0],$Virus[-1]),"\n";
979 @TTT = ($Results{$chr}{$Poses[$i]});
983 my @tmp = @
{$TTT[0]};
984 my $last1 = pop @tmp;
985 my $last2 = pop @tmp;
986 my $last3 = pop @tmp;
988 my @tVr = split /,/,$last1;
989 for (@tVr,$last2,$last3) {
990 push @Virus,$_ if $_ != -1;
992 print OUT
join("\t",@tmp,$Virus[0],$Virus[-1]),"\n";
993 #print OUT join("\t",@{$TTT[0]}),"\n";
997 # EOF this silly thing, which is favored by the mankind.