5 use File
::Path
2.08; # http://search.cpan.org/~riche/File-Path-2.11/lib/File/Path.pm#API_CHANGES
15 File
::Path
::make_path
("$main::RootPath/Ref",{verbose
=> 0,mode
=> 0755});
16 my $Refprefix = getRef2char
($main::HostRefName
,$main::VirusRefName
);
17 my $Refile = "$main::RootPath/Ref/$main::RefFilesSHA/$Refprefix.fa";
18 #warn "[$main::HostRefName,$main::VirusRefName] -> $Refprefix [$main::RefFilesSHA]\n";
20 if ( -f
"$main::RootPath/Ref/Ref.ini" ) {
21 $main::RefConfig
->read("$main::RootPath/Ref/Ref.ini");
22 $found=1 if exists $main::RefConfig
->{$main::RefFilesSHA
};
25 File
::Path
::make_path
("$main::RootPath/Ref/$main::RefFilesSHA",{verbose
=> 0,mode
=> 0755});
26 $main::RefConfig
->{$main::RefFilesSHA
} = { Refilename
=> $Refile, RefChrIDs
=> '', VirusChrIDs
=> '' };
27 open O
,'>',$Refile or die $!;
28 my $FH=openfile
($main::Config
->{'RefFiles'}->{'HostRef'});
30 while (my $ret = FastaReadNext
($FH)) {
31 if ($$ret[0] =~ /(^chrEBV$)|(Un[_-])|(random$)/) {
32 warn "[!] skip Ref[$$ret[0]].\n";
35 my $len = length $$ret[1];
36 warn "[!] read Ref[$$ret[0]], $len bp.\n";
37 print O
">$$ret[0]\n$$ret[1]\n";
38 $main::RefConfig
->{$main::RefFilesSHA
}->{$$ret[0]} = $len;
39 push @ChrIDs,$$ret[0];
41 $main::RefConfig
->{$main::RefFilesSHA
}->{RefChrIDs
} = join(',',@ChrIDs);
44 $FH=openfile
($main::Config
->{'RefFiles'}->{'VirusRef'});
45 while (my $ret = FastaReadNext
($FH)) {
46 my $len = length $$ret[1];
47 warn "[!] read Virus[$$ret[0]], $len bp.\n";
48 print O
">$$ret[0]\n$$ret[1]\n";
49 $main::RefConfig
->{$main::RefFilesSHA
}->{$$ret[0]} = $len;
50 push @ChrIDs,$$ret[0];
52 $main::RefConfig
->{$main::RefFilesSHA
}->{VirusChrIDs
} = join(',',@ChrIDs);
54 $main::RefConfig
->write("$main::RootPath/Ref/Ref.ini");
56 warn "[!] Already Read References Pairs:[$main::HostRefName,$main::VirusRefName].\n";
58 #ddx \$main::RefConfig;
60 open O
,'>',"$main::RootPath/${main::ProjectID}_index.sh" or die $!;
61 print O
"#!/bin/sh\n\nexport $main::PathPrefix\n\n";
62 push @cmd,"samtools faidx $Refile";
63 push @cmd,"$RealBin/bin/bwameth.py index $Refile";
64 push @cmd,"#python2 $RealBin/bin/BSseeker2/bs_seeker2-build.py --aligner bowtie2 -f $Refile -d ${Refile}2";
65 push @cmd,"bwa index $Refile";
66 print O
join("\n",@cmd),"\n";
68 chmod 0755,"$main::RootPath/${main::ProjectID}_index.sh";
69 warn "[!] Please run [$main::RootPath/${main::ProjectID}_index.sh] to build index.\n";
73 my $Refilename = warnFileExist
($main::RefConfig
->{$main::RefFilesSHA
}->{'Refilename'});
74 #warn "$Refilename\n";
75 my (%tID,%FQc,%maxReadNum);
76 for (@
{$main::Config
->{$main::FileData
}->{'='}}) {
77 /([^.]+)\.(\d)/ or die;
79 @
{$FQc{$1}{$2}} = split /\s*,\s*/,$main::Config
->{$main::FileData
}->{$_};
80 $maxReadNum{$1} = 0 unless exists $maxReadNum{$1};
81 $maxReadNum{$1} = $2 if $maxReadNum{$1} < $2;
84 # if ($main::FileData eq 'Base4Files') {
87 File
::Path
::make_path
("$main::RootPath/${main::ProjectID}_aln",{verbose
=> 0,mode
=> 0755});
88 open O
,'>',"$main::RootPath/${main::ProjectID}_aln.sh" or die $!;
89 print O
"#!/bin/sh\n\nexport THREADSCNT=24\nexport $main::PathPrefix\n\n";
91 for my $k (keys %tID) {
92 if ($maxReadNum{$k} == 1) { # SE
94 for my $f1 (@
{$FQc{$k}{1}}) {
95 #my $fID = basename($f1);
99 @
{$FQc{$k}{2}} = @FQ2c;
103 for my $k (keys %tID) {
104 #my @FQ1c = split /\s*,\s*/,$main::Config->{$main::FileData}->{$tID{$k}{1}};
105 #my @FQ2c = split /\s*,\s*/,$main::Config->{$main::FileData}->{$tID{$k}{2}};
106 my @FQ1c = @
{$FQc{$k}{1}}; my @FQ2c = @
{$FQc{$k}{2}};
107 die "[x] $main::FileData not paired ! [@FQ1c],[@FQ2c]\n" unless $#FQ1c == $#FQ1c;
108 my ($cmd,$cmd2,$cmd3);
111 $RealBin/bin/bwameth.py --reference $Refilename -t \${THREADSCNT} --read-group $k @{[warnFileExist($FQ1c[0],$FQ2c[0])]} 2>$main::RootPath/${main::ProjectID}_aln/$k.log | samtools view -bS - | samtools sort -n -m 2415919104 - -T $main::RootPath/${main::ProjectID}_aln/$k -o $main::RootPath/${main::ProjectID}_aln/$k.bam
114 python2 $RealBin/bin/BSseeker2/bs_seeker2-align.py --aligner bowtie2 -d ${Refilename}2 -g $Refilename --bt2--rg-id $k -1 $FQ1c[0] -2 $FQ2c[0] -o $main::RootPath/${main::ProjectID}_aln/$k.bam --temp_dir=$main::RootPath/${main::ProjectID}_aln >/dev/null
117 bwa mem -MY $Refilename -t \${THREADSCNT} -R '\@RG\\tID:$k\\tSM:$k' @{[warnFileExist($FQ1c[0],$FQ2c[0])]} 2>$main::RootPath/${main::ProjectID}_aln/$k.log | samtools view -bS - | samtools sort -n -m 2415919104 - -T $main::RootPath/${main::ProjectID}_aln/$k -o $main::RootPath/${main::ProjectID}_aln/$k.bam
119 if ($main::Aligner
eq 'bwa-meth') {
121 } elsif ($main::Aligner
eq 'BSseeker2') {
123 } elsif ($main::Aligner
eq 'bwa') {
128 for my $i (0 .. $#FQ1c) {
129 my $fID = basename
($FQ1c[$i]);
130 $fID =~ s/\.fq(\.gz)?$//i;
132 $RealBin/bin/bwameth.py --reference $Refilename -t \${THREADSCNT} --read-group '\@RG\\tID:${k}_${i}_${fID}\\tSM:$k' @{[warnFileExist($FQ1c[$i],$FQ2c[$i])]} 2>$main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID}.log | samtools view -bS - | samtools sort -n -m 2415919104 - -T $main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID} -o $main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID}.bam
135 python2 $RealBin/bin/BSseeker2/bs_seeker2-align.py --aligner bowtie2 -d ${Refilename}2 -g $Refilename --bt2--rg-id '\@RG\\tID:${k}_${i}_${fID}\\tSM:$k' -1 $FQ1c[$i] -2 $FQ2c[$i] -o $main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID}.bam --temp_dir=$main::RootPath/${main::ProjectID}_aln >/dev/null
138 bwa mem -MY $Refilename -t \${THREADSCNT} -R '\@RG\\tID:${k}_${i}_${fID}\\tSM:$k' @{[warnFileExist($FQ1c[$i],$FQ2c[$i])]} 2>$main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID}.log | samtools view -bS - | samtools sort -n -m 2415919104 - -T $main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID} -o $main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID}.bam
140 push @theBams,"$main::RootPath/${main::ProjectID}_aln/${k}_${i}_${fID}.bam";
141 if ($main::Aligner
eq 'bwa-meth') {
143 } elsif ($main::Aligner
eq 'BSseeker2') {
145 } elsif ($main::Aligner
eq 'bwa') {
149 my $theBamsJ = join(' ',@theBams);
151 samtools merge -n -l 9 $main::RootPath/${main::ProjectID}_aln/$k.bam $theBamsJ
156 $RealBin/bin/bamfilter.pl 30 5 $main::RootPath/${main::ProjectID}_aln/$k.bam $main::RootPath/${main::ProjectID}_aln/S_${k}.bam
157 samtools sort -l 0 -m 2G $main::RootPath/${main::ProjectID}_aln/S_${k}.bam -T $main::RootPath/${main::ProjectID}_aln/P_${k} -o $main::RootPath/${main::ProjectID}_aln/P_${k}.bam
158 samtools index $main::RootPath/${main::ProjectID}_aln/P_${k}.bam
162 samtools sort -m 2415919104 -n $main::RootPath/${main::ProjectID}_aln/$k.bam -O bam -T $main::RootPath/${main::ProjectID}_aln/$k.sn >$main::RootPath/${main::ProjectID}_aln/$k.sn.bam 2>>$main::RootPath/${main::ProjectID}_aln/$k.log
164 samtools view -b -F256 $main::RootPath/${main::ProjectID}_aln/$k.sn.bam >$main::RootPath/${main::ProjectID}_aln/$k.snPstat.bam 2>>$main::RootPath/${main::ProjectID}_aln/$k.log
174 File
::Path
::make_path
("$main::RootPath/${main::ProjectID}_grep",{verbose
=> 0,mode
=> 0755});
175 my $WorkINI = Galaxy
::IO
::INI
->new();
176 $WorkINI->{'Output'} = $main::Config
->{'Output'};
177 $WorkINI->{'Ref'} = $main::RefConfig
->{$main::RefFilesSHA
};
178 $WorkINI->{'InsertSizes'} = $main::Config
->{'InsertSizes'};
180 for my $k (keys %tID) {
181 my $myBamf = "$main::RootPath/${main::ProjectID}_aln/P_$k.bam";
182 $BamFiles{$k} = $myBamf;
184 $WorkINI->{'BamFiles'} = \
%BamFiles;
185 $WorkINI->write("$main::RootPath/${main::ProjectID}_grep/ToGrep.ini");
186 my $cli = "$RealBin/bin/bsanalyser -p grep $main::RootPath/${main::ProjectID}_grep/ToGrep.ini";
188 print O
"\n$RealBin/bsuit grep $main::fullcfgfile\n$RealBin/bsuit analyse $main::fullcfgfile\n";
191 chmod 0755,"$main::RootPath/${main::ProjectID}_aln.sh";
192 warn "[!] Please run [$main::RootPath/${main::ProjectID}_aln.sh] to do the aln.\n"
198 for (@
{$main::Config
->{$main::FileData
}->{'='}}) {
199 /([^.]+)\.(\d)/ or die;
202 # "780_T" => { 1 => "780_T.1", 2 => "780_T.2" },
203 # "s01_P" => { 1 => "s01_P.1", 2 => "s01_P.2" },
204 for my $k (keys %tID) {
205 my $myBamf = "$main::RootPath/${main::ProjectID}_grep/$k.bam";
206 print "[$myBamf] "; # 内部有错误示意。换行后移
207 open OUT
,'>',"${myBamf}.grep" or die "Error opening ${myBamf}.grep: $!\n";
208 open( IN
,"-|","$main::PathPrefix samtools view $myBamf") or die "Error opening $myBamf: $!\n";
209 my ($lastgid,@hReads,@vReads,%Vchr);
210 my ($fhReads,$rhReads,$fvReads,$rvReads,$flagHV,$strandOdd,$strandEven)=(0,0,0,0,0,0,0); # /\bYD:Z:f\b/
213 my @dat = split /\t/;
214 /\tZc:i:(\d+)\b/ or die "[x]TAG:Zc:i not found.\n";
216 #print "$lastgid <- $thisGroup\n";
217 #print $thisGroup,"\t",join("][",@dat),"\n";
218 if ($dat[6] ne '=') { # 假设染色体不同就算一边人一边病毒(故也统计了两遍)。严格来说也要分染色体统计,但目前只是临时补充的 用sam文件推断病毒正负链插入 功能,正式版将恢复通过后面的动态规划比对来精确判断。
219 my $flagr = ($dat[1] & 16)>>4;
220 my $flagR = ($dat[1] & 32)>>5;
221 if ($flagr == $flagR) {
222 ++$strandEven; # 相同就在负链上
227 if ($lastgid and ($lastgid != $thisGroup)) {
229 if ($main::GrepMergeBetter
and $main::Aligner
eq 'bwa-meth') {
230 $skipflag = 1 if ($fhReads < 1 or $rhReads < 1);
232 $skipflag = 1 if @hReads < 2;
234 #print "$skipflag $lastgid <- $thisGroup\n";
238 my ($MergedHds,$mostHchr,$mostVchr) = grepmerge
(\
@hReads,$main::Aligner
);
239 my ($MergedVds,$mostRchr,$mostMchr) = grepmerge
(\
@vReads,$main::Aligner
);
240 print STDERR
',' if $mostHchr ne $mostMchr;
241 print STDERR
';' if $mostVchr ne $mostRchr;
242 #ddx [$MergedHds,$MergedVds];
247 # "TTGCGAAAGCCCAAGATGATGGGATGGGAATACAGGTGCAATTTCCATCCGTAGGTTTTGTACAGCAACATGAGGGAAACATAGAGTTGCCTTGRRCRRRRRTCRTR",
253 # "CCAGATGCAATCTAATTAAACCCACCAGGTGTCTCCCCTCARRTTRRRRTRCCCCRTR",
258 if ($strandEven > $strandOdd) {
260 } elsif ($strandEven < $strandOdd) {
263 ($strandOdd,$strandEven)=(0,0);
264 my @Keys = sort {$b <=> $a} keys %{$MergedHds};
265 my @VKeys = sort {$b <=> $a} keys %{$MergedVds};
266 my @Vchrs = sort { $Vchr{$b} <=> $Vchr{$a} } keys %Vchr;
271 @Vposes = ($VKeys[0],-1);
273 @Vposes = (-1,-$VKeys[0]);
275 } elsif (@VKeys == 2) {
276 @Vposes = ($VKeys[0],-$VKeys[1]);
280 print OUT
join("\t",$lastgid,$hReads[0]->[2],$Keys[0],-1,@
{$MergedHds->{$Keys[0]}},0,'N',$tmp,$Vchrs[0],@Vposes),"\n";
282 print OUT
join("\t",$lastgid,$hReads[0]->[2],-1,-$Keys[0],0,'N',@
{$MergedHds->{$Keys[0]}},$tmp,$Vchrs[0],@Vposes),"\n";
284 } elsif (@Keys == 2) {
285 print OUT
join("\t",$lastgid,$hReads[0]->[2],$Keys[0],-$Keys[1],@
{$MergedHds->{$Keys[0]}},@
{$MergedHds->{$Keys[1]}},$tmp,$Vchrs[0],@Vposes),"\n";
287 #print OUT join("\t",$lastgid,$hReads[0]->[2],$_,@{$MergedHds->{$_}}),"\n" for sort { $a <=> $b } keys %{$MergedHds};
292 $lastgid = $thisGroup;
293 ($fhReads,$rhReads,$fvReads,$rvReads)=(0,0,0,0);
295 $lastgid = $thisGroup;
299 if ($dat[5] !~ /[IDH]/) { # ignore [IDH] until 我有空写好 getDeriv()。
301 if (/\bYD:Z:f\b/) {++$fhReads}
304 } elsif (/\bZd:Z:V\b/) {
307 if ($dat[5] !~ /[IDH]/) {
309 if (/\bYD:Z:f\b/) {++$fvReads}
323 for (@
{$main::Config
->{$main::FileData
}->{'='}}) {
324 /([^.]+)\.(\d)/ or die;
327 # "780_T" => { 1 => "780_T.1", 2 => "780_T.2" },
328 # "s01_P" => { 1 => "s01_P.1", 2 => "s01_P.2" },
329 File
::Path
::make_path
("$main::RootPath/${main::ProjectID}_grep",{verbose
=> 0,mode
=> 0755});
330 my $GrepResult = Galaxy
::IO
::INI
->new();
332 for my $k (keys %tID) {
333 my $myBamf = "$main::RootPath/${main::ProjectID}_aln/$k.bam";
334 my $InsMean = $main::Config
->{'InsertSizes'}->{$k} or die;
335 my $InsSD = $main::Config
->{'InsertSizes'}->{"$k.SD"} or die; # SD cannot be 0, so no need to test with defined.
336 #warn "$myBamf,$InsMean,$InsSD";
337 $GrepResult->{$k} = {
342 $GrepResult->{$k} = { DatFile
=> "$main::RootPath/${main::ProjectID}_grep/$k.sam" };
343 open( IN
,"-|","$main::PathPrefix samtools view $myBamf") or die "Error opening $myBamf: $!\n"; # `-F768` later
344 system( "$main::PathPrefix samtools view -H $myBamf >".$GrepResult->{$k}{'DatFile'} );
345 open GOUT
,'>>',$GrepResult->{$k}{'DatFile'} or die "$!";
346 print GOUT
join("\t",'@PG','ID:bsuit',"CL:\"grep $cfgfile\""),"\n";
347 print STDERR
"[!] Reading [$myBamf] ...";
348 while (my $line = <IN
>) {
349 my @Dat1 = split /\t/,$line;
352 while ($Dat1[5] =~ /(\d+)S/g) {
353 $maxSC = $1 if $maxSC < $1;
355 $flag |= 1 if $maxSC > $main::minSoftClip
; # 加上SC的话,内存不乐观
356 if ($Dat1[6] eq '=') {
357 #$flag |= 1 if abs(abs($Dat1[8])-$InsMean) > 3*$InsSD;
358 $flag |= 2 if exists($main::VirusChrIDs
{$Dat1[2]}) or exists($main::VirusChrIDs
{$Dat1[6]});
360 $flag |= 4 if exists($main::VirusChrIDs
{$Dat1[2]}) or exists($main::VirusChrIDs
{$Dat1[6]});
361 $flag &= ~1 if $flag & 4; # 人的PE,不同染色体的hit直接扔掉不管。
364 $flag |= 8 if $Dat1[5] !~ /^\d+M$/; # 数据只是占位置,所以可以去掉次行
365 my $id = join("\t",$k,$Dat1[0]);
366 $ReadsIndex{$id} = [[0,$Dat1[2],$flag]]; # 初始化,数据只是占位置。
369 open IN
,'-|',"$main::PathPrefix samtools view $myBamf" or die "Error opening $myBamf: $!\n"; # `-F768` later
370 while (my $line = <IN
>) {
371 #my ($id, $flag, $ref, $pos, $mapq, $CIGAR, $mref, $mpos, $isize, $seq, $qual, @OPT) = split /\t/,$line;
372 #print "$id, $flag, $ref, $pos, $mapq, $CIGAR, $mref, $mpos, $isize\n";
373 my @Dat1 = split /\t/,$line;
375 if ($Dat1[1] & 0x40) {
377 } elsif ($Dat1[1] & 0x80) {
382 #die '[x]SAM/BAM file not paired !' unless defined($line2);
383 #my @Dat2 = split /\t/,$line2;
384 #next if $Dat1[4]<$CFGminMAPQ;
387 while ($Dat1[5] =~ /(\d+)S/g) {
388 $maxSC = $1 if $maxSC < $1;
390 $flag |= 1 if $maxSC > $main::minSoftClip
;
391 if ($Dat1[6] eq '=') {
392 #$flag |= 1 if abs(abs($Dat1[8])-$InsMean) > 3*$InsSD;
393 $flag |= 2 if exists($main::VirusChrIDs
{$Dat1[2]}) or exists($main::VirusChrIDs
{$Dat1[6]});
395 $flag |= 4 if exists($main::VirusChrIDs
{$Dat1[2]}) or exists($main::VirusChrIDs
{$Dat1[6]});
396 $flag &= ~1 if $flag & 4;
399 $flag |= 8 if $Dat1[5] !~ /^\d+M$/; # soft-clip
400 my $curpos = tell(GOUT
);
401 my $id = join("\t",$k,$Dat1[0]);
402 #warn "$flag $InsSD\t$curpos\n[${line}]\n" if $flag>1;
403 next unless defined $ReadsIndex{$id};
404 push @
{$ReadsIndex{$id}},[$curpos,$r12R1,@Dat1[2,3,4,5],$flag]; # Chr, Pos, MapQ, CIGAR
405 #print "($Dat1[2],$Dat1[3],$Dat1[5],$curpos)\n";
407 #ddx $ReadsIndex{$id} if scalar @{$ReadsIndex{$id}} > 2;
412 print STDERR
"\b\b\bdone.\n";
414 open $tFH{$k},'<',$GrepResult->{$k}{'DatFile'} or die "$!";
417 for my $tk (keys %ReadsIndex) {
418 my ($flag,%tmpH,%tmpV)=(0);
419 for my $i (1 .. $#{$ReadsIndex{$tk}}) {
420 #my $x = join("\t",$ReadsIndex{$tk}->[$i][2],$ReadsIndex{$tk}->[$i][3]);
421 my $x = $ReadsIndex{$tk}->[$i][4];
422 if (exists $main::VirusChrIDs
{$ReadsIndex{$tk}->[$i][2]}) {
423 $tmpV{$x} = $i; # 先认为最高质量的唯一。
427 $flag |= $ReadsIndex{$tk}->[$i][6];
429 if ((keys %tmpH) > 0) { # Select max MapQ as 1st & favors Human to Virus.
430 my @s = sort { $b <=> $a } keys %tmpH;
431 $ReadsIndex{$tk}->[0][0] = $tmpH{$s[0]};
432 $ReadsIndex{$tk}->[0][1] = join("\t",$ReadsIndex{$tk}->[$tmpH{$s[0]}][2],$ReadsIndex{$tk}->[$tmpH{$s[0]}][3]);
434 my @s = sort { $b <=> $a } keys %tmpV;
435 $ReadsIndex{$tk}->[0][0] = $tmpV{$s[0]};
436 $ReadsIndex{$tk}->[0][1] = join("\t",$ReadsIndex{$tk}->[$tmpV{$s[0]}][2],$ReadsIndex{$tk}->[$tmpV{$s[0]}][3]);
438 #my @s = sort sortChrPos(keys %tmp);
439 $ReadsIndex{$tk}->[0][2] = $flag;
440 #ddx $ReadsIndex{$tk};
442 # [1, "chr15\t20858350", 15], # index, ChrPos, 总flag
443 # [1477321, 1, "chr15", 20858350, 0, "48M42S", 12],
444 # [33704587, 1, "gi|59585|emb|X04615.1|", 652, 0, "45S45M", 11],
445 # [34264944, 2, "gi|59585|emb|X04615.1|", 717, 60, "56M34S", 12],
448 warn "[!] Bam Reading done.\n";
450 #ddx (\%RefChrIDs,\%main::VirusChrIDs);
451 open BOUT
,'>',"$main::RootPath/${main::ProjectID}_grep/blocks.ini" or die "$!";
453 open BOUTDBG
,'>',"$main::RootPath/${main::ProjectID}_grep/blocks.txt" or die "$!";
455 my @IDsorted = sort { sortChrPos
($ReadsIndex{$a}->[0][1],$ReadsIndex{$b}->[0][1]) } keys %ReadsIndex;
456 my ($Cnt,%hChrRange,%vChrRange,@Store,@PureVirReads)=(0);
457 for my $cid (@IDsorted) {
458 my @minCPR = split /\t/,$ReadsIndex{$cid}->[0][1];
459 push @minCPR,$ReadsIndex{$cid}->[0][0];
460 if (exists $main::VirusChrIDs
{$minCPR[0]}) {
461 push @PureVirReads,$cid;
464 my ($thisehPos,$thisvChr,$thissvPos,$thisevPos);
465 for my $i (1 .. $#{$ReadsIndex{$cid}}) {
466 my ($DatRef,$isHost);
467 if (exists $main::VirusChrIDs
{$ReadsIndex{$cid}->[$i]->[2]}) {
468 $DatRef = \
%vChrRange;
471 $DatRef = \
%hChrRange;
474 while ( mergeIn
($isHost,$DatRef,$ReadsIndex{$cid}->[$i],\
@Store,$cid,$i) ) { # 目前只有桥墩,没有中间部分
476 print BOUT
"[B$Cnt]\nHostRange=",formatChrRange
(\
%hChrRange),
477 "\nVirusRange=",formatChrRange
(\
%vChrRange),"\nSamFS=",
478 join(',',map { my @t = split /\n/;my @f=split /\t/,$t[0];join(':',$f[0],$ReadsIndex{$t[0]}->[$t[1]]->[0]); } @Store),"\n\n";
480 print BOUTDBG
"[B$Cnt]\nHostRange=",formatChrRange
(\
%hChrRange),
481 "\nVirusRange=",formatChrRange
(\
%vChrRange),"\nSamFS=",
482 join(',',map { my @t = split /\n/;my @f=split /\t/,$t[0];join(':',$f[0],$ReadsIndex{$t[0]}->[$t[1]]->[0]); } @Store),"\n";
485 my @f=split /\t/,$t[0];
486 seek($tFH{$f[0]},$ReadsIndex{$t[0]}->[$t[1]]->[0],0);
487 my $str = readline $tFH{$f[0]};
492 %hChrRange = %vChrRange = @Store = ();
498 close $tFH{$_} for keys %tFH;
501 #warn $GrepResult->write_string;
503 $GrepResult->write("$main::RootPath/${main::ProjectID}_grep.ini");
504 warn "[!] grep done with [$Cnt] items.\n";
508 my $Refilename = warnFileExist
($main::RefConfig
->{$main::RefFilesSHA
}->{'Refilename'});
510 for (@
{$main::Config
->{$main::FileData
}->{'='}}) {
511 /([^.]+)\.(\d)/ or die;
514 File
::Path
::make_path
("$main::RootPath/${main::ProjectID}_analyse",{verbose
=> 0,mode
=> 0755});
516 my $VirusFN = "$main::RootPath/${main::ProjectID}_analyse/Virus.fa";
517 open VIRUS
,'>',$VirusFN or die;
518 my $FH = openfile
($main::Config
->{'RefFiles'}->{'VirusRef'});
519 while (my $ret = FastaReadNext
($FH)) {
521 print VIRUS
'>',$ret->[0],"\n",$ret->[1],"\n";
526 my $tmptmpf = "$main::RootPath/${main::ProjectID}_grep/Greped.ini";
527 open TMPTMP
,'<',$tmptmpf or die;
530 /^\[(\d+)\]$/ or die;
535 next if /^VirRange=NA$/;
537 my $tmp = (split /:/,$_)[-1];
542 for my $k (keys %tID) {
543 my $myGrepf = "$main::RootPath/${main::ProjectID}_grep/$k.bam.grep";
544 print "[$myGrepf]\n";
545 my $outf = "$main::RootPath/${main::ProjectID}_analyse/.$k.analyse";
546 open IN
,'<',$myGrepf or die;
547 open OUT
,'>',$outf or die;
548 my (@OutCnt,%OutDat)=(0,0,0);
551 my @LineDat = split /\t/,$_;
553 #my $query = @LineDat[5,7];
555 my ($left,$right,$strand);
556 if ($LineDat[5] ne 'N') { # 时间有限,病毒只支持第一条染色体
557 my $ta = doAln
($VirusFN,$LineDat[5],1);
560 if ($strand eq '+') {
562 $right = $ta->[1] + $ta->[2];
565 $left = $ta->[1] - $ta->[2];
569 if ($LineDat[7] ne 'N') {
570 my $tb = doAln
($VirusFN,$LineDat[7],-1);
573 if ($strand eq '+') {
574 $left = $tb->[1] unless defined $left;
575 $right = $tb->[1] + $tb->[2];
577 $right = $tb->[1] unless defined $right;;
578 $left = $tb->[1] - $tb->[2]; # 负链还是有问题
582 if ($LineDat[2] == -1) {
583 $LineDat[2] = $LineDat[3];
586 $LineDat[3] = -1 if $LineDat[3] == $LineDat[2] + 1; # 貌似正负链加减一没统一?
587 #next unless defined $strand;
589 for (@LineDat[10,11]) {
590 push @
{$tVr},$_ if $_ != -1;
592 unless (defined $strand) { # Well, we need more poistive.
593 #print OUT join("\t",@LineDat[0..3],'Virus','NA','0','0'),"\n";
594 my @range = split /-/,$TMPtmp{$LineDat[0]};
595 $OutDat{$LineDat[1]}{$LineDat[2]} = [$LineDat[0],$LineDat[3],$LineDat[9],$LineDat[8],@range,$tVr]; # 临时补丁
599 #print OUT join("\t",@LineDat[0..3],'Virus',$strand,$left,$right),"\n";
600 $OutDat{$LineDat[1]}{$LineDat[2]} = [$LineDat[0],$LineDat[3],$LineDat[9],$strand,$left,$right,$tVr];
603 for my $chr (sort {alphanum
($a,$b)} keys %OutDat) {
604 my ($lastL,$lastR) = (-1,-1);
605 for my $pos (sort {$a <=> $b} keys %{$OutDat{$chr}}) {
609 #print STDERR ">>> $pos\n";
610 } elsif (($pos - $lastR) <= $main::ResultMergeRange
) {
611 #print STDERR "-> $pos - ($lastL,$lastR)\n";
612 $lastR = $pos; # 暂时只考虑左端点的合并
615 my @Dat = @
{$OutDat{$chr}{$lastL}}; # 假设第一条的病毒结果最准确(其实应该是中间某个;最好前期打分)
616 $Dat[1] = -1 if $Dat[1] == $lastL + 1;
617 print OUT
join("\t",$Dat[0],$chr,$lastL,@Dat[1..5],join(',',@
{$Dat[6]})),"\n";
619 #print STDERR join("\t",'---',$Dat[0],$chr,$lastL,@Dat[1..$#Dat]),"\n";
621 ($lastL,$lastR) = ($pos,$pos); # 下一轮初值。
625 my @Dat = @
{$OutDat{$chr}{$lastL}}; # 假设第一条的病毒结果最准确(其实应该是中间某个;最好前期打分)
626 $Dat[1] = -1 if $Dat[1] == $lastL + 1;
627 print OUT
join("\t",$Dat[0],$chr,$lastL,@Dat[1..5],join(',',@
{$Dat[6]})),"\n";
632 warn "[!]O: $OutCnt[0]+$OutCnt[1] => $OutCnt[2] in [$k], merged=",$OutCnt[0]+$OutCnt[1]-$OutCnt[2],".\n";
640 unless (exists $main::Config
->{'Simed'}) {
641 warn "[x]config_file NOT generated by `simVirusInserts.pl` !\n";
644 my (%VirFrag,@Refticks,@Virticks,%VirFragSE);
645 for (@
{$main::Config
->{'Simed'}->{'='}}) {
646 if (/^([^.]+)\.VirFrag$/) {
647 $VirFrag{$1} = $main::Config
->{'Simed'}->{$_} or die;
648 } elsif ($_ eq 'Refticks') {
649 @Refticks = split /,/,$main::Config
->{'Simed'}->{$_} or die;
650 } elsif ($_ eq 'Virticks') {
651 @Virticks = split /,/,$main::Config
->{'Simed'}->{$_} or die;
654 my %Refticksh = map { $Refticks[$_] => $_ } 0..$#Refticks; # http://stackoverflow.com/a/2957903/159695
655 open OUT
,'>',"$main::RootPath/${main::ProjectID}_check.log" or die;
656 for my $k (keys %VirFrag) {
657 my (@VirFragStartEnd);
658 for my $pVir (@Virticks) {
659 my $st = $pVir-int(0.5*$VirFrag{$k});
660 my $ed = $st + $VirFrag{$k};
662 push @VirFragStartEnd,[$st,$ed];
664 $VirFragSE{$k} = \
@VirFragStartEnd;
665 if ( (! defined $bias) or $bias<0 ) {
666 for my $i (0 .. $#Refticks) {
667 print OUT
"$k $i:\t$Refticks[$i], ",join('-',@
{$VirFragStartEnd[$i]}),"\n";
671 $bias = 10 unless (defined $bias and $bias >= 0);
673 warn 'Load @Refticks:',scalar(@Refticks),', @Virticks',scalar(@Virticks),". Bias_Allowed=$bias\n";
675 for my $p (@Virticks) {
676 $VirChrP{$_}=1 for ($p-$bias)..($p+$bias);
679 my (%tID,%Result,%FragLength);
680 for (@
{$main::Config
->{$main::FileData
}->{'='}}) {
681 /([^.]+)\.(\d)/ or die;
684 for my $k (keys %tID) {
685 my $myAnaf = "$main::RootPath/${main::ProjectID}_analyse/$k.analyse";
686 my $myGrepf = "$main::RootPath/${main::ProjectID}_grep/$k.bam.grep";
688 print OUT
"[$myAnaf]\n";
689 open IN
,'<',$myGrepf or die;
692 my @LineDat = split /\t/,$_;
693 if ($LineDat[5] ne 'N') {
694 my $flen1 = length $LineDat[5];
695 ++$FragLength{$k}{$flen1};
696 ++$FragLength{'=Sum='}{$flen1};
698 if ($LineDat[7] ne 'N') {
699 my $flen2 = length $LineDat[7];
700 ++$FragLength{$k}{$flen2};
701 ++$FragLength{'=Sum='}{$flen2};
705 open ANA
,'<',$myAnaf or die;
709 my @dat = split /\t/;
710 my (undef,$chr,$pos1,$pos2,undef,$strand,$vp1,$vp2) = @dat;
711 $pos2 = $pos1 if $pos2 == -1;
712 if ($chr eq $hchrid) {
714 for my $p ( ($pos1-$bias)..($pos2+$bias) ) {
715 if (exists $Refticksh{$p}) {
717 my $idx = $Refticksh{$p};
718 print OUT
'h',$Refticks[$idx],",";
719 ($va,$vb) = @
{$VirFragSE{$k}->[$idx]};
720 if ($strand eq 'NA') {
724 if (($vp1<=$vb) and ($vp2>=$va)) {
728 print OUT
"v$va-$vb,";
729 } elsif (($vp1 <= $vb+$bias) and ($vp2 >= $va-$bias)) {
732 print OUT
"m$va-$vb,";
734 for my $p ($vp1 .. $vp2) {
735 if (exists $VirChrP{$p}) {
746 print OUT
"x$va-$vb,";
749 print OUT
"$flag\t$_\n";
750 ++$Result{$k}{$flag};
751 ++$Result{'=Sum='}{$flag};
757 open P
,'>',"$main::RootPath/${main::ProjectID}_plot.dat" or die;
758 open PH
,'>',"$main::RootPath/${main::ProjectID}_plot.sh" or die;
759 print PH
"#!/bin/bash\ngnuplot -persist <<-'EOFP1'
760 set xlabel \"Length\"
762 set title 'Histgram of Identified Fragments'
764 set output \"$main::RootPath/${main::ProjectID}_plot.svg\"
766 infile = '$main::RootPath/${main::ProjectID}_plot.dat'
767 tempfile = '$main::RootPath/${main::ProjectID}_plot.tmp'\n",
768 'stats infile u 1:2 nooutput
769 blocks = STATS_blocks
774 do for[i=0:blocks-1] {
775 stats infile index i u (first_x=($0==1)?sprintf("%s %f",first_x,$1):first_x,first_y=($0==1)?sprintf("%s %f",first_y,$2):first_y,$1):2 nooutput
776 print sprintf("%f %f",STATS_pos_max_y,STATS_max_y)
781 print sprintf("%s %s",word(first_x,i),word(first_y,i))
787 plot for[i=0:blocks-1] infile i i u 1:2 w lines title columnheader(1),\
788 for[i=0:1] tempfile i i u 1:2:($0+1) w points pt (i==0?7:9) lc variable not
790 my @IDs = sort sortWsum
keys %FragLength;
791 for my $k (0 .. $#IDs) {
795 for my $i (sort {$a <=> $b} keys %{$FragLength{$IDs[$k]}}) {
796 print P
"$i\t$FragLength{$IDs[$k]}{$i}\n";
803 print PH
"\nEOFP1\n";
805 chmod 0755,"$main::RootPath/${main::ProjectID}_plot.sh";
806 system "$main::RootPath/${main::ProjectID}_plot.sh";
810 my $Refilename = warnFileExist
($main::RefConfig
->{$main::RefFilesSHA
}->{'Refilename'});
812 for (@
{$main::Config
->{$main::FileData
}->{'='}}) {
813 /([^.]+)\.(\d)/ or die;
817 for my $k (keys %tID) {
818 my $GrepResult = "$main::RootPath/${main::ProjectID}_grep/$k.sam";
819 my $InsMean = $main::Config
->{'InsertSizes'}->{$k} or die;
820 my $InsSD = $main::Config
->{'InsertSizes'}->{"$k.SD"} or die; # SD cannot be 0, so no need to test with defined.
821 open $tFH{$k},'<',$GrepResult or die "$!";
823 File
::Path
::make_path
("$main::RootPath/${main::ProjectID}_analyse",{verbose
=> 0,mode
=> 0755});
824 my $BlockINI = Galaxy
::IO
::INI
->new();
825 my $BlockINIFN = "$main::RootPath/${main::ProjectID}_grep/blocks.ini";
826 if ( -f
$BlockINIFN ) {
827 $BlockINI->read($BlockINIFN);
828 } else {die "[x] Grep INI not found ! [$BlockINIFN]\n";}
829 warn "[!] minHostDepth = $main::minHostDepth\n";
830 warn "[!] Running in DEVELOP(chr18 only) mode !\n" if ($main::DEVELOP
);
831 open OA
,'>',"$main::RootPath/${main::ProjectID}_analyse.txt" or die;
832 for my $Bid (@
{$BlockINI->{']'}}) {
833 my @HostRange = split /,/,$BlockINI->{$Bid}->{'HostRange'};
834 my @VirusRange = split /,/,$BlockINI->{$Bid}->{'VirusRange'};
835 my @SamFS = split /,/,$BlockINI->{$Bid}->{'SamFS'};
836 next if (@HostRange<1 or @VirusRange<1); # Need to do assembly
837 my ($maxHostDepth,$maxItem) = (0,-1);
838 for my $i (0 .. $#HostRange) {
839 my ($chr,$range,$depth) = split /:/,$HostRange[$i];
840 if ($maxHostDepth < $depth) {
841 $maxHostDepth = $depth;
844 $HostRange[$i] = [$chr,(split /-/,$range),$depth];
846 if ($main::DEVELOP
) {
847 next if $HostRange[$maxItem][0] ne 'chr18';
849 next if $maxHostDepth < $main::minHostDepth
;
850 #ddx $maxItem,\@HostRange;
851 File
::Path
::make_path
("$main::RootPath/${main::ProjectID}_analyse/idba/$Bid",{verbose
=> 0,mode
=> 0755});
852 system("$main::PathPrefix samtools faidx $Refilename $HostRange[$maxItem][0]:$HostRange[$maxItem][1]-$HostRange[$maxItem][2] >$main::RootPath/${main::ProjectID}_analyse/idba/Ref.fa");
854 open $FH,'<',"$main::RootPath/${main::ProjectID}_analyse/idba/Ref.fa" or die $!;
855 my $retHost = FastaReadNext
($FH);
858 $FH=openfile
($main::Config
->{'RefFiles'}->{'VirusRef'});
859 while (my $ret = FastaReadNext
($FH)) {
863 open FHo
,'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Refo.fa" or die $!;
864 open FHf
,'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Reff.fa" or die $!;
865 open FHr
,'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Refr.fa" or die $!;
866 for ($retHost,@retVirus) {
868 print FHo
">$id\n$seq\n";
871 print FHf
">${id}_F\n$seqF\n";
874 print FHr
">${id}_R\n$seqR\n";
876 close FHo
; close FHf
; close FHr
;
883 open $FHO{'o'}->[0],'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Readso.fa" or die $!;
884 open $FHO{'f'}->[0],'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Readsf.fa" or die $!;
885 open $FHO{'r'}->[0],'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Readsr.fa" or die $!;
888 my ($fid,$pos) = split /:/,$_;
889 seek($tFH{$fid},$pos,0);
890 my $str = readline $tFH{$fid};
892 my $strlen = length $str;
893 $maxReadLen = $strlen if $maxReadLen < $strlen;
894 my @dat = split /\t/,$str;
895 next if $dat[1] & 256; # FLAG, s | 0x0100 | the alignment is not primary
896 next if $dat[4] < 30; # MAPQ
898 $seq = revcom
($seq) if $dat[1] & 16;
900 $ReadsbyID{$dat[0]}->[1]=$seq;
901 } elsif ($dat[1] & 128) {
902 $ReadsbyID{$dat[0]}->[2]=$seq;
904 if (($dat[2] eq $HostRange[$maxItem][0]) or (exists $main::VirusChrIDs
{$dat[2]})) {
905 ++$ReadsbyID{$dat[0]}->[0];
908 for my $id (keys %ReadsbyID) {
909 next unless defined($ReadsbyID{$id}->[1]) or defined($ReadsbyID{$id}->[2]); # and->or to include SE.
910 $ReadsbyID{$id}->[1] = '' unless defined($ReadsbyID{$id}->[1]);
911 $ReadsbyID{$id}->[2] = '' unless defined($ReadsbyID{$id}->[2]);
912 next unless defined($ReadsbyID{$id}->[0]);
913 if ($main::FORCE_UNMETH
) {
915 $ReadsbyID{$id}->[1] =~ tr/gG/aA/;
916 $ReadsbyID{$id}->[2] =~ tr/cC/tT/;
918 $ReadsbyID{$id}->[1] =~ tr/cC/tT/;
919 $ReadsbyID{$id}->[2] =~ tr/gG/aA/;
922 my $type = guessMethyl
( $ReadsbyID{$id}->[1] . revcom
($ReadsbyID{$id}->[2]) );
923 my $str = join("\n",">$id/1",$ReadsbyID{$id}->[1],">$id/2",$ReadsbyID{$id}->[2],'');
925 if ($type eq '1CT') {
926 $FH = $FHO{'f'}->[0];
928 } elsif ($type eq '2GA') {
929 $FH = $FHO{'r'}->[0];
932 $FH = $FHO{'o'}->[0];
937 #close $FHO{$_}->[0] for keys %FHO;
940 $idbaRead = '-l' if $maxReadLen > 128;
941 for my $fro (keys %FHO) {
942 close $FHO{$fro}->[0];
943 next if $FHO{$fro}->[1] <= 2;
944 my $reff = "$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Ref$fro.fa";
945 my $readsf = "$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/Reads$fro.fa";
946 next unless -s
$readsf;
947 my $outp = "$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/$fro";
948 system("$RealBin/bin/idba_hybrid $main::idbacmd $idbaRead $readsf --reference $reff -o $outp >/dev/null");
949 next unless -s
"$outp/scaffold.fa";
951 open $tFH,'<',"$outp/scaffold.fa" or die $!;
952 while (my $ret = FastaReadNext
($tFH)) {
955 $Assem{$fro} = \
@asm;
957 my $ret = doAlign
(\
%Assem,[$retHost],\
@retVirus);
959 print OA
"[$Bid]\nRefCut: $ret->[0] $ret->[1]\nVirusCut: $ret->[2] $ret->[3] +$ret->[4]\n\n" if $ret->[1] > 0;
961 open DBG
,'>',"$main::RootPath/${main::ProjectID}_analyse/idba/$Bid/idba.fa" or die $!;
962 print DBG
">Host\n$$retHost[1]\n\n";
963 #print DBG ">Virus$_->[0]\n$_->[1]\n" for @retVirus;
965 for my $fro (sort keys %Assem) {
966 print DBG
">Asm_${fro}$_ $Assem{$fro}->[$_]->[0]\n$Assem{$fro}->[$_]->[1]\n" for 0 .. $#{$Assem{$fro}};
968 print DBG
"\nResult: ",join(",",@
$ret),"\n";
973 close $tFH{$_} for keys %tFH;
979 samtools view
-h
/share/user
s/huxs/work/bsvir
/bsI/SZ0010_aln
/780_T.bam '*' | samtools bam2fq -O - | gzip -9 > /share
/users/huxs
/work
/bsvir
/bsI
/SZ0010_aln/780_T
.unmap
.fq
.gz
980 samtools view
-h
/share/user
s/huxs/work/bsvir
/bsI/SZ0010_aln
/s01_P.bam '*'|samtools bam2fq -O -|gzip -9 > /share
/users/huxs
/work
/bsvir
/bsI
/SZ0010_aln/s01_P
.unmap
.fq
.gz
&
982 ./bin/bwameth
.py
--reference
/share/user
s/huxs/work/bsvir
/HBV.AJ507799.2.fa -t \${THREADSCNT} --read-group 780_T -p ~/work
/bsvir/bsI
/SZ0010_aln/780_T
.unmap
~/work/bsvir
/bsI/SZ0010_aln
/780_T.unmap.fq.gz 2>~/work
/bsvir/bsI
/SZ0010_aln/780_T
.unmap
.log & #/
983 ./bin/bwameth
.py
--reference
/share/user
s/huxs/work/bsvir
/HBV.AJ507799.2.fa -t \${THREADSCNT} --read-group s01_P -p /share
/users/huxs
/work
/bsvir
/bsI
/SZ0010_aln/s01_P
.unmap
/share/user
s/huxs/work/bsvir
/bsI/SZ0010_aln
/s01_P.unmap.fq.gz 2>/share
/users/huxs
/work
/bsvir
/bsI
/SZ0010_aln/s01_P
.unmap
.log &
985 samtools view
-h
/share/user
s/huxs/work/bsvir
/bsI/SZ0010_aln
/780_T
.bam
'gi|86261677|emb|AJ507799.2|'