3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120330
8 #use Data::Dump qw(ddx);
10 die "Usage: $0 <Cut sites> <sorted bam> <out prefix>\n" if @ARGV<3;
15 my $ReadLen = 100-5; # 95
17 my $minAlignLen = int($ReadLen * 0.6);
18 $minAlignLen = $minOKalnLen if $minAlignLen < $minOKalnLen;
19 die if $ReadLen < $minOKalnLen;
23 my $EseqL="CTGCA";#substr $Eseq,0,$EcutAt;
26 my $EfwdTerm5=1-$EcutAt+1; # -3
27 my $EfwdTerm3=1-$EcutAt+$ReadLen; # 91
29 my $ErevTerm5=$ErevTerm3-$ReadLen+1; # -94
30 # 12008 -> [12005,12099],[11914,12008]
31 my $Rfwd2Ec = -$EfwdTerm5; # 3
32 my $Rrev2Ec = -$ErevTerm3; # 0
37 if ($filename=~/.bz2$/) {
38 open( $infile,"-|","bzip2 -dc $filename") or die "Error opening $filename: $!\n";
39 } elsif ($filename=~/.gz$/) {
40 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
41 } else {open( $infile,"<",$filename) or die "Error opening $filename: $!\n";}
47 if ($filename=~/.bam$/) {
48 open( $infile,"-|","samtools view -h -F 132 $filename") or die "Error opening $filename: $!\n";
49 } elsif ($filename=~/.sam.gz$/) {
50 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
51 } elsif ($filename=~/.sam$/) {
52 open( $infile,"<",$filename) or die "Error opening $filename: $!\n";
54 #die "[x]Only support .sam(.gz) & .bam [$filename]\n";
55 open( $infile,"-|","samtools view -h -F 132 $filename") or die "Error opening $filename: $!\n. Only support .sam(.gz) & .bam !";
60 my $ecin = openfile
($inec);
61 my $samin = opensam
($insam);
63 open $O,'|-',"gzip -9c >$outp.elen.gz" or die "Error opening $outp.edep.gz with gzip: $!\n";
64 open $N,'|-',"gzip -9c >$outp.nlen.gz" or die "Error opening $outp.ndep.gz with gzip: $!\n";
65 open L
,'>',$outp.'.eclog' or die "Error opening $outp.edep.log: $!\n";
69 my $tmpstr="From [$insam],[$inec] to [$outp.{e,n}dep.gz]\n";
73 $tmpstr = "#".join("\t",qw
/ChrID eCut_pos isMutSite Reads_Count Reads_Len_mean Reads_Cnt(fwd_U,rev_U,fwd_R,rev_R) Reads_Len_mean(same)/)."\n";
78 my (%nDat,%eDat,@ChrOrder);
79 my ($eCntAll,%eCnt)=(0);
83 my ($chr,$pos) = split /\t/;
84 unless (exists $eDat{$chr}) {
90 $eDat{$chr}{$pos}=[0,0,0,0,0,0,0,0,0]; # [SumF,CountF, SumR,CountR] for average read length (len on ref). The last(9th) is flag for mutated eCut_site
91 # first 4, Unique(XT:A:U); second 4, Repeat(XT:A:R).
92 # XT:A:N is fragmental like "61S12M2D3M2D10M14S".
93 # Mate-sw(XT:A:M) is either short or with many(like 7) mismatches(including 'N' on reads)
99 my @cigar = $cigar =~ /(\d+)(\w)/g;
100 my ($reflen,$maxM)=(0,0);
102 my ($len,$op) = splice(@cigar,0,2);
105 $maxM = $len if $maxM < $len;
108 $reflen += $len if $op eq 'D';
110 return [$reflen,$maxM];
113 my ($ReadsCnt,$ReadsSkipped,%eCutSiteCnt,%ChrLen)=(0,0);
114 print L
"[ChrNFO]\nbyChr = <<TABLE\n#ChrID\tLen\teCut_Count\n";
116 if (/^@\w\w\t\w\w:/) {
117 if (/^\@SQ\tSN:(\S+)\tLN:(\d+)$/) {
118 if (exists $eDat{$1}) {
120 #print STDERR "Chr:[$1]\tLen:[$2], Cut:[",scalar keys %{$eDat{$1}},"]\n";
121 print L
"$1\t$2\t",$eCnt{$1},"\n";
123 warn "Chr:[$1], Len:[$2] not cut.\n";
124 print L
"$1\t$2\t0\n";
129 my @read1=split /\t/,$_,12;
132 if ($read1[11] =~ /\bXT:A:[RN]\b/) {
135 my ($reflen,$maxM)=@
{cigar2reflen
($read1[5])};
136 if ($maxM<$minOKalnLen or $reflen<$minAlignLen) {
139 #print join("\t",@read1[0..8]),"\n**>\t$reflen,$maxM\n";
143 my ($strandShift,$thePos)=(0);
144 if ($read1[1] & 16) { # reverse
145 $thePos = $read1[3] + $reflen-1 + $Rrev2Ec;
148 $thePos = $read1[3] + $Rfwd2Ec;
150 if (exists $eDat{$read1[2]}{$thePos}) {
151 $eDat{$read1[2]}{$thePos}->[0+$strandShift+$AmbShift] += $reflen;
152 ++$eDat{$read1[2]}{$thePos}->[1+$strandShift+$AmbShift];
154 $nDat{$read1[2]}{$thePos} = [0,0,0,0,0,0,0,0,0] unless (exists $nDat{$read1[2]}{$thePos});
155 $nDat{$read1[2]}{$thePos}->[0+$strandShift+$AmbShift] += $reflen;
156 ++$nDat{$read1[2]}{$thePos}->[1+$strandShift+$AmbShift];
157 if (($strandShift == 0 and $read1[5] =~ /^(\d+)M/ and $1>=$EcutAt and $read1[9] =~ /^$EseqR/) or
158 ($strandShift == 2 and $read1[5] =~ /(\d+)M$/ and $1>=$EcutAt and $read1[9] =~ /$EseqL$/))
160 $nDat{$read1[2]}{$thePos}->[8]=1;
161 $eDat{$read1[2]}{$thePos} = $nDat{$read1[2]}{$thePos};
162 delete $nDat{$read1[2]}{$thePos};
166 #print join("\t",@read1[0..8]),"\n-->$thePos\t$reflen,$maxM\t$strandShift+$AmbShift\n";
171 my ($chr,$theDatHp,$fh,$isEcutSite)=@_;
173 for my $pos (sort {$a<=>$b} keys %{$theDatHp}) {
174 my ($sA,$sB,@tA,@tB)=(0,0);
175 for my $id (1,3,5,7) {
176 my $tmp = $theDatHp->{$pos}->[$id];
180 my $tmpval = $theDatHp->{$pos}->[$id-1];
181 push @tB,int(0.5+100*$tmpval/$tmp)/100; # avgLen
188 $sB = int(0.5+100*$sB/$sA)/100;
189 if ($theDatHp->{$pos}->[8]) {
190 if ($theDatHp->{$pos}->[1]+$theDatHp->{$pos}->[3]) {
191 ++$eCutSiteCnt{"TP_Mut"};
193 ++$eCutSiteCnt{"TP_Mut_p"};
194 $theDatHp->{$pos}->[8] = 2;
196 ++$FPsites; # Well, it WAS FP ...
198 ++$eCutSiteCnt{"${isEcutSite}P"};
199 ++$FPsites if ${isEcutSite
} eq 'F';
201 } elsif (${isEcutSite
} eq 'T') {
202 ++$eCutSiteCnt{"FN"};
204 print $fh join("\t",$chr,$pos,$theDatHp->{$pos}->[8],$sA,$sB,join(",",@tA),join(",",@tB)),"\n";
206 $eCutSiteCnt{"TN"} -= $FPsites;
211 for my $chr (@ChrOrder) {
212 &printDat($chr,$eDat{$chr},$O,'T') if exists $eDat{$chr};
213 &printDat($chr,$nDat{$chr},$N,'F') if exists $nDat{$chr};
214 $eCutSiteCnt{"TN
"} += $ChrLen{$chr} - $eCnt{$chr};
223 Skipped = $ReadsSkipped
228 for my $t (sort {$b cmp $a} keys %eCutSiteCnt) {
229 print L
$t,' = ',$eCutSiteCnt{$t},"\n";