new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / radseq / radstat.pl
blob0d819369d2130fa1a27fcbbcca931dbc09c35d20
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120330
5 =cut
6 use strict;
7 use warnings;
8 #use Data::Dump qw(ddx);
10 die "Usage: $0 <Cut sites> <sorted bam> <out prefix>\n" if @ARGV<3;
11 my $inec=shift;
12 my $insam=shift;
13 my $outp=shift;
15 my $ReadLen = 100-5; # 95
16 my $minOKalnLen = 30;
17 my $minAlignLen = int($ReadLen * 0.6);
18 $minAlignLen = $minOKalnLen if $minAlignLen < $minOKalnLen;
19 die if $ReadLen < $minOKalnLen;
21 my $Eseq="CTGCAG";
22 my $EcutAt=5;
23 my $EseqL="CTGCA";#substr $Eseq,0,$EcutAt;
24 my $EseqR="TGCAG";
26 my $EfwdTerm5=1-$EcutAt+1; # -3
27 my $EfwdTerm3=1-$EcutAt+$ReadLen; # 91
28 my $ErevTerm3=0; # 0
29 my $ErevTerm5=$ErevTerm3-$ReadLen+1; # -94
30 # 12008 -> [12005,12099],[11914,12008]
31 my $Rfwd2Ec = -$EfwdTerm5; # 3
32 my $Rrev2Ec = -$ErevTerm3; # 0
34 sub openfile($) {
35 my ($filename)=@_;
36 my $infile;
37 if ($filename=~/.bz2$/) {
38 open( $infile,"-|","bzip2 -dc $filename") or die "Error opening $filename: $!\n";
39 } elsif ($filename=~/.gz$/) {
40 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
41 } else {open( $infile,"<",$filename) or die "Error opening $filename: $!\n";}
42 return $infile;
44 sub opensam($) {
45 my ($filename)=@_;
46 my $infile;
47 if ($filename=~/.bam$/) {
48 open( $infile,"-|","samtools view -h -F 132 $filename") or die "Error opening $filename: $!\n";
49 } elsif ($filename=~/.sam.gz$/) {
50 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
51 } elsif ($filename=~/.sam$/) {
52 open( $infile,"<",$filename) or die "Error opening $filename: $!\n";
53 } else {
54 #die "[x]Only support .sam(.gz) & .bam [$filename]\n";
55 open( $infile,"-|","samtools view -h -F 132 $filename") or die "Error opening $filename: $!\n. Only support .sam(.gz) & .bam !";
57 return $infile;
60 my $ecin = openfile($inec);
61 my $samin = opensam($insam);
62 my ($O,$N);
63 open $O,'|-',"gzip -9c >$outp.elen.gz" or die "Error opening $outp.edep.gz with gzip: $!\n";
64 open $N,'|-',"gzip -9c >$outp.nlen.gz" or die "Error opening $outp.ndep.gz with gzip: $!\n";
65 open L,'>',$outp.'.eclog' or die "Error opening $outp.edep.log: $!\n";
66 select(L);
67 $|=1;
68 select(STDOUT);
69 my $tmpstr="From [$insam],[$inec] to [$outp.{e,n}dep.gz]\n";
70 print L $tmpstr;
71 warn $tmpstr;
72 no warnings;
73 $tmpstr = "#".join("\t",qw/ChrID eCut_pos isMutSite Reads_Count Reads_Len_mean Reads_Cnt(fwd_U,rev_U,fwd_R,rev_R) Reads_Len_mean(same)/)."\n";
74 use warnings;
75 print $O $tmpstr;
76 print $N $tmpstr;
78 my (%nDat,%eDat,@ChrOrder);
79 my ($eCntAll,%eCnt)=(0);
80 while (<$ecin>) {
81 next if /^(#|$)/;
82 chomp;
83 my ($chr,$pos) = split /\t/;
84 unless (exists $eDat{$chr}) {
85 push @ChrOrder,$chr;
86 $nDat{$chr} = {};
88 ++$eCntAll;
89 ++$eCnt{$chr};
90 $eDat{$chr}{$pos}=[0,0,0,0,0,0,0,0,0]; # [SumF,CountF, SumR,CountR] for average read length (len on ref). The last(9th) is flag for mutated eCut_site
91 # first 4, Unique(XT:A:U); second 4, Repeat(XT:A:R).
92 # XT:A:N is fragmental like "61S12M2D3M2D10M14S".
93 # Mate-sw(XT:A:M) is either short or with many(like 7) mismatches(including 'N' on reads)
95 close $ecin;
97 sub cigar2reflen($) {
98 my $cigar = $_[0];
99 my @cigar = $cigar =~ /(\d+)(\w)/g;
100 my ($reflen,$maxM)=(0,0);
101 while (@cigar) {
102 my ($len,$op) = splice(@cigar,0,2);
103 if ($op eq 'M') {
104 $reflen += $len;
105 $maxM = $len if $maxM < $len;
106 next;
108 $reflen += $len if $op eq 'D';
110 return [$reflen,$maxM];
113 my ($ReadsCnt,$ReadsSkipped,%eCutSiteCnt,%ChrLen)=(0,0);
114 print L "[ChrNFO]\nbyChr = <<TABLE\n#ChrID\tLen\teCut_Count\n";
115 while (<$samin>) {
116 if (/^@\w\w\t\w\w:/) {
117 if (/^\@SQ\tSN:(\S+)\tLN:(\d+)$/) {
118 if (exists $eDat{$1}) {
119 $ChrLen{$1} = $2;
120 #print STDERR "Chr:[$1]\tLen:[$2], Cut:[",scalar keys %{$eDat{$1}},"]\n";
121 print L "$1\t$2\t",$eCnt{$1},"\n";
122 } else {
123 warn "Chr:[$1], Len:[$2] not cut.\n";
124 print L "$1\t$2\t0\n";
127 next;
129 my @read1=split /\t/,$_,12;
130 ++$ReadsCnt;
131 my $AmbShift=0;
132 if ($read1[11] =~ /\bXT:A:[RN]\b/) {
133 $AmbShift = 4;
135 my ($reflen,$maxM)=@{cigar2reflen($read1[5])};
136 if ($maxM<$minOKalnLen or $reflen<$minAlignLen) {
137 ++$ReadsSkipped;
138 #########
139 #print join("\t",@read1[0..8]),"\n**>\t$reflen,$maxM\n";
140 #########
141 next;
143 my ($strandShift,$thePos)=(0);
144 if ($read1[1] & 16) { # reverse
145 $thePos = $read1[3] + $reflen-1 + $Rrev2Ec;
146 $strandShift = 2;
147 } else {
148 $thePos = $read1[3] + $Rfwd2Ec;
150 if (exists $eDat{$read1[2]}{$thePos}) {
151 $eDat{$read1[2]}{$thePos}->[0+$strandShift+$AmbShift] += $reflen;
152 ++$eDat{$read1[2]}{$thePos}->[1+$strandShift+$AmbShift];
153 } else {
154 $nDat{$read1[2]}{$thePos} = [0,0,0,0,0,0,0,0,0] unless (exists $nDat{$read1[2]}{$thePos});
155 $nDat{$read1[2]}{$thePos}->[0+$strandShift+$AmbShift] += $reflen;
156 ++$nDat{$read1[2]}{$thePos}->[1+$strandShift+$AmbShift];
157 if (($strandShift == 0 and $read1[5] =~ /^(\d+)M/ and $1>=$EcutAt and $read1[9] =~ /^$EseqR/) or
158 ($strandShift == 2 and $read1[5] =~ /(\d+)M$/ and $1>=$EcutAt and $read1[9] =~ /$EseqL$/))
160 $nDat{$read1[2]}{$thePos}->[8]=1;
161 $eDat{$read1[2]}{$thePos} = $nDat{$read1[2]}{$thePos};
162 delete $nDat{$read1[2]}{$thePos};
165 #########
166 #print join("\t",@read1[0..8]),"\n-->$thePos\t$reflen,$maxM\t$strandShift+$AmbShift\n";
167 #########
170 sub printDat($$$$) {
171 my ($chr,$theDatHp,$fh,$isEcutSite)=@_;
172 my $FPsites=0;
173 for my $pos (sort {$a<=>$b} keys %{$theDatHp}) {
174 my ($sA,$sB,@tA,@tB)=(0,0);
175 for my $id (1,3,5,7) {
176 my $tmp = $theDatHp->{$pos}->[$id];
177 push @tA,$tmp; # n
178 $sA += $tmp;
179 if ($tmp) {
180 my $tmpval = $theDatHp->{$pos}->[$id-1];
181 push @tB,int(0.5+100*$tmpval/$tmp)/100; # avgLen
182 $sB += $tmpval;
183 } else {
184 push @tB,'.';
187 if ($sA) {
188 $sB = int(0.5+100*$sB/$sA)/100;
189 if ($theDatHp->{$pos}->[8]) {
190 if ($theDatHp->{$pos}->[1]+$theDatHp->{$pos}->[3]) {
191 ++$eCutSiteCnt{"TP_Mut"};
192 } else {
193 ++$eCutSiteCnt{"TP_Mut_p"};
194 $theDatHp->{$pos}->[8] = 2;
196 ++$FPsites; # Well, it WAS FP ...
197 } else {
198 ++$eCutSiteCnt{"${isEcutSite}P"};
199 ++$FPsites if ${isEcutSite} eq 'F';
201 } elsif (${isEcutSite} eq 'T') {
202 ++$eCutSiteCnt{"FN"};
204 print $fh join("\t",$chr,$pos,$theDatHp->{$pos}->[8],$sA,$sB,join(",",@tA),join(",",@tB)),"\n";
206 $eCutSiteCnt{"TN"} -= $FPsites;
209 print L "TABLE\n\n";
211 for my $chr (@ChrOrder) {
212 &printDat($chr,$eDat{$chr},$O,'T') if exists $eDat{$chr};
213 &printDat($chr,$nDat{$chr},$N,'F') if exists $nDat{$chr};
214 $eCutSiteCnt{"TN"} += $ChrLen{$chr} - $eCnt{$chr};
217 close $samin;
218 close $O;
219 close $N;
220 print L <<CONTENT;
221 [Read1_stat]
222 Total = $ReadsCnt
223 Skipped = $ReadsSkipped
225 [eCut_stat]
226 Total = $eCntAll
227 CONTENT
228 for my $t (sort {$b cmp $a} keys %eCutSiteCnt) {
229 print L $t,' = ',$eCutSiteCnt{$t},"\n";
231 close L;