new file: cell2loc.py
[GalaxyCodeBases.git] / perl / soap / draw / getsoap2nfo.pl
blobb0384ae4c9f4b98cb1c0eeca4a0b578590a061c8
1 #!/bin/env perl
2 use strict;
3 use warnings;
5 unless (@ARGV>0){
6 print "perl $0 base\n";
7 exit;
9 my ($soapbase)=@ARGV;
11 ### Erf < ###
12 sub gam1($) {
13 my $x1 = $_[0];
14 my $t = 0;
15 my @a = (0.0000677106,-0.0003442342,0.0015397681,-0.0024467480,
16 0.0109736958,-0.0002109075,0.0742379071,0.0815782188,
17 0.4118402518,0.4227843370,1.0);
18 unless($x1 > 0) {
19 print "x must greater than 0!\n";
20 exit 0;
22 if($x1 < 1) {
23 $t = 1/($x1*($x1+1));
24 $x1 += 2;
25 } elsif($x1 <= 2) {
26 $t = 1/$x1;
27 $x1 += 1;
28 } elsif($x1 <= 3) {
29 $t = 1;
30 } else {
31 $t = 1;
32 while($x1 > 3) {
33 $x1 -= 1;
34 $t *= $x1;
37 my $s1 = $a[0];
38 my $u = $x1-2;
39 foreach (@a) {
40 $s1 = $s1*$u+$_;
42 $s1 *= $t;
45 sub gam2($$) {
46 my ($a_gam,$x_gam) = ($_[0],$_[1]);
47 my ($n_gam,$p_gam,$q_gam,$d_gam,$s_gam,$s1_gam,$p0_gam,$q0_gam,$p1_gam,$q1_gam,$qq_gam);
48 if(($a_gam <= 0)||($x_gam < 0)) {
49 if($a_gam <= 0) {
50 print "err**a<=0!\n";
52 if($x_gam < 0) {
53 print "err**x<0!\n";
55 exit 0;
57 if($x_gam == 0) {
58 return 0;
60 if($x_gam > 1e35) {
61 return 1;
63 $q_gam = $a_gam*log($x_gam);
64 $qq_gam = exp($q_gam);
65 if($x_gam < 1+$a_gam) {
66 $p_gam = $a_gam;
67 $s_gam = $d_gam = 1/$a_gam;
68 for($n_gam = 1; $n_gam <= 100; $n_gam++) {
69 $p_gam++;
70 $d_gam = $d_gam*$x_gam/$p_gam;
71 $s_gam += $d_gam;
72 if(abs($d_gam) < abs($s_gam)*1e-7) {
73 $s_gam=$s_gam*exp(-$x_gam)*$qq_gam/&gam1($a_gam);
74 return $s_gam;
77 } else {
78 $s_gam = 1/$x_gam;
79 $p0_gam = 0;
80 $q0_gam = $p1_gam=1;
81 $q1_gam = $x_gam;
82 for($n_gam = 1; $n_gam <= 100; $n_gam++) {
83 $p0_gam = $p1_gam+($n_gam-$a_gam)*$p0_gam;
84 $q0_gam = $q1_gam+($n_gam-$a_gam)*$q0_gam;
85 $p_gam = $x_gam*$p0_gam+$n_gam*$p1_gam;
86 $q_gam = $x_gam*$q0_gam+$n_gam*$q1_gam;
87 if(abs($q_gam) != 0) {
88 $s1_gam = $p_gam/$q_gam;
89 $p1_gam = $p_gam;
90 $q1_gam = $q_gam;
91 if(abs(($s1_gam-$s_gam)/$s1_gam) < 1e-7) {
92 $s_gam = $s1_gam*exp(-$x_gam)*$qq_gam/&gam1($a_gam);
93 return (1-$s_gam);
95 $s_gam = $s1_gam;
97 $p1_gam = $p_gam;
98 $q1_gam = $q_gam;
101 print "a too large !\n";
102 $s_gam = 1-$s_gam*exp(-$x_gam)*$qq_gam/&gam1($a_gam);
105 sub erf($) {
106 my $x_erf = $_[0];
107 my $y_erf;
108 if($x_erf >= 0) {
109 $y_erf=&gam2(0.5,$x_erf**2);
110 } else {
111 $y_erf=-&gam2(0.5,$x_erf);
114 ### Erf > ###
117 sub combineC($) {
118 my $href=$_[0];
119 if ($href and %$href) {
120 my (@str,$m);
121 $m = (sort {$a<=>$b} keys %$href)[-1];
122 for (1..$m) {
123 #$$href{$_} += 0;
124 push @str,join(':',$_,$$href{$_}||0);
126 return \join(',',@str);
127 } else {return \'.';}
130 sub combineJ($) {
131 my $href=$_[0];
132 if ($href and %$href) {
133 my @str;
134 for (sort {$a<=>$b} keys %$href) {
135 push @str,join(':',$_,$$href{$_});
137 return \join(',',@str);
138 } else {return \'.';}
141 sub getRealpos($$$$) {
142 my ($len,$strand,$realpos,$trim)=@_;
143 if ($strand eq '-') { # Negative
144 $realpos += $len; # should be $len-1. So, starting 0. (+ & -, never meets.)
145 if ($trim =~ /(\d+)S$/) {
146 $realpos += $1; # $1 only reset after next /()/
148 } elsif ($strand eq '+') { # Positive
149 if ($trim =~ /^(\d+)S/) {
150 $realpos -= $1;
152 } else {
153 $realpos=-1;
155 return $realpos;
158 my ($Pairs,$Paired,$Singled,$Reads,$Alignment);
159 open LOG,'<',"$soapbase.log" or die "[x]Error opening $soapbase.log: $!\n";
160 while (<LOG>) {
161 $Pairs = (split)[-2] if /^Total Pairs:/;
162 $Paired = (split)[1] if /^Paired:/;
163 $Singled = (split)[1] if /^Singled:/;
164 $Reads = (split)[-1] if /^Total Reads/;
165 $Alignment = (split)[1] if /^Alignment:/;
167 close LOG;
169 open NFO,'>',"$soapbase.nfo" or die "[x]Error opening $soapbase.nfo: $!\n";
171 print NFO "#fmtS\tTotal_Pairs\tPaired\tSingled\n";
172 print NFO "Summary\t",join("\t",$Pairs,$Paired,$Singled),"\n";
173 #open(OUTS,"-|","gzip -dc $soapbase.soap.gz $soapbase.single.gz") or die "Error: $!\n";
174 open(TP,"-|","gzip -dc $soapbase.soap.gz") or die "Error: $!\n";
175 open(TS,"-|","gzip -dc $soapbase.single.gz") or die "Error: $!\n";
177 my ($BadLines,$BPOut,$ReadsOut,$MisSum,$TrimedBP,$TrimedReads,%Hit9r,%Hit9bp,%misMatch,%Indel)=(0,0,0,0,0,0);
178 my (%chrBPOut,%chrReadsOut,%chrMisSum,%chrTrimedBP,%chrTrimedReads,%chrHit9r,%chrHit9bp,%chrmisMatch,%chrIndel);
179 # for 46999 ChrIDs, one hash took 12m RES for {}=$ and 125m VIRT for {}{10}=$, thus fine to work with scaffolds. ( 1 gb VIRT max ? )
180 #my (@lines,$hit,$len,$chr,$types,$trim,$mistr,$missed);
182 sub getStatNfo($) {
183 my ($hit,$len,$chr,$types,$trim,$mistr) = @{$_[0]}[3,5,7,9,-2,-1];
184 my $missed=$mistr=~tr/ATCGatcg//;
185 $BPOut += $len; $chrBPOut{$chr} += $len;
186 ++$ReadsOut; ++$chrReadsOut{$chr};
187 ++$misMatch{$missed};
188 ++$chrmisMatch{$chr}{$missed};
189 $MisSum += $missed;
190 $chrMisSum{$chr} += $missed;
192 $hit=4 if $hit>4; # max to count 3, then >=4. Ancient Chinese wisdom, and a bit more ...
193 ++$Hit9r{$hit};
194 ++$chrHit9r{$chr}{$hit};
195 $Hit9bp{$hit} += $len;
196 $chrHit9bp{$chr}{$hit} += $len;
197 if ($types > 200) {
198 ++$Indel{200-$types};
199 ++$chrIndel{$chr}{200-$types};
200 } elsif ($types > 100) {
201 ++$Indel{$types-100};
202 ++$chrIndel{$chr}{$types-100};
205 my @Trimed = $trim =~ /(\d+)S/;
206 if (@Trimed) {
207 ++$TrimedReads;
208 ++$chrTrimedReads{$chr};
209 for ( @Trimed ) {
210 $TrimedBP += $_;
211 $chrTrimedBP{$chr} += $_;
217 my ($pairs,$lastpos,$line1,$line2,$pp,$pn,$calins,%insD,@lines)=(0);
218 while ($line1=<TP>) {
219 last if eof TP;
220 $lastpos=tell TP;
221 @lines = split /\t/,$line1;
222 if (@lines < 11) {
223 ++$BadLines;
224 next;
226 &getStatNfo(\@lines);
227 my ($id1, $n1, $len1, $f1, $chr1, $x1, $m1) = @lines[0,3,5,6,7,8,-2];
228 $line2=<TP>;
229 @lines = split /\t/,$line2;
230 if (@lines < 11) {
231 ++$BadLines;
232 next;
234 my ($id2, $n2, $len2, $f2, $chr2, $x2, $m2) = @lines[0,3,5,6,7,8,-2];
235 #($soapid,$hit,$len,$strand,$chr,$pos,$trim) = (split(/\t/))[0,3,5,6,7,8,-2];
236 $id1 =~ s/\/[12]$//;
237 $id2 =~ s/\/[12]$//;
238 if ($id1 ne $id2){ # single
239 seek (TP, $lastpos, 0);
240 next;
242 &getStatNfo(\@lines);
243 next if $n1+$n2>2 or $chr1 ne $chr2;
244 if ($f1 eq '+') {
245 ($pp,$pn)=($x1,$x2);
246 } else {
247 ($pp,$pn)=($x2,$x1);
249 =pod
250 if ($ins > 1500) { # FR => +.pos < -.pos; RF => -.pos < +.pos
251 next if $pp < $pn;
252 } else { next if $pp > $pn; }
253 =cut
254 next if $pp > $pn;
255 ++$pairs;
256 $line1=&getRealpos($len1, $f1, $x1, $m1); # $len,$strand,$realpos,$trim
257 $line2=&getRealpos($len2, $f2, $x2, $m2); # Well, $line{1,2} is recycled.
258 $calins=abs($line1-$line2); # -.starting=0
259 ++$insD{$calins};
261 close TP;
262 while (<TS>) {
263 @lines = split /\t/;
264 if (@lines > 10) { # soap2 output always more than 10 columes.
265 &getStatNfo(\@lines);
266 } else { # rare event ...
267 ++$BadLines;
268 next;
271 close TS;
273 print NFO "\n#fmtC\tReadsOut\tBPOut\tMisSum\tTrimedReads\tTrimedBP\tmisMatchReads\tReads\@Hit\tBP\@Hit\tIndelReads\tBadLines\n";
274 print NFO join("\t",'ALL',$ReadsOut,$BPOut,$MisSum,$TrimedReads,$TrimedBP,
275 ${&combineC(\%misMatch)},${&combineJ(\%Hit9r)},${&combineJ(\%Hit9bp)},${&combineJ(\%Indel)},$BadLines),"\n\n";
276 print NFO "#fmtP\tReadsOut\tBPOut\tMisSum\tTrimedReads\tTrimedBP\tmisMatchReads\tReads\@Hit\tBP\@Hit\tIndelReads\n";
277 print NFO join("\t",";$_",$chrReadsOut{$_},$chrBPOut{$_},$chrMisSum{$_}||0,$chrTrimedReads{$_}||0,$chrTrimedBP{$_}||0,
278 ${&combineC(\%{$chrmisMatch{$_}})},${&combineJ(\%{$chrHit9r{$_}})},${&combineJ(\%{$chrHit9bp{$_}})},${&combineJ(\%{$chrIndel{$_}})}),"\n" for sort keys %chrReadsOut;
279 close NFO;
281 my ($n,$fqcmd,$tmpcmd,$avg,$std,$Lsd,$Rsd,$max_y,$max_x)=(0);
282 my ($sum,$sum2,$v)=(0,0);
283 open O,'>',"$soapbase.insD";
284 for my $k (sort {$a <=> $b} keys %insD) {
285 $v=$insD{$k};
286 print O "$k\t$v\n";
287 $sum += $k * $v;
288 $n += $v;
289 $sum2 += $k*$k * $v;
291 $avg = $sum/$n;
292 $std = sqrt($sum2/$n-$avg*$avg);
293 print O "# $avg ± $std\n";
294 ($max_y,$max_x)=(-1,0);
295 for my $k ($avg-$std .. $avg+$std) { # 68.27%
296 next unless $v=$insD{$k};
297 if ($max_y < $v) {
298 $max_x = $k;
299 $max_y = $v;
302 my $cutoff = $max_y / 1000;
303 $cutoff = 3 if $cutoff<3;
304 my ($diff, $Lc, $Rc);
305 for my $k (keys %insD) {
306 $v=$insD{$k};
307 next if $v < $cutoff;
308 $diff = $k - $max_x;
309 if ($diff < 0) {
310 $Lsd += $v * $diff * $diff;
311 $Lc += $v;
313 elsif ($diff > 0) {
314 $Rsd += $v * $diff * $diff;
315 $Rc += $v;
318 $Lsd = sqrt($Lsd/$Lc);
319 $Rsd = sqrt($Rsd/$Rc);
320 print O "# +$Lsd -$Rsd\n";
321 close O;
323 END:
324 #system("bzip2 -9 $opath$fqnames[0].unmap");