new file: mfig1.py
[GalaxyCodeBases.git] / perl / soap / insizing / calculate_mx.pl
blob537dbd6cec36931138e0fba020dcb48268031c77
1 #!/usr/bin/perl -w
2 use strict;
4 unless(@ARGV>0)
6 print "perl $0 <soap file name> <insert size result> <frequence statistics> <drop rate(0~1)>\n";
7 exit 0;
10 my ($sum,$count,$sum_2,@insertdata,%f);
11 my $soapfile = $ARGV[0];
12 my $insert = $ARGV[1];
13 my $frequence = $ARGV[2];
14 my $drop = $ARGV[3];
15 #open SOAP, "<", "$soapfile";
16 open( SOAP,"-|","gzip -dc $soapfile");
17 open INSERT, ">", "$insert";
18 open FREQUENCE, ">", "$frequence";
20 my $prograss;
21 my $filetotal = `ls -l $soapfile`;
22 my @tmp = split/\s+/,$filetotal;
23 $filetotal = $tmp[4];
25 print "\nProcessing......\n\n";
26 while(my $line1 = <SOAP>)
28 my @arry1 = split/\s+/,$line1;
29 my $line2 = <SOAP>;
30 my @arry2 = split/\s+/,$line2;
31 unless($arry1[0] eq $arry2[0] && $arry1[7] eq $arry2[7])
33 $line2 = <SOAP>;
34 @arry2 = split/\s+/,$line2;
36 if($arry1[6] eq '-')
38 my $result1 = $arry1[8]+$arry1[5]-$arry2[8];
39 print INSERT $result1,"\n";
40 $sum += $result1;
41 $sum_2 += ($result1**2);
42 $f{$result1}++;
44 else
46 my $result2 = $arry2[8]+$arry2[5]-$arry1[8];
47 print INSERT $result2,"\n";
48 $sum += $result2;
49 $sum_2 += ($result2**2);
50 $f{$result2}++;
52 $count++;
53 if($count%640 == 0)
55 &prograss_bar((tell SOAP),$filetotal);
58 close INSERT;
60 sub prograss_bar
62 my ($i,$total) = ($_[0],$_[1]);
63 local $| = 1;
64 print "\r [".("=" x int(($i/$total)*25)).">".(" " x (24 - int(($i/$total)*25)))."]";
65 printf("%2.1f %%",$i/$total*100);
66 local $| = 0;
69 my @maxfr = (0,0);
70 foreach (sort keys %f)
72 if($f{$_} > $maxfr[1])
74 $maxfr[0] = $_;
75 $maxfr[1] = $f{$_};
77 print FREQUENCE $_,"\t";
78 print FREQUENCE $f{$_},"\n";
81 print "\n\n";
82 print "Process Done!\n\n";
83 my $average = $sum/$count;
84 print "The average value is : ", $average,".\n";
85 print "The number of sample is : ", $count,".\n";
86 my $sd = sqrt(($sum_2-$count*($average**2))/($count-1));
87 print "The SD value is : ", $sd,".\n\n";
89 print "The Max Frequence Value is : ", $maxfr[0], " ", $maxfr[1], "\n";
90 #print "Now Please Input the presentage you want to drop (0~1): ";
91 #chomp(my $drop = <STDIN>);
92 my $k = 0;
93 $sum = $sum_2 = $count = 0;
94 open INSERT, "<", "$insert";
95 while(<INSERT>)
97 chomp;
98 $insertdata[$k] = $_;
99 $k++;
101 foreach (sort keys %f)
103 if($f{$_} <= $maxfr[1]*$drop)
105 delete $f{$_};
108 foreach (@insertdata)
110 if(exists $f{$_})
112 $sum += $_;
113 $sum_2 += ($_**2);
114 $count++;
118 sub gam1{
119 my $x1 = $_[0];
120 my $t = 0;
121 my @a = (0.0000677106,-0.0003442342,0.0015397681,-0.0024467480,
122 0.0109736958,-0.0002109075,0.0742379071,0.0815782188,
123 0.4118402518,0.4227843370,1.0);
124 unless($x1 > 0)
126 print "x must greater than 0!\n";
127 exit 0;
129 if($x1 < 1)
131 $t = 1/($x1*($x1+1));
132 $x1 += 2;
134 elsif($x1 <= 2)
136 $t = 1/$x1;
137 $x1 += 1;
139 elsif($x1 <= 3)
141 $t = 1;
143 else
145 $t = 1;
146 while($x1 > 3)
148 $x1 -= 1;
149 $t *= $x1;
152 my $s1 = $a[0];
153 my $u = $x1-2;
154 foreach (@a)
156 $s1 = $s1*$u+$_;
158 $s1 *= $t;
161 sub gam2
163 my ($a_gam,$x_gam) = ($_[0],$_[1]);
164 my ($n_gam,$p_gam,$q_gam,$d_gam,$s_gam,$s1_gam,$p0_gam,$q0_gam,$p1_gam,$q1_gam,$qq_gam);
165 if(($a_gam <= 0)||($x_gam < 0))
167 if($a_gam <= 0)
169 print "err**a<=0!\n";
171 if($x_gam < 0)
173 print "err**x<0!\n";
175 exit 0;
177 if($x_gam == 0)
179 return 0;
181 if($x_gam > 1e35)
183 return 1;
185 $q_gam = $a_gam*log($x_gam);
186 $qq_gam = exp($q_gam);
187 if($x_gam < 1+$a_gam)
189 $p_gam = $a_gam;
190 $s_gam = $d_gam = 1/$a_gam;
191 for($n_gam = 1; $n_gam <= 100; $n_gam++)
193 $p_gam++;
194 $d_gam = $d_gam*$x_gam/$p_gam;
195 $s_gam += $d_gam;
196 if(abs($d_gam) < abs($s_gam)*1e-7)
198 $s_gam=$s_gam*exp(-$x_gam)*$qq_gam/&gam1($a_gam);
199 return $s_gam;
203 else
205 $s_gam = 1/$x_gam;
206 $p0_gam = 0;
207 $q0_gam = $p1_gam=1;
208 $q1_gam = $x_gam;
209 for($n_gam = 1; $n_gam <= 100; $n_gam++)
211 $p0_gam = $p1_gam+($n_gam-$a_gam)*$p0_gam;
212 $q0_gam = $q1_gam+($n_gam-$a_gam)*$q0_gam;
213 $p_gam = $x_gam*$p0_gam+$n_gam*$p1_gam;
214 $q_gam = $x_gam*$q0_gam+$n_gam*$q1_gam;
215 if(abs($q_gam) != 0)
217 $s1_gam = $p_gam/$q_gam;
218 $p1_gam = $p_gam;
219 $q1_gam = $q_gam;
220 if(abs(($s1_gam-$s_gam)/$s1_gam) < 1e-7)
222 $s_gam = $s1_gam*exp(-$x_gam)*$qq_gam/&gam1($a_gam);
223 return (1-$s_gam);
225 $s_gam = $s1_gam;
227 $p1_gam = $p_gam;
228 $q1_gam = $q_gam;
231 print "a too large !\n";
232 $s_gam = 1-$s_gam*exp(-$x_gam)*$qq_gam/&gam1($a_gam);
235 sub erf
237 my $x_erf = $_[0];
238 my $y_erf;
239 if($x_erf >= 0)
241 $y_erf=&gam2(0.5,$x_erf**2);
243 else
245 $y_erf=-&gam2(0.5,$x_erf);
249 my $average_drop = $sum/$count;
250 print "The average value after modification is : ", $average_drop,".\n";
251 print "The number of sample after modification is : ", $count,".\n";
252 my $pa = 3.1415926536;
253 my $variance = ($sum_2-$count*($average_drop**2))/($count-1);
254 print "The SD value after modification is : ",sqrt($variance), "\n";
255 my $sd_drop = sqrt($variance*&erf(sqrt((-1)*log($drop)))/(&erf(sqrt((-1)*log($drop)))-2*$drop*sqrt((-1)*log($drop))/sqrt($pa)));
256 print "The estimated SD value based on the modified sample is : ", $sd_drop,".\n";
257 close SOAP;
258 close INSERT;