modified: myjupyterlab.sh
[GalaxyCodeBases.git] / perl / etc / WoodyMiaoLin / DistMCMC / DistMCMC.pl
blob22109efe7ffe67607e782e31fc7f44ec79f36302
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
5 die("
6 Program: DistMCMC
7 DistMCMC does maximum likelihood extimation of pairwise distance using Markov chain Monte Carlo algorithm.
8 Only JC69, JC69+G, K80 and K80+G are available now.\n
9 Version: 1.0
10 Release: Aug. 3, 2014
11 Auther: Woody\n
12 Usage: $0 <in.fasta> <out.txt>
13 Please edit the Perl script directly if you want to make changes of parameters.
14 \n") if (@ARGV<2);
16 my $NoItera = 100000; # Number of iterations
17 my $Burnin = 10000; # Burn in
18 my $Savfreq = 100; # Frequency of saving states after burn in
19 my $Prifreq = 50000; # Frequency of printing states on screen
20 my $dSlidWin = 0.1; # Sliding window size of distance
21 my $kSlidWin = 10; # Sliding window size of kappa
22 my @dRange = (0, 1); # Range of distance
23 my @kRange = (0, 100); # Range of kappa
24 my $alpha1 = 1; # Gamma shape parameter
25 my $alpha2 = 0.6;
26 my $alpha3 = 0.2;
28 my $i = shift;
29 my $o = shift;
30 open my $i1, "<", "$i";
31 open O, ">", "$o";
33 my $nchar;
34 my %dna = %{&readfasta($i1)};
35 close $i1;
37 my %nseg; # Hash of pairwise number of segregation sites
38 foreach my $s1 (sort keys %dna) {
39 foreach my $s2 (sort keys %dna) {
40 if ($nseg{"$s1 $s2"} or $nseg{"$s2 $s1"}) {
41 next;
42 } elsif ($s1 eq $s2) {
43 next;
44 } else {
45 $nseg{"$s1 $s2"} = &calnseg($dna{$s1}, $dna{$s2});
50 print O "#Pair\tDistance\tDistance_mean\tDistance_median\tDistance_%95_HPD_lower\tDistance_%95_HPD_upper\tKappa_mean\tKappa_median\tKappa_%95_HPD_lower\tKappa_%95_HPD_upper\n";
52 foreach (sort keys %nseg) {
53 print "\n$_";
54 print "\nJC69 MCMC simulating...\nNo. Iteration\tDistance\n";
55 my @a = &JC69MCMC(@{$nseg{$_}});
56 print O $_, "\tJC69\t", join("\t", @a), "\n";
57 print "\nJC69+G MCMC simulating...\nNo. Iteration\tDistance\n";
58 @a = &JC69gammaMCMC(@{$nseg{$_}}, $alpha1);
59 print O $_, "\tJC69+G a=$alpha1\t", join("\t", @a), "\n";
60 @a = &JC69gammaMCMC(@{$nseg{$_}}, $alpha2);
61 print O $_, "\tJC69+G a=$alpha2\t", join("\t", @a), "\n";
62 @a = &JC69gammaMCMC(@{$nseg{$_}}, $alpha3);
63 print O $_, "\tJC69+G a=$alpha3\t", join("\t", @a), "\n";
64 print "\nK80 MCMC simulating...\nNo. Iteration\tDistance\tKappa\n";
65 @a = &K80MCMC(@{$nseg{$_}});
66 print O $_, "\tK80\t", join("\t", @a), "\n";
67 print "\nK80+G MCMC simulating...\nNo. Iteration\tDistance\tKappa\n";
68 @a = &K80gammaMCMC(@{$nseg{$_}}, $alpha1);
69 print O $_, "\tK80+G a=$alpha1\t", join("\t", @a), "\n";
70 @a = &K80gammaMCMC(@{$nseg{$_}}, $alpha2);
71 print O $_, "\tK80+G a=$alpha2\t", join("\t", @a), "\n";
72 @a = &K80gammaMCMC(@{$nseg{$_}}, $alpha3);
73 print O $_, "\tK80+G a=$alpha3\t", join("\t", @a), "\n";
75 close O;
77 sub readfasta { # Read fasta file
78 my $in = $_[0];
79 my %fa;
80 while (<$in>) {
81 $/ = ">";
82 my $seq = <$in>;
83 chomp $seq;
84 $seq =~ s/\s//g;
85 $/ = "\n";
86 $nchar = length $seq unless $nchar;
87 s/>//;
88 /^(\S+)\s/;
89 $fa{$1} = $seq;
91 return \%fa;
94 sub calnseg {
95 my ($s1, $s2) = @_;
96 my $total;
97 my $ti = 0;
98 my $tv = 0;
99 foreach my $i (0 .. $nchar-1) {
100 next unless (substr($s1, $i, 1) =~ /[ATCG]/) and (substr($s2, $i, 1) =~ /[ATCG]/);
101 ++$total;
102 if (substr($s1, $i, 1) eq "A") {
103 if (substr($s2, $i, 1) eq "G") {
104 ++$ti;
105 } elsif (substr($s2, $i, 1) eq "A") {
106 next;
107 } else {
108 ++$tv;
110 } elsif (substr($s1, $i, 1) eq "G") {
111 if (substr($s2, $i, 1) eq "G") {
112 next;
113 } elsif (substr($s2, $i, 1) eq "A") {
114 ++$ti;
115 } else {
116 ++$tv;
118 } elsif (substr($s1, $i, 1) eq "C") {
119 if (substr($s2, $i, 1) eq "T") {
120 ++$ti;
121 } elsif (substr($s2, $i, 1) eq "C") {
122 next;
123 } else {
124 ++$tv;
126 } elsif (substr($s1, $i, 1) eq "T") {
127 if (substr($s2, $i, 1) eq "T") {
128 next;
129 } elsif (substr($s2, $i, 1) eq "C") {
130 ++$ti;
131 } else {
132 ++$tv;
136 return [$ti, $tv, $total];
140 sub NewSta { # Propose new state
141 my ($OldStat, $SlidWin, $LoBound, $UpBound) = @_;
142 my $r1 = rand;
143 my $TemStat = $OldStat + ($r1-0.5)*$SlidWin; # Temp state
144 my $NewStat; # New state
145 if ($TemStat < $LoBound) {
146 $NewStat = 2*$LoBound - $TemStat;
147 } elsif ($TemStat > $UpBound) {
148 $NewStat = 2*$UpBound - $TemStat;
149 } else {
150 $NewStat = $TemStat;
152 return $NewStat;
155 sub statis { # Calculate mean, median and 95% HPD of numbers
156 my @d_sort = sort {$a <=> $b} @_;
157 my $num_d = @_; # Number of samples in @_
158 my $num_mid = int($num_d/2); # Middle number of @d_sort;
159 my $d_mid = $d_sort[$num_mid];
160 my $sum;
161 foreach (@_) {
162 $sum += $_;
164 my $d_mean = $sum/@_;
165 my $num95_d = 0.95*$num_d; # %95 HPD number of samples
166 my $intHPD95 = $d_sort[$num95_d-1] - $d_sort[0]; # %95 HPD interval
167 my $intHPD95_l = $d_sort[0]; # Left boundary of %95 HPD interval
168 my $intHPD95_r = $d_sort[$num95_d-1]; # Right boundary of %95 HPD interval
169 for (my $i = 1; $num95_d+$i <= $num_d; ++$i) {
170 my $int95 = $d_sort[$num95_d+$i-1] - $d_sort[$i]; # %95 interval
171 if ($int95 < $intHPD95) {
172 $intHPD95 = $int95;
173 $intHPD95_l = $d_sort[$i];
174 $intHPD95_r = $d_sort[$num95_d+$i-1];
177 return ($d_mean, $d_mid, $intHPD95_l, $intHPD95_r);
180 sub JC69ML {
181 my ($ti, $tv, $n) = @_;
182 my $p = ($ti+$tv)/$n;
183 my $d_mean = -0.75*log(1-4/3*$p);
184 my $d_stde = sqrt($p*(1-$p)/(1-4/3*$p)**2/$n);
185 my $ci_l = $d_mean - 1.96*$d_stde;
186 my $ci_r = $d_mean + 1.96*$d_stde;
187 return ($d_mean, $d_stde, $ci_l, $ci_r);
190 sub JC69MCMC {
191 my ($ti, $tv, $n) = @_;
192 my $d = ($dRange[1]-$dRange[0])*(rand) + $dRange[0]; # Random initial state
193 my @d; # An array of saved states
194 foreach (1 .. $NoItera) {
195 my $d_old = $d; # Old state
196 my $d_new = &NewSta($d_old, $dSlidWin, @dRange);
197 my $logOld = ($ti+$tv)*log(0.75-0.75*exp(-4/3*$d_old)) + ($n-$ti-$tv)*log(0.25+0.75*exp(-4/3*$d_old)); # Log likelihood of old state
198 my $logNew = ($ti+$tv)*log(0.75-0.75*exp(-4/3*$d_new)) + ($n-$ti-$tv)*log(0.25+0.75*exp(-4/3*$d_new)); # Log likelihood of new state
199 my $accRat; # Acceptance ratio
200 if ($logNew > $logOld) {
201 $accRat = 1;
202 } else {
203 $accRat = exp($logNew-$logOld);
205 my $r2 = rand;
206 if ($r2 < $accRat) {
207 $d = $d_new;
209 my $pri = $_/$Prifreq;
210 print "$_\t$d\n" if $pri == int($pri);
211 if ($_ > $Burnin) {
212 my $sav = $_/$Savfreq;
213 push @d, $d if $sav == int($sav);
216 my @d_stat = &statis(@d);
217 return (@d_stat);
220 sub K80ML {
221 my ($ti, $tv, $n) = @_;
222 my $S = $ti/$n;
223 my $V = $tv/$n;
224 my $d_mean = -0.5*log(1-2*$S-$V) - 0.25*log(1-2*$V);
225 my $a = 1/(1-2*$S-$V);
226 my $b = 0.5*(1/(1-2*$S-$V)+1/(1-2*$V));
227 my $d_stde = sqrt(($a**2*$S+$b**2*$V-($a*$S+$b*$V)**2)/$n);
228 my $ci_l = $d_mean - 1.96*$d_stde;
229 my $ci_r = $d_mean + 1.96*$d_stde;
230 # my $k_mean = 2*log(1-2*$S-$V)/log(1-2*$V)-1;
231 return ($d_mean, $d_stde, $ci_l, $ci_r);
234 sub K80MCMC {
235 my ($ti, $tv, $n) = @_;
236 my $d = ($dRange[1]-$dRange[0])*(rand) + $dRange[0]; # Random initial state of distance
237 my $k = ($kRange[1]-$kRange[0])*(rand) + $kRange[0]; # Random initial state of kappa
238 my @d; # An array of saved states of distance
239 my @k; # An array of saved states of kappa
240 foreach (1 .. $NoItera) {
241 my $d_old = $d; # Old state of distance
242 my $d_new = &NewSta($d_old, $dSlidWin, @dRange);
243 my $k_old = $k; # Old state of kappa
244 my $k_new = &NewSta($k_old, $kSlidWin, @kRange);
245 my $old1 = exp(-4*$d_old/($k_old+2));
246 my $old2 = exp(-2*$d_old*($k_old+1)/($k_old+2));
247 my $logOld = $ti*log(0.25+0.25*$old1-0.5*$old2) + $tv*log(0.5-0.5*$old1) + ($n-$ti-$tv)*log(0.25+0.25*$old1+0.5*$old2); # Log likelihood of old state
248 my $new1 = exp(-4*$d_new/($k_new+2));
249 my $new2 = exp(-2*$d_new*($k_new+1)/($k_new+2));
250 my $logNew = $ti*log(0.25+0.25*$new1-0.5*$new2) + $tv*log(0.5-0.5*$new1) + ($n-$ti-$tv)*log(0.25+0.25*$new1+0.5*$new2); # Log likelihood of new state
251 my $accRat; # Acceptance ratio
252 if ($logNew > $logOld) {
253 $accRat = 1;
254 } else {
255 $accRat = exp($logNew-$logOld);
257 my $r2 = rand;
258 if ($r2 < $accRat) {
259 $d = $d_new;
260 $k = $k_new;
262 my $pri = $_/$Prifreq;
263 print "$_\t$d\t$k\n" if $pri == int($pri);
264 if ($_ > $Burnin) {
265 my $sav = $_/$Savfreq;
266 if ($sav == int($sav)) {
267 push @d, $d;
268 push @k, $k;
272 my @d_stat = &statis(@d);
273 my @k_stat = &statis(@k);
274 return (@d_stat, @k_stat);
277 sub JC69gammaML {
278 my ($ti, $tv, $n, $alpha) = @_;
279 my $p = ($ti+$tv)/$n;
280 my $d_mean = 0.75*$alpha*((1-4/3*$p)**(-1/$alpha)-1);
281 my $d_stde = sqrt($p*(1-$p)/$n*(1-4/3*$p)**(-2/($alpha+1))); # There is something wrong.
282 my $ci_l = $d_mean - 1.96*$d_stde;
283 my $ci_r = $d_mean + 1.96*$d_stde;
284 return ($d_mean, $d_stde, $ci_l, $ci_r);
287 sub JC69gammaMCMC {
288 my ($ti, $tv, $n, $alpha) = @_;
289 my $d = ($dRange[1]-$dRange[0])*(rand) + $dRange[0]; # Random initial state
290 my @d; # An array of saved states
291 foreach (1 .. $NoItera) {
292 my $d_old = $d; # Old state
293 my $d_new = &NewSta($d_old, $dSlidWin, @dRange);
294 my $old1 = 1/(1+4*$d_old/3/$alpha)**$alpha;
295 my $logOld = ($ti+$tv)*log(0.75-0.75*$old1) + ($n-$ti-$tv)*log(0.25+0.75*$old1); # Log likelihood of old state
296 my $new1 = 1/(1+4*$d_new/3/$alpha)**$alpha;
297 my $logNew = ($ti+$tv)*log(0.75-0.75*$new1) + ($n-$ti-$tv)*log(0.25+0.75*$new1); # Log likelihood of new state
298 my $accRat; # Acceptance ratio
299 if ($logNew > $logOld) {
300 $accRat = 1;
301 } else {
302 $accRat = exp($logNew-$logOld);
304 my $r2 = rand;
305 if ($r2 < $accRat) {
306 $d = $d_new;
308 my $pri = $_/$Prifreq;
309 print "$_\t$d\n" if $pri == int($pri);
310 if ($_ > $Burnin) {
311 my $sav = $_/$Savfreq;
312 push @d, $d if $sav == int($sav);
315 my @d_stat = &statis(@d);
316 return (@d_stat);
319 sub K80gammaML {
320 my ($ti, $tv, $n, $alpha) = @_;
321 my $S = $ti/$n;
322 my $V = $tv/$n;
323 my $d_mean = $alpha/2*((1-2*$S-$V)**(-1/$alpha)-1) + $alpha/4*((1-2*$V)**(-1/$alpha)-1);
324 my $a = (1-2*$S-$V)**(-1/(1+$alpha));
325 my $b = 0.5*((1-2*$S-$V)**(-1/(1+$alpha))+(1-2*$V)**(-1/(1+$alpha)));
326 my $d_stde = sqrt(($a**2*$S+$b**2*$V-($a*$S+$b*$V)**2)/$n); # There is something wrong.
327 my $ci_l = $d_mean - 1.96*$d_stde;
328 my $ci_r = $d_mean + 1.96*$d_stde;
329 return ($d_mean, $d_stde, $ci_l, $ci_r);
332 sub K80gammaMCMC {
333 my ($ti, $tv, $n, $alpha) = @_;
334 my $d = ($dRange[1]-$dRange[0])*(rand) + $dRange[0]; # Random initial state of distance
335 my $k = ($kRange[1]-$kRange[0])*(rand) + $kRange[0]; # Random initial state of kappa
336 my @d; # An array of saved states of distance
337 my @k; # An array of saved states of kappa
338 foreach (1 .. $NoItera) {
339 my $d_old = $d; # Old state of distance
340 my $d_new = &NewSta($d_old, $dSlidWin, @dRange);
341 my $k_old = $k; # Old state of kappa
342 my $k_new = &NewSta($k_old, $kSlidWin, @kRange);
343 my $old1 = 1/(1+4*$d_old/($k_old+2)/$alpha)**$alpha;
344 my $old2 = 1/(1+2*$d_old*($k_old+1)/($k_old+2)/$alpha)**$alpha;
345 my $logOld = $ti*log(0.25+0.25*$old1-0.5*$old2) + $tv*log(0.5-0.5*$old1) + ($n-$ti-$tv)*log(0.25+0.25*$old1+0.5*$old2); # Log likelihood of old state
346 my $new1 = 1/(1+4*$d_new/($k_new+2)/$alpha)**$alpha;
347 my $new2 = 1/(1+2*$d_new*($k_new+1)/($k_new+2)/$alpha)**$alpha;
348 my $logNew = $ti*log(0.25+0.25*$new1-0.5*$new2) + $tv*log(0.5-0.5*$new1) + ($n-$ti-$tv)*log(0.25+0.25*$new1+0.5*$new2); # Log likelihood of new state
349 my $accRat; # Acceptance ratio
350 if ($logNew > $logOld) {
351 $accRat = 1;
352 } else {
353 $accRat = exp($logNew-$logOld);
355 my $r2 = rand;
356 if ($r2 < $accRat) {
357 $d = $d_new;
358 $k = $k_new;
360 my $pri = $_/$Prifreq;
361 print "$_\t$d\t$k\n" if $pri == int($pri);
362 if ($_ > $Burnin) {
363 my $sav = $_/$Savfreq;
364 if ($sav == int($sav)) {
365 push @d, $d;
366 push @k, $k;
370 my @d_stat = &statis(@d);
371 my @k_stat = &statis(@k);
372 return (@d_stat, @k_stat);