7 DistMCMC does maximum likelihood extimation of pairwise distance using Markov chain Monte Carlo algorithm.
8 Only JC69, JC69+G, K80 and K80+G are available now.\n
12 Usage: $0 <in.fasta> <out.txt>
13 Please edit the Perl script directly if you want to make changes of parameters.
16 my $NoItera = 100000; # Number of iterations
17 my $Burnin = 10000; # Burn in
18 my $Savfreq = 100; # Frequency of saving states after burn in
19 my $Prifreq = 50000; # Frequency of printing states on screen
20 my $dSlidWin = 0.1; # Sliding window size of distance
21 my $kSlidWin = 10; # Sliding window size of kappa
22 my @dRange = (0, 1); # Range of distance
23 my @kRange = (0, 100); # Range of kappa
24 my $alpha1 = 1; # Gamma shape parameter
30 open my $i1, "<", "$i";
34 my %dna = %{&readfasta
($i1)};
37 my %nseg; # Hash of pairwise number of segregation sites
38 foreach my $s1 (sort keys %dna) {
39 foreach my $s2 (sort keys %dna) {
40 if ($nseg{"$s1 $s2"} or $nseg{"$s2 $s1"}) {
42 } elsif ($s1 eq $s2) {
45 $nseg{"$s1 $s2"} = &calnseg
($dna{$s1}, $dna{$s2});
50 print O
"#Pair\tDistance\tDistance_mean\tDistance_median\tDistance_%95_HPD_lower\tDistance_%95_HPD_upper\tKappa_mean\tKappa_median\tKappa_%95_HPD_lower\tKappa_%95_HPD_upper\n";
52 foreach (sort keys %nseg) {
54 print "\nJC69 MCMC simulating...\nNo. Iteration\tDistance\n";
55 my @a = &JC69MCMC
(@
{$nseg{$_}});
56 print O
$_, "\tJC69\t", join("\t", @a), "\n";
57 print "\nJC69+G MCMC simulating...\nNo. Iteration\tDistance\n";
58 @a = &JC69gammaMCMC
(@
{$nseg{$_}}, $alpha1);
59 print O
$_, "\tJC69+G a=$alpha1\t", join("\t", @a), "\n";
60 @a = &JC69gammaMCMC
(@
{$nseg{$_}}, $alpha2);
61 print O
$_, "\tJC69+G a=$alpha2\t", join("\t", @a), "\n";
62 @a = &JC69gammaMCMC
(@
{$nseg{$_}}, $alpha3);
63 print O
$_, "\tJC69+G a=$alpha3\t", join("\t", @a), "\n";
64 print "\nK80 MCMC simulating...\nNo. Iteration\tDistance\tKappa\n";
65 @a = &K80MCMC
(@
{$nseg{$_}});
66 print O
$_, "\tK80\t", join("\t", @a), "\n";
67 print "\nK80+G MCMC simulating...\nNo. Iteration\tDistance\tKappa\n";
68 @a = &K80gammaMCMC
(@
{$nseg{$_}}, $alpha1);
69 print O
$_, "\tK80+G a=$alpha1\t", join("\t", @a), "\n";
70 @a = &K80gammaMCMC
(@
{$nseg{$_}}, $alpha2);
71 print O
$_, "\tK80+G a=$alpha2\t", join("\t", @a), "\n";
72 @a = &K80gammaMCMC
(@
{$nseg{$_}}, $alpha3);
73 print O
$_, "\tK80+G a=$alpha3\t", join("\t", @a), "\n";
77 sub readfasta
{ # Read fasta file
86 $nchar = length $seq unless $nchar;
99 foreach my $i (0 .. $nchar-1) {
100 next unless (substr($s1, $i, 1) =~ /[ATCG]/) and (substr($s2, $i, 1) =~ /[ATCG]/);
102 if (substr($s1, $i, 1) eq "A") {
103 if (substr($s2, $i, 1) eq "G") {
105 } elsif (substr($s2, $i, 1) eq "A") {
110 } elsif (substr($s1, $i, 1) eq "G") {
111 if (substr($s2, $i, 1) eq "G") {
113 } elsif (substr($s2, $i, 1) eq "A") {
118 } elsif (substr($s1, $i, 1) eq "C") {
119 if (substr($s2, $i, 1) eq "T") {
121 } elsif (substr($s2, $i, 1) eq "C") {
126 } elsif (substr($s1, $i, 1) eq "T") {
127 if (substr($s2, $i, 1) eq "T") {
129 } elsif (substr($s2, $i, 1) eq "C") {
136 return [$ti, $tv, $total];
140 sub NewSta
{ # Propose new state
141 my ($OldStat, $SlidWin, $LoBound, $UpBound) = @_;
143 my $TemStat = $OldStat + ($r1-0.5)*$SlidWin; # Temp state
144 my $NewStat; # New state
145 if ($TemStat < $LoBound) {
146 $NewStat = 2*$LoBound - $TemStat;
147 } elsif ($TemStat > $UpBound) {
148 $NewStat = 2*$UpBound - $TemStat;
155 sub statis
{ # Calculate mean, median and 95% HPD of numbers
156 my @d_sort = sort {$a <=> $b} @_;
157 my $num_d = @_; # Number of samples in @_
158 my $num_mid = int($num_d/2); # Middle number of @d_sort;
159 my $d_mid = $d_sort[$num_mid];
164 my $d_mean = $sum/@_;
165 my $num95_d = 0.95*$num_d; # %95 HPD number of samples
166 my $intHPD95 = $d_sort[$num95_d-1] - $d_sort[0]; # %95 HPD interval
167 my $intHPD95_l = $d_sort[0]; # Left boundary of %95 HPD interval
168 my $intHPD95_r = $d_sort[$num95_d-1]; # Right boundary of %95 HPD interval
169 for (my $i = 1; $num95_d+$i <= $num_d; ++$i) {
170 my $int95 = $d_sort[$num95_d+$i-1] - $d_sort[$i]; # %95 interval
171 if ($int95 < $intHPD95) {
173 $intHPD95_l = $d_sort[$i];
174 $intHPD95_r = $d_sort[$num95_d+$i-1];
177 return ($d_mean, $d_mid, $intHPD95_l, $intHPD95_r);
181 my ($ti, $tv, $n) = @_;
182 my $p = ($ti+$tv)/$n;
183 my $d_mean = -0.75*log(1-4/3*$p);
184 my $d_stde = sqrt($p*(1-$p)/(1-4/3*$p)**2/$n);
185 my $ci_l = $d_mean - 1.96*$d_stde;
186 my $ci_r = $d_mean + 1.96*$d_stde;
187 return ($d_mean, $d_stde, $ci_l, $ci_r);
191 my ($ti, $tv, $n) = @_;
192 my $d = ($dRange[1]-$dRange[0])*(rand) + $dRange[0]; # Random initial state
193 my @d; # An array of saved states
194 foreach (1 .. $NoItera) {
195 my $d_old = $d; # Old state
196 my $d_new = &NewSta
($d_old, $dSlidWin, @dRange);
197 my $logOld = ($ti+$tv)*log(0.75-0.75*exp(-4/3*$d_old)) + ($n-$ti-$tv)*log(0.25+0.75*exp(-4/3*$d_old)); # Log likelihood of old state
198 my $logNew = ($ti+$tv)*log(0.75-0.75*exp(-4/3*$d_new)) + ($n-$ti-$tv)*log(0.25+0.75*exp(-4/3*$d_new)); # Log likelihood of new state
199 my $accRat; # Acceptance ratio
200 if ($logNew > $logOld) {
203 $accRat = exp($logNew-$logOld);
209 my $pri = $_/$Prifreq;
210 print "$_\t$d\n" if $pri == int($pri);
212 my $sav = $_/$Savfreq;
213 push @d, $d if $sav == int($sav);
216 my @d_stat = &statis
(@d);
221 my ($ti, $tv, $n) = @_;
224 my $d_mean = -0.5*log(1-2*$S-$V) - 0.25*log(1-2*$V);
225 my $a = 1/(1-2*$S-$V);
226 my $b = 0.5*(1/(1-2*$S-$V)+1/(1-2*$V));
227 my $d_stde = sqrt(($a**2*$S+$b**2*$V-($a*$S+$b*$V)**2)/$n);
228 my $ci_l = $d_mean - 1.96*$d_stde;
229 my $ci_r = $d_mean + 1.96*$d_stde;
230 # my $k_mean = 2*log(1-2*$S-$V)/log(1-2*$V)-1;
231 return ($d_mean, $d_stde, $ci_l, $ci_r);
235 my ($ti, $tv, $n) = @_;
236 my $d = ($dRange[1]-$dRange[0])*(rand) + $dRange[0]; # Random initial state of distance
237 my $k = ($kRange[1]-$kRange[0])*(rand) + $kRange[0]; # Random initial state of kappa
238 my @d; # An array of saved states of distance
239 my @k; # An array of saved states of kappa
240 foreach (1 .. $NoItera) {
241 my $d_old = $d; # Old state of distance
242 my $d_new = &NewSta
($d_old, $dSlidWin, @dRange);
243 my $k_old = $k; # Old state of kappa
244 my $k_new = &NewSta
($k_old, $kSlidWin, @kRange);
245 my $old1 = exp(-4*$d_old/($k_old+2));
246 my $old2 = exp(-2*$d_old*($k_old+1)/($k_old+2));
247 my $logOld = $ti*log(0.25+0.25*$old1-0.5*$old2) + $tv*log(0.5-0.5*$old1) + ($n-$ti-$tv)*log(0.25+0.25*$old1+0.5*$old2); # Log likelihood of old state
248 my $new1 = exp(-4*$d_new/($k_new+2));
249 my $new2 = exp(-2*$d_new*($k_new+1)/($k_new+2));
250 my $logNew = $ti*log(0.25+0.25*$new1-0.5*$new2) + $tv*log(0.5-0.5*$new1) + ($n-$ti-$tv)*log(0.25+0.25*$new1+0.5*$new2); # Log likelihood of new state
251 my $accRat; # Acceptance ratio
252 if ($logNew > $logOld) {
255 $accRat = exp($logNew-$logOld);
262 my $pri = $_/$Prifreq;
263 print "$_\t$d\t$k\n" if $pri == int($pri);
265 my $sav = $_/$Savfreq;
266 if ($sav == int($sav)) {
272 my @d_stat = &statis
(@d);
273 my @k_stat = &statis
(@k);
274 return (@d_stat, @k_stat);
278 my ($ti, $tv, $n, $alpha) = @_;
279 my $p = ($ti+$tv)/$n;
280 my $d_mean = 0.75*$alpha*((1-4/3*$p)**(-1/$alpha)-1);
281 my $d_stde = sqrt($p*(1-$p)/$n*(1-4/3*$p)**(-2/($alpha+1))); # There is something wrong.
282 my $ci_l = $d_mean - 1.96*$d_stde;
283 my $ci_r = $d_mean + 1.96*$d_stde;
284 return ($d_mean, $d_stde, $ci_l, $ci_r);
288 my ($ti, $tv, $n, $alpha) = @_;
289 my $d = ($dRange[1]-$dRange[0])*(rand) + $dRange[0]; # Random initial state
290 my @d; # An array of saved states
291 foreach (1 .. $NoItera) {
292 my $d_old = $d; # Old state
293 my $d_new = &NewSta
($d_old, $dSlidWin, @dRange);
294 my $old1 = 1/(1+4*$d_old/3/$alpha)**$alpha;
295 my $logOld = ($ti+$tv)*log(0.75-0.75*$old1) + ($n-$ti-$tv)*log(0.25+0.75*$old1); # Log likelihood of old state
296 my $new1 = 1/(1+4*$d_new/3/$alpha)**$alpha;
297 my $logNew = ($ti+$tv)*log(0.75-0.75*$new1) + ($n-$ti-$tv)*log(0.25+0.75*$new1); # Log likelihood of new state
298 my $accRat; # Acceptance ratio
299 if ($logNew > $logOld) {
302 $accRat = exp($logNew-$logOld);
308 my $pri = $_/$Prifreq;
309 print "$_\t$d\n" if $pri == int($pri);
311 my $sav = $_/$Savfreq;
312 push @d, $d if $sav == int($sav);
315 my @d_stat = &statis
(@d);
320 my ($ti, $tv, $n, $alpha) = @_;
323 my $d_mean = $alpha/2*((1-2*$S-$V)**(-1/$alpha)-1) + $alpha/4*((1-2*$V)**(-1/$alpha)-1);
324 my $a = (1-2*$S-$V)**(-1/(1+$alpha));
325 my $b = 0.5*((1-2*$S-$V)**(-1/(1+$alpha))+(1-2*$V)**(-1/(1+$alpha)));
326 my $d_stde = sqrt(($a**2*$S+$b**2*$V-($a*$S+$b*$V)**2)/$n); # There is something wrong.
327 my $ci_l = $d_mean - 1.96*$d_stde;
328 my $ci_r = $d_mean + 1.96*$d_stde;
329 return ($d_mean, $d_stde, $ci_l, $ci_r);
333 my ($ti, $tv, $n, $alpha) = @_;
334 my $d = ($dRange[1]-$dRange[0])*(rand) + $dRange[0]; # Random initial state of distance
335 my $k = ($kRange[1]-$kRange[0])*(rand) + $kRange[0]; # Random initial state of kappa
336 my @d; # An array of saved states of distance
337 my @k; # An array of saved states of kappa
338 foreach (1 .. $NoItera) {
339 my $d_old = $d; # Old state of distance
340 my $d_new = &NewSta
($d_old, $dSlidWin, @dRange);
341 my $k_old = $k; # Old state of kappa
342 my $k_new = &NewSta
($k_old, $kSlidWin, @kRange);
343 my $old1 = 1/(1+4*$d_old/($k_old+2)/$alpha)**$alpha;
344 my $old2 = 1/(1+2*$d_old*($k_old+1)/($k_old+2)/$alpha)**$alpha;
345 my $logOld = $ti*log(0.25+0.25*$old1-0.5*$old2) + $tv*log(0.5-0.5*$old1) + ($n-$ti-$tv)*log(0.25+0.25*$old1+0.5*$old2); # Log likelihood of old state
346 my $new1 = 1/(1+4*$d_new/($k_new+2)/$alpha)**$alpha;
347 my $new2 = 1/(1+2*$d_new*($k_new+1)/($k_new+2)/$alpha)**$alpha;
348 my $logNew = $ti*log(0.25+0.25*$new1-0.5*$new2) + $tv*log(0.5-0.5*$new1) + ($n-$ti-$tv)*log(0.25+0.25*$new1+0.5*$new2); # Log likelihood of new state
349 my $accRat; # Acceptance ratio
350 if ($logNew > $logOld) {
353 $accRat = exp($logNew-$logOld);
360 my $pri = $_/$Prifreq;
361 print "$_\t$d\t$k\n" if $pri == int($pri);
363 my $sav = $_/$Savfreq;
364 if ($sav == int($sav)) {
370 my @d_stat = &statis
(@d);
371 my @k_stat = &statis
(@k);
372 return (@d_stat, @k_stat);