scripts/tree/bp_blast2tree.pl: moved to bioperl-run
[bioperl-live.git] / Bio / PopGen / Simulation / Coalescent.pm
blob4e1dfaa90962fbe8298841817af957fe3d47be6f
2 # BioPerl module for Bio::PopGen::Simulation::Coalescent
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::PopGen::Simulation::Coalescent - A Coalescent simulation factory
18 =head1 SYNOPSIS
20 use Bio::PopGen::Simulation::Coalescent;
21 my @taxonnames = qw(SpeciesA SpeciesB SpeciesC SpeciesD);
22 my $sim1 = Bio::PopGen::Simulation::Coalescent->new(-samples => \@taxonnames);
24 my $tree = $sim1->next_tree;
26 # add 20 mutations randomly to the tree
27 $sim1->add_Mutations($tree,20);
29 # or for anonymous samples
31 my $sim2 = Bio::PopGen::Simulation::Coalescent->new( -sample_size => 6,
32 -maxcount => 50);
33 my $tree2 = $sim2->next_tree;
34 # add 20 mutations randomly to the tree
35 $sim2->add_Mutations($tree2,20);
37 =head1 DESCRIPTION
39 Builds a random tree every time next_tree is called or up to -maxcount
40 times with branch lengths and provides the ability to randomly add
41 mutations onto the tree with a probabilty proportional to the branch
42 lengths.
44 This algorithm is based on the make_tree algorithm from Richard Hudson 1990.
46 Hudson, R. R. 1990. Gene genealogies and the coalescent
47 process. Pp. 1-44 in D. Futuyma and J. Antonovics, eds. Oxford
48 surveys in evolutionary biology. Vol. 7. Oxford University
49 Press, New York.
51 This module was previously named Bio::Tree::RandomTree
53 =head1 FEEDBACK
55 =head2 Mailing Lists
57 User feedback is an integral part of the evolution of this and other
58 Bioperl modules. Send your comments and suggestions preferably to
59 the Bioperl mailing list. Your participation is much appreciated.
61 bioperl-l@bioperl.org - General discussion
62 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
64 =head2 Support
66 Please direct usage questions or support issues to the mailing list:
68 I<bioperl-l@bioperl.org>
70 rather than to the module maintainer directly. Many experienced and
71 reponsive experts will be able look at the problem and quickly
72 address it. Please include a thorough description of the problem
73 with code and data examples if at all possible.
75 =head2 Reporting Bugs
77 Report bugs to the Bioperl bug tracking system to help us keep track
78 of the bugs and their resolution. Bug reports can be submitted via
79 the web:
81 https://github.com/bioperl/bioperl-live/issues
83 =head1 AUTHOR - Jason Stajich, Matthew Hahn
85 Email jason-at-bioperl-dot-org
86 Email matthew-dot-hahn-at-duke-dot-edu
88 =head1 APPENDIX
90 The rest of the documentation details each of the object methods.
91 Internal methods are usually preceded with a _
93 =cut
96 # Let the code begin...
99 package Bio::PopGen::Simulation::Coalescent;
100 use vars qw($PRECISION_DIGITS);
101 use strict;
103 $PRECISION_DIGITS = 3; # Precision for the branchlength
105 use Bio::Tree::AlleleNode;
106 use Bio::PopGen::Genotype;
107 use Bio::Tree::Tree;
109 use base qw(Bio::Root::Root Bio::Factory::TreeFactoryI);
112 =head2 new
114 Title : new
115 Usage : my $obj = Bio::PopGen::Simulation::Coalescent->new();
116 Function: Builds a new Bio::PopGen::Simulation::Coalescent object
117 Returns : an instance of Bio::PopGen::Simulation::Coalescent
118 Args : -samples => arrayref of sample names
120 -sample_size=> number of samples (samps will get a systematic name)
121 -maxcount => [optional] maximum number of trees to provide
123 =cut
125 sub new{
126 my ($class,@args) = @_;
127 my $self = $class->SUPER::new(@args);
129 $self->{'_treecounter'} = 0;
130 $self->{'_maxcount'} = 0;
131 my ($maxcount, $samps,$samplesize ) = $self->_rearrange([qw(MAXCOUNT
132 SAMPLES
133 SAMPLE_SIZE)],
134 @args);
135 my @samples;
137 if( ! defined $samps ) {
138 if( ! defined $samplesize || $samplesize <= 0 ) {
139 $self->throw("Must specify a valid samplesize if parameter -SAMPLE is not specified (sampsize is $samplesize)");
141 foreach ( 1..$samplesize ) { push @samples, "Samp$_"; }
142 } else {
143 if( ref($samps) !~ /ARRAY/i ) {
144 $self->throw("Must specify a valid ARRAY reference to the parameter -SAMPLES, did you forget a leading '\\'?");
146 @samples = @$samps;
149 $self->samples(\@samples);
150 $self->sample_size(scalar @samples);
151 defined $maxcount && $self->maxcount($maxcount);
152 return $self;
155 =head2 next_tree
157 Title : next_tree
158 Usage : my $tree = $factory->next_tree
159 Function: Returns a random tree based on the initialized number of nodes
160 NOTE: if maxcount is not specified on initialization or
161 set to a valid integer, subsequent calls to next_tree will
162 continue to return random trees and never return undef
163 Returns : Bio::Tree::TreeI object
164 Args : none
166 =cut
168 sub next_tree{
169 my ($self) = @_;
170 # If maxcount is set to something non-zero then next tree will
171 # continue to return valid trees until maxcount is reached
172 # otherwise will always return trees
173 return if( $self->maxcount &&
174 $self->{'_treecounter'}++ >= $self->maxcount );
175 my $size = $self->sample_size;
177 my $in;
178 my @tree = ();
179 my @list = ();
181 for($in=0;$in < 2*$size -1; $in++ ) {
182 push @tree, { 'nodenum' => "Node$in" };
184 # in C we would have 2 arrays
185 # an array of nodes (tree)
186 # and array of pointers to these nodes (list)
187 # and we just shuffle the list items to do the
188 # tree topology generation
189 # instead in perl, we will have a list of hashes (nodes) called @tree
190 # and a list of integers representing the indexes in tree called @list
192 for($in=0;$in < $size;$in++) {
193 $tree[$in]->{'time'} = 0;
194 $tree[$in]->{'desc1'} = undef;
195 $tree[$in]->{'desc2'} = undef;
196 push @list, $in;
199 my $t=0;
200 # generate times for the nodes
201 for($in = $size; $in > 1; $in-- ) {
202 $t+= -2.0 * log(1 - $self->random(1)) / ( $in * ($in-1) );
203 $tree[2 * $size - $in]->{'time'} =$t;
205 # topology generation
206 for ($in = $size; $in > 1; $in-- ) {
207 my $pick = int $self->random($in);
208 my $nodeindex = $list[$pick];
209 my $swap = 2 * $size - $in;
210 $tree[$swap]->{'desc1'} = $nodeindex;
211 $list[$pick] = $list[$in-1];
212 $pick = int rand($in - 1);
213 $nodeindex = $list[$pick];
214 $tree[$swap]->{'desc2'} = $nodeindex;
215 $list[$pick] = $swap;
217 # Let's convert the hashes into nodes
219 my @nodes = ();
220 foreach my $n ( @tree ) {
221 push @nodes,
222 Bio::Tree::AlleleNode->new(-id => $n->{'nodenum'},
223 -branch_length => $n->{'time'});
225 my $ct = 0;
226 foreach my $node ( @nodes ) {
227 my $n = $tree[$ct++];
228 if( defined $n->{'desc1'} ) {
229 $node->add_Descendent($nodes[$n->{'desc1'}]);
231 if( defined $n->{'desc2'} ) {
232 $node->add_Descendent($nodes[$n->{'desc2'}]);
235 my $T = Bio::Tree::Tree->new(-root => pop @nodes );
236 return $T;
239 =head2 add_Mutations
241 Title : add_Mutations
242 Usage : $factory->add_Mutations($tree, $mutcount);
243 Function: Adds mutations to a tree via a random process weighted by
244 branch length (it is a poisson distribution
245 as part of a coalescent process)
246 Returns : none
247 Args : $tree - Bio::Tree::TreeI
248 $nummut - number of mutations
249 $precision - optional # of digits for precision
252 =cut
254 sub add_Mutations{
255 my ($self,$tree, $nummut,$precision) = @_;
256 $precision ||= $PRECISION_DIGITS;
257 $precision = 10**$precision;
259 my @branches;
260 my @lens;
261 my $branchlen = 0;
262 my $last = 0;
263 my @nodes = $tree->get_nodes();
264 my $i = 0;
266 # Jason's somewhat simplistics way of doing a poission
267 # distribution for a fixed number of mutations
268 # build an array and put the node number in a slot
269 # representing the branch to put a mutation on
270 # but weight the number of slots per branch by the
271 # length of the branch ( ancestor's time - node time)
273 foreach my $node ( @nodes ) {
274 if( $node->ancestor ) {
275 my $len = int ( ($node->ancestor->branch_length -
276 $node->branch_length) * $precision);
277 if ( $len > 0 ) {
278 for( my $j =0;$j < $len;$j++) {
279 push @branches, $i;
281 $last += $len;
283 $branchlen += $len;
285 if( ! $node->isa('Bio::Tree::AlleleNode') ) {
286 bless $node, 'Bio::Tree::AlleleNode'; # rebless it to the right node
288 # This let's us reset the stored genotypes so we can keep reusing the
289 # same tree topology, but throw down mutations multiple times
290 $node->reset_Genotypes;
291 $i++;
293 # sanity check
294 $self->throw("branch len is $branchlen arraylen is $last")
295 unless ( $branchlen == $last );
296 my @mutations;
297 for( my $j = 0; $j < $nummut; $j++) {
298 my $index = int(rand($branchlen));
299 my $branch = $branches[$index];
301 # We're using an infinite sites model so every new
302 # mutation is a new site
303 my $g = Bio::PopGen::Genotype->new(-marker_name => "Mutation$j",
304 -alleles => [1]);
305 $nodes[$branch]->add_Genotype($g);
306 push @mutations, "Mutation$j";
307 # Let's add this mutation to all the children (push it down
308 # the branches to the tips)
309 foreach my $child ( $nodes[$branch]->get_all_Descendents ) {
310 $child->add_Genotype($g);
313 # Insure that everyone who doesn't have the mutation
314 # has the ancestral state, which is '0'
315 foreach my $node ( @nodes ) {
316 foreach my $m ( @mutations ) {
317 if( ! $node->has_Marker($m) ) {
318 my $emptyg = Bio::PopGen::Genotype->new(-marker_name => $m,
319 -alleles => [0]);
320 $node->add_Genotype($emptyg);
326 =head2 maxcount
328 Title : maxcount
329 Usage : $obj->maxcount($newval)
330 Function:
331 Returns : Maxcount value
332 Args : newvalue (optional)
335 =cut
337 sub maxcount{
338 my ($self,$value) = @_;
339 if( defined $value) {
340 if( $value =~ /^(\d+)/ ) {
341 $self->{'maxcount'} = $1;
342 } else {
343 $self->warn("Must specify a valid Positive integer to maxcount");
344 $self->{'maxcount'} = 0;
347 return $self->{'_maxcount'};
350 =head2 samples
352 Title : samples
353 Usage : $obj->samples($newval)
354 Function:
355 Example :
356 Returns : value of samples
357 Args : newvalue (optional)
360 =cut
362 sub samples{
363 my ($self,$value) = @_;
364 if( defined $value) {
365 if( ref($value) !~ /ARRAY/i ) {
366 $self->warn("Must specify a valid array ref to the method 'samples'");
367 $value = [];
369 $self->{'samples'} = $value;
371 return $self->{'samples'};
375 =head2 sample_size
377 Title : sample_size
378 Usage : $obj->sample_size($newval)
379 Function:
380 Example :
381 Returns : value of sample_size
382 Args : newvalue (optional)
385 =cut
387 sub sample_size{
388 my ($self,$value) = @_;
389 if( defined $value) {
390 $self->{'sample_size'} = $value;
392 return $self->{'sample_size'};
396 =head2 random
398 Title : random
399 Usage : my $rfloat = $node->random($size)
400 Function: Generates a random number between 0 and $size
401 This is abstracted so that someone can override and provide their
402 own special RNG. This is expected to be a uniform RNG.
403 Returns : Floating point random
404 Args : $maximum size for random number (defaults to 1)
407 =cut
409 sub random{
410 my ($self,$max) = @_;
411 return rand($max);