scripts/tree/bp_blast2tree.pl: moved to bioperl-run
[bioperl-live.git] / Bio / PopGen / IO / phase.pm
blobade0cfe98cb9a2ca391f537e39e8b1b47948669d
2 # BioPerl module for Bio::PopGen::IO::phase
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Rich Dobson <r.j.dobson-at-qmul.ac.uk>
8 # Copyright Rich Dobson
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::PopGen::IO::phase - A parser for Phase format data
18 =head1 SYNOPSIS
20 # Do not use directly, use through the Bio::PopGen::IO driver
22 use Bio::PopGen::IO;
23 my $io = Bio::PopGen::IO->new(-format => 'phase',
24 -file => 'data.phase');
26 # Some IO might support reading in a population at a time
28 my @population;
29 while( my $ind = $io->next_individual ) {
30 push @population, $ind;
34 =head1 DESCRIPTION
36 A driver module for Bio::PopGen::IO for parsing phase data.
38 PHASE is defined in http://www.stat.washington.edu/stephens/instruct2.1.pdf
40 =head1 FEEDBACK
42 =head2 Mailing Lists
44 User feedback is an integral part of the evolution of this and other
45 Bioperl modules. Send your comments and suggestions preferably to
46 the Bioperl mailing list. Your participation is much appreciated.
48 bioperl-l@bioperl.org - General discussion
49 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
51 =head2 Support
53 Please direct usage questions or support issues to the mailing list:
55 I<bioperl-l@bioperl.org>
57 rather than to the module maintainer directly. Many experienced and
58 reponsive experts will be able look at the problem and quickly
59 address it. Please include a thorough description of the problem
60 with code and data examples if at all possible.
62 =head2 Reporting Bugs
64 Report bugs to the Bioperl bug tracking system to help us keep track
65 of the bugs and their resolution. Bug reports can be submitted via
66 the web:
68 https://github.com/bioperl/bioperl-live/issues
70 =head1 AUTHOR - Rich Dobson
72 Email r.j.dobson-at-qmul.ac.uk
74 =head1 CONTRIBUTORS
76 Jason Stajich, jason-at-bioperl.org
78 =head1 APPENDIX
80 The rest of the documentation details each of the object methods.
81 Internal methods are usually preceded with a _
83 =cut
86 # Let the code begin...
89 package Bio::PopGen::IO::phase;
90 use vars qw($FieldDelim $AlleleDelim $NoHeader);
91 use strict;
93 ($FieldDelim, $AlleleDelim, $NoHeader) = (' ', '\s+', 1);
98 use Bio::PopGen::Individual;
99 use Bio::PopGen::Population;
100 use Bio::PopGen::Genotype;
102 use base qw(Bio::PopGen::IO);
104 =head2 new
106 Title : new
107 Usage : my $obj = Bio::PopGen::IO::hapmap->new();
108 Function: Builds a new Bio::PopGen::IO::hapmap object
109 Returns : an instance of Bio::PopGen::IO::hapmap
110 Args : [optional, these are the current defaults]
111 -field_delimiter => ' '
112 -allele_delimiter=> '\s+'
113 -no_header => 0,
116 =cut
119 sub _initialize {
121 my($self, @args) = @_;
123 $Bio::PopGen::Genotype::BlankAlleles='';
125 my ($fieldsep,$all_sep,
126 $noheader) = $self->_rearrange([qw(FIELD_DELIMITER
127 ALLELE_DELIMITER
128 NO_HEADER)],@args);
130 $self->flag('no_header', defined $noheader ? $noheader : $NoHeader);
131 $self->flag('field_delimiter',defined $fieldsep ? $fieldsep : $FieldDelim);
132 $self->flag('allele_delimiter',defined $all_sep ? $all_sep : $AlleleDelim);
134 $self->{'_header'} = undef;
135 return 1;
139 =head2 flag
141 Title : flag
142 Usage : $obj->flag($flagname,$newval)
143 Function: Get/Set the flag value
144 Returns : value of a flag (a boolean)
145 Args : A flag name, currently we expect
146 'no_header', 'field_delimiter', or 'allele_delimiter'
147 on set, new value (a boolean or undef, optional)
150 =cut
152 sub flag {
154 my $self = shift;
155 my $fieldname = shift;
156 return unless defined $fieldname;
158 return $self->{'_flag'}->{$fieldname} = shift if @_;
159 return $self->{'_flag'}->{$fieldname};
163 =head2 next_individual
165 Title : next_individual
166 Usage : my $ind = $popgenio->next_individual;
167 Function: Retrieve the next individual from a dataset
168 Returns : L<Bio::PopGen::IndividualI> object
169 Args : none
172 =cut
174 sub next_individual {
175 my ($self) = @_;
177 my ($sam,@marker_results,$number_of_ids,$number_of_markers,
178 $marker_positions,$micro_snp);
180 while( defined( $_ = $self->_readline) ) {
181 chomp;
182 next if( /^\s+$/ || ! length($_) );
183 last;
186 return unless defined $_;
187 if( $self->flag('no_header') || defined $self->{'_header'} ) {
189 ####### sometimes there is some marker info at the start of a phase input file
190 ####### we collect it in the next few lines if there is. Should this info be held in a marker object?
192 if(!$self->{'_count'} && /^\s*\d+$/){
193 $self->flag('number_of_ids',$_);
194 #print "number_of_ids : $number_of_ids\n";
195 $self->{'_count'}++;
196 return $self->next_individual;
197 } elsif($self->{'_count'} == 1 && /^\s*\d+$/){
198 $self->flag('number_of_markers',$_);
199 #print "number_of_markers : $number_of_markers\n";
200 $self->{'_count'}++;
201 return $self->next_individual;
202 } elsif($self->{'_count'} == 2 && /^\s*P\s\d/){
203 $self->flag('marker_positions',$_);
204 #print "marker_position : $marker_positions\n";
205 $self->{'_count'}++;
206 return $self->next_individual;
207 } elsif($self->{'_count'} == 3 && /^\s*(M|S)+\s*$/i){
208 $self->flag('micro_snp',$_);
209 #print "microsat or snp : $micro_snp\n";
210 $self->{'_count'}++;
211 return $self->next_individual;
212 } elsif(/^\s*\#/){
213 ($self->{'_sam'}) = /^\s*\#(.+)/;
214 #print "sample : $self->{'_sam'}\n";
215 $self->{'_count'}++;
216 return $self->next_individual;
217 } else {
218 if( $self->{'_row1'} ) {
219 # if we are looking at the 2nd row of alleles for this id
221 @{$self->{'_second_row'}} =
222 split($self->flag('field_delimiter'),$_);
224 for my $i(0 .. $#{$self->{'_first_row'}}){
226 push(@{$self->{'_marker_results'}},
227 $self->{'_first_row'}->[$i].
228 $self->flag('field_delimiter').
229 $self->{'_second_row'}->[$i]);
231 $self->{'_row1'} = 0;
232 } else {
233 # if we are looking at the first row of alleles for this id
234 @{$self->{'_marker_results'}} = ();
235 @{$self->{'_first_row'}} = split($self->flag('field_delimiter'),$_);
236 $self->{'_row1'} = 1;
237 return $self->next_individual;
241 my $i = 1;
242 foreach my $m ( @{$self->{'_marker_results'}} ) {
243 $m =~ s/^\s+//;
244 $m =~ s/\s+$//;
245 my $markername;
246 if( defined($self->flag('marker_positions')) ) {
247 $markername = (split($self->flag('field_delimiter'), $self->flag('marker_positions')))[$i];
248 } elsif( defined $self->{'_header'} ) {
249 $markername = $self->{'_header'}->[$i] || "$i";
250 } else {
251 $markername = "$i";
254 my $markertype;
255 if( defined($self->flag('marker_positions')) ) {
256 $markertype = (split('', $self->flag('micro_snp')))[$i-1];
257 } else {
258 $markertype = "S";
261 $self->debug( "markername is $markername alleles are $m\n");
262 my @alleles = split($self->flag('allele_delimiter'), $m);
264 $m = Bio::PopGen::Genotype->new(-alleles =>\@alleles,
265 -marker_name => $markername,
266 -marker_type => $markertype,
267 -individual_id => $self->{'_sam'});
268 $i++;
270 return Bio::PopGen::Individual->new(-unique_id => $self->{'_sam'},
271 -genotypes =>\@{$self->{'_marker_results'}},
274 } else {
275 $self->{'_header'} = [split($self->flag('field_delimiter'),$_)];
276 return $self->next_individual; # rerun loop again
278 return;
281 =head2 next_population
283 Title : next_population
284 Usage : my $ind = $popgenio->next_population;
285 Function: Retrieve the next population from a dataset
286 Returns : L<Bio::PopGen::PopulationI> object
287 Args : none
288 Note : Many implementation will not implement this
290 =cut
292 sub next_population{
293 my ($self) = @_;
294 my @inds;
295 while( my $ind = $self->next_individual ) {
296 push @inds, $ind;
298 Bio::PopGen::Population->new(-individuals => \@inds);
301 =head2 write_individual
303 Title : write_individual
304 Usage : $popgenio->write_individual($ind);
305 Function: Write an individual out in the file format
306 Returns : none
307 Args : L<Bio::PopGen::PopulationI> object(s)
309 =cut
312 sub write_individual {
313 my ($self,@inds) = @_;
314 my $fielddelim = $self->flag('field_delimiter');
315 my $alleledelim = $self->flag('allele_delimiter');
317 # For now capture print_header flag from @inds
318 my $header = 1;
319 $header = pop(@inds) if($inds[-1] =~ m/^[01]$/);
321 foreach my $ind ( @inds ) {
322 if (! ref($ind) || ! $ind->isa('Bio::PopGen::IndividualI') ) {
323 $self->warn("Cannot write an object that is not a Bio::PopGen::IndividualI object ($ind)");
324 next;
327 # sort lexically until we have a better way to insure a consistent order
328 my @marker_names = sort $ind->get_marker_names;
330 if ($header) {
331 my $n_markers = scalar(@marker_names);
332 $self->_print( "1\n");
333 $self->_print( $n_markers, "\n");
334 if( $self->flag('no_header') &&
335 ! $self->flag('header_written') ) {
336 $self->_print(join($fielddelim, ('P', @marker_names)), "\n");
337 $self->flag('header_written',1);
339 foreach my $geno ($ind->get_Genotypes()) {
340 $self->_print($geno->marker_type);
342 $self->_print("\n");
345 my(@row1,@row2);
346 for (@marker_names){
347 my $geno = $ind->get_Genotypes(-marker => $_);
348 my @alleles = $geno->get_Alleles(1);
349 push(@row1,$alleles[0]);
350 push(@row2,$alleles[1]);
352 $self->_print("#",$ind->unique_id,"\n",
353 join($fielddelim,@row1),"\n",
354 join($fielddelim,@row2),"\n");
358 =head2 write_population
360 Title : write_population
361 Usage : $popgenio->write_population($pop);
362 Function: Write a population out in the file format
363 Returns : none
364 Args : L<Bio::PopGen::PopulationI> object(s)
365 Note : Many implementation will not implement this
367 =cut
370 sub write_population {
371 my ($self,@pops) = @_;
372 my $fielddelim = $self->flag('field_delimiter');
373 my $alleledelim = $self->flag('allele_delimiter');
375 foreach my $pop ( @pops ) {
376 if (! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
377 $self->warn("Cannot write an object that is not a Bio::PopGen::PopulationI object");
378 next;
380 # sort lexically until we have a better way to insure a consistent order
381 my @marker_names = sort $pop->get_marker_names;
382 my $n_markers = scalar(@marker_names);
383 $self->_print( $pop->get_number_individuals, "\n");
384 $self->_print( $n_markers, "\n");
385 if( $self->flag('no_header') &&
386 ! $self->flag('header_written') ) {
387 $self->_print( join($fielddelim, ('P', @marker_names)), "\n");
388 $self->flag('header_written',1);
391 foreach (@marker_names) {
392 $self->_print(($pop->get_Genotypes($_))[0]->marker_type);
394 $self->_print("\n");
396 $self->write_individual( $pop->get_Individuals, 0 );