2 # BioPerl module for Bio::PopGen::IO::phase
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Rich Dobson <r.j.dobson-at-qmul.ac.uk>
8 # Copyright Rich Dobson
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::PopGen::IO::phase - A parser for Phase format data
20 # Do not use directly, use through the Bio::PopGen::IO driver
23 my $io = Bio::PopGen::IO->new(-format => 'phase',
24 -file => 'data.phase');
26 # Some IO might support reading in a population at a time
29 while( my $ind = $io->next_individual ) {
30 push @population, $ind;
36 A driver module for Bio::PopGen::IO for parsing phase data.
38 PHASE is defined in http://www.stat.washington.edu/stephens/instruct2.1.pdf
44 User feedback is an integral part of the evolution of this and other
45 Bioperl modules. Send your comments and suggestions preferably to
46 the Bioperl mailing list. Your participation is much appreciated.
48 bioperl-l@bioperl.org - General discussion
49 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53 Please direct usage questions or support issues to the mailing list:
55 I<bioperl-l@bioperl.org>
57 rather than to the module maintainer directly. Many experienced and
58 reponsive experts will be able look at the problem and quickly
59 address it. Please include a thorough description of the problem
60 with code and data examples if at all possible.
64 Report bugs to the Bioperl bug tracking system to help us keep track
65 of the bugs and their resolution. Bug reports can be submitted via
68 https://github.com/bioperl/bioperl-live/issues
70 =head1 AUTHOR - Rich Dobson
72 Email r.j.dobson-at-qmul.ac.uk
76 Jason Stajich, jason-at-bioperl.org
80 The rest of the documentation details each of the object methods.
81 Internal methods are usually preceded with a _
86 # Let the code begin...
89 package Bio
::PopGen
::IO
::phase
;
90 use vars
qw($FieldDelim $AlleleDelim $NoHeader);
93 ($FieldDelim, $AlleleDelim, $NoHeader) = (' ', '\s+', 1);
98 use Bio::PopGen::Individual;
99 use Bio::PopGen::Population;
100 use Bio::PopGen::Genotype;
102 use base qw(Bio::PopGen::IO);
107 Usage : my $obj = Bio::PopGen::IO::hapmap->new();
108 Function: Builds a new Bio::PopGen::IO::hapmap object
109 Returns : an instance of Bio::PopGen::IO::hapmap
110 Args : [optional, these are the current defaults]
111 -field_delimiter => ' '
112 -allele_delimiter=> '\s+'
121 my($self, @args) = @_;
123 $Bio::PopGen
::Genotype
::BlankAlleles
='';
125 my ($fieldsep,$all_sep,
126 $noheader) = $self->_rearrange([qw(FIELD_DELIMITER
130 $self->flag('no_header', defined $noheader ?
$noheader : $NoHeader);
131 $self->flag('field_delimiter',defined $fieldsep ?
$fieldsep : $FieldDelim);
132 $self->flag('allele_delimiter',defined $all_sep ?
$all_sep : $AlleleDelim);
134 $self->{'_header'} = undef;
142 Usage : $obj->flag($flagname,$newval)
143 Function: Get/Set the flag value
144 Returns : value of a flag (a boolean)
145 Args : A flag name, currently we expect
146 'no_header', 'field_delimiter', or 'allele_delimiter'
147 on set, new value (a boolean or undef, optional)
155 my $fieldname = shift;
156 return unless defined $fieldname;
158 return $self->{'_flag'}->{$fieldname} = shift if @_;
159 return $self->{'_flag'}->{$fieldname};
163 =head2 next_individual
165 Title : next_individual
166 Usage : my $ind = $popgenio->next_individual;
167 Function: Retrieve the next individual from a dataset
168 Returns : L<Bio::PopGen::IndividualI> object
174 sub next_individual
{
177 my ($sam,@marker_results,$number_of_ids,$number_of_markers,
178 $marker_positions,$micro_snp);
180 while( defined( $_ = $self->_readline) ) {
182 next if( /^\s+$/ || ! length($_) );
186 return unless defined $_;
187 if( $self->flag('no_header') || defined $self->{'_header'} ) {
189 ####### sometimes there is some marker info at the start of a phase input file
190 ####### we collect it in the next few lines if there is. Should this info be held in a marker object?
192 if(!$self->{'_count'} && /^\s*\d+$/){
193 $self->flag('number_of_ids',$_);
194 #print "number_of_ids : $number_of_ids\n";
196 return $self->next_individual;
197 } elsif($self->{'_count'} == 1 && /^\s*\d+$/){
198 $self->flag('number_of_markers',$_);
199 #print "number_of_markers : $number_of_markers\n";
201 return $self->next_individual;
202 } elsif($self->{'_count'} == 2 && /^\s*P\s\d/){
203 $self->flag('marker_positions',$_);
204 #print "marker_position : $marker_positions\n";
206 return $self->next_individual;
207 } elsif($self->{'_count'} == 3 && /^\s*(M|S)+\s*$/i){
208 $self->flag('micro_snp',$_);
209 #print "microsat or snp : $micro_snp\n";
211 return $self->next_individual;
213 ($self->{'_sam'}) = /^\s*\#(.+)/;
214 #print "sample : $self->{'_sam'}\n";
216 return $self->next_individual;
218 if( $self->{'_row1'} ) {
219 # if we are looking at the 2nd row of alleles for this id
221 @
{$self->{'_second_row'}} =
222 split($self->flag('field_delimiter'),$_);
224 for my $i(0 .. $#{$self->{'_first_row'}}){
226 push(@
{$self->{'_marker_results'}},
227 $self->{'_first_row'}->[$i].
228 $self->flag('field_delimiter').
229 $self->{'_second_row'}->[$i]);
231 $self->{'_row1'} = 0;
233 # if we are looking at the first row of alleles for this id
234 @
{$self->{'_marker_results'}} = ();
235 @
{$self->{'_first_row'}} = split($self->flag('field_delimiter'),$_);
236 $self->{'_row1'} = 1;
237 return $self->next_individual;
242 foreach my $m ( @
{$self->{'_marker_results'}} ) {
246 if( defined($self->flag('marker_positions')) ) {
247 $markername = (split($self->flag('field_delimiter'), $self->flag('marker_positions')))[$i];
248 } elsif( defined $self->{'_header'} ) {
249 $markername = $self->{'_header'}->[$i] || "$i";
255 if( defined($self->flag('marker_positions')) ) {
256 $markertype = (split('', $self->flag('micro_snp')))[$i-1];
261 $self->debug( "markername is $markername alleles are $m\n");
262 my @alleles = split($self->flag('allele_delimiter'), $m);
264 $m = Bio
::PopGen
::Genotype
->new(-alleles
=>\
@alleles,
265 -marker_name
=> $markername,
266 -marker_type
=> $markertype,
267 -individual_id
=> $self->{'_sam'});
270 return Bio
::PopGen
::Individual
->new(-unique_id
=> $self->{'_sam'},
271 -genotypes
=>\@
{$self->{'_marker_results'}},
275 $self->{'_header'} = [split($self->flag('field_delimiter'),$_)];
276 return $self->next_individual; # rerun loop again
281 =head2 next_population
283 Title : next_population
284 Usage : my $ind = $popgenio->next_population;
285 Function: Retrieve the next population from a dataset
286 Returns : L<Bio::PopGen::PopulationI> object
288 Note : Many implementation will not implement this
295 while( my $ind = $self->next_individual ) {
298 Bio
::PopGen
::Population
->new(-individuals
=> \
@inds);
301 =head2 write_individual
303 Title : write_individual
304 Usage : $popgenio->write_individual($ind);
305 Function: Write an individual out in the file format
307 Args : L<Bio::PopGen::PopulationI> object(s)
312 sub write_individual
{
313 my ($self,@inds) = @_;
314 my $fielddelim = $self->flag('field_delimiter');
315 my $alleledelim = $self->flag('allele_delimiter');
317 # For now capture print_header flag from @inds
319 $header = pop(@inds) if($inds[-1] =~ m/^[01]$/);
321 foreach my $ind ( @inds ) {
322 if (! ref($ind) || ! $ind->isa('Bio::PopGen::IndividualI') ) {
323 $self->warn("Cannot write an object that is not a Bio::PopGen::IndividualI object ($ind)");
327 # sort lexically until we have a better way to insure a consistent order
328 my @marker_names = sort $ind->get_marker_names;
331 my $n_markers = scalar(@marker_names);
332 $self->_print( "1\n");
333 $self->_print( $n_markers, "\n");
334 if( $self->flag('no_header') &&
335 ! $self->flag('header_written') ) {
336 $self->_print(join($fielddelim, ('P', @marker_names)), "\n");
337 $self->flag('header_written',1);
339 foreach my $geno ($ind->get_Genotypes()) {
340 $self->_print($geno->marker_type);
347 my $geno = $ind->get_Genotypes(-marker
=> $_);
348 my @alleles = $geno->get_Alleles(1);
349 push(@row1,$alleles[0]);
350 push(@row2,$alleles[1]);
352 $self->_print("#",$ind->unique_id,"\n",
353 join($fielddelim,@row1),"\n",
354 join($fielddelim,@row2),"\n");
358 =head2 write_population
360 Title : write_population
361 Usage : $popgenio->write_population($pop);
362 Function: Write a population out in the file format
364 Args : L<Bio::PopGen::PopulationI> object(s)
365 Note : Many implementation will not implement this
370 sub write_population
{
371 my ($self,@pops) = @_;
372 my $fielddelim = $self->flag('field_delimiter');
373 my $alleledelim = $self->flag('allele_delimiter');
375 foreach my $pop ( @pops ) {
376 if (! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
377 $self->warn("Cannot write an object that is not a Bio::PopGen::PopulationI object");
380 # sort lexically until we have a better way to insure a consistent order
381 my @marker_names = sort $pop->get_marker_names;
382 my $n_markers = scalar(@marker_names);
383 $self->_print( $pop->get_number_individuals, "\n");
384 $self->_print( $n_markers, "\n");
385 if( $self->flag('no_header') &&
386 ! $self->flag('header_written') ) {
387 $self->_print( join($fielddelim, ('P', @marker_names)), "\n");
388 $self->flag('header_written',1);
391 foreach (@marker_names) {
392 $self->_print(($pop->get_Genotypes($_))[0]->marker_type);
396 $self->write_individual( $pop->get_Individuals, 0 );