1 # POD documentation - main docs before the code
5 Bio::SeqIO::msout - input stream for output by Hudson's ms
9 Do not use this module directly. Use it via the Bio::SeqIO class.
13 ms ( Hudson, R. R. (2002) Generating samples under a Wright-Fisher neutral
14 model. Bioinformatics 18:337-8 ) can be found at
15 http://home.uchicago.edu/~rhudson1/source/mksamples.html.
17 Currently, this object can be used to read output from ms into seq objects.
18 However, because bioperl has no support for haplotypes created using an infinite
19 sites model (where '1' identifies a derived allele and '0' identifies an
20 ancestral allele), the sequences returned by msout are coded using A, T, C and
21 G. To decode the bases, use the sequence conversion table (a hash) returned by
22 get_base_conversion_table(). In the table, 4 and 5 are used when the ancestry is
23 unclear. This should not ever happen when creating files with ms, but it will be
24 used when creating msOUT files from a collection of seq objects ( To be added
25 later ). Alternatively, use get_next_hap() to get a string with 1's and 0's
26 instead of a seq object.
28 =head2 Mapping to Finite Sites
30 This object can now also be used to map haplotypes created using an infinite sites
31 model to sequences of arbitrary finite length. See set_n_sites() for more detail.
32 Thanks to Filipe G. Vieira <fgvieira@berkeley.edu> for the idea and code.
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to the
40 Bioperl mailing list. Your participation is much appreciated.
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
47 Report bugs to the Bioperl bug tracking system to help us keep track
48 of the bugs and their resolution. Bug reports can be submitted via the
51 https://github.com/bioperl/bioperl-live/issues
53 =head1 AUTHOR - Warren Kretzschmar
55 This module was written by Warren Kretzschmar
57 email: wkretzsch@gmail.com
59 This module grew out of a parser written by Aida Andres.
63 =head2 Public Domain Notice
65 This software/database is ``United States Government Work'' under the
66 terms of the United States Copyright Act. It was written as part of
67 the authors' official duties for the United States Government and thus
68 cannot be copyrighted. This software/database is freely available to
69 the public for use without a copyright notice. Restrictions cannot
70 be placed on its present or future use.
72 Although all reasonable efforts have been taken to ensure the accuracy
73 and reliability of the software and data, the National Human Genome
74 Research Institute (NHGRI) and the U.S. Government does not and cannot
75 warrant the performance or results that may be obtained by using this
76 software or data. NHGRI and the U.S. Government disclaims all
77 warranties as to performance, merchantability or fitness for any
84 package Bio
::SeqIO
::msout
;
87 our $API_VERSION = qv
('1.1.8');
90 use base
qw(Bio::SeqIO); # This ISA Bio::SeqIO object
91 use Bio
::Seq
::SeqFactory
;
93 =head2 Methods for Internal Use
98 Usage : $stream = Bio::SeqIO::msOUT->new($infile)
99 Function: extracts basic information about the file.
100 Returns : Bio::SeqIO object
101 Args : no_og, gunzip, gzip, n_sites
103 - include 'no_og' flag if the last population of an msout file contains
104 only one haplotype and you don't want the last haplotype to be
105 treated as the outgroup ( suggested when reading data created by ms ).
106 - including 'n_sites' (positive integer) causes all output haplotypes to be
107 mapped to a sequence of length 'n_sites'. See set_n_sites() for more details.
112 my ( $self, @args ) = @_;
113 $self->SUPER::_initialize
(@args);
115 unless ( defined $self->sequence_factory ) {
116 $self->sequence_factory( Bio
::Seq
::SeqFactory
->new() );
118 my ($no_og) = $self->_rearrange( [qw(NO_OG)], @args );
119 my ($n_sites) = $self->_rearrange( [qw(N_SITES)], @args );
121 my %initial_values = (
126 MS_INFO_LINE
=> undef,
127 TOT_RUN_HAPS
=> undef,
129 NEXT_RUN_NUM
=> undef, # What run is the next hap from? undef = EOF
130 LAST_READ_HAP_NUM
=> undef, # What did we just read from
131 LAST_HAPS_RUN_NUM
=> undef,
132 LAST_READ_POSITIONS
=> [],
133 LAST_READ_SEGSITES
=> undef,
135 NO_OUTGROUP
=> $no_og,
136 BASE_CONVERSION_TABLE_HASH_REF
=> {
143 foreach my $key ( keys %initial_values ) {
144 $self->{$key} = $initial_values{$key};
146 $self->set_n_sites($n_sites);
148 # If the filehandle is defined open it and read a few lines
149 if ( ref( $self->{_filehandle
} ) eq 'GLOB' ) {
150 $self->_read_start();
154 # Otherwise throw a warning
157 "No filehandle defined. Please define a file handle through -file when calling msout with Bio::SeqIO"
165 Usage : $stream->_read_start()
166 Function: reads from the filehandle $stream->{_filehandle} all information up to the first haplotype (sequence). Closes the filehandle if all lines have been read.
175 my $fh_IN = $self->{_filehandle
};
177 # get the first five lines and parse for important info
178 my ( $ms_info_line, $seeds ) = $self->_get_next_clean_hap( $fh_IN, 2, 1 );
180 my @ms_info_line = split( /\s+/, $ms_info_line );
182 my ( $tot_pops, @pop_haplos );
184 # Parsing the ms header line
186 my $tot_run_haps = shift @ms_info_line;
187 my $runs = shift @ms_info_line;
190 foreach my $word ( 0 .. $#ms_info_line ) {
191 if ( $ms_info_line[$word] eq '-I' ) {
192 $tot_pops = $ms_info_line[ $word + 1 ];
193 for my $pop_num ( 1 .. $tot_pops ) {
194 push @pop_haplos, $ms_info_line[ $word + 1 + $pop_num ];
197 # if @pop_haplos contains a non-digit, then there is an error in the msinfo line.
198 if ( !defined $pop_haplos[-1] || $pop_haplos[-1] =~ /\D/ ) {
200 "Incorrect number of populations in the ms info line (after the -I specifier)"
204 elsif ( $ms_info_line[$word] eq '-s' ) {
205 $segsites = $ms_info_line[ $word + 1 ];
210 unless (@pop_haplos) { @pop_haplos = ($tot_run_haps) }
212 my @seeds = split( /\s+/, $seeds );
215 $self->{RUNS
} = $runs;
216 $self->{SEGSITES
} = $segsites;
217 $self->{SEEDS
} = \
@seeds;
218 $self->{MS_INFO_LINE
} = $ms_info_line;
219 $self->{TOT_RUN_HAPS
} = $tot_run_haps;
220 $self->{POPS
} = [@pop_haplos];
225 =head2 Methods to Access Data
230 Usage : $segsites = $stream->get_segsites()
231 Function: returns the number of segsites in the msOUT file (according to the msOUT header line's -s option), or the current run's segsites if -s was not specified in the command line (in this case the number of segsites varies from run to run).
239 if ( defined $self->{SEGSITES
} ) {
240 return $self->{SEGSITES
};
243 return $self->get_current_run_segsites;
247 =head3 get_current_run_segsites
249 Title : get_current_run_segsites
250 Usage : $segsites = $stream->get_current_run_segsites()
251 Function: returns the number of segsites in the run of the last read
252 haplotype (sequence).
258 sub get_current_run_segsites
{
260 return $self->{LAST_READ_SEGSITES
};
266 Usage : $n_sites = $stream->get_n_sites()
267 Function: Gets the number of total sites (variable or not) to be output.
268 Returns : scalar if n_sites option is defined at call time of new()
271 WARNING: Final sequence length might not be equal to n_sites if n_sites is
272 too close to number of segregating sites in the msout file.
279 return $self->{N_SITES
};
285 Usage : $n_sites = $stream->set_n_sites($value)
286 Function: Sets the number of total sites (variable or not) to be output.
287 Returns : 1 on success; throws an error if $value is not a positive integer or undef
288 Args : positive integer
290 WARNING: Final sequence length might not be equal to n_sites if it is
291 too close to number of segregating sites.
292 - n_sites needs to be at least as large as the number of segsites of
293 the next haplotype returned
294 - n_sites may also be set to undef, in which case haplotypes are returned
295 under the infinite sites model assumptions.
300 my ( $self, $value ) = @_;
302 # make sure $value is a positive integer if it is defined
303 if ( defined $value ) {
305 "first argument needs to be a positive integer. argument supplied: $value"
306 ) unless ( $value =~ m/^\d+$/ && $value > 0 );
308 $self->{N_SITES
} = $value;
316 Usage : $runs = $stream->get_runs()
317 Function: returns the number of runs in the msOUT file (according to the
326 return $self->{RUNS
};
332 Usage : @seeds = $stream->get_Seeds()
333 Function: returns an array of the seeds used in the creation of the msOUT file.
336 Details : In older versions, ms used three seeds. Newer versions of ms seem to
337 use only one (longer) seed. This function will return all the seeds
344 return @
{ $self->{SEEDS
} };
349 Title : get_Positions
350 Usage : @positions = $stream->get_Positions()
351 Function: returns an array of the names of each segsite of the run of the last
355 Details : The Positions may or may not vary from run to run depending on the
356 options used with ms.
362 return @
{ $self->{LAST_READ_POSITIONS
} };
365 =head3 get_tot_run_haps
367 Title : get_tot_run_haps
368 Usage : $number_of_haps_per_run = $stream->get_tot_run_haps()
369 Function: returns the number of haplotypes (sequences) in each run of the msOUT
370 file ( according to the msinfo line ).
371 Returns : scalar >= 0
373 Details : This number should not vary from run to run.
377 sub get_tot_run_haps
{
379 return $self->{TOT_RUN_HAPS
};
382 =head3 get_ms_info_line
384 Title : get_ms_info_line
385 Usage : $ms_info_line = $stream->get_ms_info_line()
386 Function: returns the header line of the msOUT file.
392 sub get_ms_info_line
{
394 return $self->{MS_INFO_LINE
};
400 Usage : $number_of_haplotypes_in_file = $stream->tot_haps()
401 Function: returns the number of haplotypes (sequences) in the msOUT file.
402 Information gathered from msOUT header line.
410 return ( $self->{TOT_RUN_HAPS
} * $self->{RUNS
} );
416 Usage : @pops = $stream->pops()
417 Function: returns an array of population sizes (order taken from the -I flag in
418 the msOUT header line). This array will include the last hap even if
419 it looks like an outgroup.
420 Returns : array of scalars > 0
427 return @
{ $self->{POPS
} };
430 =head3 get_next_run_num
432 Title : get_next_run_num
433 Usage : $next_run_number = $stream->next_run_num()
434 Function: returns the number of the ms run that the next haplotype (sequence)
435 will be taken from (starting at 1). Returns undef if the complete
437 Returns : scalar > 0 or undef
442 sub get_next_run_num
{
444 return $self->{NEXT_RUN_NUM
};
447 =head3 get_last_haps_run_num
449 Title : get_last_haps_run_num
450 Usage : $last_haps_run_number = $stream->get_last_haps_run_num()
451 Function: returns the number of the ms run that the last haplotype (sequence)
452 was taken from (starting at 1). Returns undef if no hap has been
454 Returns : scalar > 0 or undef
459 sub get_last_haps_run_num
{
461 return $self->{LAST_HAPS_RUN_NUM
};
464 =head3 get_last_read_hap_num
466 Title : get_last_read_hap_num
467 Usage : $last_read_hap_num = $stream->get_last_read_hap_num()
468 Function: returns the number (starting with 1) of the last haplotype read from
470 Returns : scalar >= 0
472 Details : 0 means that no haplotype has been read yet. Is reset to 0 every run.
476 sub get_last_read_hap_num
{
478 return $self->{LAST_READ_HAP_NUM
};
484 Usage : $outgroup = $stream->outgroup()
485 Function: returns '1' if the msOUT stream has an outgroup. Returns '0'
489 Details : This method will return '1' only if the last population in the msOUT
490 file contains only one haplotype. If the last population is not an
491 outgroup then create the msOUT object using 'no_og' as input flag.
492 Also, return 0, if the run has only one population.
498 my @pops = $self->get_Pops;
499 if ( $pops[$#pops] == 1 && !defined $self->{NO_OUTGROUP
} && @pops > 1 ) {
505 =head3 get_next_haps_pop_num
507 Title : get_next_haps_pop_num
508 Usage : ($next_haps_pop_num, $num_haps_left_in_pop) = $stream->get_next_haps_pop_num()
509 Function: First return value is the population number (starting with 1) the
510 next hap will come from. The second return value is the number of haps
511 left to read in the population from which the next hap will come.
512 Returns : (scalar > 0, scalar > 0)
517 sub get_next_haps_pop_num
{
519 my $last_read_hap = $self->get_last_read_hap_num;
520 my @pops = $self->get_Pops;
522 foreach my $pop_num ( 0 .. $#pops ) {
523 if ( $last_read_hap < $pops[$pop_num] ) {
524 return ( $pop_num + 1, $pops[$pop_num] - $last_read_hap );
526 else { $last_read_hap -= $pops[$pop_num] }
529 # In this case we're at the beginning of the next run
530 return ( 1, $pops[0] );
536 Usage : $seq = $stream->get_next_seq()
537 Function: reads and returns the next sequence (haplotype) in the stream
538 Returns : Bio::Seq object or void if end of file
540 Note : This function is included only to conform to convention. The
541 returned Bio::Seq object holds a halpotype in coded form. Use the hash
542 returned by get_base_conversion_table() to convert 'A', 'T', 'C', 'G'
543 back into 1,2,4 and 5. Use get_next_hap() to retrieve the halptoype as
544 a string of 1,2,4 and 5s instead.
550 my $seqstring = $self->get_next_hap;
552 return unless ($seqstring);
554 # Used to create unique ID;
555 my $run = $self->get_last_haps_run_num;
557 # Converting numbers to letters so that the haplotypes can be stored as a
559 my $rh_base_conversion_table = $self->get_base_conversion_table;
560 foreach my $base ( keys %{$rh_base_conversion_table} ) {
561 $seqstring =~ s/($rh_base_conversion_table->{$base})/$base/g;
564 # Fill in non-variable positions
565 my $segsites = $self->get_current_run_segsites;
566 my $n_sites = $self->get_n_sites;
567 if ( defined($n_sites) ) {
569 # make sure that n_sites is at least as large
570 # as segsites for each run. Throw an exception otherwise.
571 $self->throw( "n_sites:\t$n_sites"
572 . "\nsegsites:\t$segsites"
574 . "\nn_sites needs to be at least the number of segsites of every run"
575 ) unless $segsites <= $n_sites;
579 my @pos = $self->get_Positions;
580 for ( my $i = 0 ; $i <= $#pos ; $i++ ) {
581 $pos[$i] *= $n_sites;
582 my $rpt = $pos[$i] - 1 - $seq_len;
583 $rpt = $rpt >= 1 ?
$rpt : 0;
584 push( @seq, "A" x
( $rpt ) );
585 $seq_len += length( $seq[-1] );
586 push( @seq, substr( $seqstring, $i, 1 ) );
587 $seq_len += length( $seq[-1] );
589 push( @seq, "A" x
( $n_sites - $seq_len ) );
590 $seqstring = join( "", @seq );
593 my $last_read_hap = $self->get_last_read_hap_num;
594 my $id = 'Hap_' . $last_read_hap . '_Run_' . $run;
596 "Segsites $segsites;"
598 . ( defined $n_sites ?
$n_sites : $segsites ) . ";"
599 . " Haplotype $last_read_hap;"
601 my $seq = $self->sequence_factory->create(
604 -desc
=> $description,
616 Usage : $seq = $stream->next_seq()
617 Function: Alias to get_next_seq()
618 Returns : Bio::Seq object or void if end of file
620 Note : This function is only included for convention. It calls get_next_seq().
621 See get_next_seq() for details.
627 return $self->get_next_seq();
633 Usage : $hap = $stream->next_hap()
634 Function: reads and returns the next sequence (haplotype) in the stream.
635 Returns undef if all sequences in stream have been read.
636 Returns : Haplotype string (e.g. '110110000101101045454000101'
638 Note : Use get_next_seq() if you want the halpotype returned as a
646 # Let's figure out how many haps to read from the input file so that
647 # we get back to the beginning of the next run.
650 if ( $self->{TOT_RUN_HAPS
} == $self->{LAST_READ_HAP_NUM
} + 1 ) {
654 # Setting last_haps_run_num
655 $self->{LAST_HAPS_RUN_NUM
} = $self->get_next_run_num;
657 my $last_read_hap = $self->get_last_read_hap_num;
659 $self->_get_next_clean_hap( $self->{_filehandle
}, 1, $end_run );
660 if ( !defined $seqstring && $last_read_hap < $self->get_tot_haps ) {
662 "msout file has only $last_read_hap hap(s), which is less than indicated in msinfo line ( "
663 . $self->get_tot_haps
673 Usage : @seqs = $stream->next_pop()
674 Function: reads and returns all the remaining sequences (haplotypes) in the
675 population of the next sequence. Returns an empty list if no more
676 haps remain to be read in the stream
677 Returns : array of Bio::Seq objects
685 # Let's figure out how many haps to read from the input file so that
686 # we get back to the beginning of the next run.
688 my @pops = $self->get_Pops;
689 my @seqs; # holds Bio::Seq objects to return
691 # Determine number of the pop that the next hap will be taken from
692 my ( $next_haps_pop_num, $haps_to_pull ) = $self->get_next_haps_pop_num;
694 # If $haps_to_pull == 0, then we need to pull the whole population
695 if ( $haps_to_pull == 0 ) {
696 $haps_to_pull = $pops[ $next_haps_pop_num - 1 ];
699 for ( 1 .. $haps_to_pull ) {
700 my $seq = $self->get_next_seq;
701 next unless defined $seq;
703 # Add Population number information to description
704 $seq->display_id(" Population number $next_haps_pop_num;");
714 Usage : @seqs = $stream->next_run()
715 Function: reads and returns all the remaining sequences (haplotypes) in the ms
716 run of the next sequence. Returns an empty list if all haps have been
717 read from the stream.
718 Returns : array of Bio::Seq objects
726 # Let's figure out how many haps to read from the input file so that
727 # we get back to the beginning of the next run.
729 my ( $next_haps_pop_num, $haps_to_pull ) = $self->get_next_haps_pop_num;
732 my @pops = $self->get_Pops;
734 foreach ( $next_haps_pop_num .. $#pops ) {
735 $haps_to_pull += $pops[$_];
738 # Read those haps from the input file
739 # Next hap read will be the first hap of the first pop of the next run.
741 for ( 1 .. $haps_to_pull ) {
742 my $seq = $self->get_next_seq;
743 next unless defined $seq;
751 =head2 Methods to Retrieve Constants
753 =head3 base_conversion_table
755 Title : get_base_conversion_table
756 Usage : $table_hash_ref = $stream->get_base_conversion_table()
757 Function: returns a reference to a hash. The keys of the hash are the letters '
758 A','T','G','C'. The values associated with each key are the value that
759 each letter in the sequence of a seq object returned by a
760 Bio::SeqIO::msout stream should be translated to.
761 Returns : reference to a hash
765 # retrieve the Bio::Seq object's sequence
766 my $haplotype = $seq->seq;
768 # need to convert all letters to their corresponding numbers.
769 foreach my $base (keys %{$rh_base_conversion_table}){
770 $haplotype =~ s/($base)/$rh_base_conversion_table->{$base}/g;
773 # $haplotype is now an ms style haplotype. (e.g. '100101101455')
777 sub get_base_conversion_table
{
779 return $self->{BASE_CONVERSION_TABLE_HASH_REF
};
782 ##############################################################################
783 ## subs for internal use only
784 ##############################################################################
786 sub _get_next_clean_hap
{
788 #By Warren Kretzschmar
790 # return the next non-empty line from file handle (chomped line)
791 # skipps to the next run if '//' is encountered
792 my ( $self, $fh, $times, $end_run ) = @_;
795 unless ( ref($fh) eq q
(GLOB
) ) { return; }
797 unless ( defined $times && $times > 0 ) {
801 if ( defined $self->{BUFFER_HAP
} ) {
802 push @data, $self->{BUFFER_HAP
};
803 $self->{BUFFER_HAP
} = undef;
804 $self->{LAST_READ_HAP_NUM
}++;
808 while ( 1 <= $times-- ) {
810 # Find next clean line
812 last if !defined($data);
814 while ( $data !~ /./ ) {
819 # If the next run is encountered here, then we have a programming
821 if ( $data eq '//' ) { $self->throw("'//' found when not expected\n") }
823 $self->{LAST_READ_HAP_NUM
}++;
828 $self->_load_run_info($fh);
836 my ( $self, $fh ) = @_;
840 # getting rid of excess newlines
841 while ( defined($data) && $data !~ /./ ) {
845 # In this case we are at EOF
846 if ( !defined($data) ) { $self->{NEXT_RUN_NUM
} = undef; return; }
848 while ( $data !~ /./ ) {
855 # If the next run is encountered, then skip to the next hap and save it in
857 if ( $data eq '//' ) {
858 $self->{NEXT_RUN_NUM
}++;
859 $self->{LAST_READ_HAP_NUM
} = 0;
862 while ( $data !~ /./ ) {
869 my @sites = split( /\s+/, $data );
870 $self->{LAST_READ_SEGSITES
} = $sites[1];
872 elsif ( $_ eq '2' ) {
873 my @positions = split( /\s+/, $data );
875 $self->{LAST_READ_POSITIONS
} = \
@positions;
878 if ( !defined($data) ) {
879 $self->throw("run $self->{NEXT_RUN_NUM} has no haps./n");
881 $self->{BUFFER_HAP
} = $data;
887 "'//' not encountered when expected. There are more haplos in one of the msOUT runs than advertised in the msinfo line."