1 # POD documentation - main docs before the code
5 Bio::SeqIO::msout - input stream for output by Hudson's ms
9 Do not use this module directly. Use it via the Bio::SeqIO class.
13 ms ( Hudson, R. R. (2002) Generating samples under a Wright-Fisher neutral
14 model. Bioinformatics 18:337-8 ) can be found at
15 http://home.uchicago.edu/~rhudson1/source/mksamples.html.
17 Currently, this object can be used to read output from ms into seq objects.
18 However, because bioperl has no support for haplotypes created using an infinite
19 sites model (where '1' identifies a derived allele and '0' identifies an
20 ancestral allele), the sequences returned by msout are coded using A, T, C and
21 G. To decode the bases, use the sequence conversion table (a hash) returned by
22 get_base_conversion_table(). In the table, 4 and 5 are used when the ancestry is
23 unclear. This should not ever happen when creating files with ms, but it will be
24 used when creating msOUT files from a collection of seq objects ( To be added
25 later ). Alternatively, use get_next_hap() to get a string with 1's and 0's
26 instead of a seq object.
28 =head2 Mapping to Finite Sites
30 This object can now also be used to map haplotypes created using an infinite sites
31 model to sequences of arbitrary finite length. See set_n_sites() for more detail.
32 Thanks to Filipe G. Vieira <fgvieira@berkeley.edu> for the idea and code.
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to the
40 Bioperl mailing list. Your participation is much appreciated.
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
47 Report bugs to the Bioperl bug tracking system to help us keep track
48 of the bugs and their resolution. Bug reports can be submitted via the
51 https://github.com/bioperl/bioperl-live/issues
53 =head1 AUTHOR - Warren Kretzschmar
55 This module was written by Warren Kretzschmar
57 email: wkretzsch@gmail.com
59 This module grew out of a parser written by Aida Andres.
63 =head2 Public Domain Notice
65 This software/database is ``United States Government Work'' under the
66 terms of the United States Copyright Act. It was written as part of
67 the authors' official duties for the United States Government and thus
68 cannot be copyrighted. This software/database is freely available to
69 the public for use without a copyright notice. Restrictions cannot
70 be placed on its present or future use.
72 Although all reasonable efforts have been taken to ensure the accuracy
73 and reliability of the software and data, the National Human Genome
74 Research Institute (NHGRI) and the U.S. Government does not and cannot
75 warrant the performance or results that may be obtained by using this
76 software or data. NHGRI and the U.S. Government disclaims all
77 warranties as to performance, merchantability or fitness for any
84 package Bio
::SeqIO
::msout
;
86 our $API_VERSION = qv
('1.1.8');
89 use base
qw(Bio::SeqIO); # This ISA Bio::SeqIO object
90 use Bio
::Seq
::SeqFactory
;
92 =head2 Methods for Internal Use
97 Usage : $stream = Bio::SeqIO::msOUT->new($infile)
98 Function: extracts basic information about the file.
99 Returns : Bio::SeqIO object
100 Args : no_og, gunzip, gzip, n_sites
102 - include 'no_og' flag if the last population of an msout file contains
103 only one haplotype and you don't want the last haplotype to be
104 treated as the outgroup ( suggested when reading data created by ms ).
105 - including 'n_sites' (positive integer) causes all output haplotypes to be
106 mapped to a sequence of length 'n_sites'. See set_n_sites() for more details.
111 my ( $self, @args ) = @_;
112 $self->SUPER::_initialize
(@args);
114 unless ( defined $self->sequence_factory ) {
115 $self->sequence_factory( Bio
::Seq
::SeqFactory
->new() );
117 my ($no_og) = $self->_rearrange( [qw(NO_OG)], @args );
118 my ($n_sites) = $self->_rearrange( [qw(N_SITES)], @args );
120 my %initial_values = (
125 MS_INFO_LINE
=> undef,
126 TOT_RUN_HAPS
=> undef,
128 NEXT_RUN_NUM
=> undef, # What run is the next hap from? undef = EOF
129 LAST_READ_HAP_NUM
=> undef, # What did we just read from
130 LAST_HAPS_RUN_NUM
=> undef,
131 LAST_READ_POSITIONS
=> [],
132 LAST_READ_SEGSITES
=> undef,
134 NO_OUTGROUP
=> $no_og,
135 BASE_CONVERSION_TABLE_HASH_REF
=> {
142 foreach my $key ( keys %initial_values ) {
143 $self->{$key} = $initial_values{$key};
145 $self->set_n_sites($n_sites);
147 # If the filehandle is defined open it and read a few lines
148 if ( ref( $self->{_filehandle
} ) eq 'GLOB' ) {
149 $self->_read_start();
153 # Otherwise throw a warning
156 "No filehandle defined. Please define a file handle through -file when calling msout with Bio::SeqIO"
164 Usage : $stream->_read_start()
165 Function: reads from the filehandle $stream->{_filehandle} all information up to the first haplotype (sequence). Closes the filehandle if all lines have been read.
174 my $fh_IN = $self->{_filehandle
};
176 # get the first five lines and parse for important info
177 my ( $ms_info_line, $seeds ) = $self->_get_next_clean_hap( $fh_IN, 2, 1 );
179 my @ms_info_line = split( /\s+/, $ms_info_line );
181 my ( $tot_pops, @pop_haplos );
183 # Parsing the ms header line
185 my $tot_run_haps = shift @ms_info_line;
186 my $runs = shift @ms_info_line;
189 foreach my $word ( 0 .. $#ms_info_line ) {
190 if ( $ms_info_line[$word] eq '-I' ) {
191 $tot_pops = $ms_info_line[ $word + 1 ];
192 for my $pop_num ( 1 .. $tot_pops ) {
193 push @pop_haplos, $ms_info_line[ $word + 1 + $pop_num ];
196 # if @pop_haplos contains a non-digit, then there is an error in the msinfo line.
197 if ( !defined $pop_haplos[-1] || $pop_haplos[-1] =~ /\D/ ) {
199 "Incorrect number of populations in the ms info line (after the -I specifier)"
203 elsif ( $ms_info_line[$word] eq '-s' ) {
204 $segsites = $ms_info_line[ $word + 1 ];
209 unless (@pop_haplos) { @pop_haplos = ($tot_run_haps) }
211 my @seeds = split( /\s+/, $seeds );
214 $self->{RUNS
} = $runs;
215 $self->{SEGSITES
} = $segsites;
216 $self->{SEEDS
} = \
@seeds;
217 $self->{MS_INFO_LINE
} = $ms_info_line;
218 $self->{TOT_RUN_HAPS
} = $tot_run_haps;
219 $self->{POPS
} = [@pop_haplos];
224 =head2 Methods to Access Data
229 Usage : $segsites = $stream->get_segsites()
230 Function: returns the number of segsites in the msOUT file (according to the msOUT header line's -s option), or the current run's segsites if -s was not specified in the command line (in this case the number of segsites varies from run to run).
238 if ( defined $self->{SEGSITES
} ) {
239 return $self->{SEGSITES
};
242 return $self->get_current_run_segsites;
246 =head3 get_current_run_segsites
248 Title : get_current_run_segsites
249 Usage : $segsites = $stream->get_current_run_segsites()
250 Function: returns the number of segsites in the run of the last read
251 haplotype (sequence).
257 sub get_current_run_segsites
{
259 return $self->{LAST_READ_SEGSITES
};
265 Usage : $n_sites = $stream->get_n_sites()
266 Function: Gets the number of total sites (variable or not) to be output.
267 Returns : scalar if n_sites option is defined at call time of new()
270 WARNING: Final sequence length might not be equal to n_sites if n_sites is
271 too close to number of segregating sites in the msout file.
278 return $self->{N_SITES
};
284 Usage : $n_sites = $stream->set_n_sites($value)
285 Function: Sets the number of total sites (variable or not) to be output.
286 Returns : 1 on success; throws an error if $value is not a positive integer or undef
287 Args : positive integer
289 WARNING: Final sequence length might not be equal to n_sites if it is
290 too close to number of segregating sites.
291 - n_sites needs to be at least as large as the number of segsites of
292 the next haplotype returned
293 - n_sites may also be set to undef, in which case haplotypes are returned
294 under the infinite sites model assumptions.
299 my ( $self, $value ) = @_;
301 # make sure $value is a positive integer if it is defined
302 if ( defined $value ) {
304 "first argument needs to be a positive integer. argument supplied: $value"
305 ) unless ( $value =~ m/^\d+$/ && $value > 0 );
307 $self->{N_SITES
} = $value;
315 Usage : $runs = $stream->get_runs()
316 Function: returns the number of runs in the msOUT file (according to the
325 return $self->{RUNS
};
331 Usage : @seeds = $stream->get_Seeds()
332 Function: returns an array of the seeds used in the creation of the msOUT file.
335 Details : In older versions, ms used three seeds. Newer versions of ms seem to
336 use only one (longer) seed. This function will return all the seeds
343 return @
{ $self->{SEEDS
} };
348 Title : get_Positions
349 Usage : @positions = $stream->get_Positions()
350 Function: returns an array of the names of each segsite of the run of the last
354 Details : The Positions may or may not vary from run to run depending on the
355 options used with ms.
361 return @
{ $self->{LAST_READ_POSITIONS
} };
364 =head3 get_tot_run_haps
366 Title : get_tot_run_haps
367 Usage : $number_of_haps_per_run = $stream->get_tot_run_haps()
368 Function: returns the number of haplotypes (sequences) in each run of the msOUT
369 file ( according to the msinfo line ).
370 Returns : scalar >= 0
372 Details : This number should not vary from run to run.
376 sub get_tot_run_haps
{
378 return $self->{TOT_RUN_HAPS
};
381 =head3 get_ms_info_line
383 Title : get_ms_info_line
384 Usage : $ms_info_line = $stream->get_ms_info_line()
385 Function: returns the header line of the msOUT file.
391 sub get_ms_info_line
{
393 return $self->{MS_INFO_LINE
};
399 Usage : $number_of_haplotypes_in_file = $stream->tot_haps()
400 Function: returns the number of haplotypes (sequences) in the msOUT file.
401 Information gathered from msOUT header line.
409 return ( $self->{TOT_RUN_HAPS
} * $self->{RUNS
} );
415 Usage : @pops = $stream->pops()
416 Function: returns an array of population sizes (order taken from the -I flag in
417 the msOUT header line). This array will include the last hap even if
418 it looks like an outgroup.
419 Returns : array of scalars > 0
426 return @
{ $self->{POPS
} };
429 =head3 get_next_run_num
431 Title : get_next_run_num
432 Usage : $next_run_number = $stream->next_run_num()
433 Function: returns the number of the ms run that the next haplotype (sequence)
434 will be taken from (starting at 1). Returns undef if the complete
436 Returns : scalar > 0 or undef
441 sub get_next_run_num
{
443 return $self->{NEXT_RUN_NUM
};
446 =head3 get_last_haps_run_num
448 Title : get_last_haps_run_num
449 Usage : $last_haps_run_number = $stream->get_last_haps_run_num()
450 Function: returns the number of the ms run that the last haplotype (sequence)
451 was taken from (starting at 1). Returns undef if no hap has been
453 Returns : scalar > 0 or undef
458 sub get_last_haps_run_num
{
460 return $self->{LAST_HAPS_RUN_NUM
};
463 =head3 get_last_read_hap_num
465 Title : get_last_read_hap_num
466 Usage : $last_read_hap_num = $stream->get_last_read_hap_num()
467 Function: returns the number (starting with 1) of the last haplotype read from
469 Returns : scalar >= 0
471 Details : 0 means that no haplotype has been read yet. Is reset to 0 every run.
475 sub get_last_read_hap_num
{
477 return $self->{LAST_READ_HAP_NUM
};
483 Usage : $outgroup = $stream->outgroup()
484 Function: returns '1' if the msOUT stream has an outgroup. Returns '0'
488 Details : This method will return '1' only if the last population in the msOUT
489 file contains only one haplotype. If the last population is not an
490 outgroup then create the msOUT object using 'no_og' as input flag.
491 Also, return 0, if the run has only one population.
497 my @pops = $self->get_Pops;
498 if ( $pops[$#pops] == 1 && !defined $self->{NO_OUTGROUP
} && @pops > 1 ) {
504 =head3 get_next_haps_pop_num
506 Title : get_next_haps_pop_num
507 Usage : ($next_haps_pop_num, $num_haps_left_in_pop) = $stream->get_next_haps_pop_num()
508 Function: First return value is the population number (starting with 1) the
509 next hap will come from. The second return value is the number of haps
510 left to read in the population from which the next hap will come.
511 Returns : (scalar > 0, scalar > 0)
516 sub get_next_haps_pop_num
{
518 my $last_read_hap = $self->get_last_read_hap_num;
519 my @pops = $self->get_Pops;
521 foreach my $pop_num ( 0 .. $#pops ) {
522 if ( $last_read_hap < $pops[$pop_num] ) {
523 return ( $pop_num + 1, $pops[$pop_num] - $last_read_hap );
525 else { $last_read_hap -= $pops[$pop_num] }
528 # In this case we're at the beginning of the next run
529 return ( 1, $pops[0] );
535 Usage : $seq = $stream->get_next_seq()
536 Function: reads and returns the next sequence (haplotype) in the stream
537 Returns : Bio::Seq object or void if end of file
539 Note : This function is included only to conform to convention. The
540 returned Bio::Seq object holds a halpotype in coded form. Use the hash
541 returned by get_base_conversion_table() to convert 'A', 'T', 'C', 'G'
542 back into 1,2,4 and 5. Use get_next_hap() to retrieve the halptoype as
543 a string of 1,2,4 and 5s instead.
549 my $seqstring = $self->get_next_hap;
551 return unless ($seqstring);
553 # Used to create unique ID;
554 my $run = $self->get_last_haps_run_num;
556 # Converting numbers to letters so that the haplotypes can be stored as a
558 my $rh_base_conversion_table = $self->get_base_conversion_table;
559 foreach my $base ( keys %{$rh_base_conversion_table} ) {
560 $seqstring =~ s/($rh_base_conversion_table->{$base})/$base/g;
563 # Fill in non-variable positions
564 my $segsites = $self->get_current_run_segsites;
565 my $n_sites = $self->get_n_sites;
566 if ( defined($n_sites) ) {
568 # make sure that n_sites is at least as large
569 # as segsites for each run. Throw an exception otherwise.
570 $self->throw( "n_sites:\t$n_sites"
571 . "\nsegsites:\t$segsites"
573 . "\nn_sites needs to be at least the number of segsites of every run"
574 ) unless $segsites <= $n_sites;
578 my @pos = $self->get_Positions;
579 for ( my $i = 0 ; $i <= $#pos ; $i++ ) {
580 $pos[$i] *= $n_sites;
581 my $rpt = $pos[$i] - 1 - $seq_len;
582 $rpt = $rpt >= 1 ?
$rpt : 0;
583 push( @seq, "A" x
( $rpt ) );
584 $seq_len += length( $seq[-1] );
585 push( @seq, substr( $seqstring, $i, 1 ) );
586 $seq_len += length( $seq[-1] );
588 push( @seq, "A" x
( $n_sites - $seq_len ) );
589 $seqstring = join( "", @seq );
592 my $last_read_hap = $self->get_last_read_hap_num;
593 my $id = 'Hap_' . $last_read_hap . '_Run_' . $run;
595 "Segsites $segsites;"
597 . ( defined $n_sites ?
$n_sites : $segsites ) . ";"
598 . " Haplotype $last_read_hap;"
600 my $seq = $self->sequence_factory->create(
603 -desc
=> $description,
615 Usage : $seq = $stream->next_seq()
616 Function: Alias to get_next_seq()
617 Returns : Bio::Seq object or void if end of file
619 Note : This function is only included for convention. It calls get_next_seq().
620 See get_next_seq() for details.
626 return $self->get_next_seq();
632 Usage : $hap = $stream->next_hap()
633 Function: reads and returns the next sequence (haplotype) in the stream.
634 Returns undef if all sequences in stream have been read.
635 Returns : Haplotype string (e.g. '110110000101101045454000101'
637 Note : Use get_next_seq() if you want the halpotype returned as a
645 # Let's figure out how many haps to read from the input file so that
646 # we get back to the beginning of the next run.
649 if ( $self->{TOT_RUN_HAPS
} == $self->{LAST_READ_HAP_NUM
} + 1 ) {
653 # Setting last_haps_run_num
654 $self->{LAST_HAPS_RUN_NUM
} = $self->get_next_run_num;
656 my $last_read_hap = $self->get_last_read_hap_num;
658 $self->_get_next_clean_hap( $self->{_filehandle
}, 1, $end_run );
659 if ( !defined $seqstring && $last_read_hap < $self->get_tot_haps ) {
661 "msout file has only $last_read_hap hap(s), which is less than indicated in msinfo line ( "
662 . $self->get_tot_haps
672 Usage : @seqs = $stream->next_pop()
673 Function: reads and returns all the remaining sequences (haplotypes) in the
674 population of the next sequence. Returns an empty list if no more
675 haps remain to be read in the stream
676 Returns : array of Bio::Seq objects
684 # Let's figure out how many haps to read from the input file so that
685 # we get back to the beginning of the next run.
687 my @pops = $self->get_Pops;
688 my @seqs; # holds Bio::Seq objects to return
690 # Determine number of the pop that the next hap will be taken from
691 my ( $next_haps_pop_num, $haps_to_pull ) = $self->get_next_haps_pop_num;
693 # If $haps_to_pull == 0, then we need to pull the whole population
694 if ( $haps_to_pull == 0 ) {
695 $haps_to_pull = $pops[ $next_haps_pop_num - 1 ];
698 for ( 1 .. $haps_to_pull ) {
699 my $seq = $self->get_next_seq;
700 next unless defined $seq;
702 # Add Population number information to description
703 $seq->display_id(" Population number $next_haps_pop_num;");
713 Usage : @seqs = $stream->next_run()
714 Function: reads and returns all the remaining sequences (haplotypes) in the ms
715 run of the next sequence. Returns an empty list if all haps have been
716 read from the stream.
717 Returns : array of Bio::Seq objects
725 # Let's figure out how many haps to read from the input file so that
726 # we get back to the beginning of the next run.
728 my ( $next_haps_pop_num, $haps_to_pull ) = $self->get_next_haps_pop_num;
731 my @pops = $self->get_Pops;
733 foreach ( $next_haps_pop_num .. $#pops ) {
734 $haps_to_pull += $pops[$_];
737 # Read those haps from the input file
738 # Next hap read will be the first hap of the first pop of the next run.
740 for ( 1 .. $haps_to_pull ) {
741 my $seq = $self->get_next_seq;
742 next unless defined $seq;
750 =head2 Methods to Retrieve Constants
752 =head3 base_conversion_table
754 Title : get_base_conversion_table
755 Usage : $table_hash_ref = $stream->get_base_conversion_table()
756 Function: returns a reference to a hash. The keys of the hash are the letters '
757 A','T','G','C'. The values associated with each key are the value that
758 each letter in the sequence of a seq object returned by a
759 Bio::SeqIO::msout stream should be translated to.
760 Returns : reference to a hash
764 # retrieve the Bio::Seq object's sequence
765 my $haplotype = $seq->seq;
767 # need to convert all letters to their corresponding numbers.
768 foreach my $base (keys %{$rh_base_conversion_table}){
769 $haplotype =~ s/($base)/$rh_base_conversion_table->{$base}/g;
772 # $haplotype is now an ms style haplotype. (e.g. '100101101455')
776 sub get_base_conversion_table
{
778 return $self->{BASE_CONVERSION_TABLE_HASH_REF
};
781 ##############################################################################
782 ## subs for internal use only
783 ##############################################################################
785 sub _get_next_clean_hap
{
787 #By Warren Kretzschmar
789 # return the next non-empty line from file handle (chomped line)
790 # skipps to the next run if '//' is encountered
791 my ( $self, $fh, $times, $end_run ) = @_;
794 unless ( ref($fh) eq q
(GLOB
) ) { return; }
796 unless ( defined $times && $times > 0 ) {
800 if ( defined $self->{BUFFER_HAP
} ) {
801 push @data, $self->{BUFFER_HAP
};
802 $self->{BUFFER_HAP
} = undef;
803 $self->{LAST_READ_HAP_NUM
}++;
807 while ( 1 <= $times-- ) {
809 # Find next clean line
811 last if !defined($data);
813 while ( $data !~ /./ ) {
818 # If the next run is encountered here, then we have a programming
820 if ( $data eq '//' ) { $self->throw("'//' found when not expected\n") }
822 $self->{LAST_READ_HAP_NUM
}++;
827 $self->_load_run_info($fh);
835 my ( $self, $fh ) = @_;
839 # getting rid of excess newlines
840 while ( defined($data) && $data !~ /./ ) {
844 # In this case we are at EOF
845 if ( !defined($data) ) { $self->{NEXT_RUN_NUM
} = undef; return; }
847 while ( $data !~ /./ ) {
854 # If the next run is encountered, then skip to the next hap and save it in
856 if ( $data eq '//' ) {
857 $self->{NEXT_RUN_NUM
}++;
858 $self->{LAST_READ_HAP_NUM
} = 0;
861 while ( $data !~ /./ ) {
868 my @sites = split( /\s+/, $data );
869 $self->{LAST_READ_SEGSITES
} = $sites[1];
871 elsif ( $_ eq '2' ) {
872 my @positions = split( /\s+/, $data );
874 $self->{LAST_READ_POSITIONS
} = \
@positions;
877 if ( !defined($data) ) {
878 $self->throw("run $self->{NEXT_RUN_NUM} has no haps./n");
880 $self->{BUFFER_HAP
} = $data;
886 "'//' not encountered when expected. There are more haplos in one of the msOUT runs than advertised in the msinfo line."