Bio::Tools::CodonTable::is_start_codon: check in case of ambiguous codons (#266)
[bioperl-live.git] / lib / Bio / SeqIO / msout.pm
blob18756c36e34d028e1bf090e36afcbe488052ed7a
1 # POD documentation - main docs before the code
3 =head1 NAME
5 Bio::SeqIO::msout - input stream for output by Hudson's ms
7 =head1 SYNOPSIS
9 Do not use this module directly. Use it via the Bio::SeqIO class.
11 =head1 DESCRIPTION
13 ms ( Hudson, R. R. (2002) Generating samples under a Wright-Fisher neutral
14 model. Bioinformatics 18:337-8 ) can be found at
15 http://home.uchicago.edu/~rhudson1/source/mksamples.html.
17 Currently, this object can be used to read output from ms into seq objects.
18 However, because bioperl has no support for haplotypes created using an infinite
19 sites model (where '1' identifies a derived allele and '0' identifies an
20 ancestral allele), the sequences returned by msout are coded using A, T, C and
21 G. To decode the bases, use the sequence conversion table (a hash) returned by
22 get_base_conversion_table(). In the table, 4 and 5 are used when the ancestry is
23 unclear. This should not ever happen when creating files with ms, but it will be
24 used when creating msOUT files from a collection of seq objects ( To be added
25 later ). Alternatively, use get_next_hap() to get a string with 1's and 0's
26 instead of a seq object.
28 =head2 Mapping to Finite Sites
30 This object can now also be used to map haplotypes created using an infinite sites
31 model to sequences of arbitrary finite length. See set_n_sites() for more detail.
32 Thanks to Filipe G. Vieira <fgvieira@berkeley.edu> for the idea and code.
34 =head1 FEEDBACK
36 =head2 Mailing Lists
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to the
40 Bioperl mailing list. Your participation is much appreciated.
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
45 =head2 Reporting Bugs
47 Report bugs to the Bioperl bug tracking system to help us keep track
48 of the bugs and their resolution. Bug reports can be submitted via the
49 web:
51 https://github.com/bioperl/bioperl-live/issues
53 =head1 AUTHOR - Warren Kretzschmar
55 This module was written by Warren Kretzschmar
57 email: wkretzsch@gmail.com
59 This module grew out of a parser written by Aida Andres.
61 =head1 COPYRIGHT
63 =head2 Public Domain Notice
65 This software/database is ``United States Government Work'' under the
66 terms of the United States Copyright Act. It was written as part of
67 the authors' official duties for the United States Government and thus
68 cannot be copyrighted. This software/database is freely available to
69 the public for use without a copyright notice. Restrictions cannot
70 be placed on its present or future use.
72 Although all reasonable efforts have been taken to ensure the accuracy
73 and reliability of the software and data, the National Human Genome
74 Research Institute (NHGRI) and the U.S. Government does not and cannot
75 warrant the performance or results that may be obtained by using this
76 software or data. NHGRI and the U.S. Government disclaims all
77 warranties as to performance, merchantability or fitness for any
78 particular purpose.
80 =head1 METHODS
82 =cut
84 package Bio::SeqIO::msout;
86 use version;
87 our $API_VERSION = qv('1.1.8');
89 use strict;
90 use base qw(Bio::SeqIO); # This ISA Bio::SeqIO object
91 use Bio::Seq::SeqFactory;
93 =head2 Methods for Internal Use
95 =head3 _initialize
97 Title : _initialize
98 Usage : $stream = Bio::SeqIO::msOUT->new($infile)
99 Function: extracts basic information about the file.
100 Returns : Bio::SeqIO object
101 Args : no_og, gunzip, gzip, n_sites
102 Details :
103 - include 'no_og' flag if the last population of an msout file contains
104 only one haplotype and you don't want the last haplotype to be
105 treated as the outgroup ( suggested when reading data created by ms ).
106 - including 'n_sites' (positive integer) causes all output haplotypes to be
107 mapped to a sequence of length 'n_sites'. See set_n_sites() for more details.
109 =cut
111 sub _initialize {
112 my ( $self, @args ) = @_;
113 $self->SUPER::_initialize(@args);
115 unless ( defined $self->sequence_factory ) {
116 $self->sequence_factory( Bio::Seq::SeqFactory->new() );
118 my ($no_og) = $self->_rearrange( [qw(NO_OG)], @args );
119 my ($n_sites) = $self->_rearrange( [qw(N_SITES)], @args );
121 my %initial_values = (
122 RUNS => undef,
123 N_SITES => undef,
124 SEGSITES => undef,
125 SEEDS => [],
126 MS_INFO_LINE => undef,
127 TOT_RUN_HAPS => undef,
128 POPS => [],
129 NEXT_RUN_NUM => undef, # What run is the next hap from? undef = EOF
130 LAST_READ_HAP_NUM => undef, # What did we just read from
131 LAST_HAPS_RUN_NUM => undef,
132 LAST_READ_POSITIONS => [],
133 LAST_READ_SEGSITES => undef,
134 BUFFER_HAP => undef,
135 NO_OUTGROUP => $no_og,
136 BASE_CONVERSION_TABLE_HASH_REF => {
137 'A' => 0,
138 'T' => 1,
139 'C' => 4,
140 'G' => 5,
143 foreach my $key ( keys %initial_values ) {
144 $self->{$key} = $initial_values{$key};
146 $self->set_n_sites($n_sites);
148 # If the filehandle is defined open it and read a few lines
149 if ( ref( $self->{_filehandle} ) eq 'GLOB' ) {
150 $self->_read_start();
151 return $self;
154 # Otherwise throw a warning
155 else {
156 $self->throw(
157 "No filehandle defined. Please define a file handle through -file when calling msout with Bio::SeqIO"
162 =head3 _read_start
164 Title : _read_start
165 Usage : $stream->_read_start()
166 Function: reads from the filehandle $stream->{_filehandle} all information up to the first haplotype (sequence). Closes the filehandle if all lines have been read.
167 Returns : void
168 Args : none
170 =cut
172 sub _read_start {
173 my $self = shift;
175 my $fh_IN = $self->{_filehandle};
177 # get the first five lines and parse for important info
178 my ( $ms_info_line, $seeds ) = $self->_get_next_clean_hap( $fh_IN, 2, 1 );
180 my @ms_info_line = split( /\s+/, $ms_info_line );
182 my ( $tot_pops, @pop_haplos );
184 # Parsing the ms header line
185 shift @ms_info_line;
186 my $tot_run_haps = shift @ms_info_line;
187 my $runs = shift @ms_info_line;
188 my $segsites;
190 foreach my $word ( 0 .. $#ms_info_line ) {
191 if ( $ms_info_line[$word] eq '-I' ) {
192 $tot_pops = $ms_info_line[ $word + 1 ];
193 for my $pop_num ( 1 .. $tot_pops ) {
194 push @pop_haplos, $ms_info_line[ $word + 1 + $pop_num ];
197 # if @pop_haplos contains a non-digit, then there is an error in the msinfo line.
198 if ( !defined $pop_haplos[-1] || $pop_haplos[-1] =~ /\D/ ) {
199 $self->throw(
200 "Incorrect number of populations in the ms info line (after the -I specifier)"
204 elsif ( $ms_info_line[$word] eq '-s' ) {
205 $segsites = $ms_info_line[ $word + 1 ];
207 else { next; }
210 unless (@pop_haplos) { @pop_haplos = ($tot_run_haps) }
212 my @seeds = split( /\s+/, $seeds );
214 # Save ms info data
215 $self->{RUNS} = $runs;
216 $self->{SEGSITES} = $segsites;
217 $self->{SEEDS} = \@seeds;
218 $self->{MS_INFO_LINE} = $ms_info_line;
219 $self->{TOT_RUN_HAPS} = $tot_run_haps;
220 $self->{POPS} = [@pop_haplos];
222 return;
225 =head2 Methods to Access Data
227 =head3 get_segsites
229 Title : get_segsites
230 Usage : $segsites = $stream->get_segsites()
231 Function: returns the number of segsites in the msOUT file (according to the msOUT header line's -s option), or the current run's segsites if -s was not specified in the command line (in this case the number of segsites varies from run to run).
232 Returns : scalar
233 Args : NONE
235 =cut
237 sub get_segsites {
238 my $self = shift;
239 if ( defined $self->{SEGSITES} ) {
240 return $self->{SEGSITES};
242 else {
243 return $self->get_current_run_segsites;
247 =head3 get_current_run_segsites
249 Title : get_current_run_segsites
250 Usage : $segsites = $stream->get_current_run_segsites()
251 Function: returns the number of segsites in the run of the last read
252 haplotype (sequence).
253 Returns : scalar
254 Args : NONE
256 =cut
258 sub get_current_run_segsites {
259 my $self = shift;
260 return $self->{LAST_READ_SEGSITES};
263 =head3 get_n_sites
265 Title : get_n_sites
266 Usage : $n_sites = $stream->get_n_sites()
267 Function: Gets the number of total sites (variable or not) to be output.
268 Returns : scalar if n_sites option is defined at call time of new()
269 Args : NONE
270 Note :
271 WARNING: Final sequence length might not be equal to n_sites if n_sites is
272 too close to number of segregating sites in the msout file.
274 =cut
276 sub get_n_sites {
277 my ($self) = @_;
279 return $self->{N_SITES};
282 =head3 set_n_sites
284 Title : set_n_sites
285 Usage : $n_sites = $stream->set_n_sites($value)
286 Function: Sets the number of total sites (variable or not) to be output.
287 Returns : 1 on success; throws an error if $value is not a positive integer or undef
288 Args : positive integer
289 Note :
290 WARNING: Final sequence length might not be equal to n_sites if it is
291 too close to number of segregating sites.
292 - n_sites needs to be at least as large as the number of segsites of
293 the next haplotype returned
294 - n_sites may also be set to undef, in which case haplotypes are returned
295 under the infinite sites model assumptions.
297 =cut
299 sub set_n_sites {
300 my ( $self, $value ) = @_;
302 # make sure $value is a positive integer if it is defined
303 if ( defined $value ) {
304 $self->throw(
305 "first argument needs to be a positive integer. argument supplied: $value"
306 ) unless ( $value =~ m/^\d+$/ && $value > 0 );
308 $self->{N_SITES} = $value;
310 return 1;
313 =head3 get_runs
315 Title : get_runs
316 Usage : $runs = $stream->get_runs()
317 Function: returns the number of runs in the msOUT file (according to the
318 msinfo line)
319 Returns : scalar
320 Args : NONE
322 =cut
324 sub get_runs {
325 my $self = shift;
326 return $self->{RUNS};
329 =head3 get_Seeds
331 Title : get_Seeds
332 Usage : @seeds = $stream->get_Seeds()
333 Function: returns an array of the seeds used in the creation of the msOUT file.
334 Returns : array
335 Args : NONE
336 Details : In older versions, ms used three seeds. Newer versions of ms seem to
337 use only one (longer) seed. This function will return all the seeds
338 found.
340 =cut
342 sub get_Seeds {
343 my $self = shift;
344 return @{ $self->{SEEDS} };
347 =head3 get_Positions
349 Title : get_Positions
350 Usage : @positions = $stream->get_Positions()
351 Function: returns an array of the names of each segsite of the run of the last
352 read hap.
353 Returns : array
354 Args : NONE
355 Details : The Positions may or may not vary from run to run depending on the
356 options used with ms.
358 =cut
360 sub get_Positions {
361 my $self = shift;
362 return @{ $self->{LAST_READ_POSITIONS} };
365 =head3 get_tot_run_haps
367 Title : get_tot_run_haps
368 Usage : $number_of_haps_per_run = $stream->get_tot_run_haps()
369 Function: returns the number of haplotypes (sequences) in each run of the msOUT
370 file ( according to the msinfo line ).
371 Returns : scalar >= 0
372 Args : NONE
373 Details : This number should not vary from run to run.
375 =cut
377 sub get_tot_run_haps {
378 my $self = shift;
379 return $self->{TOT_RUN_HAPS};
382 =head3 get_ms_info_line
384 Title : get_ms_info_line
385 Usage : $ms_info_line = $stream->get_ms_info_line()
386 Function: returns the header line of the msOUT file.
387 Returns : scalar
388 Args : NONE
390 =cut
392 sub get_ms_info_line {
393 my $self = shift;
394 return $self->{MS_INFO_LINE};
397 =head3 tot_haps
399 Title : tot_haps
400 Usage : $number_of_haplotypes_in_file = $stream->tot_haps()
401 Function: returns the number of haplotypes (sequences) in the msOUT file.
402 Information gathered from msOUT header line.
403 Returns : scalar
404 Args : NONE
406 =cut
408 sub get_tot_haps {
409 my $self = shift;
410 return ( $self->{TOT_RUN_HAPS} * $self->{RUNS} );
413 =head3 get_Pops
415 Title : get_Pops
416 Usage : @pops = $stream->pops()
417 Function: returns an array of population sizes (order taken from the -I flag in
418 the msOUT header line). This array will include the last hap even if
419 it looks like an outgroup.
420 Returns : array of scalars > 0
421 Args : NONE
423 =cut
425 sub get_Pops {
426 my $self = shift;
427 return @{ $self->{POPS} };
430 =head3 get_next_run_num
432 Title : get_next_run_num
433 Usage : $next_run_number = $stream->next_run_num()
434 Function: returns the number of the ms run that the next haplotype (sequence)
435 will be taken from (starting at 1). Returns undef if the complete
436 file has been read.
437 Returns : scalar > 0 or undef
438 Args : NONE
440 =cut
442 sub get_next_run_num {
443 my $self = shift;
444 return $self->{NEXT_RUN_NUM};
447 =head3 get_last_haps_run_num
449 Title : get_last_haps_run_num
450 Usage : $last_haps_run_number = $stream->get_last_haps_run_num()
451 Function: returns the number of the ms run that the last haplotype (sequence)
452 was taken from (starting at 1). Returns undef if no hap has been
453 read yet.
454 Returns : scalar > 0 or undef
455 Args : NONE
457 =cut
459 sub get_last_haps_run_num {
460 my $self = shift;
461 return $self->{LAST_HAPS_RUN_NUM};
464 =head3 get_last_read_hap_num
466 Title : get_last_read_hap_num
467 Usage : $last_read_hap_num = $stream->get_last_read_hap_num()
468 Function: returns the number (starting with 1) of the last haplotype read from
469 the ms file
470 Returns : scalar >= 0
471 Args : NONE
472 Details : 0 means that no haplotype has been read yet. Is reset to 0 every run.
474 =cut
476 sub get_last_read_hap_num {
477 my $self = shift;
478 return $self->{LAST_READ_HAP_NUM};
481 =head3 outgroup
483 Title : outgroup
484 Usage : $outgroup = $stream->outgroup()
485 Function: returns '1' if the msOUT stream has an outgroup. Returns '0'
486 otherwise.
487 Returns : '1' or '0'
488 Args : NONE
489 Details : This method will return '1' only if the last population in the msOUT
490 file contains only one haplotype. If the last population is not an
491 outgroup then create the msOUT object using 'no_og' as input flag.
492 Also, return 0, if the run has only one population.
494 =cut
496 sub outgroup {
497 my $self = shift;
498 my @pops = $self->get_Pops;
499 if ( $pops[$#pops] == 1 && !defined $self->{NO_OUTGROUP} && @pops > 1 ) {
500 return 1;
502 else { return 0; }
505 =head3 get_next_haps_pop_num
507 Title : get_next_haps_pop_num
508 Usage : ($next_haps_pop_num, $num_haps_left_in_pop) = $stream->get_next_haps_pop_num()
509 Function: First return value is the population number (starting with 1) the
510 next hap will come from. The second return value is the number of haps
511 left to read in the population from which the next hap will come.
512 Returns : (scalar > 0, scalar > 0)
513 Args : NONE
515 =cut
517 sub get_next_haps_pop_num {
518 my $self = shift;
519 my $last_read_hap = $self->get_last_read_hap_num;
520 my @pops = $self->get_Pops;
522 foreach my $pop_num ( 0 .. $#pops ) {
523 if ( $last_read_hap < $pops[$pop_num] ) {
524 return ( $pop_num + 1, $pops[$pop_num] - $last_read_hap );
526 else { $last_read_hap -= $pops[$pop_num] }
529 # In this case we're at the beginning of the next run
530 return ( 1, $pops[0] );
533 =head3 get_next_seq
535 Title : get_next_seq
536 Usage : $seq = $stream->get_next_seq()
537 Function: reads and returns the next sequence (haplotype) in the stream
538 Returns : Bio::Seq object or void if end of file
539 Args : NONE
540 Note : This function is included only to conform to convention. The
541 returned Bio::Seq object holds a halpotype in coded form. Use the hash
542 returned by get_base_conversion_table() to convert 'A', 'T', 'C', 'G'
543 back into 1,2,4 and 5. Use get_next_hap() to retrieve the halptoype as
544 a string of 1,2,4 and 5s instead.
546 =cut
548 sub get_next_seq {
549 my $self = shift;
550 my $seqstring = $self->get_next_hap;
552 return unless ($seqstring);
554 # Used to create unique ID;
555 my $run = $self->get_last_haps_run_num;
557 # Converting numbers to letters so that the haplotypes can be stored as a
558 # seq object
559 my $rh_base_conversion_table = $self->get_base_conversion_table;
560 foreach my $base ( keys %{$rh_base_conversion_table} ) {
561 $seqstring =~ s/($rh_base_conversion_table->{$base})/$base/g;
564 # Fill in non-variable positions
565 my $segsites = $self->get_current_run_segsites;
566 my $n_sites = $self->get_n_sites;
567 if ( defined($n_sites) ) {
569 # make sure that n_sites is at least as large
570 # as segsites for each run. Throw an exception otherwise.
571 $self->throw( "n_sites:\t$n_sites"
572 . "\nsegsites:\t$segsites"
573 . "\nrun:\t$run"
574 . "\nn_sites needs to be at least the number of segsites of every run"
575 ) unless $segsites <= $n_sites;
577 my $seq_len = 0;
578 my @seq;
579 my @pos = $self->get_Positions;
580 for ( my $i = 0 ; $i <= $#pos ; $i++ ) {
581 $pos[$i] *= $n_sites;
582 my $rpt = $pos[$i] - 1 - $seq_len;
583 $rpt = $rpt >= 1 ? $rpt : 0;
584 push( @seq, "A" x ( $rpt ) );
585 $seq_len += length( $seq[-1] );
586 push( @seq, substr( $seqstring, $i, 1 ) );
587 $seq_len += length( $seq[-1] );
589 push( @seq, "A" x ( $n_sites - $seq_len ) );
590 $seqstring = join( "", @seq );
593 my $last_read_hap = $self->get_last_read_hap_num;
594 my $id = 'Hap_' . $last_read_hap . '_Run_' . $run;
595 my $description =
596 "Segsites $segsites;"
597 . " Positions "
598 . ( defined $n_sites ? $n_sites : $segsites ) . ";"
599 . " Haplotype $last_read_hap;"
600 . " Run $run;";
601 my $seq = $self->sequence_factory->create(
602 -seq => $seqstring,
603 -id => $id,
604 -desc => $description,
605 -alphabet => q(dna),
606 -direct => 1,
609 return $seq
613 =head3 next_seq
615 Title : next_seq
616 Usage : $seq = $stream->next_seq()
617 Function: Alias to get_next_seq()
618 Returns : Bio::Seq object or void if end of file
619 Args : NONE
620 Note : This function is only included for convention. It calls get_next_seq().
621 See get_next_seq() for details.
623 =cut
625 sub next_seq {
626 my $self = shift;
627 return $self->get_next_seq();
630 =head3 get_next_hap
632 Title : get_next_hap
633 Usage : $hap = $stream->next_hap()
634 Function: reads and returns the next sequence (haplotype) in the stream.
635 Returns undef if all sequences in stream have been read.
636 Returns : Haplotype string (e.g. '110110000101101045454000101'
637 Args : NONE
638 Note : Use get_next_seq() if you want the halpotype returned as a
639 Bio::Seq object.
641 =cut
643 sub get_next_hap {
644 my $self = shift;
646 # Let's figure out how many haps to read from the input file so that
647 # we get back to the beginning of the next run.
649 my $end_run = 0;
650 if ( $self->{TOT_RUN_HAPS} == $self->{LAST_READ_HAP_NUM} + 1 ) {
651 $end_run = 1;
654 # Setting last_haps_run_num
655 $self->{LAST_HAPS_RUN_NUM} = $self->get_next_run_num;
657 my $last_read_hap = $self->get_last_read_hap_num;
658 my ($seqstring) =
659 $self->_get_next_clean_hap( $self->{_filehandle}, 1, $end_run );
660 if ( !defined $seqstring && $last_read_hap < $self->get_tot_haps ) {
661 $self->throw(
662 "msout file has only $last_read_hap hap(s), which is less than indicated in msinfo line ( "
663 . $self->get_tot_haps
664 . " )" );
667 return $seqstring;
670 =head3 get_next_pop
672 Title : get_next_pop
673 Usage : @seqs = $stream->next_pop()
674 Function: reads and returns all the remaining sequences (haplotypes) in the
675 population of the next sequence. Returns an empty list if no more
676 haps remain to be read in the stream
677 Returns : array of Bio::Seq objects
678 Args : NONE
680 =cut
682 sub get_next_pop {
683 my $self = shift;
685 # Let's figure out how many haps to read from the input file so that
686 # we get back to the beginning of the next run.
688 my @pops = $self->get_Pops;
689 my @seqs; # holds Bio::Seq objects to return
691 # Determine number of the pop that the next hap will be taken from
692 my ( $next_haps_pop_num, $haps_to_pull ) = $self->get_next_haps_pop_num;
694 # If $haps_to_pull == 0, then we need to pull the whole population
695 if ( $haps_to_pull == 0 ) {
696 $haps_to_pull = $pops[ $next_haps_pop_num - 1 ];
699 for ( 1 .. $haps_to_pull ) {
700 my $seq = $self->get_next_seq;
701 next unless defined $seq;
703 # Add Population number information to description
704 $seq->display_id(" Population number $next_haps_pop_num;");
705 push @seqs, $seq;
708 return @seqs;
711 =head3 next_run
713 Title : next_run
714 Usage : @seqs = $stream->next_run()
715 Function: reads and returns all the remaining sequences (haplotypes) in the ms
716 run of the next sequence. Returns an empty list if all haps have been
717 read from the stream.
718 Returns : array of Bio::Seq objects
719 Args : NONE
721 =cut
723 sub get_next_run {
724 my $self = shift;
726 # Let's figure out how many haps to read from the input file so that
727 # we get back to the beginning of the next run.
729 my ( $next_haps_pop_num, $haps_to_pull ) = $self->get_next_haps_pop_num;
730 my @seqs;
732 my @pops = $self->get_Pops;
734 foreach ( $next_haps_pop_num .. $#pops ) {
735 $haps_to_pull += $pops[$_];
738 # Read those haps from the input file
739 # Next hap read will be the first hap of the first pop of the next run.
741 for ( 1 .. $haps_to_pull ) {
742 my $seq = $self->get_next_seq;
743 next unless defined $seq;
745 push @seqs, $seq;
748 return @seqs;
751 =head2 Methods to Retrieve Constants
753 =head3 base_conversion_table
755 Title : get_base_conversion_table
756 Usage : $table_hash_ref = $stream->get_base_conversion_table()
757 Function: returns a reference to a hash. The keys of the hash are the letters '
758 A','T','G','C'. The values associated with each key are the value that
759 each letter in the sequence of a seq object returned by a
760 Bio::SeqIO::msout stream should be translated to.
761 Returns : reference to a hash
762 Args : NONE
763 Synopsis:
765 # retrieve the Bio::Seq object's sequence
766 my $haplotype = $seq->seq;
768 # need to convert all letters to their corresponding numbers.
769 foreach my $base (keys %{$rh_base_conversion_table}){
770 $haplotype =~ s/($base)/$rh_base_conversion_table->{$base}/g;
773 # $haplotype is now an ms style haplotype. (e.g. '100101101455')
775 =cut
777 sub get_base_conversion_table {
778 my $self = shift;
779 return $self->{BASE_CONVERSION_TABLE_HASH_REF};
782 ##############################################################################
783 ## subs for internal use only
784 ##############################################################################
786 sub _get_next_clean_hap {
788 #By Warren Kretzschmar
790 # return the next non-empty line from file handle (chomped line)
791 # skipps to the next run if '//' is encountered
792 my ( $self, $fh, $times, $end_run ) = @_;
793 my @data;
795 unless ( ref($fh) eq q(GLOB) ) { return; }
797 unless ( defined $times && $times > 0 ) {
798 $times = 1;
801 if ( defined $self->{BUFFER_HAP} ) {
802 push @data, $self->{BUFFER_HAP};
803 $self->{BUFFER_HAP} = undef;
804 $self->{LAST_READ_HAP_NUM}++;
805 $times--;
808 while ( 1 <= $times-- ) {
810 # Find next clean line
811 my $data = <$fh>;
812 last if !defined($data);
813 chomp $data;
814 while ( $data !~ /./ ) {
815 $data = <$fh>;
816 chomp $data;
819 # If the next run is encountered here, then we have a programming
820 # or format error
821 if ( $data eq '//' ) { $self->throw("'//' found when not expected\n") }
823 $self->{LAST_READ_HAP_NUM}++;
824 push @data, $data;
827 if ($end_run) {
828 $self->_load_run_info($fh);
831 return (@data);
834 sub _load_run_info {
836 my ( $self, $fh ) = @_;
838 my $data = <$fh>;
840 # getting rid of excess newlines
841 while ( defined($data) && $data !~ /./ ) {
842 $data = <$fh>;
845 # In this case we are at EOF
846 if ( !defined($data) ) { $self->{NEXT_RUN_NUM} = undef; return; }
848 while ( $data !~ /./ ) {
849 $data = <$fh>;
850 chomp $data;
853 chomp $data;
855 # If the next run is encountered, then skip to the next hap and save it in
856 # the buffer.
857 if ( $data eq '//' ) {
858 $self->{NEXT_RUN_NUM}++;
859 $self->{LAST_READ_HAP_NUM} = 0;
860 for ( 1 .. 3 ) {
861 $data = <$fh>;
862 while ( $data !~ /./ ) {
863 $data = <$fh>;
864 chomp $data;
866 chomp $data;
868 if ( $_ eq '1' ) {
869 my @sites = split( /\s+/, $data );
870 $self->{LAST_READ_SEGSITES} = $sites[1];
872 elsif ( $_ eq '2' ) {
873 my @positions = split( /\s+/, $data );
874 shift @positions;
875 $self->{LAST_READ_POSITIONS} = \@positions;
877 else {
878 if ( !defined($data) ) {
879 $self->throw("run $self->{NEXT_RUN_NUM} has no haps./n");
881 $self->{BUFFER_HAP} = $data;
885 else {
886 $self->throw(
887 "'//' not encountered when expected. There are more haplos in one of the msOUT runs than advertised in the msinfo line."