Use /usr/bin/perl instead of env even on examples
[bioperl-live.git] / lib / Bio / SeqIO / mbsout.pm
blob41b867cef7b0a7d8054e313f751b6f4329a90774
1 # POD documentation - main docs before the code
3 =head1 NAME
5 Bio::SeqIO::mbsout - input stream for output by Teshima et al.'s mbs.
7 =head1 SYNOPSIS
9 Do not use this module directly. Use it via the Bio::SeqIO class.
11 =head1 DESCRIPTION
13 mbs (Teshima KM, Innan H (2009) mbs: modifying Hudson's ms software to generate
14 samples of DNA sequences with a biallelic site under selection. BMC
15 Bioinformatics 10: 166 ) can be found at
16 http://www.biomedcentral.com/1471-2105/10/166/additional/.
18 Currently this object can be used to read output from mbs into seq objects.
19 However, because bioperl has no support for haplotypes created using an infinite
20 sites model (where '1' identifies a derived allele and '0' identifies an
21 ancestral allele), the sequences returned by mbsout are coded using A, T, C and
22 G. To decode the bases, use the sequence conversion table (a hash) returned by
23 get_base_conversion_table(). In the table, 4 and 5 are used when the ancestry is
24 unclear. This should not ever happen when creating files with mbs, but it will
25 be used when creating mbsOUT files from a collection of seq objects ( To be
26 added later ). Alternatively, use get_next_hap() to get a string with 1's and
27 0's instead of a seq object.
29 =head1 FEEDBACK
31 =head2 Mailing Lists
33 User feedback is an integral part of the evolution of this and other
34 Bioperl modules. Send your comments and suggestions preferably to the
35 Bioperl mailing list. Your participation is much appreciated.
37 bioperl-l@bioperl.org - General discussion
38 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
40 =head2 Reporting Bugs
42 Report bugs to the Bioperl bug tracking system to help us keep track
43 of the bugs and their resolution. Bug reports can be submitted via the
44 web:
46 https://github.com/bioperl/bioperl-live/issues
48 =head1 AUTHOR - Warren Kretzschmar
50 This module was written by Warren Kretzschmar
52 email: wkretzsch@gmail.com
54 This module grew out of a parser written by Aida Andres.
56 =head1 COPYRIGHT
58 =head2 Public Domain Notice
60 This software/database is ``United States Government Work'' under the
61 terms of the United States Copyright Act. It was written as part of
62 the authors' official duties for the United States Government and thus
63 cannot be copyrighted. This software/database is freely available to
64 the public for use without a copyright notice. Restrictions cannot
65 be placed on its present or future use.
67 Although all reasonable efforts have been taken to ensure the accuracy
68 and reliability of the software and data, the National Human Genome
69 Research Institute (NHGRI) and the U.S. Government does not and cannot
70 warrant the performance or results that may be obtained by using this
71 software or data. NHGRI and the U.S. Government disclaims all
72 warranties as to performance, merchantability or fitness for any
73 particular purpose.
75 =head1 METHODS
77 =cut
79 package Bio::SeqIO::mbsout;
81 use version;
82 our $API_VERSION = qv('1.1.3');
84 use strict;
85 use base qw(Bio::SeqIO); # This ISA Bio::SeqIO object
86 use Bio::Seq::SeqFactory;
88 =head2 INTERNAL METHODS
90 =head3 _initialize
92 Title : _initialize
93 Usage : $stream = Bio::SeqIO::mbsout->new($infile)
94 Function: extracts basic information about the file.
95 Returns : Bio::SeqIO object
96 Args : no_og
97 Details : include 'no_og' flag = 0 if the last population of an mbsout file
98 contains only one haplotype and you want the last haplotype to be
99 treated as the outgroup.
100 =cut
102 sub _initialize {
103 my ( $self, @args ) = @_;
104 $self->SUPER::_initialize(@args);
106 unless ( defined $self->sequence_factory ) {
107 $self->sequence_factory( Bio::Seq::SeqFactory->new() );
110 # Don't expect mbs to create an outgroup
111 my ($no_og) = $self->_rearrange( [qw(NO_OG)], @args ) || 1;
113 my %initial_values = (
114 RUNS => undef,
115 SEGSITES => undef,
116 MBS_INFO_LINE => undef,
117 TOT_RUN_HAPS => undef,
118 NEXT_RUN_NUM => undef, # What run is the next hap from? undef = EOF
119 LAST_READ_HAP_NUM => undef, # What did we just read from
120 LAST_READ_POSITIONS => [],
121 LAST_READ_SEGSITES => undef,
122 BUFFER_HAP => undef,
123 NO_OUTGROUP => $no_og,
124 OPTIONS => {},
125 LAST_READ_ALLELES => [],
126 LAST_READ_TRAJECTORY_FILE => undef,
127 LAST_READ_REPLICATION_OF_TRAJECTORY_FILE => undef,
128 BASE_CONVERSION_TABLE_HASH_REF => {
129 'A' => 0,
130 'T' => 1,
131 'C' => 4,
132 'G' => 5,
136 foreach my $key ( keys %initial_values ) {
137 $self->{$key} = $initial_values{$key};
140 # If the filehandle is defined open it and read a few lines
141 if ( ref( $self->{_filehandle} ) eq 'GLOB' ) {
142 $self->_read_start();
143 return $self;
146 # Otherwise throw a warning
147 else {
148 $self->throw(
149 "No filehandle defined. Please define a file handle through -file when calling mbsout with Bio::SeqIO"
154 =head3 _read_start
156 Title : _read_start
157 Usage : $stream->_read_start()
158 Function: reads from the filehandle $stream->{_filehandle} all information up to the first haplotype (sequence).
159 Returns : void
160 Args : none
162 =cut
164 sub _read_start {
165 my $self = shift;
167 my $fh_IN = $self->{_filehandle};
169 # get the first five lines and parse for important info
170 my ($mbs_info_line) = $self->_get_next_clean_hap( $fh_IN, 1, 1 );
172 my @mbs_info_line = split( /\s+/, $mbs_info_line );
174 # Parsing the mbs header line
175 shift @mbs_info_line;
176 shift @mbs_info_line;
177 my $tot_run_haps = shift @mbs_info_line;
178 my $runs;
180 # $pop_mut_param_per_site is the population mutation parameter per site.
181 my $pop_mut_param_per_site;
183 # $pop_recomb_param_per_site is the population recombination parameter per
184 # site.
185 my $pop_recomb_param_per_site;
187 # $nsites is length of the simulated region.
188 # $selpos is position of the target site of selection relative to the first
189 # site of the simulated region.
190 my $nsites;
191 my $selpos;
193 # $nfile is number of trajectory files.
194 # $nrep is number of replications for each trajectory.
195 # $traj_filename is initial part of the name of the trajectory files.
196 my $nfiles;
197 my $nreps;
198 my $traj_filename;
200 foreach my $word ( 0 .. $#mbs_info_line ) {
201 if ( $mbs_info_line[$word] eq '-t' ) {
202 $pop_mut_param_per_site = $mbs_info_line[ $word + 1 ];
204 elsif ( $mbs_info_line[$word] eq '-r' ) {
205 $pop_recomb_param_per_site = $mbs_info_line[ $word + 1 ];
206 $selpos = $mbs_info_line[ $word + 2 ];
208 elsif ( $mbs_info_line[$word] eq '-s' ) {
209 $nsites = $mbs_info_line[ $word + 1 ];
210 $selpos = $mbs_info_line[ $word + 2 ];
212 elsif ( $mbs_info_line[$word] eq '-f' ) {
213 $nfiles = $mbs_info_line[ $word + 1 ];
214 $nreps = $mbs_info_line[ $word + 2 ];
215 $traj_filename = $mbs_info_line[ $word + 3 ];
216 $runs = $nfiles * $nreps;
218 else { next; }
221 # Save mbs info data
222 $self->{RUNS} = $runs;
223 $self->{MBS_INFO_LINE} = $mbs_info_line;
224 $self->{TOT_RUN_HAPS} = $tot_run_haps;
225 $self->{POP_MUT_PARAM_PER_SITE} = $pop_mut_param_per_site;
226 $self->{POP_RECOMB_PARAM_PER_SITE} = $pop_recomb_param_per_site;
227 $self->{NSITES} = $nsites;
228 $self->{SELPOS} = $selpos;
229 $self->{NFILES} = $nfiles;
230 $self->{NREPS} = $nreps;
231 $self->{TRAJ_FILENAME} = $traj_filename;
234 =head2 Methods to retrieve mbsout data
236 =head3 get_segsites
238 Title : get_segsites
239 Usage : $segsites = $stream->get_segsites()
240 Function: returns the number segsites in the mbsout file (according to the mbsout header line).
241 Returns : scalar
242 Args : NONE
244 =cut
246 sub get_segsites {
247 my $self = shift;
248 if ( defined $self->{SEGSITES} ) {
249 return $self->{SEGSITES};
251 else {
252 return $self->get_current_run_segsites;
256 =head3 get_current_run_segsites
258 Title : get_current_run_segsites
259 Usage : $segsites = $stream->get_current_run_segsites()
260 Function: returns the number of segsites in the run of the last read haplotype (sequence).
261 Returns : scalar
262 Args : NONE
264 =cut
266 sub get_current_run_segsites {
267 my $self = shift;
268 return $self->{LAST_READ_SEGSITES};
271 =head3 get_pop_mut_param_per_site
273 Title : get_pop_mut_param_per_site
274 Usage : $pop_mut_param_per_site = $stream->get_pop_mut_param_per_site()
275 Function: returns 4*N0*mu or the "population mutation parameter per site"
276 Returns : scalar
277 Args : NONE
279 =cut
281 sub get_pop_mut_param_per_site {
282 my $self = shift;
283 return $self->{POP_MUT_PARAM_PER_SITE};
286 =head3 get_pop_recomb_param_per_site
288 Title : get_pop_recomb_param_per_site
289 Usage : $pop_recomb_param_per_site = $stream->get_pop_recomb_param_per_site()
290 Function: returns 4*N0*r or the "population recombination parameter per site"
291 Returns : scalar
292 Args : NONE
294 =cut
296 sub get_pop_recomb_param_per_site {
297 my $self = shift;
298 return $self->{POP_RECOMB_PARAM_PER_SITE};
301 =head3 get_nsites
303 Title : get_nsites
304 Usage : $nsites = $stream->get_nsites()
305 Function: returns the number of sites simulated by mbs.
306 Returns : scalar
307 Args : NONE
309 =cut
311 sub get_nsites {
312 my $self = shift;
313 return $self->{NSITES};
316 =head3 get_selpos
318 Title : get_selpos
319 Usage : $selpos = $stream->get_selpos()
320 Function: returns the location on the chromosome where the allele is located that was selected for by mbs.
321 Returns : scalar
322 Args : NONE
324 =cut
326 sub get_selpos {
327 my $self = shift;
328 return $self->{SELPOS};
331 =head3 get_nreps
333 Title : get_nreps
334 Usage : $nreps = $stream->get_nreps()
335 Function: returns the number replications done by mbs on each trajectory file to create the mbsout file.
336 Returns : scalar
337 Args : NONE
339 =cut
341 sub get_nreps {
342 my $self = shift;
343 return $self->{NREPS};
346 =head3 get_nfiles
348 Title : get_nfiles
349 Usage : $nfiles = $stream->get_nfiles()
350 Function: returns the number of trajectory files used by mbs to create the mbsout file
351 Returns : scalar
352 Args : NONE
354 =cut
356 sub get_nfiles {
357 my $self = shift;
358 return $self->{NFILES};
361 =head3 get_traj_filename
363 Title : get_traj_filename
364 Usage : $traj_filename = $stream->get_traj_filename()
365 Function: returns the prefix of the trajectory files used by mbs to create the mbsout file
366 Returns : scalar
367 Args : NONE
369 =cut
371 sub get_traj_filename {
372 my $self = shift;
373 return $self->{TRAJ_FILENAME};
376 =head3 get_runs
378 Title : get_runs
379 Usage : $runs = $stream->get_runs()
380 Function: returns the number of runs in the mbsout file
381 Returns : scalar
382 Args : NONE
384 =cut
386 sub get_runs {
387 my $self = shift;
388 return $self->{RUNS};
391 =head3 get_Positions
393 Title : get_Positions
394 Usage : @positions = $stream->get_Positions()
395 Function: returns an array of the names of each segsite of the run of the last read hap.
396 Returns : array
397 Args : NONE
399 =cut
401 sub get_Positions {
402 my $self = shift;
403 return @{ $self->{LAST_READ_POSITIONS} };
406 =head3 get_tot_run_haps
408 Title : get_tot_run_haps
409 Usage : $number_of_haps_per_run = $stream->get_tot_run_haps()
410 Function: returns the number of haplotypes (sequences) in each run of the mbsout file.
411 Returns : scalar >= 0
412 Args : NONE
414 =cut
416 sub get_tot_run_haps {
417 my $self = shift;
418 return $self->{TOT_RUN_HAPS};
421 =head3 get_mbs_info_line
423 Title : get_mbs_info_line
424 Usage : $mbs_info_line = $stream->get_mbs_info_line()
425 Function: returns the header line of the mbsout file.
426 Returns : scalar
427 Args : NONE
429 =cut
431 sub get_mbs_info_line {
432 my $self = shift;
433 return $self->{MBS_INFO_LINE};
436 =head3 tot_haps
438 Title : tot_haps
439 Usage : $number_of_haplotypes_in_file = $stream->tot_haps()
440 Function: returns the number of haplotypes (sequences) in the mbsout file. Information gathered from mbsout header line.
441 Returns : scalar
442 Args : NONE
444 =cut
446 sub get_tot_haps {
447 my $self = shift;
448 return ( $self->{TOT_RUN_HAPS} * $self->{RUNS} );
451 =head3 next_run_num
453 Title : next_run_num
454 Usage : $next_run_number = $stream->next_run_num()
455 Function: returns the number of the mbs run that the next haplotype (sequence)
456 will be taken from (starting at 1). Returns undef if the complete
457 file has been read.
458 Returns : scalar > 0 or undef
459 Args : NONE
461 =cut
463 sub get_next_run_num {
464 my $self = shift;
465 return $self->{NEXT_RUN_NUM};
468 =head3 get_last_haps_run_num
470 Title : get_last_haps_run_num
471 Usage : $last_haps_run_number = $stream->get_last_haps_run_num()
472 Function: returns the number of the ms run that the last haplotype (sequence)
473 was taken from (starting at 1). Returns undef if no hap has been
474 read yet.
475 Returns : scalar > 0 or undef
476 Args : NONE
478 =cut
480 sub get_last_haps_run_num {
481 my $self = shift;
482 return $self->{LAST_HAPS_RUN_NUM};
485 =head3 get_last_read_hap_num
487 Title : get_last_read_hap_num
488 Usage : $last_read_hap_num = $stream->get_last_read_hap_num()
489 Function: returns the number (starting with 1) of the last haplotype
490 read from the mbs file
491 Returns : scalar >= 0
492 Args : NONE
493 Details : 0 means that no haplotype has been read yet.
495 =cut
497 sub get_last_read_hap_num {
498 my $self = shift;
499 return $self->{LAST_READ_HAP_NUM};
502 =head3 outgroup
504 Title : outgroup
505 Usage : $outgroup = $stream->outgroup()
506 Function: returns '1' if the mbsout object has an outgroup. Returns '0'
507 otherwise.
508 Returns : 1 or 0, currently always 0
509 Args : NONE
510 Details : This method will return '1' only if the last population in the mbsout
511 file contains only one haplotype. If the last population is not an
512 outgroup then create the mbsout object using 'no_outgroup' as input
513 parameter for new() (see mbsout->new()).
515 Currently there exists no way of introducing an outgroup into an mbs
516 file, so this function will always return '0'.
518 =cut
520 sub outgroup {
521 my $self = shift;
522 if ( $self->{NO_OUTGROUP} ) { return 0; }
523 else { return 0; }
526 =head3 get_next_seq
528 Title : get_next_seq
529 Usage : $seq = $stream->get_next_seq()
530 Function: reads and returns the next sequence (haplotype) in the stream
531 Returns : Bio::Seq object
532 Args : NONE
533 Note : This function is included only to conform to convention. It only
534 calls next_hap() and passes on that method's return value. Use
535 next_hap() instead for better performance.
537 =cut
539 sub get_next_seq {
540 my $self = shift;
541 my $seqstring = $self->get_next_hap;
543 return unless defined $seqstring;
545 # Used to create unique ID;
546 my $run = $self->get_last_haps_run_num;
548 # Converting numbers to letters so that the haplotypes can be stored as a
549 # seq object
550 my $rh_base_conversion_table = $self->get_base_conversion_table;
551 foreach my $base ( keys %{$rh_base_conversion_table} ) {
552 $seqstring =~ s/($rh_base_conversion_table->{$base})/$base/g;
555 my $last_read_hap = $self->get_last_read_hap_num;
557 my $id = 'Hap_' . $last_read_hap . '_Run_' . $run;
558 my $description =
559 'Segsites '
560 . $self->get_current_run_segsites
561 . "; Positions $self->positions; Haplotype "
562 . $last_read_hap
563 . '; Run '
564 . $run . ';';
565 my $seq = $self->sequence_factory->create(
566 -seq => $seqstring,
567 -id => $id,
568 -desc => $description,
569 -alphabet => q(dna),
570 -direct => 1,
573 return $seq;
577 =head3 get_next_hap
579 Title : get_next_hap
580 Usage : $seq = $stream->get_next_hap()
581 Function: reads and returns the next sequence (haplotype) in the stream. Returns
582 void if all sequences in stream have been read.
583 Returns : Bio::Seq object
584 Args : NONE
585 Note : Use this instead of get_next_seq().
587 =cut
589 sub get_next_hap {
590 my $self = shift;
592 # Let's figure out how many haps to read from the input file so that
593 # we get back to the beginning of the next run.
595 my $end_run = 0;
596 if ( $self->{TOT_RUN_HAPS} == $self->{LAST_READ_HAP_NUM} + 1 ) {
597 $end_run = 1;
600 # Setting last_haps_run_num
601 $self->{LAST_HAPS_RUN_NUM} = $self->get_next_run_num;
603 my $fh_IN = $self->{_filehandle};
605 my ($seqstring) =
606 $self->_get_next_clean_hap( $self->{_filehandle}, 1, $end_run );
608 return $seqstring;
611 =head3 get_next_run
613 Title : get_next_run
614 Usage : @seqs = $stream->get_next_run()
615 Function: reads and returns all the remaining sequences (haplotypes) in the mbs
616 run of the next sequence.
617 Returns : array of Bio::Seq objects
618 Args : NONE
620 =cut
622 sub get_next_run {
623 my $self = shift;
625 # Let's figure out how many haps to read from the input file so that
626 # we get back to the beginning of the next run.
628 my $haps_to_pull = $self->{TOT_RUN_HAPS} - $self->{LAST_READ_HAP_NUM};
630 # Read those haps from the input file
631 # Next hap read will be the first hap of the next run.
633 my @seqs;
634 for ( 1 .. $haps_to_pull ) {
635 my $seq = $self->get_next_seq;
636 next unless defined $seq;
638 push @seqs, $seq;
641 return @seqs;
644 =head2 METHODS TO RETRIEVE CONSTANTS
646 =head3 base_conversion_table
648 Title : get_base_conversion_table
649 Usage : $table_hash_ref = $stream->get_base_conversion_table()
650 Function: returns a reference to a hash. The keys of the hash are the letters
651 'A','T','G','C'. The values associated with each key are the value
652 that each letter in the sequence of a seq object returned by a
653 Bio::SeqIO::mbsout stream should be translated to.
654 Returns : reference to a hash
655 Args : NONE
656 Synopsis:
658 # retrieve the Bio::Seq object's sequence
659 my $haplotype = $seq->seq;
660 my $rh_base_conversion_table = $stream->get_base_conversion_table();
662 # need to convert all letters to their corresponding numbers.
663 foreach my $base (keys %{$rh_base_conversion_table}){
664 $haplotype =~ s/($base)/$rh_base_conversion_table->{$base}/g;
667 # $haplotype is now an ms style haplotype. (e.g. '100101101455')
669 =cut
671 sub get_base_conversion_table {
672 my $self = shift;
673 return $self->{BASE_CONVERSION_TABLE_HASH_REF};
676 ##############################################################################
677 ## subs for internal use only
678 ##############################################################################
680 sub _get_next_clean_hap {
682 #By Warren Kretzschmar
684 # return the next non-empty line from file handle (chomped line)
685 # skipps to the next run if '//' is encountered
686 my ( $self, $fh, $times, $end_run ) = @_;
687 my @data;
689 unless ( defined $fh ) { return; }
691 unless ( defined $times && $times > 0 ) {
692 $times = 1;
695 if ( defined $self->{BUFFER_HAP} ) {
696 push @data, $self->{BUFFER_HAP};
697 $self->{BUFFER_HAP} = undef;
698 $self->{LAST_READ_HAP_NUM}++;
699 $times--;
702 while ( 1 <= $times-- ) {
704 # Find next clean line
705 my $data = <$fh>;
706 last if !defined($data);
707 chomp $data;
708 while ( $data !~ /./ ) {
709 $data = <$fh>;
710 chomp $data;
713 # If the next run is encountered here, then we have a programming
714 # or format error
715 if ( $data eq '//' ) { $self->throw("'//' found when not expected\n") }
717 $self->{LAST_READ_HAP_NUM}++;
718 push @data, $data;
721 if ($end_run) {
722 $self->_load_run_info($fh);
725 return (@data);
728 sub _load_run_info {
730 my ( $self, $fh ) = @_;
732 my $data = <$fh>;
734 # In this case we are at EOF
735 if ( !defined($data) ) { $self->{NEXT_RUN_NUM} = undef; return; }
737 chomp $data;
739 while ( $data !~ /./ ) {
740 $data = <$fh>;
742 # In this case we are at EOF
743 if ( !defined($data) ) { $self->{NEXT_RUN_NUM} = undef; return; }
744 chomp $data;
747 # If the next run is encountered, then skip to the next hap and save it in
748 # the buffer.
749 if ( $data =~ /^\/\// ) {
750 $self->{NEXT_RUN_NUM}++;
751 $self->{LAST_READ_HAP_NUM} = 0;
752 my @data = split( /\s+/, $data );
753 my @temp = split( /\/\//, $data[0] );
754 @temp = split( /-/, $temp[0] );
755 $self->{LAST_READ_TRAJ_FILE} = $temp[0];
756 $self->{LAST_LEAD_TRAJ_FILE_REPLICATION} = $temp[1];
757 $self->{LAST_READ_ALLELES} = \@data[ 2 .. $#data ];
759 for ( 1 .. 3 ) {
760 $data = <$fh>;
761 while ( $data !~ /./ ) {
762 $data = <$fh>;
764 chomp $data;
766 @data = split( /\s+/, $data );
768 if ( $_ eq '1' ) {
769 $self->{LAST_READ_SEGSITES} = $data[1];
771 elsif ( $_ eq '2' ) {
772 $self->{LAST_READ_POSITIONS} = [ @data[ 1 .. $#data ] ];
774 else {
775 if ( !defined($data) ) {
776 $self->throw("run $self->{NEXT_RUN_NUM} has no haps./n");
778 $self->{BUFFER_HAP} = $data;
782 else { $self->throw("'//' not encountered when expected\n") }