2 # BioPerl module for Bio::AlignIO::mega
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::AlignIO::mega - Parse and Create MEGA format data files
21 my $alignio = Bio::AlignIO->new(-format => 'mega',
22 -file => 't/data/hemoglobinA.meg');
24 while( my $aln = $alignio->next_aln ) {
25 # process each alignment or convert to another format like NEXUS
30 This object handles reading and writing data streams in the MEGA
31 format (Kumar and Nei).
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to
40 the Bioperl mailing list. Your participation is much appreciated.
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
47 Please direct usage questions or support issues to the mailing list:
49 I<bioperl-l@bioperl.org>
51 rather than to the module maintainer directly. Many experienced and
52 reponsive experts will be able look at the problem and quickly
53 address it. Please include a thorough description of the problem
54 with code and data examples if at all possible.
58 Report bugs to the Bioperl bug tracking system to help us keep track
59 of the bugs and their resolution. Bug reports can be submitted the
62 https://github.com/bioperl/bioperl-live/issues
64 =head1 AUTHOR - Jason Stajich
66 Email jason-at-bioperl.org
70 The rest of the documentation details each of the object methods.
71 Internal methods are usually preceded with a _
76 # Let the code begin...
79 package Bio
::AlignIO
::mega
;
81 use vars
qw($MEGANAMELEN %VALID_TYPES $LINELEN $BLOCKLEN);
85 use Bio::LocatableSeq;
87 # symbols are changed due to MEGA's use of '.' for redundant sequences
93 %VALID_TYPES = map {$_, 1} qw( dna rna protein standard);
95 use base
qw(Bio::AlignIO);
101 Usage : $aln = $stream->next_aln()
102 Function: returns the next alignment in the stream.
103 Supports the following MEGA format features:
104 - The file has to start with '#mega'
105 - Reads in the name of the alignment from a comment
106 (anything after '!TITLE: ') .
107 - Reads in the format parameters datatype
109 Returns : L<Bio::Align::AlignI> object - returns 0 on end of file
119 my ($alphabet,%seqs);
120 local $Bio::LocatableSeq
::OTHER_SYMBOLS
= '\*\?\.';
121 local $Bio::LocatableSeq
::GAP_SYMBOLS
= '\-';
122 my $aln = Bio
::SimpleAlign
->new(-source
=> 'mega');
124 while( defined($entry = $self->_readline()) && ($entry =~ /^\s+$/) ) {}
126 $self->throw("Not a valid MEGA file! [#mega] not starting the file!")
127 unless $entry =~ /^#mega/i;
129 while( defined($entry = $self->_readline() ) ) {
131 if(/\!Title:\s*([^\;]+)\s*/i) { $aln->id($1)}
132 elsif( s/\!Format\s+([^\;]+)\s*/$1/ ) {
133 my (@fields) = split(/\s+/,$1);
134 foreach my $f ( @fields ) {
135 my ($name,$value) = split(/\=/,$f);
136 if( $name eq 'datatype' ) {
138 } elsif( $name eq 'identical' ) {
139 $aln->match_char($value);
140 } elsif( $name eq 'indel' ) {
141 $aln->gap_char($value);
149 while( defined($entry) ) {
150 if( $entry !~ /^\s+$/ ) {
151 # this is to skip the leading '#'
152 my $seqname = substr($entry,1,$MEGANAMELEN-1);
153 $seqname =~ s/(\S+)\s+$/$1/g;
154 my $line = substr($entry,$MEGANAMELEN);
156 if( ! defined $seqs{$seqname} ) {push @order, $seqname; }
157 $seqs{$seqname} .= $line;
159 $entry = $self->_readline();
162 foreach my $seqname ( @order ) {
163 my $s = $seqs{$seqname};
164 $s =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g;
165 my $end = length($s);
166 my $seq = Bio
::LocatableSeq
->new('-alphabet' => $alphabet,
167 '-display_id' => $seqname,
168 '-seq' => $seqs{$seqname},
175 return $aln if $aln->num_sequences;
182 Usage : $stream->write_aln(@aln)
183 Function: writes the $aln object into the stream in MEGA format
184 Returns : 1 for success and 0 for error
185 Args : L<Bio::Align::AlignI> object
190 my ($self,@aln) = @_;
195 foreach my $aln ( @aln ) {
196 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
197 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
199 } elsif( ! $aln->is_flush($self->verbose) ) {
200 $self->warn("All Sequences in the alignment must be the same length");
204 my $len = $aln->length();
205 my $format = sprintf('datatype=%s identical=%s indel=%s;',
206 $aln->get_seq_by_pos(1)->alphabet(),
207 $aln->match_char, $aln->gap_char);
209 $self->_print(sprintf("#mega\n!Title: %s;\n!Format %s\n\n\n",
212 my ($count, $blockcount,$length) = ( 0,0,$aln->length());
213 $aln->set_displayname_flat();
214 while( $count < $length ) {
215 foreach my $seq ( $aln->each_seq ) {
216 my $seqchars = $seq->seq();
218 my $substring = substr($seqchars, $count, $LINELEN);
220 while( $blockcount < length($substring) ) {
221 push @blocks, substr($substring, $blockcount,$BLOCKLEN);
222 $blockcount += $BLOCKLEN;
224 $self->_print(sprintf("#%-".($MEGANAMELEN-1)."s%s\n",
225 substr($aln->displayname($seq->get_nse()),
227 join(' ', @blocks)));
233 $self->flush if $self->_flush_on_write && defined $self->_fh;