2 # BioPerl module for Bio::AlignIO::nexus
4 # Copyright Heikki Lehvaslaiho
9 Bio::AlignIO::nexus - NEXUS format sequence input/output stream
13 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
17 my $in = Bio::AlignIO->new(-format => 'nexus',
18 -file => 'aln.nexus');
19 while( my $aln = $in->next_aln ) {
20 # do something with the alignment
25 This object can transform L<Bio::Align::AlignI> objects to and from NEXUS
26 data blocks. See method documentation for supported NEXUS features.
28 =head1 ACKNOWLEDGEMENTS
30 Will Fisher has written an excellent standalone NEXUS format parser in
31 Perl, readnexus. A number of tricks were adapted from it.
37 Please direct usage questions or support issues to the mailing list:
39 I<bioperl-l@bioperl.org>
41 rather than to the module maintainer directly. Many experienced and
42 reponsive experts will be able look at the problem and quickly
43 address it. Please include a thorough description of the problem
44 with code and data examples if at all possible.
48 Report bugs to the Bioperl bug tracking system to help us keep track
49 the bugs and their resolution. Bug reports can be submitted via the
52 https://github.com/bioperl/bioperl-live/issues
54 =head1 AUTHORS - Heikki Lehvaslaiho
56 Email: heikki-at-bioperl-dot-org
60 The rest of the documentation details each of the object
61 methods. Internal methods are usually preceded with a _
65 # Let the code begin...
67 package Bio
::AlignIO
::nexus
;
69 use vars
qw(%valid_type);
74 use base qw(Bio::AlignIO);
77 %valid_type = map {$_, 1} qw( dna rna protein standard );
78 # standard throws error: inherited from Bio::PrimarySeq
84 Usage : $alignio = Bio::AlignIO->new(-format => 'nexus', -file => 'filename');
85 Function: returns a new Bio::AlignIO object to handle clustalw files
86 Returns : Bio::AlignIO::clustalw object
87 Args : -verbose => verbosity setting (-1,0,1,2)
88 -file => name of file to read in or with ">" - writeout
89 -fh => alternative to -file param - provide a filehandle
91 -format => type of Alignment Format to process or produce
93 Customization of nexus flavor output
95 -show_symbols => print the symbols="ATGC" in the data definition
96 (MrBayes does not like this)
97 boolean [default is 1]
98 -show_endblock => print an 'endblock;' at the end of the data
99 (MyBayes does not like this)
100 boolean [default is 1]
105 my ($self, @args) = @_;
106 $self->SUPER::_initialize
(@args);
107 my ($show_symbols, $endblock) =
108 $self->_rearrange([qw(SHOW_SYMBOLS SHOW_ENDBLOCK)], @args);
109 my @names = qw(symbols endblock);
110 for my $v ( $show_symbols, $endblock ) {
111 $v = 1 unless defined $v; # default value is 1
112 my $n = shift @names;
121 Usage : $aln = $stream->next_aln()
122 Function: Returns the next alignment in the stream.
124 Supports the following NEXUS format features:
125 - The file has to start with '#NEXUS'
126 - Reads in the name of the alignment from a comment
127 (anything after 'TITLE: ') .
128 - Sequence names can be given in a taxa block, too.
129 - If matchchar notation is used, converts
130 them back to sequence characters.
131 - Does character conversions specified in the
132 NEXUS equate command.
133 - Sequence names of type 'Homo sapiens' and
134 Homo_sapiens are treated identically.
136 Returns : L<Bio::Align::AlignI> object
146 my ($aln_name, $seqcount, $residuecount, %hash, $alphabet,
147 $match, $gap, $missing, $equate, $interleave,
148 $name,$str,@names,$seqname,$start,$end,$count,$seq);
149 local $Bio::LocatableSeq
::OTHER_SYMBOLS
= '\*\?\.';
150 local $Bio::LocatableSeq
::GAP_SYMBOLS
= '\-';
151 my $aln = Bio
::SimpleAlign
->new(-source
=> 'nexus');
153 # file starts with '#NEXUS' but we allow white space only lines before it
154 $entry = $self->_readline;
155 $entry = $self->_readline while defined $entry && $entry =~ /^\s+$/;
157 return unless $entry;
158 $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry")
159 unless ($entry && $entry =~ /^#NEXUS/i);
161 # skip anything before either the taxa or data block
162 # but read in the optional title in a comment
163 while (defined($entry = $self->_readline)) {
165 /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1;
166 last if /^begin +data/i || /^begin +taxa/i;
168 $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name;
170 # data and taxa blocks
172 while (defined ($entry = $self->_readline)) {
174 next if s/\[[^\]]+\]//g; # remove comments
175 if( s/\[[^\]]+$// ) {
177 # skip line if it is now empty or contains only whitespace
179 } elsif($incomment) {
180 if( s/^[^\]]*\]// ) {
185 } elsif( /taxlabels/i ) {
186 # doesn't deal with taxlabels adequately and can mess things up!
187 # @names = $self->_read_taxlabels;
190 /ntax\s*=\s*(\d+)/i and $seqcount = $1;
191 /nchar\s*=\s*(\d+)/i and $residuecount = $1;
192 /matchchar\s*=\s*(.)/i and $match = $1;
193 /gap\s*=\s*(.)/i and $gap = $1;
194 /missing\s*=\s*(.)/i and $missing = $1;
195 /equate\s*=\s*\"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A";
196 /datatype\s*=\s*(\w+)/i and $alphabet = lc $1;
197 /interleave/i and $interleave = 1 ;
201 $self->throw("Not a valid NEXUS sequence file. Datatype not specified.")
203 $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]")
204 unless $valid_type{$alphabet};
205 $self->throw("\"$gap\" is not a valid gap character. For compatability, gap char can not be one of: ()[]{}/\,;:=*'`\"<>^")
206 if $gap && $gap =~ /[\(\)\[\]\{\}\/\\\
,\
;\
:\
=\
*\'\
`\<\>\^]/;
207 $self->throw("\"$missing\" is not a valid missing character. For compatability, missing char can not be one of: ()[]{}/\,;:=*'`\"<>^")
208 if $missing && $missing =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
210 $aln->gap_char($gap);
211 $aln->missing_char($missing);
214 # if data is not right after the matrix line
215 # read the empty lines out
217 while ($entry = $self->_readline) {
218 unless ($entry =~ /^\s+$/) {
219 $self->_pushback($entry);
227 # first alignment section
228 if (@names == 0) { # taxa block did not exist
229 while ($entry = $self->_readline) {
231 if( s/\[[^\]]+\]//g ) { #] remove comments
233 # skip line if it is now empty or contains only whitespace
235 if ($interleave && defined$count && ($count <= $seqcount)) {
240 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
241 #/^\s*;\s*$/ and last;
242 if ( /^\s*([\"\'](.+?)[\"\']|(\S+))\s+(.*)\s*$/ ) {
243 # get single and double quoted names, or all the first
244 # nonwhite word as the name, and remained is seq
245 #if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #'
248 # seq is on same line as name
249 # this is the usual NEXUS format
252 # otherwise get seq from following lines. No comments allowed
253 # a less common matrix format, usually used for very long seqs
255 while (local ($_) = $self->_readline) {
257 $str_tmp =~ s/[\s;]//g;
259 last if length$str == $residuecount;
267 $hash{$count} = $str;
269 $self->throw("Not a valid interleaved NEXUS file
! seqcount
[$count] > predeclared
[$seqcount] in the first section
") if $count > $seqcount;
270 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
274 # interleaved sections
276 if ( $interleave ) { # only read next section if file is interleaved
277 while( $entry = $self->_readline) {
279 if( s/\[[^\]]+\]//g ) { #] remove comments
280 next if /^\s*$/; # skip line if it is now empty or contains only whitespace
282 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
283 $count = 0, next if $entry =~ /^\s*$/;
284 if (/^\s*(\'([^\']*?)\'|([^\']\S*))\s+(.*)$/) {
288 $hash{$count} .= $str;
290 $self->throw("Not a valid interleaved NEXUS file
!
291 seqcount
[$count] > predeclared
[$seqcount] ") if $count > $seqcount;
292 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
296 return if @names < 1;
300 foreach $name ( @names ) {
302 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
303 ($seqname,$start,$end) = ($1,$2,$3);
305 ($seqname,$start,$str) = ($name,1,$hash{$count});
306 $str =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]//g;
311 $self->throw("Length of sequence
[$seqname] is
not [$residuecount]; got
".CORE::length($hash{$count}))
312 unless CORE::length($hash{$count}) == $residuecount;
314 $seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
315 '-display_id' => $seqname,
318 '-alphabet' => $alphabet
323 # if matchchar is used
324 $aln->unmatch($match) if $match;
326 # if equate ( e.g. equate="T
=C G
=A
") is used
328 $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g;
331 while (defined $entry &&
332 $entry !~ /endblock/i) {
333 $entry = $self->_readline;
336 return $aln if $aln->num_sequences;
340 sub _read_taxlabels {
343 while (my $entry = $self->_readline) {
344 last if $entry =~ m/^\s*(END)?;/i;
345 if( $entry =~ m/\s*(\S+)\s+/ ) {
347 $name =~ s/\[[^\[]+\]//g;
358 Usage : $stream->write_aln(@aln)
359 Function: Writes the $aln object into the stream in interleaved NEXUS
360 format. Everything is written into a data block.
361 SimpleAlign methods match_char, missing_char and gap_char must be set
362 if you want to see them in the output.
363 Returns : 1 for success and 0 for error
364 Args : L<Bio::Align::AlignI> object
369 my ($self,@aln) = @_;
373 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index );
374 my ($match, $missing, $gap,$symbols) = ('', '', '','');
376 foreach my $aln (@aln) {
377 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
378 $self->warn("Must provide a Bio
::Align
::AlignI object
when calling write_aln
");
381 $self->throw("All sequences
in the alignment must be the same
length")
382 unless $aln->is_flush($self->verbose);
384 $length = $aln->length();
386 $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n",
387 $aln->id, $aln->num_sequences, $length));
388 $match = "match=". $aln->match_char if $aln->match_char;
389 $missing = "missing=". $aln->missing_char if $aln->missing_char;
390 $gap = "gap=". $aln->gap_char if $aln->gap_char;
392 $symbols = 'symbols="'.join('',$aln->symbol_chars). '"'
393 if( $self->flag('symbols') && $aln->symbol_chars);
395 (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n",
396 $aln->get_seq_by_pos(1)->alphabet, $match,
397 $missing, $gap, $symbols));
399 # account for single quotes round names
400 my $indent = $aln->maxdisplayname_length+2;
402 $aln->set_displayname_flat();
403 foreach $seq ( $aln->each_seq() ) {
404 my $nmid = $aln->displayname($seq->get_nse());
405 if( $nmid =~ /[^\w\d\.]/ ) {
406 # put name in single quotes incase it contains any of
407 # the following chars: ()[]{}/\,;:=*'"`+-<> that are not
408 # allowed in PAUP* and possible other software
410 $name = sprintf("%-${indent}s", "\'" . $nmid . "\'");
412 $name = sprintf("%-${indent}s", $nmid);
414 $hash{$name} = $seq->seq;
418 while( $count < $length ) {
419 # there is another block to go!
420 foreach $name ( @arr ) {
421 my $dispname = $name;
422 # $dispname = '' if $wrapped;
423 $self->_print (sprintf("%${indent}s ",$dispname));
426 while( ($tempcount + 10 < $length) && ($index < 5) ) {
427 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10)));
434 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
437 $self->_print ("\n");
439 $self->_print ("\n\n");
443 if( $self->flag('endblock') ) {
444 $self->_print (";\n\nendblock;\n");
446 $self->_print (";\n\nend;\n");
449 $self->flush if $self->_flush_on_write && defined $self->_fh;
456 Usage : $obj->flag($name,$value)
457 Function: Get/Set a flag value
458 Returns : value of flag (a scalar)
459 Args : on set, new value (a scalar or undef, optional)
465 my ($self,$name,$val) = @_;
466 return $self->{'flag'}->{$name} = $val if defined $val;
467 return $self->{'flag'}->{$name};