2 # BioPerl module for Bio::AlignIO::nexus
4 # Copyright Heikki Lehvaslaiho
9 Bio::AlignIO::nexus - NEXUS format sequence input/output stream
13 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
17 my $in = Bio::AlignIO->new(-format => 'nexus',
18 -file => 'aln.nexus');
19 while( my $aln = $in->next_aln ) {
20 # do something with the alignment
25 This object can transform L<Bio::Align::AlignI> objects to and from NEXUS
26 data blocks. See method documentation for supported NEXUS features.
28 =head1 ACKNOWLEDGEMENTS
30 Will Fisher has written an excellent standalone NEXUS format parser in
31 Perl, readnexus. A number of tricks were adapted from it.
37 Please direct usage questions or support issues to the mailing list:
39 I<bioperl-l@bioperl.org>
41 rather than to the module maintainer directly. Many experienced and
42 reponsive experts will be able look at the problem and quickly
43 address it. Please include a thorough description of the problem
44 with code and data examples if at all possible.
48 Report bugs to the Bioperl bug tracking system to help us keep track
49 the bugs and their resolution. Bug reports can be submitted via the
52 https://github.com/bioperl/bioperl-live/issues
54 =head1 AUTHORS - Heikki Lehvaslaiho
56 Email: heikki-at-bioperl-dot-org
60 The rest of the documentation details each of the object
61 methods. Internal methods are usually preceded with a _
65 # Let the code begin...
67 package Bio
::AlignIO
::nexus
;
68 use vars
qw(%valid_type);
73 use base qw(Bio::AlignIO);
76 %valid_type = map {$_, 1} qw( dna rna protein standard );
77 # standard throws error: inherited from Bio::PrimarySeq
83 Usage : $alignio = Bio::AlignIO->new(-format => 'nexus', -file => 'filename');
84 Function: returns a new Bio::AlignIO object to handle clustalw files
85 Returns : Bio::AlignIO::clustalw object
86 Args : -verbose => verbosity setting (-1,0,1,2)
87 -file => name of file to read in or with ">" - writeout
88 -fh => alternative to -file param - provide a filehandle
90 -format => type of Alignment Format to process or produce
92 Customization of nexus flavor output
94 -show_symbols => print the symbols="ATGC" in the data definition
95 (MrBayes does not like this)
96 boolean [default is 1]
97 -show_endblock => print an 'endblock;' at the end of the data
98 (MyBayes does not like this)
99 boolean [default is 1]
104 my ($self, @args) = @_;
105 $self->SUPER::_initialize
(@args);
106 my ($show_symbols, $endblock) =
107 $self->_rearrange([qw(SHOW_SYMBOLS SHOW_ENDBLOCK)], @args);
108 my @names = qw(symbols endblock);
109 for my $v ( $show_symbols, $endblock ) {
110 $v = 1 unless defined $v; # default value is 1
111 my $n = shift @names;
120 Usage : $aln = $stream->next_aln()
121 Function: Returns the next alignment in the stream.
123 Supports the following NEXUS format features:
124 - The file has to start with '#NEXUS'
125 - Reads in the name of the alignment from a comment
126 (anything after 'TITLE: ') .
127 - Sequence names can be given in a taxa block, too.
128 - If matchchar notation is used, converts
129 them back to sequence characters.
130 - Does character conversions specified in the
131 NEXUS equate command.
132 - Sequence names of type 'Homo sapiens' and
133 Homo_sapiens are treated identically.
135 Returns : L<Bio::Align::AlignI> object
145 my ($aln_name, $seqcount, $residuecount, %hash, $alphabet,
146 $match, $gap, $missing, $equate, $interleave,
147 $name,$str,@names,$seqname,$start,$end,$count,$seq);
148 local $Bio::LocatableSeq
::OTHER_SYMBOLS
= '\*\?\.';
149 local $Bio::LocatableSeq
::GAP_SYMBOLS
= '\-';
150 my $aln = Bio
::SimpleAlign
->new(-source
=> 'nexus');
152 # file starts with '#NEXUS' but we allow white space only lines before it
153 $entry = $self->_readline;
154 $entry = $self->_readline while defined $entry && $entry =~ /^\s+$/;
156 return unless $entry;
157 $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry")
158 unless ($entry && $entry =~ /^#NEXUS/i);
160 # skip anything before either the taxa or data block
161 # but read in the optional title in a comment
162 while (defined($entry = $self->_readline)) {
164 /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1;
165 last if /^begin +data/i || /^begin +taxa/i;
167 $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name;
169 # data and taxa blocks
171 while (defined ($entry = $self->_readline)) {
173 next if s/\[[^\]]+\]//g; # remove comments
174 if( s/\[[^\]]+$// ) {
176 # skip line if it is now empty or contains only whitespace
178 } elsif($incomment) {
179 if( s/^[^\]]*\]// ) {
184 } elsif( /taxlabels/i ) {
185 # doesn't deal with taxlabels adequately and can mess things up!
186 # @names = $self->_read_taxlabels;
189 /ntax\s*=\s*(\d+)/i and $seqcount = $1;
190 /nchar\s*=\s*(\d+)/i and $residuecount = $1;
191 /matchchar\s*=\s*(.)/i and $match = $1;
192 /gap\s*=\s*(.)/i and $gap = $1;
193 /missing\s*=\s*(.)/i and $missing = $1;
194 /equate\s*=\s*\"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A";
195 /datatype\s*=\s*(\w+)/i and $alphabet = lc $1;
196 /interleave/i and $interleave = 1 ;
200 $self->throw("Not a valid NEXUS sequence file. Datatype not specified.")
202 $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]")
203 unless $valid_type{$alphabet};
204 $self->throw("\"$gap\" is not a valid gap character. For compatability, gap char can not be one of: ()[]{}/\,;:=*'`\"<>^")
205 if $gap && $gap =~ /[\(\)\[\]\{\}\/\\\
,\
;\
:\
=\
*\'\
`\<\>\^]/;
206 $self->throw("\"$missing\" is not a valid missing character. For compatability, missing char can not be one of: ()[]{}/\,;:=*'`\"<>^")
207 if $missing && $missing =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
209 $aln->gap_char($gap);
210 $aln->missing_char($missing);
213 # if data is not right after the matrix line
214 # read the empty lines out
216 while ($entry = $self->_readline) {
217 unless ($entry =~ /^\s+$/) {
218 $self->_pushback($entry);
226 # first alignment section
227 if (@names == 0) { # taxa block did not exist
228 while ($entry = $self->_readline) {
230 if( s/\[[^\]]+\]//g ) { #] remove comments
232 # skip line if it is now empty or contains only whitespace
234 if ($interleave && defined$count && ($count <= $seqcount)) {
239 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
240 #/^\s*;\s*$/ and last;
241 if ( /^\s*([\"\'](.+?)[\"\']|(\S+))\s+(.*)\s*$/ ) {
242 # get single and double quoted names, or all the first
243 # nonwhite word as the name, and remained is seq
244 #if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #'
247 # seq is on same line as name
248 # this is the usual NEXUS format
251 # otherwise get seq from following lines. No comments allowed
252 # a less common matrix format, usually used for very long seqs
254 while (local ($_) = $self->_readline) {
256 $str_tmp =~ s/[\s;]//g;
258 last if length$str == $residuecount;
266 $hash{$count} = $str;
268 $self->throw("Not a valid interleaved NEXUS file
! seqcount
[$count] > predeclared
[$seqcount] in the first section
") if $count > $seqcount;
269 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
273 # interleaved sections
275 if ( $interleave ) { # only read next section if file is interleaved
276 while( $entry = $self->_readline) {
278 if( s/\[[^\]]+\]//g ) { #] remove comments
279 next if /^\s*$/; # skip line if it is now empty or contains only whitespace
281 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
282 $count = 0, next if $entry =~ /^\s*$/;
283 if (/^\s*(\'([^\']*?)\'|([^\']\S*))\s+(.*)$/) {
287 $hash{$count} .= $str;
289 $self->throw("Not a valid interleaved NEXUS file
!
290 seqcount
[$count] > predeclared
[$seqcount] ") if $count > $seqcount;
291 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
295 return if @names < 1;
299 foreach $name ( @names ) {
301 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
302 ($seqname,$start,$end) = ($1,$2,$3);
304 ($seqname,$start,$str) = ($name,1,$hash{$count});
305 $str =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]//g;
310 $self->throw("Length of sequence
[$seqname] is
not [$residuecount]; got
".CORE::length($hash{$count}))
311 unless CORE::length($hash{$count}) == $residuecount;
313 $seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
314 '-display_id' => $seqname,
317 '-alphabet' => $alphabet
322 # if matchchar is used
323 $aln->unmatch($match) if $match;
325 # if equate ( e.g. equate="T
=C G
=A
") is used
327 $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g;
330 while (defined $entry &&
331 $entry !~ /endblock/i) {
332 $entry = $self->_readline;
335 return $aln if $aln->num_sequences;
339 sub _read_taxlabels {
342 while (my $entry = $self->_readline) {
343 last if $entry =~ m/^\s*(END)?;/i;
344 if( $entry =~ m/\s*(\S+)\s+/ ) {
346 $name =~ s/\[[^\[]+\]//g;
357 Usage : $stream->write_aln(@aln)
358 Function: Writes the $aln object into the stream in interleaved NEXUS
359 format. Everything is written into a data block.
360 SimpleAlign methods match_char, missing_char and gap_char must be set
361 if you want to see them in the output.
362 Returns : 1 for success and 0 for error
363 Args : L<Bio::Align::AlignI> object
368 my ($self,@aln) = @_;
372 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index );
373 my ($match, $missing, $gap,$symbols) = ('', '', '','');
375 foreach my $aln (@aln) {
376 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
377 $self->warn("Must provide a Bio
::Align
::AlignI object
when calling write_aln
");
380 $self->throw("All sequences
in the alignment must be the same
length")
381 unless $aln->is_flush($self->verbose);
383 $length = $aln->length();
385 $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n",
386 $aln->id, $aln->num_sequences, $length));
387 $match = "match=". $aln->match_char if $aln->match_char;
388 $missing = "missing=". $aln->missing_char if $aln->missing_char;
389 $gap = "gap=". $aln->gap_char if $aln->gap_char;
391 $symbols = 'symbols="'.join('',$aln->symbol_chars). '"'
392 if( $self->flag('symbols') && $aln->symbol_chars);
394 (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n",
395 $aln->get_seq_by_pos(1)->alphabet, $match,
396 $missing, $gap, $symbols));
398 # account for single quotes round names
399 my $indent = $aln->maxdisplayname_length+2;
401 $aln->set_displayname_flat();
402 foreach $seq ( $aln->each_seq() ) {
403 my $nmid = $aln->displayname($seq->get_nse());
404 if( $nmid =~ /[^\w\d\.]/ ) {
405 # put name in single quotes incase it contains any of
406 # the following chars: ()[]{}/\,;:=*'"`+-<> that are not
407 # allowed in PAUP* and possible other software
409 $name = sprintf("%-${indent}s", "\'" . $nmid . "\'");
411 $name = sprintf("%-${indent}s", $nmid);
413 $hash{$name} = $seq->seq;
417 while( $count < $length ) {
418 # there is another block to go!
419 foreach $name ( @arr ) {
420 my $dispname = $name;
421 # $dispname = '' if $wrapped;
422 $self->_print (sprintf("%${indent}s ",$dispname));
425 while( ($tempcount + 10 < $length) && ($index < 5) ) {
426 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10)));
433 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
436 $self->_print ("\n");
438 $self->_print ("\n\n");
442 if( $self->flag('endblock') ) {
443 $self->_print (";\n\nendblock;\n");
445 $self->_print (";\n\nend;\n");
448 $self->flush if $self->_flush_on_write && defined $self->_fh;
455 Usage : $obj->flag($name,$value)
456 Function: Get/Set a flag value
457 Returns : value of flag (a scalar)
458 Args : on set, new value (a scalar or undef, optional)
464 my ($self,$name,$val) = @_;
465 return $self->{'flag'}->{$name} = $val if defined $val;
466 return $self->{'flag'}->{$name};