maint: remove Travis stuff which has been replaced with Github actions (#325)
[bioperl-live.git] / lib / Bio / AlignIO / nexus.pm
blob9e1fcfadc7b20e4804c6ed69b1691eaab5fb74eb
2 # BioPerl module for Bio::AlignIO::nexus
4 # Copyright Heikki Lehvaslaiho
7 =head1 NAME
9 Bio::AlignIO::nexus - NEXUS format sequence input/output stream
11 =head1 SYNOPSIS
13 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
15 use Bio::AlignIO;
17 my $in = Bio::AlignIO->new(-format => 'nexus',
18 -file => 'aln.nexus');
19 while( my $aln = $in->next_aln ) {
20 # do something with the alignment
23 =head1 DESCRIPTION
25 This object can transform L<Bio::Align::AlignI> objects to and from NEXUS
26 data blocks. See method documentation for supported NEXUS features.
28 =head1 ACKNOWLEDGEMENTS
30 Will Fisher has written an excellent standalone NEXUS format parser in
31 Perl, readnexus. A number of tricks were adapted from it.
33 =head1 FEEDBACK
35 =head2 Support
37 Please direct usage questions or support issues to the mailing list:
39 I<bioperl-l@bioperl.org>
41 rather than to the module maintainer directly. Many experienced and
42 reponsive experts will be able look at the problem and quickly
43 address it. Please include a thorough description of the problem
44 with code and data examples if at all possible.
46 =head2 Reporting Bugs
48 Report bugs to the Bioperl bug tracking system to help us keep track
49 the bugs and their resolution. Bug reports can be submitted via the
50 web:
52 https://github.com/bioperl/bioperl-live/issues
54 =head1 AUTHORS - Heikki Lehvaslaiho
56 Email: heikki-at-bioperl-dot-org
58 =head1 APPENDIX
60 The rest of the documentation details each of the object
61 methods. Internal methods are usually preceded with a _
63 =cut
65 # Let the code begin...
67 package Bio::AlignIO::nexus;
69 use vars qw(%valid_type);
70 use strict;
71 no strict "refs";
74 use base qw(Bio::AlignIO);
76 BEGIN {
77 %valid_type = map {$_, 1} qw( dna rna protein standard );
78 # standard throws error: inherited from Bio::PrimarySeq
81 =head2 new
83 Title : new
84 Usage : $alignio = Bio::AlignIO->new(-format => 'nexus', -file => 'filename');
85 Function: returns a new Bio::AlignIO object to handle clustalw files
86 Returns : Bio::AlignIO::clustalw object
87 Args : -verbose => verbosity setting (-1,0,1,2)
88 -file => name of file to read in or with ">" - writeout
89 -fh => alternative to -file param - provide a filehandle
90 to read from/write to
91 -format => type of Alignment Format to process or produce
93 Customization of nexus flavor output
95 -show_symbols => print the symbols="ATGC" in the data definition
96 (MrBayes does not like this)
97 boolean [default is 1]
98 -show_endblock => print an 'endblock;' at the end of the data
99 (MyBayes does not like this)
100 boolean [default is 1]
102 =cut
104 sub _initialize {
105 my ($self, @args) = @_;
106 $self->SUPER::_initialize(@args);
107 my ($show_symbols, $endblock) =
108 $self->_rearrange([qw(SHOW_SYMBOLS SHOW_ENDBLOCK)], @args);
109 my @names = qw(symbols endblock);
110 for my $v ( $show_symbols, $endblock ) {
111 $v = 1 unless defined $v; # default value is 1
112 my $n = shift @names;
113 $self->flag($n, $v);
118 =head2 next_aln
120 Title : next_aln
121 Usage : $aln = $stream->next_aln()
122 Function: Returns the next alignment in the stream.
124 Supports the following NEXUS format features:
125 - The file has to start with '#NEXUS'
126 - Reads in the name of the alignment from a comment
127 (anything after 'TITLE: ') .
128 - Sequence names can be given in a taxa block, too.
129 - If matchchar notation is used, converts
130 them back to sequence characters.
131 - Does character conversions specified in the
132 NEXUS equate command.
133 - Sequence names of type 'Homo sapiens' and
134 Homo_sapiens are treated identically.
136 Returns : L<Bio::Align::AlignI> object
137 Args :
140 =cut
143 sub next_aln {
144 my $self = shift;
145 my $entry;
146 my ($aln_name, $seqcount, $residuecount, %hash, $alphabet,
147 $match, $gap, $missing, $equate, $interleave,
148 $name,$str,@names,$seqname,$start,$end,$count,$seq);
149 local $Bio::LocatableSeq::OTHER_SYMBOLS = '\*\?\.';
150 local $Bio::LocatableSeq::GAP_SYMBOLS = '\-';
151 my $aln = Bio::SimpleAlign->new(-source => 'nexus');
153 # file starts with '#NEXUS' but we allow white space only lines before it
154 $entry = $self->_readline;
155 $entry = $self->_readline while defined $entry && $entry =~ /^\s+$/;
157 return unless $entry;
158 $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry")
159 unless ($entry && $entry =~ /^#NEXUS/i);
161 # skip anything before either the taxa or data block
162 # but read in the optional title in a comment
163 while (defined($entry = $self->_readline)) {
164 local ($_) = $entry;
165 /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1;
166 last if /^begin +data/i || /^begin +taxa/i;
168 $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name;
170 # data and taxa blocks
171 my $incomment;
172 while (defined ($entry = $self->_readline)) {
173 local ($_) = $entry;
174 next if s/\[[^\]]+\]//g; # remove comments
175 if( s/\[[^\]]+$// ) {
176 $incomment = 1;
177 # skip line if it is now empty or contains only whitespace
178 next if /^\s*$/;
179 } elsif($incomment) {
180 if( s/^[^\]]*\]// ) {
181 $incomment = 0;
182 } else {
183 next;
185 } elsif( /taxlabels/i ) {
186 # doesn't deal with taxlabels adequately and can mess things up!
187 # @names = $self->_read_taxlabels;
188 } else {
190 /ntax\s*=\s*(\d+)/i and $seqcount = $1;
191 /nchar\s*=\s*(\d+)/i and $residuecount = $1;
192 /matchchar\s*=\s*(.)/i and $match = $1;
193 /gap\s*=\s*(.)/i and $gap = $1;
194 /missing\s*=\s*(.)/i and $missing = $1;
195 /equate\s*=\s*\"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A";
196 /datatype\s*=\s*(\w+)/i and $alphabet = lc $1;
197 /interleave/i and $interleave = 1 ;
198 last if /matrix/io;
201 $self->throw("Not a valid NEXUS sequence file. Datatype not specified.")
202 unless $alphabet;
203 $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]")
204 unless $valid_type{$alphabet};
205 $self->throw("\"$gap\" is not a valid gap character. For compatability, gap char can not be one of: ()[]{}/\,;:=*'`\"<>^")
206 if $gap && $gap =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
207 $self->throw("\"$missing\" is not a valid missing character. For compatability, missing char can not be one of: ()[]{}/\,;:=*'`\"<>^")
208 if $missing && $missing =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
210 $aln->gap_char($gap);
211 $aln->missing_char($missing);
214 # if data is not right after the matrix line
215 # read the empty lines out
217 while ($entry = $self->_readline) {
218 unless ($entry =~ /^\s+$/) {
219 $self->_pushback($entry);
220 last;
225 # matrix command
227 # first alignment section
228 if (@names == 0) { # taxa block did not exist
229 while ($entry = $self->_readline) {
230 local ($_) = $entry;
231 if( s/\[[^\]]+\]//g ) { #] remove comments
232 next if /^\s*$/;
233 # skip line if it is now empty or contains only whitespace
235 if ($interleave && defined$count && ($count <= $seqcount)) {
236 /^\s+$/ and last;
237 } else {
238 /^\s+$/ and next;
240 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
241 #/^\s*;\s*$/ and last;
242 if ( /^\s*([\"\'](.+?)[\"\']|(\S+))\s+(.*)\s*$/ ) {
243 # get single and double quoted names, or all the first
244 # nonwhite word as the name, and remained is seq
245 #if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #'
246 $name = ($2 || $3);
247 if ($4) {
248 # seq is on same line as name
249 # this is the usual NEXUS format
250 $str = $4;
251 } else {
252 # otherwise get seq from following lines. No comments allowed
253 # a less common matrix format, usually used for very long seqs
254 $str='';
255 while (local ($_) = $self->_readline) {
256 my $str_tmp = $_;
257 $str_tmp =~ s/[\s;]//g;
258 $str .= $str_tmp;
259 last if length$str == $residuecount;
262 $name =~ s/ /_/g;
263 push @names, $name;
265 $str =~ s/[\s;]//g;
266 $count = @names;
267 $hash{$count} = $str;
269 $self->throw("Not a valid interleaved NEXUS file! seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount;
270 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
274 # interleaved sections
275 $count = 0;
276 if ( $interleave ) { # only read next section if file is interleaved
277 while( $entry = $self->_readline) {
278 local ($_) = $entry;
279 if( s/\[[^\]]+\]//g ) { #] remove comments
280 next if /^\s*$/; # skip line if it is now empty or contains only whitespace
282 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
283 $count = 0, next if $entry =~ /^\s*$/;
284 if (/^\s*(\'([^\']*?)\'|([^\']\S*))\s+(.*)$/) {
285 $str = $4;
286 $str =~ s/[\s;]//g;
287 $count++;
288 $hash{$count} .= $str;
290 $self->throw("Not a valid interleaved NEXUS file!
291 seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount;
292 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
296 return if @names < 1;
298 # sequence creation
299 $count = 0;
300 foreach $name ( @names ) {
301 $count++;
302 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
303 ($seqname,$start,$end) = ($1,$2,$3);
304 } else {
305 ($seqname,$start,$str) = ($name,1,$hash{$count});
306 $str =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]//g;
307 $end = length($str);
310 # consistency test
311 $self->throw("Length of sequence [$seqname] is not [$residuecount]; got".CORE::length($hash{$count}))
312 unless CORE::length($hash{$count}) == $residuecount;
314 $seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
315 '-display_id' => $seqname,
316 '-start' => $start,
317 '-end' => $end,
318 '-alphabet' => $alphabet
320 $aln->add_seq($seq);
323 # if matchchar is used
324 $aln->unmatch($match) if $match;
326 # if equate ( e.g. equate="T=C G=A") is used
327 if ($equate) {
328 $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g;
331 while (defined $entry &&
332 $entry !~ /endblock/i) {
333 $entry = $self->_readline;
336 return $aln if $aln->num_sequences;
337 return;
340 sub _read_taxlabels {
341 my ($self) = @_;
342 my ($name, @names);
343 while (my $entry = $self->_readline) {
344 last if $entry =~ m/^\s*(END)?;/i;
345 if( $entry =~ m/\s*(\S+)\s+/ ) {
346 ($name) = ($1);
347 $name =~ s/\[[^\[]+\]//g;
348 $name =~ s/\W/_/g;
349 push @names, $name;
352 return @names;
355 =head2 write_aln
357 Title : write_aln
358 Usage : $stream->write_aln(@aln)
359 Function: Writes the $aln object into the stream in interleaved NEXUS
360 format. Everything is written into a data block.
361 SimpleAlign methods match_char, missing_char and gap_char must be set
362 if you want to see them in the output.
363 Returns : 1 for success and 0 for error
364 Args : L<Bio::Align::AlignI> object
366 =cut
368 sub write_aln {
369 my ($self,@aln) = @_;
370 my $count = 0;
371 my $wrapped = 0;
372 my $maxname;
373 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index );
374 my ($match, $missing, $gap,$symbols) = ('', '', '','');
376 foreach my $aln (@aln) {
377 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
378 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
379 next;
381 $self->throw("All sequences in the alignment must be the same length")
382 unless $aln->is_flush($self->verbose);
384 $length = $aln->length();
386 $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n",
387 $aln->id, $aln->num_sequences, $length));
388 $match = "match=". $aln->match_char if $aln->match_char;
389 $missing = "missing=". $aln->missing_char if $aln->missing_char;
390 $gap = "gap=". $aln->gap_char if $aln->gap_char;
392 $symbols = 'symbols="'.join('',$aln->symbol_chars). '"'
393 if( $self->flag('symbols') && $aln->symbol_chars);
394 $self->_print
395 (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n",
396 $aln->get_seq_by_pos(1)->alphabet, $match,
397 $missing, $gap, $symbols));
399 # account for single quotes round names
400 my $indent = $aln->maxdisplayname_length+2;
402 $aln->set_displayname_flat();
403 foreach $seq ( $aln->each_seq() ) {
404 my $nmid = $aln->displayname($seq->get_nse());
405 if( $nmid =~ /[^\w\d\.]/ ) {
406 # put name in single quotes incase it contains any of
407 # the following chars: ()[]{}/\,;:=*'"`+-<> that are not
408 # allowed in PAUP* and possible other software
410 $name = sprintf("%-${indent}s", "\'" . $nmid . "\'");
411 } else {
412 $name = sprintf("%-${indent}s", $nmid);
414 $hash{$name} = $seq->seq;
415 push(@arr,$name);
418 while( $count < $length ) {
419 # there is another block to go!
420 foreach $name ( @arr ) {
421 my $dispname = $name;
422 # $dispname = '' if $wrapped;
423 $self->_print (sprintf("%${indent}s ",$dispname));
424 $tempcount = $count;
425 $index = 0;
426 while( ($tempcount + 10 < $length) && ($index < 5) ) {
427 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10)));
428 $tempcount += 10;
429 $index++;
431 # last
432 if( $index < 5) {
433 # space to print!
434 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
435 $tempcount += 10;
437 $self->_print ("\n");
439 $self->_print ("\n\n");
440 $count = $tempcount;
441 $wrapped = 1;
443 if( $self->flag('endblock') ) {
444 $self->_print (";\n\nendblock;\n");
445 } else {
446 $self->_print (";\n\nend;\n");
449 $self->flush if $self->_flush_on_write && defined $self->_fh;
450 return 1;
453 =head2 flag
455 Title : flag
456 Usage : $obj->flag($name,$value)
457 Function: Get/Set a flag value
458 Returns : value of flag (a scalar)
459 Args : on set, new value (a scalar or undef, optional)
462 =cut
464 sub flag{
465 my ($self,$name,$val) = @_;
466 return $self->{'flag'}->{$name} = $val if defined $val;
467 return $self->{'flag'}->{$name};