2 # BioPerl module for Bio::AlignIO::phylip
4 # Copyright Heikki Lehvaslaiho
9 Bio::AlignIO::phylip - PHYLIP format sequence input/output stream
13 Do not use this module directly. Use it via the Bio::AlignIO class.
15 This example shows how to write to phylip format:
20 # Use -idlength to set the name length to something other than
21 # the default 10 if you need longer ids.
22 my $phylipstream = Bio::AlignIO->new(-format => 'phylip',
25 # Convert data from one format to another
26 my $gcgstream = Bio::AlignIO->new(-format => 'msf',
27 -file => 't/data/cysprot1a.msf');
29 while( my $aln = $gcgstream->next_aln ) {
30 $phylipstream->write_aln($aln);
33 # With phylip sequential format format
34 $phylipstream->interleaved(0);
35 # Or initialize the object like this
36 $phylipstream = Bio::AlignIO->new(-interleaved => 0,
40 $gcgstream = Bio::AlignIO->new(-format => 'msf',
41 -file => 't/data/cysprot1a.msf');
43 while( my $aln = $gcgstream->next_aln ) {
44 $phylipstream->write_aln($aln);
47 This example shows how to read phylip format:
49 my $in = Bio::AlignIO->new(
56 my $out = Bio::AlignIO->new(
61 while ( my $aln = $in->next_aln() ) {
62 $out->write_aln($aln);
65 The -longid argument is required if the input phylip format file
66 has ids with lengths greater then 10 characters.
70 This object can transform Bio::SimpleAlign objects to and from PHYLIP
71 format. By default it works with the interleaved format. By specifying
72 the flag -interleaved =E<gt> 0 in the initialization the module can
73 read or write data in sequential format.
75 Reading phylip format with long IDs up to 50 characters is supported by
76 the flag -longid =E<gt>1. ID strings can be surrounded by single quotes.
77 They are mandatory only if the IDs contain spaces.
83 Please direct usage questions or support issues to the mailing list:
85 I<bioperl-l@bioperl.org>
87 rather than to the module maintainer directly. Many experienced and
88 reponsive experts will be able look at the problem and quickly
89 address it. Please include a thorough description of the problem
90 with code and data examples if at all possible.
94 Report bugs to the Bioperl bug tracking system to help us keep track
95 the bugs and their resolution. Bug reports can be submitted via the
98 https://github.com/bioperl/bioperl-live/issues
100 =head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich
102 Email: heikki at ebi.ac.uk
103 Email: jason at bioperl.org
107 The rest of the documentation details each of the object
108 methods. Internal methods are usually preceded with a _
112 # Let the code begin...
114 package Bio
::AlignIO
::phylip
;
115 use vars
qw($DEFAULTIDLENGTH $DEFAULTLINELEN $DEFAULTTAGLEN);
118 use Bio::SimpleAlign;
119 use POSIX; # for the rounding call
121 use base qw(Bio::AlignIO);
124 $DEFAULTIDLENGTH = 10;
125 $DEFAULTLINELEN = 60;
132 Usage : my $alignio = Bio::AlignIO->new(-format => 'phylip'
136 Function: Initialize a new L<Bio::AlignIO::phylip> reader or writer
137 Returns : L<Bio::AlignIO> object
138 Args : [specific for writing of phylip format files]
139 -idlength => integer - length of the id (will pad w/
140 spaces if needed) when writing phylip
141 -interleaved => boolean - whether interleaved
142 or sequential format required
143 -line_length => integer of how long a sequence lines should be
144 -idlinebreak => insert a line break after the sequence id
145 so that sequence starts on the next line
146 -flag_SI => whether or not write a "S" or "I" just after
147 the num.seq. and line len., in the first line
148 -tag_length => integer of how long the tags have to be in
149 each line between the space separator. set it
150 to 0 to have 1 tag only.
151 -wrap_sequential => boolean for whether or not sequential
152 format should be broken up or a single line
153 default is false (single line)
154 -longid => boolean to read arbitrary long IDs (default is false)
159 my ( $self, @args ) = @_;
160 $self->SUPER::_initialize
(@args);
162 my ( $interleave, $linelen, $idlinebreak,
163 $idlength, $flag_SI, $tag_length, $ws, $longid )
176 $self->interleaved( $interleave ?
1 : 0 ) if defined $interleave;
177 $self->idlength( $idlength || $DEFAULTIDLENGTH );
178 $self->id_linebreak(1) if ($idlinebreak);
179 $self->line_length($linelen) if defined $linelen && $linelen > 0;
180 $self->flag_SI(1) if ($flag_SI);
181 $self->tag_length($tag_length) if ( $tag_length || $DEFAULTTAGLEN );
182 $self->wrap_sequential( $ws ?
1 : 0 );
183 $self->longid( $longid ?
1 : 0 );
190 Usage : $aln = $stream->next_aln()
191 Function: returns the next alignment in the stream.
192 Throws an exception if trying to read in PHYLIP
194 Returns : L<Bio::SimpleAlign> object
202 my ($seqcount, $residuecount, %hash, $name,
203 $str, @names, $seqname, $start,
207 my $aln = Bio
::SimpleAlign
->new( -source
=> 'phylip' );
209 # First, parse up through the header.
210 # If we see a non-blank line that isn't the seqcount and residuecount line
211 # then bail out of next_aln (return)
212 while ( $entry = $self->_readline ) {
213 if ( $entry =~ /^\s?$/ ) {
215 } elsif ( $entry =~ /\s*(\d+)\s+(\d+)/ ) {
216 ( $seqcount, $residuecount ) = ( $1, $2 );
220 "Failed to parse PHYLIP: Did not see a sequence count and residue count."
225 return unless $seqcount and $residuecount;
227 # First alignment section. We expect to see a name and (part of) a sequence.
228 my $idlen = $self->idlength;
231 while ( $entry = $self->_readline ) {
232 if ( $entry =~ /^\s?$/ ) { # eat the newlines
236 # Names can be in a few different formats:
237 # 1. they can be traditional phylip: 10 chars long, period. If this is the case, that name can have spaces.
238 # 2. they can be hacked with a long ID, as passed in with the flag -longid.
239 # 3. if there is a long ID, the name can have spaces as long as it is wrapped in single quotes.
240 if ( $self->longid() ) { # 2 or 3
241 if ( $entry =~ /^'(.+)'\s+(.+)$/ ) { # 3. name has single quotes.
244 } else { # 2. name does not have single quotes, so should not have spaces.
245 # therefore, the first part of the line is the name and the rest is the seq.
246 # make sure that the line does not lead with extra spaces.
248 ( $name, $str ) = split( /\s+/, $entry, 2 );
250 } else { # 1. traditional phylip.
251 $entry =~ /^(.{1,10})\s(.+)$/;
254 $name =~ s/\s+$//; # eat any trailing spaces
259 #clean sequence of spaces:
262 # are we sequential? If so, we should keep adding to the sequence until we've got all the residues.
263 if ( ( $self->interleaved ) == 0 ) {
264 while ( length($str) < $residuecount ) {
265 $entry = $self->_readline;
268 if ( $entry =~ /^\s*$/ ) { # we ran into a newline before we got a complete sequence: bail!
270 "Failed to parse PHYLIP: Sequence $name was shorter than expected: "
272 . " instead of $residuecount." );
277 $hash{$count} = $str;
281 # if we've read as many seqs as we're supposed to, move on.
282 if ( $count == $seqcount ) {
287 # if we are interleaved, we're going to keep seeing chunks of sequence until we get all of it.
288 if ( $self->interleaved ) {
289 while ( length( $hash{ $seqcount - 1 } ) < $residuecount ) {
291 while ( $entry = $self->_readline ) {
292 if ( $entry =~ /^\s*$/ ) { # eat newlines
293 if ( $count != 0 ) { # there was a newline at an unexpected place!
295 "Failed to parse PHYLIP: Interleaved file is missing a segment: saw $count, expected $seqcount."
300 } else { # start taking in chunks
302 $hash{$count} .= $entry;
305 if ( $count >= $seqcount ) { # we've read all of the sequences for this chunk, so move on.
311 if ( ( scalar @names ) != $seqcount ) {
313 "Failed to parse PHYLIP: Did not see the correct number of seqs: saw "
315 . ", expected $seqcount." );
318 for ( $count = 0; $count < $seqcount; $count++ ) {
319 $str = $hash{$count};
320 my $seqname = $names[$count];
321 if ( length($str) != $residuecount ) {
323 "Failed to parse PHYLIP: Sequence $seqname was the wrong length: "
325 . " instead of $residuecount." );
327 $seq = Bio
::LocatableSeq
->new(
328 '-seq' => $hash{$count},
329 '-display_id' => $seqname
339 Usage : $stream->write_aln(@aln)
340 Function: writes the $aln object into the stream in phylip format
341 Returns : 1 for success and 0 for error
342 Args : L<Bio::Align::AlignI> object
347 my ( $self, @aln ) = @_;
351 my $width = $self->line_length();
352 my ($length, $date, $name, $seq, $miss,
353 $pad, %hash, @arr, $tempcount, $index,
354 $idlength, $flag_SI, $line_length, $tag_length
357 foreach my $aln (@aln) {
358 if ( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
360 "Must provide a Bio::Align::AlignI object when calling write_aln"
364 $self->throw("All sequences in the alignment must be the same length")
365 unless $aln->is_flush(1);
367 $flag_SI = $self->flag_SI();
368 $aln->set_displayname_flat(); # plain
369 $length = $aln->length();
371 if ( $self->interleaved() ) {
374 " %s %s I\n", $aln->num_sequences, $aln->length
380 " %s %s S\n", $aln->num_sequences, $aln->length
386 sprintf( " %s %s\n", $aln->num_sequences, $aln->length ) );
389 $idlength = $self->idlength();
390 $line_length = $self->line_length();
391 $tag_length = $self->tag_length();
392 foreach $seq ( $aln->each_seq() ) {
393 $name = $aln->displayname( $seq->get_nse );
394 if ( $self->longid ) {
396 "The length of the name is over 50 chars long [$name]")
397 if length($name) > 50;
400 $name = substr( $name, 0, $idlength )
401 if length($name) > $idlength;
402 $name = sprintf( "%-" . $idlength . "s", $name );
403 if ( $self->interleaved() ) {
405 } elsif ( $self->id_linebreak ) {
410 #phylip needs dashes not dots
411 my $seq = $seq->seq();
417 if ( $self->interleaved() ) {
419 if ( $tag_length <= $line_length ) {
420 $numtags = floor
( $line_length / $tag_length );
421 $line_length = $tag_length * $numtags;
425 while ( $count < $length ) {
427 # there is another block to go!
428 foreach $name (@arr) {
429 my $dispname = $name;
430 $dispname = '' if $wrapped;
432 sprintf( "%" . ( $idlength + 3 ) . "s", $dispname ) );
435 $self->debug("residue count: $count\n")
436 if ( $count % 100000 == 0 );
437 while (( $tempcount + $tag_length < $length )
438 && ( $index < $numtags ) ) {
443 $hash{$name}, $tempcount, $tag_length
447 $tempcount += $tag_length;
452 if ( $index < $numtags ) {
457 substr( $hash{$name}, $tempcount ) )
459 $tempcount += $tag_length;
468 foreach $name (@arr) {
469 my $dispname = $name;
470 my $line = sprintf( "%s%s\n", $dispname, $hash{$name} );
471 if ( $self->wrap_sequential ) {
472 $line =~ s/(.{1,$width})/$1\n/g;
474 $self->_print($line);
478 $self->flush if $self->_flush_on_write && defined $self->_fh;
485 Usage : my $interleaved = $obj->interleaved
486 Function: Get/Set Interleaved status
494 my ( $self, $value ) = @_;
495 if ( defined $value ) {
497 $self->{'_interleaved'} = 1
499 $self->{'_interleaved'} = 0
502 return 1 unless defined $self->{'_interleaved'};
503 return $self->{'_interleaved'};
509 Usage : my $flag = $obj->flag_SI
510 Function: Get/Set if the Sequential/Interleaved flag has to be shown
511 after the number of sequences and sequence length
520 my ( $self, $value ) = @_;
521 my $previous = $self->{'_flag_SI'};
522 if ( defined $value ) {
523 $self->{'_flag_SI'} = $value;
531 Usage : my $idlength = $obj->idlength
532 Function: Get/Set value of id length
540 my ( $self, $value ) = @_;
541 if ( defined $value ) {
542 $self->{'_idlength'} = $value;
544 return $self->{'_idlength'};
550 Usage : $obj->line_length($newval)
552 Returns : value of line_length
553 Args : newvalue (optional)
559 my ( $self, $value ) = @_;
560 if ( defined $value ) {
561 $self->{'_line_length'} = $value;
563 return $self->{'_line_length'} || $DEFAULTLINELEN;
570 Usage : $obj->tag_length($newval)
572 Example : my $tag_length = $obj->tag_length
573 Returns : value of the length for each space-separated tag in a line
574 Args : newvalue (optional) - set to zero to have one tag per line
580 my ( $self, $value ) = @_;
581 if ( defined $value ) {
582 $self->{'_tag_length'} = $value;
584 return $self->{'_tag_length'} || $DEFAULTTAGLEN;
590 Usage : $obj->id_linebreak($newval)
592 Returns : value of id_linebreak
593 Args : newvalue (optional)
599 my ( $self, $value ) = @_;
600 if ( defined $value ) {
601 $self->{'_id_linebreak'} = $value;
603 return $self->{'_id_linebreak'} || 0;
606 =head2 wrap_sequential
608 Title : wrap_sequential
609 Usage : $obj->wrap_sequential($newval)
611 Returns : value of wrap_sequential
612 Args : newvalue (optional)
617 sub wrap_sequential
{
618 my ( $self, $value ) = @_;
619 if ( defined $value ) {
620 $self->{'_wrap_sequential'} = $value;
622 return $self->{'_wrap_sequential'} || 0;
628 Usage : $obj->longid($newval)
630 Returns : value of longid
631 Args : newvalue (optional)
637 my ( $self, $value ) = @_;
638 if ( defined $value ) {
639 $self->{'_longid'} = $value;
641 return $self->{'_longid'} || 0;