2 # BioPerl module for Bio::AlignIO::msf
3 # based on the Bio::SeqIO::msf module
4 # by Ewan Birney <birney@ebi.ac.uk>
5 # and Lincoln Stein <lstein@cshl.org>
7 # and the SimpleAlign.pm module of Ewan Birney
9 # Copyright Peter Schattner
11 # You may distribute this module under the same terms as perl itself
14 # POD documentation - main docs before the code
18 Bio::AlignIO::msf - msf sequence input/output stream
22 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
26 This object can transform L<Bio::Align::AlignI> objects to and from msf
33 Please direct usage questions or support issues to the mailing list:
35 I<bioperl-l@bioperl.org>
37 rather than to the module maintainer directly. Many experienced and
38 reponsive experts will be able look at the problem and quickly
39 address it. Please include a thorough description of the problem
40 with code and data examples if at all possible.
44 Report bugs to the Bioperl bug tracking system to help us keep track
45 the bugs and their resolution. Bug reports can be submitted via the
48 https://github.com/bioperl/bioperl-live/issues
50 =head1 AUTHORS - Peter Schattner
52 Email: schattner@alum.mit.edu
57 The rest of the documentation details each of the object
58 methods. Internal methods are usually preceded with a _
62 # Let the code begin...
64 package Bio
::AlignIO
::msf
;
66 use vars
qw(%valid_type);
69 use Bio::SeqIO::gcg; # for GCG_checksum()
72 use base qw(Bio::AlignIO);
75 %valid_type = qw( dna N rna N protein P );
81 Usage : $aln = $stream->next_aln()
82 Function: returns the next alignment in the stream. Tries to read *all* MSF
83 It reads all non whitespace characters in the alignment
84 area. For MSFs with weird gaps (eg ~~~) map them by using
85 $aln->map_chars('~','-')
86 Returns : Bio::Align::AlignI object
94 my (%hash,$name,$str,@names,$seqname,$start,$end,$count,$seq);
96 my $aln = Bio
::SimpleAlign
->new(-source
=> 'gcg' );
98 while( $entry = $self->_readline) {
99 $entry =~ m{//} && last; # move to alignment section
100 $entry =~ /Name:\s+(\S+)/ && do { $name = $1;
101 $hash{$name} = ""; # blank line
102 push(@names,$name); # we need it ordered!
109 while( $entry = $self->_readline) {
110 next if ( $entry =~ /^\s+(\d+)/ ) ;
111 $entry =~ /^\s*(\S+)\s+(.*)$/ && do {
114 if( ! exists $hash{$name} ) {
115 $self->throw("$name exists as an alignment line but not in the header. Not confident of what is going on!");
119 $hash{$name} .= $str;
123 return if @names < 1;
125 # now got this as a name - sequence hash. Let's make some sequences!
127 for $name ( @names ) {
128 if( $name =~ m{(\S+)/(\d+)-(\d+)} ) {
136 $str =~ s/[^0-9A-Za-z$Bio::LocatableSeq::OTHER_SYMBOLS]//g;
141 $seq = Bio
::LocatableSeq
->new('-seq' => $hash{$name},
142 '-display_id' => $seqname,
145 '-alphabet' => $self->alphabet,
149 # If $end <= 0, we have either reached the end of
150 # file in <> or we have encountered some other error
153 return $aln if $aln->num_sequences;
161 Usage : $stream->write_aln(@aln)
162 Function: writes the $aln object into the stream in MSF format
163 Sequence type of the alignment is determined by the first sequence.
164 Returns : 1 for success and 0 for error
165 Args : Bio::Align::AlignI object
171 my ($self,@aln) = @_;
176 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index);
177 foreach my $aln (@aln) {
178 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
179 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
182 $date = localtime(time);
184 $type = $valid_type{$aln->get_seq_by_pos(1)->alphabet};
185 $maxname = $aln->maxdisplayname_length();
186 $length = $aln->length();
188 if( !defined $name ) {
192 $self->_print (sprintf("\n%s MSF: %d Type: %s %s Check: 00 ..\n\n",
193 $name, $aln->num_sequences, $type, $date));
195 my $seqCountFormat = "%".($maxname > 20 ?
$maxname + 2: 22)."s%-27d%27d\n";
196 my $seqNameFormat = "%-".($maxname > 20 ?
$maxname : 20)."s ";
198 foreach $seq ( $aln->each_seq() ) {
199 $name = $aln->displayname($seq->get_nse());
200 $miss = $maxname - length ($name);
204 $self->_print (sprintf(" Name: %s%sLen: %d Check: %d Weight: 1.00\n",$name,$pad,length $seq->seq(), Bio
::SeqIO
::gcg
->GCG_checksum($seq)));
206 $hash{$name} = $seq->seq();
209 # ok - heavy handed, but there you go.
211 $self->_print ("\n//\n\n\n");
213 while( $count < $length ) {
214 # there is another block to go!
215 $self->_print (sprintf($seqCountFormat,' ',$count+1,$count+50));
216 foreach $name ( @arr ) {
217 $self->_print (sprintf($seqNameFormat,$name));
221 while( ($tempcount + 10 < $length) && ($index < 5) ) {
223 $self->_print (sprintf("%s ",substr($hash{$name},
229 # ok, could be the very last guy ;)
234 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
237 $self->_print ("\n");
239 $self->_print ("\n\n");
243 $self->flush if $self->_flush_on_write && defined $self->_fh;