Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / AlignIO / clustalw.pm
blob052a50e147af9b603a020103c0b2b842bf13338c
2 # BioPerl module for Bio::AlignIO::clustalw
4 # based on the Bio::SeqIO modules
5 # by Ewan Birney <birney@ebi.ac.uk>
6 # and Lincoln Stein <lstein@cshl.org>
7 # and the Bio::SimpleAlign module of Ewan Birney
9 # Copyright Peter Schattner
11 # You may distribute this module under the same terms as perl itself
12 # History
13 # September 5, 2000
14 # POD documentation - main docs before the code
16 =head1 NAME
18 Bio::AlignIO::clustalw - clustalw sequence input/output stream
20 =head1 SYNOPSIS
22 Do not use this module directly. Use it via the Bio::AlignIO class.
24 =head1 DESCRIPTION
26 This object can transform Bio::Align::AlignI objects to and from clustalw
27 files.
29 =head1 FEEDBACK
31 =head2 Mailing Lists
33 User feedback is an integral part of the evolution of this and other
34 Bioperl modules. Send your comments and suggestions preferably to one
35 of the Bioperl mailing lists. Your participation is much appreciated.
37 bioperl-l@bioperl.org - General discussion
38 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
40 =head2 Support
42 Please direct usage questions or support issues to the mailing list:
44 I<bioperl-l@bioperl.org>
46 rather than to the module maintainer directly. Many experienced and
47 reponsive experts will be able look at the problem and quickly
48 address it. Please include a thorough description of the problem
49 with code and data examples if at all possible.
51 =head2 Reporting Bugs
53 Report bugs to the Bioperl bug tracking system to help us keep track
54 the bugs and their resolution. Bug reports can be submitted via the
55 web:
57 https://github.com/bioperl/bioperl-live/issues
59 =head1 AUTHORS - Peter Schattner
61 Email: schattner@alum.mit.edu
64 =head1 APPENDIX
66 The rest of the documentation details each of the object
67 methods. Internal methods are usually preceded with a _
69 =cut
71 # Let the code begin...
73 package Bio::AlignIO::clustalw;
74 use vars qw($LINELENGTH $CLUSTALPRINTVERSION);
75 use strict;
78 $LINELENGTH = 60;
79 $CLUSTALPRINTVERSION = '1.81';
80 use base qw(Bio::AlignIO);
82 =head2 new
84 Title : new
85 Usage : $alignio = Bio::AlignIO->new(-format => 'clustalw',
86 -file => 'filename');
87 Function: returns a new Bio::AlignIO object to handle clustalw files
88 Returns : Bio::AlignIO::clustalw object
89 Args : -verbose => verbosity setting (-1, 0, 1, 2)
90 -file => name of file to read in or to write, with ">"
91 -fh => alternative to -file param - provide a filehandle
92 to read from or write to
93 -format => alignment format to process or produce
94 -percentages => display a percentage of identity
95 in each line of the alignment (clustalw only)
96 -linelength=> alignment output line length (default 60)
98 =cut
100 sub _initialize {
101 my ( $self, @args ) = @_;
102 $self->SUPER::_initialize(@args);
103 my ( $percentages, $ll ) =
104 $self->_rearrange( [qw(PERCENTAGES LINELENGTH)], @args );
105 defined $percentages && $self->percentages($percentages);
106 $self->line_length( $ll || $LINELENGTH );
109 =head2 next_aln
111 Title : next_aln
112 Usage : $aln = $stream->next_aln()
113 Function: returns the next alignment in the stream
114 Returns : Bio::Align::AlignI object
115 Args : NONE
117 See L<Bio::Align::AlignI> for details
119 =cut
121 sub next_aln {
122 my ($self) = @_;
123 my $first_line;
125 while ( $first_line = $self->_readline ) {
126 last if $first_line !~ /^$/;
128 $self->_pushback($first_line);
129 if ( defined( $first_line = $self->_readline )
130 && $first_line !~ /CLUSTAL/ )
132 $self->throw(
133 "trying to parse a file which does not start with a CLUSTAL header"
136 my %alignments;
137 my $aln = Bio::SimpleAlign->new(
138 -source => 'clustalw',
139 -verbose => $self->verbose
141 my $order = 0;
142 my %order;
143 $self->{_lastline} = '';
144 my ($first_block, $seen_block) = (0,0);
145 while ( defined( $_ = $self->_readline ) ) {
146 next if (/^\s+$/ && !$first_block);
147 if (/^\s$/) { # line contains no description
148 $seen_block = 1;
149 next;
151 $first_block = 1;
152 # break the loop if we come to the end of the current alignment
153 # and push back the CLUSTAL header
154 if (/CLUSTAL/) {
155 $self->_pushback($_);
156 last;
159 my ( $seqname, $aln_line ) = ( '', '' );
160 if (/^\s*(\S+)\s*\/\s*(\d+)-(\d+)\s+(\S+)\s*$/ox) {
162 # clustal 1.4 format
163 ( $seqname, $aln_line ) = ( "$1:$2-$3", $4 );
165 # } elsif( /^\s*(\S+)\s+(\S+)\s*$/ox ) { without trailing numbers
167 elsif (/^\s*(\S+)\s+(\S+)\s*\d*\s*$/ox) { # with numbers
168 ( $seqname, $aln_line ) = ( $1, $2 );
169 if ( $seqname =~ /^[\*\.\+\:]+$/ ) {
170 $self->{_lastline} = $_;
171 next;
174 else {
175 $self->{_lastline} = $_;
176 next;
179 if ( !$seen_block ) {
180 if (exists $order{$seqname}) {
181 $self->warn("Duplicate sequence : $seqname\n".
182 "Can't guarantee alignment quality");
184 else {
185 $order{$seqname} = $order++;
189 $alignments{$seqname} .= $aln_line;
192 my ( $sname, $start, $end );
193 foreach my $name ( sort { $order{$a} <=> $order{$b} } keys %alignments ) {
194 if ( $name =~ /(\S+):(\d+)-(\d+)/ ) {
195 ( $sname, $start, $end ) = ( $1, $2, $3 );
197 else {
198 ( $sname, $start ) = ( $name, 1 );
199 my $str = $alignments{$name};
200 $str =~ s/[^A-Za-z]//g;
201 $end = length($str);
203 my $seq = Bio::LocatableSeq->new
205 '-seq' => $alignments{$name},
206 '-display_id' => $sname,
207 '-start' => $start,
208 '-end' => $end,
209 '-alphabet' => $self->alphabet,
211 $aln->add_seq($seq);
214 # not sure if this should be a default option - or we can pass in
215 # an option to do this in the future? --jason stajich
216 # $aln->map_chars('\.','-');
218 # no sequences added, so just return
219 return $aln if $aln->num_sequences;
220 return;
223 =head2 write_aln
225 Title : write_aln
226 Usage : $stream->write_aln(@aln)
227 Function: writes the clustalw-format object (.aln) into the stream
228 Returns : 1 for success and 0 for error
229 Args : Bio::Align::AlignI object
231 =cut
233 sub write_aln {
234 my ( $self, @aln ) = @_;
235 my ( $count, $length, $seq, @seq, $tempcount, $line_len );
236 $line_len = $self->line_length || $LINELENGTH;
237 foreach my $aln (@aln) {
238 if ( !$aln || !$aln->isa('Bio::Align::AlignI') ) {
239 $self->warn(
240 "Must provide a Bio::Align::AlignI object when calling write_aln"
242 next;
244 my $matchline = $aln->match_line;
245 if ( $self->force_displayname_flat ) {
246 $aln->set_displayname_flat(1);
248 $self->_print(
249 sprintf( "CLUSTAL W (%s) multiple sequence alignment\n\n\n",
250 $CLUSTALPRINTVERSION )
251 ) or return;
252 $length = $aln->length();
253 $count = $tempcount = 0;
254 @seq = $aln->each_seq();
255 my $max = 22;
256 foreach $seq (@seq) {
257 $max = length( $aln->displayname( $seq->get_nse() ) )
258 if ( length( $aln->displayname( $seq->get_nse() ) ) > $max );
261 while ( $count < $length ) {
262 my ( $linesubstr, $first ) = ( '', 1 );
263 foreach $seq (@seq) {
266 # Following lines are to suppress warnings
267 # if some sequences in the alignment are much longer than others.
269 my ($substring);
270 my $seqchars = $seq->seq();
271 SWITCH: {
272 if ( length($seqchars) >= ( $count + $line_len ) ) {
273 $substring = substr( $seqchars, $count, $line_len );
274 if ($first) {
275 $linesubstr =
276 substr( $matchline, $count, $line_len );
277 $first = 0;
279 last SWITCH;
281 elsif ( length($seqchars) >= $count ) {
282 $substring = substr( $seqchars, $count );
283 if ($first) {
284 $linesubstr = substr( $matchline, $count );
285 $first = 0;
287 last SWITCH;
289 $substring = "";
291 $self->_print(
292 sprintf(
293 "%-" . $max . "s %s\n",
294 $aln->displayname( $seq->get_nse() ), $substring
296 ) or return;
299 my $percentages = '';
300 if ( $self->percentages ) {
301 my ($strcpy) = ($linesubstr);
302 my $count = ( $strcpy =~ tr/\*// );
303 $percentages =
304 sprintf( "\t%d%%", 100 * ( $count / length($linesubstr) ) );
306 $self->_print(
307 sprintf(
308 "%-" . $max . "s %s%s\n",
309 '', $linesubstr, $percentages
312 $self->_print( sprintf("\n\n") ) or return;
313 $count += $line_len;
316 $self->flush if $self->_flush_on_write && defined $self->_fh;
317 return 1;
320 =head2 percentages
322 Title : percentages
323 Usage : $obj->percentages($newval)
324 Function: Set the percentages flag - whether or not to show percentages in
325 each output line
326 Returns : value of percentages
327 Args : newvalue (optional)
330 =cut
332 sub percentages {
333 my ( $self, $value ) = @_;
334 if ( defined $value ) {
335 $self->{'_percentages'} = $value;
337 return $self->{'_percentages'};
340 =head2 line_length
342 Title : line_length
343 Usage : $obj->line_length($newval)
344 Function: Set the alignment output line length
345 Returns : value of line_length
346 Args : newvalue (optional)
349 =cut
351 sub line_length {
352 my ( $self, $value ) = @_;
353 if ( defined $value ) {
354 $self->{'_line_length'} = $value;
356 return $self->{'_line_length'};