Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / Tools / pSW.pm
blob5a80b0a0ae448737569da7146c2adf061e557fcd
3 # BioPerl module for Bio::Tools::pSW
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Ewan Birney <birney@sanger.ac.uk>
9 # Copyright Ewan Birney
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Tools::pSW - pairwise Smith Waterman object
19 =head1 SYNOPSIS
21 use Bio::Tools::pSW;
22 use Bio::AlignIO;
23 my $factory = Bio::Tools::pSW->new( '-matrix' => 'blosum62.bla',
24 '-gap' => 12,
25 '-ext' => 2,
28 #use the factory to make some output
30 $factory->align_and_show($seq1,$seq2,STDOUT);
32 # make a Bio::SimpleAlign and do something with it
34 my $aln = $factory->pairwise_alignment($seq1,$seq2);
35 my $alnout = Bio::AlignIO->new(-format => 'msf',
36 -fh => \*STDOUT);
38 $alnout->write_aln($aln);
40 =head1 INSTALLATION
42 This module is included with the central Bioperl distribution:
44 http://bio.perl.org/Core/Latest
45 ftp://bio.perl.org/pub/DIST
47 Follow the installation instructions included in the INSTALL file.
49 =head1 DESCRIPTION
51 pSW is an Alignment Factory for protein sequences. It builds pairwise
52 alignments using the Smith-Waterman algorithm. The alignment algorithm is
53 implemented in C and added in using an XS extension. The XS extension basically
54 comes from the Wise2 package, but has been slimmed down to only be the
55 alignment part of that (this is a good thing!). The XS extension comes
56 from the bioperl-ext package which is distributed along with bioperl.
57 I<Warning:> This package will not work if you have not compiled the
58 bioperl-ext package.
60 The mixture of C and Perl is ideal for this sort of
61 problem. Here are some plus points for this strategy:
63 =over 2
65 =item Speed and Memory
67 The algorithm is actually implemented in C, which means it is faster than
68 a pure perl implementation (I have never done one, so I have no idea
69 how faster) and will use considerably less memory, as it efficiently
70 assigns memory for the calculation.
72 =item Algorithm efficiency
74 The algorithm was written using Dynamite, and so contains an automatic
75 switch to the linear space divide-and-conquer method. This means you
76 could effectively align very large sequences without killing your machine
77 (it could take a while though!).
79 =back
81 =head1 FEEDBACK
83 =head2 Mailing Lists
85 User feedback is an integral part of the evolution of this and other
86 Bioperl modules. Send your comments and suggestions preferably to one
87 of the Bioperl mailing lists. Your participation is much appreciated.
89 bioperl-l@bioperl.org - General discussion
90 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
92 =head2 Support
94 Please direct usage questions or support issues to the mailing list:
96 I<bioperl-l@bioperl.org>
98 rather than to the module maintainer directly. Many experienced and
99 reponsive experts will be able look at the problem and quickly
100 address it. Please include a thorough description of the problem
101 with code and data examples if at all possible.
103 =head2 Reporting Bugs
105 Report bugs to the Bioperl bug tracking system to help us keep track
106 the bugs and their resolution. Bug reports can be submitted via the
107 web:
109 https://github.com/bioperl/bioperl-live/issues
111 =head1 AUTHOR
113 Ewan Birney, birney-at-sanger.ac.uk or birney-at-ebi.ac.uk
115 =head1 CONTRIBUTORS
117 Jason Stajich, jason-at-bioperl.org
119 =head1 APPENDIX
121 The rest of the documentation details each of the object
122 methods. Internal methods are usually preceded with an underscore "_".
124 =cut
126 # Let the code begin...
128 package Bio::Tools::pSW;
129 use strict;
130 no strict ( 'refs');
132 BEGIN {
133 eval {
134 require Bio::Ext::Align;
136 if ( $@ ) {
137 die("\nThe C-compiled engine for Smith Waterman alignments (Bio::Ext::Align) has not been installed.\n Please read the install the bioperl-ext package\n\n");
138 exit(1);
142 use Bio::SimpleAlign;
145 use base qw(Bio::Tools::AlignFactory);
149 sub new {
150 my($class,@args) = @_;
152 my $self = $class->SUPER::new(@args);
154 my($matrix,$gap,$ext) = $self->_rearrange([qw(MATRIX
157 )],@args);
159 #default values - we have to load matrix into memory, so
160 # we need to check it out now
161 if( ! defined $matrix || !($matrix =~ /\w/) ) {
162 $matrix = 'blosum62.bla';
165 $self->matrix($matrix); # will throw exception if it can't load it
166 $self->gap(12) unless defined $gap;
167 $self->ext(2) unless defined $ext;
169 # I'm pretty sure I am not doing this right... ho hum...
170 # This was not roght ($gap and $ext could not be 0) It is fixed now /AE
171 if( defined $gap ) {
172 if( $gap =~ /^\d+$/ ) {
173 $self->gap($gap);
174 } else {
175 $self->throw("Gap penalty must be a number, not [$gap]");
178 if( defined $ext ) {
179 if( $ext =~ /^\d+$/ ) {
180 $self->ext($ext);
181 } else {
182 $self->throw("Extension penalty must be a number, not [$ext]");
186 return $self;
190 =head2 pairwise_alignment
192 Title : pairwise_alignment
193 Usage : $aln = $factory->pairwise_alignment($seq1,$seq2)
194 Function: Makes a SimpleAlign object from two sequences
195 Returns : A SimpleAlign object
196 Args :
199 =cut
201 sub pairwise_alignment{
202 my ($self,$seq1,$seq2) = @_;
203 my($t1,$t2,$aln,$out,@str1,@str2,@ostr1,@ostr2,$alc,$tstr,$tid,$start1,$end1,$start2,$end2,$alctemp);
205 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') ||
206 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) {
207 $self->warn("Cannot call pairwise_alignment without specifing 2 sequences (Bio::PrimarySeqI objects)");
208 return;
210 # fix Jitterbug #1044
211 if( $seq1->length() < 2 ||
212 $seq2->length() < 2 ) {
213 $self->warn("cannot align sequences with length less than 2");
214 return;
216 $self->set_memory_and_report();
217 # create engine objects
218 $seq1->display_id('seq1') unless ( defined $seq1->id() );
219 $seq2->display_id('seq2') unless ( defined $seq2->id() );
221 $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(),
222 $seq1->seq());
223 $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(),
224 $seq2->seq());
225 $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
226 if( ! defined $aln || $aln == 0 ) {
227 $self->throw("Unable to build an alignment");
230 # free sequence engine objects
232 $t1 = $t2 = 0;
234 # now we have to get into the AlnBlock structure and
235 # figure out what is aligned to what...
237 # we are going to need the sequences as arrays for convience
239 @str1 = split(//, $seq1->seq());
240 @str2 = split(//, $seq2->seq());
242 # get out start points
244 # The alignment is in alignment coordinates - ie the first
245 # residues starts at -1 and ends at 0. (weird I know).
246 # bio-coordinates are +2 from this...
248 $start1 = $aln->start()->alu(0)->start +2;
249 $start2 = $aln->start()->alu(1)->start +2;
251 # step along the linked list of alc units...
253 for($alc = $aln->start();$alc->at_end() != 1;$alc = $alc->next()) {
254 if( $alc->alu(0)->text_label eq 'SEQUENCE' ) {
255 push(@ostr1,$str1[$alc->alu(0)->start+1]);
256 } else {
257 # assume it is in insert!
258 push(@ostr1,'-');
261 if( $alc->alu(1)->text_label eq 'SEQUENCE' ) {
262 push(@ostr2,$str2[$alc->alu(1)->start+1]);
263 } else {
264 # assume it is in insert!
265 push(@ostr2,'-');
267 $alctemp = $alc;
271 # get out end points
274 # end points = real residue end in 'C' coordinates = residue
275 # end in biocoordinates. Oh... the wonder of coordinate systems!
277 $end1 = $alctemp->alu(0)->end+1;
278 $end2 = $alctemp->alu(1)->end+1;
280 # get rid of the alnblock
281 $alc = 0;
282 $aln = 0;
284 # new SimpleAlignment
285 $out = Bio::SimpleAlign->new(); # new SimpleAlignment
287 $tstr = join('',@ostr1);
288 $tid = $seq1->id();
289 $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr,
290 -start => $start1,
291 -end => $end1,
292 -id=>$tid ));
294 $tstr = join('',@ostr2);
295 $tid = $seq2->id();
296 $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr,
297 -start => $start2,
298 -end => $end2,
299 -id=> $tid ));
301 # give'm back the alignment
303 return $out;
306 =head2 align_and_show
308 Title : align_and_show
309 Usage : $factory->align_and_show($seq1,$seq2,STDOUT)
311 =cut
313 sub align_and_show {
314 my($self,$seq1,$seq2,$fh) = @_;
315 my($t1,$t2,$aln,$id,$str);
317 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') ||
318 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) {
319 $self->warn("Cannot call align_and_show without specifing 2 sequences (Bio::PrimarySeqI objects)");
320 return;
322 # fix Jitterbug #1044
323 if( $seq1->length() < 2 ||
324 $seq2->length() < 2 ) {
325 $self->warn("cannot align sequences with length less than 2");
326 return;
328 if( ! defined $fh ) {
329 $fh = \*STDOUT;
331 $self->set_memory_and_report();
332 $seq1->display_id('seq1') unless ( defined $seq1->id() );
333 $seq2->display_id('seq2') unless ( defined $seq2->id() );
335 $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(),$seq1->seq());
337 $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(),$seq2->seq());
338 $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
339 if( ! defined $aln || $aln == 0 ) {
340 $self->throw("Unable to build an alignment");
343 &Bio::Ext::Align::write_pretty_seq_align($aln,$t1,$t2,12,50,$fh);
347 =head2 matrix
349 Title : matrix()
350 Usage : $factory->matrix('blosum62.bla');
351 Function : Reads in comparison matrix based on name
353 Returns :
354 Argument : comparison matrix
356 =cut
358 sub matrix {
359 my($self,$comp) = @_;
360 my $temp;
362 if( !defined $comp ) {
363 $self->throw("You must have a comparison matrix to set!");
366 # talking to the engine here...
368 $temp = &Bio::Ext::Align::CompMat::read_Blast_file_CompMat($comp);
370 if( !(defined $temp) || $temp == 0 ) {
371 $self->throw("$comp cannot be read as a BLAST comparison matrix file");
374 $self->{'matrix'} = $temp;
379 =head2 gap
381 Title : gap
382 Usage : $gap = $factory->gap() #get
383 : $factory->gap($value) #set
384 Function : the set get for the gap penalty
385 Example :
386 Returns : gap value
387 Arguments : new value
389 =cut
391 sub gap {
392 my ($self,$val) = @_;
395 if( defined $val ) {
396 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE
397 $self->throw("Can't have a gap penalty less than 0");
399 $self->{'gap'} = $val;
401 return $self->{'gap'};
405 =head2 ext
407 Title : ext
408 Usage : $ext = $factory->ext() #get
409 : $factory->ext($value) #set
410 Function : the set get for the ext penalty
411 Example :
412 Returns : ext value
413 Arguments : new value
415 =cut
417 sub ext {
418 my ($self,$val) = @_;
420 if( defined $val ) {
421 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE
422 $self->throw("Can't have a gap penalty less than 0");
424 $self->{'ext'} = $val;
426 return $self->{'ext'};