3 # BioPerl module for Bio::Tools::pSW
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Ewan Birney <birney@sanger.ac.uk>
9 # Copyright Ewan Birney
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Tools::pSW - pairwise Smith Waterman object
23 my $factory = Bio::Tools::pSW->new( '-matrix' => 'blosum62.bla',
28 #use the factory to make some output
30 $factory->align_and_show($seq1,$seq2,STDOUT);
32 # make a Bio::SimpleAlign and do something with it
34 my $aln = $factory->pairwise_alignment($seq1,$seq2);
35 my $alnout = Bio::AlignIO->new(-format => 'msf',
38 $alnout->write_aln($aln);
42 This module is included with the central Bioperl distribution:
44 http://bio.perl.org/Core/Latest
45 ftp://bio.perl.org/pub/DIST
47 Follow the installation instructions included in the INSTALL file.
51 pSW is an Alignment Factory for protein sequences. It builds pairwise
52 alignments using the Smith-Waterman algorithm. The alignment algorithm is
53 implemented in C and added in using an XS extension. The XS extension basically
54 comes from the Wise2 package, but has been slimmed down to only be the
55 alignment part of that (this is a good thing!). The XS extension comes
56 from the bioperl-ext package which is distributed along with bioperl.
57 I<Warning:> This package will not work if you have not compiled the
60 The mixture of C and Perl is ideal for this sort of
61 problem. Here are some plus points for this strategy:
65 =item Speed and Memory
67 The algorithm is actually implemented in C, which means it is faster than
68 a pure perl implementation (I have never done one, so I have no idea
69 how faster) and will use considerably less memory, as it efficiently
70 assigns memory for the calculation.
72 =item Algorithm efficiency
74 The algorithm was written using Dynamite, and so contains an automatic
75 switch to the linear space divide-and-conquer method. This means you
76 could effectively align very large sequences without killing your machine
77 (it could take a while though!).
85 User feedback is an integral part of the evolution of this and other
86 Bioperl modules. Send your comments and suggestions preferably to one
87 of the Bioperl mailing lists. Your participation is much appreciated.
89 bioperl-l@bioperl.org - General discussion
90 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
94 Please direct usage questions or support issues to the mailing list:
96 I<bioperl-l@bioperl.org>
98 rather than to the module maintainer directly. Many experienced and
99 reponsive experts will be able look at the problem and quickly
100 address it. Please include a thorough description of the problem
101 with code and data examples if at all possible.
103 =head2 Reporting Bugs
105 Report bugs to the Bioperl bug tracking system to help us keep track
106 the bugs and their resolution. Bug reports can be submitted via the
109 https://github.com/bioperl/bioperl-live/issues
113 Ewan Birney, birney-at-sanger.ac.uk or birney-at-ebi.ac.uk
117 Jason Stajich, jason-at-bioperl.org
121 The rest of the documentation details each of the object
122 methods. Internal methods are usually preceded with an underscore "_".
126 # Let the code begin...
128 package Bio
::Tools
::pSW
;
134 require Bio
::Ext
::Align
;
137 die("\nThe C-compiled engine for Smith Waterman alignments (Bio::Ext::Align) has not been installed.\n Please read the install the bioperl-ext package\n\n");
142 use Bio
::SimpleAlign
;
145 use base
qw(Bio::Tools::AlignFactory);
150 my($class,@args) = @_;
152 my $self = $class->SUPER::new
(@args);
154 my($matrix,$gap,$ext) = $self->_rearrange([qw(MATRIX
159 #default values - we have to load matrix into memory, so
160 # we need to check it out now
161 if( ! defined $matrix || !($matrix =~ /\w/) ) {
162 $matrix = 'blosum62.bla';
165 $self->matrix($matrix); # will throw exception if it can't load it
166 $self->gap(12) unless defined $gap;
167 $self->ext(2) unless defined $ext;
169 # I'm pretty sure I am not doing this right... ho hum...
170 # This was not roght ($gap and $ext could not be 0) It is fixed now /AE
172 if( $gap =~ /^\d+$/ ) {
175 $self->throw("Gap penalty must be a number, not [$gap]");
179 if( $ext =~ /^\d+$/ ) {
182 $self->throw("Extension penalty must be a number, not [$ext]");
190 =head2 pairwise_alignment
192 Title : pairwise_alignment
193 Usage : $aln = $factory->pairwise_alignment($seq1,$seq2)
194 Function: Makes a SimpleAlign object from two sequences
195 Returns : A SimpleAlign object
201 sub pairwise_alignment
{
202 my ($self,$seq1,$seq2) = @_;
203 my($t1,$t2,$aln,$out,@str1,@str2,@ostr1,@ostr2,$alc,$tstr,$tid,$start1,$end1,$start2,$end2,$alctemp);
205 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') ||
206 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) {
207 $self->warn("Cannot call pairwise_alignment without specifing 2 sequences (Bio::PrimarySeqI objects)");
210 # fix Jitterbug #1044
211 if( $seq1->length() < 2 ||
212 $seq2->length() < 2 ) {
213 $self->warn("cannot align sequences with length less than 2");
216 $self->set_memory_and_report();
217 # create engine objects
218 $seq1->display_id('seq1') unless ( defined $seq1->id() );
219 $seq2->display_id('seq2') unless ( defined $seq2->id() );
221 $t1 = &Bio
::Ext
::Align
::new_Sequence_from_strings
($seq1->id(),
223 $t2 = &Bio
::Ext
::Align
::new_Sequence_from_strings
($seq2->id(),
225 $aln = &Bio
::Ext
::Align
::Align_Sequences_ProteinSmithWaterman
($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
226 if( ! defined $aln || $aln == 0 ) {
227 $self->throw("Unable to build an alignment");
230 # free sequence engine objects
234 # now we have to get into the AlnBlock structure and
235 # figure out what is aligned to what...
237 # we are going to need the sequences as arrays for convience
239 @str1 = split(//, $seq1->seq());
240 @str2 = split(//, $seq2->seq());
242 # get out start points
244 # The alignment is in alignment coordinates - ie the first
245 # residues starts at -1 and ends at 0. (weird I know).
246 # bio-coordinates are +2 from this...
248 $start1 = $aln->start()->alu(0)->start +2;
249 $start2 = $aln->start()->alu(1)->start +2;
251 # step along the linked list of alc units...
253 for($alc = $aln->start();$alc->at_end() != 1;$alc = $alc->next()) {
254 if( $alc->alu(0)->text_label eq 'SEQUENCE' ) {
255 push(@ostr1,$str1[$alc->alu(0)->start+1]);
257 # assume it is in insert!
261 if( $alc->alu(1)->text_label eq 'SEQUENCE' ) {
262 push(@ostr2,$str2[$alc->alu(1)->start+1]);
264 # assume it is in insert!
274 # end points = real residue end in 'C' coordinates = residue
275 # end in biocoordinates. Oh... the wonder of coordinate systems!
277 $end1 = $alctemp->alu(0)->end+1;
278 $end2 = $alctemp->alu(1)->end+1;
280 # get rid of the alnblock
284 # new SimpleAlignment
285 $out = Bio
::SimpleAlign
->new(); # new SimpleAlignment
287 $tstr = join('',@ostr1);
289 $out->add_seq(Bio
::LocatableSeq
->new( -seq
=> $tstr,
294 $tstr = join('',@ostr2);
296 $out->add_seq(Bio
::LocatableSeq
->new( -seq
=> $tstr,
301 # give'm back the alignment
306 =head2 align_and_show
308 Title : align_and_show
309 Usage : $factory->align_and_show($seq1,$seq2,STDOUT)
314 my($self,$seq1,$seq2,$fh) = @_;
315 my($t1,$t2,$aln,$id,$str);
317 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') ||
318 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) {
319 $self->warn("Cannot call align_and_show without specifing 2 sequences (Bio::PrimarySeqI objects)");
322 # fix Jitterbug #1044
323 if( $seq1->length() < 2 ||
324 $seq2->length() < 2 ) {
325 $self->warn("cannot align sequences with length less than 2");
328 if( ! defined $fh ) {
331 $self->set_memory_and_report();
332 $seq1->display_id('seq1') unless ( defined $seq1->id() );
333 $seq2->display_id('seq2') unless ( defined $seq2->id() );
335 $t1 = &Bio
::Ext
::Align
::new_Sequence_from_strings
($seq1->id(),$seq1->seq());
337 $t2 = &Bio
::Ext
::Align
::new_Sequence_from_strings
($seq2->id(),$seq2->seq());
338 $aln = &Bio
::Ext
::Align
::Align_Sequences_ProteinSmithWaterman
($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
339 if( ! defined $aln || $aln == 0 ) {
340 $self->throw("Unable to build an alignment");
343 &Bio
::Ext
::Align
::write_pretty_seq_align
($aln,$t1,$t2,12,50,$fh);
350 Usage : $factory->matrix('blosum62.bla');
351 Function : Reads in comparison matrix based on name
354 Argument : comparison matrix
359 my($self,$comp) = @_;
362 if( !defined $comp ) {
363 $self->throw("You must have a comparison matrix to set!");
366 # talking to the engine here...
368 $temp = &Bio
::Ext
::Align
::CompMat
::read_Blast_file_CompMat
($comp);
370 if( !(defined $temp) || $temp == 0 ) {
371 $self->throw("$comp cannot be read as a BLAST comparison matrix file");
374 $self->{'matrix'} = $temp;
382 Usage : $gap = $factory->gap() #get
383 : $factory->gap($value) #set
384 Function : the set get for the gap penalty
387 Arguments : new value
392 my ($self,$val) = @_;
396 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE
397 $self->throw("Can't have a gap penalty less than 0");
399 $self->{'gap'} = $val;
401 return $self->{'gap'};
408 Usage : $ext = $factory->ext() #get
409 : $factory->ext($value) #set
410 Function : the set get for the ext penalty
413 Arguments : new value
418 my ($self,$val) = @_;
421 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE
422 $self->throw("Can't have a gap penalty less than 0");
424 $self->{'ext'} = $val;
426 return $self->{'ext'};