Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / Search / HSP / HSPI.pm
bloba28b0c991fec1e52694cd21bd3904ef14967e099
1 #-----------------------------------------------------------------
3 # BioPerl module for Bio::Search::HSP::HSPI
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Steve Chervitz <sac@bioperl.org>
8 # and Jason Stajich <jason@bioperl.org>
10 # You may distribute this module under the same terms as perl itself
11 #-----------------------------------------------------------------
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Search::HSP::HSPI - Interface for a High Scoring Pair in a similarity search result
19 =head1 SYNOPSIS
21 # Bio::Search::HSP::HSPI objects cannot be instantiated since this
22 # module defines a pure interface.
24 # Given an object that implements the Bio::Search::HSP::HSPI interface,
25 # you can do the following things with it:
27 $r_type = $hsp->algorithm;
29 $pvalue = $hsp->pvalue();
31 $evalue = $hsp->evalue();
33 $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
35 $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
37 $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
39 $qseq = $hsp->query_string;
41 $hseq = $hsp->hit_string;
43 $homology_string = $hsp->homology_string;
45 $len = $hsp->length( ['query'|'hit'|'total'] );
47 $rank = $hsp->rank;
49 =head1 DESCRIPTION
51 Bio::Search::HSP::HSPI objects cannot be instantiated since this
52 module defines a pure interface.
54 Given an object that implements the L<Bio::Search::HSP::HSPI> interface,
55 you can do the following things with it:
57 =head1 SEE ALSO
59 This interface inherits methods from these other modules:
61 L<Bio::SeqFeatureI>,
62 L<Bio::SeqFeature::FeaturePair>
63 L<Bio::SeqFeature::SimilarityPair>
65 Please refer to these modules for documentation of the
66 many additional inherited methods.
68 =head1 FEEDBACK
70 =head2 Mailing Lists
72 User feedback is an integral part of the evolution of this and other
73 Bioperl modules. Send your comments and suggestions preferably to
74 the Bioperl mailing list. Your participation is much appreciated.
76 bioperl-l@bioperl.org - General discussion
77 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
79 =head2 Support
81 Please direct usage questions or support issues to the mailing list:
83 I<bioperl-l@bioperl.org>
85 rather than to the module maintainer directly. Many experienced and
86 reponsive experts will be able look at the problem and quickly
87 address it. Please include a thorough description of the problem
88 with code and data examples if at all possible.
90 =head2 Reporting Bugs
92 Report bugs to the Bioperl bug tracking system to help us keep track
93 of the bugs and their resolution. Bug reports can be submitted via the
94 web:
96 https://github.com/bioperl/bioperl-live/issues
98 =head1 AUTHOR - Steve Chervitz, Jason Stajich
100 Email sac-at-bioperl.org
101 Email jason-at-bioperl.org
103 =head1 COPYRIGHT
105 Copyright (c) 2001 Steve Chervitz, Jason Stajich. All Rights Reserved.
107 =head1 DISCLAIMER
109 This software is provided "as is" without warranty of any kind.
111 =head1 APPENDIX
113 The rest of the documentation details each of the object methods.
114 Internal methods are usually preceded with a _
116 =cut
119 # Let the code begin...
122 package Bio::Search::HSP::HSPI;
125 use strict;
126 use Carp;
128 use base qw(Bio::SeqFeature::SimilarityPair Bio::Root::RootI);
131 =head2 algorithm
133 Title : algorithm
134 Usage : my $r_type = $hsp->algorithm
135 Function: Obtain the name of the algorithm used to obtain the HSP
136 Returns : string (e.g., BLASTP)
137 Args : none
139 =cut
141 sub algorithm{
142 my ($self,@args) = @_;
143 $self->throw_not_implemented;
146 =head2 pvalue
148 Title : pvalue
149 Usage : my $pvalue = $hsp->pvalue();
150 Function: Returns the P-value for this HSP or undef
151 Returns : float or exponential (2e-10)
152 P-value is not defined with NCBI Blast2 reports.
153 Args : none
155 =cut
157 sub pvalue {
158 my ($self) = @_;
159 $self->throw_not_implemented;
162 =head2 evalue
164 Title : evalue
165 Usage : my $evalue = $hsp->evalue();
166 Function: Returns the e-value for this HSP
167 Returns : float or exponential (2e-10)
168 Args : none
170 =cut
172 sub evalue {
173 my ($self) = @_;
174 $self->throw_not_implemented;
178 =head2 frac_identical
180 Title : frac_identical
181 Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
182 Function: Returns the fraction of identitical positions for this HSP
183 Returns : Float in range 0.0 -> 1.0
184 Args : 'query' = num identical / length of query seq (without gaps)
185 'hit' = num identical / length of hit seq (without gaps)
186 'total' = num identical / length of alignment (with gaps)
187 default = 'total'
189 =cut
191 sub frac_identical {
192 my ($self, $type) = @_;
193 $self->throw_not_implemented;
196 =head2 frac_conserved
198 Title : frac_conserved
199 Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
200 Function : Returns the fraction of conserved positions for this HSP.
201 This is the fraction of symbols in the alignment with a
202 positive score.
203 Returns : Float in range 0.0 -> 1.0
204 Args : 'query' = num conserved / length of query seq (without gaps)
205 'hit' = num conserved / length of hit seq (without gaps)
206 'total' = num conserved / length of alignment (with gaps)
207 default = 'total'
209 =cut
211 sub frac_conserved {
212 my ($self, $type) = @_;
213 $self->throw_not_implemented;
216 =head2 num_identical
218 Title : num_identical
219 Usage : $obj->num_identical($newval)
220 Function: returns the number of identical residues in the alignment
221 Returns : integer
222 Args : integer (optional)
225 =cut
227 sub num_identical{
228 shift->throw_not_implemented;
231 =head2 num_conserved
233 Title : num_conserved
234 Usage : $obj->num_conserved($newval)
235 Function: returns the number of conserved residues in the alignment
236 Returns : inetger
237 Args : integer (optional)
240 =cut
242 sub num_conserved{
243 shift->throw_not_implemented();
246 =head2 gaps
248 Title : gaps
249 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
250 Function : Get the number of gap characters in the query, hit, or total alignment.
251 Returns : Integer, number of gap characters or 0 if none
252 Args : 'query' = num conserved / length of query seq (without gaps)
253 'hit' = num conserved / length of hit seq (without gaps)
254 'total' = num conserved / length of alignment (with gaps)
255 default = 'total'
257 =cut
259 sub gaps {
260 my ($self, $type) = @_;
261 $self->throw_not_implemented;
264 =head2 query_string
266 Title : query_string
267 Usage : my $qseq = $hsp->query_string;
268 Function: Retrieves the query sequence of this HSP as a string
269 Returns : string
270 Args : none
273 =cut
275 sub query_string{
276 my ($self) = @_;
277 $self->throw_not_implemented;
280 =head2 hit_string
282 Title : hit_string
283 Usage : my $hseq = $hsp->hit_string;
284 Function: Retrieves the hit sequence of this HSP as a string
285 Returns : string
286 Args : none
289 =cut
291 sub hit_string{
292 my ($self) = @_;
293 $self->throw_not_implemented;
296 =head2 homology_string
298 Title : homology_string
299 Usage : my $homo_string = $hsp->homology_string;
300 Function: Retrieves the homology sequence for this HSP as a string.
301 : The homology sequence is the string of symbols in between the
302 : query and hit sequences in the alignment indicating the degree
303 : of conservation (e.g., identical, similar, not similar).
304 Returns : string
305 Args : none
307 =cut
309 sub homology_string{
310 my ($self) = @_;
311 $self->throw_not_implemented;
314 =head2 length
316 Title : length
317 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
318 Function : Returns the length of the query or hit in the alignment (without gaps)
319 or the aggregate length of the HSP (including gaps;
320 this may be greater than either hit or query )
321 Returns : integer
322 Args : 'query' = length of query seq (without gaps)
323 'hit' = length of hit seq (without gaps)
324 'total' = length of alignment (with gaps)
325 default = 'total'
326 Args : none
328 =cut
330 sub length{
331 shift->throw_not_implemented();
334 =head2 percent_identity
336 Title : percent_identity
337 Usage : my $percentid = $hsp->percent_identity()
338 Function: Returns the calculated percent identity for an HSP
339 Returns : floating point between 0 and 100
340 Args : none
343 =cut
345 sub percent_identity{
346 my ($self) = @_;
347 return $self->frac_identical('hsp') * 100;
350 =head2 get_aln
352 Title : get_aln
353 Usage : my $aln = $hsp->get_aln
354 Function: Returns a Bio::SimpleAlign representing the HSP alignment
355 Returns : Bio::SimpleAlign
356 Args : none
358 =cut
360 sub get_aln {
361 my ($self) = @_;
362 $self->throw_not_implemented;
366 =head2 seq_inds
368 Title : seq_inds
369 Purpose : Get a list of residue positions (indices) for all identical
370 : or conserved residues in the query or sbjct sequence.
371 Example : @s_ind = $hsp->seq_inds('query', 'identical');
372 : @h_ind = $hsp->seq_inds('hit', 'conserved');
373 : @h_ind = $hsp->seq_inds('hit', 'conserved', 1);
374 Returns : List of integers
375 : May include ranges if collapse is true.
376 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query)
377 ('sbjct' is synonymous with 'hit')
378 class = 'identical' or 'conserved' or 'nomatch' or 'gap'
379 (default = identical)
380 (can be shortened to 'id' or 'cons')
382 collapse = boolean, if true, consecutive positions are merged
383 using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
384 collapses to "1-5 7 9-11". This is useful for
385 consolidating long lists. Default = no collapse.
386 Throws : n/a.
387 Comments :
389 See Also : L<Bio::Search::BlastUtils::collapse_nums()|Bio::Search::BlastUtils>, L<Bio::Search::Hit::HitI::seq_inds()|Bio::Search::Hit::HitI>
391 =cut
393 sub seq_inds {
394 shift->throw_not_implemented();
397 =head2 Inherited from L<Bio::SeqFeature::SimilarityPair>
399 These methods come from L<Bio::SeqFeature::SimilarityPair>
401 =head2 query
403 Title : query
404 Usage : my $query = $hsp->query
405 Function: Returns a SeqFeature representing the query in the HSP
406 Returns : Bio::SeqFeature::Similarity
407 Args : [optional] new value to set
410 =head2 hit
412 Title : hit
413 Usage : my $hit = $hsp->hit
414 Function: Returns a SeqFeature representing the hit in the HSP
415 Returns : Bio::SeqFeature::Similarity
416 Args : [optional] new value to set
419 =head2 significance
421 Title : significance
422 Usage : $evalue = $obj->significance();
423 $obj->significance($evalue);
424 Function: Get/Set the significance value (see Bio::SeqFeature::SimilarityPair)
425 Returns : significance value (scientific notation string)
426 Args : significance value (sci notation string)
429 =head2 score
431 Title : score
432 Usage : my $score = $hsp->score();
433 Function: Returns the score for this HSP or undef
434 Returns : numeric
435 Args : [optional] numeric to set value
437 =head2 bits
439 Title : bits
440 Usage : my $bits = $hsp->bits();
441 Function: Returns the bit value for this HSP or undef
442 Returns : numeric
443 Args : none
445 =cut
447 # override
449 =head2 strand
451 Title : strand
452 Usage : $hsp->strand('query')
453 Function: Retrieves the strand for the HSP component requested
454 Returns : +1 or -1 (0 if unknown)
455 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject
456 'query' to retrieve the query strand (default)
457 'list' or 'array' to retrieve both query and hit together
459 =cut
461 sub strand {
462 my $self = shift;
463 my $val = shift;
464 $val = 'query' unless defined $val;
465 $val =~ s/^\s+//;
467 if ( $val =~ /^q/i ) {
468 return $self->query->strand(@_);
470 elsif ( $val =~ /^hi|^s/i ) {
471 return $self->hit->strand(@_);
473 elsif ( $val =~ /^list|array/i ) {
475 # Do we really need to pass on additional arguments here? HL
476 # (formerly this was strand(shift) which is really bad coding because
477 # it breaks if the callee allows setting to undef)
478 return ( $self->query->strand(@_), $self->hit->strand(@_) );
480 else {
481 $self->warn("unrecognized component '$val' requested\n");
483 return 0;
486 =head2 start
488 Title : start
489 Usage : $hsp->start('query')
490 Function: Retrieves the start for the HSP component requested
491 Returns : integer
492 Args : 'hit' or 'subject' or 'sbjct' to retrieve the start of the subject
493 'query' to retrieve the query start (default)
495 =cut
497 sub start {
498 my $self = shift;
499 my $val = shift;
500 $val = 'query' unless defined $val;
501 $val =~ s/^\s+//;
503 if( $val =~ /^q/i ) {
504 return $self->query->start(@_);
505 } elsif( $val =~ /^(hi|s)/i ) {
506 return $self->hit->start(@_);
507 } elsif ( $val =~ /^list|array/i ) {
508 # do we really need to pass on additional arguments here? HL
509 # (formerly this was strand(shift) which is really bad coding because
510 # it breaks if the callee allows setting to undef)
511 return ($self->query->start(@_),
512 $self->hit->start(@_) );
513 } else {
514 $self->warn("unrecognized component '$val' requested\n");
516 return 0;
519 =head2 end
521 Title : end
522 Usage : $hsp->end('query')
523 Function: Retrieves the end for the HSP component requested
524 Returns : integer
525 Args : 'hit' or 'subject' or 'sbjct' to retrieve the end of the subject
526 'query' to retrieve the query end (default)
528 =cut
530 sub end {
531 my $self = shift;
532 my $val = shift;
533 $val = 'query' unless defined $val;
534 $val =~ s/^\s+//;
536 if( $val =~ /^q/i ) {
537 return $self->query->end(@_);
538 } elsif( $val =~ /^(hi|s)/i ) {
539 return $self->hit->end(@_);
540 } elsif ( $val =~ /^list|array/i ) {
541 # do we really need to pass on additional arguments here? HL
542 # (formerly this was strand(shift) which is really bad coding because
543 # it breaks if the callee allows setting to undef)
544 return ($self->query->end(@_),
545 $self->hit->end(@_) );
546 } else {
547 $self->warn("unrecognized end component '$val' requested\n");
549 return 0;
552 =head2 seq
554 Usage : $hsp->seq( [seq_type] );
555 Purpose : Get the query or sbjct sequence as a Bio::Seq.pm object.
556 Example : $seqObj = $hsp->seq('query');
557 Returns : Object reference for a Bio::Seq.pm object.
558 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'query').
559 : ('sbjct' is synonymous with 'hit')
560 : default is 'query'
561 Throws : Propagates any exception that occurs during construction
562 : of the Bio::Seq.pm object.
563 Comments : The sequence is returned in an array of strings corresponding
564 : to the strings in the original format of the Blast alignment.
565 : (i.e., same spacing).
567 See Also : L</seq_str>, L</seq_inds>, L<Bio::Seq>
569 =cut
571 sub seq {
572 my($self,$seqType) = @_;
573 $seqType ||= 'query';
574 $seqType = 'sbjct' if $seqType eq 'hit';
575 my $str = $self->seq_str($seqType);
576 if( $seqType =~ /^(m|ho)/i ) {
577 $self->throw("cannot call seq on the homology match string, it isn't really a sequence, use get_aln to convert the HSP to a Bio::AlignIO and generate a consensus from that.");
579 require Bio::LocatableSeq;
580 my $id = $seqType =~ /^q/i ? $self->query->seq_id : $self->hit->seq_id;
581 return Bio::LocatableSeq->new( -ID => $id,
582 -SEQ => $str,
583 -START => $self->start($seqType),
584 -END => $self->end($seqType),
585 -STRAND => $self->strand($seqType),
586 -FORCE_NSE => $id ? 0 : 1,
587 -DESC => "$seqType sequence " );
590 =head2 seq_str
592 Usage : $hsp->seq_str( seq_type );
593 Purpose : Get the full query, sbjct, or 'match' sequence as a string.
594 : The 'match' sequence is the string of symbols in between the
595 : query and sbjct sequences.
596 Example : $str = $hsp->seq_str('query');
597 Returns : String
598 Argument : seq_Type = 'query' or 'hit' or 'sbjct' or 'match'
599 : ('sbjct' is synonymous with 'hit')
600 : default is 'query'
601 Throws : Exception if the argument does not match an accepted seq_type.
602 Comments :
604 See Also : L</seq>, L</seq_inds>, C<_set_match_seq>
606 =cut
608 sub seq_str {
609 my $self = shift;
610 my $type = shift || 'query';
612 if( $type =~ /^q/i ) { return $self->query_string(@_) }
613 elsif( $type =~ /^(s)|(hi)/i ) { return $self->hit_string(@_)}
614 elsif ( $type =~ /^(ho)|(ma)/i ) { return $self->homology_string(@_) }
615 else {
616 $self->warn("unknown sequence type $type");
618 return '';
622 =head2 rank
624 Usage : $hsp->rank( [string] );
625 Purpose : Get the rank of the HSP within a given Blast hit.
626 Example : $rank = $hsp->rank;
627 Returns : Integer (1..n) corresponding to the order in which the HSP
628 appears in the BLAST report.
630 =cut
632 sub rank { shift->throw_not_implemented }
634 =head2 matches
636 Usage : $hsp->matches(-seq => 'hit'|'query',
637 -start => $start,
638 -stop => $stop);
639 Purpose : Get the total number of identical and conservative matches
640 : in the query or sbjct sequence for the given HSP. Optionally can
641 : report data within a defined interval along the seq.
642 : (Note: 'conservative' matches are called 'positives' in the
643 : Blast report.)
644 Example : ($id,$cons) = $hsp_object->matches(-seq => 'hit');
645 : ($id,$cons) = $hsp_object->matches(-seq => 'query',
646 -start => 300,
647 -stop => 400);
648 Returns : 2-element array of integers
649 Argument : (1) seq_type = 'query' or 'hit' or 'sbjct' (default = query)
650 : ('sbjct' is synonymous with 'hit')
651 : (2) start = Starting coordinate (optional)
652 : (3) stop = Ending coordinate (optional)
653 Throws : Exception if the supplied coordinates are out of range.
654 Comments : Relies on seq_str('match') to get the string of alignment symbols
655 : between the query and sbjct lines which are used for determining
656 : the number of identical and conservative matches.
658 See Also : L</length>, L</gaps>, L</seq_str>, L<Bio::Search::Hit::BlastHit::_adjust_contigs()|Bio::Search::Hit::BlastHit>
660 =cut
662 #-----------
663 sub matches {
664 #-----------
665 my( $self, %param ) = @_;
666 my(@data);
667 my($seqType, $beg, $end) = ($param{-SEQ},
668 $param{-START},
669 $param{-STOP});
670 $seqType ||= 'query';
671 $seqType = 'sbjct' if $seqType eq 'hit';
673 if( (!defined $beg && !defined $end) || ! $self->seq_str('match') ) {
674 ## Get data for the whole alignment.
675 push @data, ($self->num_identical, $self->num_conserved);
676 } else {
678 ## Get the substring representing the desired sub-section of aln.
679 $beg ||= 0;
680 $end ||= 0;
681 my($start,$stop) = $self->range($seqType);
682 if($beg == 0) { $beg = $start; $end = $beg+$end; } # sane?
683 elsif($end == 0) { $end = $stop; $beg = $end-$beg; } # sane?
685 if($end > $stop) { $end = $stop; }
686 if($beg < $start) { $beg = $start; }
688 # now with gap handling! /maj
689 my $match_str = $self->seq_str('match');
690 if ($self->gaps) {
691 # strip the homology string of gap positions relative
692 # to the target type
693 $match_str = $self->seq_str('match');
694 my $tgt = $self->seq_str($seqType);
695 my $encode = $match_str ^ $tgt;
696 my $zap = '-'^' ';
697 $encode =~ s/$zap//g;
698 $tgt =~ s/-//g;
699 $match_str = $tgt ^ $encode;
702 ## ML: START fix for substr out of range error ------------------
703 my $seq = "";
704 if (($self->algorithm =~ /TBLAST[NX]/) && ($seqType eq 'sbjct'))
706 $seq = substr($match_str,
707 int(($beg-$start)/3),
708 int(($end-$beg+1)/3));
710 } elsif (($self->algorithm =~ /T?BLASTX/) && ($seqType eq 'query')) {
711 $seq = substr($match_str,
712 int(($beg-$start)/3), int(($end-$beg+1)/3));
713 } else {
714 $seq = substr($match_str,
715 $beg-$start, ($end-$beg+1));
717 ## ML: End of fix for substr out of range error -----------------
719 if(!CORE::length $seq) {
720 $self->throw("Undefined sub-sequence ($beg,$end). Valid range = $start - $stop");
723 $seq =~ s/ //g; # remove space (no info).
724 my $len_cons = CORE::length $seq;
725 $seq =~ s/\+//g; # remove '+' characters (conservative substitutions)
726 my $len_id = CORE::length $seq;
727 push @data, ($len_id, $len_cons);
729 @data;
732 =head2 n
734 Usage : $hsp_obj->n()
735 Purpose : Get the N value (num HSPs on which P/Expect is based).
736 : This value is not defined with NCBI Blast2 with gapping.
737 Returns : Integer or null string if not defined.
738 Argument : n/a
739 Throws : n/a
740 Comments : The 'N' value is listed in parenthesis with P/Expect value:
741 : e.g., P(3) = 1.2e-30 ---> (N = 3).
742 : Not defined in NCBI Blast2 with gaps.
743 : This typically is equal to the number of HSPs but not always.
744 : To obtain the number of HSPs, use Bio::Search::Hit::HitI::num_hsps().
746 See Also : L<Bio::SeqFeature::SimilarityPair::score()|Bio::SeqFeature::SimilarityPair>
748 =cut
750 sub n { shift->throw_not_implemented }
752 =head2 range
754 Usage : $hsp->range( [seq_type] );
755 Purpose : Gets the (start, end) coordinates for the query or sbjct sequence
756 : in the HSP alignment.
757 Example : ($query_beg, $query_end) = $hsp->range('query');
758 : ($hit_beg, $hit_end) = $hsp->range('hit');
759 Returns : Two-element array of integers
760 Argument : seq_type = string, 'query' or 'hit' or 'sbjct' (default = 'query')
761 : ('sbjct' is synonymous with 'hit')
762 Throws : n/a
763 Comments : This is a convenience method for constructions such as
764 ($hsp->query->start, $hsp->query->end)
766 =cut
768 sub range { shift->throw_not_implemented }
770 sub expect { shift->evalue(@_) }