Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / Search / Hit / HitI.pm
blob89e65c88d3a40fd4724752b76517dca2e9a52839
1 #-----------------------------------------------------------------
3 # BioPerl module Bio::Search::Hit::HitI
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Steve Chervitz <sac@bioperl.org>
9 # Originally created by Aaron Mackey <amackey@virginia.edu>
11 # You may distribute this module under the same terms as perl itself
12 #-----------------------------------------------------------------
14 # POD documentation - main docs before the code
16 =head1 NAME
18 Bio::Search::Hit::HitI - Interface for a hit in a similarity search result
20 =head1 SYNOPSIS
22 # Bio::Search::Hit::HitI objects should not be instantiated since this
23 # module defines a pure interface.
25 # Given an object that implements the Bio::Search::Hit::HitI interface,
26 # you can do the following things with it:
28 # Get a HitI object from a SearchIO stream:
29 use Bio::SeachIO;
30 my $searchio = Bio::SearchIO->new(-format => 'blast', -file => 'result.bls');
31 my $result = $searchio->next_result;
32 my $hit = $result->next_hit;
34 $hit_name = $hit->name();
36 $desc = $hit->description();
38 $len = $hit->length
40 $alg = $hit->algorithm();
42 $score = $hit->raw_score();
44 $significance = $hit->significance();
46 $rank = $hit->rank(); # the Nth hit for a specific query
48 while( $hsp = $obj->next_hsp()) { ... } # process in iterator fashion
50 for my $hsp ( $obj->hsps()()) { ... } # process in list fashion
52 =head1 DESCRIPTION
54 Bio::Search::Hit::* objects are data structures that contain information
55 about specific hits obtained during a library search. Some information will
56 be algorithm-specific, but others will be generally defined.
58 =head1 FEEDBACK
60 =head2 Mailing Lists
62 User feedback is an integral part of the evolution of this and other
63 Bioperl modules. Send your comments and suggestions preferably to one
64 of the Bioperl mailing lists. Your participation is much appreciated.
66 bioperl-l@bioperl.org - General discussion
67 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
69 =head2 Support
71 Please direct usage questions or support issues to the mailing list:
73 I<bioperl-l@bioperl.org>
75 rather than to the module maintainer directly. Many experienced and
76 reponsive experts will be able look at the problem and quickly
77 address it. Please include a thorough description of the problem
78 with code and data examples if at all possible.
80 =head2 Reporting Bugs
82 Report bugs to the Bioperl bug tracking system to help us keep track
83 the bugs and their resolution. Bug reports can be submitted via the
84 web:
86 https://github.com/bioperl/bioperl-live/issues
88 =head1 AUTHOR - Aaron Mackey, Steve Chervitz
90 Email amackey@virginia.edu (original author)
91 Email sac@bioperl.org
93 =head1 COPYRIGHT
95 Copyright (c) 1999-2001 Aaron Mackey, Steve Chervitz. All Rights Reserved.
97 =head1 DISCLAIMER
99 This software is provided "as is" without warranty of any kind.
101 =head1 APPENDIX
103 The rest of the documentation details each of the object
104 methods. Internal methods are usually preceded with a _
106 =cut
108 # Let the code begin...
110 package Bio::Search::Hit::HitI;
113 use strict;
115 use base qw(Bio::Root::RootI);
118 =head2 name
120 Title : name
121 Usage : $hit_name = $hit->name();
122 Function: returns the name of the Hit sequence
123 Returns : a scalar string
124 Args : none
126 The B<name> of a hit is unique within a Result or within an Iteration.
128 =cut
130 sub name {
131 my ($self,@args) = @_;
132 $self->throw_not_implemented;
135 =head2 description
137 Title : description
138 Usage : $desc = $hit->description();
139 Function: Retrieve the description for the hit
140 Returns : a scalar string
141 Args : none
143 =cut
145 sub description {
146 my ($self,@args) = @_;
147 $self->throw_not_implemented;
151 =head2 accession
153 Title : accession
154 Usage : $acc = $hit->accession();
155 Function: Retrieve the accession (if available) for the hit
156 Returns : a scalar string (empty string if not set)
157 Args : none
159 =cut
161 sub accession {
162 my ($self,@args) = @_;
163 $self->throw_not_implemented;
166 =head2 locus
168 Title : locus
169 Usage : $acc = $hit->locus();
170 Function: Retrieve the locus(if available) for the hit
171 Returns : a scalar string (empty string if not set)
172 Args : none
174 =cut
176 sub locus {
177 my ($self,@args) = @_;
178 $self->throw_not_implemented;
181 =head2 length
183 Title : length
184 Usage : my $len = $hit->length
185 Function: Returns the length of the hit
186 Returns : integer
187 Args : none
189 =cut
191 sub length {
192 my ($self,@args) = @_;
193 $self->throw_not_implemented;
197 =head2 algorithm
199 Title : algorithm
200 Usage : $alg = $hit->algorithm();
201 Function: Gets the algorithm specification that was used to obtain the hit
202 For BLAST, the algorithm denotes what type of sequence was aligned
203 against what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated
204 dna-prt, TBLASTN prt-translated dna, TBLASTX translated
205 dna-translated dna).
206 Returns : a scalar string
207 Args : none
209 =cut
211 sub algorithm {
212 my ($self,@args) = @_;
213 $self->throw_not_implemented;
216 =head2 raw_score
218 Title : raw_score
219 Usage : $score = $hit->raw_score();
220 Function: Gets the "raw score" generated by the algorithm. What
221 this score is exactly will vary from algorithm to algorithm,
222 returning undef if unavailable.
223 Returns : a scalar value
224 Args : none
226 =cut
228 sub raw_score {
229 $_[0]->throw_not_implemented;
232 =head2 score
234 Equivalent to L<raw_score()|raw_score>
236 =cut
238 sub score { shift->raw_score(@_); }
240 =head2 significance
242 Title : significance
243 Usage : $significance = $hit->significance();
244 Function: Used to obtain the E or P value of a hit, i.e. the probability that
245 this particular hit was obtained purely by random chance. If
246 information is not available (nor calculatable from other
247 information sources), return undef.
248 Returns : a scalar value or undef if unavailable
249 Args : none
251 =cut
253 sub significance {
254 $_[0]->throw_not_implemented;
257 =head2 bits
259 Usage : $hit_object->bits();
260 Purpose : Gets the bit score of the best HSP for the current hit.
261 Example : $bits = $hit_object->bits();
262 Returns : Integer or double for FASTA reports
263 Argument : n/a
264 Comments : For BLAST1, the non-bit score is listed in the summary line.
266 See Also : L<score()|score>
268 =cut
270 #---------
271 sub bits {
272 #---------
273 $_[0]->throw_not_implemented();
276 =head2 next_hsp
278 Title : next_hsp
279 Usage : while( $hsp = $obj->next_hsp()) { ... }
280 Function : Returns the next available High Scoring Pair
281 Example :
282 Returns : L<Bio::Search::HSP::HSPI> object or null if finished
283 Args : none
285 =cut
287 sub next_hsp {
288 my ($self,@args) = @_;
289 $self->throw_not_implemented;
293 =head2 hsps
295 Usage : $hit_object->hsps();
296 Purpose : Get a list containing all HSP objects.
297 : Get the numbers of HSPs for the current hit.
298 Example : @hsps = $hit_object->hsps();
299 : $num = $hit_object->hsps(); # alternatively, use num_hsps()
300 Returns : Array context : list of L<Bio::Search::HSP::BlastHSP> objects.
301 : Scalar context: integer (number of HSPs).
302 : (Equivalent to num_hsps()).
303 Argument : n/a. Relies on wantarray
304 Throws : Exception if the HSPs have not been collected.
306 See Also : L<hsp()|hsp>, L<num_hsps()|num_hsps>
308 =cut
310 #---------
311 sub hsps {
312 #---------
313 my $self = shift;
315 $self->throw_not_implemented();
320 =head2 num_hsps
322 Usage : $hit_object->num_hsps();
323 Purpose : Get the number of HSPs for the present Blast hit.
324 Example : $nhsps = $hit_object->num_hsps();
325 Returns : Integer
326 Argument : n/a
327 Throws : Exception if the HSPs have not been collected.
329 See Also : L<hsps()|hsps>
331 =cut
333 #-------------
334 sub num_hsps {
335 #-------------
336 shift->throw_not_implemented();
340 =head2 seq_inds
342 Usage : $hit->seq_inds( seq_type, class, collapse );
343 Purpose : Get a list of residue positions (indices) across all HSPs
344 : for identical or conserved residues in the query or sbjct sequence.
345 Example : @s_ind = $hit->seq_inds('query', 'identical');
346 : @h_ind = $hit->seq_inds('hit', 'conserved');
347 : @h_ind = $hit->seq_inds('hit', 'conserved', 1);
348 Returns : Array of integers
349 : May include ranges if collapse is non-zero.
350 Argument : [0] seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
351 : ('sbjct' is synonymous with 'hit')
352 : [1] class = 'identical' or 'conserved' (default = 'identical')
353 : (can be shortened to 'id' or 'cons')
354 : (actually, anything not 'id' will evaluate to 'conserved').
355 : [2] collapse = boolean, if non-zero, consecutive positions are merged
356 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
357 : collapses to "1-5 7 9-11". This is useful for
358 : consolidating long lists. Default = no collapse.
359 Throws : n/a.
361 See Also : L<Bio::Search::HSP::HSPI::seq_inds()|Bio::Search::HSP::HSPI>
363 =cut
365 #-------------
366 sub seq_inds {
367 #-------------
368 my ($self, $seqType, $class, $collapse) = @_;
370 $seqType ||= 'query';
371 $class ||= 'identical';
372 $collapse ||= 0;
374 $seqType = 'sbjct' if $seqType eq 'hit';
376 my (@inds, $hsp);
377 foreach $hsp ($self->hsps) {
378 # This will merge data for all HSPs together.
379 push @inds, $hsp->seq_inds($seqType, $class);
382 # Need to remove duplicates and sort the merged positions.
383 if(@inds) {
384 my %tmp = map { $_, 1 } @inds;
385 @inds = sort {$a <=> $b} keys %tmp;
388 $collapse ? &Bio::Search::BlastUtils::collapse_nums(@inds) : @inds;
391 =head2 rewind
393 Title : rewind
394 Usage : $hit->rewind;
395 Function: Allow one to reset the HSP iterator to the beginning
396 if possible
397 Returns : none
398 Args : none
400 =cut
402 sub rewind{
403 my ($self) = @_;
404 $self->throw_not_implemented();
408 =head2 overlap
410 Usage : $hit_object->overlap( [integer] );
411 Purpose : Gets/Sets the allowable amount overlap between different HSP sequences.
412 Example : $hit_object->overlap(5);
413 : $overlap = $hit_object->overlap;
414 Returns : Integer.
415 Argument : integer.
416 Throws : n/a
417 Status : Experimental
418 Comments : Any two HSPs whose sequences overlap by less than or equal
419 : to the overlap() number of resides will be considered separate HSPs
420 : and will not get tiled by L<Bio::Search::BlastUtils::_adjust_contigs()>.
422 See Also : L<Bio::Search::BlastUtils::_adjust_contigs()|Bio::Search::BlastUtils>, L<BUGS | BUGS>
424 =cut
426 #-------------
427 sub overlap { shift->throw_not_implemented }
430 =head2 n
432 Usage : $hit_object->n();
433 Purpose : Gets the N number for the current Blast hit.
434 : This is the number of HSPs in the set which was ascribed
435 : the lowest P-value (listed on the description line).
436 : This number is not the same as the total number of HSPs.
437 : To get the total number of HSPs, use num_hsps().
438 Example : $n = $hit_object->n();
439 Returns : Integer
440 Argument : n/a
441 Throws : Exception if HSPs have not been set (BLAST2 reports).
442 Comments : Note that the N parameter is not reported in gapped BLAST2.
443 : Calling n() on such reports will result in a call to num_hsps().
444 : The num_hsps() method will count the actual number of
445 : HSPs in the alignment listing, which may exceed N in
446 : some cases.
448 See Also : L<num_hsps()|num_hsps>
450 =cut
452 #-----
453 sub n { shift->throw_not_implemented }
455 =head2 p
457 Usage : $hit_object->p( [format] );
458 Purpose : Get the P-value for the best HSP of the given BLAST hit.
459 : (Note that P-values are not provided with NCBI Blast2 reports).
460 Example : $p = $sbjct->p;
461 : $p = $sbjct->p('exp'); # get exponent only.
462 : ($num, $exp) = $sbjct->p('parts'); # split sci notation into parts
463 Returns : Float or scientific notation number (the raw P-value, DEFAULT).
464 : Integer if format == 'exp' (the magnitude of the base 10 exponent).
465 : 2-element list (float, int) if format == 'parts' and P-value
466 : is in scientific notation (See Comments).
467 Argument : format: string of 'raw' | 'exp' | 'parts'
468 : 'raw' returns value given in report. Default. (1.2e-34)
469 : 'exp' returns exponent value only (34)
470 : 'parts' returns the decimal and exponent as a
471 : 2-element list (1.2, -34) (See Comments).
472 Throws : Warns if no P-value is defined. Uses expect instead.
473 Comments : Using the 'parts' argument is not recommended since it will not
474 : work as expected if the P-value is not in scientific notation.
475 : That is, floats are not converted into sci notation before
476 : splitting into parts.
478 See Also : L<expect()|expect>, L<signif()|signif>, L<Bio::Search::BlastUtils::get_exponent()|Bio::Search::BlastUtils>
480 =cut
482 #--------
483 sub p { shift->throw_not_implemented() }
485 =head2 hsp
487 Usage : $hit_object->hsp( [string] );
488 Purpose : Get a single HSPI object for the present HitI object.
489 Example : $hspObj = $hit_object->hsp; # same as 'best'
490 : $hspObj = $hit_object->hsp('best');
491 : $hspObj = $hit_object->hsp('worst');
492 Returns : Object reference for a L<Bio::Search::HSP::HSPI> object.
493 Argument : String (or no argument).
494 : No argument (default) = highest scoring HSP (same as 'best').
495 : 'best' or 'first' = highest scoring HSP.
496 : 'worst' or 'last' = lowest scoring HSP.
497 Throws : Exception if the HSPs have not been collected.
498 : Exception if an unrecognized argument is used.
500 See Also : L<hsps()|hsps>, L<num_hsps>()
502 =cut
504 #----------
505 sub hsp { shift->throw_not_implemented }
507 =head2 logical_length
509 Usage : $hit_object->logical_length( [seq_type] );
510 : (mostly intended for internal use).
511 Purpose : Get the logical length of the hit sequence.
512 : If the Blast is a TBLASTN or TBLASTX, the returned length
513 : is the length of the would-be amino acid sequence (length/3).
514 : For all other BLAST flavors, this function is the same as length().
515 Example : $len = $hit_object->logical_length();
516 Returns : Integer
517 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
518 ('sbjct' is synonymous with 'hit')
519 Throws : n/a
520 Comments : This is important for functions like frac_aligned_query()
521 : which need to operate in amino acid coordinate space when dealing
522 : with [T]BLAST[NX] type reports.
524 See Also : L<length()|length>, L<frac_aligned_query()|frac_aligned_query>, L<frac_aligned_hit()|frac_aligned_hit>
526 =cut
528 #--------------------
529 sub logical_length { shift->throw_not_implemented() }
532 =head2 rank
534 Title : rank
535 Usage : $obj->rank($newval)
536 Function: Get/Set the rank of this Hit in the Query search list
537 i.e. this is the Nth hit for a specific query
538 Returns : value of rank
539 Args : newvalue (optional)
542 =cut
544 sub rank{
545 my ($self,$value) = @_;
546 $self->throw_not_implemented();
549 =head2 each_accession_number
551 Title : each_accession_number
552 Usage : $obj->each_accession_number
553 Function: Get each accession number listed in the description of the hit.
554 If there are no alternatives, then only the primary accession will
555 be given
556 Returns : list of all accession numbers in the description
557 Args : none
560 =cut
562 sub each_accession_number{
563 my ($self,$value) = @_;
564 $self->throw_not_implemented();
568 =head2 tiled_hsps
570 Usage : $hit_object->tiled_hsps( [integer] );
571 Purpose : Gets/Sets an indicator for whether or not the HSPs in this Hit
572 : have been tiled.
573 : Methods that rely on HSPs being tiled should check this
574 : and then call SearchUtils::tile_hsps() if not.
575 Example : $hit_object->tiled_hsps(1);
576 : if( $hit_object->tiled_hsps ) { # do something }
577 Returns : Boolean (1 or 0)
578 Argument : integer (optional)
579 Throws : n/a
581 =cut
583 sub tiled_hsps { shift->throw_not_implemented }
586 =head2 strand
588 Usage : $sbjct->strand( [seq_type] );
589 Purpose : Gets the strand(s) for the query, sbjct, or both sequences
590 : in the best HSP of the BlastHit object after HSP tiling.
591 : Only valid for BLASTN, TBLASTX, BLASTX-query, TBLASTN-hit.
592 Example : $qstrand = $sbjct->strand('query');
593 : $sstrand = $sbjct->strand('hit');
594 : ($qstrand, $sstrand) = $sbjct->strand();
595 Returns : scalar context: integer '1', '-1', or '0'
596 : array context without args: list of two strings (queryStrand, sbjctStrand)
597 : Array context can be "induced" by providing an argument of 'list' or 'array'.
598 Argument : In scalar context: seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
599 ('sbjct' is synonymous with 'hit')
600 Throws : n/a
601 Comments : This method requires that all HSPs be tiled. If they have not
602 : already been tiled, they will be tiled first automatically..
603 : If you don't want the tiled data, iterate through each HSP
604 : calling strand() on each (use hsps() to get all HSPs).
606 : Formerly (prior to 10/21/02), this method would return the
607 : string "-1/1" for hits with HSPs on both strands.
608 : However, now that strand and frame is properly being accounted
609 : for during HSP tiling, it makes more sense for strand()
610 : to return the strand data for the best HSP after tiling.
612 : If you really want to know about hits on opposite strands,
613 : you should be iterating through the HSPs using methods on the
614 : HSP objects.
616 : A possible use case where knowing whether a hit has HSPs
617 : on both strands would be when filtering via SearchIO for hits with
618 : this property. However, in this case it would be better to have a
619 : dedicated method such as $hit->hsps_on_both_strands(). Similarly
620 : for frame. This could be provided if there is interest.
622 See Also : L<Bio::Search::HSP::HSPI::strand>()
624 =cut
626 #---------'
627 sub strand { shift->throw_not_implemented }
630 =head2 frame
632 Usage : $hit_object->frame();
633 Purpose : Gets the reading frame for the best HSP after HSP tiling.
634 : This is only valid for BLASTX and TBLASTN/X type reports.
635 Example : $frame = $hit_object->frame();
636 Returns : Integer (-2 .. +2)
637 Argument : n/a
638 Throws : Exception if HSPs have not been set.
639 Comments : This method requires that all HSPs be tiled. If they have not
640 : already been tiled, they will be tiled first automatically..
641 : If you don't want the tiled data, iterate through each HSP
642 : calling frame() on each (use hsps() to get all HSPs).
644 See Also : L<hsps()|hsps>
646 =cut
648 #---------'
649 sub frame { shift->throw_not_implemented }
652 =head2 matches
654 Usage : $hit_object->matches( [class] );
655 Purpose : Get the total number of identical or conserved matches
656 : (or both) across all HSPs.
657 : (Note: 'conservative' matches are indicated as 'positives'
658 : in BLAST reports.)
659 Example : ($id,$cons) = $hit_object->matches(); # no argument
660 : $id = $hit_object->matches('id');
661 : $cons = $hit_object->matches('cons');
662 Returns : Integer or a 2-element array of integers
663 Argument : class = 'id' | 'cons' OR none.
664 : If no argument is provided, both identical and conservative
665 : numbers are returned in a two element list.
666 : (Other terms can be used to refer to the conservative
667 : matches, e.g., 'positive'. All that is checked is whether or
668 : not the supplied string starts with 'id'. If not, the
669 : conservative matches are returned.)
670 Throws : Exception if the requested data cannot be obtained.
671 Comments : This method requires that all HSPs be tiled. If there is more than one
672 : HSP and they have not already been tiled, they will be tiled first automatically..
674 : If you need data for each HSP, use hsps() and then interate
675 : through the HSP objects.
676 : Does not rely on wantarray to return a list. Only checks for
677 : the presence of an argument (no arg = return list).
679 See Also : L<Bio::Search::HSP::GenericHSP::matches()|Bio::Search::HSP::GenericHSP>, L<hsps()|hsps>
681 =cut
683 sub matches { shift->throw_not_implemented }
686 # aliasing for Steve's method names
687 sub hit_description { shift->description(@_) }
688 # aliasing for Steve's method names
689 sub hit_length { shift->length(@_) }
692 # sort method for HSPs
694 =head2 sort_hits
696 Title : sort_hsps
697 Usage : $result->sort_hsps(\&sort_function)
698 Function : Sorts the available HSP objects by a user-supplied function. Defaults to sort
699 by descending score.
700 Returns : n/a
701 Args : A coderef for the sort function. See the documentation on the Perl sort()
702 function for guidelines on writing sort functions.
703 Note : To access the special variables $a and $b used by the Perl sort() function
704 the user function must access Bio::Search::Hit::HitI namespace.
705 For example, use :
706 $hit->sort_hsps( sub{$Bio::Search::Result::HitI::a->length <=>
707 $Bio::Search::Result::HitI::b->length});
708 NOT $hit->sort_hsps($a->length <=> $b->length);
710 =cut
712 sub sort_hsps {shift->throw_not_implemented }
714 =head2 _default sort_hsps
716 Title : _default_sort_hsps
717 Usage : Do not call directly.
718 Function : Sort hsps in ascending order by evalue
719 Args : None
720 Returns: 1 on success
721 Note : Used by $hit->sort_hsps()
723 =cut
725 sub _default_sort_hsps {
726 $Bio::Search::Hit::HitI::a->evalue <=>
727 $Bio::Search::Hit::HitI::a->evalue;