Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / Search / Hit / ModelHit.pm
blobe6b8c21cf8be362062e8a6af9e10500e5c7302ad
2 # BioPerl module for Bio::Search::Hit::ModelHit
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields <cjfields at bioperl dot org>
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Search::Hit::ModelHit - A model-based implementation of the Bio::Search::Hit::HitI interface
18 =head1 SYNOPSIS
20 use Bio::Search::Hit::ModelHit;
21 my $hit = Bio::Search::Hit::ModelHit->new(-algorithm => 'rnamotif');
23 # typically one gets HitI objects from a SearchIO stream via a ResultI
24 use Bio::SearchIO;
25 my $parser = Bio::SearchIO->new(-format => 'infernal', -file => 'trap.inf');
27 my $result = $parser->next_result;
28 my $hit = $result->next_hit;
30 =head1 DESCRIPTION
32 This object handles the hit data from a database search using models or
33 descriptors instead of sequences, such as Infernal, HMMER, RNAMotif, etc.
35 Unless you're writing a parser, you won't ever need to create a ModelHit or
36 any other HitI-implementing object. If you use the SearchIO system, HitI objects
37 are created automatically from a SearchIO stream which returns
38 Bio::Search::Hit::HitI objects.
40 Note that several HitI-based methods have been overridden from ModelHit due to
41 their unreliability when dealing with queries that aren't sequence-based. It may
42 be possible to reimplement these at a later point, but for the time being they
43 will throw warnings and return w/o results.
45 For documentation on what you can do with ModelHit (and other HitI objects),
46 please see the API documentation in
47 L<Bio::Search::Hit::HitI|Bio::Search::Hit::HitI>.
49 =head1 FEEDBACK
51 =head2 Mailing Lists
53 User feedback is an integral part of the evolution of this and other
54 Bioperl modules. Send your comments and suggestions preferably to
55 the Bioperl mailing list. Your participation is much appreciated.
57 bioperl-l@bioperl.org - General discussion
58 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
60 =head2 Support
62 Please direct usage questions or support issues to the mailing list:
64 I<bioperl-l@bioperl.org>
66 rather than to the module maintainer directly. Many experienced and
67 reponsive experts will be able look at the problem and quickly
68 address it. Please include a thorough description of the problem
69 with code and data examples if at all possible.
71 =head2 Reporting Bugs
73 Report bugs to the Bioperl bug tracking system to help us keep track
74 of the bugs and their resolution. Bug reports can be submitted via the
75 web:
77 https://github.com/bioperl/bioperl-live/issues
79 =head1 AUTHOR - Chris Fields
81 Email cjfields at bioperl dot org
83 =head1 APPENDIX
85 The rest of the documentation details each of the object methods.
86 Internal methods are usually preceded with a _
88 =cut
90 # Let the code begin...
92 package Bio::Search::Hit::ModelHit;
94 use strict;
96 use base qw(Bio::Search::Hit::GenericHit);
98 =head1 HitI methods implemented in parent class Bio::Search::Hit::ModelHit
100 =head2 new
102 Title : new
103 Usage : my $obj = Bio::Search::Hit::ModelHit->new();
104 Function: Builds a new Bio::Search::Hit::ModelHit object
105 Returns : Bio::Search::Hit::ModelHit
106 Args : -name => Name of Hit (required)
107 -description => Description (optional)
108 -accession => Accession number (optional)
109 -ncbi_gi => NCBI GI UID (optional)
110 -length => Length of the Hit (optional)
111 -score => Raw Score for the Hit (optional)
112 -bits => Bit Score for the Hit (optional)
113 -significance => Significance value for the Hit (optional)
114 -algorithm => Algorithm used (BLASTP, FASTX, etc...)
115 -hsps => Array ref of HSPs for this Hit.
116 -found_again => boolean, true if hit appears in a
117 "previously found" section of a PSI-Blast report.
118 -hsp_factory => Bio::Factory::ObjectFactoryI able to create HSPI
119 objects.
121 =cut
123 =head2 add_hsp
125 Title : add_hsp
126 Usage : $hit->add_hsp($hsp)
127 Function: Add a HSP to the collection of HSPs for a Hit
128 Returns : number of HSPs in the Hit
129 Args : Bio::Search::HSP::HSPI object, OR hash ref containing data suitable
130 for creating a HSPI object (&hsp_factory must be set to get it back)
132 =cut
134 =head2 hsp_factory
136 Title : hsp_factory
137 Usage : $hit->hsp_factory($hsp_factory)
138 Function: Get/set the factory used to build HSPI objects if necessary.
139 Returns : Bio::Factory::ObjectFactoryI
140 Args : Bio::Factory::ObjectFactoryI
142 =cut
144 =head2 Bio::Search::Hit::HitI methods
146 Implementation of Bio::Search::Hit::HitI methods
148 =head2 name
150 Title : name
151 Usage : $hit_name = $hit->name();
152 Function: returns the name of the Hit sequence
153 Returns : a scalar string
154 Args : [optional] scalar string to set the name
156 =cut
158 =head2 accession
160 Title : accession
161 Usage : $acc = $hit->accession();
162 Function: Retrieve the accession (if available) for the hit
163 Returns : a scalar string (empty string if not set)
164 Args : none
166 =cut
168 =head2 description
170 Title : description
171 Usage : $desc = $hit->description();
172 Function: Retrieve the description for the hit
173 Returns : a scalar string
174 Args : [optional] scalar string to set the description
176 =cut
178 =head2 length
180 Title : length
181 Usage : my $len = $hit->length
182 Function: Returns the length of the hit
183 Returns : integer
184 Args : [optional] integer to set the length
186 =cut
188 =head2 algorithm
190 Title : algorithm
191 Usage : $alg = $hit->algorithm();
192 Function: Gets the algorithm specification that was used to obtain the hit
193 For BLAST, the algorithm denotes what type of sequence was aligned
194 against what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated
195 dna-prt, TBLASTN prt-translated dna, TBLASTX translated
196 dna-translated dna).
197 Returns : a scalar string
198 Args : [optional] scalar string to set the algorithm
200 =cut
202 =head2 raw_score
204 Title : raw_score
205 Usage : $score = $hit->raw_score();
206 Function: Gets the "raw score" generated by the algorithm. What
207 this score is exactly will vary from algorithm to algorithm,
208 returning undef if unavailable.
209 Returns : a scalar value
210 Args : [optional] scalar value to set the raw score
212 =cut
214 =head2 score
216 Equivalent to L<raw_score()|raw_score>
218 =cut
220 =head2 significance
222 Title : significance
223 Usage : $significance = $hit->significance();
224 Function: Used to obtain the E or P value of a hit, i.e. the probability that
225 this particular hit was obtained purely by random chance. If
226 information is not available (nor calculatable from other
227 information sources), return undef.
228 Returns : a scalar value or undef if unavailable
229 Args : [optional] scalar value to set the significance
231 =cut
233 =head2 bits
235 Usage : $hit_object->bits();
236 Purpose : Gets the bit score of the best HSP for the current hit.
237 Example : $bits = $hit_object->bits();
238 Returns : Integer or undef if bit score is not set
239 Argument : n/a
240 Comments : For BLAST1, the non-bit score is listed in the summary line.
242 See Also : L<score()|score>
244 =cut
246 =head2 next_hsp
248 Title : next_hsp
249 Usage : while( $hsp = $obj->next_hsp()) { ... }
250 Function : Returns the next available High Scoring Pair
251 Example :
252 Returns : Bio::Search::HSP::HSPI object or null if finished
253 Args : none
255 =cut
257 =head2 hsps
259 Usage : $hit_object->hsps();
260 Purpose : Get a list containing all HSP objects.
261 : Get the numbers of HSPs for the current hit.
262 Example : @hsps = $hit_object->hsps();
263 : $num = $hit_object->hsps(); # alternatively, use num_hsps()
264 Returns : Array context : list of Bio::Search::HSP::BlastHSP.pm objects.
265 : Scalar context: integer (number of HSPs).
266 : (Equivalent to num_hsps()).
267 Argument : n/a. Relies on wantarray
268 Throws : Exception if the HSPs have not been collected.
270 See Also : L<hsp()|hsp>, L<num_hsps()|num_hsps>
272 =cut
274 =head2 num_hsps
276 Usage : $hit_object->num_hsps();
277 Purpose : Get the number of HSPs for the present hit.
278 Example : $nhsps = $hit_object->num_hsps();
279 Returns : Integer or '-' if HSPs have not been callected
280 Argument : n/a
282 See Also : L<hsps()|hsps>
284 =cut
286 =head2 rewind
288 Title : rewind
289 Usage : $hit->rewind;
290 Function: Allow one to reset the HSP iterator to the beginning
291 Since this is an in-memory implementation
292 Returns : none
293 Args : none
295 =cut
297 =head2 ambiguous_aln
299 Usage : $ambig_code = $hit_object->ambiguous_aln();
300 Purpose : Sets/Gets ambiguity code data member.
301 Example : (see usage)
302 Returns : String = 'q', 's', 'qs', '-'
303 : 'q' = query sequence contains overlapping sub-sequences
304 : while sbjct does not.
305 : 's' = sbjct sequence contains overlapping sub-sequences
306 : while query does not.
307 : 'qs' = query and sbjct sequence contains overlapping sub-sequences
308 : relative to each other.
309 : '-' = query and sbjct sequence do not contains multiple domains
310 : relative to each other OR both contain the same distribution
311 : of similar domains.
312 Argument : n/a
313 Throws : n/a
314 Comment : Note: "sbjct" is synonymous with "hit"
316 =cut
318 =head2 overlap
320 See documentation in L<Bio::Search::Hit::HitI::overlap()|Bio::Search::Hit::HitI>
322 =cut
324 sub overlap {
325 my $self = shift;
326 $self->{'_overlap'} = shift if @_;
327 return exists $self->{'_overlap'} ? $self->{'_overlap'} : 0;
331 =head2 n
333 Usage : $hit_object->n();
334 Purpose : Gets the N number for the current hit.
335 : This is the number of HSPs in the set which was ascribed
336 : the lowest P-value (listed on the description line).
337 : This number is not the same as the total number of HSPs.
338 : To get the total number of HSPs, use num_hsps().
339 Example : $n = $hit_object->n();
340 Returns : Integer
341 Argument : n/a
342 Throws : Exception if HSPs have not been set.
343 Comments : Calling n() on such reports will result in a call to num_hsps().
344 : The num_hsps() method will count the actual number of
345 : HSPs in the alignment listing, which may exceed N in
346 : some cases.
348 See Also : L<num_hsps()|num_hsps>
350 =cut
352 sub n {
353 my $self = shift;
354 $self->{'_n'} = shift if @_;
355 return exists $self->{'_n'} ? $self->{'_n'} : $self->num_hsps;
359 =head2 p
361 Usage : $hit_object->p( [format] );
362 Purpose : Get the P-value for the best HSP
363 Example : $p = $sbjct->p;
364 : $p = $sbjct->p('exp'); # get exponent only.
365 : ($num, $exp) = $sbjct->p('parts'); # split sci notation into parts
366 Returns : Float or scientific notation number (the raw P-value, DEFAULT).
367 : Integer if format == 'exp' (the magnitude of the base 10 exponent).
368 : 2-element list (float, int) if format == 'parts' and P-value
369 : is in scientific notation (See Comments).
370 Argument : format: string of 'raw' | 'exp' | 'parts'
371 : 'raw' returns value given in report. Default. (1.2e-34)
372 : 'exp' returns exponent value only (34)
373 : 'parts' returns the decimal and exponent as a
374 : 2-element list (1.2, -34) (See Comments).
375 Throws : Warns if no P-value is defined. Uses expect instead.
376 Comments : Using the 'parts' argument is not recommended since it will not
377 : work as expected if the P-value is not in scientific notation.
378 : That is, floats are not converted into sci notation before
379 : splitting into parts.
381 See Also : L<expect()|expect>, L<signif()|signif>, L<Bio::Search::SearchUtils::get_exponent()|Bio::Search::SearchUtils>
383 =cut
385 =head2 hsp
387 Usage : $hit_object->hsp( [string] );
388 Purpose : Get a single HSPI object for the present HitI object.
389 Example : $hspObj = $hit_object->hsp; # same as 'best'
390 : $hspObj = $hit_object->hsp('best');
391 : $hspObj = $hit_object->hsp('worst');
392 Returns : Object reference for a Bio::Search::HSP::BlastHSP.pm object.
393 Argument : String (or no argument).
394 : No argument (default) = highest scoring HSP (same as 'best').
395 : 'best' or 'first' = highest scoring HSP.
396 : 'worst' or 'last' = lowest scoring HSP.
397 Throws : Exception if the HSPs have not been collected.
398 : Exception if an unrecognized argument is used.
400 See Also : L<hsps()|hsps>, L<num_hsps>()
402 =cut
404 sub hsp {
405 my( $self, $option ) = @_;
406 $option ||= 'best';
408 if (not ref $self->{'_hsps'}) {
409 $self->throw("Can't get HSPs: data not collected.");
412 my @hsps = $self->hsps;
414 return $hsps[0] if $option =~ /best|first|1/i;
415 return $hsps[$#hsps] if $option =~ /worst|last/i;
417 $self->throw("Can't get HSP for: $option\n" .
418 "Valid arguments: 'best', 'worst'");
421 =head2 rank
423 Title : rank
424 Usage : $obj->rank($newval)
425 Function: Get/Set the rank of this Hit in the Query search list
426 i.e. this is the Nth hit for a specific query
427 Returns : value of rank
428 Args : newvalue (optional)
431 =cut
433 sub rank {
434 my $self = shift;
435 return $self->{'_rank'} = shift if @_;
436 return $self->{'_rank'} || 1;
439 =head2 locus
441 Title : locus
442 Usage : $locus = $hit->locus();
443 Function: Retrieve the locus (if available) for the hit
444 Returns : a scalar string (empty string if not set)
445 Args : none
447 =cut
449 sub locus {
450 my ($self,$value) = @_;
451 my $previous = $self->{'_locus'};
452 if( defined $value || ! defined $previous ) {
453 unless (defined $value) {
454 if ($self->{'_name'} =~/(gb|emb|dbj|ref)\|(.*)\|(.*)/) {
455 $value = $previous = $3;
456 } else {
457 $value = $previous = '';
460 $self->{'_locus'} = $value;
462 return $previous;
465 =head2 each_accession_number
467 Title : each_accession_number
468 Usage : @each_accession_number = $hit->each_accession_number();
469 Function: Get each accession number listed in the description of the hit.
470 If there are no alternatives, then only the primary accession will
471 be given
472 Returns : list of all accession numbers in the description
473 Args : none
475 =cut
477 sub each_accession_number {
478 my ($self,$value) = @_;
479 my $desc = $self->{'_description'};
480 #put primary accnum on the list
481 my @accnums;
482 push (@accnums,$self->{'_accession'});
483 if( defined $desc ) {
484 while ($desc =~ /(\b\S+\|\S*\|\S*\s?)/g) {
485 my $id = $1;
486 my ($acc, $version);
487 if ($id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|tp[gde])\|(.*)\|(.*)/) {
488 ($acc, $version) = split /\./, $2;
489 } elsif ($id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/) {
490 ($acc, $version) = split /\./, $3;
491 } elsif( $id =~ /(gim|gi|bbm|bbs|lcl)\|(\d*)/) {
492 $acc = $id;
493 } elsif( $id =~ /(oth)\|(.*)\|(.*)\|(.*)/ ) { # discontinued...
494 ($acc,$version) = ($2);
495 } else {
496 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
497 #Database Name Identifier Syntax
498 #============================ ========================
499 #GenBank gb|accession|locus
500 #EMBL Data Library emb|accession|locus
501 #DDBJ, DNA Database of Japan dbj|accession|locus
502 #NBRF PIR pir||entry
503 #Protein Research Foundation prf||name
504 #SWISS-PROT sp|accession|entry name
505 #Brookhaven Protein Data Bank pdb|entry|chain
506 #Patents pat|country|number
507 #GenInfo Backbone Id bbs|number
508 #General database identifier gnl|database|identifier
509 #NCBI Reference Sequence ref|accession|locus
510 #Local Sequence identifier lcl|identifier
511 $acc=$id;
513 push(@accnums, $acc);
516 return @accnums;
519 =head2 tiled_hsps
521 See documentation in L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils>
523 =cut
525 =head2 query_length
527 Title : query_length
528 Usage : $obj->query_length($newval)
529 Function: Get/Set the query_length
530 Returns : value of query_length (a scalar)
531 Args : on set, new value (a scalar or undef, optional)
534 =cut
536 sub query_length {
537 my $self = shift;
539 return $self->{'_query_length'} = shift if @_;
540 return $self->{'_query_length'};
543 =head2 ncbi_gi
545 Title : ncbi_gi
546 Usage : $acc = $hit->ncbi_gi();
547 Function: Retrieve the NCBI Unique ID (aka the GI #),
548 if available, for the hit
549 Returns : a scalar string (empty string if not set)
550 Args : none
551 Note : As of Sept. 2016 NCBI records will no longer have a
552 GI; this attributue will remain in place for older
553 records
555 =cut
557 sub ncbi_gi {
558 my ($self,$value) = @_;
559 my $previous = $self->{'_ncbi_gi'};
560 if( defined $value || ! defined $previous ) {
561 $value = $previous = '' unless defined $value;
562 $self->{'_ncbi_gi'} = $value;
564 return $previous;
567 =head1 ModelHit methods overridden in ModelHit
569 The following methods have been overridden due to their current reliance on
570 sequence-based queries. They may be implemented in future versions of this class.
572 =head2 length_aln
574 =cut
576 sub length_aln {
577 my $self = shift;
578 $self->warn('$hit->length_aln not implemented for Model-based searches');
579 return;
582 =head2 gaps
584 =cut
586 sub gaps {
587 my $self = shift;
588 $self->warn('$hit->gaps not implemented for Model-based searches');
589 return;
592 =head2 matches
594 =cut
596 sub matches {
597 my $self = shift;
598 $self->warn('$hit->matches not implemented for Model-based searches');
599 return;
602 =head2 start
604 =cut
606 sub start {
607 my $self = shift;
608 $self->warn('$hit->start not implemented for Model-based searches');
609 return;
613 =head2 end
615 =cut
617 sub end {
618 my $self = shift;
619 $self->warn('$hit->end not implemented for Model-based searches');
620 return;
623 =head2 range
625 =cut
627 sub range {
628 my $self = shift;
629 $self->warn('$hit->range not implemented for Model-based searches');
630 return;
633 =head2 frac_identical
635 =cut
637 sub frac_identical {
638 my $self = shift;
639 $self->warn('$hit->frac_identical not implemented for Model-based searches');
640 return;
643 =head2 frac_conserved
645 =cut
647 sub frac_conserved {
648 my $self = shift;
649 $self->warn('$hit->frac_conserved not implemented for Model-based searches');
650 return;
653 =head2 frac_aligned_query
655 =cut
657 sub frac_aligned_query {
658 my $self = shift;
659 $self->warn('$hit->frac_aligned_query not implemented for Model-based searches');
660 return;
663 =head2 frac_aligned_hit
665 =cut
667 sub frac_aligned_hit {
668 my $self = shift;
669 $self->warn('$hit->frac_aligned_hit not implemented for Model-based searches');
670 return;
673 =head2 num_unaligned_hit
675 =cut
677 *num_unaligned_sbjct = \&num_unaligned_hit;
679 sub num_unaligned_hit {
680 my $self = shift;
681 $self->warn('$hit->num_unaligned_hit/num_unaligned_sbjct not implemented for Model-based searches');
682 return;
685 =head2 num_unaligned_query
687 =cut
689 sub num_unaligned_query {
690 my $self = shift;
691 $self->warn('$hit->num_unaligned_query not implemented for Model-based searches');
692 return;
695 =head2 seq_inds
697 =cut
699 sub seq_inds {
700 my $self = shift;
701 $self->warn('$hit->seq_inds not implemented for Model-based searches');
702 return;
705 =head2 strand
707 =cut
709 sub strand {
710 my $self = shift;
711 $self->warn('$hit->strand not implemented for Model-based searches');
712 return;
715 =head2 frame
717 =cut
719 sub frame {
720 my $self = shift;
721 $self->warn('$hit->frame not implemented for Model-based searches');
722 return;
725 =head2 logical_length
727 =cut
729 sub logical_length {
730 my $self = shift;
731 $self->warn('$hit->logical_length not implemented for Model-based searches');
732 return;