Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / Search / HSP / ModelHSP.pm
blobc83905bb3daea15ccbab650d489c2d52b536f66c
2 # BioPerl module for Bio::Search::HSP::ModelHSP
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields <cjfields at bioperl dot org>
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Search::HSP::ModelHSP - A HSP object for model-based searches
18 =head1 SYNOPSIS
20 use Bio::Search::HSP::ModelHSP;
21 # us it just like a Bio::Search::HSP::ModelHSP object
23 =head1 DESCRIPTION
25 This object is a specialization of L<Bio::Search::HSP::ModelHSP> and is used
26 for searches which involve a query model, such as a Hidden Markov Model (HMM),
27 covariance model (CM), descriptor, or anything else besides a sequence. Note
28 that results from any HSPI class methods which rely on the query being a
29 sequence are unreliable and have thus been overridden with warnings indicating
30 they have not been implemented at this time.
32 =head1 FEEDBACK
34 =head2 Mailing Lists
36 User feedback is an integral part of the evolution of this and other
37 Bioperl modules. Send your comments and suggestions preferably to
38 the Bioperl mailing list. Your participation is much appreciated.
40 bioperl-l@bioperl.org - General discussion
41 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 =head2 Support
45 Please direct usage questions or support issues to the mailing list:
47 I<bioperl-l@bioperl.org>
49 rather than to the module maintainer directly. Many experienced and
50 reponsive experts will be able look at the problem and quickly
51 address it. Please include a thorough description of the problem
52 with code and data examples if at all possible.
54 =head2 Reporting Bugs
56 Report bugs to the Bioperl bug tracking system to help us keep track
57 of the bugs and their resolution. Bug reports can be submitted via the
58 web:
60 https://github.com/bioperl/bioperl-live/issues
62 =head1 AUTHOR - Chris Fields
64 Email cjfields at bioperl dot org
66 =head1 APPENDIX
68 The rest of the documentation details each of the object methods.
69 Internal methods are usually preceded with a _
71 =cut
73 # Let the code begin...
75 package Bio::Search::HSP::ModelHSP;
76 use strict;
77 use Bio::Seq::Meta;
79 use base qw(Bio::Search::HSP::GenericHSP);
81 =head2 new
83 Title : new
84 Usage : my $obj = Bio::Search::HSP::ModelHSP->new();
85 Function: Builds a new Bio::Search::HSP::ModelHSP object
86 Returns : Bio::Search::HSP::ModelHSP
87 Args :
89 Plus Bio::Seach::HSP::ModelHSP methods
91 -algorithm => algorithm used (Infernal, RNAMotif, ERPIN, etc)
92 -evalue => evalue
93 -pvalue => pvalue
94 -bits => bit value for HSP
95 -score => score value for HSP (typically z-score but depends on
96 analysis)
97 -hsp_length=> Length of the HSP (including gaps)
98 -identical => # of residues that that matched identically
99 -conserved => # of residues that matched conservatively
100 (only protein comparisons;
101 conserved == identical in nucleotide comparisons)
102 -hsp_gaps => # of gaps in the HSP
103 -query_gaps => # of gaps in the query in the alignment
104 -hit_gaps => # of gaps in the subject in the alignment
105 -query_name => HSP Query sequence name (if available)
106 -query_start => HSP Query start (in original query sequence coords)
107 -query_end => HSP Query end (in original query sequence coords)
108 -hit_name => HSP Hit sequence name (if available)
109 -hit_start => HSP Hit start (in original hit sequence coords)
110 -hit_end => HSP Hit end (in original hit sequence coords)
111 -hit_length => total length of the hit sequence
112 -query_length=> total length of the query sequence
113 -query_seq => query sequence portion of the HSP
114 -hit_seq => hit sequence portion of the HSP
115 -homology_seq=> homology sequence for the HSP
116 -hit_frame => hit frame (only if hit is translated protein)
117 -query_frame => query frame (only if query is translated protein)
118 -meta => optional meta data (sec structure, markup, etc)
119 -custom_score=> custom score data
121 =cut
123 =head2 meta
125 Title : meta
126 Usage : my $meta = $hsp->meta();
127 Function: Returns meta data for this HSP or undef
128 Returns : string of meta data or undef
129 Args : [optional] string to set value
130 Note : At some point very soon this will likely be a Bio::AnnotationI.
131 Don't get used to a simple string!
133 =cut
135 sub meta {
136 my ($self,$value) = @_;
137 my $previous = $self->{'META'};
138 if( defined $value ) {
139 $self->{'META'} = $value;
141 return $previous;
144 =head2 noncanonical_string
146 Title : noncanonical_string
147 Usage : my $nc_seq = $hsp->noncanonical_string();
148 Function: Returns noncanonical string (NC) data for this HSP or undef
149 Returns : string of noncanonical data or undef
150 Args : [optional] string to set value
152 =cut
154 sub noncanonical_string {
155 my ($self,$value) = @_;
156 my $previous = $self->{'NC_SEQ'};
157 if( defined $value ) {
158 $self->{'NC_SEQ'} = $value;
160 return $previous;
163 =head2 custom_score
165 Title : custom_score
166 Usage : my $data = $hsp->custom_score();
167 Function: Returns custom_score data for this HSP, or undef
168 Returns : custom_score data or undef
169 Args : [optional] custom_score
170 Note : This is a Get/Set used to deal with odd score-like data generated
171 from RNAMotif (and other programs) where the score section
172 can be customized to include non-standard data, including sequence
173 data, user-based scores, and other values.
175 =cut
177 sub custom_score {
178 my ($self,$value) = @_;
179 my $previous = $self->{'CUSTOMSCORE'};
180 if( defined $value ) {
181 $self->{'CUSTOMSCORE'} = $value;
183 return $previous;
186 =head2 Bio::Search::HSP::HSPI methods
188 Implementation of Bio::Search::HSP::HSPI methods follow
190 =head2 algorithm
192 Title : algorithm
193 Usage : my $r_type = $hsp->algorithm
194 Function: Obtain the name of the algorithm used to obtain the HSP
195 Returns : string (e.g., BLASTP)
196 Args : [optional] scalar string to set value
198 =cut
200 =head2 strand
202 Title : strand
203 Usage : $hsp->strand('hit')
204 Function: Retrieves the strand for the HSP component requested
205 Returns : +1 or -1 (0 if unknown)
206 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject.
207 There is no strand available for 'query', as the query is a model
208 and not a true sequence.
210 =cut
212 # overrides HSPI::seq()
214 =head2 seq
216 Usage : $hsp->seq( [seq_type] );
217 Purpose : Get the query or sbjct sequence as a Bio::Seq.pm object.
218 Example : $seqObj = $hsp->seq('sbjct');
219 Returns : Object reference for a Bio::Seq.pm object.
220 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'sbjct').
221 : ('sbjct' is synonymous with 'hit')
222 : default is 'sbjct'
223 : Note: if there is no sequence available (eg for a model-based
224 : search), this returns a LocatableSeq object w/o a sequence
225 Throws : Propagates any exception that occurs during construction
226 : of the Bio::Seq.pm object.
227 Comments : The sequence is returned in an array of strings corresponding
228 : to the strings in the original format of the Blast alignment.
229 : (i.e., same spacing).
231 See Also : L<seq_str()|seq_str>, L<Bio::Seq>
233 =cut
235 #-------
236 sub seq {
237 #-------
238 my($self,$seqType) = @_;
239 $seqType ||= 'sbjct';
240 $seqType = 'sbjct' if $seqType eq 'hit';
241 my $str = $self->seq_str($seqType);
242 if( $seqType =~ /^(m|ho)/i ) {
243 $self->throw("cannot call seq on the homology match string, it isn't really a sequence, use get_aln to convert the HSP to a Bio::AlignIO and generate a consensus from that.");
245 require Bio::LocatableSeq;
246 my $id = $seqType =~ /^q/i ? $self->query->seq_id : $self->hit->seq_id;
247 $str =~ s{\*\[\s*(\d+)\s*\]\*}{'N' x $1}ge;
248 $str =~ s{\s+}{}g;
249 my $seq = Bio::LocatableSeq->new (-ID => $id,
250 -START => $self->start($seqType),
251 -END => $self->end($seqType),
252 -STRAND=> $self->strand($seqType),
253 -DESC => "$seqType sequence ",
255 $seq->seq($str) if $str;
256 $seq;
259 =head2 pvalue
261 Title : pvalue
262 Usage : my $pvalue = $hsp->pvalue();
263 Function: Returns the P-value for this HSP or undef
264 Returns : float or exponential (2e-10)
265 P-value is not defined with NCBI Blast2 reports.
266 Args : [optional] numeric to set value
268 =cut
270 =head2 evalue
272 Title : evalue
273 Usage : my $evalue = $hsp->evalue();
274 Function: Returns the e-value for this HSP
275 Returns : float or exponential (2e-10)
276 Args : [optional] numeric to set value
278 =cut
280 =head2 gaps
282 Title : gaps
283 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
284 Function : Get the number of gaps in the query, hit, or total alignment.
285 Returns : Integer, number of gaps or 0 if none
286 Args : arg 1: 'query' = num gaps in query seq
287 'hit' = num gaps in hit seq
288 'total' = num gaps in whole alignment
289 default = 'total'
290 arg 2: [optional] integer gap value to set for the type requested
292 =cut
294 =head2 query_string
296 Title : query_string
297 Usage : my $qseq = $hsp->query_string;
298 Function: Retrieves the query sequence of this HSP as a string
299 Returns : string
300 Args : [optional] string to set for query sequence
302 =cut
304 =head2 hit_string
306 Title : hit_string
307 Usage : my $hseq = $hsp->hit_string;
308 Function: Retrieves the hit sequence of this HSP as a string
309 Returns : string
310 Args : [optional] string to set for hit sequence
312 =cut
314 =head2 homology_string
316 Title : homology_string
317 Usage : my $homo_string = $hsp->homology_string;
318 Function: Retrieves the homology sequence for this HSP as a string.
319 : The homology sequence is the string of symbols in between the
320 : query and hit sequences in the alignment indicating the degree
321 : of conservation (e.g., identical, similar, not similar).
322 Returns : string
323 Args : [optional] string to set for homology sequence
325 =cut
327 =head2 length
329 Title : length
330 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
331 Function : Returns the length of the query or hit in the alignment
332 (without gaps)
333 or the aggregate length of the HSP (including gaps;
334 this may be greater than either hit or query )
335 Returns : integer
336 Args : arg 1: 'query' = length of query seq (without gaps)
337 'hit' = length of hit seq (without gaps)
338 'total' = length of alignment (with gaps)
339 default = 'total'
340 arg 2: [optional] integer length value to set for specific type
342 =cut
344 =head2 frame
346 Title : frame
347 Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
348 Function: Set the Frame for both query and subject and insure that
349 they agree.
350 This overrides the frame() method implementation in
351 FeaturePair.
352 Returns : array of query and subject frame if return type wants an array
353 or query frame if defined or subject frame if not defined
354 Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
355 'query' to retrieve the query frame
356 'list' or 'array' to retrieve both query and hit frames together
357 Note : Frames are stored in the GFF way (0-2) not 1-3
358 as they are in BLAST (negative frames are deduced by checking
359 the strand of the query or hit)
361 =cut
363 =head2 get_aln
365 Title : get_aln
366 Usage : my $aln = $hsp->gel_aln
367 Function: Returns a Bio::SimpleAlign representing the HSP alignment
368 Returns : Bio::SimpleAlign
369 Args : none
371 =cut
373 sub get_aln {
374 my ($self) = @_;
375 require Bio::LocatableSeq;
376 require Bio::SimpleAlign;
377 my $aln = Bio::SimpleAlign->new;
378 my %hsp = (hit => $self->hit_string,
379 midline => $self->homology_string,
380 query => $self->query_string,
381 meta => $self->meta);
383 # this takes care of infernal issues
384 if ($hsp{meta} && $hsp{meta} =~ m{~+}) {
385 $self->_postprocess_hsp(\%hsp);
388 if (!$hsp{query}) {
389 $self->warn("Missing query string, can't build alignment");
390 return;
393 my $seqonly = $hsp{query};
394 $seqonly =~ s/[\-\s]//g;
395 my ($q_nm,$s_nm) = ($self->query->seq_id(),
396 $self->hit->seq_id());
397 unless( defined $q_nm && CORE::length ($q_nm) ) {
398 $q_nm = 'query';
400 unless( defined $s_nm && CORE::length ($s_nm) ) {
401 $s_nm = 'hit';
403 my $query = Bio::LocatableSeq->new('-seq' => $hsp{query},
404 '-id' => $q_nm,
405 '-start' => $self->query->start,
406 '-end' => $self->query->end,
408 $seqonly = $hsp{hit};
409 $seqonly =~ s/[\-\s]//g;
410 my $hit = Bio::LocatableSeq->new('-seq' => $hsp{hit},
411 '-id' => $s_nm,
412 '-start' => $self->hit->start,
413 '-end' => $self->hit->end,
415 $aln->add_seq($query);
416 $aln->add_seq($hit);
417 if ($hsp{meta}) {
418 my $meta_obj = Bio::Seq::Meta->new();
419 $meta_obj->named_meta('ss_cons', $hsp{meta});
420 $aln->consensus_meta($meta_obj);
422 return $aln;
425 =head2 Inherited from Bio::SeqFeature::SimilarityPair
427 These methods come from Bio::SeqFeature::SimilarityPair
429 =head2 query
431 Title : query
432 Usage : my $query = $hsp->query
433 Function: Returns a SeqFeature representing the query in the HSP
434 Returns : Bio::SeqFeature::Similarity
435 Args : [optional] new value to set
438 =head2 hit
440 Title : hit
441 Usage : my $hit = $hsp->hit
442 Function: Returns a SeqFeature representing the hit in the HSP
443 Returns : Bio::SeqFeature::Similarity
444 Args : [optional] new value to set
447 =head2 significance
449 Title : significance
450 Usage : $evalue = $obj->significance();
451 $obj->significance($evalue);
452 Function: Get/Set the significance value
453 Returns : numeric
454 Args : [optional] new value to set
457 =head2 score
459 Title : score
460 Usage : my $score = $hsp->score();
461 Function: Returns the score for this HSP or undef
462 Returns : numeric
463 Args : [optional] numeric to set value
465 =cut
467 =head2 bits
469 Title : bits
470 Usage : my $bits = $hsp->bits();
471 Function: Returns the bit value for this HSP or undef
472 Returns : numeric
473 Args : none
475 =cut
477 =head1 ModelHSP methods overridden in ModelHSP
479 The following methods have been overridden due to their current reliance on
480 sequence-based queries. They may be implemented in future versions of this class.
482 =head2 seq_inds
484 =cut
486 sub seq_inds {
487 my $self = shift;
488 $self->warn('$hsp->seq_inds not implemented for Model-based searches');
489 return;
492 =head2 frac_identical
494 =cut
496 sub frac_identical {
497 my $self = shift;
498 $self->warn('$hsp->frac_identical not implemented for Model-based searches');
499 return;
502 =head2 frac_conserved
504 =cut
506 sub frac_conserved {
507 my $self = shift;
508 $self->warn('$hsp->frac_conserved not implemented for Model-based searches');
509 return;
512 =head2 matches
514 =cut
516 sub matches {
517 my $self = shift;
518 $self->warn('$hsp->matches not implemented for Model-based searches');
519 return;
522 =head2 num_conserved
524 =cut
526 sub num_conserved {
527 my $self = shift;
528 $self->warn('$hsp->num_conserved not implemented for Model-based searches');
529 return;
532 =head2 num_identical
534 =cut
536 sub num_identical {
537 my $self = shift;
538 $self->warn('$hsp->num_identical not implemented for Model-based searches');
539 return;
542 =head2 cigar_string
544 =cut
547 sub cigar_string {
548 my $self = shift;
549 $self->warn('$hsp->cigar_string not implemented for Model-based searches');
550 return;
553 =head2 generate_cigar_string
555 =cut
557 sub generate_cigar_string {
558 my $self = shift;
559 $self->warn('$hsp->generate_cigar_string not implemented for Model-based searches');
560 return;
563 =head2 percent_identity
565 =cut
567 sub percent_identity {
568 my $self = shift;
569 $self->warn('$hsp->percent_identity not implemented for Model-based searches');
570 return;
573 ############## PRIVATE ##############
575 # the following method postprocesses HSP data in cases where the sequences
576 # aren't complete (which can trigger a validation error)
579 my $SEQ_REGEX = qr/\*\[\s*(\d+)\s*\]\*/;
580 my $META_REGEX = qr/(~+)/;
582 sub _postprocess_hsp {
583 my ($self, $hsp) = @_;
584 $self->throw('Must pass a hash ref for HSP processing') unless ref($hsp) eq 'HASH';
585 my @ins;
586 for my $type (qw(query hit meta)) {
587 $hsp->{$type} =~ s{\s+$}{};
588 my $str = $hsp->{$type};
589 my $regex = $type eq 'meta' ? $META_REGEX : $SEQ_REGEX;
590 my $ind = 0;
591 while ($str =~ m{$regex}g) {
592 $ins[$ind]->{$type} = {pos => pos($str) - length($1), str => $1};
593 $ind++;
596 for my $chunk (reverse @ins) {
597 my ($max, $min) = ($chunk->{hit}->{str} >= $chunk->{query}->{str}) ?
598 ('hit', 'query') : ('query', 'hit');
599 my %rep;
600 $rep{$max} = 'N' x $chunk->{$max}->{str};
601 $rep{$min} = 'N' x $chunk->{$min}->{str}.
602 ('-'x($chunk->{$max}->{str}-$chunk->{$min}->{str}));
603 $rep{'meta'} = '~' x $chunk->{$max}->{str};
604 $rep{'midline'} = ' ' x $chunk->{$max}->{str};
605 for my $t (qw(hit query meta midline)) {
606 substr($hsp->{$t}, $chunk->{meta}->{pos}, length($chunk->{meta}->{str}) , $rep{$t});