2 # BioPerl module for Bio::Search::HSP::ModelHSP
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields <cjfields at bioperl dot org>
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Search::HSP::ModelHSP - A HSP object for model-based searches
20 use Bio::Search::HSP::ModelHSP;
21 # us it just like a Bio::Search::HSP::ModelHSP object
25 This object is a specialization of L<Bio::Search::HSP::ModelHSP> and is used
26 for searches which involve a query model, such as a Hidden Markov Model (HMM),
27 covariance model (CM), descriptor, or anything else besides a sequence. Note
28 that results from any HSPI class methods which rely on the query being a
29 sequence are unreliable and have thus been overridden with warnings indicating
30 they have not been implemented at this time.
36 User feedback is an integral part of the evolution of this and other
37 Bioperl modules. Send your comments and suggestions preferably to
38 the Bioperl mailing list. Your participation is much appreciated.
40 bioperl-l@bioperl.org - General discussion
41 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
45 Please direct usage questions or support issues to the mailing list:
47 I<bioperl-l@bioperl.org>
49 rather than to the module maintainer directly. Many experienced and
50 reponsive experts will be able look at the problem and quickly
51 address it. Please include a thorough description of the problem
52 with code and data examples if at all possible.
56 Report bugs to the Bioperl bug tracking system to help us keep track
57 of the bugs and their resolution. Bug reports can be submitted via the
60 https://github.com/bioperl/bioperl-live/issues
62 =head1 AUTHOR - Chris Fields
64 Email cjfields at bioperl dot org
68 The rest of the documentation details each of the object methods.
69 Internal methods are usually preceded with a _
73 # Let the code begin...
75 package Bio
::Search
::HSP
::ModelHSP
;
79 use base
qw(Bio::Search::HSP::GenericHSP);
84 Usage : my $obj = Bio::Search::HSP::ModelHSP->new();
85 Function: Builds a new Bio::Search::HSP::ModelHSP object
86 Returns : Bio::Search::HSP::ModelHSP
89 Plus Bio::Seach::HSP::ModelHSP methods
91 -algorithm => algorithm used (Infernal, RNAMotif, ERPIN, etc)
94 -bits => bit value for HSP
95 -score => score value for HSP (typically z-score but depends on
97 -hsp_length=> Length of the HSP (including gaps)
98 -identical => # of residues that that matched identically
99 -conserved => # of residues that matched conservatively
100 (only protein comparisons;
101 conserved == identical in nucleotide comparisons)
102 -hsp_gaps => # of gaps in the HSP
103 -query_gaps => # of gaps in the query in the alignment
104 -hit_gaps => # of gaps in the subject in the alignment
105 -query_name => HSP Query sequence name (if available)
106 -query_start => HSP Query start (in original query sequence coords)
107 -query_end => HSP Query end (in original query sequence coords)
108 -hit_name => HSP Hit sequence name (if available)
109 -hit_start => HSP Hit start (in original hit sequence coords)
110 -hit_end => HSP Hit end (in original hit sequence coords)
111 -hit_length => total length of the hit sequence
112 -query_length=> total length of the query sequence
113 -query_seq => query sequence portion of the HSP
114 -hit_seq => hit sequence portion of the HSP
115 -homology_seq=> homology sequence for the HSP
116 -hit_frame => hit frame (only if hit is translated protein)
117 -query_frame => query frame (only if query is translated protein)
118 -meta => optional meta data (sec structure, markup, etc)
119 -custom_score=> custom score data
126 Usage : my $meta = $hsp->meta();
127 Function: Returns meta data for this HSP or undef
128 Returns : string of meta data or undef
129 Args : [optional] string to set value
130 Note : At some point very soon this will likely be a Bio::AnnotationI.
131 Don't get used to a simple string!
136 my ($self,$value) = @_;
137 my $previous = $self->{'META'};
138 if( defined $value ) {
139 $self->{'META'} = $value;
144 =head2 noncanonical_string
146 Title : noncanonical_string
147 Usage : my $nc_seq = $hsp->noncanonical_string();
148 Function: Returns noncanonical string (NC) data for this HSP or undef
149 Returns : string of noncanonical data or undef
150 Args : [optional] string to set value
154 sub noncanonical_string
{
155 my ($self,$value) = @_;
156 my $previous = $self->{'NC_SEQ'};
157 if( defined $value ) {
158 $self->{'NC_SEQ'} = $value;
166 Usage : my $data = $hsp->custom_score();
167 Function: Returns custom_score data for this HSP, or undef
168 Returns : custom_score data or undef
169 Args : [optional] custom_score
170 Note : This is a Get/Set used to deal with odd score-like data generated
171 from RNAMotif (and other programs) where the score section
172 can be customized to include non-standard data, including sequence
173 data, user-based scores, and other values.
178 my ($self,$value) = @_;
179 my $previous = $self->{'CUSTOMSCORE'};
180 if( defined $value ) {
181 $self->{'CUSTOMSCORE'} = $value;
186 =head2 Bio::Search::HSP::HSPI methods
188 Implementation of Bio::Search::HSP::HSPI methods follow
193 Usage : my $r_type = $hsp->algorithm
194 Function: Obtain the name of the algorithm used to obtain the HSP
195 Returns : string (e.g., BLASTP)
196 Args : [optional] scalar string to set value
203 Usage : $hsp->strand('hit')
204 Function: Retrieves the strand for the HSP component requested
205 Returns : +1 or -1 (0 if unknown)
206 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject.
207 There is no strand available for 'query', as the query is a model
208 and not a true sequence.
212 # overrides HSPI::seq()
216 Usage : $hsp->seq( [seq_type] );
217 Purpose : Get the query or sbjct sequence as a Bio::Seq.pm object.
218 Example : $seqObj = $hsp->seq('sbjct');
219 Returns : Object reference for a Bio::Seq.pm object.
220 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'sbjct').
221 : ('sbjct' is synonymous with 'hit')
223 : Note: if there is no sequence available (eg for a model-based
224 : search), this returns a LocatableSeq object w/o a sequence
225 Throws : Propagates any exception that occurs during construction
226 : of the Bio::Seq.pm object.
227 Comments : The sequence is returned in an array of strings corresponding
228 : to the strings in the original format of the Blast alignment.
229 : (i.e., same spacing).
231 See Also : L<seq_str()|seq_str>, L<Bio::Seq>
238 my($self,$seqType) = @_;
239 $seqType ||= 'sbjct';
240 $seqType = 'sbjct' if $seqType eq 'hit';
241 my $str = $self->seq_str($seqType);
242 if( $seqType =~ /^(m|ho)/i ) {
243 $self->throw("cannot call seq on the homology match string, it isn't really a sequence, use get_aln to convert the HSP to a Bio::AlignIO and generate a consensus from that.");
245 require Bio
::LocatableSeq
;
246 my $id = $seqType =~ /^q/i ?
$self->query->seq_id : $self->hit->seq_id;
247 $str =~ s{\*\[\s*(\d+)\s*\]\*}{'N' x $1}ge;
249 my $seq = Bio
::LocatableSeq
->new (-ID
=> $id,
250 -START
=> $self->start($seqType),
251 -END => $self->end($seqType),
252 -STRAND
=> $self->strand($seqType),
253 -DESC
=> "$seqType sequence ",
255 $seq->seq($str) if $str;
262 Usage : my $pvalue = $hsp->pvalue();
263 Function: Returns the P-value for this HSP or undef
264 Returns : float or exponential (2e-10)
265 P-value is not defined with NCBI Blast2 reports.
266 Args : [optional] numeric to set value
273 Usage : my $evalue = $hsp->evalue();
274 Function: Returns the e-value for this HSP
275 Returns : float or exponential (2e-10)
276 Args : [optional] numeric to set value
283 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
284 Function : Get the number of gaps in the query, hit, or total alignment.
285 Returns : Integer, number of gaps or 0 if none
286 Args : arg 1: 'query' = num gaps in query seq
287 'hit' = num gaps in hit seq
288 'total' = num gaps in whole alignment
290 arg 2: [optional] integer gap value to set for the type requested
297 Usage : my $qseq = $hsp->query_string;
298 Function: Retrieves the query sequence of this HSP as a string
300 Args : [optional] string to set for query sequence
307 Usage : my $hseq = $hsp->hit_string;
308 Function: Retrieves the hit sequence of this HSP as a string
310 Args : [optional] string to set for hit sequence
314 =head2 homology_string
316 Title : homology_string
317 Usage : my $homo_string = $hsp->homology_string;
318 Function: Retrieves the homology sequence for this HSP as a string.
319 : The homology sequence is the string of symbols in between the
320 : query and hit sequences in the alignment indicating the degree
321 : of conservation (e.g., identical, similar, not similar).
323 Args : [optional] string to set for homology sequence
330 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
331 Function : Returns the length of the query or hit in the alignment
333 or the aggregate length of the HSP (including gaps;
334 this may be greater than either hit or query )
336 Args : arg 1: 'query' = length of query seq (without gaps)
337 'hit' = length of hit seq (without gaps)
338 'total' = length of alignment (with gaps)
340 arg 2: [optional] integer length value to set for specific type
347 Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
348 Function: Set the Frame for both query and subject and insure that
350 This overrides the frame() method implementation in
352 Returns : array of query and subject frame if return type wants an array
353 or query frame if defined or subject frame if not defined
354 Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
355 'query' to retrieve the query frame
356 'list' or 'array' to retrieve both query and hit frames together
357 Note : Frames are stored in the GFF way (0-2) not 1-3
358 as they are in BLAST (negative frames are deduced by checking
359 the strand of the query or hit)
366 Usage : my $aln = $hsp->gel_aln
367 Function: Returns a Bio::SimpleAlign representing the HSP alignment
368 Returns : Bio::SimpleAlign
375 require Bio
::LocatableSeq
;
376 require Bio
::SimpleAlign
;
377 my $aln = Bio
::SimpleAlign
->new;
378 my %hsp = (hit
=> $self->hit_string,
379 midline
=> $self->homology_string,
380 query
=> $self->query_string,
381 meta
=> $self->meta);
383 # this takes care of infernal issues
384 if ($hsp{meta
} && $hsp{meta
} =~ m{~+}) {
385 $self->_postprocess_hsp(\
%hsp);
389 $self->warn("Missing query string, can't build alignment");
393 my $seqonly = $hsp{query
};
394 $seqonly =~ s/[\-\s]//g;
395 my ($q_nm,$s_nm) = ($self->query->seq_id(),
396 $self->hit->seq_id());
397 unless( defined $q_nm && CORE
::length ($q_nm) ) {
400 unless( defined $s_nm && CORE
::length ($s_nm) ) {
403 my $query = Bio
::LocatableSeq
->new('-seq' => $hsp{query
},
405 '-start' => $self->query->start,
406 '-end' => $self->query->end,
408 $seqonly = $hsp{hit
};
409 $seqonly =~ s/[\-\s]//g;
410 my $hit = Bio
::LocatableSeq
->new('-seq' => $hsp{hit
},
412 '-start' => $self->hit->start,
413 '-end' => $self->hit->end,
415 $aln->add_seq($query);
418 my $meta_obj = Bio
::Seq
::Meta
->new();
419 $meta_obj->named_meta('ss_cons', $hsp{meta
});
420 $aln->consensus_meta($meta_obj);
425 =head2 Inherited from Bio::SeqFeature::SimilarityPair
427 These methods come from Bio::SeqFeature::SimilarityPair
432 Usage : my $query = $hsp->query
433 Function: Returns a SeqFeature representing the query in the HSP
434 Returns : Bio::SeqFeature::Similarity
435 Args : [optional] new value to set
441 Usage : my $hit = $hsp->hit
442 Function: Returns a SeqFeature representing the hit in the HSP
443 Returns : Bio::SeqFeature::Similarity
444 Args : [optional] new value to set
450 Usage : $evalue = $obj->significance();
451 $obj->significance($evalue);
452 Function: Get/Set the significance value
454 Args : [optional] new value to set
460 Usage : my $score = $hsp->score();
461 Function: Returns the score for this HSP or undef
463 Args : [optional] numeric to set value
470 Usage : my $bits = $hsp->bits();
471 Function: Returns the bit value for this HSP or undef
477 =head1 ModelHSP methods overridden in ModelHSP
479 The following methods have been overridden due to their current reliance on
480 sequence-based queries. They may be implemented in future versions of this class.
488 $self->warn('$hsp->seq_inds not implemented for Model-based searches');
492 =head2 frac_identical
498 $self->warn('$hsp->frac_identical not implemented for Model-based searches');
502 =head2 frac_conserved
508 $self->warn('$hsp->frac_conserved not implemented for Model-based searches');
518 $self->warn('$hsp->matches not implemented for Model-based searches');
528 $self->warn('$hsp->num_conserved not implemented for Model-based searches');
538 $self->warn('$hsp->num_identical not implemented for Model-based searches');
549 $self->warn('$hsp->cigar_string not implemented for Model-based searches');
553 =head2 generate_cigar_string
557 sub generate_cigar_string
{
559 $self->warn('$hsp->generate_cigar_string not implemented for Model-based searches');
563 =head2 percent_identity
567 sub percent_identity
{
569 $self->warn('$hsp->percent_identity not implemented for Model-based searches');
573 ############## PRIVATE ##############
575 # the following method postprocesses HSP data in cases where the sequences
576 # aren't complete (which can trigger a validation error)
579 my $SEQ_REGEX = qr/\*\[\s*(\d+)\s*\]\*/;
580 my $META_REGEX = qr/(~+)/;
582 sub _postprocess_hsp
{
583 my ($self, $hsp) = @_;
584 $self->throw('Must pass a hash ref for HSP processing') unless ref($hsp) eq 'HASH';
586 for my $type (qw(query hit meta)) {
587 $hsp->{$type} =~ s{\s+$}{};
588 my $str = $hsp->{$type};
589 my $regex = $type eq 'meta' ?
$META_REGEX : $SEQ_REGEX;
591 while ($str =~ m{$regex}g) {
592 $ins[$ind]->{$type} = {pos => pos($str) - length($1), str
=> $1};
596 for my $chunk (reverse @ins) {
597 my ($max, $min) = ($chunk->{hit
}->{str
} >= $chunk->{query
}->{str
}) ?
598 ('hit', 'query') : ('query', 'hit');
600 $rep{$max} = 'N' x
$chunk->{$max}->{str
};
601 $rep{$min} = 'N' x
$chunk->{$min}->{str
}.
602 ('-'x
($chunk->{$max}->{str
}-$chunk->{$min}->{str
}));
603 $rep{'meta'} = '~' x
$chunk->{$max}->{str
};
604 $rep{'midline'} = ' ' x
$chunk->{$max}->{str
};
605 for my $t (qw(hit query meta midline)) {
606 substr($hsp->{$t}, $chunk->{meta
}->{pos}, length($chunk->{meta
}->{str
}) , $rep{$t});