2 # BioPerl module for Bio::Search::HSP::GenericHSP
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Search::HSP::GenericHSP - A "Generic" implementation of a High Scoring Pair
20 my $hsp = Bio::Search::HSP::GenericHSP->new( -algorithm => 'blastp',
24 $r_type = $hsp->algorithm;
28 $evalue = $hsp->evalue();
30 $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
32 $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
34 $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
36 $qseq = $hsp->query_string;
38 $hseq = $hsp->hit_string;
40 $homo_string = $hsp->homology_string;
42 $len = $hsp->length( ['query'|'hit'|'total'] );
44 $len = $hsp->length( ['query'|'hit'|'total'] );
48 # TODO: Describe how to configure a SearchIO stream so that it generates
53 This implementation is "Generic", meaning it is is suitable for
54 holding information about High Scoring pairs from most Search reports
55 such as BLAST and FastA. Specialized objects can be derived from
58 Unless you're writing a parser, you won't ever need to create a
59 GenericHSP or any other HSPI-implementing object. If you use
60 the SearchIO system, HSPI objects are created automatically from
61 a SearchIO stream which returns Bio::Search::Result::ResultI objects
62 and you get the HSPI objects via the ResultI API.
64 For documentation on what you can do with GenericHSP (and other HSPI
65 objects), please see the API documentation in
66 L<Bio::Search::HSP::HSPI|Bio::Search::HSP::HSPI>.
72 User feedback is an integral part of the evolution of this and other
73 Bioperl modules. Send your comments and suggestions preferably to
74 the Bioperl mailing list. Your participation is much appreciated.
76 bioperl-l@bioperl.org - General discussion
77 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
81 Please direct usage questions or support issues to the mailing list:
83 I<bioperl-l@bioperl.org>
85 rather than to the module maintainer directly. Many experienced and
86 reponsive experts will be able look at the problem and quickly
87 address it. Please include a thorough description of the problem
88 with code and data examples if at all possible.
92 Report bugs to the Bioperl bug tracking system to help us keep track
93 of the bugs and their resolution. Bug reports can be submitted via the
96 https://github.com/bioperl/bioperl-live/issues
98 =head1 AUTHOR - Jason Stajich and Steve Chervitz
100 Email jason-at-bioperl.org
101 Email sac-at-bioperl.org
105 Sendu Bala, bix@sendu.me.uk
109 The rest of the documentation details each of the object methods.
110 Internal methods are usually preceded with a _
114 # Let the code begin...
116 package Bio
::Search
::HSP
::GenericHSP
;
121 use Bio
::SeqFeature
::Similarity
;
123 use base
qw(Bio::Search::HSP::HSPI);
128 Usage : my $obj = Bio::Search::HSP::GenericHSP->new();
129 Function: Builds a new Bio::Search::HSP::GenericHSP object
130 Returns : Bio::Search::HSP::GenericHSP
131 Args : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc)
134 -bits => bit value for HSP
135 -score => score value for HSP (typically z-score but depends on
137 -hsp_length=> Length of the HSP (including gaps)
138 -identical => # of residues that that matched identically
139 -percent_identity => (optional) percent identity
140 -conserved => # of residues that matched conservatively
141 (only protein comparisons;
142 conserved == identical in nucleotide comparisons)
143 -hsp_gaps => # of gaps in the HSP
144 -query_gaps => # of gaps in the query in the alignment
145 -hit_gaps => # of gaps in the subject in the alignment
146 -query_name => HSP Query sequence name (if available)
147 -query_start => HSP Query start (in original query sequence coords)
148 -query_end => HSP Query end (in original query sequence coords)
149 -query_length=> total length of the query sequence
150 -query_seq => query sequence portion of the HSP
151 -query_desc => textual description of the query
152 -hit_name => HSP Hit sequence name (if available)
153 -hit_start => HSP Hit start (in original hit sequence coords)
154 -hit_end => HSP Hit end (in original hit sequence coords)
155 -hit_length => total length of the hit sequence
156 -hit_seq => hit sequence portion of the HSP
157 -hit_desc => textual description of the hit
158 -homology_seq=> homology sequence for the HSP
159 -hit_frame => hit frame (only if hit is translated protein)
160 -query_frame => query frame (only if query is translated protein)
162 -links => HSP links information (WU-BLAST only)
163 -hsp_group => HSP Group informat (WU-BLAST only)
164 -gap_symbol => symbol representing a gap (default = '-')
165 -hit_features=> string of features found in or near HSP hit
166 region (reported in some BLAST text output,
168 -stranded => If the algorithm isn't known (i.e. defaults to
169 'generic'), setting this will indicate start/end
170 coordinates are to be used to determine the strand
171 for 'query', 'subject', 'hit', 'both', or 'none'
177 my($class,%args) = @_;
179 # don't pass anything to SUPER; complex hierarchy results in lots of work
182 my $self = $class->SUPER::new
();
184 # for speed, don't use _rearrange and just store all input data directly
185 # with no method calls and no work done. work can be carried
186 # out just-in-time later if desired
187 while (my ($arg, $value) = each %args) {
188 $arg =~ tr/a-z\055/A-Z/d;
189 $self->{$arg} = $value;
191 my $bits = $self->{BITS
};
193 defined $self->{VERBOSE
} && $self->verbose($self->{VERBOSE
});
194 if (exists $self->{GAP_SYMBOL
}) {
195 # not checking anything else but the length (must be 1 as only one gap
196 # symbol allowed currently); can add in support for symbol checks or
197 # multiple symbols later if needed
198 $self->throw("Gap symbol must be of length 1") if
199 CORE
::length($self->{GAP_SYMBOL
}) != 1;
202 $self->{GAP_SYMBOL
} = '-';
204 $self->{ALGORITHM
} ||= 'GENERIC';
205 $self->{STRANDED
} ||= 'NONE';
207 if (! defined $self->{QUERY_LENGTH
} || ! defined $self->{HIT_LENGTH
}) {
208 $self->throw("Must define hit and query length");
211 $self->{'_sequenceschanged'} = 1;
213 $self->{_finished_new
} = 1;
217 sub _logical_length
{
218 my ($self, $type) = @_;
219 if (!defined($self->{_sbjct_offset
}) || !defined($self->{_query_offset
})) {
220 $self->_calculate_seq_offsets();
222 my $key = $type =~ /sbjct|hit|tot/i ?
'sbjct' : 'query';
224 my $offset = $self->{"_${key}_offset"};
225 return $self->length($type) / $offset ;
228 =head2 L<Bio::Search::HSP::HSPI> methods
230 Implementation of L<Bio::Search::HSP::HSPI> methods follow
235 Usage : my $r_type = $hsp->algorithm
236 Function: Obtain the name of the algorithm used to obtain the HSP
237 Returns : string (e.g., BLASTP)
238 Args : [optional] scalar string to set value
243 my ($self,$value) = @_;
244 my $previous = $self->{'ALGORITHM'};
245 if( defined $value || ! defined $previous ) {
246 $value = $previous = '' unless defined $value;
247 $self->{'ALGORITHM'} = $value;
256 Usage : my $pvalue = $hsp->pvalue();
257 Function: Returns the P-value for this HSP or undef
258 Returns : float or exponential (2e-10)
259 P-value is not defined with NCBI Blast2 reports.
260 Args : [optional] numeric to set value
265 my ($self,$value) = @_;
266 my $previous = $self->{'PVALUE'};
267 if( defined $value ) {
268 $self->{'PVALUE'} = $value;
276 Usage : my $evalue = $hsp->evalue();
277 Function: Returns the e-value for this HSP
278 Returns : float or exponential (2e-10)
279 Args : [optional] numeric to set value
284 my ($self,$value) = @_;
285 my $previous = $self->{'EVALUE'};
286 if( defined $value ) {
287 $self->{'EVALUE'} = $value;
292 =head2 frac_identical
294 Title : frac_identical
295 Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
296 Function: Returns the fraction of identitical positions for this HSP
297 Returns : Float in range 0.0 -> 1.0
298 Args : arg 1: 'query' = num identical / length of query seq (without gaps)
299 'hit' = num identical / length of hit seq (without gaps)
300 synonyms: 'sbjct', 'subject'
301 'total' = num identical / length of alignment (with gaps)
304 arg 2: [optional] frac identical value to set for the type requested
305 Note : for translated sequences, this adjusts the length accordingly
310 my ($self, $type,$value) = @_;
312 unless ($self->{_did_prefrac
}) {
316 $type = lc $type if defined $type;
317 $type = 'hit' if( defined $type &&
318 $type =~ /subject|sbjct/);
319 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
320 $type !~ /query|hit|subject|sbjct|total/);
321 my $previous = $self->{'_frac_identical'}->{$type};
322 if( defined $value || ! defined $previous ) {
323 $value = $previous = '' unless defined $value;
324 if( $type eq 'hit' || $type eq 'query' ) {
325 $self->$type()->frac_identical( $value);
327 $self->{'_frac_identical'}->{$type} = $value;
333 =head2 frac_conserved
335 Title : frac_conserved
336 Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
337 Function : Returns the fraction of conserved positions for this HSP.
338 This is the fraction of symbols in the alignment with a
340 Returns : Float in range 0.0 -> 1.0
341 Args : arg 1: 'query' = num conserved / length of query seq (without gaps)
342 'hit' = num conserved / length of hit seq (without gaps)
343 synonyms: 'sbjct', 'subject'
344 'total' = num conserved / length of alignment (with gaps)
347 arg 2: [optional] frac conserved value to set for the type requested
352 my ($self, $type,$value) = @_;
354 unless ($self->{_did_prefrac
}) {
358 $type = lc $type if defined $type;
359 $type = 'hit' if( defined $type && $type =~ /subject|sbjct/);
360 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
361 $type !~ /query|hit|subject|sbjct|total/);
362 my $previous = $self->{'_frac_conserved'}->{$type};
363 if( defined $value || ! defined $previous ) {
364 $value = $previous = '' unless defined $value;
365 $self->{'_frac_conserved'}->{$type} = $value;
373 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
374 Function : Get the number of gap characters in the query, hit, or total alignment.
375 Returns : Integer, number of gaps or 0 if none
376 Args : arg 1: 'query' = num gap characters in query seq
377 'hit' = num gap characters in hit seq; synonyms: 'sbjct', 'subject'
378 'total' = num gap characters in whole alignment; synonyms: 'hsp'
380 arg 2: [optional] integer gap value to set for the type requested
385 my ($self, $type, $value) = @_;
387 unless ($self->{_did_pregaps
}) {
391 $type = lc $type if defined $type;
392 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
393 $type !~ /query|hit|subject|sbjct|total/);
394 $type = 'hit' if $type =~ /sbjct|subject/;
395 my $previous = $self->{'_gaps'}->{$type};
396 if( defined $value || ! defined $previous ) {
397 $value = $previous = '' unless defined $value;
398 $self->{'_gaps'}->{$type} = $value;
400 return $previous || 0;
406 Usage : my $qseq = $hsp->query_string;
407 Function: Retrieves the query sequence of this HSP as a string
409 Args : [optional] string to set for query sequence
415 my ($self,$value) = @_;
416 my $previous = $self->{QUERY_SEQ
};
417 if( defined $value || ! defined $previous ) {
418 $value = $previous = '' unless defined $value;
419 $self->{QUERY_SEQ
} = $value;
420 # do some housekeeping so we know when to
421 # re-run _calculate_seq_positions
422 $self->{'_sequenceschanged'} = 1;
430 Usage : my $hseq = $hsp->hit_string;
431 Function: Retrieves the hit sequence of this HSP as a string
433 Args : [optional] string to set for hit sequence
439 my ($self,$value) = @_;
440 my $previous = $self->{HIT_SEQ
};
441 if( defined $value || ! defined $previous ) {
442 $value = $previous = '' unless defined $value;
443 $self->{HIT_SEQ
} = $value;
444 # do some housekeeping so we know when to
445 # re-run _calculate_seq_positions
446 $self->{'_sequenceschanged'} = 1;
451 =head2 homology_string
453 Title : homology_string
454 Usage : my $homo_string = $hsp->homology_string;
455 Function: Retrieves the homology sequence for this HSP as a string.
456 : The homology sequence is the string of symbols in between the
457 : query and hit sequences in the alignment indicating the degree
458 : of conservation (e.g., identical, similar, not similar).
460 Args : [optional] string to set for homology sequence
465 my ($self,$value) = @_;
466 my $previous = $self->{HOMOLOGY_SEQ
};
467 if( defined $value || ! defined $previous ) {
468 $value = $previous = '' unless defined $value;
469 $self->{HOMOLOGY_SEQ
} = $value;
470 # do some housekeeping so we know when to
471 # re-run _calculate_seq_positions
472 $self->{'_sequenceschanged'} = 1;
477 =head2 consensus_string
479 Title : consensus_string
480 Usage : my $cs_string = $hsp->consensus_string;
481 Function: Retrieves the consensus structure line for this HSP as a string (HMMER).
482 : If the model had any consensus structure or reference line annotation
483 : that it inherited from a multiple alignment (#=GC SS cons,
484 : #=GC RF annotation in Stockholm files), that information is shown
485 : as CS or RF annotation line.
487 Args : [optional] string to set for consensus structure
491 sub consensus_string
{
492 my ($self,$value) = @_;
493 my $previous = $self->{CS_SEQ
};
494 if( defined $value || ! defined $previous ) {
495 $value = $previous = '' unless defined $value;
496 $self->{CS_SEQ
} = $value;
497 # do some housekeeping so we know when to
498 # re-run _calculate_seq_positions
499 $self->{'_sequenceschanged'} = 1;
504 =head2 posterior_string
506 Title : posterior_string
507 Usage : my $pp_string = $hsp->posterior_string;
508 Function: Retrieves the posterior probability line for this HSP as a string (HMMer3).
509 : The posterior probability is the string of symbols at the bottom
510 : of the alignment indicating the expected accuracy of each aligned residue.
511 : A 0 means 0-5%, 1 means 5-15%, and so on; 9 means 85-95%,
512 : and a * means 95-100% posterior probability.
514 Args : [optional] string to set for posterior probability
518 sub posterior_string
{
519 my ($self,$value) = @_;
520 my $previous = $self->{PP_SEQ
};
521 if( defined $value || ! defined $previous ) {
522 $value = $previous = '' unless defined $value;
523 $self->{PP_SEQ
} = $value;
524 # do some housekeeping so we know when to
525 # re-run _calculate_seq_positions
526 $self->{'_sequenceschanged'} = 1;
534 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
535 Function : Returns the length of the query or hit in the alignment
537 or the aggregate length of the HSP (including gaps;
538 this may be greater than either hit or query )
540 Args : arg 1: 'query' = length of query seq (without gaps)
541 'hit' = length of hit seq (without gaps) (synonyms: sbjct, subject)
542 'total' = length of alignment (with gaps)
544 arg 2: [optional] integer length value to set for specific type
553 $type = 'total' unless defined $type;
556 if( $type =~ /^q/i ) {
557 return $self->query()->length(shift);
558 } elsif( $type =~ /^(hit|subject|sbjct)/ ) {
559 return $self->hit()->length(shift);
563 $self->{HSP_LENGTH
} = $v;
565 return $self->{HSP_LENGTH
};
567 return 0; # should never get here
573 Usage : my $len = $hsp->hsp_length()
574 Function: shortcut length('hsp')
575 Returns : floating point between 0 and 100
580 sub hsp_length
{ return shift->length('hsp', shift); }
582 =head2 percent_identity
584 Title : percent_identity
585 Usage : my $percentid = $hsp->percent_identity()
586 Function: Returns the calculated percent identity for an HSP
587 Returns : floating point between 0 and 100
593 sub percent_identity
{
596 unless ($self->{_did_prepi
}) {
600 return $self->SUPER::percent_identity
(@_);
606 Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
607 Function: Set the Frame for both query and subject and insure that
609 This overrides the frame() method implementation in
611 Returns : array of query and subject frame if return type wants an array
612 or query frame if defined or subject frame if not defined
613 Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
614 'query' to retrieve the query frame
615 'list' or 'array' to retrieve both query and hit frames together
616 Note : Frames are stored in the GFF way (0-2) not 1-3
617 as they are in BLAST (negative frames are deduced by checking
618 the strand of the query or hit)
622 # Note: changed 4/19/08 - bug 2485
624 # frame() is supposed to be a getter/setter (as is implied by the Function desc
625 # above; this is also consistent with Bio::SeqFeature::SimilarityPair). Also,
626 # the API is not consistent with the other HSP/SimilarityPair methods such as
627 # strand(), start(), end(), etc.
629 # In order to make this consistent with other methods all work is now done
630 # when the features are instantiated and not delayed. We compromise by
631 # defaulting back 'to hit' w/o passed args. Setting is now allowed.
637 # unfortunately, w/o args we need to warn about API changes
638 $self->warn("API for frame() has changed.\n".
639 "Please refer to documentation for Bio::Search::HSP::GenericHSP;\n".
640 "returning query frame");
645 if( $val =~ /^q/i ) {
646 return $self->query->frame(@_);
647 } elsif( $val =~ /^hi|^s/i ) {
648 return $self->hit->frame(@_);
649 } elsif ( $val =~ /^list|array/i ) {
650 return ($self->query->frame($_[0]),
651 $self->hit->frame($_[1]) );
652 } elsif ( $val =~ /^\d+$/) {
653 # old API i.e. frame($query_frame, $hit_frame). This catches all but one
654 # case, where no arg is passed (so the hit is wanted).
655 $self->warn("API for frame() has changed.\n".
656 "Please refer to documentation for Bio::Search::HSP::GenericHSP");
658 return ($self->query->frame($val),
659 $self->hit->frame(@_) ) :
660 return $self->hit->frame($val,@_);
662 $self->warn("unrecognized component '$val' requested\n");
670 Usage : my $aln = $hsp->gel_aln
671 Function: Returns a L<Bio::SimpleAlign> object representing the HSP alignment
672 Returns : L<Bio::SimpleAlign>
679 require Bio
::LocatableSeq
;
680 require Bio
::SimpleAlign
;
682 my $aln = Bio
::SimpleAlign
->new();
683 my $hs = $self->hit_string();
684 my $qs = $self->query_string();
685 # FASTA specific stuff moved to the FastaHSP object
687 $seqonly =~ s/[\-\s]//g;
688 my ($q_nm,$s_nm) = ($self->query->seq_id(),
689 $self->hit->seq_id());
690 # Should we silently change the name of the query or hit if it isn't
691 # defined? May need revisiting... cjfields 2008-12-3 (commented out below)
693 #unless( defined $q_nm && CORE::length ($q_nm) ) {
696 #unless( defined $s_nm && CORE::length ($s_nm) ) {
700 # mapping: 1 residues for every x coordinate positions
701 my $query = Bio
::LocatableSeq
->new(
704 -start
=> $self->query->start,
705 -end
=> $self->query->end,
706 -strand
=> $self->query->strand,
707 -force_nse
=> $q_nm ?
0 : 1,
708 -mapping
=> [ 1, $self->{_query_mapping
} ]
711 $seqonly =~ s/[\-\s]//g;
712 my $hit = Bio
::LocatableSeq
->new(
715 -start
=> $self->hit->start,
716 -end
=> $self->hit->end,
717 -strand
=> $self->hit->strand,
718 -force_nse
=> $s_nm ?
0 : 1,
719 -mapping
=> [ 1, $self->{_hit_mapping
} ]
721 $aln->add_seq($query);
728 Title : num_conserved
729 Usage : $obj->num_conserved($newval)
730 Function: returns the number of conserved residues in the alignment
732 Args : integer (optional)
738 my ($self,$value) = @_;
740 unless ($self->{_did_presimilar
}) {
741 $self->_pre_similar_stats;
744 if (defined $value) {
745 $self->{CONSERVED
} = $value;
747 return $self->{CONSERVED
};
752 Title : num_identical
753 Usage : $obj->num_identical($newval)
754 Function: returns the number of identical residues in the alignment
756 Args : integer (optional)
762 my ($self,$value) = @_;
764 unless ($self->{_did_presimilar
}) {
765 $self->_pre_similar_stats;
768 if( defined $value) {
769 $self->{IDENTICAL
} = $value;
771 return $self->{IDENTICAL
};
776 Usage : $hsp->rank( [string] );
777 Purpose : Get the rank of the HSP within a given Blast hit.
778 Example : $rank = $hsp->rank;
779 Returns : Integer (1..n) corresponding to the order in which the HSP
780 appears in the BLAST report.
785 my ($self,$value) = @_;
786 if( defined $value) {
787 $self->{RANK
} = $value;
789 return $self->{RANK
};
795 Purpose : Get a list of residue positions (indices) for all identical
796 : or conserved residues in the query or sbjct sequence.
797 Example : @s_ind = $hsp->seq_inds('query', 'identical');
798 : @h_ind = $hsp->seq_inds('hit', 'conserved');
799 : @h_ind = $hsp->seq_inds('hit', 'conserved-not-identical');
800 : @h_ind = $hsp->seq_inds('hit', 'conserved', 1);
801 Returns : List of integers
802 : May include ranges if collapse is true.
803 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query)
804 : ('sbjct' is synonymous with 'hit')
805 : class = 'identical' - identical positions
806 : 'conserved' - conserved positions
807 : 'nomatch' - mismatched residue or gap positions
808 : 'mismatch' - mismatched residue positions (no gaps)
809 : 'gap' - gap positions only
810 : 'frameshift'- frameshift positions only
811 : 'conserved-not-identical' - conserved positions w/o
813 : The name can be shortened to 'id' or 'cons' unless
814 : the name is . The default value is
817 : collapse = boolean, if true, consecutive positions are merged
818 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
819 : collapses to "1-5 7 9-11". This is useful for
820 : consolidating long lists. Default = no collapse.
823 Comments : For HSPs partially or completely derived from translated sequences
824 : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
825 : cannot easily be attributed to a single position (i.e. the
826 : positional data is ambiguous). In these cases all three codon
827 : positions are reported. Under these conditions you can check
828 : ambiguous_seq_inds() to determine whether the query, subject,
829 : or both are ambiguous.
831 See Also : L<Bio::Search::SearchUtils::collapse_nums()|Bio::Search::SearchUtils>,
832 L<Bio::Search::Hit::HitI::seq_inds()|Bio::Search::Hit::HitI>
837 my ($self, $seqType, $class, $collapse) = @_;
839 # prepare the internal structures - this is cached so
840 # if the strings have not changed we're okay
841 $self->_calculate_seq_positions();
843 $seqType ||= 'query';
844 $class ||= 'identical';
846 $seqType = 'sbjct' if $seqType eq 'hit';
847 my $t = lc(substr($seqType,0,1));
850 } elsif ( $t eq 's' || $t eq 'h' ) {
853 $self->warn("unknown seqtype $seqType using 'query'");
857 $t = lc(substr($class,0,1));
860 if( $class =~ /conserved\-not\-identical/ ) {
861 $class = 'conserved';
863 $class = 'conservedall';
865 } elsif( $t eq 'i' ) {
866 $class = 'identical';
867 } elsif( $t eq 'n' ) {
869 } elsif( $t eq 'm' ) {
871 } elsif( $t eq 'g' ) {
873 } elsif( $t eq 'f' ) {
874 $class = 'frameshift';
876 $self->warn("unknown sequence class $class using 'identical'");
877 $class = 'identical';
880 ## Sensitive to member name changes.
881 $seqType = "_\L$seqType\E";
882 $class = "_\L$class\E";
885 if( $class eq '_gap' ) {
886 # this means that we are remapping the gap length that is stored
887 # in the hash (for example $self->{'_gapRes_query'} )
888 # so we'll return an array which has the values of the position of the
889 # of the gap (the key in the hash) + the gap length (value in the
890 # hash for this key - 1.
892 # changing this; since the index is the position prior to the insertion,
893 # repeat the position based on the number of gaps inserted
894 @ary = map { my @tmp;
895 # position holds number of gaps inserted
896 for my $g (1..$self->{seqinds
}{"${class}Res$seqType"}->{$_}) {
900 sort { $a <=> $b } keys %{ $self->{seqinds
}{"${class}Res$seqType"}};
901 } elsif( $class eq '_conservedall' ) {
902 @ary = sort { $a <=> $b }
903 keys %{ $self->{seqinds
}{"_conservedRes$seqType"}},
904 keys %{ $self->{seqinds
}{"_identicalRes$seqType"}},
906 @ary = sort { $a <=> $b } keys %{ $self->{seqinds
}{"${class}Res$seqType"}};
908 require Bio
::Search
::BlastUtils
if $collapse;
910 return $collapse ?
&Bio
::Search
::SearchUtils
::collapse_nums
(@ary) : @ary;
913 =head2 ambiguous_seq_inds
915 Title : ambiguous_seq_inds
916 Purpose : returns a string denoting whether sequence indices for query,
917 : subject, or both are ambiguous
918 Returns : String; 'query', 'subject', 'query/subject', or empty string ''
920 Comments : For HSPs partially or completely derived from translated sequences
921 : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
922 : cannot easily be attributed to a single position (i.e. the
923 : positional data is ambiguous). In these cases all three codon
924 : positions are reported when using seq_inds(). Under these
925 : conditions you can check ambiguous_seq_inds() to determine whether
926 : the query, subject, or both are ambiguous.
927 See Also : L<Bio::Search::Hit::HSPI::seq_inds()>
931 sub ambiguous_seq_inds
{
933 $self->_calculate_seq_positions();
934 my $type = ($self->{_query_offset
} == 3 && $self->{_sbjct_offset
} == 3) ?
936 ($self->{_query_offset
} == 3) ?
'query' :
937 ($self->{_sbjct_offset
} == 3) ?
'subject' : '';
941 =head2 Inherited from L<Bio::SeqFeature::SimilarityPair>
943 These methods come from L<Bio::SeqFeature::SimilarityPair>
948 Usage : my $query = $hsp->query
949 Function: Returns a SeqFeature representing the query in the HSP
950 Returns : L<Bio::SeqFeature::Similarity>
951 Args : [optional] new value to set
957 unless ($self->{_created_qff
}) {
958 $self->_query_seq_feature;
960 return $self->SUPER::query
(@_);
965 if (! $self->{_finished_new
} || $self->{_making_qff
}) {
966 return $self->{_sim1
} if $self->{_sim1
};
967 $self->{_sim1
} = Bio
::SeqFeature
::Similarity
->new();
968 return $self->{_sim1
};
970 unless ($self->{_created_qff
}) {
971 $self->_query_seq_feature;
973 return $self->SUPER::feature1
(@_);
979 Usage : my $hit = $hsp->hit
980 Function: Returns a SeqFeature representing the hit in the HSP
981 Returns : L<Bio::SeqFeature::Similarity>
982 Args : [optional] new value to set
988 unless ($self->{_created_sff
}) {
989 $self->_subject_seq_feature;
991 return $self->SUPER::hit
(@_);
996 if (! $self->{_finished_new
} || $self->{_making_sff
}) {
997 return $self->{_sim2
} if $self->{_sim2
};
998 $self->{_sim2
} = Bio
::SeqFeature
::Similarity
->new();
999 return $self->{_sim2
};
1001 unless ($self->{_created_sff
}) {
1002 $self->_subject_seq_feature;
1004 return $self->SUPER::feature2
(@_);
1009 Title : significance
1010 Usage : $evalue = $obj->significance();
1011 $obj->significance($evalue);
1012 Function: Get/Set the significance value
1014 Args : [optional] new value to set
1018 # Override significance to return the e-value or, if this is
1019 # not defined (WU-BLAST), return the p-value.
1021 my ($self, $val) = @_;
1022 if (!defined $self->{SIGNIFICANCE
} || defined $val) {
1023 $self->{SIGNIFICANCE
} = defined $val ?
$val :
1024 defined $self->evalue ?
$self->evalue :
1025 defined $self->pvalue ?
$$self->pvalue :
1027 $self->query->significance($self->{SIGNIFICANCE
});
1029 return $self->{SIGNIFICANCE
};
1035 Usage : $hsp->strand('query')
1036 Function: Retrieves the strand for the HSP component requested
1038 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject,
1039 'query' to retrieve the query strand (default)
1044 my ($self,$type) = @_;
1046 if( $type =~ /^q/i && defined $self->{'QUERY_STRAND'} ) {
1047 return $self->{'QUERY_STRAND'};
1048 } elsif( $type =~ /^(hit|subject|sbjct)/i && defined $self->{'HIT_STRAND'} ) {
1049 return $self->{'HIT_STRAND'};
1052 return $self->SUPER::strand
($type)
1058 Usage : $score = $obj->score();
1059 $obj->score($value);
1060 Function: Get/Set the score value
1062 Args : [optional] new value to set
1067 Usage : $bits = $obj->bits();
1069 Function: Get/Set the bits value
1071 Args : [optional] new value to set
1073 =head1 Private methods
1077 =head2 _calculate_seq_positions
1079 Title : _calculate_seq_positions
1080 Usage : $self->_calculate_seq_positions
1081 Function: Internal function
1088 sub _calculate_seq_positions
{
1089 my ($self,@args) = @_;
1090 return unless ( $self->{'_sequenceschanged'} );
1091 $self->{'_sequenceschanged'} = 0;
1092 my ($seqString, $qseq,$sseq) = ( $self->homology_string(),
1093 $self->query_string(),
1094 $self->hit_string() );
1095 my ($mlen, $qlen, $slen) = (CORE
::length($seqString), CORE
::length($qseq), CORE
::length($sseq));
1096 my $qdir = $self->query->strand || 1;
1097 my $sdir = $self->hit->strand || 1;
1098 my ($resCount_query, $endpoint_query) = ($qdir <=0) ?
($self->query->end, $self->query->start)
1099 : ($self->query->start, $self->query->end);
1100 my ($resCount_sbjct, $endpoint_sbjct) = ($sdir <=0) ?
($self->hit->end, $self->hit->start)
1101 : ($self->hit->start, $self->hit->end);
1103 my $prog = $self->algorithm;
1105 if( $prog =~ /FAST|SSEARCH|SMITH-WATERMAN/i ) {
1107 # we infer the end of the regional sequence where the first and last
1108 # non spaces are in the homology string
1109 # then we use the HSP->length to tell us how far to read
1110 # to cut off the end of the sequence
1112 my ($start, $rest) = (0,0);
1113 if( $seqString =~ /^(\s+)?(.*?)\s*$/ ) {
1114 ($start, $rest) = ($1 ? CORE
::length($1) : 0, CORE
::length($2));
1117 $seqString = substr($seqString, $start, $rest);
1118 $qseq = substr($qseq, $start, $rest);
1119 $sseq = substr($sseq, $start, $rest);
1121 # commented out 10/29/08
1122 # removing frameshift symbols doesn't take into account the following:
1123 # 1) does not remove the same point in the homology string (get
1124 # positional errors)
1125 # 2) adjustments to the overall position in the string due to the
1126 # frameshift must be taken into consideration (get balancing errors)
1128 # Frameshifts will be handled directly in the main loop.
1131 # fasta reports some extra 'regional' sequence information
1132 # we need to clear out first
1133 # this seemed a bit insane to me at first, but it appears to
1136 #$qseq =~ s![\\\/]!!g;
1137 #$sseq =~ s![\\\/]!!g;
1140 if (!defined($self->{_sbjct_offset
}) || !defined($self->{_query_offset
})) {
1141 $self->_calculate_seq_offsets();
1144 my ($qfs, $sfs) = (0,0);
1146 for my $pos (0..CORE
::length($seqString)-1) {
1147 my @qrange = (0 - $qfs)..($self->{_query_offset
} - 1);
1148 my @srange = (0 - $sfs)..($self->{_sbjct_offset
} - 1);
1149 # $self->debug("QRange:@qrange SRange:@srange\n") if ($qfs || $sfs);
1150 ($qfs, $sfs) = (0,0);
1151 my ($mchar, $qchar, $schar) = (
1152 unpack("x$pos A1",$seqString) || ' ',
1153 $pos < CORE
::length($qseq) ?
unpack("x$pos A1",$qseq) : '-',
1154 $pos < CORE
::length($sseq) ?
unpack("x$pos A1",$sseq) : '-'
1157 my ($qgap, $sgap) = (0,0);
1158 if( $mchar eq '+' || $mchar eq '.') { # conserved
1159 $self->{seqinds
}{_conservedRes_query
}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1160 $self->{seqinds
}{_conservedRes_sbjct
}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1161 $matchtype = 'conserved';
1162 } elsif( $mchar eq ':' || $mchar ne ' ' ) { # identical
1163 $self->{seqinds
}{_identicalRes_query
}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1164 $self->{seqinds
}{_identicalRes_sbjct
}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1165 $matchtype = 'identical';
1166 } elsif( $mchar eq ' ' ) { # possible mismatch/nomatch/frameshift
1167 $qfs = $qchar eq '/' ?
-1 : # base inserted to match frame
1168 $qchar eq '\\' ?
1 : # base deleted to match frame
1170 $sfs = $schar eq '/' ?
-1 :
1171 $schar eq '\\' ?
1 :
1174 # Frameshifts are tricky.
1176 # '/' indicates that the next residue is shifted back one
1177 # (-1) frame position and is a deletion of one base (so to
1178 # correctly align, a base is inserted). That residue should only
1179 # occupy two sequence positions instead of three.
1181 # '\' indicates that the next residue is shifted forward
1182 # one (+1) frame position and is an insertion of one base (so to
1183 # correctly align, a base is removed). That residue should
1184 # occupy four sequence positions instead of three.
1186 # Note that gaps are not counted across from frameshifts.
1187 # Frameshift indices are a range of positions starting in the
1188 # previous sequence position in which the frameshift occurs;
1189 # they are ambiguous by nature.
1190 $self->{seqinds
}{_frameshiftRes_query
}{ $resCount_query - ($_ * $qdir * $qfs) } = $qfs for @qrange;
1191 $matchtype = "$qfs frameshift-query";
1195 $self->{seqinds
}{_frameshiftRes_sbjct
}{ $resCount_sbjct - ($_ * $sdir * $sfs) } = $sfs for @srange;
1196 $matchtype = "$sfs frameshift-sbcjt";
1199 elsif ($qchar eq $self->{GAP_SYMBOL
}) {
1200 # gap are counted as being in the immediately preceeding residue
1201 # position; for translated sequences this is not the start of
1202 # the previous codon, but the third codon position
1203 $self->{seqinds
}{_gapRes_query
}{ $resCount_query - $qdir }++ for @qrange;
1204 $matchtype = 'gap-query';
1207 elsif ($schar eq $self->{GAP_SYMBOL
}) {
1208 $self->{seqinds
}{_gapRes_sbjct
}{ $resCount_sbjct - $sdir }++ for @srange;
1209 $matchtype = 'gap-sbjct';
1213 # if not a gap or frameshift in either seq, must be mismatch
1214 $self->{seqinds
}{_mismatchRes_query
}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1215 $self->{seqinds
}{_mismatchRes_sbjct
}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1216 $matchtype = 'mismatch';
1218 # always add a nomatch unless the current position in the seq is a gap
1220 $self->{seqinds
}{_nomatchRes_sbjct
}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1223 $self->{seqinds
}{_nomatchRes_query
}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1226 $self->warn("Unknown midline character: [$mchar]");
1228 # leave in and uncomment for future debugging
1229 #$self->debug(sprintf("%7d %1s[%1s]%1s %-7d Type: %-20s QOff:%-2d SOff:%-2d\n",
1236 # ($self->{_query_offset} * $qdir),
1237 # ($self->{_sbjct_offset} * $sdir)));
1238 $resCount_query += ($qdir * (scalar(@qrange) + $qfs)) if !$qgap;
1239 $resCount_sbjct += ($sdir * (scalar(@srange) + $sfs)) if !$sgap;
1244 sub _calculate_seq_offsets
{
1246 my $prog = $self->algorithm;
1247 ($self->{_sbjct_offset
}, $self->{_query_offset
}) = (1,1);
1248 if($prog =~ /^(?:PSI)?T(BLAST|FAST)(N|X|Y)/oi ) {
1249 $self->{_sbjct_offset
} = 3;
1250 if ($1 eq 'BLAST' && $2 eq 'X') { #TBLASTX
1251 $self->{_query_offset
} = 3;
1253 } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) {
1254 $self->{_query_offset
} = 3;
1261 See documentation in L<Bio::Search::HSP::HSPI::n()|Bio::Search::HSP::HSPI>
1267 if(@_) { $self->{'N'} = shift; }
1268 # note that returning 1 is completely an assumption
1269 defined $self->{'N'} ?
$self->{'N'} : 1;
1274 See documentation in L<Bio::Search::HSP::HSPI::range()|Bio::Search::HSP::HSPI>
1279 my ($self, $seqType) = @_;
1281 $seqType ||= 'query';
1282 $seqType = 'sbjct' if $seqType eq 'hit';
1285 if( $seqType eq 'query' ) {
1286 $start = $self->query->start;
1287 $end = $self->query->end;
1290 $start = $self->hit->start;
1291 $end = $self->hit->end;
1293 return ($start, $end);
1300 Usage : $obj->links($newval)
1301 Function: Get/Set the Links value (from WU-BLAST)
1302 Indicates the placement of the alignment in the group of HSPs
1303 Returns : Value of links
1304 Args : On set, new value (a scalar or undef, optional)
1312 return $self->{LINKS
} = shift if @_;
1313 return $self->{LINKS
};
1319 Usage : $obj->hsp_group($newval)
1320 Function: Get/Set the Group value (from WU-BLAST)
1321 Indicates a grouping of HSPs
1322 Returns : Value of group
1323 Args : On set, new value (a scalar or undef, optional)
1331 return $self->{HSP_GROUP
} = shift if @_;
1332 return $self->{HSP_GROUP
};
1337 Title : hit_features
1338 Usage : $obj->hit_features($newval)
1339 Function: Get/Set the HSP hit feature string (from some BLAST 2.2.13 text
1340 output), which is a string of overlapping or nearby features in HSP
1342 Returns : Value of hit features, if present
1343 Args : On set, new value (a scalar or undef, optional)
1351 return $self->{HIT_FEATURES
} = shift if @_;
1352 return $self->{HIT_FEATURES
};
1355 # The cigar string code is written by Juguang Xiao <juguang@fugu-sg.org>
1357 =head1 Brief introduction on cigar string
1359 NOTE: the concept is originally from EnsEMBL docs at
1360 http://may2005.archive.ensembl.org/Docs/wiki/html/EnsemblDocs/CigarFormat.html
1361 Please append to these docs if you have a better definition.
1363 Sequence alignment hits can be stored in a database as ungapped alignments.
1364 This imposes 2 major constraints on alignments:
1366 a) alignments for a single hit record require multiple rows in the database,
1368 b) it is not possible to accurately retrieve the exact original alignment.
1370 Alternatively, sequence alignments can be stored as gapped alignments using
1371 the CIGAR line format (where CIGAR stands for Concise Idiosyncratic Gapped
1374 In the cigar line format alignments are stored as follows:
1380 An example of an alignment for a hypthetical protein match is shown below:
1383 Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
1387 Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
1390 protein_align_feature table as the following cigar line:
1397 Usage: $cigar_string = $hsp->cigar_string
1398 Function: Generate and return cigar string for this HSP alignment
1399 Args: No input needed
1400 Return: a cigar string
1406 my ($self, $arg) = @_;
1407 $self->warn("this is not a setter") if(defined $arg);
1409 unless(defined $self->{_cigar_string
}){ # generate cigar string
1410 my $cigar_string = $self->generate_cigar_string($self->query_string, $self->hit_string);
1411 $self->{_cigar_string
} = $cigar_string;
1414 return $self->{_cigar_string
};
1417 =head2 generate_cigar_string
1419 Name: generate_cigar_string
1420 Usage: my $cigar_string = Bio::Search::HSP::GenericHSP::generate_cigar_string ($qstr, $hstr);
1421 Function: generate cigar string from a simple sequence of alignment.
1422 Args: the string of query and subject
1423 Return: cigar string
1427 sub generate_cigar_string
{
1428 my ($self, $qstr, $hstr) = @_;
1429 my @qchars = split //, $qstr;
1430 my @hchars = split //, $hstr;
1432 unless(scalar(@qchars) == scalar(@hchars)){
1433 $self->throw("two sequences are not equal in lengths");
1436 $self->{_count_for_cigar_string
} = 0;
1437 $self->{_state_for_cigar_string
} = 'M';
1439 my $cigar_string = '';
1440 for(my $i=0; $i <= $#qchars; $i++){
1441 my $qchar = $qchars[$i];
1442 my $hchar = $hchars[$i];
1443 if($qchar ne $self->{GAP_SYMBOL
} && $hchar ne $self->{GAP_SYMBOL
}){ # Match
1444 $cigar_string .= $self->_sub_cigar_string('M');
1445 }elsif($qchar eq $self->{GAP_SYMBOL
}){ # Deletion
1446 $cigar_string .= $self->_sub_cigar_string('D');
1447 }elsif($hchar eq $self->{GAP_SYMBOL
}){ # Insertion
1448 $cigar_string .= $self->_sub_cigar_string('I');
1450 $self->throw("Impossible state that 2 gaps on each seq aligned");
1453 $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
1454 return $cigar_string;
1457 # an internal method to help generate cigar string
1459 sub _sub_cigar_string
{
1460 my ($self, $new_state) = @_;
1462 my $sub_cigar_string = '';
1463 if($self->{_state_for_cigar_string
} eq $new_state){
1464 $self->{_count_for_cigar_string
} += 1; # Remain the state and increase the counter
1466 $sub_cigar_string .= $self->{_count_for_cigar_string
}
1467 unless $self->{_count_for_cigar_string
} == 1;
1468 $sub_cigar_string .= $self->{_state_for_cigar_string
};
1469 $self->{_count_for_cigar_string
} = 1;
1470 $self->{_state_for_cigar_string
} = $new_state;
1472 return $sub_cigar_string;
1475 # needed before seqfeatures can be made
1476 sub _pre_seq_feature
{
1478 my $algo = $self->{ALGORITHM
};
1480 my ($queryfactor, $hitfactor) = (0,0);
1481 my ($hitmap, $querymap) = (1,1);
1482 if( $algo =~ /^(?:PSI)?T(?:BLAST|FAST|SW)[NY]/oi ) {
1486 elsif ($algo =~ /^(?:FAST|BLAST)(?:X|Y|XY)/oi || $algo =~ /^P?GENEWISE/oi ) {
1490 elsif ($algo =~ /^T(BLAST|FAST|SW)(X|Y|XY)/oi || $algo =~ /^(BLAST|FAST|SW)N/oi || $algo =~ /^WABA|AXT|BLAT|BLASTZ|PSL|MEGABLAST|EXONERATE|SW|SSEARCH|SMITH\-WATERMAN|SIM4$/){
1492 $hitmap = $querymap = 3;
1497 elsif ($algo =~ /^RPS-BLAST/) {
1498 if ($algo =~ /^RPS-BLAST\(BLASTX\)/) {
1505 my $stranded = lc substr($self->{STRANDED
}, 0,1);
1506 $queryfactor = ($stranded eq 'q' || $stranded eq 'b') ?
1 : 0;
1507 $hitfactor = ($stranded eq 'h' || $stranded eq 's' || $stranded eq 'b') ?
1 : 0;
1509 $self->{_query_factor
} = $queryfactor;
1510 $self->{_hit_factor
} = $hitfactor;
1511 $self->{_hit_mapping
} = $hitmap;
1512 $self->{_query_mapping
} = $querymap;
1515 # make query seq feature
1516 sub _query_seq_feature
{
1518 $self->{_making_qff
} = 1;
1519 my $qs = $self->{QUERY_START
};
1520 my $qe = $self->{QUERY_END
};
1521 unless (defined $self->{_query_factor
}) {
1522 $self->_pre_seq_feature;
1524 my $queryfactor = $self->{_query_factor
};
1526 unless( defined $qe && defined $qs ) { $self->throw("Did not specify a Query End or Query Begin"); }
1529 if ($qe > $qs) { # normal query: start < end
1544 ($qs,$qe) = ($qe,$qs);
1547 # Note: many of these data are not query- and hit-specific.
1548 # Only start, end, name, length are.
1549 # We could be more efficient by only storing this info once.
1550 # steve chervitz --- Sat Apr 5 00:55:07 2003
1552 my $sim1 = $self->{_sim1
} || Bio
::SeqFeature
::Similarity
->new();
1555 $sim1->significance($self->{EVALUE
});
1556 $sim1->bits($self->{BITS
});
1557 $sim1->score($self->{SCORE
});
1558 $sim1->strand($strand);
1559 $sim1->seq_id($self->{QUERY_NAME
});
1560 $sim1->seqlength($self->{QUERY_LENGTH
});
1561 $sim1->source_tag($self->{ALGORITHM
});
1562 $sim1->seqdesc($self->{QUERY_DESC
});
1563 $sim1->add_tag_value('meta', $self->{META
}) if $self->can('meta');
1564 # to determine frame from something like FASTXY which doesn't
1566 my $qframe = $self->{QUERY_FRAME
};
1568 if (defined $strand && !defined $qframe && $queryfactor) {
1569 $qframe = ( $qs % 3 ) * $strand;
1571 elsif (!defined $strand) {
1575 if( $qframe =~ /^([+-])?([0-3])/ ) {
1576 my $dir = $1 || '+';
1577 if($qframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
1578 $self->warn("Query frame ($qframe) did not match strand of query ($strand)");
1580 $qframe = $2 != 0 ?
$2 - 1 : $2;
1582 $self->warn("Unknown query frame ($qframe)");
1586 $sim1->frame($qframe);
1587 $self->SUPER::feature1
($sim1);
1589 $self->{_created_qff
} = 1;
1590 $self->{_making_qff
} = 0;
1593 # make subject seq feature
1594 sub _subject_seq_feature
{
1596 $self->{_making_sff
} = 1;
1597 my $hs = $self->{HIT_START
};
1598 my $he = $self->{HIT_END
};
1599 unless (defined $self->{_hit_factor
}) {
1600 $self->_pre_seq_feature;
1602 my $hitfactor = $self->{_hit_factor
};
1604 unless( defined $he && defined $hs ) { $self->throw("Did not specify a Hit End or Hit Begin"); }
1607 if ($he > $hs) { # normal subject
1622 ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end
1625 my $sim2 = $self->{_sim2
} || Bio
::SeqFeature
::Similarity
->new();
1628 $sim2->significance($self->{EVALUE
});
1629 $sim2->bits($self->{BITS
});
1630 $sim2->score($self->{SCORE
});
1631 $sim2->strand($strand);
1632 $sim2->seq_id($self->{HIT_NAME
});
1633 $sim2->seqlength($self->{HIT_LENGTH
});
1634 $sim2->source_tag($self->{ALGORITHM
});
1635 $sim2->seqdesc($self->{HIT_DESC
});
1636 $sim2->add_tag_value('meta', $self->{META
}) if $self->can('meta');
1637 my $hframe = $self->{HIT_FRAME
};
1639 if (defined $strand && !defined $hframe && $hitfactor) {
1640 $hframe = ( $hs % 3 ) * $strand;
1642 elsif (!defined $strand) {
1646 if( $hframe =~ /^([+-])?([0-3])/ ) {
1647 my $dir = $1 || '+';
1648 if($hframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
1649 $self->warn("Subject frame ($hframe) did not match strand of subject ($strand)");
1651 $hframe = $2 != 0 ?
$2 - 1 : $2;
1653 $self->warn("Unknown subject frame ($hframe)");
1657 $sim2->frame($hframe);
1658 $self->SUPER::feature2
($sim2);
1660 $self->{_created_sff
} = 1;
1661 $self->{_making_sff
} = 0;
1664 # before calling the num_* methods
1665 sub _pre_similar_stats
{
1667 my $identical = $self->{IDENTICAL
};
1668 my $conserved = $self->{CONSERVED
};
1669 my $percent_id = $self->{PERCENT_IDENTITY
};
1671 if (! defined $identical) {
1672 if (! defined $percent_id) {
1673 $self->warn("Did not defined the number of identical matches or overall percent identity in the HSP; assuming 0");
1677 $identical = sprintf("%.0f",$percent_id * $self->{HSP_LENGTH
});
1681 if (! defined $conserved) {
1682 $self->warn("Did not define the number of conserved matches in the HSP; assuming conserved == identical ($identical)")
1683 if( $self->{ALGORITHM
} !~ /^((FAST|BLAST)N)|EXONERATE|SIM4|AXT|PSL|BLAT|BLASTZ|WABA/oi);
1684 $conserved = $identical;
1686 $self->{IDENTICAL
} = $identical;
1687 $self->{CONSERVED
} = $conserved;
1688 $self->{_did_presimilar
} = 1;
1691 # before calling the frac_* methods
1696 my $hsp_len = $self->{HSP_LENGTH
};
1697 my $hit_len = $self->{HIT_LENGTH
};
1698 my $query_len = $self->{QUERY_LENGTH
};
1700 my $identical = $self->num_identical;
1701 my $conserved = $self->num_conserved;
1703 $self->{_did_prefrac
} = 1;
1706 $self->length('total', $hsp_len);
1707 $logical = $self->_logical_length('total');
1708 $self->frac_identical( 'total', $identical / $hsp_len);
1709 $self->frac_conserved( 'total', $conserved / $hsp_len);
1712 $logical = $self->_logical_length('hit');
1713 $self->frac_identical( 'hit', $identical / $logical);
1714 $self->frac_conserved( 'hit', $conserved / $logical);
1717 $logical = $self->_logical_length('query');
1718 $self->frac_identical( 'query', $identical / $logical) ;
1719 $self->frac_conserved( 'query', $conserved / $logical);
1723 # before calling gaps()
1724 # This relies first on passed parameters (parser-dependent), then on gaps
1725 # calculated by seq_inds() (if set), then falls back to directly checking
1726 # for '-' or '.' as a last resort
1730 my $query_gaps = $self->{QUERY_GAPS
};
1731 my $query_seq = $self->{QUERY_SEQ
};
1732 my $hit_gaps = $self->{HIT_GAPS
};
1733 my $hit_seq = $self->{HIT_SEQ
};
1734 my $gaps = $self->{HSP_GAPS
};
1736 $self->{_did_pregaps
} = 1; # well, we're in the process; avoid recursion
1737 if( defined $query_gaps ) {
1738 $self->gaps('query', $query_gaps);
1739 } elsif( defined $query_seq ) {
1740 my $qg = (defined $self->{'_query_offset'}) ?
$self->seq_inds('query','gaps')
1741 : ($self->algorithm eq 'ERPIN') ?
scalar( $hit_seq =~ tr/\-//)
1742 : scalar( $query_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1743 my $offset = $self->{'_query_offset'} || 1;
1744 $self->gaps('query', $qg/$offset);
1746 if( defined $hit_gaps ) {
1747 $self->gaps('hit', $hit_gaps);
1748 } elsif( defined $hit_seq ) {
1749 my $hg = (defined $self->{'_sbjct_offset'}) ?
$self->seq_inds('hit','gaps')
1750 : ($self->algorithm eq 'ERPIN') ?
scalar( $hit_seq =~ tr/\-//)
1751 : scalar( $hit_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1752 my $offset = $self->{'_sbjct_offset'} || 1;
1753 $self->gaps('hit', $hg/$offset);
1755 if( ! defined $gaps ) {
1756 $gaps = $self->gaps("query") + $self->gaps("hit");
1758 $self->gaps('total', $gaps);
1761 # before percent_identity
1764 $self->{_did_prepi
} = 1;
1765 $self->percent_identity($self->{PERCENT_IDENTITY
} || $self->frac_identical('total')*100) if( $self->{HSP_LENGTH
} > 0 );