2 # BioPerl module for Bio::Search::HSP::GenericHSP
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Search::HSP::GenericHSP - A "Generic" implementation of a High Scoring Pair
20 my $hsp = Bio::Search::HSP::GenericHSP->new( -algorithm => 'blastp',
24 $r_type = $hsp->algorithm;
28 $evalue = $hsp->evalue();
30 $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
32 $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
34 $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
36 $qseq = $hsp->query_string;
38 $hseq = $hsp->hit_string;
40 $homo_string = $hsp->homology_string;
42 $len = $hsp->length( ['query'|'hit'|'total'] );
44 $len = $hsp->length( ['query'|'hit'|'total'] );
48 # TODO: Describe how to configure a SearchIO stream so that it generates
53 This implementation is "Generic", meaning it is is suitable for
54 holding information about High Scoring pairs from most Search reports
55 such as BLAST and FastA. Specialized objects can be derived from
58 Unless you're writing a parser, you won't ever need to create a
59 GenericHSP or any other HSPI-implementing object. If you use
60 the SearchIO system, HSPI objects are created automatically from
61 a SearchIO stream which returns Bio::Search::Result::ResultI objects
62 and you get the HSPI objects via the ResultI API.
64 For documentation on what you can do with GenericHSP (and other HSPI
65 objects), please see the API documentation in
66 L<Bio::Search::HSP::HSPI|Bio::Search::HSP::HSPI>.
72 User feedback is an integral part of the evolution of this and other
73 Bioperl modules. Send your comments and suggestions preferably to
74 the Bioperl mailing list. Your participation is much appreciated.
76 bioperl-l@bioperl.org - General discussion
77 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
81 Please direct usage questions or support issues to the mailing list:
83 I<bioperl-l@bioperl.org>
85 rather than to the module maintainer directly. Many experienced and
86 reponsive experts will be able look at the problem and quickly
87 address it. Please include a thorough description of the problem
88 with code and data examples if at all possible.
92 Report bugs to the Bioperl bug tracking system to help us keep track
93 of the bugs and their resolution. Bug reports can be submitted via the
96 https://github.com/bioperl/bioperl-live/issues
98 =head1 AUTHOR - Jason Stajich and Steve Chervitz
100 Email jason-at-bioperl.org
101 Email sac-at-bioperl.org
105 Sendu Bala, bix@sendu.me.uk
109 The rest of the documentation details each of the object methods.
110 Internal methods are usually preceded with a _
114 # Let the code begin...
116 package Bio
::Search
::HSP
::GenericHSP
;
120 use Bio
::SeqFeature
::Similarity
;
122 use base
qw(Bio::Search::HSP::HSPI);
127 Usage : my $obj = Bio::Search::HSP::GenericHSP->new();
128 Function: Builds a new Bio::Search::HSP::GenericHSP object
129 Returns : Bio::Search::HSP::GenericHSP
130 Args : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc)
133 -bits => bit value for HSP
134 -score => score value for HSP (typically z-score but depends on
136 -hsp_length=> Length of the HSP (including gaps)
137 -identical => # of residues that that matched identically
138 -percent_identity => (optional) percent identity
139 -conserved => # of residues that matched conservatively
140 (only protein comparisons;
141 conserved == identical in nucleotide comparisons)
142 -hsp_gaps => # of gaps in the HSP
143 -query_gaps => # of gaps in the query in the alignment
144 -hit_gaps => # of gaps in the subject in the alignment
145 -query_name => HSP Query sequence name (if available)
146 -query_start => HSP Query start (in original query sequence coords)
147 -query_end => HSP Query end (in original query sequence coords)
148 -query_length=> total length of the query sequence
149 -query_seq => query sequence portion of the HSP
150 -query_desc => textual description of the query
151 -hit_name => HSP Hit sequence name (if available)
152 -hit_start => HSP Hit start (in original hit sequence coords)
153 -hit_end => HSP Hit end (in original hit sequence coords)
154 -hit_length => total length of the hit sequence
155 -hit_seq => hit sequence portion of the HSP
156 -hit_desc => textual description of the hit
157 -homology_seq=> homology sequence for the HSP
158 -hit_frame => hit frame (only if hit is translated protein)
159 -query_frame => query frame (only if query is translated protein)
161 -links => HSP links information (WU-BLAST only)
162 -hsp_group => HSP Group informat (WU-BLAST only)
163 -gap_symbol => symbol representing a gap (default = '-')
164 -hit_features=> string of features found in or near HSP hit
165 region (reported in some BLAST text output,
167 -stranded => If the algorithm isn't known (i.e. defaults to
168 'generic'), setting this will indicate start/end
169 coordinates are to be used to determine the strand
170 for 'query', 'subject', 'hit', 'both', or 'none'
176 my($class,%args) = @_;
178 # don't pass anything to SUPER; complex hierarchy results in lots of work
181 my $self = $class->SUPER::new
();
183 # for speed, don't use _rearrange and just store all input data directly
184 # with no method calls and no work done. work can be carried
185 # out just-in-time later if desired
186 while (my ($arg, $value) = each %args) {
187 $arg =~ tr/a-z\055/A-Z/d;
188 $self->{$arg} = $value;
190 my $bits = $self->{BITS
};
192 defined $self->{VERBOSE
} && $self->verbose($self->{VERBOSE
});
193 if (exists $self->{GAP_SYMBOL
}) {
194 # not checking anything else but the length (must be 1 as only one gap
195 # symbol allowed currently); can add in support for symbol checks or
196 # multiple symbols later if needed
197 $self->throw("Gap symbol must be of length 1") if
198 CORE
::length($self->{GAP_SYMBOL
}) != 1;
201 $self->{GAP_SYMBOL
} = '-';
203 $self->{ALGORITHM
} ||= 'GENERIC';
204 $self->{STRANDED
} ||= 'NONE';
206 if (! defined $self->{QUERY_LENGTH
} || ! defined $self->{HIT_LENGTH
}) {
207 $self->throw("Must define hit and query length");
210 $self->{'_sequenceschanged'} = 1;
212 $self->{_finished_new
} = 1;
216 sub _logical_length
{
217 my ($self, $type) = @_;
218 if (!defined($self->{_sbjct_offset
}) || !defined($self->{_query_offset
})) {
219 $self->_calculate_seq_offsets();
221 my $key = $type =~ /sbjct|hit|tot/i ?
'sbjct' : 'query';
223 my $offset = $self->{"_${key}_offset"};
224 return $self->length($type) / $offset ;
227 =head2 L<Bio::Search::HSP::HSPI> methods
229 Implementation of L<Bio::Search::HSP::HSPI> methods follow
234 Usage : my $r_type = $hsp->algorithm
235 Function: Obtain the name of the algorithm used to obtain the HSP
236 Returns : string (e.g., BLASTP)
237 Args : [optional] scalar string to set value
242 my ($self,$value) = @_;
243 my $previous = $self->{'ALGORITHM'};
244 if( defined $value || ! defined $previous ) {
245 $value = $previous = '' unless defined $value;
246 $self->{'ALGORITHM'} = $value;
255 Usage : my $pvalue = $hsp->pvalue();
256 Function: Returns the P-value for this HSP or undef
257 Returns : float or exponential (2e-10)
258 P-value is not defined with NCBI Blast2 reports.
259 Args : [optional] numeric to set value
264 my ($self,$value) = @_;
265 my $previous = $self->{'PVALUE'};
266 if( defined $value ) {
267 $self->{'PVALUE'} = $value;
275 Usage : my $evalue = $hsp->evalue();
276 Function: Returns the e-value for this HSP
277 Returns : float or exponential (2e-10)
278 Args : [optional] numeric to set value
283 my ($self,$value) = @_;
284 my $previous = $self->{'EVALUE'};
285 if( defined $value ) {
286 $self->{'EVALUE'} = $value;
291 =head2 frac_identical
293 Title : frac_identical
294 Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
295 Function: Returns the fraction of identitical positions for this HSP
296 Returns : Float in range 0.0 -> 1.0
297 Args : arg 1: 'query' = num identical / length of query seq (without gaps)
298 'hit' = num identical / length of hit seq (without gaps)
299 synonyms: 'sbjct', 'subject'
300 'total' = num identical / length of alignment (with gaps)
303 arg 2: [optional] frac identical value to set for the type requested
304 Note : for translated sequences, this adjusts the length accordingly
309 my ($self, $type,$value) = @_;
311 unless ($self->{_did_prefrac
}) {
315 $type = lc $type if defined $type;
316 $type = 'hit' if( defined $type &&
317 $type =~ /subject|sbjct/);
318 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
319 $type !~ /query|hit|subject|sbjct|total/);
320 my $previous = $self->{'_frac_identical'}->{$type};
321 if( defined $value || ! defined $previous ) {
322 $value = $previous = '' unless defined $value;
323 if( $type eq 'hit' || $type eq 'query' ) {
324 $self->$type()->frac_identical( $value);
326 $self->{'_frac_identical'}->{$type} = $value;
332 =head2 frac_conserved
334 Title : frac_conserved
335 Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
336 Function : Returns the fraction of conserved positions for this HSP.
337 This is the fraction of symbols in the alignment with a
339 Returns : Float in range 0.0 -> 1.0
340 Args : arg 1: 'query' = num conserved / length of query seq (without gaps)
341 'hit' = num conserved / length of hit seq (without gaps)
342 synonyms: 'sbjct', 'subject'
343 'total' = num conserved / length of alignment (with gaps)
346 arg 2: [optional] frac conserved value to set for the type requested
351 my ($self, $type,$value) = @_;
353 unless ($self->{_did_prefrac
}) {
357 $type = lc $type if defined $type;
358 $type = 'hit' if( defined $type && $type =~ /subject|sbjct/);
359 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
360 $type !~ /query|hit|subject|sbjct|total/);
361 my $previous = $self->{'_frac_conserved'}->{$type};
362 if( defined $value || ! defined $previous ) {
363 $value = $previous = '' unless defined $value;
364 $self->{'_frac_conserved'}->{$type} = $value;
372 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
373 Function : Get the number of gap characters in the query, hit, or total alignment.
374 Returns : Integer, number of gaps or 0 if none
375 Args : arg 1: 'query' = num gap characters in query seq
376 'hit' = num gap characters in hit seq; synonyms: 'sbjct', 'subject'
377 'total' = num gap characters in whole alignment; synonyms: 'hsp'
379 arg 2: [optional] integer gap value to set for the type requested
384 my ($self, $type, $value) = @_;
386 unless ($self->{_did_pregaps
}) {
390 $type = lc $type if defined $type;
391 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
392 $type !~ /query|hit|subject|sbjct|total/);
393 $type = 'hit' if $type =~ /sbjct|subject/;
394 my $previous = $self->{'_gaps'}->{$type};
395 if( defined $value || ! defined $previous ) {
396 $value = $previous = '' unless defined $value;
397 $self->{'_gaps'}->{$type} = $value;
399 return $previous || 0;
405 Usage : my $qseq = $hsp->query_string;
406 Function: Retrieves the query sequence of this HSP as a string
408 Args : [optional] string to set for query sequence
414 my ($self,$value) = @_;
415 my $previous = $self->{QUERY_SEQ
};
416 if( defined $value || ! defined $previous ) {
417 $value = $previous = '' unless defined $value;
418 $self->{QUERY_SEQ
} = $value;
419 # do some housekeeping so we know when to
420 # re-run _calculate_seq_positions
421 $self->{'_sequenceschanged'} = 1;
429 Usage : my $hseq = $hsp->hit_string;
430 Function: Retrieves the hit sequence of this HSP as a string
432 Args : [optional] string to set for hit sequence
438 my ($self,$value) = @_;
439 my $previous = $self->{HIT_SEQ
};
440 if( defined $value || ! defined $previous ) {
441 $value = $previous = '' unless defined $value;
442 $self->{HIT_SEQ
} = $value;
443 # do some housekeeping so we know when to
444 # re-run _calculate_seq_positions
445 $self->{'_sequenceschanged'} = 1;
450 =head2 homology_string
452 Title : homology_string
453 Usage : my $homo_string = $hsp->homology_string;
454 Function: Retrieves the homology sequence for this HSP as a string.
455 : The homology sequence is the string of symbols in between the
456 : query and hit sequences in the alignment indicating the degree
457 : of conservation (e.g., identical, similar, not similar).
459 Args : [optional] string to set for homology sequence
464 my ($self,$value) = @_;
465 my $previous = $self->{HOMOLOGY_SEQ
};
466 if( defined $value || ! defined $previous ) {
467 $value = $previous = '' unless defined $value;
468 $self->{HOMOLOGY_SEQ
} = $value;
469 # do some housekeeping so we know when to
470 # re-run _calculate_seq_positions
471 $self->{'_sequenceschanged'} = 1;
476 =head2 consensus_string
478 Title : consensus_string
479 Usage : my $cs_string = $hsp->consensus_string;
480 Function: Retrieves the consensus structure line for this HSP as a string (HMMER).
481 : If the model had any consensus structure or reference line annotation
482 : that it inherited from a multiple alignment (#=GC SS cons,
483 : #=GC RF annotation in Stockholm files), that information is shown
484 : as CS or RF annotation line.
486 Args : [optional] string to set for consensus structure
490 sub consensus_string
{
491 my ($self,$value) = @_;
492 my $previous = $self->{CS_SEQ
};
493 if( defined $value || ! defined $previous ) {
494 $value = $previous = '' unless defined $value;
495 $self->{CS_SEQ
} = $value;
496 # do some housekeeping so we know when to
497 # re-run _calculate_seq_positions
498 $self->{'_sequenceschanged'} = 1;
503 =head2 posterior_string
505 Title : posterior_string
506 Usage : my $pp_string = $hsp->posterior_string;
507 Function: Retrieves the posterior probability line for this HSP as a string (HMMer3).
508 : The posterior probability is the string of symbols at the bottom
509 : of the alignment indicating the expected accuracy of each aligned residue.
510 : A 0 means 0-5%, 1 means 5-15%, and so on; 9 means 85-95%,
511 : and a * means 95-100% posterior probability.
513 Args : [optional] string to set for posterior probability
517 sub posterior_string
{
518 my ($self,$value) = @_;
519 my $previous = $self->{PP_SEQ
};
520 if( defined $value || ! defined $previous ) {
521 $value = $previous = '' unless defined $value;
522 $self->{PP_SEQ
} = $value;
523 # do some housekeeping so we know when to
524 # re-run _calculate_seq_positions
525 $self->{'_sequenceschanged'} = 1;
533 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
534 Function : Returns the length of the query or hit in the alignment
536 or the aggregate length of the HSP (including gaps;
537 this may be greater than either hit or query )
539 Args : arg 1: 'query' = length of query seq (without gaps)
540 'hit' = length of hit seq (without gaps) (synonyms: sbjct, subject)
541 'total' = length of alignment (with gaps)
543 arg 2: [optional] integer length value to set for specific type
552 $type = 'total' unless defined $type;
555 if( $type =~ /^q/i ) {
556 return $self->query()->length(shift);
557 } elsif( $type =~ /^(hit|subject|sbjct)/ ) {
558 return $self->hit()->length(shift);
562 $self->{HSP_LENGTH
} = $v;
564 return $self->{HSP_LENGTH
};
566 return 0; # should never get here
572 Usage : my $len = $hsp->hsp_length()
573 Function: shortcut length('hsp')
574 Returns : floating point between 0 and 100
579 sub hsp_length
{ return shift->length('hsp', shift); }
581 =head2 percent_identity
583 Title : percent_identity
584 Usage : my $percentid = $hsp->percent_identity()
585 Function: Returns the calculated percent identity for an HSP
586 Returns : floating point between 0 and 100
592 sub percent_identity
{
595 unless ($self->{_did_prepi
}) {
599 return $self->SUPER::percent_identity
(@_);
605 Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
606 Function: Set the Frame for both query and subject and insure that
608 This overrides the frame() method implementation in
610 Returns : array of query and subject frame if return type wants an array
611 or query frame if defined or subject frame if not defined
612 Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
613 'query' to retrieve the query frame
614 'list' or 'array' to retrieve both query and hit frames together
615 Note : Frames are stored in the GFF way (0-2) not 1-3
616 as they are in BLAST (negative frames are deduced by checking
617 the strand of the query or hit)
621 # Note: changed 4/19/08 - bug 2485
623 # frame() is supposed to be a getter/setter (as is implied by the Function desc
624 # above; this is also consistent with Bio::SeqFeature::SimilarityPair). Also,
625 # the API is not consistent with the other HSP/SimilarityPair methods such as
626 # strand(), start(), end(), etc.
628 # In order to make this consistent with other methods all work is now done
629 # when the features are instantiated and not delayed. We compromise by
630 # defaulting back 'to hit' w/o passed args. Setting is now allowed.
636 # unfortunately, w/o args we need to warn about API changes
637 $self->warn("API for frame() has changed.\n".
638 "Please refer to documentation for Bio::Search::HSP::GenericHSP;\n".
639 "returning query frame");
644 if( $val =~ /^q/i ) {
645 return $self->query->frame(@_);
646 } elsif( $val =~ /^hi|^s/i ) {
647 return $self->hit->frame(@_);
648 } elsif ( $val =~ /^list|array/i ) {
649 return ($self->query->frame($_[0]),
650 $self->hit->frame($_[1]) );
651 } elsif ( $val =~ /^\d+$/) {
652 # old API i.e. frame($query_frame, $hit_frame). This catches all but one
653 # case, where no arg is passed (so the hit is wanted).
654 $self->warn("API for frame() has changed.\n".
655 "Please refer to documentation for Bio::Search::HSP::GenericHSP");
657 return ($self->query->frame($val),
658 $self->hit->frame(@_) ) :
659 return $self->hit->frame($val,@_);
661 $self->warn("unrecognized component '$val' requested\n");
669 Usage : my $aln = $hsp->gel_aln
670 Function: Returns a L<Bio::SimpleAlign> object representing the HSP alignment
671 Returns : L<Bio::SimpleAlign>
678 require Bio
::LocatableSeq
;
679 require Bio
::SimpleAlign
;
681 my $aln = Bio
::SimpleAlign
->new();
682 my $hs = $self->hit_string();
683 my $qs = $self->query_string();
684 # FASTA specific stuff moved to the FastaHSP object
686 $seqonly =~ s/[\-\s]//g;
687 my ($q_nm,$s_nm) = ($self->query->seq_id(),
688 $self->hit->seq_id());
689 # Should we silently change the name of the query or hit if it isn't
690 # defined? May need revisiting... cjfields 2008-12-3 (commented out below)
692 #unless( defined $q_nm && CORE::length ($q_nm) ) {
695 #unless( defined $s_nm && CORE::length ($s_nm) ) {
699 # mapping: 1 residues for every x coordinate positions
700 my $query = Bio
::LocatableSeq
->new(
703 -start
=> $self->query->start,
704 -end
=> $self->query->end,
705 -strand
=> $self->query->strand,
706 -force_nse
=> $q_nm ?
0 : 1,
707 -mapping
=> [ 1, $self->{_query_mapping
} ]
710 $seqonly =~ s/[\-\s]//g;
711 my $hit = Bio
::LocatableSeq
->new(
714 -start
=> $self->hit->start,
715 -end
=> $self->hit->end,
716 -strand
=> $self->hit->strand,
717 -force_nse
=> $s_nm ?
0 : 1,
718 -mapping
=> [ 1, $self->{_hit_mapping
} ]
720 $aln->add_seq($query);
727 Title : num_conserved
728 Usage : $obj->num_conserved($newval)
729 Function: returns the number of conserved residues in the alignment
731 Args : integer (optional)
737 my ($self,$value) = @_;
739 unless ($self->{_did_presimilar
}) {
740 $self->_pre_similar_stats;
743 if (defined $value) {
744 $self->{CONSERVED
} = $value;
746 return $self->{CONSERVED
};
751 Title : num_identical
752 Usage : $obj->num_identical($newval)
753 Function: returns the number of identical residues in the alignment
755 Args : integer (optional)
761 my ($self,$value) = @_;
763 unless ($self->{_did_presimilar
}) {
764 $self->_pre_similar_stats;
767 if( defined $value) {
768 $self->{IDENTICAL
} = $value;
770 return $self->{IDENTICAL
};
775 Usage : $hsp->rank( [string] );
776 Purpose : Get the rank of the HSP within a given Blast hit.
777 Example : $rank = $hsp->rank;
778 Returns : Integer (1..n) corresponding to the order in which the HSP
779 appears in the BLAST report.
784 my ($self,$value) = @_;
785 if( defined $value) {
786 $self->{RANK
} = $value;
788 return $self->{RANK
};
794 Purpose : Get a list of residue positions (indices) for all identical
795 : or conserved residues in the query or sbjct sequence.
796 Example : @s_ind = $hsp->seq_inds('query', 'identical');
797 : @h_ind = $hsp->seq_inds('hit', 'conserved');
798 : @h_ind = $hsp->seq_inds('hit', 'conserved-not-identical');
799 : @h_ind = $hsp->seq_inds('hit', 'conserved', 1);
800 Returns : List of integers
801 : May include ranges if collapse is true.
802 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query)
803 : ('sbjct' is synonymous with 'hit')
804 : class = 'identical' - identical positions
805 : 'conserved' - conserved positions
806 : 'nomatch' - mismatched residue or gap positions
807 : 'mismatch' - mismatched residue positions (no gaps)
808 : 'gap' - gap positions only
809 : 'frameshift'- frameshift positions only
810 : 'conserved-not-identical' - conserved positions w/o
812 : The name can be shortened to 'id' or 'cons' unless
813 : the name is . The default value is
816 : collapse = boolean, if true, consecutive positions are merged
817 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
818 : collapses to "1-5 7 9-11". This is useful for
819 : consolidating long lists. Default = no collapse.
822 Comments : For HSPs partially or completely derived from translated sequences
823 : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
824 : cannot easily be attributed to a single position (i.e. the
825 : positional data is ambiguous). In these cases all three codon
826 : positions are reported. Under these conditions you can check
827 : ambiguous_seq_inds() to determine whether the query, subject,
828 : or both are ambiguous.
830 See Also : L<Bio::Search::SearchUtils::collapse_nums()|Bio::Search::SearchUtils>,
831 L<Bio::Search::Hit::HitI::seq_inds()|Bio::Search::Hit::HitI>
836 my ($self, $seqType, $class, $collapse) = @_;
838 # prepare the internal structures - this is cached so
839 # if the strings have not changed we're okay
840 $self->_calculate_seq_positions();
842 $seqType ||= 'query';
843 $class ||= 'identical';
845 $seqType = 'sbjct' if $seqType eq 'hit';
846 my $t = lc(substr($seqType,0,1));
849 } elsif ( $t eq 's' || $t eq 'h' ) {
852 $self->warn("unknown seqtype $seqType using 'query'");
856 $t = lc(substr($class,0,1));
859 if( $class =~ /conserved\-not\-identical/ ) {
860 $class = 'conserved';
862 $class = 'conservedall';
864 } elsif( $t eq 'i' ) {
865 $class = 'identical';
866 } elsif( $t eq 'n' ) {
868 } elsif( $t eq 'm' ) {
870 } elsif( $t eq 'g' ) {
872 } elsif( $t eq 'f' ) {
873 $class = 'frameshift';
875 $self->warn("unknown sequence class $class using 'identical'");
876 $class = 'identical';
879 ## Sensitive to member name changes.
880 $seqType = "_\L$seqType\E";
881 $class = "_\L$class\E";
884 if( $class eq '_gap' ) {
885 # this means that we are remapping the gap length that is stored
886 # in the hash (for example $self->{'_gapRes_query'} )
887 # so we'll return an array which has the values of the position of the
888 # of the gap (the key in the hash) + the gap length (value in the
889 # hash for this key - 1.
891 # changing this; since the index is the position prior to the insertion,
892 # repeat the position based on the number of gaps inserted
893 @ary = map { my @tmp;
894 # position holds number of gaps inserted
895 for my $g (1..$self->{seqinds
}{"${class}Res$seqType"}->{$_}) {
899 sort { $a <=> $b } keys %{ $self->{seqinds
}{"${class}Res$seqType"}};
900 } elsif( $class eq '_conservedall' ) {
901 @ary = sort { $a <=> $b }
902 keys %{ $self->{seqinds
}{"_conservedRes$seqType"}},
903 keys %{ $self->{seqinds
}{"_identicalRes$seqType"}},
905 @ary = sort { $a <=> $b } keys %{ $self->{seqinds
}{"${class}Res$seqType"}};
907 require Bio
::Search
::BlastUtils
if $collapse;
909 return $collapse ?
&Bio
::Search
::SearchUtils
::collapse_nums
(@ary) : @ary;
912 =head2 ambiguous_seq_inds
914 Title : ambiguous_seq_inds
915 Purpose : returns a string denoting whether sequence indices for query,
916 : subject, or both are ambiguous
917 Returns : String; 'query', 'subject', 'query/subject', or empty string ''
919 Comments : For HSPs partially or completely derived from translated sequences
920 : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
921 : cannot easily be attributed to a single position (i.e. the
922 : positional data is ambiguous). In these cases all three codon
923 : positions are reported when using seq_inds(). Under these
924 : conditions you can check ambiguous_seq_inds() to determine whether
925 : the query, subject, or both are ambiguous.
926 See Also : L<Bio::Search::Hit::HSPI::seq_inds()>
930 sub ambiguous_seq_inds
{
932 $self->_calculate_seq_positions();
933 my $type = ($self->{_query_offset
} == 3 && $self->{_sbjct_offset
} == 3) ?
935 ($self->{_query_offset
} == 3) ?
'query' :
936 ($self->{_sbjct_offset
} == 3) ?
'subject' : '';
940 =head2 Inherited from L<Bio::SeqFeature::SimilarityPair>
942 These methods come from L<Bio::SeqFeature::SimilarityPair>
947 Usage : my $query = $hsp->query
948 Function: Returns a SeqFeature representing the query in the HSP
949 Returns : L<Bio::SeqFeature::Similarity>
950 Args : [optional] new value to set
956 unless ($self->{_created_qff
}) {
957 $self->_query_seq_feature;
959 return $self->SUPER::query
(@_);
964 if (! $self->{_finished_new
} || $self->{_making_qff
}) {
965 return $self->{_sim1
} if $self->{_sim1
};
966 $self->{_sim1
} = Bio
::SeqFeature
::Similarity
->new();
967 return $self->{_sim1
};
969 unless ($self->{_created_qff
}) {
970 $self->_query_seq_feature;
972 return $self->SUPER::feature1
(@_);
978 Usage : my $hit = $hsp->hit
979 Function: Returns a SeqFeature representing the hit in the HSP
980 Returns : L<Bio::SeqFeature::Similarity>
981 Args : [optional] new value to set
987 unless ($self->{_created_sff
}) {
988 $self->_subject_seq_feature;
990 return $self->SUPER::hit
(@_);
995 if (! $self->{_finished_new
} || $self->{_making_sff
}) {
996 return $self->{_sim2
} if $self->{_sim2
};
997 $self->{_sim2
} = Bio
::SeqFeature
::Similarity
->new();
998 return $self->{_sim2
};
1000 unless ($self->{_created_sff
}) {
1001 $self->_subject_seq_feature;
1003 return $self->SUPER::feature2
(@_);
1008 Title : significance
1009 Usage : $evalue = $obj->significance();
1010 $obj->significance($evalue);
1011 Function: Get/Set the significance value
1013 Args : [optional] new value to set
1017 # Override significance to return the e-value or, if this is
1018 # not defined (WU-BLAST), return the p-value.
1020 my ($self, $val) = @_;
1021 if (!defined $self->{SIGNIFICANCE
} || defined $val) {
1022 $self->{SIGNIFICANCE
} = defined $val ?
$val :
1023 defined $self->evalue ?
$self->evalue :
1024 defined $self->pvalue ?
$$self->pvalue :
1026 $self->query->significance($self->{SIGNIFICANCE
});
1028 return $self->{SIGNIFICANCE
};
1034 Usage : $hsp->strand('query')
1035 Function: Retrieves the strand for the HSP component requested
1037 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject,
1038 'query' to retrieve the query strand (default)
1043 my ($self,$type) = @_;
1045 if( $type =~ /^q/i && defined $self->{'QUERY_STRAND'} ) {
1046 return $self->{'QUERY_STRAND'};
1047 } elsif( $type =~ /^(hit|subject|sbjct)/i && defined $self->{'HIT_STRAND'} ) {
1048 return $self->{'HIT_STRAND'};
1051 return $self->SUPER::strand
($type)
1057 Usage : $score = $obj->score();
1058 $obj->score($value);
1059 Function: Get/Set the score value
1061 Args : [optional] new value to set
1066 Usage : $bits = $obj->bits();
1068 Function: Get/Set the bits value
1070 Args : [optional] new value to set
1072 =head1 Private methods
1076 =head2 _calculate_seq_positions
1078 Title : _calculate_seq_positions
1079 Usage : $self->_calculate_seq_positions
1080 Function: Internal function
1087 sub _calculate_seq_positions
{
1088 my ($self,@args) = @_;
1089 return unless ( $self->{'_sequenceschanged'} );
1090 $self->{'_sequenceschanged'} = 0;
1091 my ($seqString, $qseq,$sseq) = ( $self->homology_string(),
1092 $self->query_string(),
1093 $self->hit_string() );
1094 my ($mlen, $qlen, $slen) = (CORE
::length($seqString), CORE
::length($qseq), CORE
::length($sseq));
1095 my $qdir = $self->query->strand || 1;
1096 my $sdir = $self->hit->strand || 1;
1097 my ($resCount_query, $endpoint_query) = ($qdir <=0) ?
($self->query->end, $self->query->start)
1098 : ($self->query->start, $self->query->end);
1099 my ($resCount_sbjct, $endpoint_sbjct) = ($sdir <=0) ?
($self->hit->end, $self->hit->start)
1100 : ($self->hit->start, $self->hit->end);
1102 my $prog = $self->algorithm;
1104 if( $prog =~ /FAST|SSEARCH|SMITH-WATERMAN/i ) {
1106 # we infer the end of the regional sequence where the first and last
1107 # non spaces are in the homology string
1108 # then we use the HSP->length to tell us how far to read
1109 # to cut off the end of the sequence
1111 my ($start, $rest) = (0,0);
1112 if( $seqString =~ /^(\s+)?(.*?)\s*$/ ) {
1113 ($start, $rest) = ($1 ? CORE
::length($1) : 0, CORE
::length($2));
1116 $seqString = substr($seqString, $start, $rest);
1117 $qseq = substr($qseq, $start, $rest);
1118 $sseq = substr($sseq, $start, $rest);
1120 # commented out 10/29/08
1121 # removing frameshift symbols doesn't take into account the following:
1122 # 1) does not remove the same point in the homology string (get
1123 # positional errors)
1124 # 2) adjustments to the overall position in the string due to the
1125 # frameshift must be taken into consideration (get balancing errors)
1127 # Frameshifts will be handled directly in the main loop.
1130 # fasta reports some extra 'regional' sequence information
1131 # we need to clear out first
1132 # this seemed a bit insane to me at first, but it appears to
1135 #$qseq =~ s![\\\/]!!g;
1136 #$sseq =~ s![\\\/]!!g;
1139 if (!defined($self->{_sbjct_offset
}) || !defined($self->{_query_offset
})) {
1140 $self->_calculate_seq_offsets();
1143 my ($qfs, $sfs) = (0,0);
1145 for my $pos (0..CORE
::length($seqString)-1) {
1146 my @qrange = (0 - $qfs)..($self->{_query_offset
} - 1);
1147 my @srange = (0 - $sfs)..($self->{_sbjct_offset
} - 1);
1148 # $self->debug("QRange:@qrange SRange:@srange\n") if ($qfs || $sfs);
1149 ($qfs, $sfs) = (0,0);
1150 my ($mchar, $qchar, $schar) = (
1151 unpack("x$pos A1",$seqString) || ' ',
1152 $pos < CORE
::length($qseq) ?
unpack("x$pos A1",$qseq) : '-',
1153 $pos < CORE
::length($sseq) ?
unpack("x$pos A1",$sseq) : '-'
1156 my ($qgap, $sgap) = (0,0);
1157 if( $mchar eq '+' || $mchar eq '.') { # conserved
1158 $self->{seqinds
}{_conservedRes_query
}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1159 $self->{seqinds
}{_conservedRes_sbjct
}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1160 $matchtype = 'conserved';
1161 } elsif( $mchar eq ':' || $mchar ne ' ' ) { # identical
1162 $self->{seqinds
}{_identicalRes_query
}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1163 $self->{seqinds
}{_identicalRes_sbjct
}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1164 $matchtype = 'identical';
1165 } elsif( $mchar eq ' ' ) { # possible mismatch/nomatch/frameshift
1166 $qfs = $qchar eq '/' ?
-1 : # base inserted to match frame
1167 $qchar eq '\\' ?
1 : # base deleted to match frame
1169 $sfs = $schar eq '/' ?
-1 :
1170 $schar eq '\\' ?
1 :
1173 # Frameshifts are tricky.
1175 # '/' indicates that the next residue is shifted back one
1176 # (-1) frame position and is a deletion of one base (so to
1177 # correctly align, a base is inserted). That residue should only
1178 # occupy two sequence positions instead of three.
1180 # '\' indicates that the next residue is shifted forward
1181 # one (+1) frame position and is an insertion of one base (so to
1182 # correctly align, a base is removed). That residue should
1183 # occupy four sequence positions instead of three.
1185 # Note that gaps are not counted across from frameshifts.
1186 # Frameshift indices are a range of positions starting in the
1187 # previous sequence position in which the frameshift occurs;
1188 # they are ambiguous by nature.
1189 $self->{seqinds
}{_frameshiftRes_query
}{ $resCount_query - ($_ * $qdir * $qfs) } = $qfs for @qrange;
1190 $matchtype = "$qfs frameshift-query";
1194 $self->{seqinds
}{_frameshiftRes_sbjct
}{ $resCount_sbjct - ($_ * $sdir * $sfs) } = $sfs for @srange;
1195 $matchtype = "$sfs frameshift-sbcjt";
1198 elsif ($qchar eq $self->{GAP_SYMBOL
}) {
1199 # gap are counted as being in the immediately preceeding residue
1200 # position; for translated sequences this is not the start of
1201 # the previous codon, but the third codon position
1202 $self->{seqinds
}{_gapRes_query
}{ $resCount_query - $qdir }++ for @qrange;
1203 $matchtype = 'gap-query';
1206 elsif ($schar eq $self->{GAP_SYMBOL
}) {
1207 $self->{seqinds
}{_gapRes_sbjct
}{ $resCount_sbjct - $sdir }++ for @srange;
1208 $matchtype = 'gap-sbjct';
1212 # if not a gap or frameshift in either seq, must be mismatch
1213 $self->{seqinds
}{_mismatchRes_query
}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1214 $self->{seqinds
}{_mismatchRes_sbjct
}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1215 $matchtype = 'mismatch';
1217 # always add a nomatch unless the current position in the seq is a gap
1219 $self->{seqinds
}{_nomatchRes_sbjct
}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1222 $self->{seqinds
}{_nomatchRes_query
}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1225 $self->warn("Unknown midline character: [$mchar]");
1227 # leave in and uncomment for future debugging
1228 #$self->debug(sprintf("%7d %1s[%1s]%1s %-7d Type: %-20s QOff:%-2d SOff:%-2d\n",
1235 # ($self->{_query_offset} * $qdir),
1236 # ($self->{_sbjct_offset} * $sdir)));
1237 $resCount_query += ($qdir * (scalar(@qrange) + $qfs)) if !$qgap;
1238 $resCount_sbjct += ($sdir * (scalar(@srange) + $sfs)) if !$sgap;
1243 sub _calculate_seq_offsets
{
1245 my $prog = $self->algorithm;
1246 ($self->{_sbjct_offset
}, $self->{_query_offset
}) = (1,1);
1247 if($prog =~ /^(?:PSI)?T(BLAST|FAST)(N|X|Y)/oi ) {
1248 $self->{_sbjct_offset
} = 3;
1249 if ($1 eq 'BLAST' && $2 eq 'X') { #TBLASTX
1250 $self->{_query_offset
} = 3;
1252 } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) {
1253 $self->{_query_offset
} = 3;
1260 See documentation in L<Bio::Search::HSP::HSPI::n()|Bio::Search::HSP::HSPI>
1266 if(@_) { $self->{'N'} = shift; }
1267 # note that returning 1 is completely an assumption
1268 defined $self->{'N'} ?
$self->{'N'} : 1;
1273 See documentation in L<Bio::Search::HSP::HSPI::range()|Bio::Search::HSP::HSPI>
1278 my ($self, $seqType) = @_;
1280 $seqType ||= 'query';
1281 $seqType = 'sbjct' if $seqType eq 'hit';
1284 if( $seqType eq 'query' ) {
1285 $start = $self->query->start;
1286 $end = $self->query->end;
1289 $start = $self->hit->start;
1290 $end = $self->hit->end;
1292 return ($start, $end);
1299 Usage : $obj->links($newval)
1300 Function: Get/Set the Links value (from WU-BLAST)
1301 Indicates the placement of the alignment in the group of HSPs
1302 Returns : Value of links
1303 Args : On set, new value (a scalar or undef, optional)
1311 return $self->{LINKS
} = shift if @_;
1312 return $self->{LINKS
};
1318 Usage : $obj->hsp_group($newval)
1319 Function: Get/Set the Group value (from WU-BLAST)
1320 Indicates a grouping of HSPs
1321 Returns : Value of group
1322 Args : On set, new value (a scalar or undef, optional)
1330 return $self->{HSP_GROUP
} = shift if @_;
1331 return $self->{HSP_GROUP
};
1336 Title : hit_features
1337 Usage : $obj->hit_features($newval)
1338 Function: Get/Set the HSP hit feature string (from some BLAST 2.2.13 text
1339 output), which is a string of overlapping or nearby features in HSP
1341 Returns : Value of hit features, if present
1342 Args : On set, new value (a scalar or undef, optional)
1350 return $self->{HIT_FEATURES
} = shift if @_;
1351 return $self->{HIT_FEATURES
};
1354 # The cigar string code is written by Juguang Xiao <juguang@fugu-sg.org>
1356 =head1 Brief introduction on cigar string
1358 NOTE: the concept is originally from EnsEMBL docs at
1359 http://may2005.archive.ensembl.org/Docs/wiki/html/EnsemblDocs/CigarFormat.html
1360 Please append to these docs if you have a better definition.
1362 Sequence alignment hits can be stored in a database as ungapped alignments.
1363 This imposes 2 major constraints on alignments:
1365 a) alignments for a single hit record require multiple rows in the database,
1367 b) it is not possible to accurately retrieve the exact original alignment.
1369 Alternatively, sequence alignments can be stored as gapped alignments using
1370 the CIGAR line format (where CIGAR stands for Concise Idiosyncratic Gapped
1373 In the cigar line format alignments are stored as follows:
1379 An example of an alignment for a hypthetical protein match is shown below:
1382 Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
1386 Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
1389 protein_align_feature table as the following cigar line:
1396 Usage: $cigar_string = $hsp->cigar_string
1397 Function: Generate and return cigar string for this HSP alignment
1398 Args: No input needed
1399 Return: a cigar string
1405 my ($self, $arg) = @_;
1406 $self->warn("this is not a setter") if(defined $arg);
1408 unless(defined $self->{_cigar_string
}){ # generate cigar string
1409 my $cigar_string = $self->generate_cigar_string($self->query_string, $self->hit_string);
1410 $self->{_cigar_string
} = $cigar_string;
1413 return $self->{_cigar_string
};
1416 =head2 generate_cigar_string
1418 Name: generate_cigar_string
1419 Usage: my $cigar_string = Bio::Search::HSP::GenericHSP::generate_cigar_string ($qstr, $hstr);
1420 Function: generate cigar string from a simple sequence of alignment.
1421 Args: the string of query and subject
1422 Return: cigar string
1426 sub generate_cigar_string
{
1427 my ($self, $qstr, $hstr) = @_;
1428 my @qchars = split //, $qstr;
1429 my @hchars = split //, $hstr;
1431 unless(scalar(@qchars) == scalar(@hchars)){
1432 $self->throw("two sequences are not equal in lengths");
1435 $self->{_count_for_cigar_string
} = 0;
1436 $self->{_state_for_cigar_string
} = 'M';
1438 my $cigar_string = '';
1439 for(my $i=0; $i <= $#qchars; $i++){
1440 my $qchar = $qchars[$i];
1441 my $hchar = $hchars[$i];
1442 if($qchar ne $self->{GAP_SYMBOL
} && $hchar ne $self->{GAP_SYMBOL
}){ # Match
1443 $cigar_string .= $self->_sub_cigar_string('M');
1444 }elsif($qchar eq $self->{GAP_SYMBOL
}){ # Deletion
1445 $cigar_string .= $self->_sub_cigar_string('D');
1446 }elsif($hchar eq $self->{GAP_SYMBOL
}){ # Insertion
1447 $cigar_string .= $self->_sub_cigar_string('I');
1449 $self->throw("Impossible state that 2 gaps on each seq aligned");
1452 $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
1453 return $cigar_string;
1456 # an internal method to help generate cigar string
1458 sub _sub_cigar_string
{
1459 my ($self, $new_state) = @_;
1461 my $sub_cigar_string = '';
1462 if($self->{_state_for_cigar_string
} eq $new_state){
1463 $self->{_count_for_cigar_string
} += 1; # Remain the state and increase the counter
1465 $sub_cigar_string .= $self->{_count_for_cigar_string
}
1466 unless $self->{_count_for_cigar_string
} == 1;
1467 $sub_cigar_string .= $self->{_state_for_cigar_string
};
1468 $self->{_count_for_cigar_string
} = 1;
1469 $self->{_state_for_cigar_string
} = $new_state;
1471 return $sub_cigar_string;
1474 # needed before seqfeatures can be made
1475 sub _pre_seq_feature
{
1477 my $algo = $self->{ALGORITHM
};
1479 my ($queryfactor, $hitfactor) = (0,0);
1480 my ($hitmap, $querymap) = (1,1);
1481 if( $algo =~ /^(?:PSI)?T(?:BLAST|FAST|SW)[NY]/oi ) {
1485 elsif ($algo =~ /^(?:FAST|BLAST)(?:X|Y|XY)/oi || $algo =~ /^P?GENEWISE/oi ) {
1489 elsif ($algo =~ /^T(BLAST|FAST|SW)(X|Y|XY)/oi || $algo =~ /^(BLAST|FAST|SW)N/oi || $algo =~ /^WABA|AXT|BLAT|BLASTZ|PSL|MEGABLAST|EXONERATE|SW|SSEARCH|SMITH\-WATERMAN|SIM4$/){
1491 $hitmap = $querymap = 3;
1496 elsif ($algo =~ /^RPS-BLAST/) {
1497 if ($algo =~ /^RPS-BLAST\(BLASTX\)/) {
1504 my $stranded = lc substr($self->{STRANDED
}, 0,1);
1505 $queryfactor = ($stranded eq 'q' || $stranded eq 'b') ?
1 : 0;
1506 $hitfactor = ($stranded eq 'h' || $stranded eq 's' || $stranded eq 'b') ?
1 : 0;
1508 $self->{_query_factor
} = $queryfactor;
1509 $self->{_hit_factor
} = $hitfactor;
1510 $self->{_hit_mapping
} = $hitmap;
1511 $self->{_query_mapping
} = $querymap;
1514 # make query seq feature
1515 sub _query_seq_feature
{
1517 $self->{_making_qff
} = 1;
1518 my $qs = $self->{QUERY_START
};
1519 my $qe = $self->{QUERY_END
};
1520 unless (defined $self->{_query_factor
}) {
1521 $self->_pre_seq_feature;
1523 my $queryfactor = $self->{_query_factor
};
1525 unless( defined $qe && defined $qs ) { $self->throw("Did not specify a Query End or Query Begin"); }
1528 if ($qe > $qs) { # normal query: start < end
1543 ($qs,$qe) = ($qe,$qs);
1546 # Note: many of these data are not query- and hit-specific.
1547 # Only start, end, name, length are.
1548 # We could be more efficient by only storing this info once.
1549 # steve chervitz --- Sat Apr 5 00:55:07 2003
1551 my $sim1 = $self->{_sim1
} || Bio
::SeqFeature
::Similarity
->new();
1554 $sim1->significance($self->{EVALUE
});
1555 $sim1->bits($self->{BITS
});
1556 $sim1->score($self->{SCORE
});
1557 $sim1->strand($strand);
1558 $sim1->seq_id($self->{QUERY_NAME
});
1559 $sim1->seqlength($self->{QUERY_LENGTH
});
1560 $sim1->source_tag($self->{ALGORITHM
});
1561 $sim1->seqdesc($self->{QUERY_DESC
});
1562 $sim1->add_tag_value('meta', $self->{META
}) if $self->can('meta');
1563 # to determine frame from something like FASTXY which doesn't
1565 my $qframe = $self->{QUERY_FRAME
};
1567 if (defined $strand && !defined $qframe && $queryfactor) {
1568 $qframe = ( $qs % 3 ) * $strand;
1570 elsif (!defined $strand) {
1574 if( $qframe =~ /^([+-])?([0-3])/ ) {
1575 my $dir = $1 || '+';
1576 if($qframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
1577 $self->warn("Query frame ($qframe) did not match strand of query ($strand)");
1579 $qframe = $2 != 0 ?
$2 - 1 : $2;
1581 $self->warn("Unknown query frame ($qframe)");
1585 $sim1->frame($qframe);
1586 $self->SUPER::feature1
($sim1);
1588 $self->{_created_qff
} = 1;
1589 $self->{_making_qff
} = 0;
1592 # make subject seq feature
1593 sub _subject_seq_feature
{
1595 $self->{_making_sff
} = 1;
1596 my $hs = $self->{HIT_START
};
1597 my $he = $self->{HIT_END
};
1598 unless (defined $self->{_hit_factor
}) {
1599 $self->_pre_seq_feature;
1601 my $hitfactor = $self->{_hit_factor
};
1603 unless( defined $he && defined $hs ) { $self->throw("Did not specify a Hit End or Hit Begin"); }
1606 if ($he > $hs) { # normal subject
1621 ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end
1624 my $sim2 = $self->{_sim2
} || Bio
::SeqFeature
::Similarity
->new();
1627 $sim2->significance($self->{EVALUE
});
1628 $sim2->bits($self->{BITS
});
1629 $sim2->score($self->{SCORE
});
1630 $sim2->strand($strand);
1631 $sim2->seq_id($self->{HIT_NAME
});
1632 $sim2->seqlength($self->{HIT_LENGTH
});
1633 $sim2->source_tag($self->{ALGORITHM
});
1634 $sim2->seqdesc($self->{HIT_DESC
});
1635 $sim2->add_tag_value('meta', $self->{META
}) if $self->can('meta');
1636 my $hframe = $self->{HIT_FRAME
};
1638 if (defined $strand && !defined $hframe && $hitfactor) {
1639 $hframe = ( $hs % 3 ) * $strand;
1641 elsif (!defined $strand) {
1645 if( $hframe =~ /^([+-])?([0-3])/ ) {
1646 my $dir = $1 || '+';
1647 if($hframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
1648 $self->warn("Subject frame ($hframe) did not match strand of subject ($strand)");
1650 $hframe = $2 != 0 ?
$2 - 1 : $2;
1652 $self->warn("Unknown subject frame ($hframe)");
1656 $sim2->frame($hframe);
1657 $self->SUPER::feature2
($sim2);
1659 $self->{_created_sff
} = 1;
1660 $self->{_making_sff
} = 0;
1663 # before calling the num_* methods
1664 sub _pre_similar_stats
{
1666 my $identical = $self->{IDENTICAL
};
1667 my $conserved = $self->{CONSERVED
};
1668 my $percent_id = $self->{PERCENT_IDENTITY
};
1670 if (! defined $identical) {
1671 if (! defined $percent_id) {
1672 $self->warn("Did not defined the number of identical matches or overall percent identity in the HSP; assuming 0");
1676 $identical = sprintf("%.0f",$percent_id * $self->{HSP_LENGTH
});
1680 if (! defined $conserved) {
1681 $self->warn("Did not define the number of conserved matches in the HSP; assuming conserved == identical ($identical)")
1682 if( $self->{ALGORITHM
} !~ /^((FAST|BLAST)N)|EXONERATE|SIM4|AXT|PSL|BLAT|BLASTZ|WABA/oi);
1683 $conserved = $identical;
1685 $self->{IDENTICAL
} = $identical;
1686 $self->{CONSERVED
} = $conserved;
1687 $self->{_did_presimilar
} = 1;
1690 # before calling the frac_* methods
1695 my $hsp_len = $self->{HSP_LENGTH
};
1696 my $hit_len = $self->{HIT_LENGTH
};
1697 my $query_len = $self->{QUERY_LENGTH
};
1699 my $identical = $self->num_identical;
1700 my $conserved = $self->num_conserved;
1702 $self->{_did_prefrac
} = 1;
1705 $self->length('total', $hsp_len);
1706 $logical = $self->_logical_length('total');
1707 $self->frac_identical( 'total', $identical / $hsp_len);
1708 $self->frac_conserved( 'total', $conserved / $hsp_len);
1711 $logical = $self->_logical_length('hit');
1712 $self->frac_identical( 'hit', $identical / $logical);
1713 $self->frac_conserved( 'hit', $conserved / $logical);
1716 $logical = $self->_logical_length('query');
1717 $self->frac_identical( 'query', $identical / $logical) ;
1718 $self->frac_conserved( 'query', $conserved / $logical);
1722 # before calling gaps()
1723 # This relies first on passed parameters (parser-dependent), then on gaps
1724 # calculated by seq_inds() (if set), then falls back to directly checking
1725 # for '-' or '.' as a last resort
1729 my $query_gaps = $self->{QUERY_GAPS
};
1730 my $query_seq = $self->{QUERY_SEQ
};
1731 my $hit_gaps = $self->{HIT_GAPS
};
1732 my $hit_seq = $self->{HIT_SEQ
};
1733 my $gaps = $self->{HSP_GAPS
};
1735 $self->{_did_pregaps
} = 1; # well, we're in the process; avoid recursion
1736 if( defined $query_gaps ) {
1737 $self->gaps('query', $query_gaps);
1738 } elsif( defined $query_seq ) {
1739 my $qg = (defined $self->{'_query_offset'}) ?
$self->seq_inds('query','gaps')
1740 : ($self->algorithm eq 'ERPIN') ?
scalar( $hit_seq =~ tr/\-//)
1741 : scalar( $query_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1742 my $offset = $self->{'_query_offset'} || 1;
1743 $self->gaps('query', $qg/$offset);
1745 if( defined $hit_gaps ) {
1746 $self->gaps('hit', $hit_gaps);
1747 } elsif( defined $hit_seq ) {
1748 my $hg = (defined $self->{'_sbjct_offset'}) ?
$self->seq_inds('hit','gaps')
1749 : ($self->algorithm eq 'ERPIN') ?
scalar( $hit_seq =~ tr/\-//)
1750 : scalar( $hit_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1751 my $offset = $self->{'_sbjct_offset'} || 1;
1752 $self->gaps('hit', $hg/$offset);
1754 if( ! defined $gaps ) {
1755 $gaps = $self->gaps("query") + $self->gaps("hit");
1757 $self->gaps('total', $gaps);
1760 # before percent_identity
1763 $self->{_did_prepi
} = 1;
1764 $self->percent_identity($self->{PERCENT_IDENTITY
} || $self->frac_identical('total')*100) if( $self->{HSP_LENGTH
} > 0 );