Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / Search / HSP / GenericHSP.pm
blob2989b0b9f83333418463f1d304a5a0ad90fb58bd
2 # BioPerl module for Bio::Search::HSP::GenericHSP
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Search::HSP::GenericHSP - A "Generic" implementation of a High Scoring Pair
18 =head1 SYNOPSIS
20 my $hsp = Bio::Search::HSP::GenericHSP->new( -algorithm => 'blastp',
21 -evalue => '1e-30',
24 $r_type = $hsp->algorithm;
26 $pvalue = $hsp->p();
28 $evalue = $hsp->evalue();
30 $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
32 $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
34 $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
36 $qseq = $hsp->query_string;
38 $hseq = $hsp->hit_string;
40 $homo_string = $hsp->homology_string;
42 $len = $hsp->length( ['query'|'hit'|'total'] );
44 $len = $hsp->length( ['query'|'hit'|'total'] );
46 $rank = $hsp->rank;
48 # TODO: Describe how to configure a SearchIO stream so that it generates
49 # GenericHSP objects.
51 =head1 DESCRIPTION
53 This implementation is "Generic", meaning it is is suitable for
54 holding information about High Scoring pairs from most Search reports
55 such as BLAST and FastA. Specialized objects can be derived from
56 this.
58 Unless you're writing a parser, you won't ever need to create a
59 GenericHSP or any other HSPI-implementing object. If you use
60 the SearchIO system, HSPI objects are created automatically from
61 a SearchIO stream which returns Bio::Search::Result::ResultI objects
62 and you get the HSPI objects via the ResultI API.
64 For documentation on what you can do with GenericHSP (and other HSPI
65 objects), please see the API documentation in
66 L<Bio::Search::HSP::HSPI|Bio::Search::HSP::HSPI>.
68 =head1 FEEDBACK
70 =head2 Mailing Lists
72 User feedback is an integral part of the evolution of this and other
73 Bioperl modules. Send your comments and suggestions preferably to
74 the Bioperl mailing list. Your participation is much appreciated.
76 bioperl-l@bioperl.org - General discussion
77 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
79 =head2 Support
81 Please direct usage questions or support issues to the mailing list:
83 I<bioperl-l@bioperl.org>
85 rather than to the module maintainer directly. Many experienced and
86 reponsive experts will be able look at the problem and quickly
87 address it. Please include a thorough description of the problem
88 with code and data examples if at all possible.
90 =head2 Reporting Bugs
92 Report bugs to the Bioperl bug tracking system to help us keep track
93 of the bugs and their resolution. Bug reports can be submitted via the
94 web:
96 https://github.com/bioperl/bioperl-live/issues
98 =head1 AUTHOR - Jason Stajich and Steve Chervitz
100 Email jason-at-bioperl.org
101 Email sac-at-bioperl.org
103 =head1 CONTRIBUTORS
105 Sendu Bala, bix@sendu.me.uk
107 =head1 APPENDIX
109 The rest of the documentation details each of the object methods.
110 Internal methods are usually preceded with a _
112 =cut
114 # Let the code begin...
116 package Bio::Search::HSP::GenericHSP;
117 use strict;
119 use Bio::Root::Root;
120 use Bio::SeqFeature::Similarity;
122 use base qw(Bio::Search::HSP::HSPI);
124 =head2 new
126 Title : new
127 Usage : my $obj = Bio::Search::HSP::GenericHSP->new();
128 Function: Builds a new Bio::Search::HSP::GenericHSP object
129 Returns : Bio::Search::HSP::GenericHSP
130 Args : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc)
131 -evalue => evalue
132 -pvalue => pvalue
133 -bits => bit value for HSP
134 -score => score value for HSP (typically z-score but depends on
135 analysis)
136 -hsp_length=> Length of the HSP (including gaps)
137 -identical => # of residues that that matched identically
138 -percent_identity => (optional) percent identity
139 -conserved => # of residues that matched conservatively
140 (only protein comparisons;
141 conserved == identical in nucleotide comparisons)
142 -hsp_gaps => # of gaps in the HSP
143 -query_gaps => # of gaps in the query in the alignment
144 -hit_gaps => # of gaps in the subject in the alignment
145 -query_name => HSP Query sequence name (if available)
146 -query_start => HSP Query start (in original query sequence coords)
147 -query_end => HSP Query end (in original query sequence coords)
148 -query_length=> total length of the query sequence
149 -query_seq => query sequence portion of the HSP
150 -query_desc => textual description of the query
151 -hit_name => HSP Hit sequence name (if available)
152 -hit_start => HSP Hit start (in original hit sequence coords)
153 -hit_end => HSP Hit end (in original hit sequence coords)
154 -hit_length => total length of the hit sequence
155 -hit_seq => hit sequence portion of the HSP
156 -hit_desc => textual description of the hit
157 -homology_seq=> homology sequence for the HSP
158 -hit_frame => hit frame (only if hit is translated protein)
159 -query_frame => query frame (only if query is translated protein)
160 -rank => HSP rank
161 -links => HSP links information (WU-BLAST only)
162 -hsp_group => HSP Group informat (WU-BLAST only)
163 -gap_symbol => symbol representing a gap (default = '-')
164 -hit_features=> string of features found in or near HSP hit
165 region (reported in some BLAST text output,
166 v. 2.2.13 and up)
167 -stranded => If the algorithm isn't known (i.e. defaults to
168 'generic'), setting this will indicate start/end
169 coordinates are to be used to determine the strand
170 for 'query', 'subject', 'hit', 'both', or 'none'
171 (default = 'none')
173 =cut
175 sub new {
176 my($class,%args) = @_;
178 # don't pass anything to SUPER; complex hierarchy results in lots of work
179 # for nothing
181 my $self = $class->SUPER::new();
183 # for speed, don't use _rearrange and just store all input data directly
184 # with no method calls and no work done. work can be carried
185 # out just-in-time later if desired
186 while (my ($arg, $value) = each %args) {
187 $arg =~ tr/a-z\055/A-Z/d;
188 $self->{$arg} = $value;
190 my $bits = $self->{BITS};
192 defined $self->{VERBOSE} && $self->verbose($self->{VERBOSE});
193 if (exists $self->{GAP_SYMBOL}) {
194 # not checking anything else but the length (must be 1 as only one gap
195 # symbol allowed currently); can add in support for symbol checks or
196 # multiple symbols later if needed
197 $self->throw("Gap symbol must be of length 1") if
198 CORE::length($self->{GAP_SYMBOL}) != 1;
199 } else {
200 # dafault
201 $self->{GAP_SYMBOL} = '-';
203 $self->{ALGORITHM} ||= 'GENERIC';
204 $self->{STRANDED} ||= 'NONE';
206 if (! defined $self->{QUERY_LENGTH} || ! defined $self->{HIT_LENGTH}) {
207 $self->throw("Must define hit and query length");
210 $self->{'_sequenceschanged'} = 1;
212 $self->{_finished_new} = 1;
213 return $self;
216 sub _logical_length {
217 my ($self, $type) = @_;
218 if (!defined($self->{_sbjct_offset}) || !defined($self->{_query_offset})) {
219 $self->_calculate_seq_offsets();
221 my $key = $type =~ /sbjct|hit|tot/i ? 'sbjct' : 'query';
223 my $offset = $self->{"_${key}_offset"};
224 return $self->length($type) / $offset ;
227 =head2 L<Bio::Search::HSP::HSPI> methods
229 Implementation of L<Bio::Search::HSP::HSPI> methods follow
231 =head2 algorithm
233 Title : algorithm
234 Usage : my $r_type = $hsp->algorithm
235 Function: Obtain the name of the algorithm used to obtain the HSP
236 Returns : string (e.g., BLASTP)
237 Args : [optional] scalar string to set value
239 =cut
241 sub algorithm{
242 my ($self,$value) = @_;
243 my $previous = $self->{'ALGORITHM'};
244 if( defined $value || ! defined $previous ) {
245 $value = $previous = '' unless defined $value;
246 $self->{'ALGORITHM'} = $value;
249 return $previous;
252 =head2 pvalue
254 Title : pvalue
255 Usage : my $pvalue = $hsp->pvalue();
256 Function: Returns the P-value for this HSP or undef
257 Returns : float or exponential (2e-10)
258 P-value is not defined with NCBI Blast2 reports.
259 Args : [optional] numeric to set value
261 =cut
263 sub pvalue {
264 my ($self,$value) = @_;
265 my $previous = $self->{'PVALUE'};
266 if( defined $value ) {
267 $self->{'PVALUE'} = $value;
269 return $previous;
272 =head2 evalue
274 Title : evalue
275 Usage : my $evalue = $hsp->evalue();
276 Function: Returns the e-value for this HSP
277 Returns : float or exponential (2e-10)
278 Args : [optional] numeric to set value
280 =cut
282 sub evalue {
283 my ($self,$value) = @_;
284 my $previous = $self->{'EVALUE'};
285 if( defined $value ) {
286 $self->{'EVALUE'} = $value;
288 return $previous;
291 =head2 frac_identical
293 Title : frac_identical
294 Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
295 Function: Returns the fraction of identitical positions for this HSP
296 Returns : Float in range 0.0 -> 1.0
297 Args : arg 1: 'query' = num identical / length of query seq (without gaps)
298 'hit' = num identical / length of hit seq (without gaps)
299 synonyms: 'sbjct', 'subject'
300 'total' = num identical / length of alignment (with gaps)
301 synonyms: 'hsp'
302 default = 'total'
303 arg 2: [optional] frac identical value to set for the type requested
304 Note : for translated sequences, this adjusts the length accordingly
306 =cut
308 sub frac_identical {
309 my ($self, $type,$value) = @_;
311 unless ($self->{_did_prefrac}) {
312 $self->_pre_frac;
315 $type = lc $type if defined $type;
316 $type = 'hit' if( defined $type &&
317 $type =~ /subject|sbjct/);
318 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
319 $type !~ /query|hit|subject|sbjct|total/);
320 my $previous = $self->{'_frac_identical'}->{$type};
321 if( defined $value || ! defined $previous ) {
322 $value = $previous = '' unless defined $value;
323 if( $type eq 'hit' || $type eq 'query' ) {
324 $self->$type()->frac_identical( $value);
326 $self->{'_frac_identical'}->{$type} = $value;
328 return $previous;
332 =head2 frac_conserved
334 Title : frac_conserved
335 Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
336 Function : Returns the fraction of conserved positions for this HSP.
337 This is the fraction of symbols in the alignment with a
338 positive score.
339 Returns : Float in range 0.0 -> 1.0
340 Args : arg 1: 'query' = num conserved / length of query seq (without gaps)
341 'hit' = num conserved / length of hit seq (without gaps)
342 synonyms: 'sbjct', 'subject'
343 'total' = num conserved / length of alignment (with gaps)
344 synonyms: 'hsp'
345 default = 'total'
346 arg 2: [optional] frac conserved value to set for the type requested
348 =cut
350 sub frac_conserved {
351 my ($self, $type,$value) = @_;
353 unless ($self->{_did_prefrac}) {
354 $self->_pre_frac;
357 $type = lc $type if defined $type;
358 $type = 'hit' if( defined $type && $type =~ /subject|sbjct/);
359 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
360 $type !~ /query|hit|subject|sbjct|total/);
361 my $previous = $self->{'_frac_conserved'}->{$type};
362 if( defined $value || ! defined $previous ) {
363 $value = $previous = '' unless defined $value;
364 $self->{'_frac_conserved'}->{$type} = $value;
366 return $previous;
369 =head2 gaps
371 Title : gaps
372 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
373 Function : Get the number of gap characters in the query, hit, or total alignment.
374 Returns : Integer, number of gaps or 0 if none
375 Args : arg 1: 'query' = num gap characters in query seq
376 'hit' = num gap characters in hit seq; synonyms: 'sbjct', 'subject'
377 'total' = num gap characters in whole alignment; synonyms: 'hsp'
378 default = 'total'
379 arg 2: [optional] integer gap value to set for the type requested
381 =cut
383 sub gaps {
384 my ($self, $type, $value) = @_;
386 unless ($self->{_did_pregaps}) {
387 $self->_pre_gaps;
390 $type = lc $type if defined $type;
391 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
392 $type !~ /query|hit|subject|sbjct|total/);
393 $type = 'hit' if $type =~ /sbjct|subject/;
394 my $previous = $self->{'_gaps'}->{$type};
395 if( defined $value || ! defined $previous ) {
396 $value = $previous = '' unless defined $value;
397 $self->{'_gaps'}->{$type} = $value;
399 return $previous || 0;
402 =head2 query_string
404 Title : query_string
405 Usage : my $qseq = $hsp->query_string;
406 Function: Retrieves the query sequence of this HSP as a string
407 Returns : string
408 Args : [optional] string to set for query sequence
411 =cut
413 sub query_string{
414 my ($self,$value) = @_;
415 my $previous = $self->{QUERY_SEQ};
416 if( defined $value || ! defined $previous ) {
417 $value = $previous = '' unless defined $value;
418 $self->{QUERY_SEQ} = $value;
419 # do some housekeeping so we know when to
420 # re-run _calculate_seq_positions
421 $self->{'_sequenceschanged'} = 1;
423 return $previous;
426 =head2 hit_string
428 Title : hit_string
429 Usage : my $hseq = $hsp->hit_string;
430 Function: Retrieves the hit sequence of this HSP as a string
431 Returns : string
432 Args : [optional] string to set for hit sequence
435 =cut
437 sub hit_string{
438 my ($self,$value) = @_;
439 my $previous = $self->{HIT_SEQ};
440 if( defined $value || ! defined $previous ) {
441 $value = $previous = '' unless defined $value;
442 $self->{HIT_SEQ} = $value;
443 # do some housekeeping so we know when to
444 # re-run _calculate_seq_positions
445 $self->{'_sequenceschanged'} = 1;
447 return $previous;
450 =head2 homology_string
452 Title : homology_string
453 Usage : my $homo_string = $hsp->homology_string;
454 Function: Retrieves the homology sequence for this HSP as a string.
455 : The homology sequence is the string of symbols in between the
456 : query and hit sequences in the alignment indicating the degree
457 : of conservation (e.g., identical, similar, not similar).
458 Returns : string
459 Args : [optional] string to set for homology sequence
461 =cut
463 sub homology_string{
464 my ($self,$value) = @_;
465 my $previous = $self->{HOMOLOGY_SEQ};
466 if( defined $value || ! defined $previous ) {
467 $value = $previous = '' unless defined $value;
468 $self->{HOMOLOGY_SEQ} = $value;
469 # do some housekeeping so we know when to
470 # re-run _calculate_seq_positions
471 $self->{'_sequenceschanged'} = 1;
473 return $previous;
476 =head2 consensus_string
478 Title : consensus_string
479 Usage : my $cs_string = $hsp->consensus_string;
480 Function: Retrieves the consensus structure line for this HSP as a string (HMMER).
481 : If the model had any consensus structure or reference line annotation
482 : that it inherited from a multiple alignment (#=GC SS cons,
483 : #=GC RF annotation in Stockholm files), that information is shown
484 : as CS or RF annotation line.
485 Returns : string
486 Args : [optional] string to set for consensus structure
488 =cut
490 sub consensus_string {
491 my ($self,$value) = @_;
492 my $previous = $self->{CS_SEQ};
493 if( defined $value || ! defined $previous ) {
494 $value = $previous = '' unless defined $value;
495 $self->{CS_SEQ} = $value;
496 # do some housekeeping so we know when to
497 # re-run _calculate_seq_positions
498 $self->{'_sequenceschanged'} = 1;
500 return $previous;
503 =head2 posterior_string
505 Title : posterior_string
506 Usage : my $pp_string = $hsp->posterior_string;
507 Function: Retrieves the posterior probability line for this HSP as a string (HMMer3).
508 : The posterior probability is the string of symbols at the bottom
509 : of the alignment indicating the expected accuracy of each aligned residue.
510 : A 0 means 0-5%, 1 means 5-15%, and so on; 9 means 85-95%,
511 : and a * means 95-100% posterior probability.
512 Returns : string
513 Args : [optional] string to set for posterior probability
515 =cut
517 sub posterior_string {
518 my ($self,$value) = @_;
519 my $previous = $self->{PP_SEQ};
520 if( defined $value || ! defined $previous ) {
521 $value = $previous = '' unless defined $value;
522 $self->{PP_SEQ} = $value;
523 # do some housekeeping so we know when to
524 # re-run _calculate_seq_positions
525 $self->{'_sequenceschanged'} = 1;
527 return $previous;
530 =head2 length
532 Title : length
533 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
534 Function : Returns the length of the query or hit in the alignment
535 (without gaps)
536 or the aggregate length of the HSP (including gaps;
537 this may be greater than either hit or query )
538 Returns : integer
539 Args : arg 1: 'query' = length of query seq (without gaps)
540 'hit' = length of hit seq (without gaps) (synonyms: sbjct, subject)
541 'total' = length of alignment (with gaps)
542 default = 'total'
543 arg 2: [optional] integer length value to set for specific type
545 =cut
547 sub length {
549 my $self = shift;
550 my $type = shift;
552 $type = 'total' unless defined $type;
553 $type = lc $type;
555 if( $type =~ /^q/i ) {
556 return $self->query()->length(shift);
557 } elsif( $type =~ /^(hit|subject|sbjct)/ ) {
558 return $self->hit()->length(shift);
559 } else {
560 my $v = shift;
561 if( defined $v ) {
562 $self->{HSP_LENGTH} = $v;
564 return $self->{HSP_LENGTH};
566 return 0; # should never get here
569 =head2 hsp_length
571 Title : hsp_length
572 Usage : my $len = $hsp->hsp_length()
573 Function: shortcut length('hsp')
574 Returns : floating point between 0 and 100
575 Args : none
577 =cut
579 sub hsp_length { return shift->length('hsp', shift); }
581 =head2 percent_identity
583 Title : percent_identity
584 Usage : my $percentid = $hsp->percent_identity()
585 Function: Returns the calculated percent identity for an HSP
586 Returns : floating point between 0 and 100
587 Args : none
590 =cut
592 sub percent_identity {
593 my $self = shift;
595 unless ($self->{_did_prepi}) {
596 $self->_pre_pi;
599 return $self->SUPER::percent_identity(@_);
602 =head2 frame
604 Title : frame
605 Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
606 Function: Set the Frame for both query and subject and insure that
607 they agree.
608 This overrides the frame() method implementation in
609 FeaturePair.
610 Returns : array of query and subject frame if return type wants an array
611 or query frame if defined or subject frame if not defined
612 Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
613 'query' to retrieve the query frame
614 'list' or 'array' to retrieve both query and hit frames together
615 Note : Frames are stored in the GFF way (0-2) not 1-3
616 as they are in BLAST (negative frames are deduced by checking
617 the strand of the query or hit)
619 =cut
621 # Note: changed 4/19/08 - bug 2485
623 # frame() is supposed to be a getter/setter (as is implied by the Function desc
624 # above; this is also consistent with Bio::SeqFeature::SimilarityPair). Also,
625 # the API is not consistent with the other HSP/SimilarityPair methods such as
626 # strand(), start(), end(), etc.
628 # In order to make this consistent with other methods all work is now done
629 # when the features are instantiated and not delayed. We compromise by
630 # defaulting back 'to hit' w/o passed args. Setting is now allowed.
632 sub frame {
633 my $self = shift;
634 my $val = shift;
635 if (!defined $val) {
636 # unfortunately, w/o args we need to warn about API changes
637 $self->warn("API for frame() has changed.\n".
638 "Please refer to documentation for Bio::Search::HSP::GenericHSP;\n".
639 "returning query frame");
640 $val = 'query';
642 $val =~ s/^\s+//;
644 if( $val =~ /^q/i ) {
645 return $self->query->frame(@_);
646 } elsif( $val =~ /^hi|^s/i ) {
647 return $self->hit->frame(@_);
648 } elsif ( $val =~ /^list|array/i ) {
649 return ($self->query->frame($_[0]),
650 $self->hit->frame($_[1]) );
651 } elsif ( $val =~ /^\d+$/) {
652 # old API i.e. frame($query_frame, $hit_frame). This catches all but one
653 # case, where no arg is passed (so the hit is wanted).
654 $self->warn("API for frame() has changed.\n".
655 "Please refer to documentation for Bio::Search::HSP::GenericHSP");
656 wantarray ?
657 return ($self->query->frame($val),
658 $self->hit->frame(@_) ) :
659 return $self->hit->frame($val,@_);
660 } else {
661 $self->warn("unrecognized component '$val' requested\n");
663 return 0;
666 =head2 get_aln
668 Title : get_aln
669 Usage : my $aln = $hsp->gel_aln
670 Function: Returns a L<Bio::SimpleAlign> object representing the HSP alignment
671 Returns : L<Bio::SimpleAlign>
672 Args : none
674 =cut
676 sub get_aln {
677 my ($self) = @_;
678 require Bio::LocatableSeq;
679 require Bio::SimpleAlign;
681 my $aln = Bio::SimpleAlign->new();
682 my $hs = $self->hit_string();
683 my $qs = $self->query_string();
684 # FASTA specific stuff moved to the FastaHSP object
685 my $seqonly = $qs;
686 $seqonly =~ s/[\-\s]//g;
687 my ($q_nm,$s_nm) = ($self->query->seq_id(),
688 $self->hit->seq_id());
689 # Should we silently change the name of the query or hit if it isn't
690 # defined? May need revisiting... cjfields 2008-12-3 (commented out below)
692 #unless( defined $q_nm && CORE::length ($q_nm) ) {
693 # $q_nm = 'query';
695 #unless( defined $s_nm && CORE::length ($s_nm) ) {
696 # $s_nm = 'hit';
699 # mapping: 1 residues for every x coordinate positions
700 my $query = Bio::LocatableSeq->new(
701 -seq => $qs,
702 -id => $q_nm,
703 -start => $self->query->start,
704 -end => $self->query->end,
705 -strand => $self->query->strand,
706 -force_nse => $q_nm ? 0 : 1,
707 -mapping => [ 1, $self->{_query_mapping} ]
709 $seqonly = $hs;
710 $seqonly =~ s/[\-\s]//g;
711 my $hit = Bio::LocatableSeq->new(
712 -seq => $hs,
713 -id => $s_nm,
714 -start => $self->hit->start,
715 -end => $self->hit->end,
716 -strand => $self->hit->strand,
717 -force_nse => $s_nm ? 0 : 1,
718 -mapping => [ 1, $self->{_hit_mapping} ]
720 $aln->add_seq($query);
721 $aln->add_seq($hit);
722 return $aln;
725 =head2 num_conserved
727 Title : num_conserved
728 Usage : $obj->num_conserved($newval)
729 Function: returns the number of conserved residues in the alignment
730 Returns : integer
731 Args : integer (optional)
734 =cut
736 sub num_conserved{
737 my ($self,$value) = @_;
739 unless ($self->{_did_presimilar}) {
740 $self->_pre_similar_stats;
743 if (defined $value) {
744 $self->{CONSERVED} = $value;
746 return $self->{CONSERVED};
749 =head2 num_identical
751 Title : num_identical
752 Usage : $obj->num_identical($newval)
753 Function: returns the number of identical residues in the alignment
754 Returns : integer
755 Args : integer (optional)
758 =cut
760 sub num_identical{
761 my ($self,$value) = @_;
763 unless ($self->{_did_presimilar}) {
764 $self->_pre_similar_stats;
767 if( defined $value) {
768 $self->{IDENTICAL} = $value;
770 return $self->{IDENTICAL};
773 =head2 rank
775 Usage : $hsp->rank( [string] );
776 Purpose : Get the rank of the HSP within a given Blast hit.
777 Example : $rank = $hsp->rank;
778 Returns : Integer (1..n) corresponding to the order in which the HSP
779 appears in the BLAST report.
781 =cut
783 sub rank {
784 my ($self,$value) = @_;
785 if( defined $value) {
786 $self->{RANK} = $value;
788 return $self->{RANK};
791 =head2 seq_inds
793 Title : seq_inds
794 Purpose : Get a list of residue positions (indices) for all identical
795 : or conserved residues in the query or sbjct sequence.
796 Example : @s_ind = $hsp->seq_inds('query', 'identical');
797 : @h_ind = $hsp->seq_inds('hit', 'conserved');
798 : @h_ind = $hsp->seq_inds('hit', 'conserved-not-identical');
799 : @h_ind = $hsp->seq_inds('hit', 'conserved', 1);
800 Returns : List of integers
801 : May include ranges if collapse is true.
802 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query)
803 : ('sbjct' is synonymous with 'hit')
804 : class = 'identical' - identical positions
805 : 'conserved' - conserved positions
806 : 'nomatch' - mismatched residue or gap positions
807 : 'mismatch' - mismatched residue positions (no gaps)
808 : 'gap' - gap positions only
809 : 'frameshift'- frameshift positions only
810 : 'conserved-not-identical' - conserved positions w/o
811 : identical residues
812 : The name can be shortened to 'id' or 'cons' unless
813 : the name is . The default value is
814 : 'identical'
816 : collapse = boolean, if true, consecutive positions are merged
817 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
818 : collapses to "1-5 7 9-11". This is useful for
819 : consolidating long lists. Default = no collapse.
821 Throws : n/a.
822 Comments : For HSPs partially or completely derived from translated sequences
823 : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
824 : cannot easily be attributed to a single position (i.e. the
825 : positional data is ambiguous). In these cases all three codon
826 : positions are reported. Under these conditions you can check
827 : ambiguous_seq_inds() to determine whether the query, subject,
828 : or both are ambiguous.
830 See Also : L<Bio::Search::SearchUtils::collapse_nums()|Bio::Search::SearchUtils>,
831 L<Bio::Search::Hit::HitI::seq_inds()|Bio::Search::Hit::HitI>
833 =cut
835 sub seq_inds{
836 my ($self, $seqType, $class, $collapse) = @_;
838 # prepare the internal structures - this is cached so
839 # if the strings have not changed we're okay
840 $self->_calculate_seq_positions();
842 $seqType ||= 'query';
843 $class ||= 'identical';
844 $collapse ||= 0;
845 $seqType = 'sbjct' if $seqType eq 'hit';
846 my $t = lc(substr($seqType,0,1));
847 if( $t eq 'q' ) {
848 $seqType = 'query';
849 } elsif ( $t eq 's' || $t eq 'h' ) {
850 $seqType = 'sbjct';
851 } else {
852 $self->warn("unknown seqtype $seqType using 'query'");
853 $seqType = 'query';
856 $t = lc(substr($class,0,1));
858 if( $t eq 'c' ) {
859 if( $class =~ /conserved\-not\-identical/ ) {
860 $class = 'conserved';
861 } else {
862 $class = 'conservedall';
864 } elsif( $t eq 'i' ) {
865 $class = 'identical';
866 } elsif( $t eq 'n' ) {
867 $class = 'nomatch';
868 } elsif( $t eq 'm' ) {
869 $class = 'mismatch';
870 } elsif( $t eq 'g' ) {
871 $class = 'gap';
872 } elsif( $t eq 'f' ) {
873 $class = 'frameshift';
874 } else {
875 $self->warn("unknown sequence class $class using 'identical'");
876 $class = 'identical';
879 ## Sensitive to member name changes.
880 $seqType = "_\L$seqType\E";
881 $class = "_\L$class\E";
882 my @ary;
884 if( $class eq '_gap' ) {
885 # this means that we are remapping the gap length that is stored
886 # in the hash (for example $self->{'_gapRes_query'} )
887 # so we'll return an array which has the values of the position of the
888 # of the gap (the key in the hash) + the gap length (value in the
889 # hash for this key - 1.
891 # changing this; since the index is the position prior to the insertion,
892 # repeat the position based on the number of gaps inserted
893 @ary = map { my @tmp;
894 # position holds number of gaps inserted
895 for my $g (1..$self->{seqinds}{"${class}Res$seqType"}->{$_}) {
896 push @tmp, $_ ;
898 @tmp}
899 sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
900 } elsif( $class eq '_conservedall' ) {
901 @ary = sort { $a <=> $b }
902 keys %{ $self->{seqinds}{"_conservedRes$seqType"}},
903 keys %{ $self->{seqinds}{"_identicalRes$seqType"}},
904 } else {
905 @ary = sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
907 require Bio::Search::BlastUtils if $collapse;
909 return $collapse ? &Bio::Search::SearchUtils::collapse_nums(@ary) : @ary;
912 =head2 ambiguous_seq_inds
914 Title : ambiguous_seq_inds
915 Purpose : returns a string denoting whether sequence indices for query,
916 : subject, or both are ambiguous
917 Returns : String; 'query', 'subject', 'query/subject', or empty string ''
918 Argument : none
919 Comments : For HSPs partially or completely derived from translated sequences
920 : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
921 : cannot easily be attributed to a single position (i.e. the
922 : positional data is ambiguous). In these cases all three codon
923 : positions are reported when using seq_inds(). Under these
924 : conditions you can check ambiguous_seq_inds() to determine whether
925 : the query, subject, or both are ambiguous.
926 See Also : L<Bio::Search::Hit::HSPI::seq_inds()>
928 =cut
930 sub ambiguous_seq_inds {
931 my $self = shift;
932 $self->_calculate_seq_positions();
933 my $type = ($self->{_query_offset} == 3 && $self->{_sbjct_offset} == 3) ?
934 'query/subject' :
935 ($self->{_query_offset} == 3) ? 'query' :
936 ($self->{_sbjct_offset} == 3) ? 'subject' : '';
937 return $type;
940 =head2 Inherited from L<Bio::SeqFeature::SimilarityPair>
942 These methods come from L<Bio::SeqFeature::SimilarityPair>
944 =head2 query
946 Title : query
947 Usage : my $query = $hsp->query
948 Function: Returns a SeqFeature representing the query in the HSP
949 Returns : L<Bio::SeqFeature::Similarity>
950 Args : [optional] new value to set
952 =cut
954 sub query {
955 my $self = shift;
956 unless ($self->{_created_qff}) {
957 $self->_query_seq_feature;
959 return $self->SUPER::query(@_);
962 sub feature1 {
963 my $self = shift;
964 if (! $self->{_finished_new} || $self->{_making_qff}) {
965 return $self->{_sim1} if $self->{_sim1};
966 $self->{_sim1} = Bio::SeqFeature::Similarity->new();
967 return $self->{_sim1};
969 unless ($self->{_created_qff}) {
970 $self->_query_seq_feature;
972 return $self->SUPER::feature1(@_);
975 =head2 hit
977 Title : hit
978 Usage : my $hit = $hsp->hit
979 Function: Returns a SeqFeature representing the hit in the HSP
980 Returns : L<Bio::SeqFeature::Similarity>
981 Args : [optional] new value to set
983 =cut
985 sub hit {
986 my $self = shift;
987 unless ($self->{_created_sff}) {
988 $self->_subject_seq_feature;
990 return $self->SUPER::hit(@_);
993 sub feature2 {
994 my $self = shift;
995 if (! $self->{_finished_new} || $self->{_making_sff}) {
996 return $self->{_sim2} if $self->{_sim2};
997 $self->{_sim2} = Bio::SeqFeature::Similarity->new();
998 return $self->{_sim2};
1000 unless ($self->{_created_sff}) {
1001 $self->_subject_seq_feature;
1003 return $self->SUPER::feature2(@_);
1006 =head2 significance
1008 Title : significance
1009 Usage : $evalue = $obj->significance();
1010 $obj->significance($evalue);
1011 Function: Get/Set the significance value
1012 Returns : numeric
1013 Args : [optional] new value to set
1015 =cut
1017 # Override significance to return the e-value or, if this is
1018 # not defined (WU-BLAST), return the p-value.
1019 sub significance {
1020 my ($self, $val) = @_;
1021 if (!defined $self->{SIGNIFICANCE} || defined $val) {
1022 $self->{SIGNIFICANCE} = defined $val ? $val :
1023 defined $self->evalue ? $self->evalue :
1024 defined $self->pvalue ? $$self->pvalue :
1025 undef;
1026 $self->query->significance($self->{SIGNIFICANCE});
1028 return $self->{SIGNIFICANCE};
1031 =head2 strand
1033 Title : strand
1034 Usage : $hsp->strand('query')
1035 Function: Retrieves the strand for the HSP component requested
1036 Returns : +1 or -1
1037 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject,
1038 'query' to retrieve the query strand (default)
1040 =cut
1042 sub strand {
1043 my ($self,$type) = @_;
1045 if( $type =~ /^q/i && defined $self->{'QUERY_STRAND'} ) {
1046 return $self->{'QUERY_STRAND'};
1047 } elsif( $type =~ /^(hit|subject|sbjct)/i && defined $self->{'HIT_STRAND'} ) {
1048 return $self->{'HIT_STRAND'};
1051 return $self->SUPER::strand($type)
1054 =head2 score
1056 Title : score
1057 Usage : $score = $obj->score();
1058 $obj->score($value);
1059 Function: Get/Set the score value
1060 Returns : numeric
1061 Args : [optional] new value to set
1063 =head2 bits
1065 Title : bits
1066 Usage : $bits = $obj->bits();
1067 $obj->bits($value);
1068 Function: Get/Set the bits value
1069 Returns : numeric
1070 Args : [optional] new value to set
1072 =head1 Private methods
1074 =cut
1076 =head2 _calculate_seq_positions
1078 Title : _calculate_seq_positions
1079 Usage : $self->_calculate_seq_positions
1080 Function: Internal function
1081 Returns :
1082 Args :
1085 =cut
1087 sub _calculate_seq_positions {
1088 my ($self,@args) = @_;
1089 return unless ( $self->{'_sequenceschanged'} );
1090 $self->{'_sequenceschanged'} = 0;
1091 my ($seqString, $qseq,$sseq) = ( $self->homology_string(),
1092 $self->query_string(),
1093 $self->hit_string() );
1094 my ($mlen, $qlen, $slen) = (CORE::length($seqString), CORE::length($qseq), CORE::length($sseq));
1095 my $qdir = $self->query->strand || 1;
1096 my $sdir = $self->hit->strand || 1;
1097 my ($resCount_query, $endpoint_query) = ($qdir <=0) ? ($self->query->end, $self->query->start)
1098 : ($self->query->start, $self->query->end);
1099 my ($resCount_sbjct, $endpoint_sbjct) = ($sdir <=0) ? ($self->hit->end, $self->hit->start)
1100 : ($self->hit->start, $self->hit->end);
1102 my $prog = $self->algorithm;
1104 if( $prog =~ /FAST|SSEARCH|SMITH-WATERMAN/i ) {
1106 # we infer the end of the regional sequence where the first and last
1107 # non spaces are in the homology string
1108 # then we use the HSP->length to tell us how far to read
1109 # to cut off the end of the sequence
1111 my ($start, $rest) = (0,0);
1112 if( $seqString =~ /^(\s+)?(.*?)\s*$/ ) {
1113 ($start, $rest) = ($1 ? CORE::length($1) : 0, CORE::length($2));
1116 $seqString = substr($seqString, $start, $rest);
1117 $qseq = substr($qseq, $start, $rest);
1118 $sseq = substr($sseq, $start, $rest);
1120 # commented out 10/29/08
1121 # removing frameshift symbols doesn't take into account the following:
1122 # 1) does not remove the same point in the homology string (get
1123 # positional errors)
1124 # 2) adjustments to the overall position in the string due to the
1125 # frameshift must be taken into consideration (get balancing errors)
1127 # Frameshifts will be handled directly in the main loop.
1128 # --chris
1130 # fasta reports some extra 'regional' sequence information
1131 # we need to clear out first
1132 # this seemed a bit insane to me at first, but it appears to
1133 # work --jason
1135 #$qseq =~ s![\\\/]!!g;
1136 #$sseq =~ s![\\\/]!!g;
1139 if (!defined($self->{_sbjct_offset}) || !defined($self->{_query_offset})) {
1140 $self->_calculate_seq_offsets();
1143 my ($qfs, $sfs) = (0,0);
1144 CHAR_LOOP:
1145 for my $pos (0..CORE::length($seqString)-1) {
1146 my @qrange = (0 - $qfs)..($self->{_query_offset} - 1);
1147 my @srange = (0 - $sfs)..($self->{_sbjct_offset} - 1);
1148 # $self->debug("QRange:@qrange SRange:@srange\n") if ($qfs || $sfs);
1149 ($qfs, $sfs) = (0,0);
1150 my ($mchar, $qchar, $schar) = (
1151 unpack("x$pos A1",$seqString) || ' ',
1152 $pos < CORE::length($qseq) ? unpack("x$pos A1",$qseq) : '-',
1153 $pos < CORE::length($sseq) ? unpack("x$pos A1",$sseq) : '-'
1155 my $matchtype = '';
1156 my ($qgap, $sgap) = (0,0);
1157 if( $mchar eq '+' || $mchar eq '.') { # conserved
1158 $self->{seqinds}{_conservedRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1159 $self->{seqinds}{_conservedRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1160 $matchtype = 'conserved';
1161 } elsif( $mchar eq ':' || $mchar ne ' ' ) { # identical
1162 $self->{seqinds}{_identicalRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1163 $self->{seqinds}{_identicalRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1164 $matchtype = 'identical';
1165 } elsif( $mchar eq ' ' ) { # possible mismatch/nomatch/frameshift
1166 $qfs = $qchar eq '/' ? -1 : # base inserted to match frame
1167 $qchar eq '\\' ? 1 : # base deleted to match frame
1169 $sfs = $schar eq '/' ? -1 :
1170 $schar eq '\\' ? 1 :
1172 if ($qfs) {
1173 # Frameshifts are tricky.
1175 # '/' indicates that the next residue is shifted back one
1176 # (-1) frame position and is a deletion of one base (so to
1177 # correctly align, a base is inserted). That residue should only
1178 # occupy two sequence positions instead of three.
1180 # '\' indicates that the next residue is shifted forward
1181 # one (+1) frame position and is an insertion of one base (so to
1182 # correctly align, a base is removed). That residue should
1183 # occupy four sequence positions instead of three.
1185 # Note that gaps are not counted across from frameshifts.
1186 # Frameshift indices are a range of positions starting in the
1187 # previous sequence position in which the frameshift occurs;
1188 # they are ambiguous by nature.
1189 $self->{seqinds}{_frameshiftRes_query}{ $resCount_query - ($_ * $qdir * $qfs) } = $qfs for @qrange;
1190 $matchtype = "$qfs frameshift-query";
1191 $sgap = $qgap = 1;
1193 elsif ($sfs) {
1194 $self->{seqinds}{_frameshiftRes_sbjct}{ $resCount_sbjct - ($_ * $sdir * $sfs) } = $sfs for @srange;
1195 $matchtype = "$sfs frameshift-sbcjt";
1196 $sgap = $qgap = 1;
1198 elsif ($qchar eq $self->{GAP_SYMBOL}) {
1199 # gap are counted as being in the immediately preceeding residue
1200 # position; for translated sequences this is not the start of
1201 # the previous codon, but the third codon position
1202 $self->{seqinds}{_gapRes_query}{ $resCount_query - $qdir }++ for @qrange;
1203 $matchtype = 'gap-query';
1204 $qgap++;
1206 elsif ($schar eq $self->{GAP_SYMBOL}) {
1207 $self->{seqinds}{_gapRes_sbjct}{ $resCount_sbjct - $sdir }++ for @srange;
1208 $matchtype = 'gap-sbjct';
1209 $sgap++;
1211 else {
1212 # if not a gap or frameshift in either seq, must be mismatch
1213 $self->{seqinds}{_mismatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1214 $self->{seqinds}{_mismatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1215 $matchtype = 'mismatch';
1217 # always add a nomatch unless the current position in the seq is a gap
1218 if (!$sgap) {
1219 $self->{seqinds}{_nomatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1221 if (!$qgap) {
1222 $self->{seqinds}{_nomatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1224 } else {
1225 $self->warn("Unknown midline character: [$mchar]");
1227 # leave in and uncomment for future debugging
1228 #$self->debug(sprintf("%7d %1s[%1s]%1s %-7d Type: %-20s QOff:%-2d SOff:%-2d\n",
1229 # $resCount_query,
1230 # $qchar,
1231 # $mchar,
1232 # $schar,
1233 # $resCount_sbjct,
1234 # $matchtype,
1235 # ($self->{_query_offset} * $qdir),
1236 # ($self->{_sbjct_offset} * $sdir)));
1237 $resCount_query += ($qdir * (scalar(@qrange) + $qfs)) if !$qgap;
1238 $resCount_sbjct += ($sdir * (scalar(@srange) + $sfs)) if !$sgap;
1240 return 1;
1243 sub _calculate_seq_offsets {
1244 my $self = shift;
1245 my $prog = $self->algorithm;
1246 ($self->{_sbjct_offset}, $self->{_query_offset}) = (1,1);
1247 if($prog =~ /^(?:PSI)?T(BLAST|FAST)(N|X|Y)/oi ) {
1248 $self->{_sbjct_offset} = 3;
1249 if ($1 eq 'BLAST' && $2 eq 'X') { #TBLASTX
1250 $self->{_query_offset} = 3;
1252 } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) {
1253 $self->{_query_offset} = 3;
1258 =head2 n
1260 See documentation in L<Bio::Search::HSP::HSPI::n()|Bio::Search::HSP::HSPI>
1262 =cut
1264 sub n {
1265 my $self = shift;
1266 if(@_) { $self->{'N'} = shift; }
1267 # note that returning 1 is completely an assumption
1268 defined $self->{'N'} ? $self->{'N'} : 1;
1271 =head2 range
1273 See documentation in L<Bio::Search::HSP::HSPI::range()|Bio::Search::HSP::HSPI>
1275 =cut
1277 sub range {
1278 my ($self, $seqType) = @_;
1280 $seqType ||= 'query';
1281 $seqType = 'sbjct' if $seqType eq 'hit';
1283 my ($start, $end);
1284 if( $seqType eq 'query' ) {
1285 $start = $self->query->start;
1286 $end = $self->query->end;
1288 else {
1289 $start = $self->hit->start;
1290 $end = $self->hit->end;
1292 return ($start, $end);
1296 =head2 links
1298 Title : links
1299 Usage : $obj->links($newval)
1300 Function: Get/Set the Links value (from WU-BLAST)
1301 Indicates the placement of the alignment in the group of HSPs
1302 Returns : Value of links
1303 Args : On set, new value (a scalar or undef, optional)
1306 =cut
1308 sub links{
1309 my $self = shift;
1311 return $self->{LINKS} = shift if @_;
1312 return $self->{LINKS};
1315 =head2 hsp_group
1317 Title : hsp_group
1318 Usage : $obj->hsp_group($newval)
1319 Function: Get/Set the Group value (from WU-BLAST)
1320 Indicates a grouping of HSPs
1321 Returns : Value of group
1322 Args : On set, new value (a scalar or undef, optional)
1325 =cut
1327 sub hsp_group {
1328 my $self = shift;
1330 return $self->{HSP_GROUP} = shift if @_;
1331 return $self->{HSP_GROUP};
1334 =head2 hit_features
1336 Title : hit_features
1337 Usage : $obj->hit_features($newval)
1338 Function: Get/Set the HSP hit feature string (from some BLAST 2.2.13 text
1339 output), which is a string of overlapping or nearby features in HSP
1341 Returns : Value of hit features, if present
1342 Args : On set, new value (a scalar or undef, optional)
1345 =cut
1347 sub hit_features {
1348 my $self = shift;
1350 return $self->{HIT_FEATURES} = shift if @_;
1351 return $self->{HIT_FEATURES};
1354 # The cigar string code is written by Juguang Xiao <juguang@fugu-sg.org>
1356 =head1 Brief introduction on cigar string
1358 NOTE: the concept is originally from EnsEMBL docs at
1359 http://may2005.archive.ensembl.org/Docs/wiki/html/EnsemblDocs/CigarFormat.html
1360 Please append to these docs if you have a better definition.
1362 Sequence alignment hits can be stored in a database as ungapped alignments.
1363 This imposes 2 major constraints on alignments:
1365 a) alignments for a single hit record require multiple rows in the database,
1367 b) it is not possible to accurately retrieve the exact original alignment.
1369 Alternatively, sequence alignments can be stored as gapped alignments using
1370 the CIGAR line format (where CIGAR stands for Concise Idiosyncratic Gapped
1371 Alignment Report).
1373 In the cigar line format alignments are stored as follows:
1375 M: Match
1376 D: Deletion
1377 I: Insertion
1379 An example of an alignment for a hypthetical protein match is shown below:
1382 Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
1384 PG P G GP R PLGP
1386 Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
1389 protein_align_feature table as the following cigar line:
1391 7M4D12M2I2MD7M
1393 =head2 cigar_string
1395 Name: cigar_string
1396 Usage: $cigar_string = $hsp->cigar_string
1397 Function: Generate and return cigar string for this HSP alignment
1398 Args: No input needed
1399 Return: a cigar string
1401 =cut
1404 sub cigar_string {
1405 my ($self, $arg) = @_;
1406 $self->warn("this is not a setter") if(defined $arg);
1408 unless(defined $self->{_cigar_string}){ # generate cigar string
1409 my $cigar_string = $self->generate_cigar_string($self->query_string, $self->hit_string);
1410 $self->{_cigar_string} = $cigar_string;
1411 } # end of unless
1413 return $self->{_cigar_string};
1416 =head2 generate_cigar_string
1418 Name: generate_cigar_string
1419 Usage: my $cigar_string = Bio::Search::HSP::GenericHSP::generate_cigar_string ($qstr, $hstr);
1420 Function: generate cigar string from a simple sequence of alignment.
1421 Args: the string of query and subject
1422 Return: cigar string
1424 =cut
1426 sub generate_cigar_string {
1427 my ($self, $qstr, $hstr) = @_;
1428 my @qchars = split //, $qstr;
1429 my @hchars = split //, $hstr;
1431 unless(scalar(@qchars) == scalar(@hchars)){
1432 $self->throw("two sequences are not equal in lengths");
1435 $self->{_count_for_cigar_string} = 0;
1436 $self->{_state_for_cigar_string} = 'M';
1438 my $cigar_string = '';
1439 for(my $i=0; $i <= $#qchars; $i++){
1440 my $qchar = $qchars[$i];
1441 my $hchar = $hchars[$i];
1442 if($qchar ne $self->{GAP_SYMBOL} && $hchar ne $self->{GAP_SYMBOL}){ # Match
1443 $cigar_string .= $self->_sub_cigar_string('M');
1444 }elsif($qchar eq $self->{GAP_SYMBOL}){ # Deletion
1445 $cigar_string .= $self->_sub_cigar_string('D');
1446 }elsif($hchar eq $self->{GAP_SYMBOL}){ # Insertion
1447 $cigar_string .= $self->_sub_cigar_string('I');
1448 }else{
1449 $self->throw("Impossible state that 2 gaps on each seq aligned");
1452 $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
1453 return $cigar_string;
1456 # an internal method to help generate cigar string
1458 sub _sub_cigar_string {
1459 my ($self, $new_state) = @_;
1461 my $sub_cigar_string = '';
1462 if($self->{_state_for_cigar_string} eq $new_state){
1463 $self->{_count_for_cigar_string} += 1; # Remain the state and increase the counter
1464 }else{
1465 $sub_cigar_string .= $self->{_count_for_cigar_string}
1466 unless $self->{_count_for_cigar_string} == 1;
1467 $sub_cigar_string .= $self->{_state_for_cigar_string};
1468 $self->{_count_for_cigar_string} = 1;
1469 $self->{_state_for_cigar_string} = $new_state;
1471 return $sub_cigar_string;
1474 # needed before seqfeatures can be made
1475 sub _pre_seq_feature {
1476 my $self = shift;
1477 my $algo = $self->{ALGORITHM};
1479 my ($queryfactor, $hitfactor) = (0,0);
1480 my ($hitmap, $querymap) = (1,1);
1481 if( $algo =~ /^(?:PSI)?T(?:BLAST|FAST|SW)[NY]/oi ) {
1482 $hitfactor = 1;
1483 $hitmap = 3;
1485 elsif ($algo =~ /^(?:FAST|BLAST)(?:X|Y|XY)/oi || $algo =~ /^P?GENEWISE/oi ) {
1486 $queryfactor = 1;
1487 $querymap = 3;
1489 elsif ($algo =~ /^T(BLAST|FAST|SW)(X|Y|XY)/oi || $algo =~ /^(BLAST|FAST|SW)N/oi || $algo =~ /^WABA|AXT|BLAT|BLASTZ|PSL|MEGABLAST|EXONERATE|SW|SSEARCH|SMITH\-WATERMAN|SIM4$/){
1490 if ($2) {
1491 $hitmap = $querymap = 3;
1493 $hitfactor = 1;
1494 $queryfactor = 1;
1496 elsif ($algo =~ /^RPS-BLAST/) {
1497 if ($algo =~ /^RPS-BLAST\(BLASTX\)/) {
1498 $queryfactor = 1;
1499 $querymap = 3;
1501 $hitfactor = 0;
1503 else {
1504 my $stranded = lc substr($self->{STRANDED}, 0,1);
1505 $queryfactor = ($stranded eq 'q' || $stranded eq 'b') ? 1 : 0;
1506 $hitfactor = ($stranded eq 'h' || $stranded eq 's' || $stranded eq 'b') ? 1 : 0;
1508 $self->{_query_factor} = $queryfactor;
1509 $self->{_hit_factor} = $hitfactor;
1510 $self->{_hit_mapping} = $hitmap;
1511 $self->{_query_mapping} = $querymap;
1514 # make query seq feature
1515 sub _query_seq_feature {
1516 my $self = shift;
1517 $self->{_making_qff} = 1;
1518 my $qs = $self->{QUERY_START};
1519 my $qe = $self->{QUERY_END};
1520 unless (defined $self->{_query_factor}) {
1521 $self->_pre_seq_feature;
1523 my $queryfactor = $self->{_query_factor};
1525 unless( defined $qe && defined $qs ) { $self->throw("Did not specify a Query End or Query Begin"); }
1527 my $strand;
1528 if ($qe > $qs) { # normal query: start < end
1529 if ($queryfactor) {
1530 $strand = 1;
1532 else {
1533 $strand = undef;
1536 else {
1537 if ($queryfactor) {
1538 $strand = -1;
1540 else {
1541 $strand = undef;
1543 ($qs,$qe) = ($qe,$qs);
1546 # Note: many of these data are not query- and hit-specific.
1547 # Only start, end, name, length are.
1548 # We could be more efficient by only storing this info once.
1549 # steve chervitz --- Sat Apr 5 00:55:07 2003
1551 my $sim1 = $self->{_sim1} || Bio::SeqFeature::Similarity->new();
1552 $sim1->start($qs);
1553 $sim1->end($qe);
1554 $sim1->significance($self->{EVALUE});
1555 $sim1->bits($self->{BITS});
1556 $sim1->score($self->{SCORE});
1557 $sim1->strand($strand);
1558 $sim1->seq_id($self->{QUERY_NAME});
1559 $sim1->seqlength($self->{QUERY_LENGTH});
1560 $sim1->source_tag($self->{ALGORITHM});
1561 $sim1->seqdesc($self->{QUERY_DESC});
1562 $sim1->add_tag_value('meta', $self->{META}) if $self->can('meta');
1563 # to determine frame from something like FASTXY which doesn't
1564 # report the frame
1565 my $qframe = $self->{QUERY_FRAME};
1567 if (defined $strand && !defined $qframe && $queryfactor) {
1568 $qframe = ( $qs % 3 ) * $strand;
1570 elsif (!defined $strand) {
1571 $qframe = 0;
1574 if( $qframe =~ /^([+-])?([0-3])/ ) {
1575 my $dir = $1 || '+';
1576 if($qframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
1577 $self->warn("Query frame ($qframe) did not match strand of query ($strand)");
1579 $qframe = $2 != 0 ? $2 - 1 : $2;
1580 } else {
1581 $self->warn("Unknown query frame ($qframe)");
1582 $qframe = 0;
1585 $sim1->frame($qframe);
1586 $self->SUPER::feature1($sim1);
1588 $self->{_created_qff} = 1;
1589 $self->{_making_qff} = 0;
1592 # make subject seq feature
1593 sub _subject_seq_feature {
1594 my $self = shift;
1595 $self->{_making_sff} = 1;
1596 my $hs = $self->{HIT_START};
1597 my $he = $self->{HIT_END};
1598 unless (defined $self->{_hit_factor}) {
1599 $self->_pre_seq_feature;
1601 my $hitfactor = $self->{_hit_factor};
1603 unless( defined $he && defined $hs ) { $self->throw("Did not specify a Hit End or Hit Begin"); }
1605 my $strand;
1606 if ($he > $hs) { # normal subject
1607 if ($hitfactor) {
1608 $strand = 1;
1610 else {
1611 $strand = undef;
1614 else {
1615 if ($hitfactor) {
1616 $strand = -1;
1618 else {
1619 $strand = undef;
1621 ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end
1624 my $sim2 = $self->{_sim2} || Bio::SeqFeature::Similarity->new();
1625 $sim2->start($hs);
1626 $sim2->end($he);
1627 $sim2->significance($self->{EVALUE});
1628 $sim2->bits($self->{BITS});
1629 $sim2->score($self->{SCORE});
1630 $sim2->strand($strand);
1631 $sim2->seq_id($self->{HIT_NAME});
1632 $sim2->seqlength($self->{HIT_LENGTH});
1633 $sim2->source_tag($self->{ALGORITHM});
1634 $sim2->seqdesc($self->{HIT_DESC});
1635 $sim2->add_tag_value('meta', $self->{META}) if $self->can('meta');
1636 my $hframe = $self->{HIT_FRAME};
1638 if (defined $strand && !defined $hframe && $hitfactor) {
1639 $hframe = ( $hs % 3 ) * $strand;
1641 elsif (!defined $strand) {
1642 $hframe = 0;
1645 if( $hframe =~ /^([+-])?([0-3])/ ) {
1646 my $dir = $1 || '+';
1647 if($hframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
1648 $self->warn("Subject frame ($hframe) did not match strand of subject ($strand)");
1650 $hframe = $2 != 0 ? $2 - 1 : $2;
1651 } else {
1652 $self->warn("Unknown subject frame ($hframe)");
1653 $hframe = 0;
1656 $sim2->frame($hframe);
1657 $self->SUPER::feature2($sim2);
1659 $self->{_created_sff} = 1;
1660 $self->{_making_sff} = 0;
1663 # before calling the num_* methods
1664 sub _pre_similar_stats {
1665 my $self = shift;
1666 my $identical = $self->{IDENTICAL};
1667 my $conserved = $self->{CONSERVED};
1668 my $percent_id = $self->{PERCENT_IDENTITY};
1670 if (! defined $identical) {
1671 if (! defined $percent_id) {
1672 $self->warn("Did not defined the number of identical matches or overall percent identity in the HSP; assuming 0");
1673 $identical = 0;
1675 else {
1676 $identical = sprintf("%.0f",$percent_id * $self->{HSP_LENGTH});
1680 if (! defined $conserved) {
1681 $self->warn("Did not define the number of conserved matches in the HSP; assuming conserved == identical ($identical)")
1682 if( $self->{ALGORITHM} !~ /^((FAST|BLAST)N)|EXONERATE|SIM4|AXT|PSL|BLAT|BLASTZ|WABA/oi);
1683 $conserved = $identical;
1685 $self->{IDENTICAL} = $identical;
1686 $self->{CONSERVED} = $conserved;
1687 $self->{_did_presimilar} = 1;
1690 # before calling the frac_* methods
1692 sub _pre_frac {
1693 my $self = shift;
1695 my $hsp_len = $self->{HSP_LENGTH};
1696 my $hit_len = $self->{HIT_LENGTH};
1697 my $query_len = $self->{QUERY_LENGTH};
1699 my $identical = $self->num_identical;
1700 my $conserved = $self->num_conserved;
1702 $self->{_did_prefrac} = 1;
1703 my $logical;
1704 if( $hsp_len ) {
1705 $self->length('total', $hsp_len);
1706 $logical = $self->_logical_length('total');
1707 $self->frac_identical( 'total', $identical / $hsp_len);
1708 $self->frac_conserved( 'total', $conserved / $hsp_len);
1710 if( $hit_len ) {
1711 $logical = $self->_logical_length('hit');
1712 $self->frac_identical( 'hit', $identical / $logical);
1713 $self->frac_conserved( 'hit', $conserved / $logical);
1715 if( $query_len ) {
1716 $logical = $self->_logical_length('query');
1717 $self->frac_identical( 'query', $identical / $logical) ;
1718 $self->frac_conserved( 'query', $conserved / $logical);
1722 # before calling gaps()
1723 # This relies first on passed parameters (parser-dependent), then on gaps
1724 # calculated by seq_inds() (if set), then falls back to directly checking
1725 # for '-' or '.' as a last resort
1727 sub _pre_gaps {
1728 my $self = shift;
1729 my $query_gaps = $self->{QUERY_GAPS};
1730 my $query_seq = $self->{QUERY_SEQ};
1731 my $hit_gaps = $self->{HIT_GAPS};
1732 my $hit_seq = $self->{HIT_SEQ};
1733 my $gaps = $self->{HSP_GAPS};
1735 $self->{_did_pregaps} = 1; # well, we're in the process; avoid recursion
1736 if( defined $query_gaps ) {
1737 $self->gaps('query', $query_gaps);
1738 } elsif( defined $query_seq ) {
1739 my $qg = (defined $self->{'_query_offset'}) ? $self->seq_inds('query','gaps')
1740 : ($self->algorithm eq 'ERPIN') ? scalar( $hit_seq =~ tr/\-//)
1741 : scalar( $query_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1742 my $offset = $self->{'_query_offset'} || 1;
1743 $self->gaps('query', $qg/$offset);
1745 if( defined $hit_gaps ) {
1746 $self->gaps('hit', $hit_gaps);
1747 } elsif( defined $hit_seq ) {
1748 my $hg = (defined $self->{'_sbjct_offset'}) ? $self->seq_inds('hit','gaps')
1749 : ($self->algorithm eq 'ERPIN') ? scalar( $hit_seq =~ tr/\-//)
1750 : scalar( $hit_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1751 my $offset = $self->{'_sbjct_offset'} || 1;
1752 $self->gaps('hit', $hg/$offset);
1754 if( ! defined $gaps ) {
1755 $gaps = $self->gaps("query") + $self->gaps("hit");
1757 $self->gaps('total', $gaps);
1760 # before percent_identity
1761 sub _pre_pi {
1762 my $self = shift;
1763 $self->{_did_prepi} = 1;
1764 $self->percent_identity($self->{PERCENT_IDENTITY} || $self->frac_identical('total')*100) if( $self->{HSP_LENGTH} > 0 );