maint: remove Travis stuff which has been replaced with Github actions (#325)
[bioperl-live.git] / lib / Bio / Search / HSP / GenericHSP.pm
bloba69e1593c46df43f142dac179ea4b385255ff44a
2 # BioPerl module for Bio::Search::HSP::GenericHSP
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Search::HSP::GenericHSP - A "Generic" implementation of a High Scoring Pair
18 =head1 SYNOPSIS
20 my $hsp = Bio::Search::HSP::GenericHSP->new( -algorithm => 'blastp',
21 -evalue => '1e-30',
24 $r_type = $hsp->algorithm;
26 $pvalue = $hsp->p();
28 $evalue = $hsp->evalue();
30 $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
32 $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
34 $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
36 $qseq = $hsp->query_string;
38 $hseq = $hsp->hit_string;
40 $homo_string = $hsp->homology_string;
42 $len = $hsp->length( ['query'|'hit'|'total'] );
44 $len = $hsp->length( ['query'|'hit'|'total'] );
46 $rank = $hsp->rank;
48 # TODO: Describe how to configure a SearchIO stream so that it generates
49 # GenericHSP objects.
51 =head1 DESCRIPTION
53 This implementation is "Generic", meaning it is is suitable for
54 holding information about High Scoring pairs from most Search reports
55 such as BLAST and FastA. Specialized objects can be derived from
56 this.
58 Unless you're writing a parser, you won't ever need to create a
59 GenericHSP or any other HSPI-implementing object. If you use
60 the SearchIO system, HSPI objects are created automatically from
61 a SearchIO stream which returns Bio::Search::Result::ResultI objects
62 and you get the HSPI objects via the ResultI API.
64 For documentation on what you can do with GenericHSP (and other HSPI
65 objects), please see the API documentation in
66 L<Bio::Search::HSP::HSPI|Bio::Search::HSP::HSPI>.
68 =head1 FEEDBACK
70 =head2 Mailing Lists
72 User feedback is an integral part of the evolution of this and other
73 Bioperl modules. Send your comments and suggestions preferably to
74 the Bioperl mailing list. Your participation is much appreciated.
76 bioperl-l@bioperl.org - General discussion
77 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
79 =head2 Support
81 Please direct usage questions or support issues to the mailing list:
83 I<bioperl-l@bioperl.org>
85 rather than to the module maintainer directly. Many experienced and
86 reponsive experts will be able look at the problem and quickly
87 address it. Please include a thorough description of the problem
88 with code and data examples if at all possible.
90 =head2 Reporting Bugs
92 Report bugs to the Bioperl bug tracking system to help us keep track
93 of the bugs and their resolution. Bug reports can be submitted via the
94 web:
96 https://github.com/bioperl/bioperl-live/issues
98 =head1 AUTHOR - Jason Stajich and Steve Chervitz
100 Email jason-at-bioperl.org
101 Email sac-at-bioperl.org
103 =head1 CONTRIBUTORS
105 Sendu Bala, bix@sendu.me.uk
107 =head1 APPENDIX
109 The rest of the documentation details each of the object methods.
110 Internal methods are usually preceded with a _
112 =cut
114 # Let the code begin...
116 package Bio::Search::HSP::GenericHSP;
118 use strict;
120 use Bio::Root::Root;
121 use Bio::SeqFeature::Similarity;
123 use base qw(Bio::Search::HSP::HSPI);
125 =head2 new
127 Title : new
128 Usage : my $obj = Bio::Search::HSP::GenericHSP->new();
129 Function: Builds a new Bio::Search::HSP::GenericHSP object
130 Returns : Bio::Search::HSP::GenericHSP
131 Args : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc)
132 -evalue => evalue
133 -pvalue => pvalue
134 -bits => bit value for HSP
135 -score => score value for HSP (typically z-score but depends on
136 analysis)
137 -hsp_length=> Length of the HSP (including gaps)
138 -identical => # of residues that that matched identically
139 -percent_identity => (optional) percent identity
140 -conserved => # of residues that matched conservatively
141 (only protein comparisons;
142 conserved == identical in nucleotide comparisons)
143 -hsp_gaps => # of gaps in the HSP
144 -query_gaps => # of gaps in the query in the alignment
145 -hit_gaps => # of gaps in the subject in the alignment
146 -query_name => HSP Query sequence name (if available)
147 -query_start => HSP Query start (in original query sequence coords)
148 -query_end => HSP Query end (in original query sequence coords)
149 -query_length=> total length of the query sequence
150 -query_seq => query sequence portion of the HSP
151 -query_desc => textual description of the query
152 -hit_name => HSP Hit sequence name (if available)
153 -hit_start => HSP Hit start (in original hit sequence coords)
154 -hit_end => HSP Hit end (in original hit sequence coords)
155 -hit_length => total length of the hit sequence
156 -hit_seq => hit sequence portion of the HSP
157 -hit_desc => textual description of the hit
158 -homology_seq=> homology sequence for the HSP
159 -hit_frame => hit frame (only if hit is translated protein)
160 -query_frame => query frame (only if query is translated protein)
161 -rank => HSP rank
162 -links => HSP links information (WU-BLAST only)
163 -hsp_group => HSP Group informat (WU-BLAST only)
164 -gap_symbol => symbol representing a gap (default = '-')
165 -hit_features=> string of features found in or near HSP hit
166 region (reported in some BLAST text output,
167 v. 2.2.13 and up)
168 -stranded => If the algorithm isn't known (i.e. defaults to
169 'generic'), setting this will indicate start/end
170 coordinates are to be used to determine the strand
171 for 'query', 'subject', 'hit', 'both', or 'none'
172 (default = 'none')
174 =cut
176 sub new {
177 my($class,%args) = @_;
179 # don't pass anything to SUPER; complex hierarchy results in lots of work
180 # for nothing
182 my $self = $class->SUPER::new();
184 # for speed, don't use _rearrange and just store all input data directly
185 # with no method calls and no work done. work can be carried
186 # out just-in-time later if desired
187 while (my ($arg, $value) = each %args) {
188 $arg =~ tr/a-z\055/A-Z/d;
189 $self->{$arg} = $value;
191 my $bits = $self->{BITS};
193 defined $self->{VERBOSE} && $self->verbose($self->{VERBOSE});
194 if (exists $self->{GAP_SYMBOL}) {
195 # not checking anything else but the length (must be 1 as only one gap
196 # symbol allowed currently); can add in support for symbol checks or
197 # multiple symbols later if needed
198 $self->throw("Gap symbol must be of length 1") if
199 CORE::length($self->{GAP_SYMBOL}) != 1;
200 } else {
201 # dafault
202 $self->{GAP_SYMBOL} = '-';
204 $self->{ALGORITHM} ||= 'GENERIC';
205 $self->{STRANDED} ||= 'NONE';
207 if (! defined $self->{QUERY_LENGTH} || ! defined $self->{HIT_LENGTH}) {
208 $self->throw("Must define hit and query length");
211 $self->{'_sequenceschanged'} = 1;
213 $self->{_finished_new} = 1;
214 return $self;
217 sub _logical_length {
218 my ($self, $type) = @_;
219 if (!defined($self->{_sbjct_offset}) || !defined($self->{_query_offset})) {
220 $self->_calculate_seq_offsets();
222 my $key = $type =~ /sbjct|hit|tot/i ? 'sbjct' : 'query';
224 my $offset = $self->{"_${key}_offset"};
225 return $self->length($type) / $offset ;
228 =head2 L<Bio::Search::HSP::HSPI> methods
230 Implementation of L<Bio::Search::HSP::HSPI> methods follow
232 =head2 algorithm
234 Title : algorithm
235 Usage : my $r_type = $hsp->algorithm
236 Function: Obtain the name of the algorithm used to obtain the HSP
237 Returns : string (e.g., BLASTP)
238 Args : [optional] scalar string to set value
240 =cut
242 sub algorithm{
243 my ($self,$value) = @_;
244 my $previous = $self->{'ALGORITHM'};
245 if( defined $value || ! defined $previous ) {
246 $value = $previous = '' unless defined $value;
247 $self->{'ALGORITHM'} = $value;
250 return $previous;
253 =head2 pvalue
255 Title : pvalue
256 Usage : my $pvalue = $hsp->pvalue();
257 Function: Returns the P-value for this HSP or undef
258 Returns : float or exponential (2e-10)
259 P-value is not defined with NCBI Blast2 reports.
260 Args : [optional] numeric to set value
262 =cut
264 sub pvalue {
265 my ($self,$value) = @_;
266 my $previous = $self->{'PVALUE'};
267 if( defined $value ) {
268 $self->{'PVALUE'} = $value;
270 return $previous;
273 =head2 evalue
275 Title : evalue
276 Usage : my $evalue = $hsp->evalue();
277 Function: Returns the e-value for this HSP
278 Returns : float or exponential (2e-10)
279 Args : [optional] numeric to set value
281 =cut
283 sub evalue {
284 my ($self,$value) = @_;
285 my $previous = $self->{'EVALUE'};
286 if( defined $value ) {
287 $self->{'EVALUE'} = $value;
289 return $previous;
292 =head2 frac_identical
294 Title : frac_identical
295 Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
296 Function: Returns the fraction of identitical positions for this HSP
297 Returns : Float in range 0.0 -> 1.0
298 Args : arg 1: 'query' = num identical / length of query seq (without gaps)
299 'hit' = num identical / length of hit seq (without gaps)
300 synonyms: 'sbjct', 'subject'
301 'total' = num identical / length of alignment (with gaps)
302 synonyms: 'hsp'
303 default = 'total'
304 arg 2: [optional] frac identical value to set for the type requested
305 Note : for translated sequences, this adjusts the length accordingly
307 =cut
309 sub frac_identical {
310 my ($self, $type,$value) = @_;
312 unless ($self->{_did_prefrac}) {
313 $self->_pre_frac;
316 $type = lc $type if defined $type;
317 $type = 'hit' if( defined $type &&
318 $type =~ /subject|sbjct/);
319 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
320 $type !~ /query|hit|subject|sbjct|total/);
321 my $previous = $self->{'_frac_identical'}->{$type};
322 if( defined $value || ! defined $previous ) {
323 $value = $previous = '' unless defined $value;
324 if( $type eq 'hit' || $type eq 'query' ) {
325 $self->$type()->frac_identical( $value);
327 $self->{'_frac_identical'}->{$type} = $value;
329 return $previous;
333 =head2 frac_conserved
335 Title : frac_conserved
336 Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
337 Function : Returns the fraction of conserved positions for this HSP.
338 This is the fraction of symbols in the alignment with a
339 positive score.
340 Returns : Float in range 0.0 -> 1.0
341 Args : arg 1: 'query' = num conserved / length of query seq (without gaps)
342 'hit' = num conserved / length of hit seq (without gaps)
343 synonyms: 'sbjct', 'subject'
344 'total' = num conserved / length of alignment (with gaps)
345 synonyms: 'hsp'
346 default = 'total'
347 arg 2: [optional] frac conserved value to set for the type requested
349 =cut
351 sub frac_conserved {
352 my ($self, $type,$value) = @_;
354 unless ($self->{_did_prefrac}) {
355 $self->_pre_frac;
358 $type = lc $type if defined $type;
359 $type = 'hit' if( defined $type && $type =~ /subject|sbjct/);
360 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
361 $type !~ /query|hit|subject|sbjct|total/);
362 my $previous = $self->{'_frac_conserved'}->{$type};
363 if( defined $value || ! defined $previous ) {
364 $value = $previous = '' unless defined $value;
365 $self->{'_frac_conserved'}->{$type} = $value;
367 return $previous;
370 =head2 gaps
372 Title : gaps
373 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
374 Function : Get the number of gap characters in the query, hit, or total alignment.
375 Returns : Integer, number of gaps or 0 if none
376 Args : arg 1: 'query' = num gap characters in query seq
377 'hit' = num gap characters in hit seq; synonyms: 'sbjct', 'subject'
378 'total' = num gap characters in whole alignment; synonyms: 'hsp'
379 default = 'total'
380 arg 2: [optional] integer gap value to set for the type requested
382 =cut
384 sub gaps {
385 my ($self, $type, $value) = @_;
387 unless ($self->{_did_pregaps}) {
388 $self->_pre_gaps;
391 $type = lc $type if defined $type;
392 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
393 $type !~ /query|hit|subject|sbjct|total/);
394 $type = 'hit' if $type =~ /sbjct|subject/;
395 my $previous = $self->{'_gaps'}->{$type};
396 if( defined $value || ! defined $previous ) {
397 $value = $previous = '' unless defined $value;
398 $self->{'_gaps'}->{$type} = $value;
400 return $previous || 0;
403 =head2 query_string
405 Title : query_string
406 Usage : my $qseq = $hsp->query_string;
407 Function: Retrieves the query sequence of this HSP as a string
408 Returns : string
409 Args : [optional] string to set for query sequence
412 =cut
414 sub query_string{
415 my ($self,$value) = @_;
416 my $previous = $self->{QUERY_SEQ};
417 if( defined $value || ! defined $previous ) {
418 $value = $previous = '' unless defined $value;
419 $self->{QUERY_SEQ} = $value;
420 # do some housekeeping so we know when to
421 # re-run _calculate_seq_positions
422 $self->{'_sequenceschanged'} = 1;
424 return $previous;
427 =head2 hit_string
429 Title : hit_string
430 Usage : my $hseq = $hsp->hit_string;
431 Function: Retrieves the hit sequence of this HSP as a string
432 Returns : string
433 Args : [optional] string to set for hit sequence
436 =cut
438 sub hit_string{
439 my ($self,$value) = @_;
440 my $previous = $self->{HIT_SEQ};
441 if( defined $value || ! defined $previous ) {
442 $value = $previous = '' unless defined $value;
443 $self->{HIT_SEQ} = $value;
444 # do some housekeeping so we know when to
445 # re-run _calculate_seq_positions
446 $self->{'_sequenceschanged'} = 1;
448 return $previous;
451 =head2 homology_string
453 Title : homology_string
454 Usage : my $homo_string = $hsp->homology_string;
455 Function: Retrieves the homology sequence for this HSP as a string.
456 : The homology sequence is the string of symbols in between the
457 : query and hit sequences in the alignment indicating the degree
458 : of conservation (e.g., identical, similar, not similar).
459 Returns : string
460 Args : [optional] string to set for homology sequence
462 =cut
464 sub homology_string{
465 my ($self,$value) = @_;
466 my $previous = $self->{HOMOLOGY_SEQ};
467 if( defined $value || ! defined $previous ) {
468 $value = $previous = '' unless defined $value;
469 $self->{HOMOLOGY_SEQ} = $value;
470 # do some housekeeping so we know when to
471 # re-run _calculate_seq_positions
472 $self->{'_sequenceschanged'} = 1;
474 return $previous;
477 =head2 consensus_string
479 Title : consensus_string
480 Usage : my $cs_string = $hsp->consensus_string;
481 Function: Retrieves the consensus structure line for this HSP as a string (HMMER).
482 : If the model had any consensus structure or reference line annotation
483 : that it inherited from a multiple alignment (#=GC SS cons,
484 : #=GC RF annotation in Stockholm files), that information is shown
485 : as CS or RF annotation line.
486 Returns : string
487 Args : [optional] string to set for consensus structure
489 =cut
491 sub consensus_string {
492 my ($self,$value) = @_;
493 my $previous = $self->{CS_SEQ};
494 if( defined $value || ! defined $previous ) {
495 $value = $previous = '' unless defined $value;
496 $self->{CS_SEQ} = $value;
497 # do some housekeeping so we know when to
498 # re-run _calculate_seq_positions
499 $self->{'_sequenceschanged'} = 1;
501 return $previous;
504 =head2 posterior_string
506 Title : posterior_string
507 Usage : my $pp_string = $hsp->posterior_string;
508 Function: Retrieves the posterior probability line for this HSP as a string (HMMer3).
509 : The posterior probability is the string of symbols at the bottom
510 : of the alignment indicating the expected accuracy of each aligned residue.
511 : A 0 means 0-5%, 1 means 5-15%, and so on; 9 means 85-95%,
512 : and a * means 95-100% posterior probability.
513 Returns : string
514 Args : [optional] string to set for posterior probability
516 =cut
518 sub posterior_string {
519 my ($self,$value) = @_;
520 my $previous = $self->{PP_SEQ};
521 if( defined $value || ! defined $previous ) {
522 $value = $previous = '' unless defined $value;
523 $self->{PP_SEQ} = $value;
524 # do some housekeeping so we know when to
525 # re-run _calculate_seq_positions
526 $self->{'_sequenceschanged'} = 1;
528 return $previous;
531 =head2 length
533 Title : length
534 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
535 Function : Returns the length of the query or hit in the alignment
536 (without gaps)
537 or the aggregate length of the HSP (including gaps;
538 this may be greater than either hit or query )
539 Returns : integer
540 Args : arg 1: 'query' = length of query seq (without gaps)
541 'hit' = length of hit seq (without gaps) (synonyms: sbjct, subject)
542 'total' = length of alignment (with gaps)
543 default = 'total'
544 arg 2: [optional] integer length value to set for specific type
546 =cut
548 sub length {
550 my $self = shift;
551 my $type = shift;
553 $type = 'total' unless defined $type;
554 $type = lc $type;
556 if( $type =~ /^q/i ) {
557 return $self->query()->length(shift);
558 } elsif( $type =~ /^(hit|subject|sbjct)/ ) {
559 return $self->hit()->length(shift);
560 } else {
561 my $v = shift;
562 if( defined $v ) {
563 $self->{HSP_LENGTH} = $v;
565 return $self->{HSP_LENGTH};
567 return 0; # should never get here
570 =head2 hsp_length
572 Title : hsp_length
573 Usage : my $len = $hsp->hsp_length()
574 Function: shortcut length('hsp')
575 Returns : floating point between 0 and 100
576 Args : none
578 =cut
580 sub hsp_length { return shift->length('hsp', shift); }
582 =head2 percent_identity
584 Title : percent_identity
585 Usage : my $percentid = $hsp->percent_identity()
586 Function: Returns the calculated percent identity for an HSP
587 Returns : floating point between 0 and 100
588 Args : none
591 =cut
593 sub percent_identity {
594 my $self = shift;
596 unless ($self->{_did_prepi}) {
597 $self->_pre_pi;
600 return $self->SUPER::percent_identity(@_);
603 =head2 frame
605 Title : frame
606 Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
607 Function: Set the Frame for both query and subject and insure that
608 they agree.
609 This overrides the frame() method implementation in
610 FeaturePair.
611 Returns : array of query and subject frame if return type wants an array
612 or query frame if defined or subject frame if not defined
613 Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
614 'query' to retrieve the query frame
615 'list' or 'array' to retrieve both query and hit frames together
616 Note : Frames are stored in the GFF way (0-2) not 1-3
617 as they are in BLAST (negative frames are deduced by checking
618 the strand of the query or hit)
620 =cut
622 # Note: changed 4/19/08 - bug 2485
624 # frame() is supposed to be a getter/setter (as is implied by the Function desc
625 # above; this is also consistent with Bio::SeqFeature::SimilarityPair). Also,
626 # the API is not consistent with the other HSP/SimilarityPair methods such as
627 # strand(), start(), end(), etc.
629 # In order to make this consistent with other methods all work is now done
630 # when the features are instantiated and not delayed. We compromise by
631 # defaulting back 'to hit' w/o passed args. Setting is now allowed.
633 sub frame {
634 my $self = shift;
635 my $val = shift;
636 if (!defined $val) {
637 # unfortunately, w/o args we need to warn about API changes
638 $self->warn("API for frame() has changed.\n".
639 "Please refer to documentation for Bio::Search::HSP::GenericHSP;\n".
640 "returning query frame");
641 $val = 'query';
643 $val =~ s/^\s+//;
645 if( $val =~ /^q/i ) {
646 return $self->query->frame(@_);
647 } elsif( $val =~ /^hi|^s/i ) {
648 return $self->hit->frame(@_);
649 } elsif ( $val =~ /^list|array/i ) {
650 return ($self->query->frame($_[0]),
651 $self->hit->frame($_[1]) );
652 } elsif ( $val =~ /^\d+$/) {
653 # old API i.e. frame($query_frame, $hit_frame). This catches all but one
654 # case, where no arg is passed (so the hit is wanted).
655 $self->warn("API for frame() has changed.\n".
656 "Please refer to documentation for Bio::Search::HSP::GenericHSP");
657 wantarray ?
658 return ($self->query->frame($val),
659 $self->hit->frame(@_) ) :
660 return $self->hit->frame($val,@_);
661 } else {
662 $self->warn("unrecognized component '$val' requested\n");
664 return 0;
667 =head2 get_aln
669 Title : get_aln
670 Usage : my $aln = $hsp->gel_aln
671 Function: Returns a L<Bio::SimpleAlign> object representing the HSP alignment
672 Returns : L<Bio::SimpleAlign>
673 Args : none
675 =cut
677 sub get_aln {
678 my ($self) = @_;
679 require Bio::LocatableSeq;
680 require Bio::SimpleAlign;
682 my $aln = Bio::SimpleAlign->new();
683 my $hs = $self->hit_string();
684 my $qs = $self->query_string();
685 # FASTA specific stuff moved to the FastaHSP object
686 my $seqonly = $qs;
687 $seqonly =~ s/[\-\s]//g;
688 my ($q_nm,$s_nm) = ($self->query->seq_id(),
689 $self->hit->seq_id());
690 # Should we silently change the name of the query or hit if it isn't
691 # defined? May need revisiting... cjfields 2008-12-3 (commented out below)
693 #unless( defined $q_nm && CORE::length ($q_nm) ) {
694 # $q_nm = 'query';
696 #unless( defined $s_nm && CORE::length ($s_nm) ) {
697 # $s_nm = 'hit';
700 # mapping: 1 residues for every x coordinate positions
701 my $query = Bio::LocatableSeq->new(
702 -seq => $qs,
703 -id => $q_nm,
704 -start => $self->query->start,
705 -end => $self->query->end,
706 -strand => $self->query->strand,
707 -force_nse => $q_nm ? 0 : 1,
708 -mapping => [ 1, $self->{_query_mapping} ]
710 $seqonly = $hs;
711 $seqonly =~ s/[\-\s]//g;
712 my $hit = Bio::LocatableSeq->new(
713 -seq => $hs,
714 -id => $s_nm,
715 -start => $self->hit->start,
716 -end => $self->hit->end,
717 -strand => $self->hit->strand,
718 -force_nse => $s_nm ? 0 : 1,
719 -mapping => [ 1, $self->{_hit_mapping} ]
721 $aln->add_seq($query);
722 $aln->add_seq($hit);
723 return $aln;
726 =head2 num_conserved
728 Title : num_conserved
729 Usage : $obj->num_conserved($newval)
730 Function: returns the number of conserved residues in the alignment
731 Returns : integer
732 Args : integer (optional)
735 =cut
737 sub num_conserved{
738 my ($self,$value) = @_;
740 unless ($self->{_did_presimilar}) {
741 $self->_pre_similar_stats;
744 if (defined $value) {
745 $self->{CONSERVED} = $value;
747 return $self->{CONSERVED};
750 =head2 num_identical
752 Title : num_identical
753 Usage : $obj->num_identical($newval)
754 Function: returns the number of identical residues in the alignment
755 Returns : integer
756 Args : integer (optional)
759 =cut
761 sub num_identical{
762 my ($self,$value) = @_;
764 unless ($self->{_did_presimilar}) {
765 $self->_pre_similar_stats;
768 if( defined $value) {
769 $self->{IDENTICAL} = $value;
771 return $self->{IDENTICAL};
774 =head2 rank
776 Usage : $hsp->rank( [string] );
777 Purpose : Get the rank of the HSP within a given Blast hit.
778 Example : $rank = $hsp->rank;
779 Returns : Integer (1..n) corresponding to the order in which the HSP
780 appears in the BLAST report.
782 =cut
784 sub rank {
785 my ($self,$value) = @_;
786 if( defined $value) {
787 $self->{RANK} = $value;
789 return $self->{RANK};
792 =head2 seq_inds
794 Title : seq_inds
795 Purpose : Get a list of residue positions (indices) for all identical
796 : or conserved residues in the query or sbjct sequence.
797 Example : @s_ind = $hsp->seq_inds('query', 'identical');
798 : @h_ind = $hsp->seq_inds('hit', 'conserved');
799 : @h_ind = $hsp->seq_inds('hit', 'conserved-not-identical');
800 : @h_ind = $hsp->seq_inds('hit', 'conserved', 1);
801 Returns : List of integers
802 : May include ranges if collapse is true.
803 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query)
804 : ('sbjct' is synonymous with 'hit')
805 : class = 'identical' - identical positions
806 : 'conserved' - conserved positions
807 : 'nomatch' - mismatched residue or gap positions
808 : 'mismatch' - mismatched residue positions (no gaps)
809 : 'gap' - gap positions only
810 : 'frameshift'- frameshift positions only
811 : 'conserved-not-identical' - conserved positions w/o
812 : identical residues
813 : The name can be shortened to 'id' or 'cons' unless
814 : the name is . The default value is
815 : 'identical'
817 : collapse = boolean, if true, consecutive positions are merged
818 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
819 : collapses to "1-5 7 9-11". This is useful for
820 : consolidating long lists. Default = no collapse.
822 Throws : n/a.
823 Comments : For HSPs partially or completely derived from translated sequences
824 : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
825 : cannot easily be attributed to a single position (i.e. the
826 : positional data is ambiguous). In these cases all three codon
827 : positions are reported. Under these conditions you can check
828 : ambiguous_seq_inds() to determine whether the query, subject,
829 : or both are ambiguous.
831 See Also : L<Bio::Search::SearchUtils::collapse_nums()|Bio::Search::SearchUtils>,
832 L<Bio::Search::Hit::HitI::seq_inds()|Bio::Search::Hit::HitI>
834 =cut
836 sub seq_inds{
837 my ($self, $seqType, $class, $collapse) = @_;
839 # prepare the internal structures - this is cached so
840 # if the strings have not changed we're okay
841 $self->_calculate_seq_positions();
843 $seqType ||= 'query';
844 $class ||= 'identical';
845 $collapse ||= 0;
846 $seqType = 'sbjct' if $seqType eq 'hit';
847 my $t = lc(substr($seqType,0,1));
848 if( $t eq 'q' ) {
849 $seqType = 'query';
850 } elsif ( $t eq 's' || $t eq 'h' ) {
851 $seqType = 'sbjct';
852 } else {
853 $self->warn("unknown seqtype $seqType using 'query'");
854 $seqType = 'query';
857 $t = lc(substr($class,0,1));
859 if( $t eq 'c' ) {
860 if( $class =~ /conserved\-not\-identical/ ) {
861 $class = 'conserved';
862 } else {
863 $class = 'conservedall';
865 } elsif( $t eq 'i' ) {
866 $class = 'identical';
867 } elsif( $t eq 'n' ) {
868 $class = 'nomatch';
869 } elsif( $t eq 'm' ) {
870 $class = 'mismatch';
871 } elsif( $t eq 'g' ) {
872 $class = 'gap';
873 } elsif( $t eq 'f' ) {
874 $class = 'frameshift';
875 } else {
876 $self->warn("unknown sequence class $class using 'identical'");
877 $class = 'identical';
880 ## Sensitive to member name changes.
881 $seqType = "_\L$seqType\E";
882 $class = "_\L$class\E";
883 my @ary;
885 if( $class eq '_gap' ) {
886 # this means that we are remapping the gap length that is stored
887 # in the hash (for example $self->{'_gapRes_query'} )
888 # so we'll return an array which has the values of the position of the
889 # of the gap (the key in the hash) + the gap length (value in the
890 # hash for this key - 1.
892 # changing this; since the index is the position prior to the insertion,
893 # repeat the position based on the number of gaps inserted
894 @ary = map { my @tmp;
895 # position holds number of gaps inserted
896 for my $g (1..$self->{seqinds}{"${class}Res$seqType"}->{$_}) {
897 push @tmp, $_ ;
899 @tmp}
900 sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
901 } elsif( $class eq '_conservedall' ) {
902 @ary = sort { $a <=> $b }
903 keys %{ $self->{seqinds}{"_conservedRes$seqType"}},
904 keys %{ $self->{seqinds}{"_identicalRes$seqType"}},
905 } else {
906 @ary = sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
908 require Bio::Search::BlastUtils if $collapse;
910 return $collapse ? &Bio::Search::SearchUtils::collapse_nums(@ary) : @ary;
913 =head2 ambiguous_seq_inds
915 Title : ambiguous_seq_inds
916 Purpose : returns a string denoting whether sequence indices for query,
917 : subject, or both are ambiguous
918 Returns : String; 'query', 'subject', 'query/subject', or empty string ''
919 Argument : none
920 Comments : For HSPs partially or completely derived from translated sequences
921 : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
922 : cannot easily be attributed to a single position (i.e. the
923 : positional data is ambiguous). In these cases all three codon
924 : positions are reported when using seq_inds(). Under these
925 : conditions you can check ambiguous_seq_inds() to determine whether
926 : the query, subject, or both are ambiguous.
927 See Also : L<Bio::Search::Hit::HSPI::seq_inds()>
929 =cut
931 sub ambiguous_seq_inds {
932 my $self = shift;
933 $self->_calculate_seq_positions();
934 my $type = ($self->{_query_offset} == 3 && $self->{_sbjct_offset} == 3) ?
935 'query/subject' :
936 ($self->{_query_offset} == 3) ? 'query' :
937 ($self->{_sbjct_offset} == 3) ? 'subject' : '';
938 return $type;
941 =head2 Inherited from L<Bio::SeqFeature::SimilarityPair>
943 These methods come from L<Bio::SeqFeature::SimilarityPair>
945 =head2 query
947 Title : query
948 Usage : my $query = $hsp->query
949 Function: Returns a SeqFeature representing the query in the HSP
950 Returns : L<Bio::SeqFeature::Similarity>
951 Args : [optional] new value to set
953 =cut
955 sub query {
956 my $self = shift;
957 unless ($self->{_created_qff}) {
958 $self->_query_seq_feature;
960 return $self->SUPER::query(@_);
963 sub feature1 {
964 my $self = shift;
965 if (! $self->{_finished_new} || $self->{_making_qff}) {
966 return $self->{_sim1} if $self->{_sim1};
967 $self->{_sim1} = Bio::SeqFeature::Similarity->new();
968 return $self->{_sim1};
970 unless ($self->{_created_qff}) {
971 $self->_query_seq_feature;
973 return $self->SUPER::feature1(@_);
976 =head2 hit
978 Title : hit
979 Usage : my $hit = $hsp->hit
980 Function: Returns a SeqFeature representing the hit in the HSP
981 Returns : L<Bio::SeqFeature::Similarity>
982 Args : [optional] new value to set
984 =cut
986 sub hit {
987 my $self = shift;
988 unless ($self->{_created_sff}) {
989 $self->_subject_seq_feature;
991 return $self->SUPER::hit(@_);
994 sub feature2 {
995 my $self = shift;
996 if (! $self->{_finished_new} || $self->{_making_sff}) {
997 return $self->{_sim2} if $self->{_sim2};
998 $self->{_sim2} = Bio::SeqFeature::Similarity->new();
999 return $self->{_sim2};
1001 unless ($self->{_created_sff}) {
1002 $self->_subject_seq_feature;
1004 return $self->SUPER::feature2(@_);
1007 =head2 significance
1009 Title : significance
1010 Usage : $evalue = $obj->significance();
1011 $obj->significance($evalue);
1012 Function: Get/Set the significance value
1013 Returns : numeric
1014 Args : [optional] new value to set
1016 =cut
1018 # Override significance to return the e-value or, if this is
1019 # not defined (WU-BLAST), return the p-value.
1020 sub significance {
1021 my ($self, $val) = @_;
1022 if (!defined $self->{SIGNIFICANCE} || defined $val) {
1023 $self->{SIGNIFICANCE} = defined $val ? $val :
1024 defined $self->evalue ? $self->evalue :
1025 defined $self->pvalue ? $$self->pvalue :
1026 undef;
1027 $self->query->significance($self->{SIGNIFICANCE});
1029 return $self->{SIGNIFICANCE};
1032 =head2 strand
1034 Title : strand
1035 Usage : $hsp->strand('query')
1036 Function: Retrieves the strand for the HSP component requested
1037 Returns : +1 or -1
1038 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject,
1039 'query' to retrieve the query strand (default)
1041 =cut
1043 sub strand {
1044 my ($self,$type) = @_;
1046 if( $type =~ /^q/i && defined $self->{'QUERY_STRAND'} ) {
1047 return $self->{'QUERY_STRAND'};
1048 } elsif( $type =~ /^(hit|subject|sbjct)/i && defined $self->{'HIT_STRAND'} ) {
1049 return $self->{'HIT_STRAND'};
1052 return $self->SUPER::strand($type)
1055 =head2 score
1057 Title : score
1058 Usage : $score = $obj->score();
1059 $obj->score($value);
1060 Function: Get/Set the score value
1061 Returns : numeric
1062 Args : [optional] new value to set
1064 =head2 bits
1066 Title : bits
1067 Usage : $bits = $obj->bits();
1068 $obj->bits($value);
1069 Function: Get/Set the bits value
1070 Returns : numeric
1071 Args : [optional] new value to set
1073 =head1 Private methods
1075 =cut
1077 =head2 _calculate_seq_positions
1079 Title : _calculate_seq_positions
1080 Usage : $self->_calculate_seq_positions
1081 Function: Internal function
1082 Returns :
1083 Args :
1086 =cut
1088 sub _calculate_seq_positions {
1089 my ($self,@args) = @_;
1090 return unless ( $self->{'_sequenceschanged'} );
1091 $self->{'_sequenceschanged'} = 0;
1092 my ($seqString, $qseq,$sseq) = ( $self->homology_string(),
1093 $self->query_string(),
1094 $self->hit_string() );
1095 my ($mlen, $qlen, $slen) = (CORE::length($seqString), CORE::length($qseq), CORE::length($sseq));
1096 my $qdir = $self->query->strand || 1;
1097 my $sdir = $self->hit->strand || 1;
1098 my ($resCount_query, $endpoint_query) = ($qdir <=0) ? ($self->query->end, $self->query->start)
1099 : ($self->query->start, $self->query->end);
1100 my ($resCount_sbjct, $endpoint_sbjct) = ($sdir <=0) ? ($self->hit->end, $self->hit->start)
1101 : ($self->hit->start, $self->hit->end);
1103 my $prog = $self->algorithm;
1105 if( $prog =~ /FAST|SSEARCH|SMITH-WATERMAN/i ) {
1107 # we infer the end of the regional sequence where the first and last
1108 # non spaces are in the homology string
1109 # then we use the HSP->length to tell us how far to read
1110 # to cut off the end of the sequence
1112 my ($start, $rest) = (0,0);
1113 if( $seqString =~ /^(\s+)?(.*?)\s*$/ ) {
1114 ($start, $rest) = ($1 ? CORE::length($1) : 0, CORE::length($2));
1117 $seqString = substr($seqString, $start, $rest);
1118 $qseq = substr($qseq, $start, $rest);
1119 $sseq = substr($sseq, $start, $rest);
1121 # commented out 10/29/08
1122 # removing frameshift symbols doesn't take into account the following:
1123 # 1) does not remove the same point in the homology string (get
1124 # positional errors)
1125 # 2) adjustments to the overall position in the string due to the
1126 # frameshift must be taken into consideration (get balancing errors)
1128 # Frameshifts will be handled directly in the main loop.
1129 # --chris
1131 # fasta reports some extra 'regional' sequence information
1132 # we need to clear out first
1133 # this seemed a bit insane to me at first, but it appears to
1134 # work --jason
1136 #$qseq =~ s![\\\/]!!g;
1137 #$sseq =~ s![\\\/]!!g;
1140 if (!defined($self->{_sbjct_offset}) || !defined($self->{_query_offset})) {
1141 $self->_calculate_seq_offsets();
1144 my ($qfs, $sfs) = (0,0);
1145 CHAR_LOOP:
1146 for my $pos (0..CORE::length($seqString)-1) {
1147 my @qrange = (0 - $qfs)..($self->{_query_offset} - 1);
1148 my @srange = (0 - $sfs)..($self->{_sbjct_offset} - 1);
1149 # $self->debug("QRange:@qrange SRange:@srange\n") if ($qfs || $sfs);
1150 ($qfs, $sfs) = (0,0);
1151 my ($mchar, $qchar, $schar) = (
1152 unpack("x$pos A1",$seqString) || ' ',
1153 $pos < CORE::length($qseq) ? unpack("x$pos A1",$qseq) : '-',
1154 $pos < CORE::length($sseq) ? unpack("x$pos A1",$sseq) : '-'
1156 my $matchtype = '';
1157 my ($qgap, $sgap) = (0,0);
1158 if( $mchar eq '+' || $mchar eq '.') { # conserved
1159 $self->{seqinds}{_conservedRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1160 $self->{seqinds}{_conservedRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1161 $matchtype = 'conserved';
1162 } elsif( $mchar eq ':' || $mchar ne ' ' ) { # identical
1163 $self->{seqinds}{_identicalRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1164 $self->{seqinds}{_identicalRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1165 $matchtype = 'identical';
1166 } elsif( $mchar eq ' ' ) { # possible mismatch/nomatch/frameshift
1167 $qfs = $qchar eq '/' ? -1 : # base inserted to match frame
1168 $qchar eq '\\' ? 1 : # base deleted to match frame
1170 $sfs = $schar eq '/' ? -1 :
1171 $schar eq '\\' ? 1 :
1173 if ($qfs) {
1174 # Frameshifts are tricky.
1176 # '/' indicates that the next residue is shifted back one
1177 # (-1) frame position and is a deletion of one base (so to
1178 # correctly align, a base is inserted). That residue should only
1179 # occupy two sequence positions instead of three.
1181 # '\' indicates that the next residue is shifted forward
1182 # one (+1) frame position and is an insertion of one base (so to
1183 # correctly align, a base is removed). That residue should
1184 # occupy four sequence positions instead of three.
1186 # Note that gaps are not counted across from frameshifts.
1187 # Frameshift indices are a range of positions starting in the
1188 # previous sequence position in which the frameshift occurs;
1189 # they are ambiguous by nature.
1190 $self->{seqinds}{_frameshiftRes_query}{ $resCount_query - ($_ * $qdir * $qfs) } = $qfs for @qrange;
1191 $matchtype = "$qfs frameshift-query";
1192 $sgap = $qgap = 1;
1194 elsif ($sfs) {
1195 $self->{seqinds}{_frameshiftRes_sbjct}{ $resCount_sbjct - ($_ * $sdir * $sfs) } = $sfs for @srange;
1196 $matchtype = "$sfs frameshift-sbcjt";
1197 $sgap = $qgap = 1;
1199 elsif ($qchar eq $self->{GAP_SYMBOL}) {
1200 # gap are counted as being in the immediately preceeding residue
1201 # position; for translated sequences this is not the start of
1202 # the previous codon, but the third codon position
1203 $self->{seqinds}{_gapRes_query}{ $resCount_query - $qdir }++ for @qrange;
1204 $matchtype = 'gap-query';
1205 $qgap++;
1207 elsif ($schar eq $self->{GAP_SYMBOL}) {
1208 $self->{seqinds}{_gapRes_sbjct}{ $resCount_sbjct - $sdir }++ for @srange;
1209 $matchtype = 'gap-sbjct';
1210 $sgap++;
1212 else {
1213 # if not a gap or frameshift in either seq, must be mismatch
1214 $self->{seqinds}{_mismatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1215 $self->{seqinds}{_mismatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1216 $matchtype = 'mismatch';
1218 # always add a nomatch unless the current position in the seq is a gap
1219 if (!$sgap) {
1220 $self->{seqinds}{_nomatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1222 if (!$qgap) {
1223 $self->{seqinds}{_nomatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1225 } else {
1226 $self->warn("Unknown midline character: [$mchar]");
1228 # leave in and uncomment for future debugging
1229 #$self->debug(sprintf("%7d %1s[%1s]%1s %-7d Type: %-20s QOff:%-2d SOff:%-2d\n",
1230 # $resCount_query,
1231 # $qchar,
1232 # $mchar,
1233 # $schar,
1234 # $resCount_sbjct,
1235 # $matchtype,
1236 # ($self->{_query_offset} * $qdir),
1237 # ($self->{_sbjct_offset} * $sdir)));
1238 $resCount_query += ($qdir * (scalar(@qrange) + $qfs)) if !$qgap;
1239 $resCount_sbjct += ($sdir * (scalar(@srange) + $sfs)) if !$sgap;
1241 return 1;
1244 sub _calculate_seq_offsets {
1245 my $self = shift;
1246 my $prog = $self->algorithm;
1247 ($self->{_sbjct_offset}, $self->{_query_offset}) = (1,1);
1248 if($prog =~ /^(?:PSI)?T(BLAST|FAST)(N|X|Y)/oi ) {
1249 $self->{_sbjct_offset} = 3;
1250 if ($1 eq 'BLAST' && $2 eq 'X') { #TBLASTX
1251 $self->{_query_offset} = 3;
1253 } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) {
1254 $self->{_query_offset} = 3;
1259 =head2 n
1261 See documentation in L<Bio::Search::HSP::HSPI::n()|Bio::Search::HSP::HSPI>
1263 =cut
1265 sub n {
1266 my $self = shift;
1267 if(@_) { $self->{'N'} = shift; }
1268 # note that returning 1 is completely an assumption
1269 defined $self->{'N'} ? $self->{'N'} : 1;
1272 =head2 range
1274 See documentation in L<Bio::Search::HSP::HSPI::range()|Bio::Search::HSP::HSPI>
1276 =cut
1278 sub range {
1279 my ($self, $seqType) = @_;
1281 $seqType ||= 'query';
1282 $seqType = 'sbjct' if $seqType eq 'hit';
1284 my ($start, $end);
1285 if( $seqType eq 'query' ) {
1286 $start = $self->query->start;
1287 $end = $self->query->end;
1289 else {
1290 $start = $self->hit->start;
1291 $end = $self->hit->end;
1293 return ($start, $end);
1297 =head2 links
1299 Title : links
1300 Usage : $obj->links($newval)
1301 Function: Get/Set the Links value (from WU-BLAST)
1302 Indicates the placement of the alignment in the group of HSPs
1303 Returns : Value of links
1304 Args : On set, new value (a scalar or undef, optional)
1307 =cut
1309 sub links{
1310 my $self = shift;
1312 return $self->{LINKS} = shift if @_;
1313 return $self->{LINKS};
1316 =head2 hsp_group
1318 Title : hsp_group
1319 Usage : $obj->hsp_group($newval)
1320 Function: Get/Set the Group value (from WU-BLAST)
1321 Indicates a grouping of HSPs
1322 Returns : Value of group
1323 Args : On set, new value (a scalar or undef, optional)
1326 =cut
1328 sub hsp_group {
1329 my $self = shift;
1331 return $self->{HSP_GROUP} = shift if @_;
1332 return $self->{HSP_GROUP};
1335 =head2 hit_features
1337 Title : hit_features
1338 Usage : $obj->hit_features($newval)
1339 Function: Get/Set the HSP hit feature string (from some BLAST 2.2.13 text
1340 output), which is a string of overlapping or nearby features in HSP
1342 Returns : Value of hit features, if present
1343 Args : On set, new value (a scalar or undef, optional)
1346 =cut
1348 sub hit_features {
1349 my $self = shift;
1351 return $self->{HIT_FEATURES} = shift if @_;
1352 return $self->{HIT_FEATURES};
1355 # The cigar string code is written by Juguang Xiao <juguang@fugu-sg.org>
1357 =head1 Brief introduction on cigar string
1359 NOTE: the concept is originally from EnsEMBL docs at
1360 http://may2005.archive.ensembl.org/Docs/wiki/html/EnsemblDocs/CigarFormat.html
1361 Please append to these docs if you have a better definition.
1363 Sequence alignment hits can be stored in a database as ungapped alignments.
1364 This imposes 2 major constraints on alignments:
1366 a) alignments for a single hit record require multiple rows in the database,
1368 b) it is not possible to accurately retrieve the exact original alignment.
1370 Alternatively, sequence alignments can be stored as gapped alignments using
1371 the CIGAR line format (where CIGAR stands for Concise Idiosyncratic Gapped
1372 Alignment Report).
1374 In the cigar line format alignments are stored as follows:
1376 M: Match
1377 D: Deletion
1378 I: Insertion
1380 An example of an alignment for a hypthetical protein match is shown below:
1383 Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
1385 PG P G GP R PLGP
1387 Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
1390 protein_align_feature table as the following cigar line:
1392 7M4D12M2I2MD7M
1394 =head2 cigar_string
1396 Name: cigar_string
1397 Usage: $cigar_string = $hsp->cigar_string
1398 Function: Generate and return cigar string for this HSP alignment
1399 Args: No input needed
1400 Return: a cigar string
1402 =cut
1405 sub cigar_string {
1406 my ($self, $arg) = @_;
1407 $self->warn("this is not a setter") if(defined $arg);
1409 unless(defined $self->{_cigar_string}){ # generate cigar string
1410 my $cigar_string = $self->generate_cigar_string($self->query_string, $self->hit_string);
1411 $self->{_cigar_string} = $cigar_string;
1412 } # end of unless
1414 return $self->{_cigar_string};
1417 =head2 generate_cigar_string
1419 Name: generate_cigar_string
1420 Usage: my $cigar_string = Bio::Search::HSP::GenericHSP::generate_cigar_string ($qstr, $hstr);
1421 Function: generate cigar string from a simple sequence of alignment.
1422 Args: the string of query and subject
1423 Return: cigar string
1425 =cut
1427 sub generate_cigar_string {
1428 my ($self, $qstr, $hstr) = @_;
1429 my @qchars = split //, $qstr;
1430 my @hchars = split //, $hstr;
1432 unless(scalar(@qchars) == scalar(@hchars)){
1433 $self->throw("two sequences are not equal in lengths");
1436 $self->{_count_for_cigar_string} = 0;
1437 $self->{_state_for_cigar_string} = 'M';
1439 my $cigar_string = '';
1440 for(my $i=0; $i <= $#qchars; $i++){
1441 my $qchar = $qchars[$i];
1442 my $hchar = $hchars[$i];
1443 if($qchar ne $self->{GAP_SYMBOL} && $hchar ne $self->{GAP_SYMBOL}){ # Match
1444 $cigar_string .= $self->_sub_cigar_string('M');
1445 }elsif($qchar eq $self->{GAP_SYMBOL}){ # Deletion
1446 $cigar_string .= $self->_sub_cigar_string('D');
1447 }elsif($hchar eq $self->{GAP_SYMBOL}){ # Insertion
1448 $cigar_string .= $self->_sub_cigar_string('I');
1449 }else{
1450 $self->throw("Impossible state that 2 gaps on each seq aligned");
1453 $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
1454 return $cigar_string;
1457 # an internal method to help generate cigar string
1459 sub _sub_cigar_string {
1460 my ($self, $new_state) = @_;
1462 my $sub_cigar_string = '';
1463 if($self->{_state_for_cigar_string} eq $new_state){
1464 $self->{_count_for_cigar_string} += 1; # Remain the state and increase the counter
1465 }else{
1466 $sub_cigar_string .= $self->{_count_for_cigar_string}
1467 unless $self->{_count_for_cigar_string} == 1;
1468 $sub_cigar_string .= $self->{_state_for_cigar_string};
1469 $self->{_count_for_cigar_string} = 1;
1470 $self->{_state_for_cigar_string} = $new_state;
1472 return $sub_cigar_string;
1475 # needed before seqfeatures can be made
1476 sub _pre_seq_feature {
1477 my $self = shift;
1478 my $algo = $self->{ALGORITHM};
1480 my ($queryfactor, $hitfactor) = (0,0);
1481 my ($hitmap, $querymap) = (1,1);
1482 if( $algo =~ /^(?:PSI)?T(?:BLAST|FAST|SW)[NY]/oi ) {
1483 $hitfactor = 1;
1484 $hitmap = 3;
1486 elsif ($algo =~ /^(?:FAST|BLAST)(?:X|Y|XY)/oi || $algo =~ /^P?GENEWISE/oi ) {
1487 $queryfactor = 1;
1488 $querymap = 3;
1490 elsif ($algo =~ /^T(BLAST|FAST|SW)(X|Y|XY)/oi || $algo =~ /^(BLAST|FAST|SW)N/oi || $algo =~ /^WABA|AXT|BLAT|BLASTZ|PSL|MEGABLAST|EXONERATE|SW|SSEARCH|SMITH\-WATERMAN|SIM4$/){
1491 if ($2) {
1492 $hitmap = $querymap = 3;
1494 $hitfactor = 1;
1495 $queryfactor = 1;
1497 elsif ($algo =~ /^RPS-BLAST/) {
1498 if ($algo =~ /^RPS-BLAST\(BLASTX\)/) {
1499 $queryfactor = 1;
1500 $querymap = 3;
1502 $hitfactor = 0;
1504 else {
1505 my $stranded = lc substr($self->{STRANDED}, 0,1);
1506 $queryfactor = ($stranded eq 'q' || $stranded eq 'b') ? 1 : 0;
1507 $hitfactor = ($stranded eq 'h' || $stranded eq 's' || $stranded eq 'b') ? 1 : 0;
1509 $self->{_query_factor} = $queryfactor;
1510 $self->{_hit_factor} = $hitfactor;
1511 $self->{_hit_mapping} = $hitmap;
1512 $self->{_query_mapping} = $querymap;
1515 # make query seq feature
1516 sub _query_seq_feature {
1517 my $self = shift;
1518 $self->{_making_qff} = 1;
1519 my $qs = $self->{QUERY_START};
1520 my $qe = $self->{QUERY_END};
1521 unless (defined $self->{_query_factor}) {
1522 $self->_pre_seq_feature;
1524 my $queryfactor = $self->{_query_factor};
1526 unless( defined $qe && defined $qs ) { $self->throw("Did not specify a Query End or Query Begin"); }
1528 my $strand;
1529 if ($qe > $qs) { # normal query: start < end
1530 if ($queryfactor) {
1531 $strand = 1;
1533 else {
1534 $strand = undef;
1537 else {
1538 if ($queryfactor) {
1539 $strand = -1;
1541 else {
1542 $strand = undef;
1544 ($qs,$qe) = ($qe,$qs);
1547 # Note: many of these data are not query- and hit-specific.
1548 # Only start, end, name, length are.
1549 # We could be more efficient by only storing this info once.
1550 # steve chervitz --- Sat Apr 5 00:55:07 2003
1552 my $sim1 = $self->{_sim1} || Bio::SeqFeature::Similarity->new();
1553 $sim1->start($qs);
1554 $sim1->end($qe);
1555 $sim1->significance($self->{EVALUE});
1556 $sim1->bits($self->{BITS});
1557 $sim1->score($self->{SCORE});
1558 $sim1->strand($strand);
1559 $sim1->seq_id($self->{QUERY_NAME});
1560 $sim1->seqlength($self->{QUERY_LENGTH});
1561 $sim1->source_tag($self->{ALGORITHM});
1562 $sim1->seqdesc($self->{QUERY_DESC});
1563 $sim1->add_tag_value('meta', $self->{META}) if $self->can('meta');
1564 # to determine frame from something like FASTXY which doesn't
1565 # report the frame
1566 my $qframe = $self->{QUERY_FRAME};
1568 if (defined $strand && !defined $qframe && $queryfactor) {
1569 $qframe = ( $qs % 3 ) * $strand;
1571 elsif (!defined $strand) {
1572 $qframe = 0;
1575 if( $qframe =~ /^([+-])?([0-3])/ ) {
1576 my $dir = $1 || '+';
1577 if($qframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
1578 $self->warn("Query frame ($qframe) did not match strand of query ($strand)");
1580 $qframe = $2 != 0 ? $2 - 1 : $2;
1581 } else {
1582 $self->warn("Unknown query frame ($qframe)");
1583 $qframe = 0;
1586 $sim1->frame($qframe);
1587 $self->SUPER::feature1($sim1);
1589 $self->{_created_qff} = 1;
1590 $self->{_making_qff} = 0;
1593 # make subject seq feature
1594 sub _subject_seq_feature {
1595 my $self = shift;
1596 $self->{_making_sff} = 1;
1597 my $hs = $self->{HIT_START};
1598 my $he = $self->{HIT_END};
1599 unless (defined $self->{_hit_factor}) {
1600 $self->_pre_seq_feature;
1602 my $hitfactor = $self->{_hit_factor};
1604 unless( defined $he && defined $hs ) { $self->throw("Did not specify a Hit End or Hit Begin"); }
1606 my $strand;
1607 if ($he > $hs) { # normal subject
1608 if ($hitfactor) {
1609 $strand = 1;
1611 else {
1612 $strand = undef;
1615 else {
1616 if ($hitfactor) {
1617 $strand = -1;
1619 else {
1620 $strand = undef;
1622 ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end
1625 my $sim2 = $self->{_sim2} || Bio::SeqFeature::Similarity->new();
1626 $sim2->start($hs);
1627 $sim2->end($he);
1628 $sim2->significance($self->{EVALUE});
1629 $sim2->bits($self->{BITS});
1630 $sim2->score($self->{SCORE});
1631 $sim2->strand($strand);
1632 $sim2->seq_id($self->{HIT_NAME});
1633 $sim2->seqlength($self->{HIT_LENGTH});
1634 $sim2->source_tag($self->{ALGORITHM});
1635 $sim2->seqdesc($self->{HIT_DESC});
1636 $sim2->add_tag_value('meta', $self->{META}) if $self->can('meta');
1637 my $hframe = $self->{HIT_FRAME};
1639 if (defined $strand && !defined $hframe && $hitfactor) {
1640 $hframe = ( $hs % 3 ) * $strand;
1642 elsif (!defined $strand) {
1643 $hframe = 0;
1646 if( $hframe =~ /^([+-])?([0-3])/ ) {
1647 my $dir = $1 || '+';
1648 if($hframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
1649 $self->warn("Subject frame ($hframe) did not match strand of subject ($strand)");
1651 $hframe = $2 != 0 ? $2 - 1 : $2;
1652 } else {
1653 $self->warn("Unknown subject frame ($hframe)");
1654 $hframe = 0;
1657 $sim2->frame($hframe);
1658 $self->SUPER::feature2($sim2);
1660 $self->{_created_sff} = 1;
1661 $self->{_making_sff} = 0;
1664 # before calling the num_* methods
1665 sub _pre_similar_stats {
1666 my $self = shift;
1667 my $identical = $self->{IDENTICAL};
1668 my $conserved = $self->{CONSERVED};
1669 my $percent_id = $self->{PERCENT_IDENTITY};
1671 if (! defined $identical) {
1672 if (! defined $percent_id) {
1673 $self->warn("Did not defined the number of identical matches or overall percent identity in the HSP; assuming 0");
1674 $identical = 0;
1676 else {
1677 $identical = sprintf("%.0f",$percent_id * $self->{HSP_LENGTH});
1681 if (! defined $conserved) {
1682 $self->warn("Did not define the number of conserved matches in the HSP; assuming conserved == identical ($identical)")
1683 if( $self->{ALGORITHM} !~ /^((FAST|BLAST)N)|EXONERATE|SIM4|AXT|PSL|BLAT|BLASTZ|WABA/oi);
1684 $conserved = $identical;
1686 $self->{IDENTICAL} = $identical;
1687 $self->{CONSERVED} = $conserved;
1688 $self->{_did_presimilar} = 1;
1691 # before calling the frac_* methods
1693 sub _pre_frac {
1694 my $self = shift;
1696 my $hsp_len = $self->{HSP_LENGTH};
1697 my $hit_len = $self->{HIT_LENGTH};
1698 my $query_len = $self->{QUERY_LENGTH};
1700 my $identical = $self->num_identical;
1701 my $conserved = $self->num_conserved;
1703 $self->{_did_prefrac} = 1;
1704 my $logical;
1705 if( $hsp_len ) {
1706 $self->length('total', $hsp_len);
1707 $logical = $self->_logical_length('total');
1708 $self->frac_identical( 'total', $identical / $hsp_len);
1709 $self->frac_conserved( 'total', $conserved / $hsp_len);
1711 if( $hit_len ) {
1712 $logical = $self->_logical_length('hit');
1713 $self->frac_identical( 'hit', $identical / $logical);
1714 $self->frac_conserved( 'hit', $conserved / $logical);
1716 if( $query_len ) {
1717 $logical = $self->_logical_length('query');
1718 $self->frac_identical( 'query', $identical / $logical) ;
1719 $self->frac_conserved( 'query', $conserved / $logical);
1723 # before calling gaps()
1724 # This relies first on passed parameters (parser-dependent), then on gaps
1725 # calculated by seq_inds() (if set), then falls back to directly checking
1726 # for '-' or '.' as a last resort
1728 sub _pre_gaps {
1729 my $self = shift;
1730 my $query_gaps = $self->{QUERY_GAPS};
1731 my $query_seq = $self->{QUERY_SEQ};
1732 my $hit_gaps = $self->{HIT_GAPS};
1733 my $hit_seq = $self->{HIT_SEQ};
1734 my $gaps = $self->{HSP_GAPS};
1736 $self->{_did_pregaps} = 1; # well, we're in the process; avoid recursion
1737 if( defined $query_gaps ) {
1738 $self->gaps('query', $query_gaps);
1739 } elsif( defined $query_seq ) {
1740 my $qg = (defined $self->{'_query_offset'}) ? $self->seq_inds('query','gaps')
1741 : ($self->algorithm eq 'ERPIN') ? scalar( $hit_seq =~ tr/\-//)
1742 : scalar( $query_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1743 my $offset = $self->{'_query_offset'} || 1;
1744 $self->gaps('query', $qg/$offset);
1746 if( defined $hit_gaps ) {
1747 $self->gaps('hit', $hit_gaps);
1748 } elsif( defined $hit_seq ) {
1749 my $hg = (defined $self->{'_sbjct_offset'}) ? $self->seq_inds('hit','gaps')
1750 : ($self->algorithm eq 'ERPIN') ? scalar( $hit_seq =~ tr/\-//)
1751 : scalar( $hit_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1752 my $offset = $self->{'_sbjct_offset'} || 1;
1753 $self->gaps('hit', $hg/$offset);
1755 if( ! defined $gaps ) {
1756 $gaps = $self->gaps("query") + $self->gaps("hit");
1758 $self->gaps('total', $gaps);
1761 # before percent_identity
1762 sub _pre_pi {
1763 my $self = shift;
1764 $self->{_did_prepi} = 1;
1765 $self->percent_identity($self->{PERCENT_IDENTITY} || $self->frac_identical('total')*100) if( $self->{HSP_LENGTH} > 0 );