3 Bio::Search::SearchUtils - Utility functions for Bio::Search:: objects
7 # This module is just a collection of subroutines, not an object.
11 The SearchUtils.pm module is a collection of subroutines used
12 primarily by Bio::Search::Hit::HitI objects for some of the additional
13 functionality, such as HSP tiling. Right now, the SearchUtils is just
14 a collection of methods, not an object.
18 Steve Chervitz E<lt>sac@bioperl.orgE<gt>
22 Sendu Bala, bix@sendu.me.uk
26 package Bio
::Search
::SearchUtils
;
32 Usage : tile_hsps( $sbjct );
33 : This is called automatically by methods in Bio::Search::Hit::GenericHit
34 : that rely on having tiled data.
36 : If you are interested in getting data about the constructed HSP contigs:
37 : my ($qcontigs, $scontigs) = Bio::Search::SearchUtils::tile_hsps($hit);
38 : if (ref $qcontigs) {
39 : print STDERR "Query contigs:\n";
40 : foreach (@{$qcontigs}) {
41 : print "contig start is $_->{'start'}\n";
42 : print "contig stop is $_->{'stop'}\n";
45 : See below for more information about the contig data structure.
47 Purpose : Collect statistics about the aligned sequences in a set of HSPs.
48 : Calculates the following data across all HSPs:
49 : -- total alignment length
50 : -- total identical residues
51 : -- total conserved residues
52 Returns : If there was only a single HSP (so no tiling was necessary)
53 tile_hsps() returns a list of two non-zero integers.
54 If there were multiple HSP,
55 tile_hsps() returns a list of two array references containin HSP contig data.
56 The first array ref contains a list of HSP contigs on the query sequence.
57 The second array ref contains a list of HSP contigs on the subject sequence.
58 Each contig is a hash reference with the following data fields:
59 'start' => start coordinate of the contig
60 'stop' => start coordinate of the contig
61 'iden' => number of identical residues in the contig
62 'cons' => number of conserved residues in the contig
63 'strand'=> strand of the contig
64 'frame' => frame of the contig
65 Argument : A Bio::Search::Hit::HitI object
68 : This method performs more careful summing of data across
69 : all HSPs in the Sbjct object. Only HSPs that are in the same strand
70 : and frame are tiled. Simply summing the data from all HSPs
71 : in the same strand and frame will overestimate the actual
72 : length of the alignment if there is overlap between different HSPs
75 : The strategy is to tile the HSPs and sum over the
76 : contigs, collecting data separately from overlapping and
77 : non-overlapping regions of each HSP. To facilitate this, the
78 : HSP.pm object now permits extraction of data from sub-sections
81 : Additional useful information is collected from the results
82 : of the tiling. It is possible that sub-sequences in
83 : different HSPs will overlap significantly. In this case, it
84 : is impossible to create a single unambiguous alignment by
85 : concatenating the HSPs. The ambiguity may indicate the
86 : presence of multiple, similar domains in one or both of the
87 : aligned sequences. This ambiguity is recorded using the
88 : ambiguous_aln() method.
90 : This method does not attempt to discern biologically
91 : significant vs. insignificant overlaps. The allowable amount of
92 : overlap can be set with the overlap() method or with the -OVERLAP
93 : parameter used when constructing the Hit object.
95 : For a given hit, both the query and the sbjct sequences are
96 : tiled independently.
98 : -- If only query sequence HSPs overlap,
99 : this may suggest multiple domains in the sbjct.
100 : -- If only sbjct sequence HSPs overlap,
101 : this may suggest multiple domains in the query.
102 : -- If both query & sbjct sequence HSPs overlap,
103 : this suggests multiple domains in both.
104 : -- If neither query & sbjct sequence HSPs overlap,
105 : this suggests either no multiple domains in either
106 : sequence OR that both sequences have the same
107 : distribution of multiple similar domains.
109 : This method can deal with the special case of when multiple
110 : HSPs exactly overlap.
112 : Efficiency concerns:
113 : Speed will be an issue for sequences with numerous HSPs.
115 Bugs : Currently, tile_hsps() does not properly account for
116 : the number of non-tiled but overlapping HSPs, which becomes a problem
117 : as overlap() grows. Large values overlap() may thus lead to
118 : incorrect statistics for some hits. For best results, keep overlap()
119 : below 5 (DEFAULT IS 2). For more about this, see the "HSP Tiling and
120 : Ambiguous Alignments" section in L<Bio::Search::Hit::GenericHit>.
122 See Also : L<_adjust_contigs>(), L<Bio::Search::Hit::GenericHit|Bio::Search::Hit::GenericHit>
131 #print STDERR "Calling tile_hsps(): $sbjct\n";
132 #$sbjct->verbose(1); # to activate debugging
133 $sbjct->tiled_hsps(1);
135 # changed to not rely on n() (which is unreliable here) --cjfields 4/6/10
136 if( $sbjct->num_hsps == 0) {
137 #print STDERR "_tile_hsps(): no hsps, nothing to tile! (", $sbjct->num_hsps, ")\n";
138 _warn_about_no_hsps
($sbjct);
139 return (undef, undef);
141 } elsif($sbjct->num_hsps == 1) {
142 ## Simple summation scheme. Valid if there is only one HSP.
143 #print STDERR "_tile_hsps(): single HSP, easy stats.\n";
144 my $hsp = $sbjct->hsp;
145 $sbjct->length_aln('query', $hsp->length('query'));
146 $sbjct->length_aln('hit', $hsp->length('sbjct'));
147 $sbjct->length_aln('total', $hsp->length('total'));
148 $sbjct->matches( $hsp->matches() );
149 $sbjct->gaps('query', $hsp->gaps('query'));
150 $sbjct->gaps('sbjct', $hsp->gaps('sbjct'));
152 _adjust_length_aln
($sbjct);
155 #print STDERR "Sbjct: _tile_hsps: summing multiple HSPs\n";
156 $sbjct->length_aln('query', 0);
157 $sbjct->length_aln('sbjct', 0);
158 $sbjct->length_aln('total', 0);
159 $sbjct->matches( 0, 0);
160 $sbjct->gaps('query', 0);
161 $sbjct->gaps('hit', 0);
164 ## More than one HSP. Must tile HSPs.
165 # print "\nTiling HSPs for $sbjct\n";
166 my($hsp, $qstart, $qstop, $sstart, $sstop);
167 my($frame, $strand, $qstrand, $sstrand);
168 my(@qcontigs, @scontigs);
171 my $max_overlap = $sbjct->overlap;
176 my $v = $sbjct->verbose;
177 foreach $hsp ( $sbjct->hsps() ) {
178 #$sbjct->debug( sprintf(" HSP: %s %d..%d\n",$hsp->query->seq_id, $hsp->query->start, $hsp->hit->end)) if $v > 0; #$hsp->str('query');
179 # printf " Length = %d; Identical = %d; Conserved = %d; Conserved(1-10): %d",$hsp->length, $hsp->length(-TYPE=>'iden'),
180 # $hsp->length(-TYPE=>'cons'),
181 # $hsp->length(-TYPE=>'cons',
182 # -START=>0,-STOP=>10);
184 ($qstart, $qstop) = $hsp->range('query');
185 ($sstart, $sstop) = $hsp->range('sbjct');
186 $frame = $hsp->frame('hit');
187 $frame = -1 unless defined $frame;
189 ($qstrand, $sstrand) = ($hsp->query->strand,
192 # Note: No correction for overlap.
194 my ($qgaps, $sgaps) = ($hsp->gaps('query'), $hsp->gaps('hit'));
195 $hit_qgaps += $qgaps;
196 $hit_sgaps += $sgaps;
197 $hit_len_aln += $hsp->length;
199 ## Collect contigs in the query sequence.
200 $qoverlap += &_adjust_contigs
('query', $hsp, $qstart, $qstop,
201 \
@qcontigs, $max_overlap, $frame,
204 ## Collect contigs in the sbjct sequence
205 # (needed for domain data and gapped Blast).
206 $soverlap += &_adjust_contigs
('sbjct', $hsp, $sstart, $sstop,
207 \
@scontigs, $max_overlap, $frame,
210 ## Collect overall start and stop data for query and
211 # sbjct over all HSPs.
212 unless ( defined $start_stop{'qstart'} ) {
213 $start_stop{'qstart'} = $qstart;
214 $start_stop{'qstop'} = $qstop;
215 $start_stop{'sstart'} = $sstart;
216 $start_stop{'sstop'} = $sstop;
218 $start_stop{'qstart'} = ($qstart < $start_stop{'qstart'} ?
219 $qstart : $start_stop{'qstart'} );
220 $start_stop{'qstop'} = ($qstop > $start_stop{'qstop'} ?
221 $qstop : $start_stop{'qstop'} );
222 $start_stop{'sstart'} = ($sstart < $start_stop{'sstart'} ?
223 $sstart : $start_stop{'sstart'} );
224 $start_stop{'sstop'} = ($sstop > $start_stop{'sstop'} ?
225 $sstop : $start_stop{'sstop'} );
229 # Store the collected data in the Hit object
230 $sbjct->gaps('query', $hit_qgaps);
231 $sbjct->gaps('hit', $hit_sgaps);
232 $sbjct->length_aln('total', $hit_len_aln);
234 $sbjct->start('query',$start_stop{'qstart'});
235 $sbjct->end('query', $start_stop{'qstop'});
236 $sbjct->start('hit', $start_stop{'sstart'});
237 $sbjct->end('hit', $start_stop{'sstop'});
238 ## Collect data across the collected contigs.
240 #$sbjct->debug( "\nQUERY CONTIGS:\n"." gaps = $sbjct->{'_gaps_query'}\n");
242 # Account for strand/frame.
243 # Strategy: collect data on a per strand+frame basis and
244 # save the most significant one.
246 foreach (@qcontigs) {
247 ($frame, $strand) = ($_->{'frame'}, $_->{'strand'});
250 #$sbjct->debug(sprintf( "$frame/$strand len is getting %d for %d..%d\n",
251 # ($_->{'stop'} - $_->{'start'} + 1), $_->{'start'}, $_->{'stop'}));
254 $qctg_dat{ "$frame$strand" }->{'length_aln_query'} += $_->{'stop'} - $_->{'start'} + 1;
255 $qctg_dat{ "$frame$strand" }->{'totalIdentical'} += $_->{'iden'};
256 $qctg_dat{ "$frame$strand" }->{'totalConserved'} += $_->{'cons'};
257 $qctg_dat{ "$frame$strand" }->{'qstrand'} = $strand;
260 # Find longest contig.
261 my @sortedkeys = sort { $qctg_dat{$b}->{'length_aln_query'}
262 <=> $qctg_dat{$a}->{'length_aln_query'} }
265 # Save the largest to the sbjct:
266 my $longest = $sortedkeys[0];
267 #$sbjct->debug( "longest is ". $qctg_dat{ $longest }->{'length_aln_query'}. "\n");
268 $sbjct->length_aln('query', $qctg_dat{ $longest }->{'length_aln_query'});
269 $sbjct->matches($qctg_dat{ $longest }->{'totalIdentical'},
270 $qctg_dat{ $longest }->{'totalConserved'});
271 $sbjct->strand('query', $qctg_dat{ $longest }->{'qstrand'});
273 ## Collect data for sbjct contigs. Important for gapped Blast.
274 ## The totalIdentical and totalConserved numbers will be the same
275 ## as determined for the query contigs.
277 #$sbjct->debug( "\nSBJCT CONTIGS:\n"." gaps = ". $sbjct->gaps('sbjct'). "\n");
280 #$sbjct->debug(" sbjct contig: $_->{'start'} - $_->{'stop'}\n".
281 # " iden = $_->{'iden'}; cons = $_->{'cons'}\n");
282 ($frame, $strand) = ($_->{'frame'}, $_->{'strand'});
283 $sctg_dat{ "$frame$strand" }->{'length_aln_sbjct'} += $_->{'stop'} - $_->{'start'} + 1;
284 $sctg_dat{ "$frame$strand" }->{'frame'} = $frame;
285 $sctg_dat{ "$frame$strand" }->{'sstrand'} = $strand;
288 @sortedkeys = sort { $sctg_dat{ $b }->{'length_aln_sbjct'}
289 <=> $sctg_dat{ $a }->{'length_aln_sbjct'}
292 # Save the largest to the sbjct:
293 $longest = $sortedkeys[0];
295 $sbjct->length_aln('sbjct', $sctg_dat{ $longest }->{'length_aln_sbjct'});
296 $sbjct->frame( $sctg_dat{ $longest }->{'frame'} );
297 $sbjct->strand('hit', $sctg_dat{ $longest }->{'sstrand'});
300 if($soverlap) { $sbjct->ambiguous_aln('qs');
301 #$sbjct->debug("\n*** AMBIGUOUS ALIGNMENT: Query and Sbjct\n\n");
303 else { $sbjct->ambiguous_aln('q');
304 #$sbjct->debug( "\n*** AMBIGUOUS ALIGNMENT: Query\n\n");
307 $sbjct->ambiguous_aln('s');
308 #$sbjct->debug( "\n*** AMBIGUOUS ALIGNMENT: Sbjct\n\n");
311 _adjust_length_aln
($sbjct);
313 return ( [@qcontigs], [@scontigs] );
318 # Title : _adjust_length_aln
319 # Usage : n/a; internal use only; called by tile_hsps.
320 # Purpose : Adjust length of aligment based on BLAST flavor.
321 # Comments : See comments in logica_length()
322 sub _adjust_length_aln
{
324 my $algo = $sbjct->algorithm;
325 my $hlen = $sbjct->length_aln('sbjct');
326 my $qlen = $sbjct->length_aln('query');
328 $sbjct->length_aln('sbjct', logical_length
($algo, 'sbjct', $hlen));
329 $sbjct->length_aln('query', logical_length
($algo, 'query', $qlen));
332 =head2 logical_length
334 Usage : logical_length( $alg_name, $seq_type, $length );
335 Purpose : Determine the logical length of an aligned sequence based on
336 : algorithm name and sequence type.
337 Returns : integer representing the logical aligned length.
338 Argument : $alg_name = name of algorigthm (e.g., blastx, tblastn)
339 : $seq_type = type of sequence (e.g., query or hit)
340 : $length = physical length of the sequence in the alignment.
342 Comments : This function is used to account for the fact that number of identities
343 and conserved residues is reported in peptide space while the query
344 length (in the case of BLASTX and TBLASTX) and/or the hit length
345 (in the case of TBLASTN and TBLASTX) are in nucleotide space.
346 The adjustment affects the values reported by the various frac_XXX
347 methods in GenericHit and GenericHSP.
352 my ($algo, $type, $len) = @_;
354 if($algo =~ /^(?:PSI)?T(?:BLASTN|FAST(?:X|Y|XY))/oi ) {
355 $logical = $len/3 if $type =~ /sbjct
|hit
|tot
/i
;
356 } elsif($algo =~ /^(?:BLASTX|FAST(?:X|Y|XY))/oi ) {
357 $logical = $len/3 if $type =~ /query
|tot
/i
;
358 } elsif($algo =~ /^TBLASTX/oi ) {
365 #=head2 _adjust_contigs
367 # Usage : n/a; internal function called by tile_hsps
368 # Purpose : Builds HSP contigs for a given BLAST hit.
369 # : Utility method called by _tile_hsps()
372 # Throws : Exceptions propagated from Bio::Search::Hit::BlastHSP::matches()
373 # : for invalid sub-sequence ranges.
374 # Status : Experimental
375 # Comments : This method supports gapped alignments through a patch by maj
376 # : to B:S:HSP:HSPI::matches().
377 # : It does not keep track of the number of HSPs that
378 # : overlap within the amount specified by overlap().
379 # : This will lead to significant tracking errors for large
382 #See Also : L<tile_hsps>(), L<Bio::Search::Hit::BlastHSP::matches|Bio::Search::Hit::BlastHSP>
386 sub _adjust_contigs
{
387 my ($seqType, $hsp, $start, $stop, $contigs_ref,
388 $max_overlap, $frame, $strand) = @_;
390 my ($numID, $numCons);
392 foreach (@
$contigs_ref) {
393 # Don't merge things unless they have matching strand/frame.
394 next unless ($_->{'frame'} == $frame && $_->{'strand'} == $strand);
396 # Test special case of a nested HSP. Skip it.
397 if ($start >= $_->{'start'} && $stop <= $_->{'stop'}) {
402 # Test for overlap at beginning of contig, or precedes consecutively
403 if ($start < $_->{'start'} && $stop >= ($_->{'start'} + $max_overlap - 1)) {
405 ($numID, $numCons) = $hsp->matches(-SEQ
=>$seqType,
407 -STOP
=> $_->{'start'} - 1);
409 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
412 if ($numCons eq '') {
413 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
417 if($@
) { warn "\a\n$@\n"; }
419 $_->{'start'} = $start; # Assign a new start coordinate to the contig
420 $_->{'iden'} += $numID; # and add new data to #identical, #conserved.
421 $_->{'cons'} += $numCons;
422 push(@
{$_->{hsps
}}, $hsp);
427 # Test for overlap at end of contig, or follows consecutively
428 if ($stop > $_->{'stop'} and $start <= ($_->{'stop'} - $max_overlap + 1)) {
430 ($numID,$numCons) = $hsp->matches(-SEQ
=>$seqType,
431 -START
=> $_->{'stop'} + 1,
434 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
437 if ($numCons eq '') {
438 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
442 if($@
) { warn "\a\n$@\n"; }
444 $_->{'stop'} = $stop; # Assign a new stop coordinate to the contig
445 $_->{'iden'} += $numID; # and add new data to #identical, #conserved.
446 $_->{'cons'} += $numCons;
447 push(@
{$_->{hsps
}}, $hsp);
455 if ($overlap && @
$contigs_ref > 1) {
456 ## Merge any contigs that now overlap
457 my $max = $#{$contigs_ref};
458 for my $i (0..$max) {
459 ${$contigs_ref}[$i] || next;
460 my ($i_start, $i_stop) = (${$contigs_ref}[$i]->{start
}, ${$contigs_ref}[$i]->{stop
});
462 for my $u ($i+1..$max) {
463 ${$contigs_ref}[$u] || next;
464 my ($u_start, $u_stop) = (${$contigs_ref}[$u]->{start
}, ${$contigs_ref}[$u]->{stop
});
466 if ($u_start < $i_start && $u_stop >= ($i_start + $max_overlap - 1)) {
467 # find the hsps within the contig that have sequence
468 # extending before $i_start
469 my ($ids, $cons) = (0, 0);
470 my $use_start = $i_start;
471 foreach my $hsp (sort { $b->end($seqType) <=> $a->end($seqType) } @
{${$contigs_ref}[$u]->{hsps
}}) {
472 my $hsp_start = $hsp->start($seqType);
473 $hsp_start < $use_start || next;
475 my ($these_ids, $these_cons);
477 ($these_ids, $these_cons) = $hsp->matches(-SEQ
=> $seqType, -START
=> $hsp_start, -STOP
=> $use_start - 1);
478 if ($these_ids eq '') {
479 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
482 if ($these_cons eq '') {
483 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
487 if($@
) { warn "\a\n$@\n"; }
490 $cons += $these_cons;
493 last if $hsp_start == $u_start;
494 $use_start = $hsp_start;
496 ${$contigs_ref}[$i]->{start
} = $u_start;
497 ${$contigs_ref}[$i]->{'iden'} += $ids;
498 ${$contigs_ref}[$i]->{'cons'} += $cons;
499 push(@
{${$contigs_ref}[$i]->{hsps
}}, @
{${$contigs_ref}[$u]->{hsps
}});
501 ${$contigs_ref}[$u] = undef;
503 elsif ($u_stop > $i_stop && $u_start <= ($i_stop - $max_overlap + 1)) {
504 # find the hsps within the contig that have sequence
505 # extending beyond $i_stop
506 my ($ids, $cons) = (0, 0);
507 my $use_stop = $i_stop;
508 foreach my $hsp (sort { $a->start($seqType) <=> $b->start($seqType) } @
{${$contigs_ref}[$u]->{hsps
}}) {
509 my $hsp_end = $hsp->end($seqType);
510 $hsp_end > $use_stop || next;
512 my ($these_ids, $these_cons);
514 ($these_ids, $these_cons) = $hsp->matches(-SEQ
=> $seqType, -START
=> $use_stop + 1, -STOP
=> $hsp_end);
515 if ($these_ids eq '') {
516 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
519 if ($these_cons eq '') {
520 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
524 if($@
) { warn "\a\n$@\n"; }
527 $cons += $these_cons;
530 last if $hsp_end == $u_stop;
531 $use_stop = $hsp_end;
533 ${$contigs_ref}[$i]->{'stop'} = $u_stop;
534 ${$contigs_ref}[$i]->{'iden'} += $ids;
535 ${$contigs_ref}[$i]->{'cons'} += $cons;
536 push(@
{${$contigs_ref}[$i]->{hsps
}}, @
{${$contigs_ref}[$u]->{hsps
}});
538 ${$contigs_ref}[$u] = undef;
540 elsif ($u_start >= $i_start && $u_stop <= $i_stop) {
541 # nested, drop this contig
542 #*** ideally we might do some magic to keep the stats of the
544 ${$contigs_ref}[$u] = undef;
550 foreach (@
$contigs_ref) {
551 push(@merged, $_ || next);
553 @
{$contigs_ref} = @merged;
556 ## If there is no overlap, add the complete HSP data.
557 ($numID,$numCons) = $hsp->matches(-SEQ
=>$seqType);
559 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
562 if ($numCons eq '') {
563 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
567 push @
$contigs_ref, {'start' =>$start, 'stop' =>$stop,
568 'iden' =>$numID, 'cons' =>$numCons,
569 'strand'=>$strand,'frame'=>$frame,'hsps'=>[$hsp]};
577 Usage : &get_exponent( number );
578 Purpose : Determines the power of 10 exponent of an integer, float,
579 : or scientific notation number.
580 Example : &get_exponent("4.0e-206");
581 : &get_exponent("0.00032");
582 : &get_exponent("10.");
583 : &get_exponent("1000.0");
584 : &get_exponent("e+83");
585 Argument : Float, Integer, or scientific notation number
586 Returns : Integer representing the exponent part of the number (+ or -).
587 : If argument == 0 (zero), return value is "-999".
588 Comments : Exponents are rounded up (less negative) if the mantissa is >= 5.
589 : Exponents are rounded down (more negative) if the mantissa is <= -5.
596 my($num, $exp) = split /[eE]/, $data;
599 $num = 1 if not $num;
600 $num >= 5 and $exp++;
601 $num <= -5 and $exp--;
602 } elsif( $num == 0) {
604 } elsif( not $num =~ /\./) {
605 $exp = CORE
::length($num) -1;
608 $num .= '0' if $num =~ /\.$/;
612 $num = reverse($num);
615 do { $c = chop($num);
619 $exp = -$exp if $num == 0 and not $rev;
627 Usage : @cnums = collapse_nums( @numbers );
628 Purpose : Collapses a list of numbers into a set of ranges of consecutive terms:
629 : Useful for condensing long lists of consecutive numbers.
631 : 1 2 3 4 5 6 10 12 13 14 15 17 18 20 21 22 24 26 30 31 32
633 : 1-6 10 12-15 17 18 20-22 24 26 30-32
634 Argument : List of numbers sorted numerically.
635 Returns : List of numbers mixed with ranges of numbers (see above).
638 See Also : L<Bio::Search::Hit::BlastHit::seq_inds()|Bio::Search::Hit::BlastHit>
643 # This is probably not the slickest connectivity algorithm, but will do for now.
645 my ($from, $to, $i, @ca, $consec);
648 for($i=0; $i < @a; $i++) {
649 not $from and do{ $from = $a[$i]; next; };
650 # pass repeated positions (gap inserts)
651 next if $a[$i] == $a[$i-1];
652 if($a[$i] == $a[$i-1]+1) {
656 if($consec == 1) { $from .= ",$to"; }
657 else { $from .= $consec>1 ?
"\-$to" : ""; }
658 push @ca, split(',', $from);
665 if($consec == 1) { $from .= ",$to"; }
666 else { $from .= $consec>1 ?
"\-$to" : ""; }
668 push @ca, split(',', $from) if $from;
674 =head2 strip_blast_html
676 Usage : $boolean = &strip_blast_html( string_ref );
677 : This method is exported.
678 Purpose : Removes HTML formatting from a supplied string.
679 : Attempts to restore the Blast report to enable
680 : parsing by Bio::SearchIO::blast.pm
681 Returns : Boolean: true if string was stripped, false if not.
682 Argument : string_ref = reference to a string containing the whole Blast
683 : report containing HTML formatting.
684 Throws : Croaks if the argument is not a scalar reference.
685 Comments : Based on code originally written by Alex Dong Li
686 : (ali@genet.sickkids.on.ca).
687 : This method does some Blast-specific stripping
688 : (adds back a '>' character in front of each HSP
689 : alignment listing).
691 : THIS METHOD IS VERY SENSITIVE TO BLAST FORMATTING CHANGES!
693 : Removal of the HTML tags and accurate reconstitution of the
694 : non-HTML-formatted report is highly dependent on structure of
695 : the HTML-formatted version. For example, it assumes that first
696 : line of each alignment section (HSP listing) starts with a
697 : <a name=..> anchor tag. This permits the reconstruction of the
698 : original report in which these lines begin with a ">".
699 : This is required for parsing.
701 : If the structure of the Blast report itself is not intended to
702 : be a standard, the structure of the HTML-formatted version
703 : is even less so. Therefore, the use of this method to
704 : reconstitute parsable Blast reports from HTML-format versions
705 : should be considered a temporary solution.
709 sub strip_blast_html
{
710 # This may not best way to remove html tags. However, it is simple.
711 # it won't work under following conditions:
712 # 1) if quoted > appears in a tag (does this ever happen?)
713 # 2) if a tag is split over multiple lines and this method is
714 # used to process one line at a time.
716 my ($string_ref) = shift;
718 ref $string_ref eq 'SCALAR' or
719 croak
("Can't strip HTML: ".
720 "Argument is should be a SCALAR reference not a ${\ref $string_ref}\n");
722 my $str = $$string_ref;
725 # Removing "<a name =...>" and adding the '>' character for
726 # HSP alignment listings.
727 $str =~ s/(\A|\n)<a name ?=[^>]+> ?/>/sgi and $stripped = 1;
729 # Removing all "<>" tags.
730 $str =~ s/<[^>]+>| //sgi and $stripped = 1;
732 # Re-uniting any lone '>' characters.
733 $str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1;
742 Usage : my %data = &Bio::Search::SearchUtils($result)
743 Function : converts ResultI data to simple hash
746 Note : used mainly as a utility for running SearchIO tests
753 $hash{'query_name'} = $result->query_name;
756 foreach my $hit ( $result->hits ) {
757 $hash{"hit$hitcount\_name"} = $hit->name;
758 # only going to test order of magnitude
759 # too hard as these don't always match
760 # $hash{"hit$hitcount\_signif"} =
761 # ( sprintf("%.0e",$hit->significance) =~ /e\-?(\d+)/ );
762 $hash{"hit$hitcount\_bits"} = sprintf("%d",$hit->bits);
764 foreach my $hsp ( $hit->hsps ) {
765 $hash{"hsp$hspcount\_bits"} = sprintf("%d",$hsp->bits);
766 # only going to test order of magnitude
767 # too hard as these don't always match
768 # $hash{"hsp$hspcount\_evalue"} =
769 # ( sprintf("%.0e",$hsp->evalue) =~ /e\-?(\d+)/ );
770 $hash{"hsp$hspcount\_qs"} = $hsp->query->start;
771 $hash{"hsp$hspcount\_qe"} = $hsp->query->end;
772 $hash{"hsp$hspcount\_qstr"} = $hsp->query->strand;
773 $hash{"hsp$hspcount\_hs"} = $hsp->hit->start;
774 $hash{"hsp$hspcount\_he"} = $hsp->hit->end;
775 $hash{"hsp$hspcount\_hstr"} = $hsp->hit->strand;
777 #$hash{"hsp$hspcount\_pid"} = sprintf("%d",$hsp->percent_identity);
778 #$hash{"hsp$hspcount\_fid"} = sprintf("%.2f",$hsp->frac_identical);
779 $hash{"hsp$hspcount\_gaps"} = $hsp->gaps('total');
787 sub _warn_about_no_hsps
{
789 my $prev_func=(caller(1))[3];
790 $hit->warn("There is no HSP data for hit '".$hit->name."'.\n".
791 "You have called a method ($prev_func)\n".
792 "that requires HSP data and there was no HSP data for this hit,\n".
793 "most likely because it was absent from the BLAST report.\n".
794 "Note that by default, BLAST lists alignments for the first 250 hits,\n".
795 "but it lists descriptions for 500 hits. If this is the case,\n".
796 "and you care about these hits, you should re-run BLAST using the\n".
797 "-b option (or equivalent if not using blastall) to increase the number\n".