t/SeqFeature/Generic.t: fix typo on required module for testing
[bioperl-live.git] / lib / Bio / Search / Tiling / MapTileUtils.pm
blob96ef82e17a1a3741fce8edba8835ea3cb77b2dad
1 #$Id$
2 package Bio::Search::Tiling::MapTileUtils;
3 use strict;
4 use warnings;
5 use Exporter;
7 BEGIN {
8 our @ISA = qw( Exporter );
9 our @EXPORT = qw( get_intervals_from_hsps
10 interval_tiling
11 decompose_interval
12 containing_hsps
13 covering_groups
14 _allowable_filters
15 _set_attributes
16 _mapping_coeff);
19 # tiling trials
20 # assumed: intervals are [$a0, $a1], with $a0 <= $a1
21 =head1 NAME
23 Bio::Search::Tiling::MapTileUtils - utilities for manipulating closed intervals for an HSP tiling algorithm
25 =head1 SYNOPSIS
27 Not used directly.
29 =head1 DESCRIPTION
31 Not used directly.
33 =head1 NOTE
35 An "interval" in this module is defined as an arrayref C<[$a0, $a1]>, where
36 C<$a0, $a1> are scalar numbers satisfying C<$a0 E<lt>= $a1>.
38 =head1 AUTHOR
40 Mark A. Jensen - maj -at- fortinbras -dot- us
42 =head1 APPENDIX
44 =head2 interval_tiling
46 Title : interval_tiling()
47 Usage : @tiling = interval_tiling( \@array_of_intervals )
48 Function: Find minimal set of intervals covering the input set
49 Returns : array of arrayrefs of the form
50 ( [$interval => [ @indices_of_collapsed_input_intervals ]], ...)
51 Args : arrayref of intervals
53 =cut
55 sub interval_tiling {
56 return unless $_[0]; # no input
57 my $n = scalar @{$_[0]};
58 my %input;
59 @input{(0..$n-1)} = @{$_[0]};
60 my @active = (0..$n-1);
61 my @hold;
62 my @tiled_ints;
63 my @ret;
64 while (@active) {
65 my $tgt = $input{my $tgt_i = shift @active};
66 push @tiled_ints, $tgt_i;
67 my $tgt_not_disjoint = 1;
68 while ($tgt_not_disjoint) {
69 $tgt_not_disjoint = 0;
70 while (my $try_i = shift @active) {
71 my $try = $input{$try_i};
72 if ( !are_disjoint($tgt, $try) ) {
73 $tgt = min_covering_interval($tgt,$try);
74 push @tiled_ints, $try_i;
75 $tgt_not_disjoint = 1;
77 else {
78 push @hold, $try_i;
81 if (!$tgt_not_disjoint) {
82 push @ret, [ $tgt => [@tiled_ints] ];
83 @tiled_ints = ();
85 @active = @hold;
86 @hold = ();
89 return @ret;
92 =head2 decompose_interval
94 Title : decompose_interval
95 Usage : @decomposition = decompose_interval( \@overlappers )
96 Function: Calculate the disjoint decomposition of a set of
97 overlapping intervals, each annotated with a list of
98 covering intervals
99 Returns : array of arrayrefs of the form
100 ( [[@interval] => [@indices_of_coverers]], ... )
101 Args : arrayref of intervals (arrayrefs like [$a0, $a1], with
102 Note : Each returned interval is associated with a list of indices of the
103 original intervals that cover that decomposition component
104 (scalar size of this list could be called the 'coverage coefficient')
105 Note : Coverage: each component of the decomp is completely contained
106 in the input intervals that overlap it, by construction.
107 Caveat : This routine expects the members of @overlappers to overlap,
108 but doesn't check this.
110 =cut
112 ### what if the input intervals don't overlap?? They MUST overlap; that's
113 ### what interval_tiling() is for.
115 sub decompose_interval {
116 return unless $_[0]; # no input
117 my @ints = @{$_[0]};
118 my (%flat,@flat);
119 ### this is ok, but need to handle the case where a lh and rh endpoint
120 ### coincide...
121 # decomposition --
122 # flatten:
123 # every lh endpoint generates (lh-1, lh)
124 # every rh endpoint generates (rh, rh+)
125 foreach (@ints) {
126 $flat{$$_[0]-1}++;
127 $flat{$$_[0]}++;
128 $flat{$$_[1]}++;
129 $flat{$$_[1]+1}++;
131 # sort, create singletons if nec.
132 my @a;
133 @a = sort {$a<=>$b} keys %flat;
134 # throw out first and last (meeting a boundary condition)
135 shift @a; pop @a;
136 # look for singletons
137 @flat = (shift @a, shift @a);
138 if ( $flat[1]-$flat[0] == 1 ) {
139 @flat = ($flat[0],$flat[0], $flat[1]);
141 while (my $a = shift @a) {
142 if ($a-$flat[-2]==2) {
143 push @flat, $flat[-1]; # create singleton interval
145 push @flat, $a;
147 if ($flat[-1]-$flat[-2]==1 and @flat % 2) {
148 push @flat, $flat[-1];
150 # component intervals are consecutive pairs
151 my @decomp;
152 while (my $a = shift @flat) {
153 push @decomp, [$a, shift @flat];
156 # for each component, return a list of the indices of the input intervals
157 # that cover the component.
158 my @coverage;
159 foreach my $i (0..$#decomp) {
160 foreach my $j (0..$#ints) {
161 unless (are_disjoint($decomp[$i], $ints[$j])) {
162 if (defined $coverage[$i]) {
163 push @{$coverage[$i]}, $j;
165 else {
166 $coverage[$i] = [$j];
171 return map { [$decomp[$_] => $coverage[$_]] } (0..$#decomp);
174 =head2 are_disjoint
176 Title : are_disjoint
177 Usage : are_disjoint( [$a0, $a1], [$b0, $b1] )
178 Function: Determine if two intervals are disjoint
179 Returns : True if the intervals are disjoint, false if they overlap
180 Args : array of two intervals
182 =cut
184 sub are_disjoint {
185 my ($int1, $int2) = @_;
186 return 1 if ( $$int1[1] < $$int2[0] ) || ( $$int2[1] < $$int1[0]);
187 return 0;
190 =head2 min_covering_interval
192 Title : min_covering_interval
193 Usage : $interval = min_covering_interval( [$a0,$a1],[$b0,$b1] )
194 Function: Determine the minimal covering interval for two intervals
195 Returns : an interval
196 Args : two intervals
198 =cut
200 sub min_covering_interval {
201 my ($int1, $int2) = @_;
202 my @a = sort {$a <=> $b} (@$int1, @$int2);
203 return [$a[0],$a[-1]];
206 =head2 get_intervals_from_hsps
208 Title : get_intervals_from_hsps
209 Usage : @intervals = get_intervals_from_hsps($type, @hsp_objects)
210 Function: Return array of intervals of the form [ $start, $end ],
211 from an array of hsp objects
212 Returns : an array of intervals
213 Args : scalar $type, array of HSPI objects; where $type is one of 'hit',
214 'subject', 'query'
216 =cut
218 sub get_intervals_from_hsps {
219 my $type = shift;
220 my @hsps;
221 if (ref($type) =~ /HSP/) {
222 push @hsps, $type;
223 $type = 'query';
225 elsif ( !grep /^$type$/,qw( hit subject query ) ) {
226 die "Unknown HSP type '$type'";
228 $type = 'hit' if $type eq 'subject';
229 push @hsps, @_;
230 my @ret;
231 foreach (@hsps) {
232 # push @ret, [ $_->start($type), $_->end($type)];
233 push @ret, [ $_->range($type) ];
235 return @ret;
238 # fast, clear, nasty, brutish and short.
239 # for _allowable_filters(), _set_mapping()
240 # covers BLAST, FAST families
241 # FASTA is ambiguous (nt or aa) based on alg name only
243 my $alg_lookup = {
244 'N' => { 'mapping' => [1,1],
245 'def_context' => ['p_','p_'],
246 'has_strand' => [1, 1],
247 'has_frame' => [0, 0]},
248 'P' => { 'mapping' => [1,1],
249 'def_context' => ['all','all'],
250 'has_strand' => [0, 0],
251 'has_frame' => [0, 0]},
252 'X' => { 'mapping' => [3, 1],
253 'def_context' => [undef,'all'],
254 'has_strand' => [1, 0],
255 'has_frame' => [1, 0]},
256 'Y' => { 'mapping' => [3, 1],
257 'def_context' => [undef,'all'],
258 'has_strand' => [1, 0],
259 'has_frame' => [1, 0]},
260 'TA' => { 'mapping' => [1, 3],
261 'def_context' => ['all',undef],
262 'has_strand' => [0, 1],
263 'has_frame' => [0, 1]},
264 'TN' => { 'mapping' => [1, 3],
265 'def_context' => ['p_',undef],
266 'has_strand' => [1,1],
267 'has_frame' => [0, 1]},
268 'TX' => { 'mapping' => [3, 3],
269 'def_context' => [undef,undef],
270 'has_strand' => [1, 1],
271 'has_frame' => [1, 1]},
272 'TY' => { 'mapping' => [3, 3],
273 'def_context' => [undef,undef],
274 'has_strand' => [1, 1],
275 'has_frame' => [1, 1]}
278 =head2 _allowable_filters
280 Title : _allowable_filters
281 Usage : _allowable_filters($Bio_Search_Hit_HitI, $type)
282 Function: Return the HSP filters (strand, frame) allowed,
283 based on the reported algorithm
284 Returns : String encoding allowable filters:
285 s = strand, f = frame
286 Empty string if no filters allowed
287 undef if algorithm unrecognized
288 Args : A Bio::Search::Hit::HitI object,
289 scalar $type, one of 'hit', 'subject', 'query';
290 default is 'query'
292 =cut
294 sub _allowable_filters {
295 my $hit = shift;
296 my $type = shift;
297 $type ||= 'q';
298 unless (grep /^$type$/, qw( h q s ) ) {
299 warn("Unknown type '$type'; returning ''");
300 return '';
302 $type = 'h' if $type eq 's';
303 my $alg = $hit->algorithm;
305 # pretreat (i.e., kludge it)
306 $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
308 for ($hit->algorithm) {
309 /MEGABLAST/i && do {
310 return qr/[s]/;
312 /(.?)BLAST(.?)/i && do {
313 return $$alg_lookup{$1.$2}{$type};
315 /(.?)FAST(.?)/ && do {
316 return $$alg_lookup{$1.$2}{$type};
318 do { # unrecognized
319 last;
322 return;
326 =head2 _set_attributes
328 Title : _set_attributes
329 Usage : $tiling->_set_attributes()
330 Function: Sets attributes for invocant
331 that depend on algorithm name
332 Returns : True on success
333 Args : none
334 Note : setting based on the configuration table
335 %alg_lookup
337 =cut
339 sub _set_attributes {
340 my $self = shift;
341 my $alg = $self->hit->algorithm;
343 # pretreat (i.e., kludge it)
344 $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
346 for ($alg) {
347 /MEGABLAST/i && do {
348 ($self->{_mapping_query},$self->{_mapping_hit}) = (1,1);
349 ($self->{_def_context_query},$self->{_def_context_hit}) =
350 ('p_','p_');
351 ($self->{_has_frame_query},$self->{_has_frame_hit}) =
352 (0, 0);
353 ($self->{_has_strand_query},$self->{_has_strand_hit}) =
354 (1, 1);
355 last;
357 /(.?)BLAST(.?)/i && do {
358 ($self->{_mapping_query},$self->{_mapping_hit}) =
359 @{$$alg_lookup{$1.$2}{mapping}};
360 ($self->{_def_context_query},$self->{_def_context_hit}) =
361 @{$$alg_lookup{$1.$2}{def_context}};
362 ($self->{_has_frame_query},$self->{_has_frame_hit}) =
363 @{$$alg_lookup{$1.$2}{has_frame}};
364 ($self->{_has_strand_query},$self->{_has_strand_hit}) =
365 @{$$alg_lookup{$1.$2}{has_strand}};
366 last;
368 /(.?)FAST(.?)/ && do {
369 ($self->{_mapping_query},$self->{_mapping_hit}) =
370 @{$$alg_lookup{$1.$2}{mapping}};
371 ($self->{_def_context_query},$self->{_def_context_hit}) =
372 @{$$alg_lookup{$1.$2}{def_context}};
373 ($self->{_has_frame_query},$self->{_has_frame_hit}) =
374 @{$$alg_lookup{$1.$2}{has_frame}};
375 ($self->{_has_strand_query},$self->{_has_strand_hit}) =
376 @{$$alg_lookup{$1.$2}{has_strand}};
377 last;
379 do { # unrecognized
380 $self->warn("Unrecognized algorithm '$alg'; defaults may not work");
381 ($self->{_mapping_query},$self->{_mapping_hit}) = (1,1);
382 ($self->{_def_context_query},$self->{_def_context_hit}) =
383 ('all','all');
384 ($self->{_has_frame_query},$self->{_has_frame_hit}) =
385 (0,0);
386 ($self->{_has_strand_query},$self->{_has_strand_hit}) =
387 (0,0);
388 return 0;
389 last;
392 return 1;
395 sub _mapping_coeff {
396 my $obj = shift;
397 my $type = shift;
398 my %type_i = ( 'query' => 0, 'hit' => 1 );
399 unless ( ref($obj) && $obj->can('algorithm') ) {
400 $obj->warn("Object type unrecognized");
401 return undef;
403 $type ||= 'query';
404 unless ( grep(/^$type$/, qw( query hit subject ) ) ) {
405 $obj->warn("Sequence type unrecognized");
406 return undef;
408 $type = 'hit' if $type eq 'subject';
409 my $alg = $obj->algorithm;
411 # pretreat (i.e., kludge it)
412 $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
414 for ($alg) {
415 /MEGABLAST/i && do {
416 return 1;
418 /(.?)BLAST(.?)/i && do {
419 return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}];
421 /(.?)FAST(.?)/ && do {
422 return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}];
424 do { # unrecognized
425 last;
428 return;
431 # a graphical depiction of a set of intervals
432 sub _ints_as_text {
433 my $ints = shift;
434 my @ints = @$ints;
435 my %pos;
436 for (@ints) {
437 $pos{$$_[0]}++;
438 $pos{$$_[1]}++;
441 my @pos = sort {$a<=>$b} keys %pos;
442 @pos = map {sprintf("%03d",$_)} @pos;
443 #header
444 my $max=0;
445 $max = (length > $max) ? length : $max for (@pos);
446 for my $j (0..$max-1) {
447 my $i = $max-1-$j;
448 my @line = map { substr($_, $j, 1) || '0' } @pos;
449 print join('', @line), "\n";
451 print '-' x @pos, "\n";
452 undef %pos;
453 @pos{map {sprintf("%d",$_)} @pos} = (0..@pos);
454 foreach (@ints) {
455 print ' ' x $pos{$$_[0]}, '[', ' ' x ($pos{$$_[1]}-$pos{$$_[0]}-1), ']', ' ' x (@pos-$pos{$$_[1]}), "\n";
461 =head2 containing_hsps()
463 Title : containing_hsps
464 Usage : @hsps = containing_hsps($interval, @hsps_to_search)
465 Function: Return a list of hsps whose coordinates completely contain the
466 given $interval
467 Returns : Array of HSP objects
468 Args : $interval : [$int1, $int2],
469 array of HSP objects
471 =cut
472 # could be more efficient if hsps are assumed ordered...
473 sub containing_hsps {
474 my $intvl = shift;
475 my @hsps = @_;
476 my @ret;
477 my ($beg, $end) = @$intvl;
478 foreach my $hsp (@hsps) {
479 my ($start, $stop) = ($hsp->start, $hsp->end);
480 push @ret, $hsp if ( $start <= $beg and $end <= $stop );
482 return @ret;
487 =head2 covering_groups()
489 Title : covering_groups
490 Usage :
491 Function: divide a list of **ordered,disjoint** intervals (as from a
492 coverage map) into a set of disjoint covering groups
493 Returns : array of arrayrefs, each arrayref a covering set of
494 intervals
495 Args : array of intervals
497 =cut
499 sub covering_groups {
500 my @intervals = @_;
501 return unless @intervals;
502 my (@groups, $grp);
503 push @{$groups[0]}, shift @intervals;
504 $grp = $groups[0];
505 for (my $intvl = shift @intervals; @intervals; $intvl = shift @intervals) {
506 if ( $intvl->[0] - $grp->[-1][1] == 1 ) { # intervals are direcly adjacent
507 push @$grp, $intvl;
509 else {
510 $grp = [$intvl];
511 push @groups, $grp;
514 return @groups;
518 # need our own subsequencer for hsps.
520 package Bio::Search::HSP::HSPI;
522 use strict;
523 use warnings;
525 =head2 matches_MT
527 Title : matches_MT
528 Usage : $hsp->matches($type, $action, $start, $end)
529 Purpose : Get the total number of identical or conserved matches
530 in the query or sbjct sequence for the given HSP. Optionally can
531 report data within a defined interval along the seq.
532 Returns : scalar int
533 Args :
534 Comments : Relies on seq_str('match') to get the string of alignment symbols
535 between the query and sbjct lines which are used for determining
536 the number of identical and conservative matches.
537 Note : Modeled on Bio::Search::HSP::HSPI::matches
539 =cut
541 sub matches_MT {
542 use integer;
543 my( $self, @args ) = @_;
544 my($type, $action, $beg, $end) = $self->_rearrange( [qw(TYPE ACTION START END)], @args);
545 my @actions = qw( identities conserved searchutils );
547 # prep $type
548 $self->throw("Type not specified") if !defined $type;
549 $self->throw("Type '$type' unrecognized") unless grep(/^$type$/,qw(query hit subject));
550 $type = 'hit' if $type eq 'subject';
552 # prep $action
553 $self->throw("Action not specified") if !defined $action;
554 $self->throw("Action '$action' unrecognized") unless grep(/^$action$/, @actions);
556 my ($len_id, $len_cons);
557 my $c = Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type);
558 if ((defined $beg && !defined $end) || (!defined $beg && defined $end)) {
559 $self->throw("Both start and end are required");
561 elsif ( (!defined($beg) && !defined($end)) || !$self->seq_str('match') ) {
562 ## Get data for the whole alignment.
563 # the reported values x mapping
564 $self->debug("Sequence data not present in report; returning data for entire HSP") unless $self->seq_str('match');
565 ($len_id, $len_cons) = map { $c*$_ } ($self->num_identical, $self->num_conserved);
566 for ($action) {
567 $_ eq 'identities' && do {
568 return $len_id;
570 $_ eq 'conserved' && do {
571 return $len_cons;
573 $_ eq 'searchutils' && do {
574 return ($len_id, $len_cons);
576 do {
577 $self->throw("What are YOU doing here?");
581 else {
582 ## Get the substring representing the desired sub-section of aln.
583 my($start,$stop) = $self->range($type);
584 if ( $beg < $start or $stop < $end ) {
585 $self->throw("Start/stop out of range [$start, $stop]");
588 # handle gaps
589 my $match_str = $self->seq_str('match');
590 if ($self->gaps) {
591 # strip the homology string of gap positions relative
592 # to the target type
593 $match_str = $self->seq_str('match');
594 my $tgt = $self->seq_str($type);
595 my $encode = $match_str ^ $tgt;
596 my $zap = '-'^' ';
597 $encode =~ s/$zap//g;
598 $tgt =~ s/-//g;
599 $match_str = $tgt ^ $encode;
600 # match string is now the correct length for substr'ing below,
601 # given that start and end are gapless coordinates in the
602 # blast report
605 my $seq = "";
606 $seq = substr( $match_str,
607 int( ($beg-$start)/Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type) ),
608 int( 1+($end-$beg)/Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type) )
611 if(!CORE::length $seq) {
612 $self->throw("Undefined sub-sequence ($beg,$end). Valid range = $start - $stop");
615 $seq =~ s/ //g; # remove space (no info).
616 $len_cons = (CORE::length $seq)*(Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self,$type));
617 $seq =~ s/\+//g; # remove '+' characters (conservative substitutions)
618 $len_id = (CORE::length $seq)*(Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self,$type));
619 for ($action) {
620 $_ eq 'identities' && do {
621 return $len_id;
623 $_ eq 'conserved' && do {
624 return $len_cons;
626 $_ eq 'searchutils' && do {
627 return ($len_id, $len_cons);
629 do {
630 $self->throw("What are YOU doing here?");
638 package Bio::LocatableSeq;
639 use strict;
640 use warnings;
642 # mixin the Bio::FeatureHolderI implementation of
643 # Bio::Seq -- for get_tiled_aln
645 =head2 get_SeqFeatures
647 Title : get_SeqFeatures
648 Usage :
649 Function: Get the feature objects held by this feature holder.
651 Features which are not top-level are subfeatures of one or
652 more of the returned feature objects, which means that you
653 must traverse the subfeature arrays of each top-level
654 feature object in order to traverse all features associated
655 with this sequence.
657 Top-level features can be obtained by tag, specified in
658 the argument.
660 Use get_all_SeqFeatures() if you want the feature tree
661 flattened into one single array.
663 Example :
664 Returns : an array of Bio::SeqFeatureI implementing objects
665 Args : [optional] scalar string (feature tag)
668 =cut
670 sub get_SeqFeatures{
671 my $self = shift;
672 my $tag = shift;
674 if( !defined $self->{'_as_feat'} ) {
675 $self->{'_as_feat'} = [];
677 if ($tag) {
678 return map { $_->primary_tag eq $tag ? $_ : () } @{$self->{'_as_feat'}};
680 else {
681 return @{$self->{'_as_feat'}};
685 =head2 feature_count
687 Title : feature_count
688 Usage : $seq->feature_count()
689 Function: Return the number of SeqFeatures attached to a sequence
690 Returns : integer representing the number of SeqFeatures
691 Args : None
694 =cut
696 sub feature_count {
697 my ($self) = @_;
698 if (defined($self->{'_as_feat'})) {
699 return ($#{$self->{'_as_feat'}} + 1);
700 } else {
701 return 0;
705 =head2 add_SeqFeature
707 Title : add_SeqFeature
708 Usage : $seq->add_SeqFeature($feat);
709 $seq->add_SeqFeature(@feat);
710 Function: Adds the given feature object (or each of an array of feature
711 objects to the feature array of this
712 sequence. The object passed is required to implement the
713 Bio::SeqFeatureI interface.
714 Returns : 1 on success
715 Args : A Bio::SeqFeatureI implementing object, or an array of such objects.
718 =cut
720 sub add_SeqFeature {
721 my ($self,@feat) = @_;
722 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
723 foreach my $feat ( @feat ) {
724 if( !$feat->isa("Bio::SeqFeatureI") ) {
725 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
727 $feat->attach_seq($self);
728 push(@{$self->{'_as_feat'}},$feat);
730 return 1;
733 =head2 remove_SeqFeatures
735 Title : remove_SeqFeatures
736 Usage : $seq->remove_SeqFeatures();
737 Function: Flushes all attached SeqFeatureI objects.
739 To remove individual feature objects, delete those from the returned
740 array and re-add the rest.
741 Example :
742 Returns : The array of Bio::SeqFeatureI objects removed from this seq.
743 Args : None
746 =cut
748 sub remove_SeqFeatures {
749 my $self = shift;
751 return () unless $self->{'_as_feat'};
752 my @feats = @{$self->{'_as_feat'}};
753 $self->{'_as_feat'} = [];
754 return @feats;