2 package Bio
::Search
::Tiling
::MapTileUtils
;
10 our @ISA = qw( Exporter );
11 our @EXPORT = qw( get_intervals_from_hsps
22 # assumed: intervals are [$a0, $a1], with $a0 <= $a1
25 Bio::Search::Tiling::MapTileUtils - utilities for manipulating closed intervals for an HSP tiling algorithm
37 An "interval" in this module is defined as an arrayref C<[$a0, $a1]>, where
38 C<$a0, $a1> are scalar numbers satisfying C<$a0 E<lt>= $a1>.
42 Mark A. Jensen - maj -at- fortinbras -dot- us
46 =head2 interval_tiling
48 Title : interval_tiling()
49 Usage : @tiling = interval_tiling( \@array_of_intervals )
50 Function: Find minimal set of intervals covering the input set
51 Returns : array of arrayrefs of the form
52 ( [$interval => [ @indices_of_collapsed_input_intervals ]], ...)
53 Args : arrayref of intervals
58 return unless $_[0]; # no input
59 my $n = scalar @
{$_[0]};
61 @input{(0..$n-1)} = @
{$_[0]};
62 my @active = (0..$n-1);
67 my $tgt = $input{my $tgt_i = shift @active};
68 push @tiled_ints, $tgt_i;
69 my $tgt_not_disjoint = 1;
70 while ($tgt_not_disjoint) {
71 $tgt_not_disjoint = 0;
72 while (my $try_i = shift @active) {
73 my $try = $input{$try_i};
74 if ( !are_disjoint
($tgt, $try) ) {
75 $tgt = min_covering_interval
($tgt,$try);
76 push @tiled_ints, $try_i;
77 $tgt_not_disjoint = 1;
83 if (!$tgt_not_disjoint) {
84 push @ret, [ $tgt => [@tiled_ints] ];
94 =head2 decompose_interval
96 Title : decompose_interval
97 Usage : @decomposition = decompose_interval( \@overlappers )
98 Function: Calculate the disjoint decomposition of a set of
99 overlapping intervals, each annotated with a list of
101 Returns : array of arrayrefs of the form
102 ( [[@interval] => [@indices_of_coverers]], ... )
103 Args : arrayref of intervals (arrayrefs like [$a0, $a1], with
104 Note : Each returned interval is associated with a list of indices of the
105 original intervals that cover that decomposition component
106 (scalar size of this list could be called the 'coverage coefficient')
107 Note : Coverage: each component of the decomp is completely contained
108 in the input intervals that overlap it, by construction.
109 Caveat : This routine expects the members of @overlappers to overlap,
110 but doesn't check this.
114 ### what if the input intervals don't overlap?? They MUST overlap; that's
115 ### what interval_tiling() is for.
117 sub decompose_interval
{
118 return unless $_[0]; # no input
121 ### this is ok, but need to handle the case where a lh and rh endpoint
125 # every lh endpoint generates (lh-1, lh)
126 # every rh endpoint generates (rh, rh+)
133 # sort, create singletons if nec.
135 @a = sort {$a<=>$b} keys %flat;
136 # throw out first and last (meeting a boundary condition)
138 # look for singletons
139 @flat = (shift @a, shift @a);
140 if ( $flat[1]-$flat[0] == 1 ) {
141 @flat = ($flat[0],$flat[0], $flat[1]);
143 while (my $a = shift @a) {
144 if ($a-$flat[-2]==2) {
145 push @flat, $flat[-1]; # create singleton interval
149 if ($flat[-1]-$flat[-2]==1 and @flat % 2) {
150 push @flat, $flat[-1];
152 # component intervals are consecutive pairs
154 while (my $a = shift @flat) {
155 push @decomp, [$a, shift @flat];
158 # for each component, return a list of the indices of the input intervals
159 # that cover the component.
161 foreach my $i (0..$#decomp) {
162 foreach my $j (0..$#ints) {
163 unless (are_disjoint
($decomp[$i], $ints[$j])) {
164 if (defined $coverage[$i]) {
165 push @
{$coverage[$i]}, $j;
168 $coverage[$i] = [$j];
173 return map { [$decomp[$_] => $coverage[$_]] } (0..$#decomp);
179 Usage : are_disjoint( [$a0, $a1], [$b0, $b1] )
180 Function: Determine if two intervals are disjoint
181 Returns : True if the intervals are disjoint, false if they overlap
182 Args : array of two intervals
187 my ($int1, $int2) = @_;
188 return 1 if ( $$int1[1] < $$int2[0] ) || ( $$int2[1] < $$int1[0]);
192 =head2 min_covering_interval
194 Title : min_covering_interval
195 Usage : $interval = min_covering_interval( [$a0,$a1],[$b0,$b1] )
196 Function: Determine the minimal covering interval for two intervals
197 Returns : an interval
202 sub min_covering_interval
{
203 my ($int1, $int2) = @_;
204 my @a = sort {$a <=> $b} (@
$int1, @
$int2);
205 return [$a[0],$a[-1]];
208 =head2 get_intervals_from_hsps
210 Title : get_intervals_from_hsps
211 Usage : @intervals = get_intervals_from_hsps($type, @hsp_objects)
212 Function: Return array of intervals of the form [ $start, $end ],
213 from an array of hsp objects
214 Returns : an array of intervals
215 Args : scalar $type, array of HSPI objects; where $type is one of 'hit',
220 sub get_intervals_from_hsps
{
223 if (ref($type) =~ /HSP/) {
227 elsif ( !grep /^$type$/,qw( hit subject query ) ) {
228 die "Unknown HSP type '$type'";
230 $type = 'hit' if $type eq 'subject';
234 # push @ret, [ $_->start($type), $_->end($type)];
235 push @ret, [ $_->range($type) ];
240 # fast, clear, nasty, brutish and short.
241 # for _allowable_filters(), _set_mapping()
242 # covers BLAST, FAST families
243 # FASTA is ambiguous (nt or aa) based on alg name only
246 'N' => { 'mapping' => [1,1],
247 'def_context' => ['p_','p_'],
248 'has_strand' => [1, 1],
249 'has_frame' => [0, 0]},
250 'P' => { 'mapping' => [1,1],
251 'def_context' => ['all','all'],
252 'has_strand' => [0, 0],
253 'has_frame' => [0, 0]},
254 'X' => { 'mapping' => [3, 1],
255 'def_context' => [undef,'all'],
256 'has_strand' => [1, 0],
257 'has_frame' => [1, 0]},
258 'Y' => { 'mapping' => [3, 1],
259 'def_context' => [undef,'all'],
260 'has_strand' => [1, 0],
261 'has_frame' => [1, 0]},
262 'TA' => { 'mapping' => [1, 3],
263 'def_context' => ['all',undef],
264 'has_strand' => [0, 1],
265 'has_frame' => [0, 1]},
266 'TN' => { 'mapping' => [1, 3],
267 'def_context' => ['p_',undef],
268 'has_strand' => [1,1],
269 'has_frame' => [0, 1]},
270 'TX' => { 'mapping' => [3, 3],
271 'def_context' => [undef,undef],
272 'has_strand' => [1, 1],
273 'has_frame' => [1, 1]},
274 'TY' => { 'mapping' => [3, 3],
275 'def_context' => [undef,undef],
276 'has_strand' => [1, 1],
277 'has_frame' => [1, 1]}
280 =head2 _allowable_filters
282 Title : _allowable_filters
283 Usage : _allowable_filters($Bio_Search_Hit_HitI, $type)
284 Function: Return the HSP filters (strand, frame) allowed,
285 based on the reported algorithm
286 Returns : String encoding allowable filters:
287 s = strand, f = frame
288 Empty string if no filters allowed
289 undef if algorithm unrecognized
290 Args : A Bio::Search::Hit::HitI object,
291 scalar $type, one of 'hit', 'subject', 'query';
296 sub _allowable_filters
{
300 unless (grep /^$type$/, qw( h q s ) ) {
301 warn("Unknown type '$type'; returning ''");
304 $type = 'h' if $type eq 's';
305 my $alg = $hit->algorithm;
307 # pretreat (i.e., kludge it)
308 $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
310 for ($hit->algorithm) {
314 /(.?)BLAST(.?)/i && do {
315 return $$alg_lookup{$1.$2}{$type};
317 /(.?)FAST(.?)/ && do {
318 return $$alg_lookup{$1.$2}{$type};
328 =head2 _set_attributes
330 Title : _set_attributes
331 Usage : $tiling->_set_attributes()
332 Function: Sets attributes for invocant
333 that depend on algorithm name
334 Returns : True on success
336 Note : setting based on the configuration table
341 sub _set_attributes
{
343 my $alg = $self->hit->algorithm;
345 # pretreat (i.e., kludge it)
346 $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
350 ($self->{_mapping_query
},$self->{_mapping_hit
}) = (1,1);
351 ($self->{_def_context_query
},$self->{_def_context_hit
}) =
353 ($self->{_has_frame_query
},$self->{_has_frame_hit
}) =
355 ($self->{_has_strand_query
},$self->{_has_strand_hit
}) =
359 /(.?)BLAST(.?)/i && do {
360 ($self->{_mapping_query
},$self->{_mapping_hit
}) =
361 @
{$$alg_lookup{$1.$2}{mapping
}};
362 ($self->{_def_context_query
},$self->{_def_context_hit
}) =
363 @
{$$alg_lookup{$1.$2}{def_context
}};
364 ($self->{_has_frame_query
},$self->{_has_frame_hit
}) =
365 @
{$$alg_lookup{$1.$2}{has_frame
}};
366 ($self->{_has_strand_query
},$self->{_has_strand_hit
}) =
367 @
{$$alg_lookup{$1.$2}{has_strand
}};
370 /(.?)FAST(.?)/ && do {
371 ($self->{_mapping_query
},$self->{_mapping_hit
}) =
372 @
{$$alg_lookup{$1.$2}{mapping
}};
373 ($self->{_def_context_query
},$self->{_def_context_hit
}) =
374 @
{$$alg_lookup{$1.$2}{def_context
}};
375 ($self->{_has_frame_query
},$self->{_has_frame_hit
}) =
376 @
{$$alg_lookup{$1.$2}{has_frame
}};
377 ($self->{_has_strand_query
},$self->{_has_strand_hit
}) =
378 @
{$$alg_lookup{$1.$2}{has_strand
}};
382 $self->warn("Unrecognized algorithm '$alg'; defaults may not work");
383 ($self->{_mapping_query
},$self->{_mapping_hit
}) = (1,1);
384 ($self->{_def_context_query
},$self->{_def_context_hit
}) =
386 ($self->{_has_frame_query
},$self->{_has_frame_hit
}) =
388 ($self->{_has_strand_query
},$self->{_has_strand_hit
}) =
400 my %type_i = ( 'query' => 0, 'hit' => 1 );
401 unless ( ref($obj) && $obj->can('algorithm') ) {
402 $obj->warn("Object type unrecognized");
406 unless ( grep(/^$type$/, qw( query hit subject ) ) ) {
407 $obj->warn("Sequence type unrecognized");
410 $type = 'hit' if $type eq 'subject';
411 my $alg = $obj->algorithm;
413 # pretreat (i.e., kludge it)
414 $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
420 /(.?)BLAST(.?)/i && do {
421 return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}];
423 /(.?)FAST(.?)/ && do {
424 return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}];
433 # a graphical depiction of a set of intervals
443 my @pos = sort {$a<=>$b} keys %pos;
444 @pos = map {sprintf("%03d",$_)} @pos;
447 $max = (length > $max) ?
length : $max for (@pos);
448 for my $j (0..$max-1) {
450 my @line = map { substr($_, $j, 1) || '0' } @pos;
451 print join('', @line), "\n";
453 print '-' x
@pos, "\n";
455 @pos{map {sprintf("%d",$_)} @pos} = (0..@pos);
457 print ' ' x
$pos{$$_[0]}, '[', ' ' x
($pos{$$_[1]}-$pos{$$_[0]}-1), ']', ' ' x
(@pos-$pos{$$_[1]}), "\n";
463 =head2 containing_hsps()
465 Title : containing_hsps
466 Usage : @hsps = containing_hsps($interval, @hsps_to_search)
467 Function: Return a list of hsps whose coordinates completely contain the
469 Returns : Array of HSP objects
470 Args : $interval : [$int1, $int2],
474 # could be more efficient if hsps are assumed ordered...
475 sub containing_hsps
{
479 my ($beg, $end) = @
$intvl;
480 foreach my $hsp (@hsps) {
481 my ($start, $stop) = ($hsp->start, $hsp->end);
482 push @ret, $hsp if ( $start <= $beg and $end <= $stop );
489 =head2 covering_groups()
491 Title : covering_groups
493 Function: divide a list of **ordered,disjoint** intervals (as from a
494 coverage map) into a set of disjoint covering groups
495 Returns : array of arrayrefs, each arrayref a covering set of
497 Args : array of intervals
501 sub covering_groups
{
503 return unless @intervals;
505 push @
{$groups[0]}, shift @intervals;
507 for (my $intvl = shift @intervals; @intervals; $intvl = shift @intervals) {
508 if ( $intvl->[0] - $grp->[-1][1] == 1 ) { # intervals are direcly adjacent
520 # need our own subsequencer for hsps.
522 package Bio
::Search
::HSP
::HSPI
;
532 Usage : $hsp->matches($type, $action, $start, $end)
533 Purpose : Get the total number of identical or conserved matches
534 in the query or sbjct sequence for the given HSP. Optionally can
535 report data within a defined interval along the seq.
538 Comments : Relies on seq_str('match') to get the string of alignment symbols
539 between the query and sbjct lines which are used for determining
540 the number of identical and conservative matches.
541 Note : Modeled on Bio::Search::HSP::HSPI::matches
547 my( $self, @args ) = @_;
548 my($type, $action, $beg, $end) = $self->_rearrange( [qw(TYPE ACTION START END)], @args);
549 my @actions = qw( identities conserved searchutils );
552 $self->throw("Type not specified") if !defined $type;
553 $self->throw("Type '$type' unrecognized") unless grep(/^$type$/,qw(query hit subject));
554 $type = 'hit' if $type eq 'subject';
557 $self->throw("Action not specified") if !defined $action;
558 $self->throw("Action '$action' unrecognized") unless grep(/^$action$/, @actions);
560 my ($len_id, $len_cons);
561 my $c = Bio
::Search
::Tiling
::MapTileUtils
::_mapping_coeff
($self, $type);
562 if ((defined $beg && !defined $end) || (!defined $beg && defined $end)) {
563 $self->throw("Both start and end are required");
565 elsif ( (!defined($beg) && !defined($end)) || !$self->seq_str('match') ) {
566 ## Get data for the whole alignment.
567 # the reported values x mapping
568 $self->debug("Sequence data not present in report; returning data for entire HSP") unless $self->seq_str('match');
569 ($len_id, $len_cons) = map { $c*$_ } ($self->num_identical, $self->num_conserved);
571 $_ eq 'identities' && do {
574 $_ eq 'conserved' && do {
577 $_ eq 'searchutils' && do {
578 return ($len_id, $len_cons);
581 $self->throw("What are YOU doing here?");
586 ## Get the substring representing the desired sub-section of aln.
587 my($start,$stop) = $self->range($type);
588 if ( $beg < $start or $stop < $end ) {
589 $self->throw("Start/stop out of range [$start, $stop]");
593 my $match_str = $self->seq_str('match');
595 # strip the homology string of gap positions relative
597 $match_str = $self->seq_str('match');
598 my $tgt = $self->seq_str($type);
599 my $encode = $match_str ^ $tgt;
601 $encode =~ s/$zap//g;
603 $match_str = $tgt ^ $encode;
604 # match string is now the correct length for substr'ing below,
605 # given that start and end are gapless coordinates in the
610 $seq = substr( $match_str,
611 int( ($beg-$start)/Bio
::Search
::Tiling
::MapTileUtils
::_mapping_coeff
($self, $type) ),
612 int( 1+($end-$beg)/Bio
::Search
::Tiling
::MapTileUtils
::_mapping_coeff
($self, $type) )
615 if(!CORE
::length $seq) {
616 $self->throw("Undefined sub-sequence ($beg,$end). Valid range = $start - $stop");
619 $seq =~ s/ //g; # remove space (no info).
620 $len_cons = (CORE
::length $seq)*(Bio
::Search
::Tiling
::MapTileUtils
::_mapping_coeff
($self,$type));
621 $seq =~ s/\+//g; # remove '+' characters (conservative substitutions)
622 $len_id = (CORE
::length $seq)*(Bio
::Search
::Tiling
::MapTileUtils
::_mapping_coeff
($self,$type));
624 $_ eq 'identities' && do {
627 $_ eq 'conserved' && do {
630 $_ eq 'searchutils' && do {
631 return ($len_id, $len_cons);
634 $self->throw("What are YOU doing here?");
642 package Bio
::LocatableSeq
;
648 # mixin the Bio::FeatureHolderI implementation of
649 # Bio::Seq -- for get_tiled_aln
651 =head2 get_SeqFeatures
653 Title : get_SeqFeatures
655 Function: Get the feature objects held by this feature holder.
657 Features which are not top-level are subfeatures of one or
658 more of the returned feature objects, which means that you
659 must traverse the subfeature arrays of each top-level
660 feature object in order to traverse all features associated
663 Top-level features can be obtained by tag, specified in
666 Use get_all_SeqFeatures() if you want the feature tree
667 flattened into one single array.
670 Returns : an array of Bio::SeqFeatureI implementing objects
671 Args : [optional] scalar string (feature tag)
680 if( !defined $self->{'_as_feat'} ) {
681 $self->{'_as_feat'} = [];
684 return map { $_->primary_tag eq $tag ?
$_ : () } @
{$self->{'_as_feat'}};
687 return @
{$self->{'_as_feat'}};
693 Title : feature_count
694 Usage : $seq->feature_count()
695 Function: Return the number of SeqFeatures attached to a sequence
696 Returns : integer representing the number of SeqFeatures
704 if (defined($self->{'_as_feat'})) {
705 return ($#{$self->{'_as_feat'}} + 1);
711 =head2 add_SeqFeature
713 Title : add_SeqFeature
714 Usage : $seq->add_SeqFeature($feat);
715 $seq->add_SeqFeature(@feat);
716 Function: Adds the given feature object (or each of an array of feature
717 objects to the feature array of this
718 sequence. The object passed is required to implement the
719 Bio::SeqFeatureI interface.
720 Returns : 1 on success
721 Args : A Bio::SeqFeatureI implementing object, or an array of such objects.
727 my ($self,@feat) = @_;
728 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
729 foreach my $feat ( @feat ) {
730 if( !$feat->isa("Bio::SeqFeatureI") ) {
731 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
733 $feat->attach_seq($self);
734 push(@
{$self->{'_as_feat'}},$feat);
739 =head2 remove_SeqFeatures
741 Title : remove_SeqFeatures
742 Usage : $seq->remove_SeqFeatures();
743 Function: Flushes all attached SeqFeatureI objects.
745 To remove individual feature objects, delete those from the returned
746 array and re-add the rest.
748 Returns : The array of Bio::SeqFeatureI objects removed from this seq.
754 sub remove_SeqFeatures
{
757 return () unless $self->{'_as_feat'};
758 my @feats = @
{$self->{'_as_feat'}};
759 $self->{'_as_feat'} = [];