2 package Bio
::Search
::Tiling
::MapTileUtils
;
8 our @ISA = qw( Exporter );
9 our @EXPORT = qw( get_intervals_from_hsps
20 # assumed: intervals are [$a0, $a1], with $a0 <= $a1
23 Bio::Search::Tiling::MapTileUtils - utilities for manipulating closed intervals for an HSP tiling algorithm
35 An "interval" in this module is defined as an arrayref C<[$a0, $a1]>, where
36 C<$a0, $a1> are scalar numbers satisfying C<$a0 E<lt>= $a1>.
40 Mark A. Jensen - maj -at- fortinbras -dot- us
44 =head2 interval_tiling
46 Title : interval_tiling()
47 Usage : @tiling = interval_tiling( \@array_of_intervals )
48 Function: Find minimal set of intervals covering the input set
49 Returns : array of arrayrefs of the form
50 ( [$interval => [ @indices_of_collapsed_input_intervals ]], ...)
51 Args : arrayref of intervals
56 return unless $_[0]; # no input
57 my $n = scalar @
{$_[0]};
59 @input{(0..$n-1)} = @
{$_[0]};
60 my @active = (0..$n-1);
65 my $tgt = $input{my $tgt_i = shift @active};
66 push @tiled_ints, $tgt_i;
67 my $tgt_not_disjoint = 1;
68 while ($tgt_not_disjoint) {
69 $tgt_not_disjoint = 0;
70 while (my $try_i = shift @active) {
71 my $try = $input{$try_i};
72 if ( !are_disjoint
($tgt, $try) ) {
73 $tgt = min_covering_interval
($tgt,$try);
74 push @tiled_ints, $try_i;
75 $tgt_not_disjoint = 1;
81 if (!$tgt_not_disjoint) {
82 push @ret, [ $tgt => [@tiled_ints] ];
92 =head2 decompose_interval
94 Title : decompose_interval
95 Usage : @decomposition = decompose_interval( \@overlappers )
96 Function: Calculate the disjoint decomposition of a set of
97 overlapping intervals, each annotated with a list of
99 Returns : array of arrayrefs of the form
100 ( [[@interval] => [@indices_of_coverers]], ... )
101 Args : arrayref of intervals (arrayrefs like [$a0, $a1], with
102 Note : Each returned interval is associated with a list of indices of the
103 original intervals that cover that decomposition component
104 (scalar size of this list could be called the 'coverage coefficient')
105 Note : Coverage: each component of the decomp is completely contained
106 in the input intervals that overlap it, by construction.
107 Caveat : This routine expects the members of @overlappers to overlap,
108 but doesn't check this.
112 ### what if the input intervals don't overlap?? They MUST overlap; that's
113 ### what interval_tiling() is for.
115 sub decompose_interval
{
116 return unless $_[0]; # no input
119 ### this is ok, but need to handle the case where a lh and rh endpoint
123 # every lh endpoint generates (lh-1, lh)
124 # every rh endpoint generates (rh, rh+)
131 # sort, create singletons if nec.
133 @a = sort {$a<=>$b} keys %flat;
134 # throw out first and last (meeting a boundary condition)
136 # look for singletons
137 @flat = (shift @a, shift @a);
138 if ( $flat[1]-$flat[0] == 1 ) {
139 @flat = ($flat[0],$flat[0], $flat[1]);
141 while (my $a = shift @a) {
142 if ($a-$flat[-2]==2) {
143 push @flat, $flat[-1]; # create singleton interval
147 if ($flat[-1]-$flat[-2]==1 and @flat % 2) {
148 push @flat, $flat[-1];
150 # component intervals are consecutive pairs
152 while (my $a = shift @flat) {
153 push @decomp, [$a, shift @flat];
156 # for each component, return a list of the indices of the input intervals
157 # that cover the component.
159 foreach my $i (0..$#decomp) {
160 foreach my $j (0..$#ints) {
161 unless (are_disjoint
($decomp[$i], $ints[$j])) {
162 if (defined $coverage[$i]) {
163 push @
{$coverage[$i]}, $j;
166 $coverage[$i] = [$j];
171 return map { [$decomp[$_] => $coverage[$_]] } (0..$#decomp);
177 Usage : are_disjoint( [$a0, $a1], [$b0, $b1] )
178 Function: Determine if two intervals are disjoint
179 Returns : True if the intervals are disjoint, false if they overlap
180 Args : array of two intervals
185 my ($int1, $int2) = @_;
186 return 1 if ( $$int1[1] < $$int2[0] ) || ( $$int2[1] < $$int1[0]);
190 =head2 min_covering_interval
192 Title : min_covering_interval
193 Usage : $interval = min_covering_interval( [$a0,$a1],[$b0,$b1] )
194 Function: Determine the minimal covering interval for two intervals
195 Returns : an interval
200 sub min_covering_interval
{
201 my ($int1, $int2) = @_;
202 my @a = sort {$a <=> $b} (@
$int1, @
$int2);
203 return [$a[0],$a[-1]];
206 =head2 get_intervals_from_hsps
208 Title : get_intervals_from_hsps
209 Usage : @intervals = get_intervals_from_hsps($type, @hsp_objects)
210 Function: Return array of intervals of the form [ $start, $end ],
211 from an array of hsp objects
212 Returns : an array of intervals
213 Args : scalar $type, array of HSPI objects; where $type is one of 'hit',
218 sub get_intervals_from_hsps
{
221 if (ref($type) =~ /HSP/) {
225 elsif ( !grep /^$type$/,qw( hit subject query ) ) {
226 die "Unknown HSP type '$type'";
228 $type = 'hit' if $type eq 'subject';
232 # push @ret, [ $_->start($type), $_->end($type)];
233 push @ret, [ $_->range($type) ];
238 # fast, clear, nasty, brutish and short.
239 # for _allowable_filters(), _set_mapping()
240 # covers BLAST, FAST families
241 # FASTA is ambiguous (nt or aa) based on alg name only
244 'N' => { 'mapping' => [1,1],
245 'def_context' => ['p_','p_'],
246 'has_strand' => [1, 1],
247 'has_frame' => [0, 0]},
248 'P' => { 'mapping' => [1,1],
249 'def_context' => ['all','all'],
250 'has_strand' => [0, 0],
251 'has_frame' => [0, 0]},
252 'X' => { 'mapping' => [3, 1],
253 'def_context' => [undef,'all'],
254 'has_strand' => [1, 0],
255 'has_frame' => [1, 0]},
256 'Y' => { 'mapping' => [3, 1],
257 'def_context' => [undef,'all'],
258 'has_strand' => [1, 0],
259 'has_frame' => [1, 0]},
260 'TA' => { 'mapping' => [1, 3],
261 'def_context' => ['all',undef],
262 'has_strand' => [0, 1],
263 'has_frame' => [0, 1]},
264 'TN' => { 'mapping' => [1, 3],
265 'def_context' => ['p_',undef],
266 'has_strand' => [1,1],
267 'has_frame' => [0, 1]},
268 'TX' => { 'mapping' => [3, 3],
269 'def_context' => [undef,undef],
270 'has_strand' => [1, 1],
271 'has_frame' => [1, 1]},
272 'TY' => { 'mapping' => [3, 3],
273 'def_context' => [undef,undef],
274 'has_strand' => [1, 1],
275 'has_frame' => [1, 1]}
278 =head2 _allowable_filters
280 Title : _allowable_filters
281 Usage : _allowable_filters($Bio_Search_Hit_HitI, $type)
282 Function: Return the HSP filters (strand, frame) allowed,
283 based on the reported algorithm
284 Returns : String encoding allowable filters:
285 s = strand, f = frame
286 Empty string if no filters allowed
287 undef if algorithm unrecognized
288 Args : A Bio::Search::Hit::HitI object,
289 scalar $type, one of 'hit', 'subject', 'query';
294 sub _allowable_filters
{
298 unless (grep /^$type$/, qw( h q s ) ) {
299 warn("Unknown type '$type'; returning ''");
302 $type = 'h' if $type eq 's';
303 my $alg = $hit->algorithm;
305 # pretreat (i.e., kludge it)
306 $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
308 for ($hit->algorithm) {
312 /(.?)BLAST(.?)/i && do {
313 return $$alg_lookup{$1.$2}{$type};
315 /(.?)FAST(.?)/ && do {
316 return $$alg_lookup{$1.$2}{$type};
326 =head2 _set_attributes
328 Title : _set_attributes
329 Usage : $tiling->_set_attributes()
330 Function: Sets attributes for invocant
331 that depend on algorithm name
332 Returns : True on success
334 Note : setting based on the configuration table
339 sub _set_attributes
{
341 my $alg = $self->hit->algorithm;
343 # pretreat (i.e., kludge it)
344 $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
348 ($self->{_mapping_query
},$self->{_mapping_hit
}) = (1,1);
349 ($self->{_def_context_query
},$self->{_def_context_hit
}) =
351 ($self->{_has_frame_query
},$self->{_has_frame_hit
}) =
353 ($self->{_has_strand_query
},$self->{_has_strand_hit
}) =
357 /(.?)BLAST(.?)/i && do {
358 ($self->{_mapping_query
},$self->{_mapping_hit
}) =
359 @
{$$alg_lookup{$1.$2}{mapping
}};
360 ($self->{_def_context_query
},$self->{_def_context_hit
}) =
361 @
{$$alg_lookup{$1.$2}{def_context
}};
362 ($self->{_has_frame_query
},$self->{_has_frame_hit
}) =
363 @
{$$alg_lookup{$1.$2}{has_frame
}};
364 ($self->{_has_strand_query
},$self->{_has_strand_hit
}) =
365 @
{$$alg_lookup{$1.$2}{has_strand
}};
368 /(.?)FAST(.?)/ && do {
369 ($self->{_mapping_query
},$self->{_mapping_hit
}) =
370 @
{$$alg_lookup{$1.$2}{mapping
}};
371 ($self->{_def_context_query
},$self->{_def_context_hit
}) =
372 @
{$$alg_lookup{$1.$2}{def_context
}};
373 ($self->{_has_frame_query
},$self->{_has_frame_hit
}) =
374 @
{$$alg_lookup{$1.$2}{has_frame
}};
375 ($self->{_has_strand_query
},$self->{_has_strand_hit
}) =
376 @
{$$alg_lookup{$1.$2}{has_strand
}};
380 $self->warn("Unrecognized algorithm '$alg'; defaults may not work");
381 ($self->{_mapping_query
},$self->{_mapping_hit
}) = (1,1);
382 ($self->{_def_context_query
},$self->{_def_context_hit
}) =
384 ($self->{_has_frame_query
},$self->{_has_frame_hit
}) =
386 ($self->{_has_strand_query
},$self->{_has_strand_hit
}) =
398 my %type_i = ( 'query' => 0, 'hit' => 1 );
399 unless ( ref($obj) && $obj->can('algorithm') ) {
400 $obj->warn("Object type unrecognized");
404 unless ( grep(/^$type$/, qw( query hit subject ) ) ) {
405 $obj->warn("Sequence type unrecognized");
408 $type = 'hit' if $type eq 'subject';
409 my $alg = $obj->algorithm;
411 # pretreat (i.e., kludge it)
412 $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
418 /(.?)BLAST(.?)/i && do {
419 return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}];
421 /(.?)FAST(.?)/ && do {
422 return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}];
431 # a graphical depiction of a set of intervals
441 my @pos = sort {$a<=>$b} keys %pos;
442 @pos = map {sprintf("%03d",$_)} @pos;
445 $max = (length > $max) ?
length : $max for (@pos);
446 for my $j (0..$max-1) {
448 my @line = map { substr($_, $j, 1) || '0' } @pos;
449 print join('', @line), "\n";
451 print '-' x
@pos, "\n";
453 @pos{map {sprintf("%d",$_)} @pos} = (0..@pos);
455 print ' ' x
$pos{$$_[0]}, '[', ' ' x
($pos{$$_[1]}-$pos{$$_[0]}-1), ']', ' ' x
(@pos-$pos{$$_[1]}), "\n";
461 =head2 containing_hsps()
463 Title : containing_hsps
464 Usage : @hsps = containing_hsps($interval, @hsps_to_search)
465 Function: Return a list of hsps whose coordinates completely contain the
467 Returns : Array of HSP objects
468 Args : $interval : [$int1, $int2],
472 # could be more efficient if hsps are assumed ordered...
473 sub containing_hsps
{
477 my ($beg, $end) = @
$intvl;
478 foreach my $hsp (@hsps) {
479 my ($start, $stop) = ($hsp->start, $hsp->end);
480 push @ret, $hsp if ( $start <= $beg and $end <= $stop );
487 =head2 covering_groups()
489 Title : covering_groups
491 Function: divide a list of **ordered,disjoint** intervals (as from a
492 coverage map) into a set of disjoint covering groups
493 Returns : array of arrayrefs, each arrayref a covering set of
495 Args : array of intervals
499 sub covering_groups
{
501 return unless @intervals;
503 push @
{$groups[0]}, shift @intervals;
505 for (my $intvl = shift @intervals; @intervals; $intvl = shift @intervals) {
506 if ( $intvl->[0] - $grp->[-1][1] == 1 ) { # intervals are direcly adjacent
518 # need our own subsequencer for hsps.
520 package Bio
::Search
::HSP
::HSPI
;
528 Usage : $hsp->matches($type, $action, $start, $end)
529 Purpose : Get the total number of identical or conserved matches
530 in the query or sbjct sequence for the given HSP. Optionally can
531 report data within a defined interval along the seq.
534 Comments : Relies on seq_str('match') to get the string of alignment symbols
535 between the query and sbjct lines which are used for determining
536 the number of identical and conservative matches.
537 Note : Modeled on Bio::Search::HSP::HSPI::matches
543 my( $self, @args ) = @_;
544 my($type, $action, $beg, $end) = $self->_rearrange( [qw(TYPE ACTION START END)], @args);
545 my @actions = qw( identities conserved searchutils );
548 $self->throw("Type not specified") if !defined $type;
549 $self->throw("Type '$type' unrecognized") unless grep(/^$type$/,qw(query hit subject));
550 $type = 'hit' if $type eq 'subject';
553 $self->throw("Action not specified") if !defined $action;
554 $self->throw("Action '$action' unrecognized") unless grep(/^$action$/, @actions);
556 my ($len_id, $len_cons);
557 my $c = Bio
::Search
::Tiling
::MapTileUtils
::_mapping_coeff
($self, $type);
558 if ((defined $beg && !defined $end) || (!defined $beg && defined $end)) {
559 $self->throw("Both start and end are required");
561 elsif ( (!defined($beg) && !defined($end)) || !$self->seq_str('match') ) {
562 ## Get data for the whole alignment.
563 # the reported values x mapping
564 $self->debug("Sequence data not present in report; returning data for entire HSP") unless $self->seq_str('match');
565 ($len_id, $len_cons) = map { $c*$_ } ($self->num_identical, $self->num_conserved);
567 $_ eq 'identities' && do {
570 $_ eq 'conserved' && do {
573 $_ eq 'searchutils' && do {
574 return ($len_id, $len_cons);
577 $self->throw("What are YOU doing here?");
582 ## Get the substring representing the desired sub-section of aln.
583 my($start,$stop) = $self->range($type);
584 if ( $beg < $start or $stop < $end ) {
585 $self->throw("Start/stop out of range [$start, $stop]");
589 my $match_str = $self->seq_str('match');
591 # strip the homology string of gap positions relative
593 $match_str = $self->seq_str('match');
594 my $tgt = $self->seq_str($type);
595 my $encode = $match_str ^ $tgt;
597 $encode =~ s/$zap//g;
599 $match_str = $tgt ^ $encode;
600 # match string is now the correct length for substr'ing below,
601 # given that start and end are gapless coordinates in the
606 $seq = substr( $match_str,
607 int( ($beg-$start)/Bio
::Search
::Tiling
::MapTileUtils
::_mapping_coeff
($self, $type) ),
608 int( 1+($end-$beg)/Bio
::Search
::Tiling
::MapTileUtils
::_mapping_coeff
($self, $type) )
611 if(!CORE
::length $seq) {
612 $self->throw("Undefined sub-sequence ($beg,$end). Valid range = $start - $stop");
615 $seq =~ s/ //g; # remove space (no info).
616 $len_cons = (CORE
::length $seq)*(Bio
::Search
::Tiling
::MapTileUtils
::_mapping_coeff
($self,$type));
617 $seq =~ s/\+//g; # remove '+' characters (conservative substitutions)
618 $len_id = (CORE
::length $seq)*(Bio
::Search
::Tiling
::MapTileUtils
::_mapping_coeff
($self,$type));
620 $_ eq 'identities' && do {
623 $_ eq 'conserved' && do {
626 $_ eq 'searchutils' && do {
627 return ($len_id, $len_cons);
630 $self->throw("What are YOU doing here?");
638 package Bio
::LocatableSeq
;
642 # mixin the Bio::FeatureHolderI implementation of
643 # Bio::Seq -- for get_tiled_aln
645 =head2 get_SeqFeatures
647 Title : get_SeqFeatures
649 Function: Get the feature objects held by this feature holder.
651 Features which are not top-level are subfeatures of one or
652 more of the returned feature objects, which means that you
653 must traverse the subfeature arrays of each top-level
654 feature object in order to traverse all features associated
657 Top-level features can be obtained by tag, specified in
660 Use get_all_SeqFeatures() if you want the feature tree
661 flattened into one single array.
664 Returns : an array of Bio::SeqFeatureI implementing objects
665 Args : [optional] scalar string (feature tag)
674 if( !defined $self->{'_as_feat'} ) {
675 $self->{'_as_feat'} = [];
678 return map { $_->primary_tag eq $tag ?
$_ : () } @
{$self->{'_as_feat'}};
681 return @
{$self->{'_as_feat'}};
687 Title : feature_count
688 Usage : $seq->feature_count()
689 Function: Return the number of SeqFeatures attached to a sequence
690 Returns : integer representing the number of SeqFeatures
698 if (defined($self->{'_as_feat'})) {
699 return ($#{$self->{'_as_feat'}} + 1);
705 =head2 add_SeqFeature
707 Title : add_SeqFeature
708 Usage : $seq->add_SeqFeature($feat);
709 $seq->add_SeqFeature(@feat);
710 Function: Adds the given feature object (or each of an array of feature
711 objects to the feature array of this
712 sequence. The object passed is required to implement the
713 Bio::SeqFeatureI interface.
714 Returns : 1 on success
715 Args : A Bio::SeqFeatureI implementing object, or an array of such objects.
721 my ($self,@feat) = @_;
722 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
723 foreach my $feat ( @feat ) {
724 if( !$feat->isa("Bio::SeqFeatureI") ) {
725 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
727 $feat->attach_seq($self);
728 push(@
{$self->{'_as_feat'}},$feat);
733 =head2 remove_SeqFeatures
735 Title : remove_SeqFeatures
736 Usage : $seq->remove_SeqFeatures();
737 Function: Flushes all attached SeqFeatureI objects.
739 To remove individual feature objects, delete those from the returned
740 array and re-add the rest.
742 Returns : The array of Bio::SeqFeatureI objects removed from this seq.
748 sub remove_SeqFeatures
{
751 return () unless $self->{'_as_feat'};
752 my @feats = @
{$self->{'_as_feat'}};
753 $self->{'_as_feat'} = [];