maint: remove Travis stuff which has been replaced with Github actions (#325)
[bioperl-live.git] / lib / Bio / Search / Tiling / MapTileUtils.pm
blobe22eaf176ff74e2b76853b05c8cd3597f218e511
1 #$Id$
2 package Bio::Search::Tiling::MapTileUtils;
5 use strict;
6 use warnings;
7 use Exporter;
9 BEGIN {
10 our @ISA = qw( Exporter );
11 our @EXPORT = qw( get_intervals_from_hsps
12 interval_tiling
13 decompose_interval
14 containing_hsps
15 covering_groups
16 _allowable_filters
17 _set_attributes
18 _mapping_coeff);
21 # tiling trials
22 # assumed: intervals are [$a0, $a1], with $a0 <= $a1
23 =head1 NAME
25 Bio::Search::Tiling::MapTileUtils - utilities for manipulating closed intervals for an HSP tiling algorithm
27 =head1 SYNOPSIS
29 Not used directly.
31 =head1 DESCRIPTION
33 Not used directly.
35 =head1 NOTE
37 An "interval" in this module is defined as an arrayref C<[$a0, $a1]>, where
38 C<$a0, $a1> are scalar numbers satisfying C<$a0 E<lt>= $a1>.
40 =head1 AUTHOR
42 Mark A. Jensen - maj -at- fortinbras -dot- us
44 =head1 APPENDIX
46 =head2 interval_tiling
48 Title : interval_tiling()
49 Usage : @tiling = interval_tiling( \@array_of_intervals )
50 Function: Find minimal set of intervals covering the input set
51 Returns : array of arrayrefs of the form
52 ( [$interval => [ @indices_of_collapsed_input_intervals ]], ...)
53 Args : arrayref of intervals
55 =cut
57 sub interval_tiling {
58 return unless $_[0]; # no input
59 my $n = scalar @{$_[0]};
60 my %input;
61 @input{(0..$n-1)} = @{$_[0]};
62 my @active = (0..$n-1);
63 my @hold;
64 my @tiled_ints;
65 my @ret;
66 while (@active) {
67 my $tgt = $input{my $tgt_i = shift @active};
68 push @tiled_ints, $tgt_i;
69 my $tgt_not_disjoint = 1;
70 while ($tgt_not_disjoint) {
71 $tgt_not_disjoint = 0;
72 while (my $try_i = shift @active) {
73 my $try = $input{$try_i};
74 if ( !are_disjoint($tgt, $try) ) {
75 $tgt = min_covering_interval($tgt,$try);
76 push @tiled_ints, $try_i;
77 $tgt_not_disjoint = 1;
79 else {
80 push @hold, $try_i;
83 if (!$tgt_not_disjoint) {
84 push @ret, [ $tgt => [@tiled_ints] ];
85 @tiled_ints = ();
87 @active = @hold;
88 @hold = ();
91 return @ret;
94 =head2 decompose_interval
96 Title : decompose_interval
97 Usage : @decomposition = decompose_interval( \@overlappers )
98 Function: Calculate the disjoint decomposition of a set of
99 overlapping intervals, each annotated with a list of
100 covering intervals
101 Returns : array of arrayrefs of the form
102 ( [[@interval] => [@indices_of_coverers]], ... )
103 Args : arrayref of intervals (arrayrefs like [$a0, $a1], with
104 Note : Each returned interval is associated with a list of indices of the
105 original intervals that cover that decomposition component
106 (scalar size of this list could be called the 'coverage coefficient')
107 Note : Coverage: each component of the decomp is completely contained
108 in the input intervals that overlap it, by construction.
109 Caveat : This routine expects the members of @overlappers to overlap,
110 but doesn't check this.
112 =cut
114 ### what if the input intervals don't overlap?? They MUST overlap; that's
115 ### what interval_tiling() is for.
117 sub decompose_interval {
118 return unless $_[0]; # no input
119 my @ints = @{$_[0]};
120 my (%flat,@flat);
121 ### this is ok, but need to handle the case where a lh and rh endpoint
122 ### coincide...
123 # decomposition --
124 # flatten:
125 # every lh endpoint generates (lh-1, lh)
126 # every rh endpoint generates (rh, rh+)
127 foreach (@ints) {
128 $flat{$$_[0]-1}++;
129 $flat{$$_[0]}++;
130 $flat{$$_[1]}++;
131 $flat{$$_[1]+1}++;
133 # sort, create singletons if nec.
134 my @a;
135 @a = sort {$a<=>$b} keys %flat;
136 # throw out first and last (meeting a boundary condition)
137 shift @a; pop @a;
138 # look for singletons
139 @flat = (shift @a, shift @a);
140 if ( $flat[1]-$flat[0] == 1 ) {
141 @flat = ($flat[0],$flat[0], $flat[1]);
143 while (my $a = shift @a) {
144 if ($a-$flat[-2]==2) {
145 push @flat, $flat[-1]; # create singleton interval
147 push @flat, $a;
149 if ($flat[-1]-$flat[-2]==1 and @flat % 2) {
150 push @flat, $flat[-1];
152 # component intervals are consecutive pairs
153 my @decomp;
154 while (my $a = shift @flat) {
155 push @decomp, [$a, shift @flat];
158 # for each component, return a list of the indices of the input intervals
159 # that cover the component.
160 my @coverage;
161 foreach my $i (0..$#decomp) {
162 foreach my $j (0..$#ints) {
163 unless (are_disjoint($decomp[$i], $ints[$j])) {
164 if (defined $coverage[$i]) {
165 push @{$coverage[$i]}, $j;
167 else {
168 $coverage[$i] = [$j];
173 return map { [$decomp[$_] => $coverage[$_]] } (0..$#decomp);
176 =head2 are_disjoint
178 Title : are_disjoint
179 Usage : are_disjoint( [$a0, $a1], [$b0, $b1] )
180 Function: Determine if two intervals are disjoint
181 Returns : True if the intervals are disjoint, false if they overlap
182 Args : array of two intervals
184 =cut
186 sub are_disjoint {
187 my ($int1, $int2) = @_;
188 return 1 if ( $$int1[1] < $$int2[0] ) || ( $$int2[1] < $$int1[0]);
189 return 0;
192 =head2 min_covering_interval
194 Title : min_covering_interval
195 Usage : $interval = min_covering_interval( [$a0,$a1],[$b0,$b1] )
196 Function: Determine the minimal covering interval for two intervals
197 Returns : an interval
198 Args : two intervals
200 =cut
202 sub min_covering_interval {
203 my ($int1, $int2) = @_;
204 my @a = sort {$a <=> $b} (@$int1, @$int2);
205 return [$a[0],$a[-1]];
208 =head2 get_intervals_from_hsps
210 Title : get_intervals_from_hsps
211 Usage : @intervals = get_intervals_from_hsps($type, @hsp_objects)
212 Function: Return array of intervals of the form [ $start, $end ],
213 from an array of hsp objects
214 Returns : an array of intervals
215 Args : scalar $type, array of HSPI objects; where $type is one of 'hit',
216 'subject', 'query'
218 =cut
220 sub get_intervals_from_hsps {
221 my $type = shift;
222 my @hsps;
223 if (ref($type) =~ /HSP/) {
224 push @hsps, $type;
225 $type = 'query';
227 elsif ( !grep /^$type$/,qw( hit subject query ) ) {
228 die "Unknown HSP type '$type'";
230 $type = 'hit' if $type eq 'subject';
231 push @hsps, @_;
232 my @ret;
233 foreach (@hsps) {
234 # push @ret, [ $_->start($type), $_->end($type)];
235 push @ret, [ $_->range($type) ];
237 return @ret;
240 # fast, clear, nasty, brutish and short.
241 # for _allowable_filters(), _set_mapping()
242 # covers BLAST, FAST families
243 # FASTA is ambiguous (nt or aa) based on alg name only
245 my $alg_lookup = {
246 'N' => { 'mapping' => [1,1],
247 'def_context' => ['p_','p_'],
248 'has_strand' => [1, 1],
249 'has_frame' => [0, 0]},
250 'P' => { 'mapping' => [1,1],
251 'def_context' => ['all','all'],
252 'has_strand' => [0, 0],
253 'has_frame' => [0, 0]},
254 'X' => { 'mapping' => [3, 1],
255 'def_context' => [undef,'all'],
256 'has_strand' => [1, 0],
257 'has_frame' => [1, 0]},
258 'Y' => { 'mapping' => [3, 1],
259 'def_context' => [undef,'all'],
260 'has_strand' => [1, 0],
261 'has_frame' => [1, 0]},
262 'TA' => { 'mapping' => [1, 3],
263 'def_context' => ['all',undef],
264 'has_strand' => [0, 1],
265 'has_frame' => [0, 1]},
266 'TN' => { 'mapping' => [1, 3],
267 'def_context' => ['p_',undef],
268 'has_strand' => [1,1],
269 'has_frame' => [0, 1]},
270 'TX' => { 'mapping' => [3, 3],
271 'def_context' => [undef,undef],
272 'has_strand' => [1, 1],
273 'has_frame' => [1, 1]},
274 'TY' => { 'mapping' => [3, 3],
275 'def_context' => [undef,undef],
276 'has_strand' => [1, 1],
277 'has_frame' => [1, 1]}
280 =head2 _allowable_filters
282 Title : _allowable_filters
283 Usage : _allowable_filters($Bio_Search_Hit_HitI, $type)
284 Function: Return the HSP filters (strand, frame) allowed,
285 based on the reported algorithm
286 Returns : String encoding allowable filters:
287 s = strand, f = frame
288 Empty string if no filters allowed
289 undef if algorithm unrecognized
290 Args : A Bio::Search::Hit::HitI object,
291 scalar $type, one of 'hit', 'subject', 'query';
292 default is 'query'
294 =cut
296 sub _allowable_filters {
297 my $hit = shift;
298 my $type = shift;
299 $type ||= 'q';
300 unless (grep /^$type$/, qw( h q s ) ) {
301 warn("Unknown type '$type'; returning ''");
302 return '';
304 $type = 'h' if $type eq 's';
305 my $alg = $hit->algorithm;
307 # pretreat (i.e., kludge it)
308 $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
310 for ($hit->algorithm) {
311 /MEGABLAST/i && do {
312 return qr/[s]/;
314 /(.?)BLAST(.?)/i && do {
315 return $$alg_lookup{$1.$2}{$type};
317 /(.?)FAST(.?)/ && do {
318 return $$alg_lookup{$1.$2}{$type};
320 do { # unrecognized
321 last;
324 return;
328 =head2 _set_attributes
330 Title : _set_attributes
331 Usage : $tiling->_set_attributes()
332 Function: Sets attributes for invocant
333 that depend on algorithm name
334 Returns : True on success
335 Args : none
336 Note : setting based on the configuration table
337 %alg_lookup
339 =cut
341 sub _set_attributes {
342 my $self = shift;
343 my $alg = $self->hit->algorithm;
345 # pretreat (i.e., kludge it)
346 $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
348 for ($alg) {
349 /MEGABLAST/i && do {
350 ($self->{_mapping_query},$self->{_mapping_hit}) = (1,1);
351 ($self->{_def_context_query},$self->{_def_context_hit}) =
352 ('p_','p_');
353 ($self->{_has_frame_query},$self->{_has_frame_hit}) =
354 (0, 0);
355 ($self->{_has_strand_query},$self->{_has_strand_hit}) =
356 (1, 1);
357 last;
359 /(.?)BLAST(.?)/i && do {
360 ($self->{_mapping_query},$self->{_mapping_hit}) =
361 @{$$alg_lookup{$1.$2}{mapping}};
362 ($self->{_def_context_query},$self->{_def_context_hit}) =
363 @{$$alg_lookup{$1.$2}{def_context}};
364 ($self->{_has_frame_query},$self->{_has_frame_hit}) =
365 @{$$alg_lookup{$1.$2}{has_frame}};
366 ($self->{_has_strand_query},$self->{_has_strand_hit}) =
367 @{$$alg_lookup{$1.$2}{has_strand}};
368 last;
370 /(.?)FAST(.?)/ && do {
371 ($self->{_mapping_query},$self->{_mapping_hit}) =
372 @{$$alg_lookup{$1.$2}{mapping}};
373 ($self->{_def_context_query},$self->{_def_context_hit}) =
374 @{$$alg_lookup{$1.$2}{def_context}};
375 ($self->{_has_frame_query},$self->{_has_frame_hit}) =
376 @{$$alg_lookup{$1.$2}{has_frame}};
377 ($self->{_has_strand_query},$self->{_has_strand_hit}) =
378 @{$$alg_lookup{$1.$2}{has_strand}};
379 last;
381 do { # unrecognized
382 $self->warn("Unrecognized algorithm '$alg'; defaults may not work");
383 ($self->{_mapping_query},$self->{_mapping_hit}) = (1,1);
384 ($self->{_def_context_query},$self->{_def_context_hit}) =
385 ('all','all');
386 ($self->{_has_frame_query},$self->{_has_frame_hit}) =
387 (0,0);
388 ($self->{_has_strand_query},$self->{_has_strand_hit}) =
389 (0,0);
390 return 0;
391 last;
394 return 1;
397 sub _mapping_coeff {
398 my $obj = shift;
399 my $type = shift;
400 my %type_i = ( 'query' => 0, 'hit' => 1 );
401 unless ( ref($obj) && $obj->can('algorithm') ) {
402 $obj->warn("Object type unrecognized");
403 return undef;
405 $type ||= 'query';
406 unless ( grep(/^$type$/, qw( query hit subject ) ) ) {
407 $obj->warn("Sequence type unrecognized");
408 return undef;
410 $type = 'hit' if $type eq 'subject';
411 my $alg = $obj->algorithm;
413 # pretreat (i.e., kludge it)
414 $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
416 for ($alg) {
417 /MEGABLAST/i && do {
418 return 1;
420 /(.?)BLAST(.?)/i && do {
421 return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}];
423 /(.?)FAST(.?)/ && do {
424 return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}];
426 do { # unrecognized
427 last;
430 return;
433 # a graphical depiction of a set of intervals
434 sub _ints_as_text {
435 my $ints = shift;
436 my @ints = @$ints;
437 my %pos;
438 for (@ints) {
439 $pos{$$_[0]}++;
440 $pos{$$_[1]}++;
443 my @pos = sort {$a<=>$b} keys %pos;
444 @pos = map {sprintf("%03d",$_)} @pos;
445 #header
446 my $max=0;
447 $max = (length > $max) ? length : $max for (@pos);
448 for my $j (0..$max-1) {
449 my $i = $max-1-$j;
450 my @line = map { substr($_, $j, 1) || '0' } @pos;
451 print join('', @line), "\n";
453 print '-' x @pos, "\n";
454 undef %pos;
455 @pos{map {sprintf("%d",$_)} @pos} = (0..@pos);
456 foreach (@ints) {
457 print ' ' x $pos{$$_[0]}, '[', ' ' x ($pos{$$_[1]}-$pos{$$_[0]}-1), ']', ' ' x (@pos-$pos{$$_[1]}), "\n";
463 =head2 containing_hsps()
465 Title : containing_hsps
466 Usage : @hsps = containing_hsps($interval, @hsps_to_search)
467 Function: Return a list of hsps whose coordinates completely contain the
468 given $interval
469 Returns : Array of HSP objects
470 Args : $interval : [$int1, $int2],
471 array of HSP objects
473 =cut
474 # could be more efficient if hsps are assumed ordered...
475 sub containing_hsps {
476 my $intvl = shift;
477 my @hsps = @_;
478 my @ret;
479 my ($beg, $end) = @$intvl;
480 foreach my $hsp (@hsps) {
481 my ($start, $stop) = ($hsp->start, $hsp->end);
482 push @ret, $hsp if ( $start <= $beg and $end <= $stop );
484 return @ret;
489 =head2 covering_groups()
491 Title : covering_groups
492 Usage :
493 Function: divide a list of **ordered,disjoint** intervals (as from a
494 coverage map) into a set of disjoint covering groups
495 Returns : array of arrayrefs, each arrayref a covering set of
496 intervals
497 Args : array of intervals
499 =cut
501 sub covering_groups {
502 my @intervals = @_;
503 return unless @intervals;
504 my (@groups, $grp);
505 push @{$groups[0]}, shift @intervals;
506 $grp = $groups[0];
507 for (my $intvl = shift @intervals; @intervals; $intvl = shift @intervals) {
508 if ( $intvl->[0] - $grp->[-1][1] == 1 ) { # intervals are direcly adjacent
509 push @$grp, $intvl;
511 else {
512 $grp = [$intvl];
513 push @groups, $grp;
516 return @groups;
520 # need our own subsequencer for hsps.
522 package Bio::Search::HSP::HSPI;
526 use strict;
527 use warnings;
529 =head2 matches_MT
531 Title : matches_MT
532 Usage : $hsp->matches($type, $action, $start, $end)
533 Purpose : Get the total number of identical or conserved matches
534 in the query or sbjct sequence for the given HSP. Optionally can
535 report data within a defined interval along the seq.
536 Returns : scalar int
537 Args :
538 Comments : Relies on seq_str('match') to get the string of alignment symbols
539 between the query and sbjct lines which are used for determining
540 the number of identical and conservative matches.
541 Note : Modeled on Bio::Search::HSP::HSPI::matches
543 =cut
545 sub matches_MT {
546 use integer;
547 my( $self, @args ) = @_;
548 my($type, $action, $beg, $end) = $self->_rearrange( [qw(TYPE ACTION START END)], @args);
549 my @actions = qw( identities conserved searchutils );
551 # prep $type
552 $self->throw("Type not specified") if !defined $type;
553 $self->throw("Type '$type' unrecognized") unless grep(/^$type$/,qw(query hit subject));
554 $type = 'hit' if $type eq 'subject';
556 # prep $action
557 $self->throw("Action not specified") if !defined $action;
558 $self->throw("Action '$action' unrecognized") unless grep(/^$action$/, @actions);
560 my ($len_id, $len_cons);
561 my $c = Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type);
562 if ((defined $beg && !defined $end) || (!defined $beg && defined $end)) {
563 $self->throw("Both start and end are required");
565 elsif ( (!defined($beg) && !defined($end)) || !$self->seq_str('match') ) {
566 ## Get data for the whole alignment.
567 # the reported values x mapping
568 $self->debug("Sequence data not present in report; returning data for entire HSP") unless $self->seq_str('match');
569 ($len_id, $len_cons) = map { $c*$_ } ($self->num_identical, $self->num_conserved);
570 for ($action) {
571 $_ eq 'identities' && do {
572 return $len_id;
574 $_ eq 'conserved' && do {
575 return $len_cons;
577 $_ eq 'searchutils' && do {
578 return ($len_id, $len_cons);
580 do {
581 $self->throw("What are YOU doing here?");
585 else {
586 ## Get the substring representing the desired sub-section of aln.
587 my($start,$stop) = $self->range($type);
588 if ( $beg < $start or $stop < $end ) {
589 $self->throw("Start/stop out of range [$start, $stop]");
592 # handle gaps
593 my $match_str = $self->seq_str('match');
594 if ($self->gaps) {
595 # strip the homology string of gap positions relative
596 # to the target type
597 $match_str = $self->seq_str('match');
598 my $tgt = $self->seq_str($type);
599 my $encode = $match_str ^ $tgt;
600 my $zap = '-'^' ';
601 $encode =~ s/$zap//g;
602 $tgt =~ s/-//g;
603 $match_str = $tgt ^ $encode;
604 # match string is now the correct length for substr'ing below,
605 # given that start and end are gapless coordinates in the
606 # blast report
609 my $seq = "";
610 $seq = substr( $match_str,
611 int( ($beg-$start)/Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type) ),
612 int( 1+($end-$beg)/Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type) )
615 if(!CORE::length $seq) {
616 $self->throw("Undefined sub-sequence ($beg,$end). Valid range = $start - $stop");
619 $seq =~ s/ //g; # remove space (no info).
620 $len_cons = (CORE::length $seq)*(Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self,$type));
621 $seq =~ s/\+//g; # remove '+' characters (conservative substitutions)
622 $len_id = (CORE::length $seq)*(Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self,$type));
623 for ($action) {
624 $_ eq 'identities' && do {
625 return $len_id;
627 $_ eq 'conserved' && do {
628 return $len_cons;
630 $_ eq 'searchutils' && do {
631 return ($len_id, $len_cons);
633 do {
634 $self->throw("What are YOU doing here?");
642 package Bio::LocatableSeq;
645 use strict;
646 use warnings;
648 # mixin the Bio::FeatureHolderI implementation of
649 # Bio::Seq -- for get_tiled_aln
651 =head2 get_SeqFeatures
653 Title : get_SeqFeatures
654 Usage :
655 Function: Get the feature objects held by this feature holder.
657 Features which are not top-level are subfeatures of one or
658 more of the returned feature objects, which means that you
659 must traverse the subfeature arrays of each top-level
660 feature object in order to traverse all features associated
661 with this sequence.
663 Top-level features can be obtained by tag, specified in
664 the argument.
666 Use get_all_SeqFeatures() if you want the feature tree
667 flattened into one single array.
669 Example :
670 Returns : an array of Bio::SeqFeatureI implementing objects
671 Args : [optional] scalar string (feature tag)
674 =cut
676 sub get_SeqFeatures{
677 my $self = shift;
678 my $tag = shift;
680 if( !defined $self->{'_as_feat'} ) {
681 $self->{'_as_feat'} = [];
683 if ($tag) {
684 return map { $_->primary_tag eq $tag ? $_ : () } @{$self->{'_as_feat'}};
686 else {
687 return @{$self->{'_as_feat'}};
691 =head2 feature_count
693 Title : feature_count
694 Usage : $seq->feature_count()
695 Function: Return the number of SeqFeatures attached to a sequence
696 Returns : integer representing the number of SeqFeatures
697 Args : None
700 =cut
702 sub feature_count {
703 my ($self) = @_;
704 if (defined($self->{'_as_feat'})) {
705 return ($#{$self->{'_as_feat'}} + 1);
706 } else {
707 return 0;
711 =head2 add_SeqFeature
713 Title : add_SeqFeature
714 Usage : $seq->add_SeqFeature($feat);
715 $seq->add_SeqFeature(@feat);
716 Function: Adds the given feature object (or each of an array of feature
717 objects to the feature array of this
718 sequence. The object passed is required to implement the
719 Bio::SeqFeatureI interface.
720 Returns : 1 on success
721 Args : A Bio::SeqFeatureI implementing object, or an array of such objects.
724 =cut
726 sub add_SeqFeature {
727 my ($self,@feat) = @_;
728 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
729 foreach my $feat ( @feat ) {
730 if( !$feat->isa("Bio::SeqFeatureI") ) {
731 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
733 $feat->attach_seq($self);
734 push(@{$self->{'_as_feat'}},$feat);
736 return 1;
739 =head2 remove_SeqFeatures
741 Title : remove_SeqFeatures
742 Usage : $seq->remove_SeqFeatures();
743 Function: Flushes all attached SeqFeatureI objects.
745 To remove individual feature objects, delete those from the returned
746 array and re-add the rest.
747 Example :
748 Returns : The array of Bio::SeqFeatureI objects removed from this seq.
749 Args : None
752 =cut
754 sub remove_SeqFeatures {
755 my $self = shift;
757 return () unless $self->{'_as_feat'};
758 my @feats = @{$self->{'_as_feat'}};
759 $self->{'_as_feat'} = [];
760 return @feats;