Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / t / SearchIO / Tiling.t
blobc13572d5a2c956c98567f3821ce69acad8c50f7b
1 #-*-perl-*-
2 #$Id$
3 use strict;
4 use warnings;
5     use vars qw($EXHAUSTIVE $VERBOSE);
6 BEGIN {
7     use lib '.';
8     use lib '../..';
10     use Bio::Root::Test;
11     $EXHAUSTIVE = $ENV{BIOPERL_TILING_EXHAUSTIVE_TESTS};
12     $VERBOSE    = $ENV{BIOPERL_TILING_VERBOSE_TESTS};
13     test_begin(-tests => ($EXHAUSTIVE ? 6519 : 1141) );
16 use_ok('Bio::Search::Tiling::MapTiling');
17 use_ok('Bio::Search::Tiling::MapTileUtils');
18 use_ok('Bio::SearchIO');
19 use_ok('Bio::Search::Hit::BlastHit');
20 use_ok('File::Spec');
22 my ($blio, $result, $hit, $tiling, $hsp);
24 my @normal_formats = qw( blast  wublast
25                          blastn wublastn
26                          blastp wublastp
27                          multiblast 
28                          megablast
29                          rpsblast
30                          psiblast );
31 my @xltd_formats  = qw(  blastx wublastx
32                          tblastn wutblastn
33                          tblastx wutblastx );
34                          
35                  
36 # an exhaustive listing of search reports in 
37 # t/data 
38         
39 my %test_files = (
40     'blast' => [qw(
41                ecolitst.bls
42                frac_problems.blast
43                frac_problems2.blast
44                frac_problems3.blast
45                bl2seq.out
46                )],
47     'multiblast' => [qw(
48                multi_blast.bls
49                )],
50     'blastn' => [qw(
51                a_thaliana.blastn
52                bl2seq.blastn
53                new_blastn.txt
54                hsinsulin.blastcl3.blastn
55                )],
56     'wublastn' =>[qw(
57                brassica_ATH.WUBLASTN
58                echofilter.wublastn
59                )],
60     'blastp' => [qw(
61                blastp2215.blast
62                no_hsps.blastp
63                catalase-webblast.BLASTP
64                )],
65     'wublastp' => [qw(
66                dcr1_sp.WUBLASTP
67                ecolitst.wublastp
68                contig-by-hand.wublastp
69                ecolitst.noseqs.wublastp
70                )],
71     'blastx' => [qw(
72                bl2seq.blastx.out
73                )],
74     'wublastx' => [qw(
75                dnaEbsub_ecoli.wublastx
76                )],
77     'wublast' => [qw(
78                tricky.wublast
79                )],
80     'tblastn' => [qw(
81                tblastn.out
82                1ZZ19XR301R-Alignment.tblastn
83                )],
84     'wutblastn' => [qw(
85                dnaEbsub_ecoli.wutblastn
86                )],
87     'tblastx' => [qw(
88                bl2seq.tblastx.out
89                HUMBETGLOA.tblastx
90                )],
91     'wutblastx' => [qw(
92                dnaEbsub_ecoli.wutblastx
93                )],
94     'megablast' => [qw(
95                503384.MEGABLAST.2
96                )],
97     'rpsblast' => [qw(
98                ecoli_domains.rpsblast
99                )],
100     'psiblast' => [qw(
101                psiblastreport.out
102                )],
103     'bug2942'  => [qw(
104                bug2942.blastx
105                )]
106     );
108 # a subset of search reports for 
109 # run-o-the-mill regression tests
111 my %example_files = (
112     'blast' => [qw(
113                ecolitst.bls
114                )],
115     'blastn' => [qw(
116                a_thaliana.blastn
117                )],
118     'wublastn' =>[qw(
119                brassica_ATH.WUBLASTN
120                )],
121     'blastp' => [qw(
122                no_hsps.blastp
123                catalase-webblast.BLASTP
124                )],
125     'wublastp' => [qw(
126                dcr1_sp.WUBLASTP
127                )],
128     'blastx' => [qw(
129                bl2seq.blastx.out
130                )],
131     'wublastx' => [qw(
132                dnaEbsub_ecoli.wublastx
133                )],
134     'wublast' => [qw(
135                tricky.wublast
136                )],
137     'tblastn' => [qw(
138                tblastn.out
139                )],
140     'wutblastn' => [qw(
141                dnaEbsub_ecoli.wutblastn
142                )],
143     'tblastx' => [qw(
144                HUMBETGLOA.tblastx
145                )],
146     'wutblastx' => [qw(
147                dnaEbsub_ecoli.wutblastx
148                )],
149     'megablast' => [qw(
150                503384.MEGABLAST.2
151                )]
152     );
154 ok( $blio = new Bio::SearchIO( 
155         -file=>test_input_file('dcr1_sp.WUBLASTP'),
156         -format=>'blast'), 'parse data file');
158 $result = $blio->next_result;
159 while ( $_ = $result->next_hit ) {
160     last if $_->name =~ /ASPTN/;
162 ok($hit = $_, 'got test hit');
163 ok($tiling = Bio::Search::Tiling::MapTiling->new($hit), 'create tiling');
166 # TilingI compliance
168 isa_ok($tiling, 'Bio::Search::Tiling::TilingI');
169 foreach ( qw( next_tiling rewind_tilings identities conserved length ) ) {
170     ok( $tiling->$_, "implements '$_'" );
173 # regression test on original calculations
175 my @orig_id_results = ( 387,388,388,381,382,389 );
176 my @orig_cn_results = ( 622,619,628,608,611,613 );
177 my @id_results = (
178     $tiling->identities('query', 'exact'),
179     $tiling->identities('query', 'est'),
180     $tiling->identities('query', 'max'),
181     $tiling->identities('subject', 'exact'),
182     $tiling->identities('subject', 'est'),
183     $tiling->identities('subject', 'max')
184     );
185 my @cn_results = (
186     $tiling->conserved('query', 'exact'),
187     $tiling->conserved('query', 'est'),
188     $tiling->conserved('query', 'max'),
189     $tiling->conserved('subject', 'exact'),
190     $tiling->conserved('subject', 'est'),
191     $tiling->conserved('subject', 'max')
192     );
193 map { $_ = int($_) } @id_results, @cn_results;
195 is_deeply(\@id_results, \@orig_id_results, 'identities regression test');
196 is_deeply(\@cn_results, \@orig_cn_results, 'conserved regression test');
198 # tiling iterator regression tests
200 my ($qn, $sn)=(0,0);
201 while ($tiling->next_tiling('query')) {$qn++};
202 while ($tiling->next_tiling('subject')) {$sn++};
203 is ($qn, 8, 'tiling iterator regression test(1)');
204 is ($sn, 128, 'tiling iterator regression test(2)');
205 $tiling->rewind('subject');
206 while ($tiling->next_tiling('subject')) {$sn++};
207 is ($sn, 256, 'tiling iterator regression test(3, rewind)');
209 diag("Old blast.t tiling tests") if $VERBOSE;
211 ok($blio = Bio::SearchIO->new(
212     '-format' => 'blast',
213     '-file'   => test_input_file('ecolitst.wublastp')
214    ), "ecolitst.wublastp");
215 $result = $blio->next_result;
216 $result->next_hit;
217 $hit = $result->next_hit;
218 $tiling = Bio::Search::Tiling::MapTiling->new($hit);
219 # Test HSP contig data returned by SearchUtils::tile_hsps()
220 # Second hit has two hsps that overlap.
222 # compare with the contig made by hand for these two contigs
223 # in t/data/contig-by-hand.wublastp
224 # (in this made-up file, the hsps from ecolitst.wublastp
225 #  were aligned and contiged, and Length, Identities, Positives 
226 #  were counted, by a human (maj) )
227         
228 my $hand_hit = Bio::SearchIO->new(
229     -format=>'blast', 
230     -file=>test_input_file('contig-by-hand.wublastp')
231     )->next_result->next_hit;
232 my $hand_hsp = $hand_hit->next_hsp;
233 my @hand_qrng = $hand_hsp->range('query');
234 my @hand_srng = $hand_hsp->range('hit');
235 my @hand_matches = $hand_hit->matches;
237 is(($tiling->range('query'))[0], $hand_qrng[0]);
238 is(($tiling->range('query'))[1], $hand_qrng[1]);
239 is(sprintf("%d",$tiling->identities('query')), $hand_matches[0]);
240 is(sprintf("%d",$tiling->conserved('query')), $hand_matches[1]);
241 is(($tiling->range('hit'))[0], $hand_srng[0]);
242 is(($tiling->range('hit'))[1], $hand_srng[1]);
243 is(sprintf("%d",$tiling->identities('hit')), $hand_matches[0]);
244 is(sprintf("%d",$tiling->conserved('hit')), $hand_matches[1]);
246 ok( $blio = Bio::SearchIO->new(
247         '-format' => 'blast',
248         '-file'   => test_input_file('dnaEbsub_ecoli.wublastx')
249     ), "dnaEbsub_ecoli.wublastx");
251 $hit = $blio->next_result->next_hit;
252 $tiling = Bio::Search::Tiling::MapTiling->new($hit);
253 is(sprintf("%.3f",$tiling->frac_identical(-type=>'query',-denom=>'aligned',-context=>'p2')), '0.364');
254 is(sprintf("%.3f",$tiling->frac_identical(-type=>'hit',-denom=>'aligned',-context=>'all')), '0.366');
255 is(sprintf("%.3f",$tiling->frac_conserved(-type=>'query',-denom=>'aligned',-context=>'p2')), '0.537');
256 is(sprintf("%.3f",$tiling->frac_conserved(-type=>'hit',-denom=>'aligned',-context=>'all')), '0.540');
257 is(sprintf("%.2f",$tiling->frac_aligned_query(-context=>'p2')), '0.62');
258 is(sprintf("%.2f",$tiling->frac_aligned_hit(-context=>'all')), '0.71');
260 ok( $blio = Bio::SearchIO->new(
261         '-format' => 'blast',
262         '-file'   => test_input_file('tricky.wublast')
263     ), "tricky.wublast");
265 $hit = $blio->next_result->next_hit;
266 $tiling = Bio::Search::Tiling::MapTiling->new($hit);
267 cmp_ok sprintf("%.3f",$tiling->frac_identical(-denom => 'aligned')), '>', 0.2, 'tricky.wublast(1)';
268 cmp_ok sprintf("%.3f",$tiling->frac_conserved(-denom => 'aligned')), '<=', 1, 'tricky.wublast(2)';
269 is(sprintf("%.2f",$tiling->frac_aligned_query), '0.92', 'tricky.wublast(3)');
270 is(sprintf("%.2f",$tiling->frac_aligned_hit), '0.91','tricky.wublast(4)');
272 diag("New tiling tests") if $VERBOSE;
274 # select test file set based on the environment variable
275 # BIOPERL_TILING_EXHAUSTIVE_TESTS
277 my $files = ($EXHAUSTIVE ? \%test_files : \%example_files);
279 foreach my $alg (@normal_formats, @xltd_formats) {
280     diag("*******$alg files*******") if ($files->{$alg} && $VERBOSE);
281     foreach my $tf (@{$files->{$alg}}) {
282         ok( $blio = Bio::SearchIO->new( -format=>'blast', 
283                                         -file=>test_input_file($tf)
284             ), "$tf" );
285         $result = $blio->next_result;
286         my $hit_count = 0;
287         # compare the per-aligned-base identity avg over hsps
288         # with frac_identical (bzw, conserved)
289         
290       HIT:
291         while ( $hit = $result->next_hit ) {
292             ++$hit_count;
293             # quiet the "No HSPs" warning with -verbose => -1
294             ok( $tiling = Bio::Search::Tiling::MapTiling->new(-hit=>$hit,-verbose=>-1), "tile $tf hit $hit_count #hsps ".scalar $tiling->hsps );
295             my @hsps = $tiling->hsps;
296             
297             unless (@hsps) {
298                 diag( "--no hsps for $tf hit $hit_count") if $VERBOSE;
299                 next HIT;
300             }
301             my ($dpct, $est, $fast,$exact, $max);
302             my $tol = 0.10; # % difference accepted as approx. equal
303             
304             ## loop through contexts:
305             for my $type (qw( query hit )) {
306                 for my $context ($tiling->contexts($type)) {
307                     diag(" --- $type $context ---") if $VERBOSE;
308                     if (scalar($tiling->contexts($type, $context)) == 1) {
309                         # equality
310                         ($dpct, $est, $fast) = $tiling->cmp_frac($type,'identical','aligned', 'est', 'fast', $context);
311                         is( $est,$fast, substr($type,0,1)." id: est ($est) = fast ($fast)");
312                         ($dpct, $est, $fast) = $tiling->cmp_frac($type,'conserved','aligned', 'est', 'fast', $context);
313                         is( $est,$fast, substr($type,0,1)." cn: est ($est) = fast ($fast)");
314                     }
315                     else {
316                         # comparisons
317                         ($dpct, $est, $fast) = $tiling->cmp_frac($type,'identical','aligned', 'est', 'fast', $context);
318 #                       cmp_ok( $dpct, "<", $tol, 
319 #                               substr($type,0,1)." id: est ($est) ~ fast ($fast)");
320                         ($dpct, $exact, $max) = $tiling->cmp_frac($type,'identical','aligned', 'exact', 'max', $context);
321                         cmp_ok( abs($exact-$est)/$exact, "<" , $tol, 
322                                 substr($type,0,1)." id: exact ($exact) ~ est ($est)");
323                         cmp_ok( $exact, "<=" , $max, 
324                                 substr($type,0,1)." id: exact ($exact) <= max ($max)");
325                         
326                         ($dpct, $est, $fast) = $tiling->cmp_frac($type,'conserved','aligned', 'est', 'fast', $context);
327 #                       cmp_ok( $dpct, "<", $tol, 
328 #                               substr($type,0,1)." cn: est ($est) ~ fast ($fast)");
329                         ($dpct, $exact, $max) = $tiling->cmp_frac($type,'conserved','aligned', 'exact', 'max', $context);
330                         cmp_ok(  abs($exact-$est)/$exact, "<" , $tol, 
331                                  substr($type,0,1)." cn: exact ($exact) ~ est ($est)");
332                         cmp_ok( $exact, "<=" , $max, 
333                                 substr($type,0,1)." cn: exact ($exact) <= max ($max)");
334                     }
335                 }
336             }
337         }
338     }
341 # bug 2942
343 my %expected_ranges = ( 'm0' => [7, 11037], #query
344                         'm1' => [1770, 10865], #query 
345                         'm2' => [2462, 14599], #query
346                         'all' => [231, 3563] #subject
347     );
348 $blio = Bio::SearchIO->new( -file=>test_input_file( $test_files{'bug2942'}->[0] ),
349                             -format => 'blast' );
350 $hit = $blio->next_result->next_hit;
351 $tiling = Bio::Search::Tiling::MapTiling->new($hit);
352 for ( 'm0', 'm1', 'm2' ) {
353     is_deeply( [$tiling->range('query',$_)], $expected_ranges{$_}, "bug2942: query $_: range correct");
355 is_deeply( [$tiling->range('subject', 'all')], $expected_ranges{'all'}, "bug2942: subject all : range correct" );
357 # test get_tiled_alns
359 $blio = Bio::SearchIO->new( -file=>test_input_file( 'dcr1_sp.WUBLASTP' ) );
360 $result = $blio->next_result;
361 while ($hit = $result->next_hit) {
362     last if $hit->name =~ /ASPTN/;
365 $tiling = Bio::Search::Tiling::MapTiling->new($hit);
367 ok my @alns = $tiling->get_tiled_alns, "get_tiled_alns";
368 is scalar @alns, 6, "got all alns";
370 for my $aln ( @alns ) {
371     my (@aint, @qint, @sint);
372     my $qs = $aln->get_seq_by_id('query');
373     my $ss = $aln->get_seq_by_id('subject');
374     ok my @qfeats = $qs->get_SeqFeatures;
375     foreach (@qfeats) {
376         push @aint, [$_->start, $_->end];
377         push @qint, [($_->get_tag_values('query_start'))[0],
378                      ($_->get_tag_values('query_end'))[0] ];
379     }
380     is( eval(join('+', map {$$_[1]-$$_[0]+1} @aint)),
381         eval(join('+', map {$$_[1]-$$_[0]+1} @qint)), 
382         "aln and qfeat lengths correspond" );
383     is( $qs->length - $qs->num_gaps('-'), eval(join('+', map {$$_[1]-$$_[0]+1} @qint)), "q length correct");
384     ok my @hfeats = $ss->get_SeqFeatures;
385     @aint = ();
386     ok ( @qfeats == @hfeats, "features on q and s correspond");
387     foreach (@hfeats) {
388         push @aint, [$_->start, $_->end];
389         push @sint, [($_->get_tag_values('subject_start'))[0],
390                      ($_->get_tag_values('subject_end'))[0] ];
391     }
392     is( eval(join('+', map {$$_[1]-$$_[0]+1} @aint)),
393         eval(join('+', map {$$_[1]-$$_[0]+1} @sint)), 
394         "aln and hfeat lengths correspond" );
395     is( $ss->length - $ss->num_gaps('-'), eval(join('+', map {$$_[1]-$$_[0]+1} @sint)), "s length correct");
402 package Bio::Search::Tiling::MapTiling;
404 sub cmp_frac {
405     my ($tiling, $type, $method, $denom, @actions) = @_;
406     my ($a, $b);
407     my $context = ($actions[2] ? $actions[2] : 'all');
408     $a = $tiling->frac(-type=>$type, 
409                        -method=>$method, 
410                        -denom=>$denom,
411                        -action=>$actions[0],
412                        -context=>$context);
413     $b = $tiling->frac(-type=>$type, 
414                        -method=>$method, 
415                        -denom=>$denom,
416                        -action=>$actions[1],
417                        -context=>$context);
418     return ( abs($a-$b)/$a, f(5,$a), f(5,$b) );
421 sub f { my ($d,$val) = @_; sprintf("%.${d}f",$val) }       
423     
425