Merge pull request #42 from solgenomics/topic/duplicate_image_warning
[cxgn-corelibs.git] / t / CXGN / Cluster / cluster.t
blob2056902b655e8dddcc1f21ceb8755f49c5c7dde8
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
5 use FindBin;
6 use File::Temp qw/tempfile/;
7 use Data::Dumper;
9 use Test::More;
11 use Bio::Index::Fasta;
13 use CXGN::Tools::List qw/str_in/;
14 use IPC::Cmd qw/can_run/;
16 BEGIN {
17 use_ok( 'CXGN::Cluster' )
18 or plan skip_all => 'could not include the module being tested';
20 unless( can_run('phrap.longreads') ){
21 plan skip_all => 'phrap.longreads not available';
25 plan tests => 19;
27 #now start testing with them
28 my $set = CXGN::Cluster::ClusterSet->new;
30 my @seqnames = sort qw/monkey bonobo homo orangutan chimpanzee gorilla marmoset lemur/;
32 $set->add_match(@seqnames[0,1]);
33 is_deeply( cs_contents($set),
34 [[ @seqnames[0,1] ]],
35 'one cluster with 2 members',
39 $set->add_match(@seqnames[0,2]);
40 is_deeply( cs_contents($set),
41 [[ @seqnames[0,1,2] ]],
42 'then three members',
45 $set->add_match(@seqnames[3,3]);
46 is_deeply( cs_contents($set),
47 [[ @seqnames[0,1,2] ],
48 [ $seqnames[3] ],
50 'self-match adds an unrelated cluster',
53 $set->add_match(@seqnames[0,0]);
54 is_deeply( cs_contents($set),
55 [[ @seqnames[0,1,2] ],
56 [ $seqnames[3] ],
58 'self-match of already present sequence does not change anything',
61 $set->add_match(@seqnames[1,1]);
62 is_deeply( cs_contents($set),
63 [[ @seqnames[0,1,2] ],
64 [ $seqnames[3] ],
66 'self-match of already present sequence does not change anything',
70 $set->add_match(@seqnames[3,7]);
71 $set->add_match(@seqnames[3,6]);
72 $set->add_match(@seqnames[3,5]);
73 is_deeply( cs_contents($set),
75 [ @seqnames[3,5,6,7] ],
76 [ @seqnames[0,1,2] ],
78 'and add more to the second cluster',
83 #find our file of test sequences
84 my $test_seqs_filename = File::Spec->catfile($FindBin::Bin,'data','test.seq');
85 die "can't open $test_seqs_filename" unless -r $test_seqs_filename;
87 #make a Bio::Index of them so we can retrieve individual ones
88 my $test_seqs_index = Bio::Index::Fasta->new( -filename => do { my (undef,$tf) = tempfile(UNLINK => 1); $tf },
89 -write_flag => 1
91 $test_seqs_index->make_index($test_seqs_filename);
93 #make a new set
94 $set = CXGN::Cluster::ClusterSet->new;
96 #now test the cluster calculation using a few things we know should cluster in the test set
97 my @known_clusters =
99 [sort qw/C04HBa0114G11.1 C04HBa0050I18.1 C04HBa0036C23.1 C04HBa0008H22.1/],
100 [sort qw/C04HBa0024G05.1 C04HBa0020F17.1 /],
101 [sort qw/C10HBa0020A12.1 C10HBa0248A13.3 /],
104 #make a cluster with just one member
105 $set->add_match($known_clusters[0][0],$known_clusters[0][0]);
106 my @c = $set->get_clusters;
107 is_deeply(cs_contents($set),
108 [[$known_clusters[0][0]]],
109 'single-member cluster loaded OK',
112 is_deeply( [ $c[0]->get_contig_coords($test_seqs_index) ],
113 [[['C04HBa0008H22.1',1,81572,1]]],
114 'singleton contig coordinates appear correctly',
117 #put in the first cluster
118 $set->add_match($known_clusters[0][0],$_) foreach @{$known_clusters[0]};;
119 is_deeply(cs_contents($set),
120 [$known_clusters[0]],
121 'first known cluster members loaded ok'
124 #now get its coordinates, it should come out as just one contig
125 @c = $set->get_clusters;
126 #print Dumper $c[0]->get_contig_coords($test_seqs_index);
127 is_deeply( [ $c[0]->get_contig_coords($test_seqs_index) ],
130 'C04HBa0114G11.1',
132 '28696',
136 'C04HBa0050I18.1',
137 26697,
138 '140337',
142 'C04HBa0036C23.1',
143 138338,
144 '252489',
148 'C04HBa0008H22.1',
149 250490,
150 '332061',
154 'first test contig assembled OK'
158 my $bs = [ $c[0]->get_consensus_base_segments($test_seqs_index, min_segment_size => 100 ) ];
159 is_deeply( $bs,
163 '1',
164 '26696',
165 'C04HBa0114G11.1',
167 26696,
171 '26697',
172 '138337',
173 'C04HBa0050I18.1',
175 111641,
179 '138338',
180 '250489',
181 'C04HBa0036C23.1',
183 112152,
187 '250490',
188 '332061',
189 'C04HBa0008H22.1',
191 81572,
196 'get_consensus_base_segments',
198 or diag Dumper $bs;
200 #put in the second cluster
201 $set->add_match($known_clusters[1][0],$_) foreach @{$known_clusters[1]};
202 is_deeply(cs_contents($set),
203 [@known_clusters[0,1]],
204 'second known cluster members loaded ok'
207 #assemble the second cluster
208 @c = $set->get_clusters;
209 is_deeply( [ $c[0]->get_contig_coords($test_seqs_index) ],
212 'C04HBa0024G05.1',
214 '85242',
218 'C04HBa0020F17.1',
219 83243,
220 '211703',
224 'second test contig assembled OK'
227 $bs = [ $c[0]->get_consensus_base_segments($test_seqs_index) ];
228 is_deeply( $bs,
232 '1',
233 '83242',
234 'C04HBa0024G05.1',
236 83242,
240 '83243',
241 '211703',
242 'C04HBa0020F17.1',
244 128461,
250 or diag Dumper $bs;
253 #make an artificial linkage between the first and second cluster, this
254 #should assemble into two real contigs
255 $set->add_match($known_clusters[0][0],$known_clusters[1][1]);
256 is_deeply(cs_contents($set),
257 [[sort @{$known_clusters[0]},@{$known_clusters[1]}]],
258 'artificial cluster linkage results in joining of clusters'
261 #now check that they assemble into two actual contigs
262 @c = $set->get_clusters;
263 #bprint Dumper $c[0]->get_contig_coords($test_seqs_index);
264 is_deeply( [ $c[0]->get_contig_coords($test_seqs_index) ],
268 'C04HBa0024G05.1',
270 '85242',
274 'C04HBa0020F17.1',
275 83243,
276 '211703',
282 'C04HBa0114G11.1',
284 '28696',
288 'C04HBa0050I18.1',
289 26697,
290 '140337',
294 'C04HBa0036C23.1',
295 138338,
296 '252489',
300 'C04HBa0008H22.1',
301 250490,
302 '332061',
307 'erroneous precluster assembles into two actual contigs'
311 #make a new set
312 $set = CXGN::Cluster::ClusterSet->new;
314 # now add fake matches between and among the known clusters to put
315 # them all in one precluster
316 $set->add_match( $known_clusters[0][0], $known_clusters[2][0] );
317 $set->add_match( $known_clusters[2][0], $_ ) for @{$known_clusters[2]};
319 @c = $set->get_clusters;
320 is( scalar(@c), 1, 'got one cluster' );
321 # and check the base segments that are made
322 $bs = [ $c[0]->get_consensus_base_segments($test_seqs_index, min_segment_size => 100 ) ];
323 is_deeply( $bs,
328 134537,
329 'C10HBa0020A12.1',
331 134537,
335 134538,
336 134541,
337 'C10HBa0248A13.3',
338 134538,
339 134541,
343 134542,
344 135038,
345 'C10HBa0020A12.1',
346 134542,
347 135038,
354 81572,
355 'C04HBa0008H22.1',
357 81572,
362 'segment merging worked'
364 or diag Dumper $bs;
366 #make a sorted list of sorted arrayrefs representing the clusters in the set
367 #clusters sorted in descending size order,
368 #members sorted in alphabetical order
369 sub cs_contents {
370 my $set = shift;
371 my @c = $set->get_clusters;
372 return [ sort {@$b <=> @$a} map [sort $_->get_members], @c ];