1 # -*-Perl-*- Test Harness script for Bioperl
10 -requires_modules => [qw(DB_File
14 use_ok('Bio::DB::Taxonomy');
15 use_ok('Bio::Tree::Tree');
18 my $temp_dir = test_output_dir();
20 # we're actually testing Bio::Taxon and Bio::DB::Taxonomy::* here, not
23 ok my $db_flatfile = Bio::DB::Taxonomy->new(
24 -source => 'flatfile',
25 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
26 -namesfile => test_input_file('taxdump', 'names.dmp'),
28 isa_ok $db_flatfile, 'Bio::DB::Taxonomy::flatfile';
29 isa_ok $db_flatfile, 'Bio::DB::Taxonomy';
31 # By not specifying a '-directory' argument, index files go to a temporary
32 # folder ($Bio::Root::IO::TEMPDIR, such as 'C:\Users\USER\AppData\Local\Temp'),
33 # and are implied to be temporary. So test the ability of flatfile->DESTROY to
34 # remove the temporary index files at object destruction (this also affects files
35 # in "test_output_dir()", since the folder is created inside the temporary folder)
36 no warnings qw(once); # silence 'Name "$Bio::Root::IO::TEMPDIR" used only once'
37 is $db_flatfile->{index_directory}, $Bio::Root::IO::TEMPDIR, 'removal of temporary index files: no -directory';
38 $db_flatfile->DESTROY;
39 ok not -e ($db_flatfile->{index_directory} . '/id2names');
40 ok not -e ($db_flatfile->{index_directory} . '/names2id');
41 ok not -e ($db_flatfile->{index_directory} . '/nodes');
42 ok not -e ($db_flatfile->{index_directory} . '/parents');
44 # Test removal of temporary index files from test_output_dir folder
45 # (since test_output_dir() =~ m/^$Bio::Root::IO::TEMPDIR/)
46 ok $db_flatfile = Bio::DB::Taxonomy->new(
47 -source => 'flatfile',
48 -directory => $temp_dir,
49 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
50 -namesfile => test_input_file('taxdump', 'names.dmp'),
53 is $db_flatfile->{index_directory}, $temp_dir, 'removal of temporary index files: test_output_dir()';
54 $db_flatfile->DESTROY;
55 ok not -e ($db_flatfile->{index_directory} . '/id2names');
56 ok not -e ($db_flatfile->{index_directory} . '/names2id');
57 ok not -e ($db_flatfile->{index_directory} . '/nodes');
58 ok not -e ($db_flatfile->{index_directory} . '/parents');
60 # Generate the object (and the files) again for the remaining tests
61 ok $db_flatfile = Bio::DB::Taxonomy->new(
62 -source => 'flatfile',
63 -directory => $temp_dir,
64 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
65 -namesfile => test_input_file('taxdump', 'names.dmp'),
73 is $db_flatfile->get_num_taxa, 189;
75 $id = $db_flatfile->get_taxonid('Homo sapiens');
79 # easy test on human, try out the main Taxon methods
80 ok $n = $db_flatfile->get_taxon(9606);
82 is $n->object_id, $n->id;
83 is $n->ncbi_taxid, $n->id;
84 is $n->parent_id, 9605;
85 is $n->rank, 'species';
87 is $n->node_name, 'Homo sapiens';
88 is $n->scientific_name, $n->node_name;
89 is ${$n->name('scientific')}[0], $n->node_name;
91 my %common_names = map { $_ => 1 } $n->common_names;
92 cmp_ok keys %common_names, '>=', 3, ref($db_flatfile).": common names";
93 ok exists $common_names{human};
94 ok exists $common_names{man};
96 is $n->division, 'Primates';
97 is $n->genetic_code, 1;
98 is $n->mitochondrial_genetic_code, 2;
100 # briefly test some Bio::Tree::NodeI methods
101 ok my $ancestor = $n->ancestor;
102 is $ancestor->scientific_name, 'Homo';
103 # unless set explicitly, Bio::Taxon doesn't return anything for
104 # each_Descendent; must ask the database directly
105 ok my @children = $ancestor->db_handle->each_Descendent($ancestor);
106 cmp_ok @children, '>', 0;
108 # do some trickier things...
109 ok my $n2 = $db_flatfile->get_Taxonomy_Node('89593');
110 is $n2->scientific_name, 'Craniata';
112 # briefly check we can use some Tree methods
113 my $tree = Bio::Tree::Tree->new();
114 is $tree->get_lca($n, $n2)->scientific_name, 'Craniata';
117 my @nodes = $tree->get_nodes;
118 is scalar(@nodes), 0;
120 @lineage_nodes = $tree->get_lineage_nodes($n->id); # read ID, only works if nodes have been added to tree
121 is scalar @lineage_nodes, 0;
122 @lineage_nodes = $tree->get_lineage_nodes($n); # node object always works
123 cmp_ok(scalar @lineage_nodes, '>', 20);
126 like($tree->get_lineage_string($n), qr/cellular organisms;Eukaryota/);
127 like($tree->get_lineage_string($n,'-'), qr/cellular organisms-Eukaryota/);
128 like($tree->get_lineage_string($n2), qr/cellular organisms;Eukaryota/);
130 # can we actually form a Tree and use other Tree methods?
131 ok $tree = Bio::Tree::Tree->new(-node => $n);
132 cmp_ok($tree->number_nodes, '>', 20);
133 cmp_ok(scalar($tree->get_nodes), '>', 20);
134 is $tree->find_node(-rank => 'genus')->scientific_name, 'Homo';
136 # check that getting the ancestor still works now we have explitly set the
137 # ancestor by making a Tree
138 is $n->ancestor->scientific_name, 'Homo';
140 ok $n = $db_flatfile->get_Taxonomy_Node('1760');
141 is $n->scientific_name, 'Actinobacteria';
143 my @ids = sort $db_flatfile->get_taxonids('Chloroflexi');
145 is_deeply \@ids, [200795, 32061];
147 $id = $db_flatfile->get_taxonids('Chloroflexi (class)');
150 @ids = $db_flatfile->get_taxonids('Rhodotorula');
151 cmp_ok @ids, '>=' , 1;
152 # note the locally cached flatfile is out-of-date, but technically
153 # correct for testing purposes
154 ok grep { $_ == 266791 } @ids;
155 ok grep { $_ == 5533 } @ids;
159 # Test the list database
161 ok my $db_list = Bio::DB::Taxonomy->new(-source => 'list');
162 isa_ok $db_list, 'Bio::DB::Taxonomy::list';
163 isa_ok $db_list, 'Bio::DB::Taxonomy';
165 my @ranks = qw(superkingdom class genus species);
166 my @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
167 ok $db_list = Bio::DB::Taxonomy->new(
169 -names => \@h_lineage,
172 is $db_list->get_num_taxa, 4;
175 ok @taxa = map {$db_list->get_taxon(-name=>$_)} @h_lineage;
176 is_deeply [map {ref($_)} @taxa], [('Bio::Taxon')x4];
177 is_deeply [map {$_->rank} @taxa], \@ranks, 'Ranks';
179 @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo erectus');
180 $db_list->add_lineage(-names => \@h_lineage, -ranks => \@ranks);
182 ok @taxa = map {$db_list->get_taxon(-name=>$_)} @h_lineage;
183 is_deeply [map {ref($_)} @taxa], [('Bio::Taxon')x4];
184 is_deeply [map {$_->rank} @taxa], \@ranks, 'Ranks';
187 ok my $tree = $db_list->get_tree('Homo sapiens', 'Homo erectus');
188 isa_ok $tree, 'Bio::Tree::TreeI';
189 is $tree->number_nodes, 5;
190 is $tree->total_branch_length, 4;
191 ok my $node1 = $tree->find_node( -scientific_name => 'Homo sapiens' );
192 ok my $node2 = $tree->find_node( -scientific_name => 'Homo erectus' );
193 is $tree->distance($node1, $node2), 2;
195 ok my $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
196 ok my $h_flat = $db_flatfile->get_taxon(-name => 'Homo sapiens');
198 is $h_list->ancestor->scientific_name, 'Homo';
200 my @names = $h_list->common_names;
202 $h_list->common_names('woman');
203 @names = $h_list->common_names;
205 @names = $h_flat->common_names;
208 # you can switch to another database when you need more information, which also
209 # merges information in the node from the two different dbs
210 $h_list->db_handle($db_flatfile);
211 @names = $h_list->common_names;
214 # form a tree with the list lineage first, preventing a subsequent database
215 # change from giving us all those extra ranks
216 $h_list->db_handle($db_list);
217 my $ancestors_ancestor = $h_list->ancestor->ancestor;
218 is $ancestors_ancestor->scientific_name, 'Mammalia';
220 $tree = Bio::Tree::Tree->new(-node => $h_list);
221 $h_list->db_handle($db_flatfile);
222 $ancestors_ancestor = $h_list->ancestor->ancestor;
223 is $ancestors_ancestor->scientific_name, 'Mammalia';
225 # or we can get the flatfile database's idea of the ancestors by removing
226 # ourselves from the tree
227 is $h_flat->ancestor->ancestor->scientific_name, 'Homo/Pan/Gorilla group';
228 $h_list->ancestor(undef);
229 is $h_list->ancestor->ancestor->scientific_name, 'Homo/Pan/Gorilla group';
231 # get_lca should work on nodes from different databases
233 test_skip(-tests => 9,
234 -requires_modules => ['Bio::DB::Taxonomy::entrez'],
235 -requires_networking => 1);
236 my $db_entrez = Bio::DB::Taxonomy->new(-source => 'entrez');
238 # check that the result is the same as if we are retrieving from the same DB
240 $h_flat = $db_flatfile->get_taxon(-name => 'Homo');
241 my $h_flat2 = $db_flatfile->get_taxon(-name => 'Homo sapiens');
242 ok my $tree_functions = Bio::Tree::Tree->new();
243 is $tree_functions->get_lca($h_flat, $h_flat2)->scientific_name, 'Homo', 'get_lca() within flatfile db';
247 eval { $h_entrez = $db_entrez->get_taxon(-name => 'Homo sapiens');};
248 skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
250 eval { $h_entrez2 = $db_entrez->get_taxon(-name => 'Homo');};
251 skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
252 ok $tree_functions = Bio::Tree::Tree->new();
253 is $tree_functions->get_lca($h_entrez, $h_entrez2)->scientific_name, 'Homo', 'get_lca() within entrez db';
255 ok $tree_functions = Bio::Tree::Tree->new();
256 # mixing entrez and flatfile
258 local $TODO = 'Mixing databases for get_lca() not working, see bug #3416';
259 is $tree_functions->get_lca($h_flat, $h_entrez)->scientific_name, 'Homo', 'get_lca() mixing flatfile and remote db';
261 # even though the species taxa for Homo sapiens from list and flat databases
262 # have the same internal id, get_lca won't work because they have different
263 # roots and descendents
264 $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
265 is $h_list->ancestor->internal_id, $h_flat->internal_id;
266 ok ! $tree_functions->get_lca($h_flat, $h_list);
268 # but we can form a tree with the flat node then remove all the ranks we're
269 # not interested in and try again
270 $tree = Bio::Tree::Tree->new(-node => $h_flat);
271 $tree->splice(-keep_rank => \@ranks);
272 is $tree->get_lca($h_flat, $h_list)->scientific_name, 'Homo';
275 # ideas from taxonomy2tree.PLS that let us make nice tree, using
276 # Bio::Tree::TreeFunctionsI methods; this is a weird and trivial example just
277 # because our test flatfile database only has the full lineage of one species
279 for my $name ('Human', 'Hominidae') {
280 my $ncbi_id = $db_flatfile->get_taxonid($name);
282 my $node = $db_flatfile->get_taxon(-taxonid => $ncbi_id);
285 ok $tree->merge_lineage($node);
288 ok $tree = Bio::Tree::Tree->new(-node => $node);
292 is $tree->get_nodes, 30;
293 $tree->contract_linear_paths;
294 my $ids = join(",", map { $_->id } $tree->get_nodes);
295 is $ids, '131567,9606';
297 # More thorough tests of merge_lineage
298 ok my $node = $db_list->get_taxon(-name => 'Eukaryota');
299 $tree = Bio::Tree::Tree->new(-node => $node);
300 ok $node = $db_list->get_taxon(-name => 'Homo erectus');
301 ok $tree->merge_lineage($node);
302 for my $name ('Eukaryota', 'Mammalia', 'Homo', 'Homo erectus') {
303 ok $node = $tree->find_node(-scientific_name => $name);
308 $db_list = Bio::DB::Taxonomy->new(-source => 'list',
310 (split(/,\s+/, "cellular organisms, Eukaryota, Fungi/Metazoa group,
311 Metazoa, Eumetazoa, Bilateria, Coelomata, Protostomia, Panarthropoda,
312 Arthropoda, Mandibulata, Pancrustacea, Hexapoda, Insecta, Dicondylia,
313 Pterygota, Neoptera, Endopterygota, Diptera, Nematocera, Culicimorpha,
314 Culicoidea, Culicidae, Anophelinae, Anopheles, Anopheles, Angusticorn,
315 Anopheles, maculipennis group, maculipennis species complex, Anopheles daciae"))]);
317 my @taxonids = $db_list->get_taxonids('Anopheles');
318 is @taxonids, 3, 'List context';
320 my $taxonid = $db_list->get_taxonids('Anopheles');
321 isa_ok \$taxonid, 'SCALAR', 'Scalar context';
322 ok exists { map({$_ => undef} @taxonids) }->{$taxonid};
324 # but we should still be able to merge in an incomplete lineage of a sister
325 # species and have the 'tree' remain consistent:
327 # missing 'no rank' Anopheles
328 $db_list->add_lineage(-names => [
329 (split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Angusticorn,
330 maculipennis group, maculipennis species complex, Anopheles labranchiae"))]);
331 $node = $db_list->get_taxon(-name => 'Anopheles labranchiae');
332 is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
333 is $node->rank, undef;
335 # missing 'subgenus' Anopheles
336 $db_list->add_lineage(-names => [
337 (split(/,\s+/, "Anophelinae, Anopheles, Angusticorn, Anopheles,
338 maculipennis group, maculipennis species complex, Anopheles maculipennis"))]);
339 $node = $db_list->get_taxon(-name => 'Anopheles maculipennis');
340 is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
342 # missing 'no rank' Angusticorn
343 $db_list->add_lineage(-names => [
344 (split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Anopheles,
345 maculipennis group, maculipennis species complex, Anopheles melanoon"))]);
346 $node = $db_list->get_taxon(-name => 'Anopheles melanoon');
347 is $node->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Angusticorn';
349 @taxonids = $db_list->get_taxonids('Anopheles');
350 is scalar @taxonids, 3;
352 # bug: duplicate topmost taxa
353 $db_list = Bio::DB::Taxonomy->new( -source => 'list',
354 -names => ['Bacteria', 'Tenericutes'] );
355 $db_list->add_lineage( -names => ['Bacteria'] );
356 @taxonids = $db_list->get_taxonids('Bacteria');
357 is scalar @taxonids, 1;
359 # Disambiguate between taxa with same name using -names
360 ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' ), 'DB with ambiguous names';
361 ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae', 'g__Spongiibacter'] );
362 ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae', 'g__Alteromonas' ] );
364 ok @taxonids = $db_list->get_taxonids('f__Alteromonadaceae');
365 is scalar @taxonids, 2; # multiple taxa would match using $db_list->get_taxon(-name => 'f__Alteromonadaceae')
367 ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae'] );
368 is $node->ancestor->node_name, 'o__Alteromonadales';
369 my $iid = $node->internal_id;
371 ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae'] );
372 is $node->ancestor->node_name, 'o__Oceanospirillales';
373 isnt $node->internal_id, $iid;
376 # More tests with ambiguous names, internal IDs and multiple databases
377 my ($node3, $node4, $db_list_2);
378 ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' );
379 ok $db_list->add_lineage( -names => [ 'o__Enterobacteriales', 'g__Escherichia' ] );
380 ok $db_list->add_lineage( -names => [ 'o__Pseudomonadales' , 'g__Pseudomonas' ] );
381 ok $db_list->add_lineage( -names => [ 'o__Chroococcales' , 'g__Microcoleus' ] );
382 ok $node1 = $db_list->get_taxon( -names => [ 'k__Chroococcales', 'g__Microcoleus' ] );
384 ok $db_list_2 = Bio::DB::Taxonomy->new( -source => 'list' );
385 ok $db_list_2->add_lineage( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
386 ok $node2 = $db_list_2->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
388 is $node1->scientific_name, 'g__Microcoleus';
389 is $node2->scientific_name, 'g__Microcoleus'; # same taxon name
390 isnt $node1->id, $node2->id; # but different dbs and hence taxids
391 is $node1->internal_id, $node1->internal_id; # but same cross-database internal ID
393 ok $db_list->add_lineage( -names => [ 'o__Oscillatoriales' , 'g__Microcoleus' ] );
394 ok $db_list->add_lineage( -names => [ 'o__Acidobacteriales', 'g__Microcoleus' ] );
396 ok $node1 = $db_list->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
397 ok $node2 = $db_list->get_taxon( -names => [ 'o__Oscillatoriales' , 'g__Microcoleus' ] );
398 ok $node3 = $db_list->get_taxon( -names => [ 'o__Acidobacteriales' , 'g__Microcoleus' ] );
399 my @nodes = ($node1, $node2, $node3);
401 is map({$_->id => undef} @nodes), 6; # 3 distinct taxids
402 is map({$_->internal_id => undef} @nodes), 6; # 3 distinct iids
404 ok $db_list->add_lineage( -names => [ 'o__Chroococcales' , 'g__Microcoleus' ] );
405 ok $node2 = $db_list->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
406 is $node2->scientific_name, $node1->scientific_name;
407 is $node2->id, $node1->id;
408 is $node2->internal_id, $node1->internal_id;