1 # -*-Perl-*- Test Harness script for Bioperl
11 -requires_modules => [qw(DB_File
16 use_ok('Bio::DB::Taxonomy');
17 use_ok('Bio::Tree::Tree');
20 my $temp_dir = test_output_dir();
22 # we're actually testing Bio::Taxon and Bio::DB::Taxonomy::* here, not
25 ok my $db_entrez = Bio::DB::Taxonomy->new(-source => 'entrez');
26 isa_ok $db_entrez, 'Bio::DB::Taxonomy::entrez';
27 isa_ok $db_entrez, 'Bio::DB::Taxonomy';
29 ok my $db_flatfile = Bio::DB::Taxonomy->new(
30 -source => 'flatfile',
31 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
32 -namesfile => test_input_file('taxdump', 'names.dmp'),
34 isa_ok $db_flatfile, 'Bio::DB::Taxonomy::flatfile';
35 isa_ok $db_flatfile, 'Bio::DB::Taxonomy';
37 # By not specifying a '-directory' argument, index files go to a temporary
38 # folder ($Bio::Root::IO::TEMPDIR, such as 'C:\Users\USER\AppData\Local\Temp'),
39 # and are implied to be temporary. So test the ability of flatfile->DESTROY to
40 # remove the temporary index files at object destruction (this also affects files
41 # in "test_output_dir()", since the folder is created inside the temporary folder)
42 no warnings qw(once); # silence 'Name "$Bio::Root::IO::TEMPDIR" used only once'
43 is $db_flatfile->{index_directory}, $Bio::Root::IO::TEMPDIR, 'removal of temporary index files: no -directory';
44 $db_flatfile->DESTROY;
45 ok not -e ($db_flatfile->{index_directory} . '/id2names');
46 ok not -e ($db_flatfile->{index_directory} . '/names2id');
47 ok not -e ($db_flatfile->{index_directory} . '/nodes');
48 ok not -e ($db_flatfile->{index_directory} . '/parents');
50 # Test removal of temporary index files from test_output_dir folder
51 # (since test_output_dir() =~ m/^$Bio::Root::IO::TEMPDIR/)
52 ok $db_flatfile = Bio::DB::Taxonomy->new(
53 -source => 'flatfile',
54 -directory => $temp_dir,
55 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
56 -namesfile => test_input_file('taxdump', 'names.dmp'),
59 is $db_flatfile->{index_directory}, $temp_dir, 'removal of temporary index files: test_output_dir()';
60 $db_flatfile->DESTROY;
61 ok not -e ($db_flatfile->{index_directory} . '/id2names');
62 ok not -e ($db_flatfile->{index_directory} . '/names2id');
63 ok not -e ($db_flatfile->{index_directory} . '/nodes');
64 ok not -e ($db_flatfile->{index_directory} . '/parents');
66 # Generate the object (and the files) again for the remaining tests
67 ok $db_flatfile = Bio::DB::Taxonomy->new(
68 -source => 'flatfile',
69 -directory => $temp_dir,
70 -nodesfile => test_input_file('taxdump', 'nodes.dmp'),
71 -namesfile => test_input_file('taxdump', 'names.dmp'),
76 for my $db ($db_entrez, $db_flatfile) {
78 test_skip(-tests => 46, -requires_networking => 1) if $db eq $db_entrez;
81 if ($db eq $db_entrez) {
82 cmp_ok $db->get_num_taxa, '>', 880_000; # 886,907 as of 08-May-2012
84 is $db->get_num_taxa, 189;
87 eval { $id = $db->get_taxonid('Homo sapiens');};
88 skip "Unable to connect to entrez database; no network or server busy?", 38 if $@;
92 # easy test on human, try out the main Taxon methods
93 ok $n = $db->get_taxon(9606);
95 is $n->object_id, $n->id;
96 is $n->ncbi_taxid, $n->id;
97 is $n->parent_id, 9605;
98 is $n->rank, 'species';
100 is $n->node_name, 'Homo sapiens';
101 is $n->scientific_name, $n->node_name;
102 is ${$n->name('scientific')}[0], $n->node_name;
104 my %common_names = map { $_ => 1 } $n->common_names;
105 cmp_ok keys %common_names, '>=', 3, ref($db).": common names";
106 ok exists $common_names{human};
107 ok exists $common_names{man};
109 is $n->division, 'Primates';
110 is $n->genetic_code, 1;
111 is $n->mitochondrial_genetic_code, 2;
112 # these are entrez-only, data not available in dmp files
113 if ($db eq $db_entrez) {
114 ok defined $n->pub_date;
115 ok defined $n->create_date;
116 ok defined $n->update_date;
119 # briefly test some Bio::Tree::NodeI methods
120 ok my $ancestor = $n->ancestor;
121 is $ancestor->scientific_name, 'Homo';
122 # unless set explicitly, Bio::Taxon doesn't return anything for
123 # each_Descendent; must ask the database directly
124 ok my @children = $ancestor->db_handle->each_Descendent($ancestor);
125 cmp_ok @children, '>', 0;
127 sleep(3) if $db eq $db_entrez;
129 # do some trickier things...
130 ok my $n2 = $db->get_Taxonomy_Node('89593');
131 is $n2->scientific_name, 'Craniata';
133 # briefly check we can use some Tree methods
134 my $tree = Bio::Tree::Tree->new();
135 is $tree->get_lca($n, $n2)->scientific_name, 'Craniata';
138 my @nodes = $tree->get_nodes;
139 is scalar(@nodes), 0;
141 @lineage_nodes = $tree->get_lineage_nodes($n->id); # read ID, only works if nodes have been added to tree
142 is scalar @lineage_nodes, 0;
143 @lineage_nodes = $tree->get_lineage_nodes($n); # node object always works
144 cmp_ok(scalar @lineage_nodes, '>', 20);
147 like($tree->get_lineage_string($n), qr/cellular organisms;Eukaryota/);
148 like($tree->get_lineage_string($n,'-'), qr/cellular organisms-Eukaryota/);
149 like($tree->get_lineage_string($n2), qr/cellular organisms;Eukaryota/);
151 # can we actually form a Tree and use other Tree methods?
152 ok $tree = Bio::Tree::Tree->new(-node => $n);
153 cmp_ok($tree->number_nodes, '>', 20);
154 cmp_ok(scalar($tree->get_nodes), '>', 20);
155 is $tree->find_node(-rank => 'genus')->scientific_name, 'Homo';
157 # check that getting the ancestor still works now we have explitly set the
158 # ancestor by making a Tree
159 is $n->ancestor->scientific_name, 'Homo';
161 sleep(3) if $db eq $db_entrez;
163 ok $n = $db->get_Taxonomy_Node('1760');
164 is $n->scientific_name, 'Actinobacteria';
166 sleep(3) if $db eq $db_entrez;
168 # entrez isn't as good at searching as flatfile, so we have to special-case
169 my @ids = sort $db->get_taxonids('Chloroflexi');
171 is_deeply \@ids, [200795, 32061];
173 $id = $db->get_taxonids('Chloroflexi (class)');
174 $db eq $db_entrez ? is($id, 'No hit') : is($id, 32061);
176 @ids = $db->get_taxonids('Rhodotorula');
177 cmp_ok @ids, '>=' , 1;
178 if ($db eq $db_entrez) {
179 diag(join(",", @ids));
180 # From NCBI: Taxid 592558 was merged into taxid 5533 on June 16, 2017
181 is( (grep { $_ == 592558 } @ids), 0, 'Value no longer found');
182 ok grep { $_ == 5533 } @ids;
184 # note the locally cached flatfile is out-of-date, but technically
185 # correct for testing purposes
186 ok grep { $_ == 266791 } @ids;
187 ok grep { $_ == 5533 } @ids;
193 # Test the list database
195 ok my $db_list = Bio::DB::Taxonomy->new(-source => 'list');
196 isa_ok $db_list, 'Bio::DB::Taxonomy::list';
197 isa_ok $db_list, 'Bio::DB::Taxonomy';
199 my @ranks = qw(superkingdom class genus species);
200 my @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
201 ok $db_list = Bio::DB::Taxonomy->new(
203 -names => \@h_lineage,
206 is $db_list->get_num_taxa, 4;
209 ok @taxa = map {$db_list->get_taxon(-name=>$_)} @h_lineage;
210 is_deeply [map {ref($_)} @taxa], [('Bio::Taxon')x4];
211 is_deeply [map {$_->rank} @taxa], \@ranks, 'Ranks';
213 @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo erectus');
214 $db_list->add_lineage(-names => \@h_lineage, -ranks => \@ranks);
216 ok @taxa = map {$db_list->get_taxon(-name=>$_)} @h_lineage;
217 is_deeply [map {ref($_)} @taxa], [('Bio::Taxon')x4];
218 is_deeply [map {$_->rank} @taxa], \@ranks, 'Ranks';
221 ok my $tree = $db_list->get_tree('Homo sapiens', 'Homo erectus');
222 isa_ok $tree, 'Bio::Tree::TreeI';
223 is $tree->number_nodes, 5;
224 is $tree->total_branch_length, 4;
225 ok my $node1 = $tree->find_node( -scientific_name => 'Homo sapiens' );
226 ok my $node2 = $tree->find_node( -scientific_name => 'Homo erectus' );
227 is $tree->distance($node1, $node2), 2;
229 ok my $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
230 ok my $h_flat = $db_flatfile->get_taxon(-name => 'Homo sapiens');
232 is $h_list->ancestor->scientific_name, 'Homo';
234 my @names = $h_list->common_names;
236 $h_list->common_names('woman');
237 @names = $h_list->common_names;
239 @names = $h_flat->common_names;
242 # you can switch to another database when you need more information, which also
243 # merges information in the node from the two different dbs
244 $h_list->db_handle($db_flatfile);
245 @names = $h_list->common_names;
248 # form a tree with the list lineage first, preventing a subsequent database
249 # change from giving us all those extra ranks
250 $h_list->db_handle($db_list);
251 my $ancestors_ancestor = $h_list->ancestor->ancestor;
252 is $ancestors_ancestor->scientific_name, 'Mammalia';
254 $tree = Bio::Tree::Tree->new(-node => $h_list);
255 $h_list->db_handle($db_flatfile);
256 $ancestors_ancestor = $h_list->ancestor->ancestor;
257 is $ancestors_ancestor->scientific_name, 'Mammalia';
259 # or we can get the flatfile database's idea of the ancestors by removing
260 # ourselves from the tree
261 is $h_flat->ancestor->ancestor->scientific_name, 'Homo/Pan/Gorilla group';
262 $h_list->ancestor(undef);
263 is $h_list->ancestor->ancestor->scientific_name, 'Homo/Pan/Gorilla group';
265 # get_lca should work on nodes from different databases
267 test_skip(-tests => 9, -requires_networking => 1);
269 # check that the result is the same as if we are retrieving from the same DB
271 $h_flat = $db_flatfile->get_taxon(-name => 'Homo');
272 my $h_flat2 = $db_flatfile->get_taxon(-name => 'Homo sapiens');
273 ok my $tree_functions = Bio::Tree::Tree->new();
274 is $tree_functions->get_lca($h_flat, $h_flat2)->scientific_name, 'Homo', 'get_lca() within flatfile db';
278 eval { $h_entrez = $db_entrez->get_taxon(-name => 'Homo sapiens');};
279 skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
281 eval { $h_entrez2 = $db_entrez->get_taxon(-name => 'Homo');};
282 skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
283 ok $tree_functions = Bio::Tree::Tree->new();
284 is $tree_functions->get_lca($h_entrez, $h_entrez2)->scientific_name, 'Homo', 'get_lca() within entrez db';
286 ok $tree_functions = Bio::Tree::Tree->new();
287 # mixing entrez and flatfile
289 local $TODO = 'Mixing databases for get_lca() not working, see bug #3416';
290 is $tree_functions->get_lca($h_flat, $h_entrez)->scientific_name, 'Homo', 'get_lca() mixing flatfile and remote db';
292 # even though the species taxa for Homo sapiens from list and flat databases
293 # have the same internal id, get_lca won't work because they have different
294 # roots and descendents
295 $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
296 is $h_list->ancestor->internal_id, $h_flat->internal_id;
297 ok ! $tree_functions->get_lca($h_flat, $h_list);
299 # but we can form a tree with the flat node then remove all the ranks we're
300 # not interested in and try again
301 $tree = Bio::Tree::Tree->new(-node => $h_flat);
302 $tree->splice(-keep_rank => \@ranks);
303 is $tree->get_lca($h_flat, $h_list)->scientific_name, 'Homo';
306 # ideas from taxonomy2tree.PLS that let us make nice tree, using
307 # Bio::Tree::TreeFunctionsI methods; this is a weird and trivial example just
308 # because our test flatfile database only has the full lineage of one species
310 for my $name ('Human', 'Hominidae') {
311 my $ncbi_id = $db_flatfile->get_taxonid($name);
313 my $node = $db_flatfile->get_taxon(-taxonid => $ncbi_id);
316 ok $tree->merge_lineage($node);
319 ok $tree = Bio::Tree::Tree->new(-node => $node);
323 is $tree->get_nodes, 30;
324 $tree->contract_linear_paths;
325 my $ids = join(",", map { $_->id } $tree->get_nodes);
326 is $ids, '131567,9606';
328 # More thorough tests of merge_lineage
329 ok my $node = $db_list->get_taxon(-name => 'Eukaryota');
330 $tree = Bio::Tree::Tree->new(-node => $node);
331 ok $node = $db_list->get_taxon(-name => 'Homo erectus');
332 ok $tree->merge_lineage($node);
333 for my $name ('Eukaryota', 'Mammalia', 'Homo', 'Homo erectus') {
334 ok $node = $tree->find_node(-scientific_name => $name);
337 # we can recursively fetch all descendents of a taxon
339 test_skip(-tests => 1, -requires_networking => 1);
340 eval {$db_entrez->get_taxon(10090);};
341 skip "Unable to connect to entrez database; no network or server busy?", 1 if $@;
343 my $lca = $db_entrez->get_taxon(314146);
344 my @descs = $db_entrez->get_all_Descendents($lca);
345 cmp_ok @descs, '>=', 17;
349 $db_list = Bio::DB::Taxonomy->new(-source => 'list',
351 (split(/,\s+/, "cellular organisms, Eukaryota, Fungi/Metazoa group,
352 Metazoa, Eumetazoa, Bilateria, Coelomata, Protostomia, Panarthropoda,
353 Arthropoda, Mandibulata, Pancrustacea, Hexapoda, Insecta, Dicondylia,
354 Pterygota, Neoptera, Endopterygota, Diptera, Nematocera, Culicimorpha,
355 Culicoidea, Culicidae, Anophelinae, Anopheles, Anopheles, Angusticorn,
356 Anopheles, maculipennis group, maculipennis species complex, Anopheles daciae"))]);
358 my @taxonids = $db_list->get_taxonids('Anopheles');
359 is @taxonids, 3, 'List context';
361 my $taxonid = $db_list->get_taxonids('Anopheles');
362 isa_ok \$taxonid, 'SCALAR', 'Scalar context';
363 ok exists { map({$_ => undef} @taxonids) }->{$taxonid};
365 # but we should still be able to merge in an incomplete lineage of a sister
366 # species and have the 'tree' remain consistent:
368 # missing 'no rank' Anopheles
369 $db_list->add_lineage(-names => [
370 (split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Angusticorn,
371 maculipennis group, maculipennis species complex, Anopheles labranchiae"))]);
372 $node = $db_list->get_taxon(-name => 'Anopheles labranchiae');
373 is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
374 is $node->rank, undef;
376 # missing 'subgenus' Anopheles
377 $db_list->add_lineage(-names => [
378 (split(/,\s+/, "Anophelinae, Anopheles, Angusticorn, Anopheles,
379 maculipennis group, maculipennis species complex, Anopheles maculipennis"))]);
380 $node = $db_list->get_taxon(-name => 'Anopheles maculipennis');
381 is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
383 # missing 'no rank' Angusticorn
384 $db_list->add_lineage(-names => [
385 (split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Anopheles,
386 maculipennis group, maculipennis species complex, Anopheles melanoon"))]);
387 $node = $db_list->get_taxon(-name => 'Anopheles melanoon');
388 is $node->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Angusticorn';
390 @taxonids = $db_list->get_taxonids('Anopheles');
391 is scalar @taxonids, 3;
393 # bug: duplicate topmost taxa
394 $db_list = Bio::DB::Taxonomy->new( -source => 'list',
395 -names => ['Bacteria', 'Tenericutes'] );
396 $db_list->add_lineage( -names => ['Bacteria'] );
397 @taxonids = $db_list->get_taxonids('Bacteria');
398 is scalar @taxonids, 1;
400 # Disambiguate between taxa with same name using -names
401 ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' ), 'DB with ambiguous names';
402 ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae', 'g__Spongiibacter'] );
403 ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae', 'g__Alteromonas' ] );
405 ok @taxonids = $db_list->get_taxonids('f__Alteromonadaceae');
406 is scalar @taxonids, 2; # multiple taxa would match using $db_list->get_taxon(-name => 'f__Alteromonadaceae')
408 ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae'] );
409 is $node->ancestor->node_name, 'o__Alteromonadales';
410 my $iid = $node->internal_id;
412 ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae'] );
413 is $node->ancestor->node_name, 'o__Oceanospirillales';
414 isnt $node->internal_id, $iid;
417 # More tests with ambiguous names, internal IDs and multiple databases
418 my ($node3, $node4, $db_list_2);
419 ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' );
420 ok $db_list->add_lineage( -names => [ 'o__Enterobacteriales', 'g__Escherichia' ] );
421 ok $db_list->add_lineage( -names => [ 'o__Pseudomonadales' , 'g__Pseudomonas' ] );
422 ok $db_list->add_lineage( -names => [ 'o__Chroococcales' , 'g__Microcoleus' ] );
423 ok $node1 = $db_list->get_taxon( -names => [ 'k__Chroococcales', 'g__Microcoleus' ] );
425 ok $db_list_2 = Bio::DB::Taxonomy->new( -source => 'list' );
426 ok $db_list_2->add_lineage( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
427 ok $node2 = $db_list_2->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
429 is $node1->scientific_name, 'g__Microcoleus';
430 is $node2->scientific_name, 'g__Microcoleus'; # same taxon name
431 isnt $node1->id, $node2->id; # but different dbs and hence taxids
432 is $node1->internal_id, $node1->internal_id; # but same cross-database internal ID
434 ok $db_list->add_lineage( -names => [ 'o__Oscillatoriales' , 'g__Microcoleus' ] );
435 ok $db_list->add_lineage( -names => [ 'o__Acidobacteriales', 'g__Microcoleus' ] );
437 ok $node1 = $db_list->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
438 ok $node2 = $db_list->get_taxon( -names => [ 'o__Oscillatoriales' , 'g__Microcoleus' ] );
439 ok $node3 = $db_list->get_taxon( -names => [ 'o__Acidobacteriales' , 'g__Microcoleus' ] );
440 my @nodes = ($node1, $node2, $node3);
442 is map({$_->id => undef} @nodes), 6; # 3 distinct taxids
443 is map({$_->internal_id => undef} @nodes), 6; # 3 distinct iids
445 ok $db_list->add_lineage( -names => [ 'o__Chroococcales' , 'g__Microcoleus' ] );
446 ok $node2 = $db_list->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
447 is $node2->scientific_name, $node1->scientific_name;
448 is $node2->id, $node1->id;
449 is $node2->internal_id, $node1->internal_id;
453 test_skip(-tests => 12, -requires_networking => 1);
455 my $db=Bio::DB::Taxonomy->new(-source=>"entrez");
457 my @taxa = qw(viruses Deltavirus unclassified plasmid);
459 for my $taxon (@taxa) {
460 test_taxid($db, $taxon);
464 my ($db, $taxa) = @_;
465 my @taxonids = $db->get_taxonids($taxa);
466 cmp_ok(scalar(@taxonids), '>', 0, "Got IDs returned for $taxa:".join(',', @taxonids));
468 lives_ok { $taxon = $db->get_taxon(-taxonid => pop @taxonids) } "IDs generates a Bio::Taxonomy::Node";
469 if (defined $taxon) {
470 like( $taxon->scientific_name, qr/$taxa/i, "Name returned matches $taxa");
472 ok(0, "No taxon object returned for $taxa");
479 test_skip( -tests => 6, -requires_networking => 1 );
481 my $db = Bio::DB::Taxonomy->new( -source => "entrez" );
483 # String | What I expect | What I get
484 # ---------------------- | ------------- | ----------
485 # 'Lissotriton vulgaris' | 8324 | 8324
486 # 'Chlorella vulgaris' | 3077 | 3077
487 # 'Phygadeuon solidus' | 1763951 | 1763951
488 # 'Ovatus' | 666060 | 666060
489 # 'Phygadeuon ovatus' | "No hit" | 666060
490 # 'Trimorus ovatus' | "No hit" | 666060
493 @ids = $db->get_taxonids('Lissotriton vulgaris');
494 is $ids[0], 8324, 'Correct: Lissotriton vulgaris';
495 @ids = $db->get_taxonids('Chlorella vulgaris');
496 is $ids[0], 3077, 'Correct: Chlorella vulgaris';
497 @ids = $db->get_taxonids('Phygadeuon solidus');
498 is $ids[0], 1763951, 'Correct: Phygadeuon solidus';
499 @ids = $db->get_taxonids('Ovatus');
500 is $ids[0], 666060, 'Correct: Ovatus';
501 @ids = $db->get_taxonids('Phygadeuon ovatus');
502 is $ids[0], 'No hit', 'Correct: No hit';
503 @ids = $db->get_taxonids('Trimorus ovatus');
504 is $ids[0], 'No hit', 'Correct: No hit';