Bio::DB::CUTG: move into its own distribution
[bioperl-live.git] / bin / bp_taxonomy2tree
blobafc07cf4b3d6a4b6676a5af74c0b1d1e4b677c82
1 #!/usr/bin/perl
3 =head1 NAME
5 bp_taxonomy2tree - Building a taxonomic tree based on the full lineages of a set of species names
7 =head1 DESCRIPTION
9 This scripts looks up the provided species names in the NCBI Taxonomy database,
10 retrieves their full lineage and puts them in a Newick taxonomic tree displayed
11 on screen.
13 bp_taxonomy2tree.pl -s Orangutan -s Gorilla -s Chimpanzee -s Human
14 bp_taxonomy2tree.pl -s Orangutan -s Gorilla -s Chimpanzee -s "Homo Sapiens"
16 Can also provide -d to specify the directory to store index files in, -o to
17 specify the location of your NCBI nodes file, and -a for the NCBI names file.
18 Or the option -e to use the web-based Entrez taxonomy database if you do not
19 have the NCBI flatfiles installed.
21 This script requires that the bioperl-run pkg be also installed.
23 Providing the nodes.dmp and names.dmp files from the NCBI Taxonomy
24 dump (see Bio::DB::Taxonomy::flatfile for more info) is only necessary
25 on the first time running. This will create the local indexes and may
26 take quite a long time. However once created, these indexes will
27 allow fast access for species to taxon id OR taxon id to species name
28 lookups.
30 =head1 AUTHOR - Gabriel Valiente, reimplemented by Sendu Bala
32 Email valiente@lsi.upc.edu
33 Email bix@sendu.me.uk
35 =cut
37 use strict;
38 use warnings;
39 use Bio::DB::Taxonomy;
40 use Bio::TreeIO;
41 use Bio::Tree::Compatible;
42 use Getopt::Long;
44 my @species;
45 my $index_dir = "./db/";
46 my $nodesfile = "nodes.dmp";
47 my $namesfile = "names.dmp";
48 my $use_entrez = 0;
50 # the input to the script is an array of species names
51 GetOptions( 's|species=s' => \@species,
52 'd|dir:s' => \$index_dir,
53 'o|nodesfile:s' => \$nodesfile,
54 'a|namesfile:s' => \$namesfile,
55 'e|entrez' => \$use_entrez,
56 'h|help' => sub { system('perldoc', $0); exit }, );
58 my $db = Bio::DB::Taxonomy->new( -source => $use_entrez ? 'entrez' : 'flatfile',
59 -directory => $index_dir,
60 -nodesfile => $nodesfile,
61 -namesfile => $namesfile );
63 # the full lineages of the species are merged into a single tree
64 my $tree;
65 for my $name (@species) {
66 my $node = $db->get_taxon(-name => $name);
67 if ($node) {
68 if ($tree) {
69 $tree->merge_lineage($node);
71 else {
72 $tree = Bio::Tree::Tree->new(-node => $node);
75 else {
76 warn "no NCBI Taxonomy node for species ",$name,"\n";
80 # simple paths are contracted by removing degree one nodes
81 $tree->contract_linear_paths;
83 # convert tree ids to their names for nice output with TreeIO
84 foreach my $node ($tree->get_nodes) {
85 $node->id($node->node_name);
88 # the tree is output in Newick format
89 my $output = Bio::TreeIO->new(-format => 'newick');
90 $output->write_tree($tree);
91 $output->close;