bin/bp_taxonomy2tree

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 bp_taxonomy2tree - Building a taxonomic tree based on the full lineages of a set of species names
   6
   7 =head1 DESCRIPTION
   8
   9 This scripts looks up the provided species names in the NCBI Taxonomy database,
  10 retrieves their full lineage and puts them in a Newick taxonomic tree displayed
  11 on screen.
  12
  13   bp_taxonomy2tree.pl -s Orangutan -s Gorilla -s Chimpanzee -s Human
  14   bp_taxonomy2tree.pl -s Orangutan -s Gorilla -s Chimpanzee -s "Homo Sapiens"
  15
  16 Can also provide -d to specify the directory to store index files in, -o to
  17 specify the location of your NCBI nodes file, and -a for the NCBI names file.
  18 Or the option -e to use the web-based Entrez taxonomy database if you do not
  19 have the NCBI flatfiles installed.
  20
  21 This script requires that the bioperl-run pkg be also installed.
  22
  23 Providing the nodes.dmp and names.dmp files from the NCBI Taxonomy
  24 dump (see Bio::DB::Taxonomy::flatfile for more info) is only necessary
  25 on the first time running.  This will create the local indexes and may
  26 take quite a long time.  However once created, these indexes will
  27 allow fast access for species to taxon id OR taxon id to species name
  28 lookups.
  29
  30 =head1 AUTHOR - Gabriel Valiente, reimplemented by Sendu Bala
  31
  32 Email valiente@lsi.upc.edu
  33 Email bix@sendu.me.uk
  34
  35 =cut
  36
  37 use strict;
  38 use warnings;
  39 use Bio::DB::Taxonomy;
  40 use Bio::TreeIO;
  41 use Bio::Tree::Compatible;
  42 use Getopt::Long;
  43
  44 my @species;
  45 my $index_dir = "./db/";
  46 my $nodesfile = "nodes.dmp";
  47 my $namesfile = "names.dmp";
  48 my $use_entrez = 0;
  49
  50 # the input to the script is an array of species names
  51 GetOptions( 's|species=s'   => \@species,
  52             'd|dir:s'       => \$index_dir,
  53             'o|nodesfile:s' => \$nodesfile,
  54             'a|namesfile:s' => \$namesfile,
  55             'e|entrez'      => \$use_entrez,
  56             'h|help'        => sub { system('perldoc', $0); exit }, );
  57
  58 my $db = Bio::DB::Taxonomy->new( -source    => $use_entrez ? 'entrez' : 'flatfile',
  59                                  -directory => $index_dir,
  60                                  -nodesfile => $nodesfile,
  61                                  -namesfile => $namesfile );
  62
  63 # the full lineages of the species are merged into a single tree
  64 my $tree;
  65 for my $name (@species) {
  66   my $node = $db->get_taxon(-name => $name);
  67   if ($node) {
  68     if ($tree) {
  69       $tree->merge_lineage($node);
  70     }
  71     else {
  72       $tree = Bio::Tree::Tree->new(-node => $node);
  73     }
  74   }
  75   else {
  76     warn "no NCBI Taxonomy node for species ",$name,"\n";
  77   }
  78 }
  79
  80 # simple paths are contracted by removing degree one nodes
  81 $tree->contract_linear_paths;
  82
  83 # convert tree ids to their names for nice output with TreeIO
  84 foreach my $node ($tree->get_nodes) {
  85   $node->id($node->node_name);
  86 }
  87
  88 # the tree is output in Newick format
  89 my $output = Bio::TreeIO->new(-format => 'newick');
  90 $output->write_tree($tree);
  91 $output->close;
  92
  93 1;