Bio/DB/Taxonomy/flatfile.pm

   1 #
   2 # BioPerl module for Bio::DB::Taxonomy::flatfile
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
   7 #
   8 # Copyright Jason Stajich
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::DB::Taxonomy::flatfile - Use the NCBI taxonomy from local indexed flat files
  17
  18 =head1 SYNOPSIS
  19
  20   use Bio::DB::Taxonomy;
  21
  22   my $db = Bio::DB::Taxonomy->new(-source    => 'flatfile' ,
  23                                   -nodesfile => 'nodes.dmp',
  24                                   -namesfile => 'names.dmp');
  25
  26 =head1 DESCRIPTION
  27
  28 This is an implementation of Bio::DB::Taxonomy which stores and accesses the
  29 NCBI taxonomy using flat files stored locally on disk and indexed using the
  30 DB_File module RECNO data structure for fast retrieval.
  31
  32 The required database files, nodes.dmp and names.dmp can be obtained from
  33 ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
  34
  35 =head1 FEEDBACK
  36
  37 =head2 Mailing Lists
  38
  39 User feedback is an integral part of the evolution of this and other
  40 Bioperl modules. Send your comments and suggestions preferably to
  41 the Bioperl mailing list.  Your participation is much appreciated.
  42
  43   bioperl-l@bioperl.org                  - General discussion
  44   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  45
  46 =head2 Support
  47
  48 Please direct usage questions or support issues to the mailing list:
  49
  50 I<bioperl-l@bioperl.org>
  51
  52 rather than to the module maintainer directly. Many experienced and
  53 reponsive experts will be able look at the problem and quickly
  54 address it. Please include a thorough description of the problem
  55 with code and data examples if at all possible.
  56
  57 =head2 Reporting Bugs
  58
  59 Report bugs to the Bioperl bug tracking system to help us keep track
  60 of the bugs and their resolution. Bug reports can be submitted via
  61 the web:
  62
  63   https://github.com/bioperl/bioperl-live/issues
  64
  65 =head1 AUTHOR - Jason Stajich
  66
  67 Email jason-at-bioperl-dot-org
  68
  69 =head1 CONTRIBUTORS
  70
  71 Sendu Bala: bix@sendu.me.uk
  72
  73 =head1 APPENDIX
  74
  75 The rest of the documentation details each of the object methods.
  76 Internal methods are usually preceded with a _
  77
  78 =cut
  79
  80 # Let the code begin...
  81
  82 package Bio::DB::Taxonomy::flatfile;
  83
  84 use vars qw($DEFAULT_INDEX_DIR $DEFAULT_NODE_INDEX $DEFAULT_NAME2ID_INDEX
  85             $DEFAULT_ID2NAME_INDEX $DEFAULT_PARENT_INDEX @DIVISIONS);
  86
  87 use strict;
  88 use DB_File;
  89 use Bio::Taxon;
  90 use File::Spec::Functions;
  91
  92 use constant SEPARATOR => ':';
  93
  94 $DEFAULT_INDEX_DIR     = $Bio::Root::IO::TEMPDIR; # /tmp
  95 $DEFAULT_NODE_INDEX    = 'nodes';
  96 $DEFAULT_NAME2ID_INDEX = 'names2id';
  97 $DEFAULT_ID2NAME_INDEX = 'id2names';
  98 $DEFAULT_PARENT_INDEX  = 'parents';
  99
 100 $DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
 101
 102 # 8192 bytes; this seems to work to keep OS X from complaining
 103 $DB_HASH->{'bsize'} = 0x2000;
 104
 105 @DIVISIONS =   ([qw(BCT Bacteria)],
 106                 [qw(INV Invertebrates)],
 107                 [qw(MAM Mammals)],
 108                 [qw(PHG Phages)],
 109                 [qw(PLN Plants)], # (and fungi)
 110                 [qw(PRI Primates)],
 111                 [qw(ROD Rodents)],
 112                 [qw(SYN Synthetic)],
 113                 [qw(UNA Unassigned)],
 114                 [qw(VRL Viruses)],
 115                 [qw(VRT Vertebrates)],
 116                 [qw(ENV 'Environmental samples')]);
 117
 118 use base qw(Bio::DB::Taxonomy);
 119
 120 =head2 new
 121
 122  Title   : new
 123  Usage   : my $obj = Bio::DB::Taxonomy::flatfile->new();
 124  Function: Builds a new Bio::DB::Taxonomy::flatfile object
 125  Returns : an instance of Bio::DB::Taxonomy::flatfile
 126  Args    : -directory => name of directory where index files should be created
 127            -nodesfile => name of file containing nodes (nodes.dmp from NCBI)
 128            -namesfile => name of the file containing names(names.dmp from NCBI)
 129            -force     => 1 to replace current indexes even if they exist
 130
 131 =cut
 132
 133 sub new {
 134   my($class, @args) = @_;
 135
 136   my $self = $class->SUPER::new(@args);
 137   my ($dir,$nodesfile,$namesfile,$force) =
 138       $self->_rearrange([qw(DIRECTORY NODESFILE NAMESFILE FORCE)], @args);
 139
 140   $self->index_directory($dir || $DEFAULT_INDEX_DIR);
 141   if ( $nodesfile ) {
 142           $self->_build_index($nodesfile,$namesfile,$force);
 143   }
 144
 145   $self->_db_connect;
 146   return $self;
 147 }
 148
 149
 150 =head2 Bio::DB::Taxonomy interface implementation
 151
 152 =head2 get_num_taxa
 153
 154  Title   : get_num_taxa
 155  Usage   : my $num = $db->get_num_taxa();
 156  Function: Get the number of taxa stored in the database.
 157  Returns : A number
 158  Args    : None
 159
 160 =cut
 161
 162 sub get_num_taxa {
 163     my ($self) = @_;
 164     if (not exists $self->{_num_taxa}) {
 165         my $num = 0;
 166         while ( my ($parent, undef) = each %{$self->{_parent2children}} ) {
 167            $num++;
 168         }
 169         $self->{_num_taxa} = $num;
 170     }
 171     return $self->{_num_taxa};
 172 }
 173
 174
 175 =head2 get_taxon
 176
 177  Title   : get_taxon
 178  Usage   : my $taxon = $db->get_taxon(-taxonid => $taxonid)
 179  Function: Get a Bio::Taxon object from the database.
 180  Returns : Bio::Taxon object
 181  Args    : just a single value which is the database id, OR named args:
 182            -taxonid => taxonomy id (to query by taxonid)
 183             OR
 184            -name    => string (to query by a taxonomy name: common name,
 185                                scientific name, etc)
 186
 187 =cut
 188
 189 sub get_taxon {
 190     my ($self) = shift;
 191     my ($taxonid, $name);
 192
 193     if (@_ > 1) {
 194         ($taxonid, $name) = $self->_rearrange([qw(TAXONID NAME)],@_);
 195         if ($name) {
 196             ($taxonid, my @others) = $self->get_taxonids($name);
 197             $self->warn("There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'") if @others > 0;
 198         }
 199     }
 200     else {
 201         $taxonid = shift;
 202     }
 203
 204     return unless $taxonid;
 205
 206     $taxonid =~ /^\d+$/ || return;
 207     my $node = $self->{'_nodes'}->[$taxonid] || return;
 208     length($node) || return;
 209     my ($taxid, undef, $rank, $code, $divid, $gen_code, $mito) = split(SEPARATOR,$node);
 210     last unless defined $taxid;
 211     my ($taxon_names) = $self->{'_id2name'}->[$taxid];
 212     my ($sci_name, @common_names) = split(SEPARATOR, $taxon_names);
 213
 214     my $taxon = Bio::Taxon->new(
 215                         -name         => $sci_name,
 216                         -common_names => [@common_names],
 217                         -ncbi_taxid   => $taxid, # since this is a real ncbi taxid, explicitly set it as one
 218                         -rank         => $rank,
 219                         -division     => $DIVISIONS[$divid]->[1],
 220                         -genetic_code => $gen_code,
 221                         -mito_genetic_code => $mito );
 222     # we can't use -dbh or the db_handle() method ourselves or we'll go
 223     # infinite on the merge attempt
 224     $taxon->{'db_handle'} = $self;
 225
 226     $self->_handle_internal_id($taxon);
 227
 228     return $taxon;
 229 }
 230
 231 *get_Taxonomy_Node = \&get_taxon;
 232
 233
 234 =head2 get_taxonids
 235
 236  Title   : get_taxonids
 237  Usage   : my @taxonids = $db->get_taxonids('Homo sapiens');
 238  Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
 239            string. Note that multiple taxonids can match to the same supplied
 240            name.
 241  Returns : array of integer ids in list context, one of these in scalar context
 242  Args    : string representing taxon's name
 243
 244 =cut
 245
 246 sub get_taxonids {
 247     my ($self, $query) = @_;
 248     my $ids = $self->{'_name2id'}->{lc($query)};
 249     unless ($ids) {
 250         if ($query =~ /_/) {
 251             # try again converting underscores to spaces
 252             $query =~ s/_/ /g;
 253             $ids = $self->{'_name2id'}->{lc($query)};
 254         }
 255         $ids || return;
 256     }
 257     my @ids = split(SEPARATOR, $ids);
 258     return wantarray() ? @ids : shift @ids;
 259 }
 260
 261 *get_taxonid = \&get_taxonids;
 262
 263
 264 =head2 get_Children_Taxids
 265
 266  Title   : get_Children_Taxids
 267  Usage   : my @childrenids = $db->get_Children_Taxids
 268  Function: Get the ids of the children of a node in the taxonomy
 269  Returns : Array of Ids
 270  Args    : Bio::Taxon or a taxon_id
 271  Status  : deprecated (use each_Descendent())
 272
 273 =cut
 274
 275 sub get_Children_Taxids {
 276    my ($self, $node) = @_;
 277    $self->warn("get_Children_Taxids is deprecated, use each_Descendent instead");
 278    my $id;
 279    if( ref($node) ) {
 280        if( $node->can('object_id') ) {
 281            $id = $node->object_id;
 282        } elsif( $node->can('ncbi_taxid') ) {
 283            $id = $node->ncbi_taxid;
 284        } else {
 285            $self->warn("Don't know how to extract a taxon id from the object of type ".ref($node)."\n");
 286            return;
 287        }
 288    } else { $id = $node }
 289    my @vals = $self->{'_parentbtree'}->get_dup($id);
 290    return @vals;
 291 }
 292
 293
 294 =head2 ancestor
 295
 296  Title   : ancestor
 297  Usage   : my $ancestor_taxon = $db->ancestor($taxon)
 298  Function: Retrieve the full ancestor taxon of a supplied Taxon from the
 299            database.
 300  Returns : Bio::Taxon
 301  Args    : Bio::Taxon (that was retrieved from this database)
 302
 303 =cut
 304
 305 sub ancestor {
 306     my ($self, $taxon) = @_;
 307     $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
 308     $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
 309     my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
 310
 311     my $node = $self->{'_nodes'}->[$id];
 312     if (length($node)) {
 313         my (undef, $parent_id) = split(SEPARATOR,$node);
 314         $parent_id || return;
 315         $parent_id eq $id && return; # one of the roots
 316         return $self->get_taxon($parent_id);
 317     }
 318     return;
 319 }
 320
 321
 322 =head2 each_Descendent
 323
 324  Title   : each_Descendent
 325  Usage   : my @taxa = $db->each_Descendent($taxon);
 326  Function: Get all the descendents of the supplied Taxon (but not their
 327            descendents, ie. not a recursive fetchall).
 328  Returns : Array of Bio::Taxon objects
 329  Args    : Bio::Taxon (that was retrieved from this database)
 330
 331 =cut
 332
 333 sub each_Descendent {
 334     my ($self, $taxon) = @_;
 335     $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
 336     $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
 337     my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
 338
 339     my @desc_ids = $self->{'_parentbtree'}->get_dup($id);
 340     my @descs;
 341     foreach my $desc_id (@desc_ids) {
 342         push(@descs, $self->get_taxon($desc_id) || next);
 343     }
 344     return @descs;
 345 }
 346
 347
 348 =head2 Helper methods
 349
 350 =cut
 351
 352 # internal method which does the indexing
 353 sub _build_index {
 354     my ($self, $nodesfile, $namesfile, $force) = @_;
 355
 356     my $dir = $self->index_directory;
 357     my $nodeindex         = catfile($dir, $DEFAULT_NODE_INDEX);
 358     my $name2idindex      = catfile($dir, $DEFAULT_NAME2ID_INDEX);
 359     my $id2nameindex      = catfile($dir, $DEFAULT_ID2NAME_INDEX);
 360     my $parent2childindex = catfile($dir, $DEFAULT_PARENT_INDEX);
 361     $self->{'_nodes'}           = [];
 362     $self->{'_id2name'}         = [];
 363     $self->{'_name2id'}         = {};
 364     $self->{'_parent2children'} = {};
 365
 366     if (! -e $nodeindex || $force) {
 367         my (%parent2children,@nodes);
 368         open my $NODES, '<', $nodesfile
 369             or $self->throw("Could not read node file '$nodesfile': $!");
 370
 371         unlink $nodeindex;
 372         unlink $parent2childindex;
 373         my $nh = tie ( @nodes, 'DB_File', $nodeindex, O_RDWR|O_CREAT, 0644, $DB_RECNO) ||
 374             $self->throw("Cannot open file '$nodeindex': $!");
 375         my $btree = tie( %parent2children, 'DB_File', $parent2childindex, O_RDWR|O_CREAT, 0644, $DB_BTREE) ||
 376             $self->throw("Cannot tie to file '$parent2childindex': $!");
 377
 378         while (<$NODES>) {
 379             next if /^$/;
 380             chomp;
 381             my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
 382             # don't include the fake root node 'root' with id 1; we essentially have multiple roots here
 383             next if $taxid == 1;
 384             if ($parent == 1) {
 385                 $parent = $taxid;
 386             }
 387
 388             # keep this stringified
 389             $nodes[$taxid] = join(SEPARATOR, ($taxid,$parent,$rank,$code,$divid,$gen_code,$mito));
 390             $btree->put($parent,$taxid);
 391         }
 392         close $NODES;
 393
 394         $nh = $btree = undef;
 395         untie @nodes ;
 396         untie %parent2children;
 397     }
 398
 399     if ((! -e $name2idindex || -z $name2idindex) || (! -e $id2nameindex || -z $id2nameindex) || $force) {
 400         open my $NAMES, '<', $namesfile
 401             or $self->throw("Could not read names file '$namesfile': $!");
 402
 403         unlink $name2idindex;
 404         unlink $id2nameindex;
 405         my (@id2name,%name2id);
 406         my $idh = tie (@id2name, 'DB_File', $id2nameindex, O_RDWR|O_CREAT, 0644, $DB_RECNO) ||
 407             $self->throw("Cannot tie to file '$id2nameindex': $!");
 408         my $nameh = tie ( %name2id, 'DB_File', $name2idindex, O_RDWR|O_CREAT, 0644, $DB_HASH) ||
 409             $self->throw("Cannot tie to file '$name2idindex': $!");
 410
 411         while (<$NAMES>) {
 412             next if /^$/;
 413             chomp;
 414             my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
 415             # don't include the fake root node 'root' or 'all' with id 1
 416             next if $taxid == 1;
 417
 418             $class =~ s/\s+\|\s*$//;
 419             my $lc_name = lc($name);
 420             my $orig_name = $name;
 421
 422             # unique names aren't always in the correct column, sometimes they
 423             # are uniqued by adding bracketed rank names to the normal name;
 424             # store the uniqued version then fix the name for normal use
 425             if ($lc_name =~ /\(class\)$/) { # it seems that only rank of class is ever used in this situation
 426                 $name2id{$lc_name} = $taxid;
 427                 $name =~ s/\s+\(class\)$//;
 428                 $lc_name = lc($name);
 429             }
 430
 431             # handle normal names which aren't necessarily unique
 432             my $taxids = $name2id{$lc_name} || '';
 433             my %taxids = map { $_ => 1 } split(SEPARATOR, $taxids);
 434             unless (exists $taxids{$taxid}) {
 435                 $taxids{$taxid} = 1;
 436                 $name2id{$lc_name} = join(SEPARATOR, keys %taxids);
 437             }
 438
 439             # store unique names in name2id
 440             if ($unique_name) {
 441                 $name2id{lc($unique_name)} = $taxid;
 442             }
 443
 444             # store all names in id2name array
 445             my $names = $id2name[$taxid] || '';
 446             my @names = split(SEPARATOR, $names);
 447             if ($class && $class eq 'scientific name') {
 448                 # the scientific name should be the first name stored
 449                 unshift(@names, $name);
 450                 push(@names, $orig_name) if ($orig_name ne $name);
 451                 push(@names, $unique_name) if $unique_name;
 452             }
 453             else {
 454                 # all other ('common' in this simplification) names get added after
 455                 push(@names, $name);
 456                 push(@names, $orig_name) if ($orig_name ne $name);
 457                 push(@names, $unique_name) if $unique_name;
 458             }
 459             $id2name[$taxid] = join(SEPARATOR, @names);
 460         }
 461         close $NAMES;
 462
 463         $idh = $nameh = undef;
 464         untie( %name2id);
 465         untie( @id2name);
 466     }
 467 }
 468
 469
 470 # connect the internal db handle
 471 sub _db_connect {
 472     my $self = shift;
 473     return if $self->{'_initialized'};
 474
 475     my $dir = $self->index_directory;
 476     my $nodeindex         = catfile($dir, $DEFAULT_NODE_INDEX);
 477     my $name2idindex      = catfile($dir, $DEFAULT_NAME2ID_INDEX);
 478     my $id2nameindex      = catfile($dir, $DEFAULT_ID2NAME_INDEX);
 479     my $parent2childindex = catfile($dir, $DEFAULT_PARENT_INDEX);
 480     $self->{'_nodes'}           = [];
 481     $self->{'_id2name'}         = [];
 482     $self->{'_name2id'}         = {};
 483     $self->{'_parent2children'} = {};
 484
 485     if( ! -e $nodeindex ||
 486         ! -e $name2idindex ||
 487         ! -e $id2nameindex ) {
 488         $self->warn("Index files have not been created");
 489         return 0;
 490     }
 491     tie ( @{$self->{'_nodes'}}, 'DB_File', $nodeindex, O_RDONLY,undef, $DB_RECNO)
 492         || $self->throw("$! $nodeindex");
 493     tie (@{$self->{'_id2name'}}, 'DB_File', $id2nameindex,O_RDONLY, undef,
 494         $DB_RECNO) || $self->throw("$! $id2nameindex");
 495
 496     tie ( %{$self->{'_name2id'}}, 'DB_File', $name2idindex, O_RDONLY,undef,
 497         $DB_HASH) || $self->throw("$! $name2idindex");
 498     $self->{'_parentbtree'} = tie( %{$self->{'_parent2children'}},
 499                                    'DB_File', $parent2childindex,
 500                                    O_RDONLY, 0644, $DB_BTREE);
 501
 502     $self->{'_initialized'} = 1;
 503 }
 504
 505
 506 =head2 index_directory
 507
 508  Title   : index_directory
 509  Function : Get/set the location that index files are stored. (this module
 510            will index the supplied database)
 511  Usage   : $obj->index_directory($newval)
 512  Returns : value of index_directory (a scalar)
 513  Args    : on set, new value (a scalar or undef, optional)
 514
 515 =cut
 516
 517
 518 sub index_directory {
 519     my $self = shift;
 520     return $self->{'index_directory'} = shift if @_;
 521     return $self->{'index_directory'};
 522 }
 523
 524
 525 sub DESTROY {
 526     my $self = shift;
 527     # Destroy all filehandle references
 528     # to be able to remove temporary files
 529     undef $self->{_id2name};
 530     undef $self->{_name2id};
 531     undef $self->{_nodes};
 532     undef $self->{_parent2children};
 533     undef $self->{_parentbtree};
 534
 535     # Treat index files as temporary and delete them now if
 536     # 'index_directory' match $DEFAULT_INDEX_DIR (which means
 537     # that no "-directory" was specified or is an explicit
 538     # temporary file)
 539     my $default_temp = quotemeta $DEFAULT_INDEX_DIR;
 540     if ($self->{index_directory} =~ m/^$default_temp/) {
 541         unlink catfile($self->{index_directory},'id2names');
 542         unlink catfile($self->{index_directory},'names2id');
 543         unlink catfile($self->{index_directory},'nodes');
 544         unlink catfile($self->{index_directory},'parents');
 545     }
 546 }
 547
 548 1;