Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / DB / Flat.pm
blobf285227ff1c1e998b29c293dd130c99eb0b941f1
3 # BioPerl module for Bio::DB::Flat
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Lincoln Stein <lstein@cshl.org>
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::DB::Flat - Interface for indexed flat files
17 =head1 SYNOPSIS
19 $db = Bio::DB::Flat->new(-directory => '/usr/share/embl',
20 -dbname => 'mydb',
21 -format => 'embl',
22 -index => 'bdb',
23 -write_flag => 1);
24 $db->build_index('/usr/share/embl/primate.embl',
25 '/usr/share/embl/protists.embl');
26 $seq = $db->get_Seq_by_id('HSFOS');
27 @sequences = $db->get_Seq_by_acc('DIV' => 'primate');
28 $raw = $db->fetch_raw('HSFOS');
30 =head1 DESCRIPTION
32 This object provides the basic mechanism to associate positions in
33 files with primary and secondary name spaces. Unlike
34 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
35 to work with the "flat index" and BerkeleyDB indexed flat file formats
36 worked out at the 2002 BioHackathon.
38 This object is a general front end to the underlying databases.
40 =head1 FEEDBACK
42 =head2 Mailing Lists
44 User feedback is an integral part of the evolution of this and other
45 Bioperl modules. Send your comments and suggestions preferably to one
46 of the Bioperl mailing lists. Your participation is much appreciated.
48 bioperl-l@bioperl.org - General discussion
49 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
51 =head2 Support
53 Please direct usage questions or support issues to the mailing list:
55 I<bioperl-l@bioperl.org>
57 rather than to the module maintainer directly. Many experienced and
58 reponsive experts will be able look at the problem and quickly
59 address it. Please include a thorough description of the problem
60 with code and data examples if at all possible.
62 =head2 Reporting Bugs
64 Report bugs to the Bioperl bug tracking system to help us keep track
65 the bugs and their resolution. Bug reports can be submitted via the
66 web:
68 https://github.com/bioperl/bioperl-live/issues
70 =head1 AUTHOR - Lincoln Stein
72 Email - lstein@cshl.org
74 =head1 APPENDIX
76 The rest of the documentation details each of the object methods. Internal
77 methods are usually preceded with an "_" (underscore).
79 =cut
82 # Let the code begin...
83 package Bio::DB::Flat;
85 use File::Spec;
87 use base qw(Bio::Root::Root Bio::DB::RandomAccessI);
89 use constant CONFIG_FILE_NAME => 'config.dat';
91 =head2 new
93 Title : new
94 Usage : my $db = Bio::DB::Flat->new(
95 -directory => $root_directory,
96 -dbname => 'mydb',
97 -write_flag => 1,
98 -index => 'bdb',
99 -verbose => 0,
100 -out => 'outputfile',
101 -format => 'genbank');
102 Function: create a new Bio::DB::Flat object
103 Returns : new Bio::DB::Flat object
104 Args : -directory Root directory containing "config.dat"
105 -write_flag If true, allows creation/updating.
106 -verbose Verbose messages
107 -out File to write to when write_seq invoked
108 -index 'bdb' or 'binarysearch'
109 Status : Public
111 The required -directory argument indicates where the flat file indexes
112 will be stored. The build_index() and write_seq() methods will
113 automatically create subdirectories of this root directory. Each
114 subdirectory will contain a human-readable configuration file named
115 "config.dat" that specifies where the individual indexes are stored.
117 The required -dbname argument gives a name to the database index. The
118 index files will actually be stored in a like-named subdirectory
119 underneath the root directory.
121 The -write_flag enables writing new entries into the database as well
122 as the creation of the indexes. By default the indexes will be opened
123 read only.
125 -index is one of "bdb" or "binarysearch" and indicates the type of
126 index to generate. "bdb" corresponds to Berkeley DB. You *must* be
127 using BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB
128 extension installed (DB_File will *not* work). "binarysearch"
129 corresponds to the OBDA "flat" indexed file.
131 The -out argument specifies the output file for writing objects created
132 with write_seq().
134 The -format argument specifies the format of the input file or files. If
135 the file suffix is one that Bioperl can already associate with a format
136 then this is optional.
138 =cut
140 sub new {
141 my $class = shift;
142 $class = ref($class) if ref($class);
143 my $self = $class->SUPER::new(@_);
145 # first we initialize ourselves
146 my ($flat_directory,$dbname,$format) =
147 $self->_rearrange([qw(DIRECTORY DBNAME FORMAT)],@_);
149 defined $flat_directory
150 or $self->throw('Please supply a -directory argument');
151 defined $dbname
152 or $self->throw('Please supply a -dbname argument');
154 # set values from configuration file
155 $self->directory($flat_directory);
156 $self->dbname($dbname);
158 $self->throw("Base directory $flat_directory doesn't exist")
159 unless -e $flat_directory;
160 $self->throw("$flat_directory isn't a directory")
161 unless -d _;
162 my $dbpath = File::Spec->catfile($flat_directory,$dbname);
163 unless (-d $dbpath) {
164 $self->debug("creating db directory $dbpath\n");
165 mkdir $dbpath,0777 or $self->throw("Can't create $dbpath: $!");
167 $self->_read_config();
169 # but override with initialization values
170 $self->_initialize(@_);
172 $self->throw('you must specify an indexing scheme')
173 unless $self->indexing_scheme;
175 # now we figure out what subclass to instantiate
176 my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ? 'BDB'
177 :$self->indexing_scheme eq 'flat/1' ? 'Binary'
178 :$self->throw("unknown indexing scheme: " .
179 $self->indexing_scheme);
180 $format = $self->file_format;
182 # because Michele and Lincoln did it differently
183 # Michele's way is via a standalone concrete class
184 if ($index_type eq 'Binary') {
185 my $child_class = 'Bio::DB::Flat::BinarySearch';
186 eval "use $child_class";
187 $self->throw($@) if $@;
188 push @_, ('-format', $format);
189 return $child_class->new(@_);
192 # Lincoln uses Bio::SeqIO style delegation.
193 my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format";
194 eval "use $child_class";
195 $self->throw($@) if $@;
197 # rebless & reinitialize with the new class
198 # (this prevents subclasses from forgetting to call our own initialization)
199 bless $self,$child_class;
200 $self->_initialize(@_);
201 $self->_set_namespaces(@_);
203 $self;
206 sub _initialize {
207 my $self = shift;
209 my ($flat_write_flag,$dbname,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format)
210 = $self->_rearrange([qw(WRITE_FLAG DBNAME INDEX VERBOSE OUT FORMAT)],@_);
212 $self->write_flag($flat_write_flag) if defined $flat_write_flag;
214 if (defined $flat_indexing) {
215 # very permissive
216 $flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb/;
217 $flat_indexing = 'flat/1' if $flat_indexing =~ /^(flat|binary)/;
218 $self->indexing_scheme($flat_indexing);
221 $self->verbose($flat_verbose) if defined $flat_verbose;
222 $self->dbname($dbname) if defined $dbname;
223 $self->out_file($flat_outfile) if defined $flat_outfile;
224 $self->file_format($flat_format) if defined $flat_format;
227 sub _set_namespaces {
228 my $self = shift;
230 $self->primary_namespace($self->default_primary_namespace)
231 unless defined $self->{flat_primary_namespace};
233 $self->secondary_namespaces($self->default_secondary_namespaces)
234 unless defined $self->{flat_secondary_namespaces};
236 $self->file_format($self->default_file_format)
237 unless defined $self->{flat_format};
240 =head2 new_from_registry
242 Title : new_from_registry
243 Usage : $db = Bio::DB::Flat->new_from_registry(%config)
244 Function: creates a new Bio::DB::Flat object in a Bio::DB::Registry-
245 compatible fashion
246 Returns : new Bio::DB::Flat
247 Args : provided by the registry, see below
248 Status : Public
250 The following registry-configuration tags are recognized:
252 location Root of the indexed flat file; corresponds to the new() method's
253 -directory argument.
255 =cut
257 sub new_from_registry {
258 my ($self,%config) = @_;
259 my $location = $config{'location'} or
260 $self->throw('location tag must be specified.');
261 my $dbname = $config{'dbname'} or
262 $self->throw('dbname tag must be specified.');
264 my $db = $self->new(-directory => $location,
265 -dbname => $dbname,
267 $db;
270 # accessors
271 sub directory {
272 my $self = shift;
273 my $d = $self->{flat_directory};
274 $self->{flat_directory} = shift if @_;
277 sub write_flag {
278 my $self = shift;
279 my $d = $self->{flat_write_flag};
280 $self->{flat_write_flag} = shift if @_;
283 sub verbose {
284 my $self = shift;
285 my $d = $self->{flat_verbose};
286 $self->{flat_verbose} = shift if @_;
289 sub out_file {
290 my $self = shift;
291 my $d = $self->{flat_outfile};
292 $self->{flat_outfile} = shift if @_;
295 sub dbname {
296 my $self = shift;
297 my $d = $self->{flat_dbname};
298 $self->{flat_dbname} = shift if @_;
301 sub primary_namespace {
302 my $self = shift;
303 my $d = $self->{flat_primary_namespace};
304 $self->{flat_primary_namespace} = shift if @_;
308 # get/set secondary namespace(s)
309 # pass an array ref.
310 # get an array ref in scalar context, list in list context.
311 sub secondary_namespaces {
312 my $self = shift;
313 my $d = $self->{flat_secondary_namespaces};
314 $self->{flat_secondary_namespaces} = (ref($_[0]) eq 'ARRAY' ? shift : [@_]) if @_;
315 return unless $d;
316 $d = [$d] if $d && ref($d) ne 'ARRAY'; # just paranoia
317 return wantarray ? @$d : $d;
320 # return the file format
321 sub file_format {
322 my $self = shift;
323 my $d = $self->{flat_format};
324 $self->{flat_format} = shift if @_;
328 # return the alphabet
329 sub alphabet {
330 my $self = shift;
331 my $d = $self->{flat_alphabet};
332 $self->{flat_alphabet} = shift if @_;
336 sub parse_one_record {
337 my $self = shift;
338 my $fh = shift;
339 my $parser =
340 $self->{cached_parsers}{fileno($fh)}
341 ||= Bio::SeqIO->new(-fh=>$fh,-format=>$self->default_file_format);
342 my $seq = $parser->next_seq or return;
343 $self->{flat_alphabet} ||= $seq->alphabet;
344 my $ids = $self->seq_to_ids($seq);
345 return $ids;
349 # return the indexing scheme
350 sub indexing_scheme {
351 my $self = shift;
352 my $d = $self->{flat_indexing};
353 $self->{flat_indexing} = shift if @_;
357 sub add_flat_file {
358 my $self = shift;
359 my ($file_path,$file_length,$nf) = @_;
361 # check that file_path is absolute
362 unless (File::Spec->file_name_is_absolute($file_path)) {
363 $file_path = File::Spec->rel2abs($file_path);
366 -r $file_path or $self->throw("flat file $file_path cannot be read: $!");
368 my $current_size = -s _;
369 if (defined $file_length) {
370 $current_size == $file_length
371 or $self->throw("flat file $file_path has changed size. Was $file_length bytes; now $current_size");
372 } else {
373 $file_length = $current_size;
376 unless (defined $nf) {
377 $self->{flat_file_index} = 0 unless exists $self->{flat_file_index};
378 $nf = $self->{flat_file_index}++;
380 $self->{flat_flat_file_path}{$nf} = $file_path;
381 $self->{flat_flat_file_no}{$file_path} = $nf;
382 $nf;
385 sub write_config {
386 my $self = shift;
387 $self->write_flag or $self->throw("cannot write configuration file because write_flag is not set");
388 my $path = $self->_config_path;
390 open my $F, '>', $path or $self->throw("Could not write file '$path': $!");
392 my $index_type = $self->indexing_scheme;
393 print $F "index\t$index_type\n";
395 my $format = $self->file_format;
396 my $alphabet = $self->alphabet;
397 my $alpha = $alphabet ? "/$alphabet" : '';
398 print $F "format\tURN:LSID:open-bio.org:${format}${alpha}\n";
400 my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined");
401 for my $nf (@filenos) {
402 my $path = $self->{flat_flat_file_path}{$nf};
403 my $size = -s $path;
404 print $F join("\t","fileid_$nf",$path,$size),"\n";
407 # write primary namespace
408 my $primary_ns = $self->primary_namespace
409 or $self->throw('cannot write config file because no primary namespace defined');
411 print $F join("\t",'primary_namespace',$primary_ns),"\n";
413 # write secondary namespaces
414 my @secondary = $self->secondary_namespaces;
415 print $F join("\t",'secondary_namespaces',@secondary),"\n";
417 close $F or $self->throw("close error on $path: $!");
420 sub files {
421 my $self = shift;
422 return unless $self->{flat_flat_file_no};
423 return keys %{$self->{flat_flat_file_no}};
426 sub write_seq {
427 my $self = shift;
428 my $seq = shift;
430 $self->write_flag or $self->throw("cannot write sequences because write_flag is not set");
432 my $file = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()');
433 my $seqio = $self->{flat_cached_parsers}{$file}
434 ||= Bio::SeqIO->new(-Format => $self->file_format,
435 -file => ">$file")
436 or $self->throw("couldn't create Bio::SeqIO object");
438 my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object");
439 my $offset = tell($fh);
440 $seqio->write_seq($seq);
441 my $length = tell($fh)-$offset;
442 my $ids = $self->seq_to_ids($seq);
443 $self->_store_index($ids,$file,$offset,$length);
445 $self->{flat_outfile_dirty}++;
448 sub close {
449 my $self = shift;
450 return unless $self->{flat_outfile_dirty};
451 $self->write_config;
452 delete $self->{flat_outfile_dirty};
453 delete $self->{flat_cached_parsers}{$self->out_file};
457 sub _filenos {
458 my $self = shift;
459 return unless $self->{flat_flat_file_path};
460 return keys %{$self->{flat_flat_file_path}};
463 # read the configuration file
464 sub _read_config {
465 my $self = shift;
466 my $path = $self->_config_path;
467 return unless -e $path;
469 open my $F, '<', $path or $self->throw("Could not read file '$path': $!");
470 my %config;
471 while (<$F>) {
472 chomp;
473 my ($tag,@values) = split "\t";
474 $config{$tag} = \@values;
476 CORE::close $F or $self->throw("close error on $path: $!");
478 $config{index}[0] =~ m~(flat/1|BerkeleyDB/1)~
479 or $self->throw("invalid configuration file $path: no index line");
481 $self->indexing_scheme($1);
483 if ($config{format}) {
484 # handle LSID format
485 if ($config{format}[0] =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w+))/) {
486 $self->file_format($1);
487 $self->alphabet($2);
488 } else { # compatibility with older versions
489 $self->file_format($config{format}[0]);
493 # set up primary namespace
494 my $primary_namespace = $config{primary_namespace}[0]
495 or $self->throw("invalid configuration file $path: no primary namespace defined");
496 $self->primary_namespace($primary_namespace);
498 # set up secondary namespaces (may be empty)
499 $self->secondary_namespaces($config{secondary_namespaces});
501 # get file paths and their normalization information
502 my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config;
503 for my $nf (@normalized_files) {
504 my ($file_path,$file_length) = @{$config{"fileid_${nf}"}};
505 $self->add_flat_file($file_path,$file_length,$nf);
511 sub _config_path {
512 my $self = shift;
513 $self->_catfile($self->_config_name);
516 sub _catfile {
517 my $self = shift;
518 my $component = shift;
519 File::Spec->catfile($self->directory,$self->dbname,$component);
522 sub _config_name { CONFIG_FILE_NAME }
524 sub _path2fileno {
525 my $self = shift;
526 my $path = shift;
527 return $self->add_flat_file($path)
528 unless exists $self->{flat_flat_file_no}{$path};
529 $self->{flat_flat_file_no}{$path};
532 sub _fileno2path {
533 my $self = shift;
534 my $fileno = shift;
535 $self->{flat_flat_file_path}{$fileno};
538 sub _files {
539 my $self = shift;
540 my $paths = $self->{flat_flat_file_no};
541 return keys %$paths;
544 =head2 fetch
546 Title : fetch
547 Usage : $index->fetch( $id )
548 Function: Returns a Bio::Seq object from the index
549 Example : $seq = $index->fetch( 'dJ67B12' )
550 Returns : Bio::Seq object
551 Args : ID
553 Deprecated. Use get_Seq_by_id instead.
555 =cut
557 sub fetch { shift->get_Seq_by_id(@_) }
560 =head2 To Be Implemented in Subclasses
562 The following methods MUST be implemented by subclasses.
564 =cut
566 # create real live Bio::Seq object
567 sub get_Seq_by_id {
568 my $self = shift;
569 my $id = shift;
570 $self->throw_not_implemented;
574 # fetch array of Bio::Seq objects
575 sub get_Seq_by_acc {
576 my $self = shift;
577 return $self->get_Seq_by_id(shift) if @_ == 1;
578 my ($ns,$key) = @_;
580 $self->throw_not_implemented;
583 sub fetch_raw {
584 my ($self,$id,$namespace) = @_;
585 $self->throw_not_implemented;
588 sub default_file_format {
589 my $self = shift;
590 $self->throw_not_implemented;
593 sub _store_index {
594 my $self = shift;
595 my ($ids,$file,$offset,$length) = @_;
596 $self->throw_not_implemented;
599 =head2 May Be Overridden in Subclasses
601 The following methods MAY be overridden by subclasses.
603 =cut
605 sub default_primary_namespace {
606 return "ACC";
609 sub default_secondary_namespaces {
610 return;
613 sub seq_to_ids {
614 my $self = shift;
615 my $seq = shift;
616 my %ids;
617 $ids{$self->primary_namespace} = $seq->accession_number;
618 \%ids;
621 sub DESTROY {
622 my $self = shift;
623 $self->close;