3 # BioPerl module for Bio::DB::Flat
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Lincoln Stein <lstein@cshl.org>
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::DB::Flat - Interface for indexed flat files
19 $db = Bio::DB::Flat->new(-directory => '/usr/share/embl',
24 $db->build_index('/usr/share/embl/primate.embl',
25 '/usr/share/embl/protists.embl');
26 $seq = $db->get_Seq_by_id('HSFOS');
27 @sequences = $db->get_Seq_by_acc('DIV' => 'primate');
28 $raw = $db->fetch_raw('HSFOS');
32 This object provides the basic mechanism to associate positions in
33 files with primary and secondary name spaces. Unlike
34 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
35 to work with the "flat index" and BerkeleyDB indexed flat file formats
36 worked out at the 2002 BioHackathon.
38 This object is a general front end to the underlying databases.
44 User feedback is an integral part of the evolution of this and other
45 Bioperl modules. Send your comments and suggestions preferably to one
46 of the Bioperl mailing lists. Your participation is much appreciated.
48 bioperl-l@bioperl.org - General discussion
49 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53 Please direct usage questions or support issues to the mailing list:
55 I<bioperl-l@bioperl.org>
57 rather than to the module maintainer directly. Many experienced and
58 reponsive experts will be able look at the problem and quickly
59 address it. Please include a thorough description of the problem
60 with code and data examples if at all possible.
64 Report bugs to the Bioperl bug tracking system to help us keep track
65 the bugs and their resolution. Bug reports can be submitted via the
68 https://github.com/bioperl/bioperl-live/issues
70 =head1 AUTHOR - Lincoln Stein
72 Email - lstein@cshl.org
76 The rest of the documentation details each of the object methods. Internal
77 methods are usually preceded with an "_" (underscore).
82 # Let the code begin...
83 package Bio
::DB
::Flat
;
87 use base
qw(Bio::Root::Root Bio::DB::RandomAccessI);
89 use constant CONFIG_FILE_NAME
=> 'config.dat';
94 Usage : my $db = Bio::DB::Flat->new(
95 -directory => $root_directory,
100 -out => 'outputfile',
101 -format => 'genbank');
102 Function: create a new Bio::DB::Flat object
103 Returns : new Bio::DB::Flat object
104 Args : -directory Root directory containing "config.dat"
105 -write_flag If true, allows creation/updating.
106 -verbose Verbose messages
107 -out File to write to when write_seq invoked
108 -index 'bdb' or 'binarysearch'
111 The required -directory argument indicates where the flat file indexes
112 will be stored. The build_index() and write_seq() methods will
113 automatically create subdirectories of this root directory. Each
114 subdirectory will contain a human-readable configuration file named
115 "config.dat" that specifies where the individual indexes are stored.
117 The required -dbname argument gives a name to the database index. The
118 index files will actually be stored in a like-named subdirectory
119 underneath the root directory.
121 The -write_flag enables writing new entries into the database as well
122 as the creation of the indexes. By default the indexes will be opened
125 -index is one of "bdb" or "binarysearch" and indicates the type of
126 index to generate. "bdb" corresponds to Berkeley DB. You *must* be
127 using BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB
128 extension installed (DB_File will *not* work). "binarysearch"
129 corresponds to the OBDA "flat" indexed file.
131 The -out argument specifies the output file for writing objects created
134 The -format argument specifies the format of the input file or files. If
135 the file suffix is one that Bioperl can already associate with a format
136 then this is optional.
142 $class = ref($class) if ref($class);
143 my $self = $class->SUPER::new
(@_);
145 # first we initialize ourselves
146 my ($flat_directory,$dbname,$format) =
147 $self->_rearrange([qw(DIRECTORY DBNAME FORMAT)],@_);
149 defined $flat_directory
150 or $self->throw('Please supply a -directory argument');
152 or $self->throw('Please supply a -dbname argument');
154 # set values from configuration file
155 $self->directory($flat_directory);
156 $self->dbname($dbname);
158 $self->throw("Base directory $flat_directory doesn't exist")
159 unless -e
$flat_directory;
160 $self->throw("$flat_directory isn't a directory")
162 my $dbpath = File
::Spec
->catfile($flat_directory,$dbname);
163 unless (-d
$dbpath) {
164 $self->debug("creating db directory $dbpath\n");
165 mkdir $dbpath,0777 or $self->throw("Can't create $dbpath: $!");
167 $self->_read_config();
169 # but override with initialization values
170 $self->_initialize(@_);
172 $self->throw('you must specify an indexing scheme')
173 unless $self->indexing_scheme;
175 # now we figure out what subclass to instantiate
176 my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ?
'BDB'
177 :$self->indexing_scheme eq 'flat/1' ?
'Binary'
178 :$self->throw("unknown indexing scheme: " .
179 $self->indexing_scheme);
180 $format = $self->file_format;
182 # because Michele and Lincoln did it differently
183 # Michele's way is via a standalone concrete class
184 if ($index_type eq 'Binary') {
185 my $child_class = 'Bio::DB::Flat::BinarySearch';
186 eval "use $child_class";
187 $self->throw($@
) if $@
;
188 push @_, ('-format', $format);
189 return $child_class->new(@_);
192 # Lincoln uses Bio::SeqIO style delegation.
193 my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format";
194 eval "use $child_class";
195 $self->throw($@
) if $@
;
197 # rebless & reinitialize with the new class
198 # (this prevents subclasses from forgetting to call our own initialization)
199 bless $self,$child_class;
200 $self->_initialize(@_);
201 $self->_set_namespaces(@_);
209 my ($flat_write_flag,$dbname,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format)
210 = $self->_rearrange([qw(WRITE_FLAG DBNAME INDEX VERBOSE OUT FORMAT)],@_);
212 $self->write_flag($flat_write_flag) if defined $flat_write_flag;
214 if (defined $flat_indexing) {
216 $flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb
/;
217 $flat_indexing = 'flat/1' if $flat_indexing =~ /^(flat
|binary
)/;
218 $self->indexing_scheme($flat_indexing);
221 $self->verbose($flat_verbose) if defined $flat_verbose;
222 $self->dbname($dbname) if defined $dbname;
223 $self->out_file($flat_outfile) if defined $flat_outfile;
224 $self->file_format($flat_format) if defined $flat_format;
227 sub _set_namespaces
{
230 $self->primary_namespace($self->default_primary_namespace)
231 unless defined $self->{flat_primary_namespace
};
233 $self->secondary_namespaces($self->default_secondary_namespaces)
234 unless defined $self->{flat_secondary_namespaces
};
236 $self->file_format($self->default_file_format)
237 unless defined $self->{flat_format
};
240 =head2 new_from_registry
242 Title : new_from_registry
243 Usage : $db = Bio::DB::Flat->new_from_registry(%config)
244 Function: creates a new Bio::DB::Flat object in a Bio::DB::Registry-
246 Returns : new Bio::DB::Flat
247 Args : provided by the registry, see below
250 The following registry-configuration tags are recognized:
252 location Root of the indexed flat file; corresponds to the new() method's
257 sub new_from_registry
{
258 my ($self,%config) = @_;
259 my $location = $config{'location'} or
260 $self->throw('location tag must be specified.');
261 my $dbname = $config{'dbname'} or
262 $self->throw('dbname tag must be specified.');
264 my $db = $self->new(-directory
=> $location,
273 my $d = $self->{flat_directory
};
274 $self->{flat_directory
} = shift if @_;
279 my $d = $self->{flat_write_flag
};
280 $self->{flat_write_flag
} = shift if @_;
285 my $d = $self->{flat_verbose
};
286 $self->{flat_verbose
} = shift if @_;
291 my $d = $self->{flat_outfile
};
292 $self->{flat_outfile
} = shift if @_;
297 my $d = $self->{flat_dbname
};
298 $self->{flat_dbname
} = shift if @_;
301 sub primary_namespace
{
303 my $d = $self->{flat_primary_namespace
};
304 $self->{flat_primary_namespace
} = shift if @_;
308 # get/set secondary namespace(s)
310 # get an array ref in scalar context, list in list context.
311 sub secondary_namespaces
{
313 my $d = $self->{flat_secondary_namespaces
};
314 $self->{flat_secondary_namespaces
} = (ref($_[0]) eq 'ARRAY' ?
shift : [@_]) if @_;
316 $d = [$d] if $d && ref($d) ne 'ARRAY'; # just paranoia
317 return wantarray ? @
$d : $d;
320 # return the file format
323 my $d = $self->{flat_format
};
324 $self->{flat_format
} = shift if @_;
328 # return the alphabet
331 my $d = $self->{flat_alphabet
};
332 $self->{flat_alphabet
} = shift if @_;
336 sub parse_one_record
{
340 $self->{cached_parsers
}{fileno($fh)}
341 ||= Bio
::SeqIO
->new(-fh
=>$fh,-format
=>$self->default_file_format);
342 my $seq = $parser->next_seq or return;
343 $self->{flat_alphabet
} ||= $seq->alphabet;
344 my $ids = $self->seq_to_ids($seq);
349 # return the indexing scheme
350 sub indexing_scheme
{
352 my $d = $self->{flat_indexing
};
353 $self->{flat_indexing
} = shift if @_;
359 my ($file_path,$file_length,$nf) = @_;
361 # check that file_path is absolute
362 unless (File
::Spec
->file_name_is_absolute($file_path)) {
363 $file_path = File
::Spec
->rel2abs($file_path);
366 -r
$file_path or $self->throw("flat file $file_path cannot be read: $!");
368 my $current_size = -s _
;
369 if (defined $file_length) {
370 $current_size == $file_length
371 or $self->throw("flat file $file_path has changed size. Was $file_length bytes; now $current_size");
373 $file_length = $current_size;
376 unless (defined $nf) {
377 $self->{flat_file_index
} = 0 unless exists $self->{flat_file_index
};
378 $nf = $self->{flat_file_index
}++;
380 $self->{flat_flat_file_path
}{$nf} = $file_path;
381 $self->{flat_flat_file_no
}{$file_path} = $nf;
387 $self->write_flag or $self->throw("cannot write configuration file because write_flag is not set");
388 my $path = $self->_config_path;
390 open my $F, '>', $path or $self->throw("Could not write file '$path': $!");
392 my $index_type = $self->indexing_scheme;
393 print $F "index\t$index_type\n";
395 my $format = $self->file_format;
396 my $alphabet = $self->alphabet;
397 my $alpha = $alphabet ?
"/$alphabet" : '';
398 print $F "format\tURN:LSID:open-bio.org:${format}${alpha}\n";
400 my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined");
401 for my $nf (@filenos) {
402 my $path = $self->{flat_flat_file_path
}{$nf};
404 print $F join("\t","fileid_$nf",$path,$size),"\n";
407 # write primary namespace
408 my $primary_ns = $self->primary_namespace
409 or $self->throw('cannot write config file because no primary namespace defined');
411 print $F join("\t",'primary_namespace',$primary_ns),"\n";
413 # write secondary namespaces
414 my @secondary = $self->secondary_namespaces;
415 print $F join("\t",'secondary_namespaces',@secondary),"\n";
417 close $F or $self->throw("close error on $path: $!");
422 return unless $self->{flat_flat_file_no
};
423 return keys %{$self->{flat_flat_file_no
}};
430 $self->write_flag or $self->throw("cannot write sequences because write_flag is not set");
432 my $file = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()');
433 my $seqio = $self->{flat_cached_parsers
}{$file}
434 ||= Bio
::SeqIO
->new(-Format
=> $self->file_format,
436 or $self->throw("couldn't create Bio::SeqIO object");
438 my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object");
439 my $offset = tell($fh);
440 $seqio->write_seq($seq);
441 my $length = tell($fh)-$offset;
442 my $ids = $self->seq_to_ids($seq);
443 $self->_store_index($ids,$file,$offset,$length);
445 $self->{flat_outfile_dirty
}++;
450 return unless $self->{flat_outfile_dirty
};
452 delete $self->{flat_outfile_dirty
};
453 delete $self->{flat_cached_parsers
}{$self->out_file};
459 return unless $self->{flat_flat_file_path
};
460 return keys %{$self->{flat_flat_file_path
}};
463 # read the configuration file
466 my $path = $self->_config_path;
467 return unless -e
$path;
469 open my $F, '<', $path or $self->throw("Could not read file '$path': $!");
473 my ($tag,@values) = split "\t";
474 $config{$tag} = \
@values;
476 CORE
::close $F or $self->throw("close error on $path: $!");
478 $config{index}[0] =~ m
~(flat
/1|BerkeleyDB/1)~
479 or $self->throw("invalid configuration file $path: no index line");
481 $self->indexing_scheme($1);
483 if ($config{format
}) {
485 if ($config{format
}[0] =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w
+))/) {
486 $self->file_format($1);
488 } else { # compatibility with older versions
489 $self->file_format($config{format
}[0]);
493 # set up primary namespace
494 my $primary_namespace = $config{primary_namespace
}[0]
495 or $self->throw("invalid configuration file $path: no primary namespace defined");
496 $self->primary_namespace($primary_namespace);
498 # set up secondary namespaces (may be empty)
499 $self->secondary_namespaces($config{secondary_namespaces
});
501 # get file paths and their normalization information
502 my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config;
503 for my $nf (@normalized_files) {
504 my ($file_path,$file_length) = @
{$config{"fileid_${nf}"}};
505 $self->add_flat_file($file_path,$file_length,$nf);
513 $self->_catfile($self->_config_name);
518 my $component = shift;
519 File
::Spec
->catfile($self->directory,$self->dbname,$component);
522 sub _config_name
{ CONFIG_FILE_NAME
}
527 return $self->add_flat_file($path)
528 unless exists $self->{flat_flat_file_no
}{$path};
529 $self->{flat_flat_file_no
}{$path};
535 $self->{flat_flat_file_path
}{$fileno};
540 my $paths = $self->{flat_flat_file_no
};
547 Usage : $index->fetch( $id )
548 Function: Returns a Bio::Seq object from the index
549 Example : $seq = $index->fetch( 'dJ67B12' )
550 Returns : Bio::Seq object
553 Deprecated. Use get_Seq_by_id instead.
557 sub fetch
{ shift->get_Seq_by_id(@_) }
560 =head2 To Be Implemented in Subclasses
562 The following methods MUST be implemented by subclasses.
566 # create real live Bio::Seq object
570 $self->throw_not_implemented;
574 # fetch array of Bio::Seq objects
577 return $self->get_Seq_by_id(shift) if @_ == 1;
580 $self->throw_not_implemented;
584 my ($self,$id,$namespace) = @_;
585 $self->throw_not_implemented;
588 sub default_file_format
{
590 $self->throw_not_implemented;
595 my ($ids,$file,$offset,$length) = @_;
596 $self->throw_not_implemented;
599 =head2 May Be Overridden in Subclasses
601 The following methods MAY be overridden by subclasses.
605 sub default_primary_namespace
{
609 sub default_secondary_namespaces
{
617 $ids{$self->primary_namespace} = $seq->accession_number;