2 # BioPerl module for Bio::DB::Flat::BinarySearch
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Michele Clamp <michele@sanger.ac.uk>>
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
14 Bio::DB::Flat::BinarySearch - BinarySearch search indexing system for sequence files
18 TODO: SYNOPSIS NEEDED!
22 This module can be used both to index sequence files and also to
23 retrieve sequences from existing sequence files.
25 This object allows indexing of sequence files both by a primary key
26 (say accession) and multiple secondary keys (say ids). This is
27 different from the Bio::Index::Abstract (see L<Bio::Index::Abstract>)
28 which uses DBM files as storage. This module uses a binary search to
29 retrieve sequences which is more efficient for large datasets.
33 my $sequencefile; # Some fasta sequence file
35 Patterns have to be entered to define where the keys are to be indexed
36 and also where the start of each record. E.g. for fasta
38 my $start_pattern = '^>';
39 my $primary_pattern = '^>(\S+)';
41 So the start of a record is a line starting with a E<gt> and the
42 primary key is all characters up to the first space after the E<gt>
44 A string also has to be entered to defined what the primary key
45 (primary_namespace) is called.
47 The index can now be created using
49 my $index = Bio::DB::Flat::BinarySearch->new(
50 -directory => "/home/max/",
52 -start_pattern => $start_pattern,
53 -primary_pattern => $primary_pattern,
54 -primary_namespace => "ID",
57 my @files = ("file1","file2","file3");
59 $index->build_index(@files);
61 The index is now ready to use. For large sequence files the perl way
62 of indexing takes a *long* time and a *huge* amount of memory. For
63 indexing things like dbEST I recommend using the DB_File indexer, BDB.
65 The formats currently supported by this module are fasta, Swissprot,
68 =head2 Creating indices with secondary keys
70 Sometimes just indexing files with one id per entry is not enough. For
71 instance you may want to retrieve sequences from swissprot using
72 their accessions as well as their ids.
74 To be able to do this when creating your index you need to pass in
75 a hash of secondary_patterns which have their namespaces as the keys
78 e.g. For Indexing something like
80 ID 1433_CAEEL STANDARD; PRT; 248 AA.
82 DT 01-NOV-1995 (Rel. 32, Created)
83 DT 01-NOV-1995 (Rel. 32, Last sequence update)
84 DT 15-DEC-1998 (Rel. 37, Last annotation update)
85 DE 14-3-3-LIKE PROTEIN 1.
87 OS Caenorhabditis elegans.
88 OC Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea;
89 OC Rhabditidae; Peloderinae; Caenorhabditis.
93 where we want to index the accession (P41932) as the primary key and the
94 id (1433_CAEEL) as the secondary id. The index is created as follows
96 my %secondary_patterns;
98 my $start_pattern = '^ID (\S+)';
99 my $primary_pattern = '^AC (\S+)\;';
101 $secondary_patterns{"ID"} = '^ID (\S+)';
103 my $index = Bio::DB::Flat::BinarySearch->new(
104 -directory => $index_directory,
108 -start_pattern => $start_pattern,
109 -primary_pattern => $primary_pattern,
110 -primary_namespace => 'AC',
111 -secondary_patterns => \%secondary_patterns);
113 $index->build_index($seqfile);
115 Of course having secondary indices makes indexing slower and use more
120 To fetch sequences using an existing index first of all create your sequence
123 my $index = Bio::DB::Flat::BinarySearch->new(
124 -directory => $index_directory);
126 Now you can happily fetch sequences either by the primary key or
127 by the secondary keys.
129 my $entry = $index->get_entry_by_id('HBA_HUMAN');
131 This returns just a string containing the whole entry. This is
132 useful is you just want to print the sequence to screen or write it to a file.
134 Other ways of getting sequences are
136 my $fh = $index->get_stream_by_id('HBA_HUMAN');
138 This can then be passed to a seqio object for output or converting
141 my $seq = Bio::SeqIO->new(-fh => $fh,
144 The last way is to retrieve a sequence directly. This is the
145 slowest way of extracting as the sequence objects need to be made.
147 my $seq = $index->get_Seq_by_id('HBA_HUMAN');
149 To access the secondary indices the secondary namespace needs to be known
151 $index->secondary_namespaces("ID");
153 Then the following call can be used
155 my $seq = $index->get_Seq_by_secondary('ID','1433_CAEEL');
157 These calls are not yet implemented
159 my $fh = $index->get_stream_by_secondary('ID','1433_CAEEL');
160 my $entry = $index->get_entry_by_secondary('ID','1433_CAEEL');
166 User feedback is an integral part of the evolution of this and other
167 Bioperl modules. Send your comments and suggestions preferably to one
168 of the Bioperl mailing lists. Your participation is much appreciated.
170 bioperl-l@bioperl.org - General discussion
171 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
175 Please direct usage questions or support issues to the mailing list:
177 I<bioperl-l@bioperl.org>
179 rather than to the module maintainer directly. Many experienced and
180 reponsive experts will be able look at the problem and quickly
181 address it. Please include a thorough description of the problem
182 with code and data examples if at all possible.
184 =head2 Reporting Bugs
186 Report bugs to the Bioperl bug tracking system to help us keep track
187 the bugs and their resolution. Bug reports can be submitted via the
190 https://github.com/bioperl/bioperl-live/issues
192 =head1 AUTHOR - Michele Clamp
194 Email - michele@sanger.ac.uk
198 Jason Stajich, jason@bioperl.org
202 The rest of the documentation details each of the object methods. Internal
203 methods are usually preceded with an "_" (underscore).
207 package Bio
::DB
::Flat
::BinarySearch
;
211 use Fcntl
qw(SEEK_END SEEK_CUR);
213 # rather than using tell which might be buffered
214 sub systell
{ sysseek( $_[0], 0, SEEK_CUR
) }
215 sub syseof
{ sysseek( $_[0], 0, SEEK_END
) }
218 use Bio
::Root
::RootI
;
222 use base
qw(Bio::DB::RandomAccessI);
224 use constant CONFIG_FILE_NAME
=> 'config.dat';
225 use constant HEADER_SIZE
=> 4;
226 use constant DEFAULT_FORMAT
=> 'fasta';
228 my @formats = [ 'FASTA', 'SWISSPROT', 'EMBL' ];
234 my $index = Bio::DB::Flat::BinarySearch->new(
235 -directory => '/Users/michele/indices/dbest',
241 my %secondary_patterns = {"ACC" => "^>\\S+ +(\\S+)"}
242 my $index = Bio::DB::Flat::BinarySearch->new(
243 -directory => '/Users/michele/indices',
245 -primary_pattern => "^>(\\S+)",
246 -secondary_patterns => \%secondary_patterns,
247 -primary_namespace => "ID");
249 my @files = ('file1','file2','file3');
251 $index->build_index(@files);
254 Function: create a new Bio::DB::Flat::BinarySearch object
255 Returns : new Bio::DB::Flat::BinarySearch
256 Args : -directory Root directory for index files
257 -dbname Name of subdirectory containing indices
259 -write_flag Allow building index
260 -primary_pattern Regexp defining the primary id
261 -secondary_patterns A hash ref containing the secondary
262 patterns with the namespaces as keys
263 -primary_namespace A string defining what the primary key
271 my ( $class, @args ) = @_;
273 my $self = $class->SUPER::new
(@args);
277 my ( $index_dir, $dbname, $format, $write_flag, $primary_pattern,
278 $primary_namespace, $start_pattern, $secondary_patterns )
293 $self->index_directory($index_dir);
294 $self->dbname($dbname);
296 if ( $self->index_directory && $self->read_config_file ) {
298 my $fh = $self->primary_index_filehandle;
299 my $record_width = $self->read_header($fh);
300 $self->record_size($record_width);
302 $format ||= DEFAULT_FORMAT
;
303 $self->format($format);
304 $self->write_flag($write_flag);
306 if ( $self->write_flag && !$primary_namespace ) {
308 $primary_namespace, $primary_pattern,
309 $start_pattern, $secondary_patterns
310 ) = $self->_guess_patterns( $self->format );
313 $self->primary_pattern($primary_pattern);
314 $self->primary_namespace($primary_namespace);
315 $self->start_pattern($start_pattern);
316 $self->secondary_patterns($secondary_patterns);
321 sub new_from_registry
{
322 my ( $self, %config ) = @_;
324 my $dbname = $config{'dbname'};
325 my $location = $config{'location'};
327 my $index = Bio
::DB
::Flat
::BinarySearch
->new(
329 -index_dir
=> $location,
335 Title : get_Seq_by_id
336 Usage : $obj->get_Seq_by_id($newval)
339 Returns : value of get_Seq_by_id
340 Args : newvalue (optional)
345 my ( $self, $id ) = @_;
347 # too many uninit variables...
350 my ( $fh, $length ) = $self->get_stream_by_id($id);
352 unless ( defined( $self->format ) ) {
353 $self->throw("Can't create sequence - format is not defined");
358 unless ( defined( $self->{_seqio
} ) ) {
360 $self->{_seqio
} = Bio
::SeqIO
->new(
362 -format
=> $self->format
366 $self->{_seqio
}->_fh($fh);
369 return $self->{_seqio
}->next_seq;
372 =head2 get_entry_by_id
374 Title : get_entry_by_id
375 Usage : $obj->get_entry_by_id($newval)
376 Function: Get a Bio::SeqI object for a unique ID
383 sub get_entry_by_id
{
384 my ( $self, $id ) = @_;
386 my ( $fh, $length ) = $self->get_stream_by_id($id);
390 sysread( $fh, $entry, $length );
395 =head2 get_stream_by_id
397 Title : get_stream_by_id
398 Usage : $obj->get_stream_by_id($id)
399 Function: Gets a Sequence stream for an id
400 Returns : Bio::SeqIO stream
401 Args : Id to lookup by
406 sub get_stream_by_id
{
407 my ( $self, $id ) = @_;
409 unless ( $self->record_size ) {
410 if ( $self->index_directory && $self->read_config_file ) {
412 my $fh = $self->primary_index_filehandle;
413 my $record_width = $self->read_header($fh);
414 $self->record_size($record_width);
417 my $indexfh = $self->primary_index_filehandle;
420 my $filesize = systell
($indexfh);
422 $self->throw("file was not parsed properly, record size is empty")
423 unless $self->record_size;
425 my $end = ( $filesize - $self->{'_start_pos'} ) / $self->record_size;
426 my ( $newid, $rest, $fhpos ) =
427 $self->find_entry( $indexfh, 0, $end, $id, $self->record_size );
429 my ( $fileid, $pos, $length ) = split( /\t/, $rest );
431 #print STDERR "BinarySearch Found id entry $newid $fileid $pos $length:$rest\n";
437 my $file = $self->{_file
}{$fileid};
439 open my $IN, '<', $file or $self->throw("Could not read file '$file': $!");
443 sysseek( $IN, $pos, 0 );
445 return ( $IN, $length );
448 =head2 get_Seq_by_acc
450 Title : get_Seq_by_acc
451 Usage : $obj->get_Seq_by_acc($acc)
452 Function: Gets a Bio::SeqI object by accession number
453 Returns : Bio::SeqI object
454 Args : string representing accession number
460 my ( $self, $acc ) = @_;
462 # too many uninit variables...
465 if ( $self->primary_namespace eq "ACC" ) {
466 return $self->get_Seq_by_id($acc);
469 return $self->get_Seq_by_secondary( "ACC", $acc );
473 =head2 get_Seq_by_version
475 Title : get_Seq_by_version
476 Usage : $obj->get_Seq_by_version($version)
477 Function: Gets a Bio::SeqI object by accession.version number
478 Returns : Bio::SeqI object
479 Args : string representing accession.version number
484 sub get_Seq_by_version
{
485 my ( $self, $acc ) = @_;
487 # too many uninit variables...
490 if ( $self->primary_namespace eq "VERSION" ) {
491 return $self->get_Seq_by_id($acc);
494 return $self->get_Seq_by_secondary( "VERSION", $acc );
498 =head2 get_Seq_by_secondary
500 Title : get_Seq_by_secondary
501 Usage : $obj->get_Seq_by_secondary($namespace,$acc)
502 Function: Gets a Bio::SeqI object looking up secondary accessions
503 Returns : Bio::SeqI object
504 Args : namespace name to check secondary namespace and an id
509 sub get_Seq_by_secondary
{
510 my ( $self, $name, $id ) = @_;
512 my @names = $self->secondary_namespaces;
515 foreach my $tmpname (@names) {
516 if ( $name eq $tmpname ) {
522 $self->throw("Secondary index for $name doesn't exist\n");
525 my $fh = $self->open_secondary_index($name);
529 my $filesize = systell
($fh);
531 my $recsize = $self->{'_secondary_record_size'}{$name};
533 # print "Name " . $recsize . "\n";
535 my $end = ( $filesize - $self->{'_start_pos'} ) / $recsize;
537 # print "End $end $filesize\n";
538 my ( $newid, $primary_id, $pos ) =
539 $self->find_entry( $fh, 0, $end, $id, $recsize );
541 sysseek( $fh, $pos, 0 );
543 # print "Found new id $newid $primary_id\n";
544 # We now need to shuffle up the index file to find the top secondary entry
548 while ( $record =~ /^$newid/ && $pos >= 0 ) {
550 $record = $self->read_record( $fh, $pos, $recsize );
551 $pos = $pos - $recsize;
553 # print "Up record = $record:$newid\n";
558 # print "Top position is $pos\n";
560 # Now we have to shuffle back down again to read all the secondary entries
562 my $current_id = $newid;
565 $primary_id{$primary_id} = 1;
567 while ( $current_id eq $newid ) {
568 $record = $self->read_record( $fh, $pos, $recsize );
570 # print "Record is :$record:\n";
571 my ( $secid, $primary_id ) = split( /\t/, $record, 2 );
572 $current_id = $secid;
574 if ( $current_id eq $newid ) {
575 $primary_id =~ s/ //g;
577 # print "Primary $primary_id\n";
578 $primary_id{$primary_id} = 1;
580 $pos = $pos + $recsize;
582 # print "Down record = $record\n";
586 if ( !defined($newid) ) {
592 foreach my $id ( keys %primary_id ) {
593 push @entry, $self->get_Seq_by_id($id);
595 return wantarray ?
@entry : $entry[0];
602 Usage : $obj->read_header($fhl)
603 Function: Reads the header from the db file
604 Returns : width of a record
611 my ( $self, $fh ) = @_;
615 sysread( $fh, $record_width, HEADER_SIZE
);
617 $self->{'_start_pos'} = HEADER_SIZE
;
618 $record_width =~ s/ //g;
619 $record_width = $record_width * 1;
621 return $record_width;
627 Usage : $obj->read_record($fh,$pos,$len)
628 Function: Reads a record from a filehandle
630 Args : filehandle, offset, and length
636 my ( $self, $fh, $pos, $len ) = @_;
638 sysseek( $fh, $pos, 0 );
642 sysread( $fh, $record, $len );
648 =head2 get_all_primary_ids
650 Title : get_all_primary_ids
651 Usage : @ids = $seqdb->get_all_primary_ids()
652 Function: gives an array of all the primary_ids of the
653 sequence objects in the database.
654 Returns : an array of strings
659 sub get_all_primary_ids
{
662 my $fh = $self->primary_index_filehandle;
664 my $filesize = systell
($fh);
665 my $recsize = $self->record_size;
669 for ( my $pos = $self->{'_start_pos'} ; $pos < $end ; $pos += $recsize ) {
670 my $record = $self->read_record( $fh, $pos, $recsize );
671 my ($entryid) = split( /\t/, $record );
680 Usage : $obj->find_entry($fh,$start,$end,$id,$recsize)
681 Function: Extract an entry based on the start,end,id and record size
683 Args : filehandle, start, end, id, recordsize
689 my ( $self, $fh, $start, $end, $id, $recsize ) = @_;
691 my $mid = int( ( $end + 1 + $start ) / 2 );
692 my $pos = ( $mid - 1 ) * $recsize + $self->{'_start_pos'};
694 my ($record) = $self->read_record( $fh, $pos, $recsize );
695 my ( $entryid, $rest ) = split( /\t/, $record, 2 );
698 # print "Mid $recsize $mid $pos:$entryid:$rest:$record\n";
699 # print "Entry :$id:$entryid:$rest\n";
701 my ( $first, $second ) =
702 $id le $entryid ?
( $id, $entryid ) : ( $entryid, $id );
704 if ( $id eq $entryid ) {
706 return ( $id, $rest, $pos - $recsize );
709 elsif ( $first eq $id ) {
711 if ( $end - $start <= 1 ) {
716 # print "Moving up $entryid $id\n";
717 $self->find_entry( $fh, $start, $end, $id, $recsize );
720 elsif ( $second eq $id ) {
722 # print "Moving down $entryid $id\n";
723 if ( $end - $start <= 1 ) {
729 $self->find_entry( $fh, $start, $end, $id, $recsize );
737 Usage : $obj->build_index(@files)
738 Function: Build the index based on a set of files
739 Returns : count of the number of entries
740 Args : List of filenames
746 my ( $self, @files ) = @_;
748 or $self->throw('Cannot build index unless -write_flag is true');
750 my $rootdir = $self->index_directory;
752 if ( !defined($rootdir) ) {
753 $self->throw("No index directory set - can't build indices");
756 if ( !-d
$rootdir ) {
758 "Index directory [$rootdir] is not a directory. Cant' build indices"
762 my $dbpath = File
::Spec
->catfile( $rootdir, $self->dbname );
764 warn "Creating directory $dbpath\n";
765 mkdir $dbpath, 0777 or $self->throw("Couldn't create $dbpath: $!");
769 $self->throw("Must enter an array of filenames to index");
772 foreach my $file (@files) {
773 $file = File
::Spec
->rel2abs($file)
774 unless File
::Spec
->file_name_is_absolute($file);
775 unless ( -e
$file ) {
776 $self->throw("Can't index file [$file] as it doesn't exist");
780 if ( my $filehash = $self->{_dbfile
} ) {
781 push @files, keys %$filehash;
785 @files = grep { !$seen{$_}++ } @files;
788 $self->make_config_file( \
@files );
790 foreach my $file (@files) {
791 $entries += $self->_index_file($file);
794 # update alphabet if necessary
795 $self->make_config_file( \
@files );
797 # And finally write out the indices
798 $self->write_primary_index;
799 $self->write_secondary_indices;
807 Usage : $obj->_index_file($newval)
810 Returns : value of _index_file
811 Args : newvalue (optional)
816 my ( $self, $file ) = @_;
817 my $v = $self->verbose;
818 open my $FILE, '<', $file or $self->throw("Could not read file '$file': $!");
821 my $fileid = $self->get_fileid_by_filename($file);
826 my $primary = $self->primary_pattern;
827 my $start_pattern = $self->start_pattern;
831 my $new_primary_entry;
839 my @secondary_names = $self->secondary_namespaces;
843 # In Windows, text files have '\r\n' as line separator, but when reading in
844 # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
845 # "length $_" will report 4 although the line is 5 bytes in length.
846 # We assume that all lines have the same line separator and only read current line.
847 my $init_pos = tell($fh);
848 my $curr_line = <$fh>;
849 my $pos_diff = tell($fh) - $init_pos;
850 my $correction = $pos_diff - length $curr_line;
851 seek $fh, $init_pos, 0; # Rewind position to proceed to read the file
855 $self->{alphabet
} ||= $self->guess_alphabet($_);
856 if ( $_ =~ /$start_pattern/ ) {
858 $id = $new_primary_entry;
859 $self->{alphabet
} ||= $self->guess_alphabet($_);
861 my $tmplen = ( tell $fh ) - length($_) - $correction;
863 $length = $tmplen - $pos;
865 unless ( defined($id) ) {
866 $self->throw("No id defined for sequence");
868 unless ( defined($fileid) ) {
869 $self->throw("No fileid defined for file $file");
871 unless ( defined($pos) ) {
872 $self->throw( "No position defined for " . $id . "\n" );
874 unless ( defined($length) ) {
875 $self->throw( "No length defined for " . $id . "\n" );
877 $self->_add_id_position( $id, $pos, $fileid, $length,
882 if ( $count > 0 && $count % 1000 == 0 ) {
883 $self->debug("Indexed $count ids\n") if $v > 0;
893 if ( $_ =~ /$primary/ ) {
894 $new_primary_entry = $1;
897 my $secondary_patterns = $self->secondary_patterns;
899 foreach my $sec (@secondary_names) {
900 my $pattern = $secondary_patterns->{$sec};
902 if ( $_ =~ /$pattern/ ) {
903 $secondary_id{$sec} = $1;
909 # Remember to add in the last one
911 $id = $new_primary_entry;
913 # my $tmplen = (tell $fh) - length($last_one);
914 my $tmplen = ( tell $fh );
916 $length = $tmplen - $pos;
918 if ( !defined($id) ) {
919 $self->throw("No id defined for sequence");
921 if ( !defined($fileid) ) {
922 $self->throw("No fileid defined for file $file");
924 if ( !defined($pos) ) {
925 $self->throw( "No position defined for " . $id . "\n" );
927 if ( !defined($length) ) {
928 $self->throw( "No length defined for " . $id . "\n" );
931 $self->_add_id_position( $id, $pos, $fileid, $length, \
%secondary_id );
938 =head2 write_primary_index
940 Title : write_primary_index
941 Usage : $obj->write_primary_index($newval)
944 Returns : value of write_primary_index
945 Args : newvalue (optional)
950 sub write_primary_index
{
953 my @ids = keys %{ $self->{_id
} };
955 @ids = sort { $a cmp $b } @ids;
957 open my $INDEX, '>', $self->primary_index_file
959 "Could not write primary index file '" . $self->primary_index_file . "': $!" );
962 $self->{_maxidlength
} +
963 $self->{_maxfileidlength
} +
964 $self->{_maxposlength
} +
965 $self->{_maxlengthlength
} + 3;
967 print $INDEX sprintf( "%04d", $recordlength );
969 foreach my $id (@ids) {
971 if ( !defined( $self->{_id
}{$id}{_fileid
} ) ) {
972 $self->throw("No fileid for $id\n");
974 if ( !defined( $self->{_id
}{$id}{_pos
} ) ) {
975 $self->throw("No position for $id\n");
977 if ( !defined( $self->{_id
}{$id}{_length
} ) ) {
978 $self->throw("No length for $id");
983 . $self->{_id
}{$id}{_fileid
} . "\t"
984 . $self->{_id
}{$id}{_pos
} . "\t"
985 . $self->{_id
}{$id}{_length
};
987 print $INDEX sprintf( "%-${recordlength}s", $record );
992 =head2 write_secondary_indices
994 Title : write_secondary_indices
995 Usage : $obj->write_secondary_indices($newval)
998 Returns : value of write_secondary_indices
999 Args : newvalue (optional)
1004 sub write_secondary_indices
{
1007 # These are the different
1008 my @names = keys( %{ $self->{_secondary_id
} } );
1010 foreach my $name (@names) {
1012 my @seconds = keys %{ $self->{_secondary_id
}{$name} };
1014 # First we need to loop over to get the longest record.
1017 foreach my $second (@seconds) {
1018 my $tmplen = length($second) + 1;
1019 my @prims = keys %{ $self->{_secondary_id
}{$name}{$second} };
1021 foreach my $prim (@prims) {
1022 my $recordlen = $tmplen + length($prim);
1024 if ( $recordlen > $length ) {
1025 $length = $recordlen;
1030 # Now we can print the index
1032 my $fh = $self->new_secondary_filehandle($name);
1034 print $fh sprintf( "%04d", $length );
1035 @seconds = sort @seconds;
1037 foreach my $second (@seconds) {
1039 my @prims = keys %{ $self->{_secondary_id
}{$name}{$second} };
1042 foreach my $prim (@prims) {
1043 my $record = $tmp . "\t" . $prim;
1044 if ( length($record) > $length ) {
1046 "Something has gone horribly wrong - length of record is more than we thought [$length]\n"
1050 print $fh sprintf( "%-${length}s", $record );
1059 =head2 new_secondary_filehandle
1061 Title : new_secondary_filehandle
1062 Usage : $obj->new_secondary_filehandle($newval)
1065 Returns : value of new_secondary_filehandle
1066 Args : newvalue (optional)
1071 sub new_secondary_filehandle
{
1072 my ( $self, $name ) = @_;
1074 my $indexdir = $self->_config_path;
1076 my $secindex = File
::Spec
->catfile( $indexdir, "id_$name.index" );
1078 open my $fh, '>', $secindex or $self->throw("Could not write file '$secindex': $!");
1082 =head2 open_secondary_index
1084 Title : open_secondary_index
1085 Usage : $obj->open_secondary_index($newval)
1088 Returns : value of open_secondary_index
1089 Args : newvalue (optional)
1094 sub open_secondary_index
{
1095 my ( $self, $name ) = @_;
1097 if ( !defined( $self->{_secondary_filehandle
}{$name} ) ) {
1099 my $indexdir = $self->_config_path;
1100 my $secindex = $indexdir . "/id_$name.index";
1102 if ( !-e
$secindex ) {
1103 $self->throw("Index is not present for namespace [$name]\n");
1106 open my $newfh, '<', $secindex or $self->throw("Could not read file '$secindex': $!");
1107 my $reclen = $self->read_header($newfh);
1109 $self->{_secondary_filehandle
}{$name} = $newfh;
1110 $self->{_secondary_record_size
}{$name} = $reclen;
1113 return $self->{_secondary_filehandle
}{$name};
1117 =head2 _add_id_position
1119 Title : _add_id_position
1120 Usage : $obj->_add_id_position($newval)
1123 Returns : value of _add_id_position
1124 Args : newvalue (optional)
1129 sub _add_id_position
{
1130 my ( $self, $id, $pos, $fileid, $length, $secondary_id ) = @_;
1132 if ( !defined($id) ) {
1133 $self->throw("No id defined. Can't add id position");
1135 if ( !defined($pos) ) {
1136 $self->throw("No position defined. Can't add id position");
1138 if ( !defined($fileid) ) {
1139 $self->throw("No fileid defined. Can't add id position");
1141 if ( !defined($length) || $length <= 0 ) {
1143 "No length defined or <= 0 [$length]. Can't add id position");
1146 $self->{_id
}{$id}{_pos
} = $pos;
1147 $self->{_id
}{$id}{_length
} = $length;
1148 $self->{_id
}{$id}{_fileid
} = $fileid;
1150 # Now the secondary ids
1152 foreach my $sec ( keys(%$secondary_id) ) {
1153 my $value = $secondary_id->{$sec};
1154 $self->{_secondary_id
}{$sec}{$value}{$id} = 1;
1157 $self->{_maxidlength
} = length($id)
1158 if !exists $self->{_maxidlength
}
1159 or length($id) >= $self->{_maxidlength
};
1161 $self->{_maxfileidlength
} = length($fileid)
1162 if !exists $self->{_maxfileidlength
}
1163 or length($fileid) >= $self->{_maxfileidlength
};
1165 $self->{_maxposlength
} = length($pos)
1166 if !exists $self->{_maxposlength
}
1167 or length($pos) >= $self->{_maxposlength
};
1169 $self->{_maxlengthlength
} = length($length)
1170 if !exists $self->{_maxlengthlength
}
1171 or length($length) >= $self->{_maxlengthlength
};
1174 =head2 make_config_file
1176 Title : make_config_file
1177 Usage : $obj->make_config_file($newval)
1180 Returns : value of make_config_file
1181 Args : newvalue (optional)
1185 sub make_config_file
{
1186 my ( $self, $files ) = @_;
1188 my @files = @
$files;
1190 my $configfile = $self->_config_file;
1192 open my $CON, '>', $configfile
1193 or $self->throw("Could not write config file '$configfile': $!");
1195 # First line must be the type of index - in this case flat
1196 print $CON "index\tflat/1\n";
1202 foreach my $file (@files) {
1204 my $size = -s
$file;
1206 print $CON "fileid_$count\t$file\t$size\n";
1208 $self->{_file
}{$count} = $file;
1209 $self->{_dbfile
}{$file} = $count;
1210 $self->{_size
}{$count} = $size;
1214 # Now the namespaces
1216 print $CON "primary_namespace\t" . $self->primary_namespace . "\n";
1218 # Needs fixing for the secondary stuff
1220 my $second_patterns = $self->secondary_patterns;
1222 my @second = keys %$second_patterns;
1225 print $CON "secondary_namespaces";
1227 foreach my $second (@second) {
1228 print $CON "\t$second";
1233 # Now the config format
1235 unless ( defined( $self->format ) ) {
1237 "Format does not exist in module - can't write config file");
1240 my $format = $self->format;
1241 my $alphabet = $self->alphabet;
1242 my $alpha = $alphabet ?
"/$alphabet" : '';
1243 print $CON "format\t" . "$format\n";
1248 =head2 read_config_file
1250 Title : read_config_file
1251 Usage : $obj->read_config_file($newval)
1254 Returns : value of read_config_file
1255 Args : newvalue (optional)
1259 sub read_config_file
{
1261 my $configfile = $self->_config_file;
1262 return unless -e
$configfile;
1264 open my $CON, '<', $configfile
1265 or $self->throw("Could not read config file '$configfile': $!");
1267 # First line must be type
1272 # This is hard coded as we only index flatfiles here
1273 if ( $line =~ m{index\tflat/(\d+)} ) {
1278 "First line not compatible with flat file index. Should be something like\n\nindex\tflat/1"
1282 $self->index_type("flat");
1283 $self->index_version($version);
1288 # Look for fileid lines
1289 if ( $_ =~ /^fileid_(\d+)\t(.+)\t(\d+)/ ) {
1294 if ( !-e
$filename ) {
1295 $self->throw("File [$filename] does not exist!");
1297 if ( -s
$filename != $filesize ) {
1299 "Flatfile size for $filename differs from what the index thinks it is. Real size ["
1301 . "] Index thinks it is ["
1306 $self->{_file
}{$fileid} = $filename;
1307 $self->{_dbfile
}{$filename} = $fileid;
1308 $self->{_size
}{$fileid} = $filesize;
1311 # Look for namespace lines
1312 if (/(.*)_namespaces?\t(.+)/) {
1313 if ( $1 eq "primary" ) {
1314 $self->primary_namespace($2);
1316 elsif ( $1 eq "secondary" ) {
1317 $self->secondary_namespaces( split "\t", $2 );
1320 $self->throw("Unknown namespace name in config file [$1");
1324 # Look for format lines
1325 if ( $_ =~ /format\t(\S+)/ ) {
1327 # Check the format here?
1330 # handle LSID format
1331 if ( $format =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w
+))?
/ ) {
1333 $self->alphabet($2);
1335 else { # compatibility with older versions
1343 # Now check we have all that we need
1345 my @fileid_keys = keys( %{ $self->{_file
} } );
1347 if ( !(@fileid_keys) ) {
1349 "No flatfile fileid files in config - check the index has been made correctly"
1353 if ( !defined( $self->primary_namespace ) ) {
1354 $self->throw("No primary namespace exists");
1357 if ( !-e
$self->primary_index_file ) {
1358 $self->throw( "Primary index file ["
1359 . $self->primary_index_file
1360 . "] doesn't exist" );
1366 =head2 get_fileid_by_filename
1368 Title : get_fileid_by_filename
1369 Usage : $obj->get_fileid_by_filename($newval)
1372 Returns : value of get_fileid_by_filename
1373 Args : newvalue (optional)
1377 sub get_fileid_by_filename
{
1378 my ( $self, $file ) = @_;
1380 if ( !defined( $self->{_dbfile
} ) ) {
1382 "No file to fileid mapping present. Has the fileid file been read?"
1386 return $self->{_dbfile
}{$file};
1389 =head2 get_filehandle_by_fileid
1391 Title : get_filehandle_by_fileid
1392 Usage : $obj->get_filehandle_by_fileid($newval)
1395 Returns : value of get_filehandle_by_fileid
1396 Args : newvalue (optional)
1400 sub get_filehandle_by_fileid
{
1401 my ( $self, $fileid ) = @_;
1403 if ( !defined( $self->{_file
}{$fileid} ) ) {
1404 $self->throw("ERROR: undefined fileid in index [$fileid]");
1407 open my $fh, '<', $self->{_file
}{$fileid} or $self->throw("Could not read file '$self->{_file}{$fileid}': $!");
1411 =head2 primary_index_file
1413 Title : primary_index_file
1414 Usage : $obj->primary_index_file($newval)
1417 Returns : value of primary_index_file
1418 Args : newvalue (optional)
1423 sub primary_index_file
{
1426 return File
::Spec
->catfile( $self->_config_path,
1427 "key_" . $self->primary_namespace . ".key" );
1430 =head2 primary_index_filehandle
1432 Title : primary_index_filehandle
1433 Usage : $obj->primary_index_filehandle($newval)
1436 Returns : value of primary_index_filehandle
1437 Args : newvalue (optional)
1442 sub primary_index_filehandle
{
1445 unless ( defined( $self->{'_primary_index_handle'} ) ) {
1446 my $primary_file = $self->primary_index_file;
1447 open $self->{'_primary_index_handle'}, '<', $primary_file
1448 or $self->throw("Could not read file '$primary_file': $!\n");
1450 return $self->{'_primary_index_handle'};
1456 Usage : $obj->format($newval)
1459 Returns : value of format
1460 Args : newvalue (optional)
1466 my ( $obj, $value ) = @_;
1467 if ( defined $value ) {
1468 $obj->{'format'} = $value;
1470 return $obj->{'format'};
1475 my ( $obj, $value ) = @_;
1476 if ( defined $value ) {
1477 $obj->{alphabet
} = $value;
1479 return $obj->{alphabet
};
1485 Usage : $obj->write_flag($newval)
1488 Returns : value of write_flag
1489 Args : newvalue (optional)
1495 my ( $obj, $value ) = @_;
1496 if ( defined $value ) {
1497 $obj->{'write_flag'} = $value;
1499 return $obj->{'write_flag'};
1506 Usage : $obj->dbname($newval)
1507 Function: get/set database name
1509 Returns : value of dbname
1510 Args : newvalue (optional)
1516 my $d = $self->{flat_dbname
};
1517 $self->{flat_dbname
} = shift if @_;
1521 =head2 index_directory
1523 Title : index_directory
1524 Usage : $obj->index_directory($newval)
1527 Returns : value of index_directory
1528 Args : newvalue (optional)
1533 sub index_directory
{
1534 my ( $self, $arg ) = @_;
1536 if ( defined($arg) ) {
1537 if ( $arg !~ m{/$} ) {
1540 $self->{_index_directory
} = $arg;
1542 return $self->{_index_directory
};
1548 my $root = $self->index_directory;
1549 my $dbname = $self->dbname;
1550 File
::Spec
->catfile( $root, $dbname );
1555 my $path = $self->_config_path;
1556 File
::Spec
->catfile( $path, CONFIG_FILE_NAME
);
1562 Usage : $obj->record_size($newval)
1565 Returns : value of record_size
1566 Args : newvalue (optional)
1573 $self->{_record_size
} = shift if @_;
1574 return $self->{_record_size
};
1577 =head2 primary_namespace
1579 Title : primary_namespace
1580 Usage : $obj->primary_namespace($newval)
1583 Returns : value of primary_namespace
1584 Args : newvalue (optional)
1588 sub primary_namespace
{
1590 $self->{_primary_namespace
} = shift if @_;
1591 return $self->{_primary_namespace
};
1597 Usage : $obj->index_type($newval)
1600 Returns : value of index_type
1601 Args : newvalue (optional)
1608 $self->{_index_type
} = shift if @_;
1609 return $self->{_index_type
};
1612 =head2 index_version
1614 Title : index_version
1615 Usage : $obj->index_version($newval)
1618 Returns : value of index_version
1619 Args : newvalue (optional)
1626 $self->{_index_version
} = shift if @_;
1627 return $self->{_index_version
};
1630 =head2 primary_pattern
1632 Title : primary_pattern
1633 Usage : $obj->primary_pattern($newval)
1636 Returns : value of primary_pattern
1637 Args : newvalue (optional)
1642 sub primary_pattern
{
1644 $obj->{'primary_pattern'} = shift if @_;
1645 return $obj->{'primary_pattern'};
1648 =head2 start_pattern
1650 Title : start_pattern
1651 Usage : $obj->start_pattern($newval)
1654 Returns : value of start_pattern
1655 Args : newvalue (optional)
1662 $obj->{'start_pattern'} = shift if @_;
1663 return $obj->{'start_pattern'};
1666 =head2 secondary_patterns
1668 Title : secondary_patterns
1669 Usage : $obj->secondary_patterns($newval)
1672 Returns : value of secondary_patterns
1673 Args : newvalue (optional)
1678 sub secondary_patterns
{
1679 my ( $obj, $value ) = @_;
1680 if ( defined $value ) {
1681 $obj->{'secondary_patterns'} = $value;
1683 my @names = keys %$value;
1685 foreach my $name (@names) {
1686 $obj->secondary_namespaces($name);
1689 return $obj->{'secondary_patterns'};
1693 =head2 secondary_namespaces
1695 Title : secondary_namespaces
1696 Usage : $obj->secondary_namespaces($newval)
1699 Returns : value of secondary_namespaces
1700 Args : newvalue (optional)
1705 sub secondary_namespaces
{
1706 my ( $obj, @values ) = @_;
1709 push( @
{ $obj->{'secondary_namespaces'} }, @values );
1711 return @
{ $obj->{'secondary_namespaces'} || [] };
1714 ## These are indexing routines to index commonly used format - fasta
1715 ## swissprot and embl
1717 sub new_SWISSPROT_index
{
1718 my ( $self, $index_dir, @files ) = @_;
1720 my %secondary_patterns;
1722 my $start_pattern = "^ID (\\S+)";
1723 my $primary_pattern = "^AC (\\S+)\\;";
1725 $secondary_patterns{"ID"} = $start_pattern;
1727 my $index = Bio
::DB
::Flat
::BinarySearch
->new(
1728 -index_dir
=> $index_dir,
1729 -format
=> 'swissprot',
1730 -primary_pattern
=> $primary_pattern,
1731 -primary_namespace
=> "ACC",
1732 -start_pattern
=> $start_pattern,
1733 -secondary_patterns
=> \
%secondary_patterns
1736 $index->build_index(@files);
1739 sub new_EMBL_index
{
1740 my ( $self, $index_dir, @files ) = @_;
1742 my %secondary_patterns;
1744 my $start_pattern = "^ID (\\S+)";
1745 my $primary_pattern = "^AC (\\S+)\\;";
1746 my $primary_namespace = "ACC";
1748 $secondary_patterns{"ID"} = $start_pattern;
1750 my $index = Bio
::DB
::Flat
::BinarySearch
->new(
1751 -index_dir
=> $index_dir,
1753 -primary_pattern
=> $primary_pattern,
1754 -primary_namespace
=> "ACC",
1755 -start_pattern
=> $start_pattern,
1756 -secondary_patterns
=> \
%secondary_patterns
1759 $index->build_index(@files);
1764 sub new_FASTA_index
{
1765 my ( $self, $index_dir, @files ) = @_;
1767 my %secondary_patterns;
1769 my $start_pattern = "^>";
1770 my $primary_pattern = "^>(\\S+)";
1771 my $primary_namespace = "ACC";
1773 $secondary_patterns{"ID"} = "^>\\S+ +(\\S+)";
1775 my $index = Bio
::DB
::Flat
::BinarySearch
->new(
1776 -index_dir
=> $index_dir,
1778 -primary_pattern
=> $primary_pattern,
1779 -primary_namespace
=> "ACC",
1780 -start_pattern
=> $start_pattern,
1781 -secondary_patterns
=> \
%secondary_patterns
1784 $index->build_index(@files);
1789 # EVERYTHING THAT FOLLOWS THIS
1790 # is an awful hack - in reality Michele's code needs to be rewritten
1791 # to use Bio::SeqIO, but I have too little time to do this -- LS
1792 sub guess_alphabet
{
1796 my $format = $self->format;
1797 return 'protein' if $format eq 'swissprot';
1799 if ( $format eq 'genbank' ) {
1800 return unless $line =~ /^LOCUS/;
1801 return 'dna' if $line =~ /\s+\d+\s+bp/i;
1805 if ( $format eq 'embl' ) {
1806 return unless $line =~ /^ID/;
1807 return 'dna' if $line =~ / DNA;/i;
1808 return 'rna' if $line =~ / RNA;/i;
1815 # return (namespace,primary_pattern,start_pattern,secondary_pattern)
1816 sub _guess_patterns
{
1819 if ( $format =~ /swiss(prot)?/i ) {
1820 return ( 'ID', "^ID (\\S+)", "^ID (\\S+)",
1821 { ACC
=> "^AC (\\S+);" } );
1824 if ($format =~ /embl/i) {
1829 ACC
=> q
/^AC (\S+);/,
1830 VERSION
=> q
/^SV\s+(\S+)/
1834 if ( $format =~ /genbank/i ) {
1840 ACC
=> q
/^ACCESSION\s+(\S+)/,
1841 VERSION
=> q
/^VERSION\s+(\S+)/
1846 if ( $format =~ /fasta/i ) {
1847 return ( 'ACC', '^>(\S+)', '^>(\S+)', );
1850 $self->throw("I can't handle format $format");