2 # BioPerl module for Bio::Index::AbstractSeq
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Index::AbstractSeq - base class for AbstractSeq
20 # Make a new sequence file indexing package
22 package MyShinyNewIndexer;
24 use base qw(Bio::Index::AbstractSeq);
26 # Now provide the necessary methods...
30 Provides a common base class for multiple sequence files built using
31 the Bio::Index::Abstract system, and provides a Bio::DB::SeqI
38 User feedback is an integral part of the evolution of this
39 and other Bioperl modules. Send your comments and suggestions
40 preferably to one of the Bioperl mailing lists.
41 Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 the bugs and their resolution. Bug reports can be submitted via the
63 https://github.com/bioperl/bioperl-live/issues
65 =head1 AUTHOR - Ewan Birney
67 Email birney@ebi.ac.uk
71 The rest of the documentation details each of the object methods.
72 Internal methods are usually preceded with a _
76 L<Bio::Index::Abstract>, which provides dbm indexing for flat files of
77 any type, containing sequence or not. L<Bio::Index::AbstractSeq> inherits
78 from L<Bio::Index::Abstract>
82 # Let's begin the code ...
84 package Bio
::Index
::AbstractSeq
;
88 use Bio
::SeqIO
::MultiFile
;
90 use base
qw(Bio::Index::Abstract Bio::DB::SeqI);
93 my ($class, @args) = @_;
94 my $self = $class->SUPER::new
(@args);
96 $self->{'_seqio_cache'} = [];
103 Usage : $self->_file_format
104 Function: Derived classes should override this
105 method (it throws an exception here)
106 to give the file format of the files used
114 my ($self,@args) = @_;
116 my $pkg = ref($self);
117 $self->throw("Class '$pkg' must provide a file format method correctly");
123 Usage : $index->fetch( $id )
124 Function: Returns a Bio::Seq object from the index
125 Example : $seq = $index->fetch( 'dJ67B12' )
126 Returns : Bio::Seq object
132 my( $self, $id ) = @_;
133 my $db = $self->db();
136 if (my $rec = $db->{ $id }) {
137 my ($file, $begin) = $self->unpack_record( $rec );
139 # Get the (possibly cached) SeqIO object
140 my $seqio = $self->_get_SeqIO_object( $file );
141 my $fh = $seqio->_fh();
143 # move to start of record
144 # $begin-- if( $^O =~ /mswin/i); # workaround for Win DB_File bug
145 seek($fh, $begin, 0);
147 $seq = $seqio->next_seq();
150 # we essentially assume that the primary_id for the database
152 if (ref($seq) && $seq->isa('Bio::PrimarySeqI') &&
153 $seq->primary_id =~ /^\D+$/) {
154 $seq->primary_id( $seq->display_id() );
159 =head2 _get_SeqIO_object
161 Title : _get_SeqIO_object
162 Usage : $index->_get_SeqIO_object( $file )
163 Function: Returns a Bio::SeqIO object for the file
164 Example : $seq = $index->_get_SeqIO_object( 0 )
165 Returns : Bio::SeqIO object
166 Args : File number (an integer)
170 sub _get_SeqIO_object
{
171 my( $self, $i ) = @_;
173 unless ($self->{'_seqio_cache'}[$i]) {
174 my $fh = $self->_file_handle($i);
175 # make a new SeqIO object
176 my $seqio = Bio
::SeqIO
->new( -Format
=> $self->_file_format,
178 $self->{'_seqio_cache'}[$i] = $seqio;
180 return $self->{'_seqio_cache'}[$i];
185 Title : get_Seq_by_id
186 Usage : $seq = $db->get_Seq_by_id()
187 Function: retrieves a sequence object, identically to
188 ->fetch, but here behaving as a Bio::DB::BioSeqI
189 Returns : new Bio::Seq object
190 Args : string represents the id
198 return $self->fetch($id);
201 =head2 get_Seq_by_acc
203 Title : get_Seq_by_acc
204 Usage : $seq = $db->get_Seq_by_acc()
205 Function: retrieves a sequence object, identically to
206 ->fetch, but here behaving as a Bio::DB::BioSeqI
207 Returns : new Bio::Seq object
208 Args : string represents the accession number
216 return $self->fetch($id);
219 =head2 get_PrimarySeq_stream
221 Title : get_PrimarySeq_stream
222 Usage : $stream = get_PrimarySeq_stream
223 Function: Makes a Bio::DB::SeqStreamI compliant object
224 which provides a single method, next_primary_seq
225 Returns : Bio::DB::SeqStreamI
231 sub get_PrimarySeq_stream
{
233 my $num = $self->_file_count() || 0;
236 for (my $i = 0; $i < $num; $i++) {
237 my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} );
241 my $out = Bio
::SeqIO
::MultiFile
->new( '-format' => $self->_file_format , -files
=> \
@file);
245 =head2 get_all_primary_ids
247 Title : get_all_primary_ids
248 Usage : @ids = $seqdb->get_all_primary_ids()
249 Function: gives an array of all the primary_ids of the
250 sequence objects in the database. These
251 maybe ids (display style) or accession numbers
252 or something else completely different - they
253 *are not* meaningful outside of this database
256 Returns : an array of strings
262 sub get_all_primary_ids
{
263 my ($self,@args) = @_;
266 # the problem is here that we have indexed things both on
267 # accession number and name.
269 # We could take two options
270 # here - loop over the database, returning only one copy of each
271 # id that points to the same byte position, or we rely on semantics
272 # of accession numbers.
274 # someone is going to index a database with no accession numbers.
275 # doh!. We have to uniquify the index...
278 while (my($id, $rec) = each %$db) {
283 my ($file, $begin) = $self->unpack_record( $rec );
285 $bytepos{"$file:$begin"} = $id;
288 return values %bytepos;
292 =head2 get_Seq_by_primary_id
294 Title : get_Seq_by_primary_id
295 Usage : $seq = $db->get_Seq_by_primary_id($primary_id_string);
296 Function: Gets a Bio::Seq object by the primary id. The primary
297 id in these cases has to come from $db->get_all_primary_ids.
298 There is no other way to get (or guess) the primary_ids
301 The other possibility is to get Bio::PrimarySeqI objects
302 via the get_PrimarySeq_stream and the primary_id field
303 on these objects are specified as the ids to use here.
304 Returns : A Bio::Seq object
305 Args : primary id (as a string)
306 Throws : "acc does not exist" exception
311 sub get_Seq_by_primary_id
{
313 return $self->fetch($id);