2 # BioPerl module for Bio::DB::Fasta
4 # You may distribute this module under the same terms as perl itself
10 Bio::DB::Fasta - Fast indexed access to fasta files
16 # Create database from a directory of Fasta files
17 my $db = Bio::DB::Fasta->new('/path/to/fasta/files/');
18 my @ids = $db->get_all_primary_ids;
21 my $seqstr = $db->seq('CHROMOSOME_I', 4_000_000 => 4_100_000);
22 my $revseq = $db->seq('CHROMOSOME_I', 4_100_000 => 4_000_000);
23 my $length = $db->length('CHROMOSOME_I');
24 my $header = $db->header('CHROMOSOME_I');
25 my $alphabet = $db->alphabet('CHROMOSOME_I');
27 # Access to sequence objects. See Bio::PrimarySeqI.
28 my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
29 my $seqstr = $seq->seq;
30 my $subseq = $seq->subseq(4_000_000 => 4_100_000);
31 my $trunc = $seq->trunc(4_000_000 => 4_100_000);
32 my $length = $seq->length;
34 # Loop through sequence objects
35 my $stream = $db->get_PrimarySeq_stream;
36 while (my $seq = $stream->next_seq) {
37 # Bio::PrimarySeqI stuff
41 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files/');
42 while (my $seq = <$fh>) {
43 # Bio::PrimarySeqI stuff
47 tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files/';
48 print $sequences{'CHROMOSOME_I:1,20000'};
52 Bio::DB::Fasta provides indexed access to a single Fasta file, several files,
53 or a directory of files. It provides persistent random access to each sequence
54 entry (either as a Bio::PrimarySeqI-compliant object or a string), and to
55 subsequences within each entry, allowing you to retrieve portions of very large
56 sequences without bringing the entire sequence into memory. Bio::DB::Fasta is
57 based on Bio::DB::IndexedBase. See this module's documentation for details.
59 The Fasta files may contain any combination of nucleotide and protein sequences;
60 during indexing the module guesses the molecular type. Entries may have any line
61 length up to 65,536 characters, and different line lengths are allowed in the
62 same file. However, within a sequence entry, all lines must be the same length
63 except for the last. An error will be thrown if this is not the case.
65 The module uses /^E<gt>(\S+)/ to extract the primary ID of each sequence
66 from the Fasta header. See -makeid in Bio::DB::IndexedBase to pass a callback
67 routine to reversibly modify this primary ID, e.g. if you wish to extract a
68 specific portion of the gi|gb|abc|xyz GenBank IDs.
70 =head1 DATABASE CREATION AND INDEXING
72 The object-oriented constructor is new(), the filehandle constructor is newFh()
73 and the tied hash constructor is tie(). They all allow one to index a single Fasta
74 file, several files, or a directory of files. See Bio::DB::IndexedBase.
78 L<Bio::DB::IndexedBase>
86 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
88 Copyright (c) 2001 Cold Spring Harbor Laboratory.
90 This library is free software; you can redistribute it and/or modify
91 it under the same terms as Perl itself. See DISCLAIMER.txt for
92 disclaimers of warranty.
96 The rest of the documentation details each of the object
97 methods. Internal methods are usually preceded with a _
99 For BioPerl-style access, the following methods are provided:
103 Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
104 Usage : my $seq = $db->get_Seq_by_id($id);
105 Function: Given an ID, fetch the corresponding sequence from the database.
106 Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
107 Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
108 only load the sequence string into memory when requested using seq().
109 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
110 returned from get_Seq_by_id() and get_PrimarySeq_stream().
113 =head2 get_PrimarySeq_stream
115 Title : get_PrimarySeq_stream
116 Usage : my $stream = $db->get_PrimarySeq_stream();
117 Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
118 single method, next_seq(). Each call to next_seq() returns a new
119 Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
120 Returns : A Bio::DB::Indexed::Stream object
125 For simple access, the following methods are provided:
130 package Bio
::DB
::Fasta
;
135 use Bio
::PrimarySeqI
;
137 use base
qw(Bio::DB::IndexedBase);
139 our $obj_class = 'Bio::PrimarySeq::Fasta';
140 our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
146 Usage : my $db = Bio::DB::Fasta->new( $path, %options);
147 Function: Initialize a new database object. When indexing a directory, files
148 ending in .fa,fasta,fast,dna,fna,faa,fsa are indexed by default.
149 Returns : A new Bio::DB::Fasta object.
150 Args : A single file, or path to dir, or arrayref of files
151 Optional arguments: see Bio::DB::IndexedBase
156 sub _calculate_offsets
{
157 # Bio::DB::IndexedBase calls this to calculate offsets
158 my ($self, $fileno, $file, $offsets) = @_;
160 my $fh = IO
::File
->new($file) or $self->throw( "Could not open $file: $!");
162 warn "Indexing $file\n" if $self->{debug
};
163 my ($offset, @ids, $linelen, $alphabet, $headerlen, $count, $seq_lines,
164 $last_line, %offsets);
165 my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
167 my $termination_length = $self->{termination_length
};
168 while (my $line = <$fh>) {
169 # Account for crlf-terminated Windows files
170 if (index($line, '>') == 0) {
171 if ($line =~ /^>(\S+)/) {
172 print STDERR
"Indexed $count sequences...\n"
173 if $self->{debug
} && (++$count%1000) == 0;
175 # please, do not enforce arbitrary line length requirements.
176 # It's good practice but not enforced.
178 #$self->_check_linelength($linelen);
181 my $strlen = $pos - $offset - length($line);
182 $strlen -= $termination_length * $seq_lines;
183 my $ppos = &{$self->{packmeth
}}($offset, $strlen, $strlen,
184 $linelen, $headerlen, $alphabet, $fileno);
185 $alphabet = Bio
::DB
::IndexedBase
::NA
;
187 $offsets->{$id} = $ppos;
190 @ids = $self->_makeid($line);
191 ($offset, $headerlen, $linelen, $seq_lines) = ($pos, length $line, 0, 0);
192 ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
194 # Catch bad header lines, bug 3172
195 $self->throw("FASTA header doesn't match '>(\\S+)': $line");
197 } elsif ($line !~ /\S/) {
202 # Need to check every line :(
205 $l_len = length $line;
206 if (Bio
::DB
::IndexedBase
::DIE_ON_MISSMATCHED_LINES
) {
207 if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
208 my $fap = substr($line, 0, 20)."..";
209 $self->throw("Each line of the fasta entry must be the same ".
210 "length except the last. Line above #$. '$fap' is $l2_len".
211 " != $l3_len chars.");
214 # Blank lines not allowed in entry
215 $self->throw("Blank lines can only precede header lines, ".
216 "found preceding line #$.");
219 $linelen ||= length $line;
220 $alphabet ||= $self->_guess_alphabet($line);
227 $self->_check_linelength($linelen);
230 my $strlen = $pos - $offset;
231 if ($linelen == 0) { # yet another pesky empty chr_random.fa file
234 if ($last_line !~ /\s$/) {
237 $strlen -= $termination_length * $seq_lines;
239 my $ppos = &{$self->{packmeth
}}($offset, $strlen, $strlen, $linelen,
240 $headerlen, $alphabet, $fileno);
242 $offsets->{$id} = $ppos;
251 Title : seq, sequence, subseq
252 Usage : # Entire sequence string
253 my $seqstr = $db->seq($id);
255 my $subseqstr = $db->seq($id, $start, $stop, $strand);
257 my $subseqstr = $db->seq($compound_id);
258 Function: Get a subseq of a sequence from the database. For your convenience,
259 the sequence to extract can be specified with any of the following
261 $db->seq("$id:$start,$stop")
262 $db->seq("$id:$start..$stop")
263 $db->seq("$id:$start-$stop")
264 $db->seq("$id:$start,$stop/$strand")
265 $db->seq("$id:$start..$stop/$strand")
266 $db->seq("$id:$start-$stop/$strand")
267 $db->seq("$id/$strand")
268 In the case of DNA or RNA sequence, if $stop is less than $start,
269 then the reverse complement of the sequence is returned. Avoid using
270 it if possible since this goes against Bio::Seq conventions.
272 Args : ID of sequence to retrieve
274 Compound ID of subsequence to fetch
276 ID, optional start (defaults to 1), optional end (defaults to length
277 of sequence) and optional strand (defaults to 1).
282 my ($self, $id, $start, $stop, $strand) = @_;
283 $self->throw('Need to provide a sequence ID') if not defined $id;
284 ($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
288 my $fh = $self->_fh($id) or return;
289 my $filestart = $self->_calc_offset($id, $start);
290 my $filestop = $self->_calc_offset($id, $stop );
292 seek($fh, $filestart,0);
293 read($fh, $data, $filestop-$filestart+1);
295 $data = Bio
::DB
::IndexedBase
::_strip_crnl
($data);
298 # Reverse-complement the sequence
299 $data = Bio
::PrimarySeqI
::_revcom_from_string
($self, $data, $self->alphabet($id));
304 *seq
= *sequence
= \
&subseq
;
310 Usage : my $length = $qualdb->length($id);
311 Function: Get the number of residues in the indicated sequence.
318 Usage : my $header = $db->header($id);
319 Function: Get the header line (ID and description fields) of the specified
322 Args : ID of sequence
327 my ($self, $id) = @_;
328 $self->throw('Need to provide a sequence ID') if not defined $id;
329 my ($offset, $headerlen) = (&{$self->{unpackmeth
}}($self->{offsets
}{$id}))[0,4];
330 $offset -= $headerlen;
332 my $fh = $self->_fh($id) or return;
333 seek($fh, $offset, 0);
334 read($fh, $data, $headerlen);
335 # On Windows chomp remove '\n' but leaves '\r'
336 # when reading '\r\n' in binary mode
337 $data = Bio
::DB
::IndexedBase
::_strip_crnl
($data);
338 substr($data, 0, 1) = '';
346 Usage : my $alphabet = $db->alphabet($id);
347 Function: Get the molecular type of the indicated sequence: dna, rna or protein
349 Args : ID of sequence
354 #-------------------------------------------------------------
355 # Bio::PrimarySeqI compatibility
357 package Bio
::PrimarySeq
::Fasta
;
358 use overload
'""' => 'display_id';
360 use base
qw(Bio::Root::Root Bio::PrimarySeqI);
363 my ($class, @args) = @_;
364 my $self = $class->SUPER::new
(@args);
365 my ($db, $id, $start, $stop) = $self->_rearrange(
366 [qw(DATABASE ID START STOP)],
370 $self->{stop
} = $stop || $db->length($id);
371 $self->{start
} = $start || ($self->{stop
} > 0 ?
1 : 0); # handle 0-length seqs
376 return shift->seq(@_);
381 return $self->{db
}->seq($self->{id
}, $self->{start
}, $self->{stop
});
386 return $self->trunc(@_)->seq();
390 # Override Bio::PrimarySeqI trunc() method. This way, we create an object
391 # that does not store the sequence in memory.
392 my ($self, $start, $stop) = @_;
393 $self->throw("Stop cannot be smaller than start") if $stop < $start;
394 if ($self->{start
} <= $self->{stop
}) {
395 $start = $self->{start
}+$start-1;
396 $stop = $self->{start
}+$stop-1;
398 $start = $self->{start
}-($start-1);
399 $stop = $self->{start
}-($stop-1);
401 return $self->new( $self->{db
}, $self->{id
}, $start, $stop );
406 return $self->{is_circular
};
414 sub accession_number
{
420 # Following Bio::PrimarySeqI, since this sequence has no accession number,
421 # its primary_id should be a stringified memory location.
423 return overload
::StrVal
($self);
432 return $self->{db
}->alphabet($self->{id
});
436 # Override Bio::PrimarySeqI revcom() with optimized method.
438 return $self->new(@
{$self}{'db', 'id', 'stop', 'start'});
442 # Get length from sequence location, not the sequence string (too expensive)
444 return $self->{start
} < $self->{stop
} ?
445 $self->{stop
} - $self->{start
} + 1 :
446 $self->{start
} - $self->{stop
} + 1 ;
451 my $header = $self->{'db'}->header($self->{id
});
452 # Remove the ID from the header
453 return (split(/\s+/, $header, 2))[1];
455 *desc
= \
&description
;