2 # BioPerl module for Bio::DB::Qual
4 # You may distribute this module under the same terms as perl itself
9 Bio::DB::Qual - Fast indexed access to quality files
15 # create database from directory of qual files
16 my $db = Bio::DB::Qual->new('/path/to/qual/files/');
17 my @ids = $db->get_all_primary_ids;
20 my @qualarr = @{$db->qual('CHROMOSOME_I',4_000_000 => 4_100_000)};
21 my @revqual = @{$db->qual('CHROMOSOME_I',4_100_000 => 4_000_000)};
22 my $length = $db->length('CHROMOSOME_I');
23 my $header = $db->header('CHROMOSOME_I');
25 # Access to sequence objects. See Bio::PrimarySeqI.
26 my $obj = $db->get_Qual_by_id('CHROMOSOME_I');
27 my @qual = @{$obj->qual};
28 my @subqual = @{$obj->subqual(4_000_000 => 4_100_000)};
29 my $length = $obj->length;
31 # Loop through sequence objects
32 my $stream = $db->get_PrimarySeq_stream;
33 while (my $qual = $stream->next_seq) {
34 # Bio::Seq::PrimaryQual operations
38 my $fh = Bio::DB::Qual->newFh('/path/to/qual/files/');
39 while (my $qual = <$fh>) {
40 # Bio::Seq::PrimaryQual operations
44 tie %qualities,'Bio::DB::Qual','/path/to/qual/files/';
45 print $qualities{'CHROMOSOME_I:1,20000'};
49 Bio::DB::Qual provides indexed access to a single Fasta file, several files,
50 or a directory of files. It provides random access to each quality score entry
51 without having to read the file from the beginning. Access to subqualities
52 (portions of a quality score) is provided, although contrary to Bio::DB::Fasta,
53 the full quality score has to be brought in memory. Bio::DB::Qual is based on
54 Bio::DB::IndexedBase. See this module's documentation for details.
56 The qual files should contain decimal quality scores. Entries may have any line
57 length up to 65,536 characters, and different line lengths are allowed in the
58 same file. However, within a quality score entry, all lines must be the same
59 length except for the last. An error will be thrown if this is not the case.
61 The module uses /^E<gt>(\S+)/ to extract the primary ID of each quality score
62 from the qual header. See -makeid in Bio::DB::IndexedBase to pass a callback
63 routine to reversibly modify this primary ID, e.g. if you wish to extract a
64 specific portion of the gi|gb|abc|xyz GenBank IDs.
66 =head1 DATABASE CREATION AND INDEXING
68 The object-oriented constructor is new(), the filehandle constructor is newFh()
69 and the tied hash constructor is tie(). They all allow one to index a single Fasta
70 file, several files, or a directory of files. See Bio::DB::IndexedBase.
74 L<Bio::DB::IndexedBase>
78 L<Bio::Seq::PrimaryQual>
82 When a quality score is deleted from one of the qual files, this deletion is not
83 detected by the module and removed from the index. As a result, a "ghost" entry
84 will remain in the index and will return garbage results if accessed. Currently,
85 the only way to accommodate deletions is to rebuild the entire index, either by
86 deleting it manually, or by passing -reindex=E<gt>1 to new() when
87 initializing the module.
89 All quality score lines for a given quality score must have the same length
90 except for the last (not sure why there is this limitation). This is not
91 problematic for sequences but could be annoying for quality scores. A workaround
92 is to make sure that your quality scores fit on no more than 2 lines. Another
93 solution could be to padd them with blank spaces so that each line has the same
94 number of characters (maybe this padding should be implemented in
99 Florent E Angly E<lt>florent . angly @ gmail-dot-comE<gt>.
101 Module largely based on and adapted from Bio::DB::Fasta by Lincoln Stein.
103 Copyright (c) 2007 Florent E Angly.
105 This library is free software; you can redistribute it and/or modify
106 it under the same terms as Perl itself.
110 The rest of the documentation details each of the object
111 methods. Internal methods are usually preceded with a _
113 For BioPerl-style access, the following methods are provided:
117 Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id,
118 get_Qual_by_id, get_qual_by_acc, get_qual_by_version, get_qual_by_primary_id,
119 Usage : my $seq = $db->get_Seq_by_id($id);
120 Function: Given an ID, fetch the corresponding sequence from the database.
121 Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
122 Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
123 only load the sequence string into memory when requested using seq().
124 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
125 returned from get_Seq_by_id() and get_PrimarySeq_stream().
128 =head2 get_PrimarySeq_stream
130 Title : get_Seq_stream, get_PrimarySeq_stream
131 Usage : my $stream = $db->get_Seq_stream();
132 Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
133 single method, next_seq(). Each call to next_seq() returns a new
134 Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
135 Returns : A Bio::DB::Indexed::Stream object
140 For simple access, the following methods are provided:
145 package Bio
::DB
::Qual
;
151 use base
qw(Bio::DB::IndexedBase);
153 our $obj_class = 'Bio::Seq::PrimaryQual::Qual';
154 our $file_glob = '*.{qual,QUAL,qa,QA}';
160 Usage : my $db = Bio::DB::Qual->new( $path, %options);
161 Function: Initialize a new database object. When indexing a directory, files
162 ending in .qual,qa are indexed by default.
163 Returns : A new Bio::DB::Qual object
164 Args : A single file, or path to dir, or arrayref of files
165 Optional arguments: see Bio::DB::IndexedBase
170 sub _calculate_offsets
{
171 # Bio::DB::IndexedBase calls this to calculate offsets
172 my ($self, $fileno, $file, $offsets) = @_;
174 my $fh = IO
::File
->new($file) or $self->throw("Could not open $file: $!");
176 warn "Indexing $file\n" if $self->{debug
};
177 my ($offset, @ids, $linelen, $headerlen, $count, $qual_lines, $last_line,
179 my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
181 my $termination_length = $self->{termination_length
};
182 while (my $line = <$fh>) {
183 # Account for crlf-terminated Windows files
184 if (index($line, '>') == 0) {
185 if ($line =~ /^>(\S+)/) {
186 print STDERR
"Indexed $count quality scores...\n"
187 if $self->{debug
} && (++$count%1000) == 0;
188 $self->_check_linelength($linelen);
191 my $strlen = $pos - $offset - length($line);
192 $strlen -= $termination_length * $qual_lines;
193 my $ppos = &{$self->{packmeth
}}($offset, $strlen, $numres,
194 $linelen, $headerlen, Bio
::DB
::IndexedBase
::NA
, $fileno);
196 $offsets->{$id} = $ppos;
200 @ids = $self->_makeid($line);
201 ($offset, $headerlen, $linelen, $qual_lines) = ($pos, length $line, 0, 0);
202 ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
204 # Catch bad header lines, bug 3172
205 $self->throw("FASTA header doesn't match '>(\\S+)': $line");
207 } elsif ($line !~ /\S/) {
212 # Need to check every line :(
215 $l_len = length $line;
216 if (Bio
::DB
::IndexedBase
::DIE_ON_MISSMATCHED_LINES
) {
217 if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
218 my $fap = substr($line, 0, 20)."..";
219 $self->throw("Each line of the qual entry must be the same ".
220 "length except the last. Line above #$. '$fap' is $l2_len".
221 " != $l3_len chars.");
224 # Blank lines not allowed in entry
225 $self->throw("Blank lines can only precede header lines, ".
226 "found preceding line #$.");
229 $linelen ||= length $line;
231 $numres += scalar(split /\s+/, $line);
237 $self->_check_linelength($linelen);
240 my $strlen = $pos - $offset;
244 if ($last_line !~ /\s$/) {
247 $strlen -= $termination_length * $qual_lines;
249 my $ppos = &{$self->{packmeth
}}($offset, $strlen, $numres, $linelen,
250 $headerlen, Bio
::DB
::IndexedBase
::NA
, $fileno);
252 $offsets->{$id} = $ppos;
260 # for backward compatibility
261 sub get_PrimaryQual_stream
{
263 return $self->get_PrimarySeq_stream;
267 # for backward compatibility
269 my ($self, $id) = @_;
270 return $self->get_Seq_by_id($id);
273 *get_qual_by_version
= *get_qual_by_primary_id
= *get_qual_by_acc
= \
&get_Qual_by_id
;
278 Title : qual, quality, subqual
279 Usage : # All quality scores
280 my @qualarr = @{$qualdb->subqual($id)};
281 # Subset of the quality scores
282 my @subqualarr = @{$qualdb->subqual($id, $start, $stop, $strand)};
284 my @subqualarr = @{$qualdb->subqual($compound_id)};
285 Function: Get a subqual of an entry in the database. For your convenience,
286 the sequence to extract can be specified with any of the following
288 $db->qual("$id:$start,$stop")
289 $db->qual("$id:$start..$stop")
290 $db->qual("$id:$start-$stop")
291 $db->qual("$id:$start,$stop/$strand")
292 $db->qual("$id:$start..$stop/$strand")
293 $db->qual("$id:$start-$stop/$strand")
294 $db->qual("$id/$strand")
295 If $stop is less than $start, then the reverse complement of the
296 sequence is returned. Avoid using it if possible since this goes
297 against Bio::Seq conventions.
298 Returns : Reference to an array of quality scores
299 Args : Compound ID of entry to retrieve
301 ID, optional start (defaults to 1), optional end (defaults to the
302 number of quality scores for this sequence), and strand (defaults to
308 my ($self, $id, $start, $stop, $strand) = @_;
310 # Quality values in a quality score can have 1 or 2 digits and are separated
311 # by one (or several?) spaces. Thus contrary to Bio::DB::Fasta, here there
312 # is no easy way match the position of a quality value to its position in
313 # the quality string.
314 # As a consequence, if a subqual of the quality is requested, we still need
315 # to grab the full quality string first - performance penalty for big
317 # I think there is no way around starting at the begining of the quality
318 # score but maybe there is a resource-efficient way of starting at the
319 # begining of the quality score and stopping when the the position of the
320 # last quality value requested is reached??
322 $self->throw('Need to provide a sequence ID') if not defined $id;
323 ($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
325 # Position in quality string
326 my $string_start = 1;
327 my $string_stop = $self->strlen($id);
329 # Fetch full quality string
330 my $fh = $self->_fh($id) or return;
331 my $filestart = $self->_calc_offset($id, $string_start);
332 my $filestop = $self->_calc_offset($id, $string_stop );
333 seek($fh, $filestart,0);
335 read($fh, $data, $filestop-$filestart+1);
337 # Process quality score
338 Bio
::DB
::IndexedBase
::_strip_crnl
($data);
340 $subqual = 1 if ( $start || $stop );
342 if ( $subqual || ($strand == -1) ) {
343 @data = split / /, $data, $stop+1;
344 my $length = scalar(@data);
345 $start = 1 if $start < 1;
346 $stop = $length if $stop > $length;
347 pop @data if ($stop != $length);
348 splice @data, 0, $start-1;
349 @data = reverse(@data) if $strand == -1;
350 $data = join ' ', @data;
352 @data = split / /, $data;
358 *qual
= *quality
= \
&subqual
;
364 Usage : my $header = $db->header($id);
365 Function: Get the header line (ID and description fields) of the specified entry.
372 my ($self, $id) = @_;
373 $self->throw('Need to provide a sequence ID') if not defined $id;
374 my ($offset, $headerlen) = (&{$self->{unpackmeth
}}($self->{offsets
}{$id}))[0,4];
375 $offset -= $headerlen;
377 my $fh = $self->_fh($id) or return;
378 seek($fh, $offset, 0);
379 read($fh, $data, $headerlen);
380 # On Windows chomp remove '\n' but leaves '\r'
381 # when reading '\r\n' in binary mode,
382 # _strip_crnl removes both
383 $data = Bio
::DB
::IndexedBase
::_strip_crnl
($data);
384 substr($data, 0, 1) = '';
389 #-------------------------------------------------------------
390 # Tied hash overrides
394 return shift->subqual(@_);
398 #-------------------------------------------------------------
399 # Bio::Seq::PrimaryQual compatibility
401 # Usage is the same as in Bio::Seq::PrimaryQual
403 package Bio
::Seq
::PrimaryQual
::Qual
;
404 use overload
'""' => 'display_id';
406 use base
qw(Bio::Root::Root Bio::Seq::PrimaryQual);
409 my ($class, @args) = @_;
410 my $self = $class->SUPER::new
(@args);
411 my ($db, $id, $start, $stop) = $self->_rearrange(
412 [qw(DATABASE ID START STOP)],
416 $self->{stop
} = $stop || $db->length($id);
417 $self->{start
} = $start || ($self->{stop
} > 0 ?
1 : 0); # handle 0-length seqs
424 my $qual = $self->{db
}->qual($self->{id
}, $self->{start
}, $self->{stop
});
430 my ($self, $start, $stop) = @_;
431 return $self->trunc($start, $stop)->qual;
436 # Override Bio::Seq::QualI trunc() method. This way, we create an object
437 # that does not store the quality array in memory.
438 my ($self, $start, $stop) = @_;
440 "$stop is smaller than $stop. If you want to truncate and reverse ".
441 "complement, you must call trunc followed by revcom."
443 if ($self->{start
} <= $self->{stop
}) {
444 $start = $self->{start
}+$start-1;
445 $stop = $self->{start
}+$stop-1;
447 $start = $self->{start
}-($start-1);
448 $stop = $self->{start
}-($stop-1);
450 my $obj = $self->new( -database
=> $self->{db
},
467 return overload
::StrVal
($self);
471 # Override Bio::QualI revcom() with optimized method.
473 return $self->new(@
{$self}{'db', 'id', 'stop', 'start'});
477 # Get length from quality location, not the quality array (too expensive)
479 return $self->{start
} < $self->{stop
} ?
480 $self->{stop
} - $self->{start
} + 1 :
481 $self->{start
} - $self->{stop
} + 1 ;
487 my $header = $self->{'db'}->header($self->{id
});
488 # remove the id from the header
489 $header = (split(/\s+/, $header, 2))[2];
492 *desc
= \
&description
;