Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / DB / Qual.pm
blob729d7bc6818bb150a7e1d8304bf7d9b9ea429f70
2 # BioPerl module for Bio::DB::Qual
4 # You may distribute this module under the same terms as perl itself
7 =head1 NAME
9 Bio::DB::Qual - Fast indexed access to quality files
11 =head1 SYNOPSIS
13 use Bio::DB::Qual;
15 # create database from directory of qual files
16 my $db = Bio::DB::Qual->new('/path/to/qual/files/');
17 my @ids = $db->get_all_primary_ids;
19 # Simple access
20 my @qualarr = @{$db->qual('CHROMOSOME_I',4_000_000 => 4_100_000)};
21 my @revqual = @{$db->qual('CHROMOSOME_I',4_100_000 => 4_000_000)};
22 my $length = $db->length('CHROMOSOME_I');
23 my $header = $db->header('CHROMOSOME_I');
25 # Access to sequence objects. See Bio::PrimarySeqI.
26 my $obj = $db->get_Qual_by_id('CHROMOSOME_I');
27 my @qual = @{$obj->qual};
28 my @subqual = @{$obj->subqual(4_000_000 => 4_100_000)};
29 my $length = $obj->length;
31 # Loop through sequence objects
32 my $stream = $db->get_PrimarySeq_stream;
33 while (my $qual = $stream->next_seq) {
34 # Bio::Seq::PrimaryQual operations
37 # Filehandle access
38 my $fh = Bio::DB::Qual->newFh('/path/to/qual/files/');
39 while (my $qual = <$fh>) {
40 # Bio::Seq::PrimaryQual operations
43 # Tied hash access
44 tie %qualities,'Bio::DB::Qual','/path/to/qual/files/';
45 print $qualities{'CHROMOSOME_I:1,20000'};
47 =head1 DESCRIPTION
49 Bio::DB::Qual provides indexed access to a single Fasta file, several files,
50 or a directory of files. It provides random access to each quality score entry
51 without having to read the file from the beginning. Access to subqualities
52 (portions of a quality score) is provided, although contrary to Bio::DB::Fasta,
53 the full quality score has to be brought in memory. Bio::DB::Qual is based on
54 Bio::DB::IndexedBase. See this module's documentation for details.
56 The qual files should contain decimal quality scores. Entries may have any line
57 length up to 65,536 characters, and different line lengths are allowed in the
58 same file. However, within a quality score entry, all lines must be the same
59 length except for the last. An error will be thrown if this is not the case.
61 The module uses /^E<gt>(\S+)/ to extract the primary ID of each quality score
62 from the qual header. See -makeid in Bio::DB::IndexedBase to pass a callback
63 routine to reversibly modify this primary ID, e.g. if you wish to extract a
64 specific portion of the gi|gb|abc|xyz GenBank IDs.
66 =head1 DATABASE CREATION AND INDEXING
68 The object-oriented constructor is new(), the filehandle constructor is newFh()
69 and the tied hash constructor is tie(). They all allow one to index a single Fasta
70 file, several files, or a directory of files. See Bio::DB::IndexedBase.
72 =head1 SEE ALSO
74 L<Bio::DB::IndexedBase>
76 L<Bio::DB::Fasta>
78 L<Bio::Seq::PrimaryQual>
80 =head1 LIMITATIONS
82 When a quality score is deleted from one of the qual files, this deletion is not
83 detected by the module and removed from the index. As a result, a "ghost" entry
84 will remain in the index and will return garbage results if accessed. Currently,
85 the only way to accommodate deletions is to rebuild the entire index, either by
86 deleting it manually, or by passing -reindex=E<gt>1 to new() when
87 initializing the module.
89 All quality score lines for a given quality score must have the same length
90 except for the last (not sure why there is this limitation). This is not
91 problematic for sequences but could be annoying for quality scores. A workaround
92 is to make sure that your quality scores fit on no more than 2 lines. Another
93 solution could be to padd them with blank spaces so that each line has the same
94 number of characters (maybe this padding should be implemented in
95 Bio::SeqIO::qual?).
97 =head1 AUTHOR
99 Florent E Angly E<lt>florent . angly @ gmail-dot-comE<gt>.
101 Module largely based on and adapted from Bio::DB::Fasta by Lincoln Stein.
103 Copyright (c) 2007 Florent E Angly.
105 This library is free software; you can redistribute it and/or modify
106 it under the same terms as Perl itself.
108 =head1 APPENDIX
110 The rest of the documentation details each of the object
111 methods. Internal methods are usually preceded with a _
113 For BioPerl-style access, the following methods are provided:
115 =head2 get_Seq_by_id
117 Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id,
118 get_Qual_by_id, get_qual_by_acc, get_qual_by_version, get_qual_by_primary_id,
119 Usage : my $seq = $db->get_Seq_by_id($id);
120 Function: Given an ID, fetch the corresponding sequence from the database.
121 Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
122 Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
123 only load the sequence string into memory when requested using seq().
124 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
125 returned from get_Seq_by_id() and get_PrimarySeq_stream().
126 Args : ID
128 =head2 get_PrimarySeq_stream
130 Title : get_Seq_stream, get_PrimarySeq_stream
131 Usage : my $stream = $db->get_Seq_stream();
132 Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
133 single method, next_seq(). Each call to next_seq() returns a new
134 Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
135 Returns : A Bio::DB::Indexed::Stream object
136 Args : None
138 =head1
140 For simple access, the following methods are provided:
142 =cut
145 package Bio::DB::Qual;
147 use strict;
148 use IO::File;
149 use File::Spec;
151 use base qw(Bio::DB::IndexedBase);
153 our $obj_class = 'Bio::Seq::PrimaryQual::Qual';
154 our $file_glob = '*.{qual,QUAL,qa,QA}';
157 =head2 new
159 Title : new
160 Usage : my $db = Bio::DB::Qual->new( $path, %options);
161 Function: Initialize a new database object. When indexing a directory, files
162 ending in .qual,qa are indexed by default.
163 Returns : A new Bio::DB::Qual object
164 Args : A single file, or path to dir, or arrayref of files
165 Optional arguments: see Bio::DB::IndexedBase
167 =cut
170 sub _calculate_offsets {
171 # Bio::DB::IndexedBase calls this to calculate offsets
172 my ($self, $fileno, $file, $offsets) = @_;
174 my $fh = IO::File->new($file) or $self->throw("Could not open $file: $!");
175 binmode $fh;
176 warn "Indexing $file\n" if $self->{debug};
177 my ($offset, @ids, $linelen, $headerlen, $count, $qual_lines, $last_line,
178 $numres, %offsets);
179 my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
181 my $termination_length = $self->{termination_length};
182 while (my $line = <$fh>) {
183 # Account for crlf-terminated Windows files
184 if (index($line, '>') == 0) {
185 if ($line =~ /^>(\S+)/) {
186 print STDERR "Indexed $count quality scores...\n"
187 if $self->{debug} && (++$count%1000) == 0;
188 $self->_check_linelength($linelen);
189 my $pos = tell($fh);
190 if (@ids) {
191 my $strlen = $pos - $offset - length($line);
192 $strlen -= $termination_length * $qual_lines;
193 my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres,
194 $linelen, $headerlen, Bio::DB::IndexedBase::NA, $fileno);
195 for my $id (@ids) {
196 $offsets->{$id} = $ppos;
198 $numres = 0;
200 @ids = $self->_makeid($line);
201 ($offset, $headerlen, $linelen, $qual_lines) = ($pos, length $line, 0, 0);
202 ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
203 } else {
204 # Catch bad header lines, bug 3172
205 $self->throw("FASTA header doesn't match '>(\\S+)': $line");
207 } elsif ($line !~ /\S/) {
208 # Skip blank line
209 $blank_lines++;
210 next;
211 } else {
212 # Need to check every line :(
213 $l3_len = $l2_len;
214 $l2_len = $l_len;
215 $l_len = length $line;
216 if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
217 if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
218 my $fap = substr($line, 0, 20)."..";
219 $self->throw("Each line of the qual entry must be the same ".
220 "length except the last. Line above #$. '$fap' is $l2_len".
221 " != $l3_len chars.");
223 if ($blank_lines) {
224 # Blank lines not allowed in entry
225 $self->throw("Blank lines can only precede header lines, ".
226 "found preceding line #$.");
229 $linelen ||= length $line;
230 $qual_lines++;
231 $numres += scalar(split /\s+/, $line);
233 $last_line = $line;
236 # Process last entry
237 $self->_check_linelength($linelen);
238 my $pos = tell($fh);
239 if (@ids) {
240 my $strlen = $pos - $offset;
241 if ($linelen == 0) {
242 $strlen = 0;
243 } else {
244 if ($last_line !~ /\s$/) {
245 $qual_lines--;
247 $strlen -= $termination_length * $qual_lines;
249 my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres, $linelen,
250 $headerlen, Bio::DB::IndexedBase::NA, $fileno);
251 for my $id (@ids) {
252 $offsets->{$id} = $ppos;
256 return \%offsets;
260 # for backward compatibility
261 sub get_PrimaryQual_stream {
262 my $self = shift;
263 return $self->get_PrimarySeq_stream;
267 # for backward compatibility
268 sub get_Qual_by_id {
269 my ($self, $id) = @_;
270 return $self->get_Seq_by_id($id);
273 *get_qual_by_version = *get_qual_by_primary_id = *get_qual_by_acc = \&get_Qual_by_id;
276 =head2 qual
278 Title : qual, quality, subqual
279 Usage : # All quality scores
280 my @qualarr = @{$qualdb->subqual($id)};
281 # Subset of the quality scores
282 my @subqualarr = @{$qualdb->subqual($id, $start, $stop, $strand)};
283 # or...
284 my @subqualarr = @{$qualdb->subqual($compound_id)};
285 Function: Get a subqual of an entry in the database. For your convenience,
286 the sequence to extract can be specified with any of the following
287 compound IDs:
288 $db->qual("$id:$start,$stop")
289 $db->qual("$id:$start..$stop")
290 $db->qual("$id:$start-$stop")
291 $db->qual("$id:$start,$stop/$strand")
292 $db->qual("$id:$start..$stop/$strand")
293 $db->qual("$id:$start-$stop/$strand")
294 $db->qual("$id/$strand")
295 If $stop is less than $start, then the reverse complement of the
296 sequence is returned. Avoid using it if possible since this goes
297 against Bio::Seq conventions.
298 Returns : Reference to an array of quality scores
299 Args : Compound ID of entry to retrieve
301 ID, optional start (defaults to 1), optional end (defaults to the
302 number of quality scores for this sequence), and strand (defaults to
305 =cut
307 sub subqual {
308 my ($self, $id, $start, $stop, $strand) = @_;
310 # Quality values in a quality score can have 1 or 2 digits and are separated
311 # by one (or several?) spaces. Thus contrary to Bio::DB::Fasta, here there
312 # is no easy way match the position of a quality value to its position in
313 # the quality string.
314 # As a consequence, if a subqual of the quality is requested, we still need
315 # to grab the full quality string first - performance penalty for big
316 # quality scores :(
317 # I think there is no way around starting at the begining of the quality
318 # score but maybe there is a resource-efficient way of starting at the
319 # begining of the quality score and stopping when the the position of the
320 # last quality value requested is reached??
322 $self->throw('Need to provide a sequence ID') if not defined $id;
323 ($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
325 # Position in quality string
326 my $string_start = 1;
327 my $string_stop = $self->strlen($id);
329 # Fetch full quality string
330 my $fh = $self->_fh($id) or return;
331 my $filestart = $self->_calc_offset($id, $string_start);
332 my $filestop = $self->_calc_offset($id, $string_stop );
333 seek($fh, $filestart,0);
334 my $data;
335 read($fh, $data, $filestop-$filestart+1);
337 # Process quality score
338 Bio::DB::IndexedBase::_strip_crnl($data);
339 my $subqual = 0;
340 $subqual = 1 if ( $start || $stop );
341 my @data;
342 if ( $subqual || ($strand == -1) ) {
343 @data = split / /, $data, $stop+1;
344 my $length = scalar(@data);
345 $start = 1 if $start < 1;
346 $stop = $length if $stop > $length;
347 pop @data if ($stop != $length);
348 splice @data, 0, $start-1;
349 @data = reverse(@data) if $strand == -1;
350 $data = join ' ', @data;
351 } else {
352 @data = split / /, $data;
355 return \@data;
358 *qual = *quality = \&subqual;
361 =head2 header
363 Title : header
364 Usage : my $header = $db->header($id);
365 Function: Get the header line (ID and description fields) of the specified entry.
366 Returns : String
367 Args : ID of entry
369 =cut
371 sub header {
372 my ($self, $id) = @_;
373 $self->throw('Need to provide a sequence ID') if not defined $id;
374 my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
375 $offset -= $headerlen;
376 my $data;
377 my $fh = $self->_fh($id) or return;
378 seek($fh, $offset, 0);
379 read($fh, $data, $headerlen);
380 # On Windows chomp remove '\n' but leaves '\r'
381 # when reading '\r\n' in binary mode,
382 # _strip_crnl removes both
383 $data = Bio::DB::IndexedBase::_strip_crnl($data);
384 substr($data, 0, 1) = '';
385 return $data;
389 #-------------------------------------------------------------
390 # Tied hash overrides
393 sub FETCH {
394 return shift->subqual(@_);
398 #-------------------------------------------------------------
399 # Bio::Seq::PrimaryQual compatibility
401 # Usage is the same as in Bio::Seq::PrimaryQual
403 package Bio::Seq::PrimaryQual::Qual;
404 use overload '""' => 'display_id';
406 use base qw(Bio::Root::Root Bio::Seq::PrimaryQual);
408 sub new {
409 my ($class, @args) = @_;
410 my $self = $class->SUPER::new(@args);
411 my ($db, $id, $start, $stop) = $self->_rearrange(
412 [qw(DATABASE ID START STOP)],
413 @args);
414 $self->{db} = $db;
415 $self->{id} = $id;
416 $self->{stop} = $stop || $db->length($id);
417 $self->{start} = $start || ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
418 return $self;
422 sub qual {
423 my $self = shift;
424 my $qual = $self->{db}->qual($self->{id}, $self->{start}, $self->{stop});
425 return $qual;
429 sub subqual {
430 my ($self, $start, $stop) = @_;
431 return $self->trunc($start, $stop)->qual;
435 sub trunc {
436 # Override Bio::Seq::QualI trunc() method. This way, we create an object
437 # that does not store the quality array in memory.
438 my ($self, $start, $stop) = @_;
439 $self->throw(
440 "$stop is smaller than $stop. If you want to truncate and reverse ".
441 "complement, you must call trunc followed by revcom."
442 ) if $start > $stop;
443 if ($self->{start} <= $self->{stop}) {
444 $start = $self->{start}+$start-1;
445 $stop = $self->{start}+$stop-1;
446 } else {
447 $start = $self->{start}-($start-1);
448 $stop = $self->{start}-($stop-1);
450 my $obj = $self->new( -database => $self->{db},
451 -id => $self->{id},
452 -start => $start,
453 -stop => $stop
455 return $obj;
459 sub display_id {
460 my $self = shift;
461 return $self->{id};
465 sub primary_id {
466 my $self = shift;
467 return overload::StrVal($self);
470 sub revcom {
471 # Override Bio::QualI revcom() with optimized method.
472 my $self = shift;
473 return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
476 sub length {
477 # Get length from quality location, not the quality array (too expensive)
478 my $self = shift;
479 return $self->{start} < $self->{stop} ?
480 $self->{stop} - $self->{start} + 1 :
481 $self->{start} - $self->{stop} + 1 ;
485 sub description {
486 my $self = shift;
487 my $header = $self->{'db'}->header($self->{id});
488 # remove the id from the header
489 $header = (split(/\s+/, $header, 2))[2];
490 return $header;
492 *desc = \&description;