lib/Bio/DB/Fasta.pm

   1 #
   2 # BioPerl module for Bio::DB::Fasta
   3 #
   4 # You may distribute this module under the same terms as perl itself
   5 #
   6
   7
   8 =head1 NAME
   9
  10 Bio::DB::Fasta - Fast indexed access to fasta files
  11
  12 =head1 SYNOPSIS
  13
  14   use Bio::DB::Fasta;
  15
  16   # Create database from a directory of Fasta files
  17   my $db       = Bio::DB::Fasta->new('/path/to/fasta/files/');
  18   my @ids      = $db->get_all_primary_ids;
  19
  20   # Simple access
  21   my $seqstr   = $db->seq('CHROMOSOME_I', 4_000_000 => 4_100_000);
  22   my $revseq   = $db->seq('CHROMOSOME_I', 4_100_000 => 4_000_000);
  23   my $length   = $db->length('CHROMOSOME_I');
  24   my $header   = $db->header('CHROMOSOME_I');
  25   my $alphabet = $db->alphabet('CHROMOSOME_I');
  26
  27   # Access to sequence objects. See Bio::PrimarySeqI.
  28   my $seq     = $db->get_Seq_by_id('CHROMOSOME_I');
  29   my $seqstr  = $seq->seq;
  30   my $subseq  = $seq->subseq(4_000_000 => 4_100_000);
  31   my $trunc   = $seq->trunc(4_000_000 => 4_100_000);
  32   my $length  = $seq->length;
  33
  34   # Loop through sequence objects
  35   my $stream  = $db->get_PrimarySeq_stream;
  36   while (my $seq = $stream->next_seq) {
  37     # Bio::PrimarySeqI stuff
  38   }
  39
  40   # Filehandle access
  41   my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files/');
  42   while (my $seq = <$fh>) {
  43     # Bio::PrimarySeqI stuff
  44   }
  45
  46   # Tied hash access
  47   tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files/';
  48   print $sequences{'CHROMOSOME_I:1,20000'};
  49
  50 =head1 DESCRIPTION
  51
  52 Bio::DB::Fasta provides indexed access to a single Fasta file, several files,
  53 or a directory of files. It provides persistent random access to each sequence
  54 entry (either as a Bio::PrimarySeqI-compliant object or a string), and to
  55 subsequences within each entry, allowing you to retrieve portions of very large
  56 sequences without bringing the entire sequence into memory. Bio::DB::Fasta is
  57 based on Bio::DB::IndexedBase. See this module's documentation for details.
  58
  59 The Fasta files may contain any combination of nucleotide and protein sequences;
  60 during indexing the module guesses the molecular type. Entries may have any line
  61 length up to 65,536 characters, and different line lengths are allowed in the
  62 same file.  However, within a sequence entry, all lines must be the same length
  63 except for the last. An error will be thrown if this is not the case.
  64
  65 The module uses /^E<gt>(\S+)/ to extract the primary ID of each sequence
  66 from the Fasta header. See -makeid in Bio::DB::IndexedBase to pass a callback
  67 routine to reversibly modify this primary ID, e.g. if you wish to extract a
  68 specific portion of the gi|gb|abc|xyz GenBank IDs.
  69
  70 =head1 DATABASE CREATION AND INDEXING
  71
  72 The object-oriented constructor is new(), the filehandle constructor is newFh()
  73 and the tied hash constructor is tie(). They all allow one to index a single Fasta
  74 file, several files, or a directory of files. See Bio::DB::IndexedBase.
  75
  76 =head1 SEE ALSO
  77
  78 L<Bio::DB::IndexedBase>
  79
  80 L<Bio::DB::Qual>
  81
  82 L<Bio::PrimarySeqI>
  83
  84 =head1 AUTHOR
  85
  86 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
  87
  88 Copyright (c) 2001 Cold Spring Harbor Laboratory.
  89
  90 This library is free software; you can redistribute it and/or modify
  91 it under the same terms as Perl itself.  See DISCLAIMER.txt for
  92 disclaimers of warranty.
  93
  94 =head1 APPENDIX
  95
  96 The rest of the documentation details each of the object
  97 methods. Internal methods are usually preceded with a _
  98
  99 For BioPerl-style access, the following methods are provided:
 100
 101 =head2 get_Seq_by_id
 102
 103  Title   : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
 104  Usage   : my $seq = $db->get_Seq_by_id($id);
 105  Function: Given an ID, fetch the corresponding sequence from the database.
 106  Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
 107            Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
 108            only load the sequence string into memory when requested using seq().
 109            See L<Bio::PrimarySeqI> for methods provided by the sequence objects
 110            returned from get_Seq_by_id() and get_PrimarySeq_stream().
 111  Args    : ID
 112
 113 =head2 get_PrimarySeq_stream
 114
 115  Title   : get_PrimarySeq_stream
 116  Usage   : my $stream = $db->get_PrimarySeq_stream();
 117  Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
 118            single method, next_seq(). Each call to next_seq() returns a new
 119            Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
 120  Returns : A Bio::DB::Indexed::Stream object
 121  Args    : None
 122
 123 =head1
 124
 125 For simple access, the following methods are provided:
 126
 127 =cut
 128
 129
 130 package Bio::DB::Fasta;
 131
 132 use strict;
 133 use IO::File;
 134 use File::Spec;
 135 use Bio::PrimarySeqI;
 136
 137 use base qw(Bio::DB::IndexedBase);
 138
 139 our $obj_class = 'Bio::PrimarySeq::Fasta';
 140 our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
 141
 142
 143 =head2 new
 144
 145  Title   : new
 146  Usage   : my $db = Bio::DB::Fasta->new( $path, %options);
 147  Function: Initialize a new database object. When indexing a directory, files
 148            ending in .fa,fasta,fast,dna,fna,faa,fsa are indexed by default.
 149  Returns : A new Bio::DB::Fasta object.
 150  Args    : A single file, or path to dir, or arrayref of files
 151            Optional arguments: see Bio::DB::IndexedBase
 152
 153 =cut
 154
 155
 156 sub _calculate_offsets {
 157     # Bio::DB::IndexedBase calls this to calculate offsets
 158     my ($self, $fileno, $file, $offsets) = @_;
 159
 160     my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
 161     binmode $fh;
 162     warn "Indexing $file\n" if $self->{debug};
 163     my ($offset, @ids, $linelen, $alphabet, $headerlen, $count, $seq_lines,
 164         $last_line, %offsets);
 165     my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
 166
 167     my $termination_length = $self->{termination_length};
 168     while (my $line = <$fh>) {
 169         # Account for crlf-terminated Windows files
 170         if (index($line, '>') == 0) {
 171             if ($line =~ /^>(\S+)/) {
 172                 print STDERR "Indexed $count sequences...\n"
 173                     if $self->{debug} && (++$count%1000) == 0;
 174
 175                 # please, do not enforce arbitrary line length requirements.
 176                 # It's good practice but not enforced.
 177
 178                 #$self->_check_linelength($linelen);
 179                 my $pos = tell($fh);
 180                 if (@ids) {
 181                     my $strlen  = $pos - $offset - length($line);
 182                     $strlen    -= $termination_length * $seq_lines;
 183                     my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen,
 184                         $linelen, $headerlen, $alphabet, $fileno);
 185                     $alphabet = Bio::DB::IndexedBase::NA;
 186                     for my $id (@ids) {
 187                         $offsets->{$id} = $ppos;
 188                     }
 189                 }
 190                 @ids = $self->_makeid($line);
 191                 ($offset, $headerlen, $linelen, $seq_lines) = ($pos, length $line, 0, 0);
 192                 ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
 193             } else {
 194                 # Catch bad header lines, bug 3172
 195                 $self->throw("FASTA header doesn't match '>(\\S+)': $line");
 196             }
 197         } elsif ($line !~ /\S/) {
 198             # Skip blank line
 199             $blank_lines++;
 200             next;
 201         } else {
 202             # Need to check every line :(
 203             $l3_len = $l2_len;
 204             $l2_len = $l_len;
 205             $l_len  = length $line;
 206             if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
 207                 if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
 208                     my $fap = substr($line, 0, 20)."..";
 209                     $self->throw("Each line of the fasta entry must be the same ".
 210                         "length except the last. Line above #$. '$fap' is $l2_len".
 211                         " != $l3_len chars.");
 212                 }
 213                 if ($blank_lines) {
 214                     # Blank lines not allowed in entry
 215                     $self->throw("Blank lines can only precede header lines, ".
 216                         "found preceding line #$.");
 217                 }
 218             }
 219             $linelen  ||= length $line;
 220             $alphabet ||= $self->_guess_alphabet($line);
 221             $seq_lines++;
 222         }
 223         $last_line = $line;
 224     }
 225
 226     # Process last entry
 227     $self->_check_linelength($linelen);
 228     my $pos = tell $fh;
 229     if (@ids) {
 230         my $strlen   = $pos - $offset;
 231         if ($linelen == 0) { # yet another pesky empty chr_random.fa file
 232             $strlen = 0;
 233         } else {
 234             if ($last_line !~ /\s$/) {
 235                 $seq_lines--;
 236             }
 237             $strlen -= $termination_length * $seq_lines;
 238         }
 239         my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen, $linelen,
 240             $headerlen, $alphabet, $fileno);
 241         for my $id (@ids) {
 242             $offsets->{$id} = $ppos;
 243         }
 244     }
 245
 246     return \%offsets;
 247 }
 248
 249 =head2 seq
 250
 251  Title   : seq, sequence, subseq
 252  Usage   : # Entire sequence string
 253            my $seqstr    = $db->seq($id);
 254            # Subsequence
 255            my $subseqstr = $db->seq($id, $start, $stop, $strand);
 256            # or...
 257            my $subseqstr = $db->seq($compound_id);
 258  Function: Get a subseq of a sequence from the database. For your convenience,
 259            the sequence to extract can be specified with any of the following
 260            compound IDs:
 261               $db->seq("$id:$start,$stop")
 262               $db->seq("$id:$start..$stop")
 263               $db->seq("$id:$start-$stop")
 264               $db->seq("$id:$start,$stop/$strand")
 265               $db->seq("$id:$start..$stop/$strand")
 266               $db->seq("$id:$start-$stop/$strand")
 267               $db->seq("$id/$strand")
 268            In the case of DNA or RNA sequence, if $stop is less than $start,
 269            then the reverse complement of the sequence is returned. Avoid using
 270            it if possible since this goes against Bio::Seq conventions.
 271  Returns : A string
 272  Args    : ID of sequence to retrieve
 273              or
 274            Compound ID of subsequence to fetch
 275              or
 276            ID, optional start (defaults to 1), optional end (defaults to length
 277            of sequence) and optional strand (defaults to 1).
 278
 279 =cut
 280
 281 sub subseq {
 282     my ($self, $id, $start, $stop, $strand) = @_;
 283     $self->throw('Need to provide a sequence ID') if not defined $id;
 284     ($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
 285
 286     my $data;
 287
 288     my $fh = $self->_fh($id) or return;
 289     my $filestart = $self->_calc_offset($id, $start);
 290     my $filestop  = $self->_calc_offset($id, $stop );
 291
 292     seek($fh, $filestart,0);
 293     read($fh, $data, $filestop-$filestart+1);
 294
 295     $data = Bio::DB::IndexedBase::_strip_crnl($data);
 296
 297     if ($strand == -1) {
 298         # Reverse-complement the sequence
 299         $data = Bio::PrimarySeqI::_revcom_from_string($self, $data, $self->alphabet($id));
 300     }
 301     return $data;
 302 }
 303
 304 *seq = *sequence = \&subseq;
 305
 306
 307 =head2 length
 308
 309  Title   : length
 310  Usage   : my $length = $qualdb->length($id);
 311  Function: Get the number of residues in the indicated sequence.
 312  Returns : Number
 313  Args    : ID of entry
 314
 315 =head2 header
 316
 317  Title   : header
 318  Usage   : my $header = $db->header($id);
 319  Function: Get the header line (ID and description fields) of the specified
 320            sequence.
 321  Returns : String
 322  Args    : ID of sequence
 323
 324 =cut
 325
 326 sub header {
 327     my ($self, $id) = @_;
 328     $self->throw('Need to provide a sequence ID') if not defined $id;
 329     my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
 330     $offset -= $headerlen;
 331     my $data;
 332     my $fh = $self->_fh($id) or return;
 333     seek($fh, $offset, 0);
 334     read($fh, $data, $headerlen);
 335     # On Windows chomp remove '\n' but leaves '\r'
 336     # when reading '\r\n' in binary mode
 337     $data = Bio::DB::IndexedBase::_strip_crnl($data);
 338     substr($data, 0, 1) = '';
 339     return $data;
 340 }
 341
 342
 343 =head2 alphabet
 344
 345  Title   : alphabet
 346  Usage   : my $alphabet = $db->alphabet($id);
 347  Function: Get the molecular type of the indicated sequence: dna, rna or protein
 348  Returns : String
 349  Args    : ID of sequence
 350
 351 =cut
 352
 353
 354 #-------------------------------------------------------------
 355 # Bio::PrimarySeqI compatibility
 356 #
 357 package Bio::PrimarySeq::Fasta;
 358 use overload '""' => 'display_id';
 359
 360 use base qw(Bio::Root::Root Bio::PrimarySeqI);
 361
 362 sub new {
 363     my ($class, @args) = @_;
 364     my $self = $class->SUPER::new(@args);
 365     my ($db, $id, $start, $stop) = $self->_rearrange(
 366                                  [qw(DATABASE ID START STOP)],
 367                                  @args);
 368     $self->{db}    = $db;
 369     $self->{id}    = $id;
 370     $self->{stop}  = $stop  || $db->length($id);
 371     $self->{start} = $start || ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
 372     return $self;
 373 }
 374
 375 sub fetch_sequence {
 376     return shift->seq(@_);
 377 }
 378
 379 sub seq {
 380     my $self = shift;
 381     return $self->{db}->seq($self->{id}, $self->{start}, $self->{stop});
 382 }
 383
 384 sub subseq {
 385     my $self = shift;
 386     return $self->trunc(@_)->seq();
 387 }
 388
 389 sub trunc {
 390     # Override Bio::PrimarySeqI trunc() method. This way, we create an object
 391     # that does not store the sequence in memory.
 392     my ($self, $start, $stop) = @_;
 393     $self->throw("Stop cannot be smaller than start") if $stop < $start;
 394     if ($self->{start} <= $self->{stop}) {
 395         $start = $self->{start}+$start-1;
 396         $stop  = $self->{start}+$stop-1;
 397     } else {
 398         $start = $self->{start}-($start-1);
 399         $stop  = $self->{start}-($stop-1);
 400     }
 401     return $self->new( $self->{db}, $self->{id}, $start, $stop );
 402 }
 403
 404 sub is_circular {
 405     my $self = shift;
 406     return $self->{is_circular};
 407 }
 408
 409 sub display_id {
 410     my $self = shift;
 411     return $self->{id};
 412 }
 413
 414 sub accession_number {
 415     my $self = shift;
 416     return 'unknown';
 417 }
 418
 419 sub primary_id {
 420     # Following Bio::PrimarySeqI, since this sequence has no accession number,
 421     # its primary_id should be a stringified memory location.
 422     my $self = shift;
 423     return overload::StrVal($self);
 424 }
 425
 426 sub can_call_new {
 427     return 0;
 428 }
 429
 430 sub alphabet {
 431     my $self = shift;
 432     return $self->{db}->alphabet($self->{id});
 433 }
 434
 435 sub revcom {
 436     # Override Bio::PrimarySeqI revcom() with optimized method.
 437     my $self = shift;
 438     return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
 439 }
 440
 441 sub length {
 442     # Get length from sequence location, not the sequence string (too expensive)
 443     my $self = shift;
 444     return $self->{start} < $self->{stop}   ?
 445         $self->{stop}  - $self->{start} + 1 :
 446         $self->{start} - $self->{stop}  + 1 ;
 447 }
 448
 449 sub description  {
 450     my $self = shift;
 451     my $header = $self->{'db'}->header($self->{id});
 452     # Remove the ID from the header
 453     return (split(/\s+/, $header, 2))[1];
 454 }
 455 *desc = \&description;
 456
 457 1;