lib/Bio/SeqIO/gbdriver.pm

   1 #
   2 # BioPerl module for Bio::SeqIO::gbdriver
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Bioperl project bioperl-l(at)bioperl.org
   7 #
   8 # Copyright Chris Fields and contributors see AUTHORS section
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::SeqIO::gbdriver - GenBank handler-based push parser
  17
  18 =head1 SYNOPSIS
  19
  20   #It is probably best not to use this object directly, but
  21   #rather go through the SeqIO handler:
  22
  23   $stream = Bio::SeqIO->new(-file => $filename,
  24                             -format => 'gbdriver');
  25
  26   while ( my $seq = $stream->next_seq() ) {
  27       # do something with $seq
  28   }
  29
  30 =head1 DESCRIPTION
  31
  32 This object can transform Bio::Seq objects to and from GenBank flat file
  33 databases. The key difference between this parser and the tried-and-true
  34 Bio::SeqIO::genbank parser is this version separates the parsing and data
  35 manipulation into a 'driver' method (next_seq) and separate object handlers
  36 which deal with the data passed to it.
  37
  38 =head2 The Driver
  39
  40 The main purpose of the driver routine, in this case next_seq(), is to carve out
  41 the data into meaningful chunks which are passed along to relevant handlers (see
  42 below).
  43
  44 Each chunk of data in the has a NAME tag attached to it, similar to that for XML
  45 parsing. This designates the type of data passed (annotation type or seqfeature)
  46 and the handler to be called for processing the data.
  47
  48 For GenBank annotations, the data is divided up and passed along to handlers
  49 according to whether the data is tagged with a field name (i.e. LOCUS) and
  50 whether the field name represents 'primary' annotation (in this case, is present
  51 at the beginning of the line, such as REFERENCE). If the field is primary, it is
  52 assigned to the NAME tag. Field names which aren't primary (have at least 2
  53 spaces before the name, like ORGANISM) are appended to the preceding primary
  54 field name as additional tags.
  55
  56 For feature table data each new feature name signals the beginning of a new
  57 chunk of data. 'FEATURES' is attached to NAME, the feature key ('CDS', 'gene',
  58 etc) is attached as the PRIMARY_ID, and the location is assigned to it's own tag
  59 name (LOCATION). Feature qualifiers are added as additional keys, with multiple
  60 keys included in an array.
  61
  62 Once a particular event occurs (new primary tag, sequence, end of record), the
  63 data is passed along to be processed by a handler or (if no handler is defined)
  64 tossed away.
  65
  66 Internally, the hash ref for a representative annotation (here a REFERENCE)
  67 looks like this:
  68
  69   $VAR1 = {
  70             'JOURNAL' => 'Unpublished (2003)',
  71             'TITLE' => 'The DNA sequence of Homo sapiens',
  72             'NAME' => 'REFERENCE',
  73             'REFERENCE' => '1  (bases 1 to 10001)',
  74             'AUTHORS' => 'International Human Genome Sequencing Consortium.'
  75           };
  76
  77 and a SeqFeature as this:
  78
  79   $VAR1 = {
  80             'db_xref' => [
  81                            'GeneID:127086',
  82                            'InterimID:127086'
  83                          ],
  84             'LOCATION' => 'complement(3024..6641)',
  85             'NAME' => 'FEATURES',
  86             'FEATURE_KEY' => 'gene',
  87             'gene' => 'LOC127086',
  88             'note' => 'Derived by automated computational analysis using
  89                        gene prediction method: GNOMON.'
  90           };
  91
  92 Note that any driver implementation would suffice as long as it fulfilled the
  93 requirements above.
  94
  95 =head1 FEEDBACK
  96
  97 =head2 Mailing Lists
  98
  99 User feedback is an integral part of the evolution of this and other
 100 Bioperl modules. Send your comments and suggestions preferably to one
 101 of the Bioperl mailing lists.  Your participation is much appreciated.
 102
 103   bioperl-l@bioperl.org                  - General discussion
 104   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 105
 106 =head2 Support
 107
 108 Please direct usage questions or support issues to the mailing list:
 109
 110 I<bioperl-l@bioperl.org>
 111
 112 rather than to the module maintainer directly. Many experienced and
 113 reponsive experts will be able look at the problem and quickly
 114 address it. Please include a thorough description of the problem
 115 with code and data examples if at all possible.
 116
 117 =head2 Reporting Bugs
 118
 119 Report bugs to the Bioperl bug tracking system to help us keep track
 120 the bugs and their resolution. Bug reports can be submitted via the web:
 121
 122   https://github.com/bioperl/bioperl-live/issues
 123
 124 =head1 AUTHOR - Bioperl Project
 125
 126 bioperl-l at bioperl.org
 127
 128 Original author Elia Stupka, elia -at- tigem.it
 129
 130 =head1 CONTRIBUTORS
 131
 132 Ewan Birney birney at ebi.ac.uk
 133 Jason Stajich jason at bioperl.org
 134 Chris Mungall cjm at fruitfly.bdgp.berkeley.edu
 135 Lincoln Stein lstein at cshl.org
 136 Heikki Lehvaslaiho, heikki at ebi.ac.uk
 137 Hilmar Lapp, hlapp at gmx.net
 138 Donald G. Jackson, donald.jackson at bms.com
 139 James Wasmuth, james.wasmuth at ed.ac.uk
 140 Brian Osborne, bosborne at alum.mit.edu
 141
 142 =head1 APPENDIX
 143
 144 The rest of the documentation details each of the object
 145 methods. Internal methods are usually preceded with a _
 146
 147 =cut
 148
 149 # POD is at the end of the module
 150
 151 # Let the code begin...
 152
 153 package Bio::SeqIO::gbdriver;
 154
 155 use strict;
 156 use warnings;
 157 use Data::Dumper;
 158 use Bio::SeqIO::Handler::GenericRichSeqHandler;
 159 use Bio::Seq::SeqFactory;
 160
 161 use base qw(Bio::SeqIO);
 162
 163 # map all annotation keys to consistent INSDC-based tags for all handlers
 164
 165 my %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
 166     anticodon           citation
 167     codon               codon_start
 168     compare             cons_splice
 169     direction           estimated_length
 170     evidence            label
 171     mod_base            number
 172     rpt_type            rpt_unit
 173     rpt_unit_range      tag_peptide
 174     transl_except       transl_table
 175     usedin
 176     );
 177
 178
 179 # 1) change this to indicate what should be secondary, not primary, which allows
 180 # unknown or new stuff to be passed to handler automatically; current behavior
 181 # appends unknowns to previous data, which isn't good since it's subtly passing
 182 # by important data
 183 # 2) add mapping details about how to separate data using specific delimiters
 184
 185
 186 # Features are the only ones postprocessed for now
 187 # Uncomment relevant code in next_seq and add keys as needed...
 188 my %POSTPROCESS_DATA = map {$_ => 1} qw (FEATURES);
 189
 190 sub _initialize {
 191     my($self,@args) = @_;
 192
 193     $self->SUPER::_initialize(@args);
 194     my $handler = $self->_rearrange([qw(HANDLER)],@args);
 195     # hash for functions for decoding keys.
 196     $handler ? $self->seqhandler($handler) :
 197     $self->seqhandler(Bio::SeqIO::Handler::GenericRichSeqHandler->new(
 198                     -format => 'genbank',
 199                     -verbose => $self->verbose,
 200                     -builder => $self->sequence_builder
 201                     ));
 202     if( ! defined $self->sequence_factory ) {
 203         $self->sequence_factory(Bio::Seq::SeqFactory->new
 204                 (-verbose => $self->verbose(),
 205                  -type => 'Bio::Seq::RichSeq'));
 206     }
 207 }
 208
 209 =head2 next_seq
 210
 211  Title   : next_seq
 212  Usage   : $seq = $stream->next_seq()
 213  Function: returns the next sequence in the stream
 214  Returns : Bio::Seq object
 215  Args    :
 216
 217 =cut
 218
 219 # at this point there is minimal sequence validation,
 220 # but the parser seems to hold up nicely so far...
 221
 222 sub next_seq {
 223     my $self = shift;
 224     local($/) = "\n";
 225     my ($ann, $data, $annkey);
 226     my $endrec = my $seenfeat = 0;
 227     my $seqdata;
 228     my $seenlocus;
 229     my $hobj = $self->seqhandler;
 230     my $handlers = $self->seqhandler->handler_methods;
 231     #$self->debug(Dumper($handlers));
 232     PARSER:
 233     while (defined(my $line = $self->_readline)) {
 234         next if $line =~ m{^\s*$};
 235
 236         # have to catch this at the top of the loop, then exit SEQ loop on //
 237         # The reason? The regex match for ann/feat keys also matches some lines
 238         # in the sequence; no easy way around it since some feature keys may
 239         # start with a number as well
 240         if ($ann && $ann eq 'ORIGIN') {
 241             SEQ:
 242             while (defined($line)) {
 243                 last SEQ if index($line,'//') == 0;
 244                 $seqdata->{DATA} .= uc $line;
 245                 $line = $self->_readline;
 246             }
 247             $seqdata->{DATA} =~ tr{0-9 \n}{}d;
 248         }
 249         $endrec = 1 if (index($line,'//')==0);
 250
 251         if ($line =~ m{^(\s{0,5})(\w+)\s+(.*)$}ox || $endrec) {
 252             ($ann, $data) = ($2, $3);
 253             unless ($seenlocus) {
 254                 $self->throw("No LOCUS found.  Not GenBank in my book!")
 255                     if ($ann ne 'LOCUS');
 256                 $seenlocus = 1;
 257             }
 258             # use the spacer to determine the annotation type
 259             my $len = length($1 || '');
 260
 261             $annkey  = ($len == 0 || $len > 4)   ? 'DATA'  : $ann;
 262
 263             # Push off the previously cached data to the handler
 264             # whenever a new primary annotation or seqfeature is found
 265             # Note use of $endrec for catching end of record
 266             if (($annkey eq 'DATA') && $seqdata) {
 267                 chomp $seqdata->{DATA};
 268                 # postprocessing for some data
 269                 if ($seqdata->{NAME} eq 'FEATURES') {
 270                     $self->_process_features($seqdata)
 271                 }
 272
 273                 # using handlers directly, slightly faster
 274                 #my $method = (exists $handlers->{ $seqdata->{NAME} }) ?
 275                 #        ($handlers->{$seqdata->{NAME}}) :
 276                 #    (exists $handlers->{'_DEFAULT_'}) ?
 277                 #        ($handlers->{'_DEFAULT_'}) :
 278                 #    undef;
 279                 #($method) ? ($hobj->$method($seqdata) ) :
 280                 #        $self->debug("No handler defined for ",$seqdata->{NAME},"\n");
 281
 282                 # using handler methods in the Handler object, more centralized
 283                 #$self->debug(Dumper($seqdata));
 284                 $hobj->data_handler($seqdata);
 285
 286                 # bail here on //
 287                 last PARSER if $endrec;
 288                 # reset for next round
 289                 $seqdata = undef;
 290             }
 291
 292             $seqdata->{NAME} =  ($len == 0) ? $ann :   # primary ann
 293                                 ($len > 4 ) ? 'FEATURES': # sf feature key
 294                                 $seqdata->{NAME};      # all rest are sec. ann
 295             if ($seqdata->{NAME} eq 'FEATURES') {
 296                 $seqdata->{FEATURE_KEY} = $ann;
 297             }
 298             # throw back to top if seq is found to avoid regex
 299             next PARSER if $ann eq 'ORIGIN';
 300
 301         } else {
 302             ($data = $line) =~ s{^\s+}{};
 303             chomp $data;
 304         }
 305         my $delim = ($seqdata && $seqdata->{NAME} eq 'FEATURES') ? "\n" : ' ';
 306         $seqdata->{$annkey} .= ($seqdata->{$annkey}) ? $delim.$data : $data;
 307     }
 308     return $hobj->build_sequence;
 309 }
 310
 311 sub next_chunk {
 312     my $self = shift;
 313     local($/) = "\n";
 314     my ($ann, $data, $annkey);
 315     my $endrec = my $seenfeat = 0;
 316     my $seqdata;
 317     my $seenlocus;
 318     my $hobj = $self->seqhandler;
 319     PARSER:
 320     while (defined(my $line = $self->_readline)) {
 321         next if $line =~ m{^\s*$};
 322         # have to catch this at the top of the loop, then exit SEQ loop on //
 323         # The reason? The regex match for ann/feat keys also matches some lines
 324         # in the sequence; no easy way around it since some feature keys may
 325         # start with a number as well
 326         if ($ann && $ann eq 'ORIGIN') {
 327             SEQ:
 328             while (defined($line)) {
 329                 last SEQ if index($line,'//') == 0;
 330                 $seqdata->{DATA} .= uc $line;
 331                 $line = $self->_readline;
 332             }
 333             $seqdata->{DATA} =~ tr{0-9 \n}{}d;
 334         }
 335         $endrec = 1 if (index($line,'//')==0);
 336
 337         if ($line =~ m{^(\s{0,5})(\w+)\s+(.*)$}ox || $endrec) {
 338             ($ann, $data) = ($2, $3);
 339             unless ($seenlocus) {
 340                 $self->throw("No LOCUS found.  Not GenBank in my book!")
 341                     if ($ann ne 'LOCUS');
 342                 $seenlocus = 1;
 343             }
 344             # use the spacer to determine the annotation type
 345             my $len = length($1 || '');
 346
 347             $annkey  = ($len == 0 || $len > 4)   ? 'DATA'  : $ann;
 348
 349             # Push off the previously cached data to the handler
 350             # whenever a new primary annotation or seqfeature is found
 351             # Note use of $endrec for catching end of record
 352             if (($annkey eq 'DATA') && $seqdata) {
 353                 chomp $seqdata->{DATA};
 354                 # postprocessing for some data
 355                 if ($seqdata->{NAME} eq 'FEATURES') {
 356                     $self->_process_features($seqdata)
 357                 }
 358                 # using handler methods in the Handler object, more centralized
 359                 $hobj->data_handler($seqdata);
 360                 # bail here on //
 361                 last PARSER if $endrec;
 362                 # reset for next round
 363                 $seqdata = undef;
 364             }
 365
 366             $seqdata->{NAME} =  ($len == 0) ? $ann :   # primary ann
 367                                 ($len > 4 ) ? 'FEATURES': # sf feature key
 368                                 $seqdata->{NAME};      # all rest are sec. ann
 369             if ($seqdata->{NAME} eq 'FEATURES') {
 370                 $seqdata->{FEATURE_KEY} = $ann;
 371             }
 372             # throw back to top if seq is found to avoid regex
 373             next PARSER if $ann eq 'ORIGIN';
 374         } else {
 375             ($data = $line) =~ s{^\s+}{};
 376             chomp $data;
 377         }
 378         my $delim = ($seqdata && $seqdata->{NAME} eq 'FEATURES') ? "\n" : ' ';
 379         $seqdata->{$annkey} .= ($seqdata->{$annkey}) ? $delim.$data : $data;
 380     }
 381 }
 382
 383 =head2 write_seq
 384
 385  Title   : write_seq
 386  Usage   : $stream->write_seq($seq)
 387  Function: writes the $seq object (must be seq) to the stream
 388  Returns : 1 for success and 0 for error
 389  Args    : array of 1 to n Bio::SeqI objects
 390
 391 =cut
 392
 393 sub write_seq {
 394     shift->throw("Use Bio::SeqIO::genbank for output");
 395     # maybe make a Writer class as well????
 396 }
 397
 398 =head2 seqhandler
 399
 400  Title   : seqhandler
 401  Usage   : $stream->seqhandler($handler)
 402  Function: Get/Set the Bio::Seq::HandlerBaseI object
 403  Returns : Bio::Seq::HandlerBaseI
 404  Args    : Bio::Seq::HandlerBaseI
 405
 406 =cut
 407
 408 sub seqhandler {
 409     my ($self, $handler) = @_;
 410     if ($handler) {
 411         $self->throw("Not a Bio::HandlerBaseI") unless
 412         ref($handler) && $handler->isa("Bio::HandlerBaseI");
 413         $self->{'_seqhandler'} = $handler;
 414     }
 415     return $self->{'_seqhandler'};
 416 }
 417
 418 #=head2 _process_features
 419 #
 420 # Title   : _process_features
 421 # Usage   : $self->_process_features($seqdata)
 422 # Function: Process feature data chunk into usable bits
 423 # Returns :
 424 # Args    : data chunk
 425 #
 426 #=cut
 427
 428 sub _process_features {
 429     my ($self, $seqdata) = @_;
 430     my @ftlines = split m{\n}, $seqdata->{DATA};
 431     delete $seqdata->{DATA};
 432     # don't deal with balancing quotes for now; just get rid of them...
 433     # Should we worry about checking whether these are balanced
 434     # for round-tripping tests?
 435     map { s{"}{}g } @ftlines;
 436     # all sfs start with the location...
 437     my $qual = 'LOCATION';
 438     my $ct = 0;
 439     for my $qualdata (@ftlines) {
 440         if ($qualdata =~ m{^/([^=]+)=?(.+)?}) {
 441             ($qual, $qualdata) = ($1, $2);
 442             $qualdata ||= ''; # for those qualifiers that have no data, like 'pseudo'
 443             $ct = (exists $seqdata->{$qual}) ?
 444                   ((ref($seqdata->{$qual}))  ? scalar(@{ $seqdata->{$qual} }) : 1)
 445                   : 0 ;
 446         }
 447         my $delim = ($qual eq 'translation' || exists $FTQUAL_NO_QUOTE{$qual}) ?
 448             '' : ' ';
 449         # if more than one, turn into an array ref and append
 450         if ($ct == 0) {
 451             (exists $seqdata->{$qual}) ? ($seqdata->{$qual}.= $delim.$qualdata || '') :
 452                                          ($seqdata->{$qual} .= $qualdata || '');
 453         } else {
 454             if (!ref($seqdata->{$qual})) {
 455                 $seqdata->{$qual} = [$seqdata->{$qual}];
 456             }
 457             (exists $seqdata->{$qual}->[$ct]) ? (($seqdata->{$qual}->[$ct]) .= $delim.$qualdata) :
 458                                              (($seqdata->{$qual}->[$ct]) .= $qualdata);
 459         }
 460     }
 461 }
 462
 463 1;
 464
 465 __END__
 466