lib/Bio/SeqIO/embldriver.pm

   1 #
   2 # BioPerl module for Bio::SeqIO::embldriver
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::SeqIO::embldriver - EMBL sequence input/output stream
  17
  18 =head1 SYNOPSIS
  19
  20 It is probably best not to use this object directly, but
  21 rather go through the SeqIO handler system. Go:
  22
  23     $stream = Bio::SeqIO->new(-file => $filename, -format => 'embldriver');
  24
  25     while ( (my $seq = $stream->next_seq()) ) {
  26         # do something with $seq
  27     }
  28
  29 =head1 DESCRIPTION
  30
  31 This object can transform Bio::Seq objects to and from EMBL flat
  32 file databases.
  33
  34 There is a lot of flexibility here about how to dump things which
  35 should be documented more fully.
  36
  37 There should be a common object that this and Genbank share (probably
  38 with Swissprot). Too much of the magic is identical.
  39
  40 =head2 Optional functions
  41
  42 =over 3
  43
  44 =item _show_dna()
  45
  46 (output only) shows the dna or not
  47
  48 =item _post_sort()
  49
  50 (output only) provides a sorting func which is applied to the FTHelpers
  51 before printing
  52
  53 =item _id_generation_func()
  54
  55 This is function which is called as
  56
  57    print "ID   ", $func($annseq), "\n";
  58
  59 To generate the ID line. If it is not there, it generates a sensible ID
  60 line using a number of tools.
  61
  62 If you want to output annotations in EMBL format they need to be
  63 stored in a Bio::Annotation::Collection object which is accessible
  64 through the Bio::SeqI interface method L<annotation()|annotation>.
  65
  66 The following are the names of the keys which are polled from a
  67 L<Bio::Annotation::Collection> object.
  68
  69  reference  - Should contain Bio::Annotation::Reference objects
  70  comment    - Should contain Bio::Annotation::Comment objects
  71  dblink     - Should contain Bio::Annotation::DBLink objects
  72
  73 =back
  74
  75 =head1 FEEDBACK
  76
  77 =head2 Mailing Lists
  78
  79 User feedback is an integral part of the evolution of this and other
  80 Bioperl modules. Send your comments and suggestions preferably to one
  81 of the Bioperl mailing lists.  Your participation is much appreciated.
  82
  83   bioperl-l@bioperl.org                  - General discussion
  84   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  85
  86 =head2 Support
  87
  88 Please direct usage questions or support issues to the mailing list:
  89
  90 I<bioperl-l@bioperl.org>
  91
  92 rather than to the module maintainer directly. Many experienced and
  93 reponsive experts will be able look at the problem and quickly
  94 address it. Please include a thorough description of the problem
  95 with code and data examples if at all possible.
  96
  97 =head2 Reporting Bugs
  98
  99 Report bugs to the Bioperl bug tracking system to help us keep track
 100 the bugs and their resolution. Bug reports can be submitted via
 101 the web:
 102
 103   https://github.com/bioperl/bioperl-live/issues
 104
 105 =head1 AUTHOR - Ewan Birney
 106
 107 Email birney@ebi.ac.uk
 108
 109 =head1 APPENDIX
 110
 111 The rest of the documentation details each of the object
 112 methods. Internal methods are usually preceded with a _
 113
 114 =cut
 115
 116 # Let the code begin...
 117
 118 package Bio::SeqIO::embldriver;
 119
 120 use vars qw(%FTQUAL_NO_QUOTE);
 121 use strict;
 122 use Bio::SeqIO::Handler::GenericRichSeqHandler;
 123 use Data::Dumper;
 124
 125 use base qw(Bio::SeqIO);
 126
 127 my %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
 128     anticodon           citation
 129     codon               codon_start
 130     compare             cons_splice
 131     direction           estimated_length
 132     evidence            label
 133     mod_base            number
 134     rpt_type            rpt_unit
 135     rpt_unit_range      tag_peptide
 136     transl_except       transl_table
 137     usedin
 138     LOCATION
 139     );
 140
 141 my %DATA_KEY = (
 142     ID      => 'ID',
 143     AC      => 'ACCESSION',
 144     DT      => 'DATE',
 145     DE      => 'DESCRIPTION',
 146     KW      => 'KEYWORDS',
 147     OS      => 'SOURCE',
 148     OC      => 'CLASSIFICATION',
 149     OG      => 'ORGANELLE',
 150     RN      => 'REFERENCE',
 151     RA      => 'AUTHORS',
 152     RC      => 'COMMENT',
 153     RG      => 'CONSRTM',
 154     RP      => 'POSITION',
 155     RX      => 'CROSSREF',
 156     RT      => 'TITLE',
 157     RL      => 'LOCATION',
 158     XX      => 'SPACER',
 159     FH      => 'FEATHEADER',
 160     FT      => 'FEATURES',
 161     AH      => 'TPA_HEADER',  # Third party annotation
 162     AS      => 'TPA_DATA',  # Third party annotation
 163     DR      => 'DBLINK',
 164     CC      => 'COMMENT',
 165     CO      => 'CO',
 166     CON     => 'CON',
 167     WGS     => 'WGS',
 168     ANN     => 'ANN',
 169     TPA     => 'TPA',
 170     SQ      => 'SEQUENCE',
 171     );
 172
 173 my %SEC = (
 174     OC      => 'CLASSIFICATION',
 175     OH      => 'HOST', # not currently handled, bundled with organism data for now
 176     OG      => 'ORGANELLE',
 177     OX      => 'CROSSREF',
 178     RA      => 'AUTHORS',
 179     RC      => 'COMMENT',
 180     RG      => 'CONSRTM',
 181     RP      => 'POSITION',
 182     RX      => 'CROSSREF',
 183     RT      => 'TITLE',
 184     RL      => 'JOURNAL',
 185     AS      => 'ASSEMBLYINFO',  # Third party annotation
 186     );
 187
 188 my %DELIM = (
 189     #CC      => "\n",
 190     #DR      => "\n",
 191     #DT      => "\n",
 192             );
 193
 194 # signals to process what's in the hash prior to next round
 195 # these should be changed to map secondary data
 196 my %PRIMARY = map {$_ => 1} qw(ID AC DT DE SV KW OS RN AH DR FH CC SQ FT WGS CON ANN TPA //);
 197
 198 sub _initialize {
 199     my($self,@args) = @_;
 200
 201     $self->SUPER::_initialize(@args);
 202     my $handler = $self->_rearrange([qw(HANDLER)],@args);
 203     # hash for functions for decoding keys.
 204     $handler ? $self->seqhandler($handler) :
 205     $self->seqhandler(Bio::SeqIO::Handler::GenericRichSeqHandler->new(
 206                     -format => 'embl',
 207                     -verbose => $self->verbose,
 208                     -builder => $self->sequence_builder
 209                     ));
 210     #
 211     if( ! defined $self->sequence_factory ) {
 212         $self->sequence_factory(Bio::Seq::SeqFactory->new
 213                 (-verbose => $self->verbose(),
 214                  -type => 'Bio::Seq::RichSeq'));
 215     }
 216 }
 217
 218 =head2 next_seq
 219
 220  Title   : next_seq
 221  Usage   : $seq = $stream->next_seq()
 222  Function: returns the next sequence in the stream
 223  Returns : Bio::Seq object
 224  Args    :
 225
 226 =cut
 227
 228 sub next_seq {
 229     my $self = shift;
 230     my $hobj = $self->seqhandler;
 231     local($/) = "\n";
 232     my ($featkey, $qual, $annkey, $delim, $seqdata);
 233     my $lastann = '';
 234     my $ct = 0;
 235     PARSER:
 236     while(defined(my $line = $self->_readline)) {
 237         next PARSER if $line =~ m{^\s*$};
 238         chomp $line;
 239         my ($ann,$data) = split m{\s{2,3}}, $line , 2;
 240         next PARSER if ($ann eq 'XX' || $ann eq 'FH');
 241         if ($ann) {
 242             $data ||='';
 243             if ($ann eq 'FT') {
 244                 # seqfeatures
 245                 if ($data =~ m{^(\S+)\s+([^\n]+)}) {
 246                     $hobj->data_handler($seqdata) if $seqdata;
 247                     $seqdata = ();
 248                     ($seqdata->{FEATURE_KEY}, $data) = ($1, $2);
 249                     $seqdata->{NAME} = $ann;
 250                     $qual = 'LOCATION';
 251                 } elsif ($data =~ m{^\s+/([^=]+)=?(.+)?}) {
 252                     ($qual, $data) = ($1, $2 ||'');
 253                     $ct = (exists $seqdata->{$qual}) ?
 254                         ((ref($seqdata->{$qual}))  ? scalar(@{ $seqdata->{$qual} }) : 1)
 255                         : 0 ;
 256                 }
 257                 $data =~ s{^\s+}{};
 258                 $data =~ tr{"}{}d; # we don't care about quotes yet...
 259                 my $delim = ($FTQUAL_NO_QUOTE{$qual}) ? '' : ' ';
 260                 if ($ct == 0) {
 261                     $seqdata->{$qual} .= ($seqdata->{$qual}) ?
 262                         $delim.$data :
 263                         $data;
 264                 } else {
 265                     if (!ref($seqdata->{$qual})) {
 266                         $seqdata->{$qual} = [$seqdata->{$qual}];
 267                     }
 268                     (exists $seqdata->{$qual}->[$ct]) ?
 269                         (($seqdata->{$qual}->[$ct]) .= $delim.$data) :
 270                         (($seqdata->{$qual}->[$ct]) .= $data);
 271                 }
 272             } else {
 273                 # simple annotations
 274                 $data =~ s{;$}{};
 275                 last PARSER if $ann eq '//';
 276                 if ($ann ne $lastann) {
 277                     if (!$SEC{$ann} && $seqdata) {
 278                         $hobj->data_handler($seqdata);
 279                         # can't use undef here; it can lead to subtle mem leaks
 280                         $seqdata = ();
 281                     }
 282                     $annkey = (!$SEC{$ann})    ? 'DATA'     : # primary data
 283                               $SEC{$ann};
 284                     $seqdata->{'NAME'} = $ann if !$SEC{$ann};
 285                 }
 286
 287                 # toss the data for SQ lines; this needs to be done after the
 288                 # call to the data handler
 289
 290                 next PARSER if $ann eq 'SQ';
 291                 my $delim = $DELIM{$ann} || ' ';
 292                 $seqdata->{$annkey} .= ($seqdata->{$annkey}) ?
 293                     $delim.$data : $data;
 294                 $lastann = $ann;
 295             }
 296         } else {
 297             # this should only be sequence (fingers crossed!)
 298             SEQUENCE:
 299             while (defined ($line = $self->_readline)) {
 300                 if (index($line, '//') == 0) {
 301                     $data =~ tr{0-9 \n}{}d;
 302                     $seqdata->{DATA} = $data;
 303                     #$self->debug(Dumper($seqdata));
 304                     $hobj->data_handler($seqdata);
 305                     $seqdata = ();
 306                     last PARSER;
 307                 } else {
 308                     $data .= $line;
 309                     $line = undef;
 310                 }
 311             }
 312         }
 313     }
 314     $hobj->data_handler($seqdata) if $seqdata;
 315     $seqdata = ();
 316     return $hobj->build_sequence;
 317 }
 318
 319 sub next_chunk {
 320     my $self = shift;
 321     my $ct = 0;
 322     PARSER:
 323     while(defined(my $line = $self->_readline)) {
 324         next if $line =~ m{^\s*$};
 325         chomp $line;
 326         my ($ann,$data) = split m{\s{2,3}}, $line , 2;
 327         $data ||= '';
 328         $self->debug("Ann: [$ann]\n\tData: [$data]\n");
 329         last PARSER if $ann =~ m{//};
 330     }
 331 }
 332
 333 =head2 write_seq
 334
 335  Title   : write_seq
 336  Usage   : $stream->write_seq($seq)
 337  Function: writes the $seq object (must be seq) to the stream
 338  Returns : 1 for success and 0 for error
 339  Args    : array of 1 to n Bio::SeqI objects
 340
 341 =cut
 342
 343 sub write_seq {
 344     shift->throw("Use Bio::SeqIO::embl for output");
 345     # maybe make a Writer class as well????
 346 }
 347
 348 =head2 seqhandler
 349
 350  Title   : seqhandler
 351  Usage   : $stream->seqhandler($handler)
 352  Function: Get/Set the Bio::Seq::HandlerBaseI object
 353  Returns : Bio::Seq::HandlerBaseI
 354  Args    : Bio::Seq::HandlerBaseI
 355
 356 =cut
 357
 358 sub seqhandler {
 359     my ($self, $handler) = @_;
 360     if ($handler) {
 361         $self->throw("Not a Bio::HandlerBaseI") unless
 362         ref($handler) && $handler->isa("Bio::HandlerBaseI");
 363         $self->{'_seqhandler'} = $handler;
 364     }
 365     return $self->{'_seqhandler'};
 366 }
 367
 368 1;
 369
 370 __END__
 371