Bio/SeqIO/Handler/GenericRichSeqHandler.pm

   1 #
   2 # BioPerl module for Bio::SeqIO::Handler::GenericRichSeqHandler
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Chris Fields
   7 #
   8 # Copyright Chris Fields
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::SeqIO::Handler::GenericRichSeqHandler - Bio::HandlerI-based
  17 data handler for GenBank/EMBL/UniProt (and other) sequence data
  18
  19 =head1 SYNOPSIS
  20
  21   # MyHandler is a GenericRichSeqHandler object.
  22   # inside a parser (driver) constructor....
  23
  24   $self->seq_handler($handler || MyHandler->new(-format => 'genbank'));
  25
  26   # in next_seq() in driver...
  27
  28   $hobj = $self->seqhandler();
  29
  30   # roll data up into hashref chunks, pass off into Handler for processing...
  31
  32   $hobj->data_handler($data);
  33
  34   # or retrieve Handler methods and pass data directly to Handler methods...
  35
  36   my $hmeth = $hobj->handler_methods;
  37
  38   if ($hmeth->{ $data->{NAME} }) {
  39       my $mth = $hmeth->{ $data->{NAME} };
  40       $hobj->$mth($data);
  41   }
  42
  43 =head1 DESCRIPTION
  44
  45 This is an experimental implementation of a sequence-based HandlerBaseI parser
  46 and may change over time. It is possible (nay, likely) that the way handler
  47 methods are set up will change over development to allow more flexibility.
  48 Release pumpkins, please do not add this to a release until the API has settled.
  49 It is also likely that write_seq() will not work properly for some data.
  50
  51 Standard Developer caveats:
  52
  53 Do not use for production purposes.
  54 Not responsible for destroying (your data|computer|world).
  55 Do not stare directly at GenericRichSeqHandler.
  56 If GenericRichSeqHandler glows, back slowly away and call for help.
  57
  58 Consider yourself warned!
  59
  60 This class acts as a demonstration on how to handle similar data chunks derived
  61 from Bio::SeqIO::gbdriver, Bio::SeqIO::embldriver, and Bio::SeqIO::swissdriver
  62 using similar (or the same) handler methods.
  63
  64 The modules currently pass all previous tests in t/genbank.t, t/embl.t, and
  65 t/swiss.t yet all use the same handler methods (the collected tests for handlers
  66 can be found in t/Handler.t). Some tweaking of the methods themselves is
  67 probably in order over the long run to ensure that data is consistently handled
  68 for each parser.  Round-trip tests are probably in order here...
  69
  70 Though a Bio::Seq::SeqBuilder is employed for building sequence objects no
  71 bypassing of data based on builder slots has been implemented (yet); this is
  72 planned in the near future.
  73
  74 As a reminder: this is the current Annotation data chunk (via Data::Dumper):
  75
  76   $VAR1 = {
  77             'NAME' => 'REFERENCE',
  78             'DATA' => '1  (bases 1 to 10001)'
  79             'AUTHORS' => 'International Human Genome Sequencing Consortium.'
  80             'TITLE' => 'The DNA sequence of Homo sapiens'
  81             'JOURNAL' => 'Unpublished (2003)'
  82           };
  83   ...
  84
  85 This is the current SeqFeature data chunk (again via Data::Dumper):
  86
  87   $VAR1 = {
  88             'mol_type' => 'genomic DNA',
  89             'LOCATION' => '<1..>10001',
  90             'NAME' => 'FEATURES',
  91             'FEATURE_KEY' => 'source',
  92             'note' => 'Accession AL451081 sequenced by The Sanger Centre',
  93             'db_xref' => 'taxon:9606',
  94             'clone' => 'RP11-302I18',
  95             'organism' => 'Homo sapiens'
  96           };
  97
  98 =head1 FEEDBACK
  99
 100 =head2 Mailing Lists
 101
 102 User feedback is an integral part of the evolution of this and other
 103 Bioperl modules. Send your comments and suggestions preferably to one
 104 of the Bioperl mailing lists.  Your participation is much appreciated.
 105
 106   bioperl-l@bioperl.org                  - General discussion
 107   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 108
 109 =head2 Support
 110
 111 Please direct usage questions or support issues to the mailing list:
 112
 113 I<bioperl-l@bioperl.org>
 114
 115 rather than to the module maintainer directly. Many experienced and
 116 reponsive experts will be able look at the problem and quickly
 117 address it. Please include a thorough description of the problem
 118 with code and data examples if at all possible.
 119
 120 =head2 Reporting Bugs
 121
 122 Report bugs to the Bioperl bug tracking system to help us keep track
 123 the bugs and their resolution.  Bug reports can be submitted via the
 124 web:
 125
 126   https://github.com/bioperl/bioperl-live/issues
 127
 128 =head1 AUTHOR - Chris Fields
 129
 130 Email cjfields at bioperl dot org
 131
 132 =head1 APPENDIX
 133
 134 The rest of the documentation details each of the object methods. Internal
 135 methods are usually preceded with a _
 136
 137 =cut
 138
 139 # Let the code begin...
 140
 141 package Bio::SeqIO::Handler::GenericRichSeqHandler;
 142 use strict;
 143 use warnings;
 144
 145 use Bio::SeqIO::FTHelper;
 146 use Bio::Annotation::Collection;
 147 use Bio::Annotation::DBLink;
 148 use Bio::Annotation::Comment;
 149 use Bio::Annotation::Reference;
 150 use Bio::Annotation::Collection;
 151 use Bio::Annotation::SimpleValue;
 152 use Bio::Annotation::TagTree;
 153 use Bio::SeqFeature::Generic;
 154 use Bio::Species;
 155 use Bio::Taxon;
 156 use Bio::DB::Taxonomy;
 157 use Bio::Factory::FTLocationFactory;
 158 use Data::Dumper;
 159
 160 use base qw(Bio::Root::Root Bio::HandlerBaseI);
 161
 162 my %HANDLERS = (
 163     'genbank'   => {
 164         'LOCUS'         => \&_genbank_locus,
 165         'DEFINITION'    => \&_generic_description,
 166         'ACCESSION'     => \&_generic_accession,
 167         'VERSION'       => \&_generic_version,
 168         'KEYWORDS'      => \&_generic_keywords,
 169         'DBSOURCE'      => \&_genbank_dbsource,
 170         'DBLINK'        => \&_genbank_dbsource,
 171         'SOURCE'        => \&_generic_species,
 172         'REFERENCE'     => \&_generic_reference,
 173         'COMMENT'       => \&_generic_comment,
 174         'FEATURES'      => \&_generic_seqfeatures,
 175         'BASE'          => \&noop,    # this is generated from scratch
 176         'ORIGIN'        => \&_generic_seq,
 177         # handles anything else (WGS, WGS_SCAFLD, CONTIG, PROJECT)
 178         '_DEFAULT_'     => \&_generic_simplevalue,
 179         },
 180     'embl'      => {
 181         'ID'            => \&_embl_id,
 182         'DT'            => \&_embl_date,
 183         'DR'            => \&_generic_dbsource,
 184         'SV'            => \&_generic_version,
 185         'RN'            => \&_generic_reference,
 186         'KW'            => \&_generic_keywords,
 187         'DE'            => \&_generic_description,
 188         'AC'            => \&_generic_accession,
 189         #'AH'            => \&noop, # TPA data not dealt with yet...
 190         #'AS'            => \&noop,
 191         'SQ'            => \&_generic_seq,
 192         'OS'            => \&_generic_species,
 193         'CC'            => \&_generic_comment,
 194         'FT'            => \&_generic_seqfeatures,
 195         # handles anything else (WGS, TPA, ANN...)
 196         '_DEFAULT_'     => \&_generic_simplevalue,
 197         },
 198     'swiss'     => {
 199         'ID'            => \&_swiss_id,
 200         'DT'            => \&_swiss_date,
 201         'GN'            => \&_swiss_genename,
 202         'DR'            => \&_generic_dbsource,
 203         'RN'            => \&_generic_reference,
 204         'KW'            => \&_generic_keywords,
 205         'DE'            => \&_generic_description,
 206         'AC'            => \&_generic_accession,
 207         'SQ'            => \&_generic_seq,
 208         'OS'            => \&_generic_species,
 209         'CC'            => \&_generic_comment,
 210         'FT'            => \&_generic_seqfeatures,
 211         # handles anything else, though I don't know what...
 212         '_DEFAULT_'     => \&_generic_simplevalue,
 213         },
 214     );
 215
 216 # can we do this generically?  Seems like a lot of trouble...
 217 my %DBSOURCE = map {$_ => 1} qw(
 218     EchoBASE     IntAct    SWISS-2DPAGE    ECO2DBASE    ECOGENE    TIGRFAMs
 219     TIGR    GO    InterPro    Pfam    PROSITE    SGD    GermOnline
 220     HSSP    PhosSite    Ensembl    RGD    AGD    ArrayExpress    KEGG
 221     H-InvDB    HGNC    LinkHub    PANTHER    PRINTS    SMART    SMR
 222     MGI    MIM    RZPD-ProtExp    ProDom    MEROPS    TRANSFAC    Reactome
 223     UniGene    GlycoSuiteDB    PIRSF    HSC-2DPAGE    PHCI-2DPAGE
 224     PMMA-2DPAGE    Siena-2DPAGE    Rat-heart-2DPAGE    Aarhus/Ghent-2DPAGE
 225     Biocyc    MetaCyc    Biocyc:Metacyc    GenomeReviews    FlyBase
 226     TMHOBP    COMPLUYEAST-2DPAGE    OGP    DictyBase    HAMAP
 227     PhotoList    Gramene    WormBase    WormPep    Genew    ZFIN
 228     PeroxiBase    MaizeDB    TAIR    DrugBank    REBASE    HPA
 229     swissprot    GenBank    GenPept    REFSEQ    embl    PDB    UniProtKB);
 230
 231 my %NOPROCESS = map {$_ => 1} qw(DBSOURCE ORGANISM FEATURES);
 232
 233 our %VALID_ALPHABET = (
 234     'bp' => 'dna',
 235     'aa' => 'protein',
 236     'rc' => '' # rc = release candidate; file has no sequences
 237 );
 238
 239 =head2 new
 240
 241  Title   :  new
 242  Usage   :
 243  Function:
 244  Returns :
 245  Args    :  -format    Sequence format to be mapped for handler methods
 246             -builder   Bio::Seq::SeqBuilder object (normally defined in
 247                        SequenceStreamI object implementation constructor)
 248  Throws  :  On undefined '-format' sequence format parameter
 249  Note    :  Still under heavy development
 250
 251 =cut
 252
 253 sub new {
 254     my ($class, @args) = @_;
 255     my $self = $class->SUPER::new(@args);
 256     $self = {@args};
 257     bless $self,$class;
 258     my ($format, $builder) = $self->_rearrange([qw(FORMAT BUILDER)], @args);
 259     $self->throw("Must define sequence record format") if !$format;
 260     $self->format($format);
 261     $self->handler_methods();
 262     $builder  &&  $self->seqbuilder($builder);
 263     $self->location_factory();
 264     return $self;
 265 }
 266
 267 =head1 L<Bio::HandlerBaseI> implementing methods
 268
 269 =head2 handler_methods
 270
 271  Title   :  handler_methods
 272  Usage   :  $handler->handler_methods('GenBank')
 273             %handlers = $handler->handler_methods();
 274  Function:  Retrieve the handler methods used for the current format() in
 275             the handler.  This assumes the handler methods are already
 276             described in the HandlerI-implementing class.
 277  Returns :  a hash reference with the data type handled and the code ref
 278             associated with it.
 279  Args    :  [optional] String representing the sequence format.  If set here
 280             this will also set sequence_format()
 281  Throws  :  On unimplemented sequence format in %HANDLERS
 282
 283 =cut
 284
 285 sub handler_methods {
 286     my $self = shift;
 287     if (!($self->{'handlers'})) {
 288         $self->throw("No handlers defined for seqformat ",$self->format)
 289             unless exists $HANDLERS{$self->format};
 290         $self->{'handlers'} = $HANDLERS{$self->format};
 291     }
 292     return ($self->{'handlers'});
 293 }
 294
 295 =head2 data_handler
 296
 297  Title   :  data_handler
 298  Usage   :  $handler->data_handler($data)
 299  Function:  Centralized method which accepts all data chunks, then distributes
 300             to the appropriate methods for processing based on the chunk name
 301             from within the HandlerBaseI object.
 302
 303             One can also use
 304  Returns :  None
 305  Args    :  an hash ref containing a data chunk.
 306
 307 =cut
 308
 309 sub data_handler {
 310     my ($self, $data) = @_;
 311     my $nm = $data->{NAME} || $self->throw("No name tag defined!");
 312
 313     # this should handle data on the fly w/o caching; any caching should be
 314     # done in the driver!
 315     my $method = (exists $self->{'handlers'}->{$nm}) ? ($self->{'handlers'}->{$nm}) :
 316                 (exists $self->{'handlers'}->{'_DEFAULT_'}) ? ($self->{'handlers'}->{'_DEFAULT_'}) :
 317                 undef;
 318     if (!$method) {
 319         $self->debug("No handler defined for $nm\n");
 320         return;
 321     };
 322     $self->$method($data);
 323 }
 324
 325 =head2 reset_parameters
 326
 327  Title   :  reset_parameters
 328  Usage   :  $handler->reset_parameters()
 329  Function:  Resets the internal cache of data (normally object parameters for
 330             a builder or factory)
 331  Returns :  None
 332  Args    :  None
 333
 334 =cut
 335
 336 sub reset_parameters {
 337     my $self = shift;
 338     $self->{'_params'} = undef;
 339 }
 340
 341 =head2 format
 342
 343  Title   :  format
 344  Usage   :  $handler->format('GenBank')
 345  Function:  Get/Set the format for the report/record being parsed. This can be
 346             used to set handlers in classes which are capable of processing
 347             similar data chunks from multiple driver modules.
 348  Returns :  String with the sequence format
 349  Args    :  [optional] String with the sequence format
 350  Note    :  The format may be used to set the handlers (as in the
 351             current GenericRichSeqHandler implementation)
 352
 353 =cut
 354
 355 sub format {
 356     my $self = shift;
 357     return $self->{'_seqformat'} = lc shift if @_;
 358     return $self->{'_seqformat'};
 359 }
 360
 361 =head2 get_params
 362
 363  Title   :  get_params
 364  Usage   :  $handler->get_params('-species')
 365  Function:  Convenience method used to retrieve the specified
 366             parameters from the internal parameter cache
 367  Returns :  Hash ref containing parameters requested and data as
 368             key-value pairs.  Note that some parameter values may be
 369             objects, arrays, etc.
 370  Args    :  List (array) representing the parameters requested
 371
 372 =cut
 373
 374 sub get_params {
 375     my ($self, @ids) = @_;
 376     my %data;
 377     for my $id (@ids) {
 378         if (!index($id, '-')==0) {
 379             $id = '-'.$id ;
 380         }
 381         $data{$id} = $self->{'_params'}->{$id} if (exists $self->{'_params'}->{$id});
 382     }
 383     return \%data;
 384 }
 385
 386 =head2 set_params
 387
 388  Title   :  set_params
 389  Usage   :  $handler->set_param({'-species')
 390  Function:  Convenience method used to set specific parameters
 391  Returns :  None
 392  Args    :  Hash ref containing the data to be passed as key-value pairs
 393
 394 =cut
 395
 396 sub set_params {
 397     shift->throw('Not implemented yet!');
 398 }
 399
 400 =head1 Methods unique to this implementation
 401
 402 =head2 seqbuilder
 403
 404  Title   :  seqbuilder
 405  Usage   :
 406  Function:
 407  Returns :
 408  Args    :
 409  Throws  :
 410  Note    :
 411
 412 =cut
 413
 414 sub seqbuilder {
 415     my $self = shift;
 416     return $self->{'_seqbuilder'} = shift if @_;
 417     return $self->{'_seqbuilder'};
 418 }
 419
 420 =head2 build_sequence
 421
 422  Title   :  build_sequence
 423  Usage   :
 424  Function:
 425  Returns :
 426  Args    :
 427  Throws  :
 428  Note    :
 429
 430 =cut
 431
 432 sub build_sequence {
 433     my $self = shift;
 434     my $builder = $self->seqbuilder();
 435     my $seq;
 436     if (defined($self->{'_params'})) {
 437         $builder->add_slot_value(%{ $self->{'_params'} });
 438         $seq = $builder->make_object();
 439         $self->reset_parameters;
 440     }
 441     return $seq if $seq;
 442     return 0;
 443 }
 444
 445 =head2 location_factory
 446
 447  Title   :  location_factory
 448  Usage   :
 449  Function:
 450  Returns :
 451  Args    :
 452  Throws  :
 453  Note    :
 454
 455 =cut
 456
 457 sub location_factory {
 458     my ($self, $factory) = @_;
 459     if ($factory) {
 460         $self->throw("Must have a Bio::Factory::LocationFactoryI when ".
 461                      "explicitly setting factory()") unless
 462              (ref($factory) && $factory->isa('Bio::Factory::LocationFactoryI'));
 463         $self->{'_locfactory'} = $factory;
 464     } elsif (!defined($self->{'_locfactory'})) {
 465         $self->{'_locfactory'} = Bio::Factory::FTLocationFactory->new()
 466     }
 467     return $self->{'_locfactory'};
 468 }
 469
 470 =head2 annotation_collection
 471
 472  Title   :  annotation_collection
 473  Usage   :
 474  Function:
 475  Returns :
 476  Args    :
 477  Throws  :
 478  Note    :
 479
 480 =cut
 481
 482 sub annotation_collection {
 483     my ($self, $coll) = @_;
 484     if ($coll) {
 485         $self->throw("Must have Bio::AnnotationCollectionI ".
 486                      "when explicitly setting collection()")
 487             unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI'));
 488         $self->{'_params'}->{'-annotation'} = $coll;
 489     } elsif (!exists($self->{'_params'}->{'-annotation'})) {
 490         $self->{'_params'}->{'-annotation'} = Bio::Annotation::Collection->new()
 491     }
 492     return $self->{'_params'}->{'-annotation'};
 493 }
 494
 495 ####################### SEQUENCE HANDLERS #######################
 496
 497 # any sequence data
 498 sub _generic_seq {
 499     my ($self, $data) = @_;
 500     $self->{'_params'}->{'-seq'} = $data->{DATA};
 501 }
 502
 503 ####################### RAW DATA HANDLERS #######################
 504
 505 # GenBank LOCUS line
 506 sub _genbank_locus {
 507     my ($self, $data) = @_;
 508     my (@tokens) = split m{\s+}, $data->{DATA};
 509     my $display_id = shift @tokens;
 510     $self->{'_params'}->{'-display_id'} = $display_id;
 511     my $seqlength = shift @tokens;
 512     if (exists $VALID_ALPHABET{$seqlength}) {
 513         # moved one token too far.  No locus name?
 514         $self->warn("Bad LOCUS name?  Changing [".$self->{'_params'}->{'-display_id'}.
 515                     "] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'});
 516         $self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'};
 517         $self->{'_params'}->{'-display_id'} = 'unknown';
 518         # add token back...
 519         unshift @tokens, $seqlength;
 520     } else {
 521         $self->{'_params'}->{'-length'} = $seqlength;
 522     }
 523     my $alphabet = lc(shift @tokens);
 524     $self->{'_params'}->{'-alphabet'} =
 525         (exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
 526                            $self->warn("Unknown alphabet: $alphabet");
 527     if (($self->{'_params'}->{'-alphabet'} eq 'dna') || (@tokens > 2)) {
 528             $self->{'_params'}->{'-molecule'} = shift(@tokens);
 529             my $circ = shift(@tokens);
 530             if ($circ eq 'circular') {
 531                 $self->{'_params'}->{'-is_circular'} = 1;
 532                 $self->{'_params'}->{'-division'} = shift(@tokens);
 533             } else {
 534                                 # 'linear' or 'circular' may actually be omitted altogether
 535                 $self->{'_params'}->{'-division'} =
 536                     (CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
 537             }
 538         } else {
 539             $self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa');
 540             $self->{'_params'}->{'-division'} = shift(@tokens);
 541         }
 542     my $date = join(' ', @tokens);
 543     # maybe use Date::Time for dates?
 544     if($date && $date =~ s{\s*((\d{1,2})-(\w{3})-(\d{2,4})).*}{$1}) {
 545
 546             if( length($date) < 11 ) {
 547             # improperly formatted date
 548             # But we'll be nice and fix it for them
 549             my ($d,$m,$y) = ($2,$3,$4);
 550             if( length($d) == 1 ) {
 551                 $d = "0$d";
 552             }
 553             # guess the century here
 554             if( length($y) == 2 ) {
 555                 if( $y > 60 ) { # arbitrarily guess that '60' means 1960
 556                 $y = "19$y";
 557                 } else {
 558                 $y = "20$y";
 559                 }
 560                 $self->warn("Date was malformed, guessing the century for $date to be $y\n");
 561             }
 562             $self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)];
 563             } else {
 564             $self->{'_params'}->{'-dates'} = [$date];
 565             }
 566         }
 567 }
 568
 569 # EMBL ID line
 570 sub _embl_id {
 571     my ($self, $data) = @_;
 572     my $alphabet;
 573     my ($name, $sv, $topology, $mol, $div);
 574     my $line = $data->{DATA};
 575     #$self->debug("$line\n");
 576     my ($idtype) = $line =~ tr/;/;/;
 577     if ( $idtype == 6) {   # New style headers contain exactly six semicolons.
 578         # New style header (EMBL Release >= 87, after June 2006)
 579         my $topology;
 580         my $sv;
 581
 582         # ID   DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
 583                 # This regexp comes from the new2old.pl conversion script, from EBI
 584         if ($line =~ m/^(\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) \w{2}\./) {
 585             ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
 586         } else {
 587             $self->throw("Unrecognized EMBL ID line:[$line]");
 588         }
 589         if (defined($sv)) {
 590             $self->{'_params'}->{'-seq_version'} = $sv;
 591             $self->{'_params'}->{'-version'} = $sv;
 592         }
 593
 594         if ($topology eq "circular") {
 595             $self->{'_params'}->{'-is_circular'} = 1;
 596         }
 597
 598         if (defined $mol ) {
 599             if ($mol =~ /DNA/) {
 600                 $alphabet='dna';
 601             }
 602             elsif ($mol =~ /RNA/) {
 603                 $alphabet='rna';
 604             }
 605             elsif ($mol =~ /AA/) {
 606                 $alphabet='protein';
 607             }
 608         }
 609     } elsif ($idtype) { # has internal ';'
 610         # Old style header (EMBL Release < 87, before June 2006)
 611         if ($line =~ m{^(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;}) {
 612             ($name, $mol, $div) = ($1, $2, $3);
 613             #$self->debug("[$name][$mol][$div]");
 614         }
 615
 616         if($mol) {
 617             if ( $mol =~ m{circular} ) {
 618                 $self->{'_params'}->{'-is_circular'} = 1;
 619                 $mol =~  s{circular }{};
 620             }
 621             if (defined $mol ) {
 622                         if ($mol =~ /DNA/) {
 623                             $alphabet='dna';
 624                         }
 625                         elsif ($mol =~ /RNA/) {
 626                             $alphabet='rna';
 627                         }
 628                         elsif ($mol =~ /AA/) {
 629                             $alphabet='protein';
 630                         }
 631             }
 632         }
 633     } else {
 634         $name = $data->{DATA};
 635     }
 636     unless( defined $name && length($name) ) {
 637         $name = "unknown_id";
 638     }
 639     $self->{'_params'}->{'-display_id'} = $name;
 640     $self->{'_params'}->{'-alphabet'} = $alphabet;
 641     $self->{'_params'}->{'-division'} = $div if $div;
 642     $self->{'_params'}->{'-molecule'} = $mol if $mol;
 643 }
 644
 645 # UniProt/SwissProt ID line
 646 sub _swiss_id {
 647     my ($self, $data) = @_;
 648     my ($name, $seq_div);
 649     if($data->{DATA} =~ m{^
 650                 (\S+)           \s+     #  $1  entryname
 651                 ([^\s;]+);      \s+     #  $2  DataClass
 652                 (?:PRT;)?       \s+     #  Molecule Type (optional)
 653                 [0-9]+[ ]AA     \.      #  Sequencelength (capture?)
 654                 $
 655                 }ox ) {
 656         ($name, $seq_div) = ($1, $2);
 657         $self->{'_params'}->{'-namespace'} =
 658           ($seq_div eq 'Reviewed'   || $seq_div eq 'STANDARD')     ? 'Swiss-Prot' :
 659           ($seq_div eq 'Unreviewed' || $seq_div eq 'PRELIMINARY')  ? 'TrEMBL'     :
 660            $seq_div;
 661         # we shouldn't be setting the division, but for now...
 662         my ($junk, $division) = split q(_), $name;
 663         $self->{'_params'}->{'-division'} = $division;
 664         $self->{'_params'}->{'-alphabet'} = 'protein';
 665         # this is important to have the id for display in e.g. FTHelper, otherwise
 666         # you won't know which entry caused an error
 667         $self->{'_params'}->{'-display_id'} = $name;
 668     } else {
 669         $self->throw("Unrecognized UniProt/SwissProt ID line:[".$data->{DATA}."]");
 670     }
 671 }
 672
 673 # UniProt/SwissProt GN line
 674 sub _swiss_genename {
 675     my ($self, $data) = @_;
 676     #$self->debug(Dumper($data));
 677     my $genename = $data->{DATA};
 678     my $gn;
 679     if ($genename) {
 680         my @stags;
 681         if ($genename =~ /\w=\w/) {
 682             # new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP)
 683             for my $n (split(m{\s+and\s+},$genename)) {
 684                 my @genenames;
 685                 for my $section (split(m{\s*;\s*},$n)) {
 686                     my ($tag, $rest) = split("=",$section);
 687                     $rest ||= '';
 688                     for my $val (split(m{\s*,\s*},$rest)) {
 689                         push @genenames, [$tag => $val];
 690                     }
 691                 }
 692                 push @stags, ['gene_name' => \@genenames];
 693             }
 694         } else {
 695             # old format
 696             for my $section (split(/ AND /, $genename)) {
 697                 my @genenames;
 698                 $section =~ s/[\(\)\.]//g;
 699                 my @names = split(m{\s+OR\s+}, $section);
 700                 push @genenames, ['Name' => shift @names];
 701                 push @genenames, map {['Synonyms' => $_]} @names;
 702                 push @stags, ['gene_name' => \@genenames]
 703             }
 704         } #use Data::Dumper; print Dumper $gn, $genename;# exit;
 705         my $gn = Bio::Annotation::TagTree->new(-tagname => 'gene_name',
 706                                                -value => ['gene_names' => \@stags]);
 707         $self->annotation_collection->add_Annotation('gene_name', $gn);
 708     }
 709 }
 710
 711 # GenBank VERSION line
 712 # old EMBL SV line (now obsolete)
 713 # UniProt/SwissProt?
 714 sub _generic_version {
 715     my ($self, $data) = @_;
 716     my ($acc,$gi) = split(' ',$data->{DATA});
 717     if($acc =~ m{^\w+\.(\d+)}xmso) {
 718         $self->{'_params'}->{'-version'} = $1;
 719         $self->{'_params'}->{'-seq_version'} = $1;
 720     }
 721     if($gi && (index($gi,"GI:") == 0)) {
 722         $self->{'_params'}->{'-primary_id'} = substr($gi,3);
 723     }
 724 }
 725
 726 # EMBL DT lines
 727 sub _embl_date {
 728     my ($self, $data) = @_;
 729     while ($data->{DATA} =~ m{(\S+)\s\((.*?)\)}g) {
 730         my ($date, $version) = ($1, $2);
 731         $date =~ tr{,}{}d; # remove comma if new version
 732         if ($version =~ m{\(Rel\.\s(\d+),\sCreated\)}xmso ) {
 733             my $release = Bio::Annotation::SimpleValue->new(
 734                             -tagname    => 'creation_release',
 735                             -value      => $1
 736                             );
 737             $self->annotation_collection->add_Annotation($release);
 738         } elsif ($version =~ m{\(Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)\)}xmso ) {
 739             my $release = Bio::Annotation::SimpleValue->new(
 740                             -tagname    => 'update_release',
 741                             -value      => $1
 742                             );
 743             $self->annotation_collection->add_Annotation($release);
 744             my $update = Bio::Annotation::SimpleValue->new(
 745                            -tagname    => 'update_version',
 746                            -value      => $2
 747                            );
 748             $self->annotation_collection->add_Annotation($update);
 749         }
 750         push @{ $self->{'_params'}->{'-dates'} }, $date;
 751     }
 752 }
 753
 754 # UniProt/SwissProt DT lines
 755 sub _swiss_date {
 756     my ($self, $data) = @_;
 757     # swissprot
 758     my @dls = split m{\n}, $data->{DATA};
 759     for my $dl (@dls) {
 760         my ($date, $version) = split(' ', $dl, 2);
 761         $date =~ tr{,}{}d; # remove comma if new version
 762         if ($version =~ m{\(Rel\. (\d+), Last sequence update\)} || # old
 763             $version =~ m{sequence version (\d+)\.}) { #new
 764         my $update = Bio::Annotation::SimpleValue->new(
 765                                     -tagname    => 'seq_update',
 766                                     -value      => $1
 767                                     );
 768         $self->annotation_collection->add_Annotation($update);
 769         } elsif ($version =~ m{\(Rel\. (\d+), Last annotation update\)} || #old
 770                  $version =~ m{entry version (\d+)\.}) { #new
 771             $self->{'_params'}->{'-version'} = $1;
 772             $self->{'_params'}->{'-seq_version'} = $1;
 773         }
 774         push @{ $self->{'_params'}->{'-dates'} }, $date;
 775     }
 776 }
 777
 778 # GenBank KEYWORDS line
 779 # EMBL KW line
 780 # UniProt/SwissProt KW line
 781 sub _generic_keywords {
 782     my ($self, $data) = @_;
 783     $data->{DATA} =~ s{\.$}{};
 784     my @kw = split m{\s*\;\s*}xo ,$data->{DATA};
 785     $self->{'_params'}->{'-keywords'} = \@kw;
 786 }
 787
 788 # GenBank DEFINITION line
 789 # EMBL DE line
 790 # UniProt/SwissProt DE line
 791 sub _generic_description {
 792     my ($self, $data) = @_;
 793     $self->{'_params'}->{'-desc'} = $data->{DATA};
 794 }
 795
 796 # GenBank ACCESSION line
 797 # EMBL AC line
 798 # UniProt/SwissProt AC line
 799 sub _generic_accession {
 800     my ($self, $data) = @_;
 801     my @accs = split m{[\s;]+}, $data->{DATA};
 802     $self->{'_params'}->{'-accession_number'} = shift @accs;
 803     $self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs;
 804 }
 805
 806 ####################### SPECIES HANDLERS #######################
 807
 808 # uses Bio::Species
 809 # GenBank SOURCE, ORGANISM lines
 810 # EMBL O* lines
 811 # UniProt/SwissProt O* lines
 812 sub _generic_species {
 813     my ($self, $data) = @_;
 814
 815     my $seqformat = $self->format;
 816     # if data is coming in from GenBank parser...
 817     if ($seqformat eq 'genbank' &&
 818         $data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
 819         ($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2);
 820     }
 821
 822     # SwissProt stuff...
 823     # hybrid names in swissprot files are no longer valid per intergration into
 824     # UniProt. Files containing these have been split into separate entries, so
 825     # it is probably a good idea to update if one has these lingering around...
 826
 827     my $taxid;
 828     if ($seqformat eq 'swiss') {
 829         if ($data->{DATA} =~ m{^([^,]+)}ox) {
 830             $data->{DATA} = $1;
 831         }
 832         if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) {
 833             $taxid = $1;
 834         }
 835     }
 836
 837     my ($sl, $class, $sci_name) = ($data->{DATA},
 838                                    $data->{CLASSIFICATION},
 839                                    $data->{ORGANISM} || '');
 840     my ($organelle,$abbr_name, $common);
 841     my @class = reverse split m{\s*;\s*}, $class;
 842     # have to treat swiss different from everything else...
 843     if ($sl =~ m{^(mitochondrion|chloroplast|plastid)?   # GenBank format
 844                 \s*(.*?)
 845                 \s*(?: \( (.*?) \) )?\.?$
 846          }xmso ){
 847         ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
 848     } else {
 849         $abbr_name = $sl;       # nothing caught; this is a backup!
 850     }
 851     # there is no 'abbreviated name' for EMBL
 852     $sci_name = $abbr_name if $seqformat ne 'genbank';
 853     $organelle ||= '';
 854     $common ||= '';
 855     $sci_name || return;
 856     unshift @class, $sci_name;
 857     # no genus/species parsing here; moving to Bio::Taxon-based taxonomy
 858     my $make = Bio::Species->new();
 859     $make->scientific_name($sci_name);
 860     $make->classification(@class) if @class > 0;
 861     $common    && $make->common_name( $common );
 862     $abbr_name && $make->name('abbreviated', $abbr_name);
 863     $organelle && $make->organelle($organelle);
 864     $taxid     && $make->ncbi_taxid($taxid);
 865     $self->{'_params'}->{'-species'} = $make;
 866 }
 867
 868 ####################### ANNOTATION HANDLERS #######################
 869
 870 # GenBank DBSOURCE line
 871 sub _genbank_dbsource {
 872     my ($self, $data) = @_;
 873     my $dbsource = $data->{DATA};
 874     my $annotation = $self->annotation_collection;
 875     # deal with swissprot dbsources
 876     # we could possibly parcel these out to subhandlers...
 877     if( $dbsource =~ s/(UniProt(?:KB)|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
 878         $annotation->add_Annotation
 879             ('dblink',
 880              Bio::Annotation::DBLink->new
 881              (-primary_id => $2,
 882               -database => $1,
 883               -tagname => 'dblink'));
 884         if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
 885             $annotation->add_Annotation
 886             ('swissprot_dates',
 887              Bio::Annotation::SimpleValue->new
 888              (-tagname => 'date_created',
 889               -value => $1));
 890         }
 891         while( $dbsource =~ s/\s*(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
 892             $annotation->add_Annotation
 893             ('swissprot_dates',
 894              Bio::Annotation::SimpleValue->new
 895              (-tagname => 'date_updated',
 896               -value => $1));
 897         }
 898         $dbsource =~ s/\n/ /g;
 899         if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
 900             # will use $i to determine even or odd
 901             # for swissprot the accessions are paired
 902             my $i = 0;
 903             for my $dbsrc ( split(/,\s+/,$1) ) {
 904                 if( $dbsrc =~ /(\S+)\.(\d+)/ || $dbsrc =~ /(\S+)/ ) {
 905                     my ($id,$version) = ($1,$2);
 906                     $version ='' unless defined $version;
 907                     my $db;
 908                     if( $id =~ /^\d\S{3}/) {
 909                         $db = 'PDB';
 910                     } else {
 911                         $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
 912                     }
 913                     $annotation->add_Annotation
 914                     ('dblink',
 915                      Bio::Annotation::DBLink->new
 916                      (-primary_id => $id,
 917                       -version => $version,
 918                       -database => $db,
 919                       -tagname => 'dblink'));
 920                 }
 921             }
 922         } elsif( $dbsource =~ s/\s*xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
 923             # download screwed up and ncbi didn't put acc in for gi numbers
 924             my $i = 0;
 925             for my $id ( split(/\,\s+/,$1) ) {
 926             my ($acc,$db);
 927             if( $id =~ /gi:\s+(\d+)/ ) {
 928                 $acc= $1;
 929                 $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
 930             } elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {
 931                 $acc= $1;
 932                 $db = 'PDB';
 933             } else {
 934                 $acc= $id;
 935                 $db = '';
 936             }
 937             $annotation->add_Annotation
 938                 ('dblink',
 939                  Bio::Annotation::DBLink->new
 940                  (-primary_id => $acc,
 941                   -database => $db,
 942                   -tagname => 'dblink'));
 943             }
 944         } else {
 945             $self->warn("Cannot match $dbsource\n");
 946         }
 947         if( $dbsource =~ s/xrefs\s+\(non\-sequence\s+databases\):\s+
 948             ((?:\S+,\s+)+\S+)//x ) {
 949             for my $id ( split(/\,\s+/,$1) ) {
 950                 my $db;
 951                 # this is because GenBank dropped the spaces!!!
 952                 # I'm sure we're not going to get this right
 953                 ##if( $id =~ s/^://i ) {
 954                 ##    $db = $1;
 955                 ##}
 956                 $db = substr($id,0,index($id,':'));
 957                 if (! exists $DBSOURCE{ $db }) {
 958                     $db = '';   # do we want 'GenBank' here?
 959                 }
 960                 $id = substr($id,index($id,':')+1);
 961                 $annotation->add_Annotation
 962                     ('dblink',Bio::Annotation::DBLink->new
 963                      (-primary_id => $id,
 964                       -database => $db,
 965                       -tagname => 'dblink'));
 966             }
 967         }
 968     } else {
 969         if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) {
 970             my ($db,$id,$version) = ($1,$2,$3);
 971             $annotation->add_Annotation
 972             ('dblink',
 973              Bio::Annotation::DBLink->new
 974              (-primary_id => $id,
 975               -version => $version,
 976               -database => $db || 'GenBank',
 977               -tagname => 'dblink'));
 978         } elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) {
 979             my ($id, $db, $version);
 980             if ($2 eq ':') {
 981                 ($db, $id) = ($1, $3);
 982             } else {
 983                 ($db, $id, $version) = ('GenBank', $1, $3);
 984             }
 985             $annotation->add_Annotation('dblink',
 986                 Bio::Annotation::DBLink->new(
 987                     -primary_id => $id,
 988                     -version => $version,
 989                     -database => $db,
 990                     -tagname => 'dblink')
 991                 );
 992         } else {
 993             $self->warn("Unrecognized DBSOURCE data: $dbsource\n");
 994         }
 995     }
 996 }
 997
 998 # EMBL DR lines
 999 # UniProt/SwissProt DR lines
1000 sub _generic_dbsource {
1001     my ($self, $data) = @_;
1002     #$self->debug(Dumper($data));
1003     while ($data->{DATA} =~ m{([^\n]+)}og) {
1004         my $dblink = $1;
1005         $dblink =~ s{\.$}{};
1006         my $link;
1007         my @linkdata = split '; ',$dblink;
1008         if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) {
1009         #if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) {
1010             my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1011             $link = Bio::Annotation::DBLink->new(-database    => $databse,
1012                                 -primary_id  => $prim_id,
1013                                 -optional_id => $sec_id);
1014         } else {
1015             $self->warn("No match for $dblink");
1016         }
1017         $self->annotation_collection->add_Annotation('dblink', $link);
1018     }
1019 }
1020
1021
1022 # GenBank REFERENCE and related lines
1023 # EMBL R* lines
1024 # UniProt/SwissProt R* lines
1025 sub _generic_reference {
1026     my ($self, $data) = @_;
1027     my $seqformat = $self->format;
1028     my ($start, $end);
1029     # get these in EMBL/Swiss
1030     if ($data->{CROSSREF}) {
1031         while ($data->{CROSSREF} =~ m{(pubmed|doi|medline)(?:=|;\s+)(\S+)}oig) {
1032             my ($db, $ref) = (uc $1, $2);
1033             $ref =~ s{[;.]+$}{};
1034             $data->{$db} = $ref;
1035         }
1036     }
1037     # run some cleanup for swissprot
1038     if ($seqformat eq 'swiss') {
1039         for my $val (values %{ $data }) {
1040             $val =~ s{;$}{};
1041             $val =~ s{(\w-)\s}{$1};
1042         }
1043     }
1044     if ( $data->{POSITION} ) {
1045         if ($seqformat eq 'embl') {
1046             ($start, $end) = split '-', $data->{POSITION},2;
1047         } elsif ($data->{POSITION} =~ m{.+? OF (\d+)-(\d+).*}) { #swiss
1048             ($start, $end) = ($1, $2);
1049         }
1050     }
1051     if ($data->{DATA} =~ m{^\d+\s+\([a-z]+\s+(\d+)\s+to\s+(\d+)\)}xmso) {
1052         ($start, $end) = ($1, $2);
1053     }
1054     my $ref = Bio::Annotation::Reference->new(
1055                     -comment    => $data->{REMARK},
1056                     -location   => $data->{JOURNAL},
1057                     -pubmed     => $data->{PUBMED},
1058                     -consortium => $data->{CONSRTM},
1059                     -title      => $data->{TITLE},
1060                     -authors    => $data->{AUTHORS},
1061                     -medline    => $data->{MEDLINE},
1062                     -doi        => $data->{DOI},
1063                     -rp         => $data->{POSITION}, # JIC...
1064                     -start      => $start,
1065                     -end        => $end,
1066                     );
1067     if ($data->{DATA} =~  m{^\d+\s+\((.*)\)}xmso) {
1068               $ref->gb_reference($1);
1069         }
1070     $self->annotation_collection->add_Annotation('reference', $ref);
1071 }
1072
1073 # GenBank COMMENT lines
1074 # EMBL CC lines
1075 # UniProt/SwissProt CC lines
1076 sub _generic_comment {
1077     my ($self, $data) = @_;
1078     $self->annotation_collection->add_Annotation('comment',
1079         Bio::Annotation::Comment->new( -text => $data->{DATA} ));
1080 }
1081
1082 ####################### SEQFEATURE HANDLER #######################
1083
1084 # GenBank Feature Table
1085 sub _generic_seqfeatures {
1086     my ($self, $data) = @_;
1087     return if $data->{FEATURE_KEY} eq 'FEATURES';
1088     my $primary_tag = $data->{FEATURE_KEY};
1089
1090     # grab the NCBI taxon ID from the source SF
1091     if ($primary_tag eq 'source' && exists $data->{'db_xref'}) {
1092         if ( $self->{'_params'}->{'-species'} &&
1093             $data->{'db_xref'} =~ m{taxon:(\d+)}xmso ) {
1094             $self->{'_params'}->{'-species'}->ncbi_taxid($1);
1095         }
1096     }
1097     my $source = $self->format;
1098
1099     my $seqid = ${ $self->get_params('accession_number')  }{'accession_number'};
1100
1101     my $loc;
1102     eval {
1103         $loc = $self->{'_locfactory'}->from_string($data->{'LOCATION'});
1104     };
1105     if(! $loc) {
1106         $self->warn("exception while parsing location line [" .
1107                     $data->{'LOCATION'} .
1108                     "] in reading $source, ignoring feature " .
1109                     $data->{'primary_tag'}.
1110                     " (seqid=" . $seqid . "): " . $@);
1111         return;
1112     }
1113     if($seqid && (! $loc->is_remote())) {
1114         $loc->seq_id($seqid); # propagates if it is a split location
1115     }
1116     my $sf = Bio::SeqFeature::Generic->direct_new();
1117     $sf->location($loc);
1118     $sf->primary_tag($primary_tag);
1119     $sf->seq_id($seqid);
1120     $sf->source_tag($source);
1121     delete $data->{'FEATURE_KEY'};
1122     delete $data->{'LOCATION'};
1123     delete $data->{'NAME'};
1124     delete $data->{'DATA'};
1125     $sf->set_attributes(-tag => $data);
1126     push @{ $self->{'_params'}->{'-features'} }, $sf;
1127 }
1128
1129 ####################### ODDS AND ENDS #######################
1130
1131 # Those things that don't fit anywhere else.  If a specific name
1132 # maps to the below table, that class and method are used, otherwise
1133 # it goes into a SimpleValue (I think this is a good argument for why
1134 # we need a generic mechanism for storing annotation)
1135
1136 sub _generic_simplevalue {
1137     my ($self, $data) = @_;
1138     $self->annotation_collection->add_Annotation(
1139         Bio::Annotation::SimpleValue->new(-tagname => lc($data->{NAME}),
1140        -value => $data->{DATA})
1141         );
1142 }
1143
1144 sub noop {}
1145
1146 sub _debug {
1147     my ($self, $data) = @_;
1148     $self->debug(Dumper($data));
1149 }
1150
1151
1152 1;