2 # BioPerl module for Bio::SeqIO::Handler::GenericRichSeqHandler
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SeqIO::Handler::GenericRichSeqHandler - Bio::HandlerI-based
17 data handler for GenBank/EMBL/UniProt (and other) sequence data
21 # MyHandler is a GenericRichSeqHandler object.
22 # inside a parser (driver) constructor....
24 $self->seq_handler($handler || MyHandler->new(-format => 'genbank'));
26 # in next_seq() in driver...
28 $hobj = $self->seqhandler();
30 # roll data up into hashref chunks, pass off into Handler for processing...
32 $hobj->data_handler($data);
34 # or retrieve Handler methods and pass data directly to Handler methods...
36 my $hmeth = $hobj->handler_methods;
38 if ($hmeth->{ $data->{NAME} }) {
39 my $mth = $hmeth->{ $data->{NAME} };
45 This is an experimental implementation of a sequence-based HandlerBaseI parser
46 and may change over time. It is possible (nay, likely) that the way handler
47 methods are set up will change over development to allow more flexibility.
48 Release pumpkins, please do not add this to a release until the API has settled.
49 It is also likely that write_seq() will not work properly for some data.
51 Standard Developer caveats:
53 Do not use for production purposes.
54 Not responsible for destroying (your data|computer|world).
55 Do not stare directly at GenericRichSeqHandler.
56 If GenericRichSeqHandler glows, back slowly away and call for help.
58 Consider yourself warned!
60 This class acts as a demonstration on how to handle similar data chunks derived
61 from Bio::SeqIO::gbdriver, Bio::SeqIO::embldriver, and Bio::SeqIO::swissdriver
62 using similar (or the same) handler methods.
64 The modules currently pass all previous tests in t/genbank.t, t/embl.t, and
65 t/swiss.t yet all use the same handler methods (the collected tests for handlers
66 can be found in t/Handler.t). Some tweaking of the methods themselves is
67 probably in order over the long run to ensure that data is consistently handled
68 for each parser. Round-trip tests are probably in order here...
70 Though a Bio::Seq::SeqBuilder is employed for building sequence objects no
71 bypassing of data based on builder slots has been implemented (yet); this is
72 planned in the near future.
74 As a reminder: this is the current Annotation data chunk (via Data::Dumper):
77 'NAME' => 'REFERENCE',
78 'DATA' => '1 (bases 1 to 10001)'
79 'AUTHORS' => 'International Human Genome Sequencing Consortium.'
80 'TITLE' => 'The DNA sequence of Homo sapiens'
81 'JOURNAL' => 'Unpublished (2003)'
85 This is the current SeqFeature data chunk (again via Data::Dumper):
88 'mol_type' => 'genomic DNA',
89 'LOCATION' => '<1..>10001',
91 'FEATURE_KEY' => 'source',
92 'note' => 'Accession AL451081 sequenced by The Sanger Centre',
93 'db_xref' => 'taxon:9606',
94 'clone' => 'RP11-302I18',
95 'organism' => 'Homo sapiens'
102 User feedback is an integral part of the evolution of this and other
103 Bioperl modules. Send your comments and suggestions preferably to one
104 of the Bioperl mailing lists. Your participation is much appreciated.
106 bioperl-l@bioperl.org - General discussion
107 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
111 Please direct usage questions or support issues to the mailing list:
113 I<bioperl-l@bioperl.org>
115 rather than to the module maintainer directly. Many experienced and
116 reponsive experts will be able look at the problem and quickly
117 address it. Please include a thorough description of the problem
118 with code and data examples if at all possible.
120 =head2 Reporting Bugs
122 Report bugs to the Bioperl bug tracking system to help us keep track
123 the bugs and their resolution. Bug reports can be submitted via the
126 https://github.com/bioperl/bioperl-live/issues
128 =head1 AUTHOR - Chris Fields
130 Email cjfields at bioperl dot org
134 The rest of the documentation details each of the object methods. Internal
135 methods are usually preceded with a _
139 # Let the code begin...
141 package Bio
::SeqIO
::Handler
::GenericRichSeqHandler
;
145 use Bio
::SeqIO
::FTHelper
;
146 use Bio
::Annotation
::Collection
;
147 use Bio
::Annotation
::DBLink
;
148 use Bio
::Annotation
::Comment
;
149 use Bio
::Annotation
::Reference
;
150 use Bio
::Annotation
::Collection
;
151 use Bio
::Annotation
::SimpleValue
;
152 use Bio
::Annotation
::TagTree
;
153 use Bio
::SeqFeature
::Generic
;
156 use Bio
::DB
::Taxonomy
;
157 use Bio
::Factory
::FTLocationFactory
;
160 use base
qw(Bio::Root::Root Bio::HandlerBaseI);
164 'LOCUS' => \
&_genbank_locus
,
165 'DEFINITION' => \
&_generic_description
,
166 'ACCESSION' => \
&_generic_accession
,
167 'VERSION' => \
&_generic_version
,
168 'KEYWORDS' => \
&_generic_keywords
,
169 'DBSOURCE' => \
&_genbank_dbsource
,
170 'DBLINK' => \
&_genbank_dbsource
,
171 'SOURCE' => \
&_generic_species
,
172 'REFERENCE' => \
&_generic_reference
,
173 'COMMENT' => \
&_generic_comment
,
174 'FEATURES' => \
&_generic_seqfeatures
,
175 'BASE' => \
&noop
, # this is generated from scratch
176 'ORIGIN' => \
&_generic_seq
,
177 # handles anything else (WGS, WGS_SCAFLD, CONTIG, PROJECT)
178 '_DEFAULT_' => \
&_generic_simplevalue
,
182 'DT' => \
&_embl_date
,
183 'DR' => \
&_generic_dbsource
,
184 'SV' => \
&_generic_version
,
185 'RN' => \
&_generic_reference
,
186 'KW' => \
&_generic_keywords
,
187 'DE' => \
&_generic_description
,
188 'AC' => \
&_generic_accession
,
189 #'AH' => \&noop, # TPA data not dealt with yet...
191 'SQ' => \
&_generic_seq
,
192 'OS' => \
&_generic_species
,
193 'CC' => \
&_generic_comment
,
194 'FT' => \
&_generic_seqfeatures
,
195 # handles anything else (WGS, TPA, ANN...)
196 '_DEFAULT_' => \
&_generic_simplevalue
,
200 'DT' => \
&_swiss_date
,
201 'GN' => \
&_swiss_genename
,
202 'DR' => \
&_generic_dbsource
,
203 'RN' => \
&_generic_reference
,
204 'KW' => \
&_generic_keywords
,
205 'DE' => \
&_generic_description
,
206 'AC' => \
&_generic_accession
,
207 'SQ' => \
&_generic_seq
,
208 'OS' => \
&_generic_species
,
209 'CC' => \
&_generic_comment
,
210 'FT' => \
&_generic_seqfeatures
,
211 # handles anything else, though I don't know what...
212 '_DEFAULT_' => \
&_generic_simplevalue
,
216 # can we do this generically? Seems like a lot of trouble...
217 my %DBSOURCE = map {$_ => 1} qw(
218 EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
219 TIGR GO InterPro Pfam PROSITE SGD GermOnline
220 HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
221 H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
222 MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
223 UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
224 PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
225 Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
226 TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
227 PhotoList Gramene WormBase WormPep Genew ZFIN
228 PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
229 swissprot GenBank GenPept REFSEQ embl PDB UniProtKB);
231 my %NOPROCESS = map {$_ => 1} qw(DBSOURCE ORGANISM FEATURES);
233 our %VALID_ALPHABET = (
236 'rc' => '' # rc = release candidate; file has no sequences
245 Args : -format Sequence format to be mapped for handler methods
246 -builder Bio::Seq::SeqBuilder object (normally defined in
247 SequenceStreamI object implementation constructor)
248 Throws : On undefined '-format' sequence format parameter
249 Note : Still under heavy development
254 my ($class, @args) = @_;
255 my $self = $class->SUPER::new
(@args);
258 my ($format, $builder) = $self->_rearrange([qw(FORMAT BUILDER)], @args);
259 $self->throw("Must define sequence record format") if !$format;
260 $self->format($format);
261 $self->handler_methods();
262 $builder && $self->seqbuilder($builder);
263 $self->location_factory();
267 =head1 L<Bio::HandlerBaseI> implementing methods
269 =head2 handler_methods
271 Title : handler_methods
272 Usage : $handler->handler_methods('GenBank')
273 %handlers = $handler->handler_methods();
274 Function: Retrieve the handler methods used for the current format() in
275 the handler. This assumes the handler methods are already
276 described in the HandlerI-implementing class.
277 Returns : a hash reference with the data type handled and the code ref
279 Args : [optional] String representing the sequence format. If set here
280 this will also set sequence_format()
281 Throws : On unimplemented sequence format in %HANDLERS
285 sub handler_methods
{
287 if (!($self->{'handlers'})) {
288 $self->throw("No handlers defined for seqformat ",$self->format)
289 unless exists $HANDLERS{$self->format};
290 $self->{'handlers'} = $HANDLERS{$self->format};
292 return ($self->{'handlers'});
298 Usage : $handler->data_handler($data)
299 Function: Centralized method which accepts all data chunks, then distributes
300 to the appropriate methods for processing based on the chunk name
301 from within the HandlerBaseI object.
305 Args : an hash ref containing a data chunk.
310 my ($self, $data) = @_;
311 my $nm = $data->{NAME
} || $self->throw("No name tag defined!");
313 # this should handle data on the fly w/o caching; any caching should be
314 # done in the driver!
315 my $method = (exists $self->{'handlers'}->{$nm}) ?
($self->{'handlers'}->{$nm}) :
316 (exists $self->{'handlers'}->{'_DEFAULT_'}) ?
($self->{'handlers'}->{'_DEFAULT_'}) :
319 $self->debug("No handler defined for $nm\n");
322 $self->$method($data);
325 =head2 reset_parameters
327 Title : reset_parameters
328 Usage : $handler->reset_parameters()
329 Function: Resets the internal cache of data (normally object parameters for
330 a builder or factory)
336 sub reset_parameters
{
338 $self->{'_params'} = undef;
344 Usage : $handler->format('GenBank')
345 Function: Get/Set the format for the report/record being parsed. This can be
346 used to set handlers in classes which are capable of processing
347 similar data chunks from multiple driver modules.
348 Returns : String with the sequence format
349 Args : [optional] String with the sequence format
350 Note : The format may be used to set the handlers (as in the
351 current GenericRichSeqHandler implementation)
357 return $self->{'_seqformat'} = lc shift if @_;
358 return $self->{'_seqformat'};
364 Usage : $handler->get_params('-species')
365 Function: Convenience method used to retrieve the specified
366 parameters from the internal parameter cache
367 Returns : Hash ref containing parameters requested and data as
368 key-value pairs. Note that some parameter values may be
369 objects, arrays, etc.
370 Args : List (array) representing the parameters requested
375 my ($self, @ids) = @_;
378 if (!index($id, '-')==0) {
381 $data{$id} = $self->{'_params'}->{$id} if (exists $self->{'_params'}->{$id});
389 Usage : $handler->set_param({'-species')
390 Function: Convenience method used to set specific parameters
392 Args : Hash ref containing the data to be passed as key-value pairs
397 shift->throw('Not implemented yet!');
400 =head1 Methods unique to this implementation
416 return $self->{'_seqbuilder'} = shift if @_;
417 return $self->{'_seqbuilder'};
420 =head2 build_sequence
422 Title : build_sequence
434 my $builder = $self->seqbuilder();
436 if (defined($self->{'_params'})) {
437 $builder->add_slot_value(%{ $self->{'_params'} });
438 $seq = $builder->make_object();
439 $self->reset_parameters;
445 =head2 location_factory
447 Title : location_factory
457 sub location_factory
{
458 my ($self, $factory) = @_;
460 $self->throw("Must have a Bio::Factory::LocationFactoryI when ".
461 "explicitly setting factory()") unless
462 (ref($factory) && $factory->isa('Bio::Factory::LocationFactoryI'));
463 $self->{'_locfactory'} = $factory;
464 } elsif (!defined($self->{'_locfactory'})) {
465 $self->{'_locfactory'} = Bio
::Factory
::FTLocationFactory
->new()
467 return $self->{'_locfactory'};
470 =head2 annotation_collection
472 Title : annotation_collection
482 sub annotation_collection
{
483 my ($self, $coll) = @_;
485 $self->throw("Must have Bio::AnnotationCollectionI ".
486 "when explicitly setting collection()")
487 unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI'));
488 $self->{'_params'}->{'-annotation'} = $coll;
489 } elsif (!exists($self->{'_params'}->{'-annotation'})) {
490 $self->{'_params'}->{'-annotation'} = Bio
::Annotation
::Collection
->new()
492 return $self->{'_params'}->{'-annotation'};
495 ####################### SEQUENCE HANDLERS #######################
499 my ($self, $data) = @_;
500 $self->{'_params'}->{'-seq'} = $data->{DATA
};
503 ####################### RAW DATA HANDLERS #######################
507 my ($self, $data) = @_;
508 my (@tokens) = split m{\s+}, $data->{DATA
};
509 my $display_id = shift @tokens;
510 $self->{'_params'}->{'-display_id'} = $display_id;
511 my $seqlength = shift @tokens;
512 if (exists $VALID_ALPHABET{$seqlength}) {
513 # moved one token too far. No locus name?
514 $self->warn("Bad LOCUS name? Changing [".$self->{'_params'}->{'-display_id'}.
515 "] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'});
516 $self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'};
517 $self->{'_params'}->{'-display_id'} = 'unknown';
519 unshift @tokens, $seqlength;
521 $self->{'_params'}->{'-length'} = $seqlength;
523 my $alphabet = lc(shift @tokens);
524 $self->{'_params'}->{'-alphabet'} =
525 (exists $VALID_ALPHABET{$alphabet}) ?
$VALID_ALPHABET{$alphabet} :
526 $self->warn("Unknown alphabet: $alphabet");
527 if (($self->{'_params'}->{'-alphabet'} eq 'dna') || (@tokens > 2)) {
528 $self->{'_params'}->{'-molecule'} = shift(@tokens);
529 my $circ = shift(@tokens);
530 if ($circ eq 'circular') {
531 $self->{'_params'}->{'-is_circular'} = 1;
532 $self->{'_params'}->{'-division'} = shift(@tokens);
534 # 'linear' or 'circular' may actually be omitted altogether
535 $self->{'_params'}->{'-division'} =
536 (CORE
::length($circ) == 3 ) ?
$circ : shift(@tokens);
539 $self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa');
540 $self->{'_params'}->{'-division'} = shift(@tokens);
542 my $date = join(' ', @tokens);
543 # maybe use Date::Time for dates?
544 if($date && $date =~ s{\s*((\d{1,2})-(\w{3})-(\d{2,4})).*}{$1}) {
546 if( length($date) < 11 ) {
547 # improperly formatted date
548 # But we'll be nice and fix it for them
549 my ($d,$m,$y) = ($2,$3,$4);
550 if( length($d) == 1 ) {
553 # guess the century here
554 if( length($y) == 2 ) {
555 if( $y > 60 ) { # arbitrarily guess that '60' means 1960
560 $self->warn("Date was malformed, guessing the century for $date to be $y\n");
562 $self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)];
564 $self->{'_params'}->{'-dates'} = [$date];
571 my ($self, $data) = @_;
573 my ($name, $sv, $topology, $mol, $div);
574 my $line = $data->{DATA
};
575 #$self->debug("$line\n");
576 my ($idtype) = $line =~ tr/;/;/;
577 if ( $idtype == 6) { # New style headers contain exactly six semicolons.
578 # New style header (EMBL Release >= 87, after June 2006)
582 # ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
583 # This regexp comes from the new2old.pl conversion script, from EBI
584 if ($line =~ m/^(\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) \w{2}\./) {
585 ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
587 $self->throw("Unrecognized EMBL ID line:[$line]");
590 $self->{'_params'}->{'-seq_version'} = $sv;
591 $self->{'_params'}->{'-version'} = $sv;
594 if ($topology eq "circular") {
595 $self->{'_params'}->{'-is_circular'} = 1;
602 elsif ($mol =~ /RNA/) {
605 elsif ($mol =~ /AA/) {
609 } elsif ($idtype) { # has internal ';'
610 # Old style header (EMBL Release < 87, before June 2006)
611 if ($line =~ m{^(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;}) {
612 ($name, $mol, $div) = ($1, $2, $3);
613 #$self->debug("[$name][$mol][$div]");
617 if ( $mol =~ m{circular} ) {
618 $self->{'_params'}->{'-is_circular'} = 1;
619 $mol =~ s{circular }{};
625 elsif ($mol =~ /RNA/) {
628 elsif ($mol =~ /AA/) {
634 $name = $data->{DATA
};
636 unless( defined $name && length($name) ) {
637 $name = "unknown_id";
639 $self->{'_params'}->{'-display_id'} = $name;
640 $self->{'_params'}->{'-alphabet'} = $alphabet;
641 $self->{'_params'}->{'-division'} = $div if $div;
642 $self->{'_params'}->{'-molecule'} = $mol if $mol;
645 # UniProt/SwissProt ID line
647 my ($self, $data) = @_;
648 my ($name, $seq_div);
649 if($data->{DATA
} =~ m
{^
650 (\S
+) \s
+ # $1 entryname
651 ([^\s
;]+); \s
+ # $2 DataClass
652 (?
:PRT
;)? \s
+ # Molecule Type (optional)
653 [0-9]+[ ]AA \
. # Sequencelength (capture?)
656 ($name, $seq_div) = ($1, $2);
657 $self->{'_params'}->{'-namespace'} =
658 ($seq_div eq 'Reviewed' || $seq_div eq 'STANDARD') ?
'Swiss-Prot' :
659 ($seq_div eq 'Unreviewed' || $seq_div eq 'PRELIMINARY') ?
'TrEMBL' :
661 # we shouldn't be setting the division, but for now...
662 my ($junk, $division) = split q
(_
), $name;
663 $self->{'_params'}->{'-division'} = $division;
664 $self->{'_params'}->{'-alphabet'} = 'protein';
665 # this is important to have the id for display in e.g. FTHelper, otherwise
666 # you won't know which entry caused an error
667 $self->{'_params'}->{'-display_id'} = $name;
669 $self->throw("Unrecognized UniProt/SwissProt ID line:[".$data->{DATA
}."]");
673 # UniProt/SwissProt GN line
674 sub _swiss_genename
{
675 my ($self, $data) = @_;
676 #$self->debug(Dumper($data));
677 my $genename = $data->{DATA
};
681 if ($genename =~ /\w=\w/) {
682 # new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP)
683 for my $n (split(m{\s+and\s+},$genename)) {
685 for my $section (split(m{\s*;\s*},$n)) {
686 my ($tag, $rest) = split("=",$section);
688 for my $val (split(m{\s*,\s*},$rest)) {
689 push @genenames, [$tag => $val];
692 push @stags, ['gene_name' => \
@genenames];
696 for my $section (split(/ AND /, $genename)) {
698 $section =~ s/[\(\)\.]//g;
699 my @names = split(m{\s+OR\s+}, $section);
700 push @genenames, ['Name' => shift @names];
701 push @genenames, map {['Synonyms' => $_]} @names;
702 push @stags, ['gene_name' => \
@genenames]
704 } #use Data::Dumper; print Dumper $gn, $genename;# exit;
705 my $gn = Bio
::Annotation
::TagTree
->new(-tagname
=> 'gene_name',
706 -value
=> ['gene_names' => \
@stags]);
707 $self->annotation_collection->add_Annotation('gene_name', $gn);
711 # GenBank VERSION line
712 # old EMBL SV line (now obsolete)
714 sub _generic_version
{
715 my ($self, $data) = @_;
716 my ($acc,$gi) = split(' ',$data->{DATA
});
717 if($acc =~ m{^\w+\.(\d+)}xmso) {
718 $self->{'_params'}->{'-version'} = $1;
719 $self->{'_params'}->{'-seq_version'} = $1;
721 if($gi && (index($gi,"GI:") == 0)) {
722 $self->{'_params'}->{'-primary_id'} = substr($gi,3);
728 my ($self, $data) = @_;
729 while ($data->{DATA
} =~ m{(\S+)\s\((.*?)\)}g) {
730 my ($date, $version) = ($1, $2);
731 $date =~ tr{,}{}d; # remove comma if new version
732 if ($version =~ m{\(Rel\.\s(\d+),\sCreated\)}xmso ) {
733 my $release = Bio
::Annotation
::SimpleValue
->new(
734 -tagname
=> 'creation_release',
737 $self->annotation_collection->add_Annotation($release);
738 } elsif ($version =~ m{\(Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)\)}xmso ) {
739 my $release = Bio
::Annotation
::SimpleValue
->new(
740 -tagname
=> 'update_release',
743 $self->annotation_collection->add_Annotation($release);
744 my $update = Bio
::Annotation
::SimpleValue
->new(
745 -tagname
=> 'update_version',
748 $self->annotation_collection->add_Annotation($update);
750 push @
{ $self->{'_params'}->{'-dates'} }, $date;
754 # UniProt/SwissProt DT lines
756 my ($self, $data) = @_;
758 my @dls = split m{\n}, $data->{DATA
};
760 my ($date, $version) = split(' ', $dl, 2);
761 $date =~ tr{,}{}d; # remove comma if new version
762 if ($version =~ m{\(Rel\. (\d+), Last sequence update\)} || # old
763 $version =~ m{sequence version (\d+)\.}) { #new
764 my $update = Bio
::Annotation
::SimpleValue
->new(
765 -tagname
=> 'seq_update',
768 $self->annotation_collection->add_Annotation($update);
769 } elsif ($version =~ m{\(Rel\. (\d+), Last annotation update\)} || #old
770 $version =~ m{entry version (\d+)\.}) { #new
771 $self->{'_params'}->{'-version'} = $1;
772 $self->{'_params'}->{'-seq_version'} = $1;
774 push @
{ $self->{'_params'}->{'-dates'} }, $date;
778 # GenBank KEYWORDS line
780 # UniProt/SwissProt KW line
781 sub _generic_keywords
{
782 my ($self, $data) = @_;
783 $data->{DATA
} =~ s{\.$}{};
784 my @kw = split m{\s*\;\s*}xo ,$data->{DATA
};
785 $self->{'_params'}->{'-keywords'} = \
@kw;
788 # GenBank DEFINITION line
790 # UniProt/SwissProt DE line
791 sub _generic_description
{
792 my ($self, $data) = @_;
793 $self->{'_params'}->{'-desc'} = $data->{DATA
};
796 # GenBank ACCESSION line
798 # UniProt/SwissProt AC line
799 sub _generic_accession
{
800 my ($self, $data) = @_;
801 my @accs = split m{[\s;]+}, $data->{DATA
};
802 $self->{'_params'}->{'-accession_number'} = shift @accs;
803 $self->{'_params'}->{'-secondary_accessions'} = \
@accs if @accs;
806 ####################### SPECIES HANDLERS #######################
809 # GenBank SOURCE, ORGANISM lines
811 # UniProt/SwissProt O* lines
812 sub _generic_species
{
813 my ($self, $data) = @_;
815 my $seqformat = $self->format;
816 # if data is coming in from GenBank parser...
817 if ($seqformat eq 'genbank' &&
818 $data->{ORGANISM
} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
819 ($data->{ORGANISM
}, $data->{CLASSIFICATION
}) = ($1, $2);
823 # hybrid names in swissprot files are no longer valid per intergration into
824 # UniProt. Files containing these have been split into separate entries, so
825 # it is probably a good idea to update if one has these lingering around...
828 if ($seqformat eq 'swiss') {
829 if ($data->{DATA
} =~ m{^([^,]+)}ox) {
832 if ($data->{CROSSREF
} && $data->{CROSSREF
} =~ m{NCBI_TaxID=(\d+)}) {
837 my ($sl, $class, $sci_name) = ($data->{DATA
},
838 $data->{CLASSIFICATION
},
839 $data->{ORGANISM
} || '');
840 my ($organelle,$abbr_name, $common);
841 my @class = reverse split m{\s*;\s*}, $class;
842 # have to treat swiss different from everything else...
843 if ($sl =~ m
{^(mitochondrion
|chloroplast
|plastid
)?
# GenBank format
845 \s
*(?
: \
( (.*?
) \
) )?\
.?
$
847 ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
849 $abbr_name = $sl; # nothing caught; this is a backup!
851 # there is no 'abbreviated name' for EMBL
852 $sci_name = $abbr_name if $seqformat ne 'genbank';
856 unshift @class, $sci_name;
857 # no genus/species parsing here; moving to Bio::Taxon-based taxonomy
858 my $make = Bio
::Species
->new();
859 $make->scientific_name($sci_name);
860 $make->classification(@class) if @class > 0;
861 $common && $make->common_name( $common );
862 $abbr_name && $make->name('abbreviated', $abbr_name);
863 $organelle && $make->organelle($organelle);
864 $taxid && $make->ncbi_taxid($taxid);
865 $self->{'_params'}->{'-species'} = $make;
868 ####################### ANNOTATION HANDLERS #######################
870 # GenBank DBSOURCE line
871 sub _genbank_dbsource
{
872 my ($self, $data) = @_;
873 my $dbsource = $data->{DATA
};
874 my $annotation = $self->annotation_collection;
875 # deal with swissprot dbsources
876 # we could possibly parcel these out to subhandlers...
877 if( $dbsource =~ s/(UniProt(?:KB)|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
878 $annotation->add_Annotation
880 Bio
::Annotation
::DBLink
->new
883 -tagname
=> 'dblink'));
884 if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
885 $annotation->add_Annotation
887 Bio
::Annotation
::SimpleValue
->new
888 (-tagname
=> 'date_created',
891 while( $dbsource =~ s/\s*(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
892 $annotation->add_Annotation
894 Bio
::Annotation
::SimpleValue
->new
895 (-tagname
=> 'date_updated',
898 $dbsource =~ s/\n/ /g;
899 if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
900 # will use $i to determine even or odd
901 # for swissprot the accessions are paired
903 for my $dbsrc ( split(/,\s+/,$1) ) {
904 if( $dbsrc =~ /(\S+)\.(\d+)/ || $dbsrc =~ /(\S+)/ ) {
905 my ($id,$version) = ($1,$2);
906 $version ='' unless defined $version;
908 if( $id =~ /^\d\S{3}/) {
911 $db = ($i++ % 2 ) ?
'GenPept' : 'GenBank';
913 $annotation->add_Annotation
915 Bio
::Annotation
::DBLink
->new
917 -version
=> $version,
919 -tagname
=> 'dblink'));
922 } elsif( $dbsource =~ s/\s*xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
923 # download screwed up and ncbi didn't put acc in for gi numbers
925 for my $id ( split(/\,\s+/,$1) ) {
927 if( $id =~ /gi:\s+(\d+)/ ) {
929 $db = ($i++ % 2 ) ?
'GenPept' : 'GenBank';
930 } elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {
937 $annotation->add_Annotation
939 Bio
::Annotation
::DBLink
->new
940 (-primary_id
=> $acc,
942 -tagname
=> 'dblink'));
945 $self->warn("Cannot match $dbsource\n");
947 if( $dbsource =~ s
/xrefs\s
+\
(non\
-sequence\s
+databases\
):\s
+
948 ((?
:\S
+,\s
+)+\S
+)//x ) {
949 for my $id ( split(/\,\s+/,$1) ) {
951 # this is because GenBank dropped the spaces!!!
952 # I'm sure we're not going to get this right
953 ##if( $id =~ s/^://i ) {
956 $db = substr($id,0,index($id,':'));
957 if (! exists $DBSOURCE{ $db }) {
958 $db = ''; # do we want 'GenBank' here?
960 $id = substr($id,index($id,':')+1);
961 $annotation->add_Annotation
962 ('dblink',Bio
::Annotation
::DBLink
->new
965 -tagname
=> 'dblink'));
969 if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) {
970 my ($db,$id,$version) = ($1,$2,$3);
971 $annotation->add_Annotation
973 Bio
::Annotation
::DBLink
->new
975 -version
=> $version,
976 -database
=> $db || 'GenBank',
977 -tagname
=> 'dblink'));
978 } elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) {
979 my ($id, $db, $version);
981 ($db, $id) = ($1, $3);
983 ($db, $id, $version) = ('GenBank', $1, $3);
985 $annotation->add_Annotation('dblink',
986 Bio
::Annotation
::DBLink
->new(
988 -version
=> $version,
990 -tagname
=> 'dblink')
993 $self->warn("Unrecognized DBSOURCE data: $dbsource\n");
999 # UniProt/SwissProt DR lines
1000 sub _generic_dbsource
{
1001 my ($self, $data) = @_;
1002 #$self->debug(Dumper($data));
1003 while ($data->{DATA
} =~ m{([^\n]+)}og) {
1005 $dblink =~ s{\.$}{};
1007 my @linkdata = split '; ',$dblink;
1008 if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) {
1009 #if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) {
1010 my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1011 $link = Bio
::Annotation
::DBLink
->new(-database
=> $databse,
1012 -primary_id
=> $prim_id,
1013 -optional_id
=> $sec_id);
1015 $self->warn("No match for $dblink");
1017 $self->annotation_collection->add_Annotation('dblink', $link);
1022 # GenBank REFERENCE and related lines
1024 # UniProt/SwissProt R* lines
1025 sub _generic_reference
{
1026 my ($self, $data) = @_;
1027 my $seqformat = $self->format;
1029 # get these in EMBL/Swiss
1030 if ($data->{CROSSREF
}) {
1031 while ($data->{CROSSREF
} =~ m{(pubmed|doi|medline)(?:=|;\s+)(\S+)}oig) {
1032 my ($db, $ref) = (uc $1, $2);
1033 $ref =~ s{[;.]+$}{};
1034 $data->{$db} = $ref;
1037 # run some cleanup for swissprot
1038 if ($seqformat eq 'swiss') {
1039 for my $val (values %{ $data }) {
1041 $val =~ s{(\w-)\s}{$1};
1044 if ( $data->{POSITION
} ) {
1045 if ($seqformat eq 'embl') {
1046 ($start, $end) = split '-', $data->{POSITION
},2;
1047 } elsif ($data->{POSITION
} =~ m{.+? OF (\d+)-(\d+).*}) { #swiss
1048 ($start, $end) = ($1, $2);
1051 if ($data->{DATA
} =~ m{^\d+\s+\([a-z]+\s+(\d+)\s+to\s+(\d+)\)}xmso) {
1052 ($start, $end) = ($1, $2);
1054 my $ref = Bio
::Annotation
::Reference
->new(
1055 -comment
=> $data->{REMARK
},
1056 -location
=> $data->{JOURNAL
},
1057 -pubmed
=> $data->{PUBMED
},
1058 -consortium
=> $data->{CONSRTM
},
1059 -title
=> $data->{TITLE
},
1060 -authors
=> $data->{AUTHORS
},
1061 -medline
=> $data->{MEDLINE
},
1062 -doi
=> $data->{DOI
},
1063 -rp
=> $data->{POSITION
}, # JIC...
1067 if ($data->{DATA
} =~ m{^\d+\s+\((.*)\)}xmso) {
1068 $ref->gb_reference($1);
1070 $self->annotation_collection->add_Annotation('reference', $ref);
1073 # GenBank COMMENT lines
1075 # UniProt/SwissProt CC lines
1076 sub _generic_comment
{
1077 my ($self, $data) = @_;
1078 $self->annotation_collection->add_Annotation('comment',
1079 Bio
::Annotation
::Comment
->new( -text
=> $data->{DATA
} ));
1082 ####################### SEQFEATURE HANDLER #######################
1084 # GenBank Feature Table
1085 sub _generic_seqfeatures
{
1086 my ($self, $data) = @_;
1087 return if $data->{FEATURE_KEY
} eq 'FEATURES';
1088 my $primary_tag = $data->{FEATURE_KEY
};
1090 # grab the NCBI taxon ID from the source SF
1091 if ($primary_tag eq 'source' && exists $data->{'db_xref'}) {
1092 if ( $self->{'_params'}->{'-species'} &&
1093 $data->{'db_xref'} =~ m{taxon:(\d+)}xmso ) {
1094 $self->{'_params'}->{'-species'}->ncbi_taxid($1);
1097 my $source = $self->format;
1099 my $seqid = ${ $self->get_params('accession_number') }{'accession_number'};
1103 $loc = $self->{'_locfactory'}->from_string($data->{'LOCATION'});
1106 $self->warn("exception while parsing location line [" .
1107 $data->{'LOCATION'} .
1108 "] in reading $source, ignoring feature " .
1109 $data->{'primary_tag'}.
1110 " (seqid=" . $seqid . "): " . $@
);
1113 if($seqid && (! $loc->is_remote())) {
1114 $loc->seq_id($seqid); # propagates if it is a split location
1116 my $sf = Bio
::SeqFeature
::Generic
->direct_new();
1117 $sf->location($loc);
1118 $sf->primary_tag($primary_tag);
1119 $sf->seq_id($seqid);
1120 $sf->source_tag($source);
1121 delete $data->{'FEATURE_KEY'};
1122 delete $data->{'LOCATION'};
1123 delete $data->{'NAME'};
1124 delete $data->{'DATA'};
1125 $sf->set_attributes(-tag
=> $data);
1126 push @
{ $self->{'_params'}->{'-features'} }, $sf;
1129 ####################### ODDS AND ENDS #######################
1131 # Those things that don't fit anywhere else. If a specific name
1132 # maps to the below table, that class and method are used, otherwise
1133 # it goes into a SimpleValue (I think this is a good argument for why
1134 # we need a generic mechanism for storing annotation)
1136 sub _generic_simplevalue
{
1137 my ($self, $data) = @_;
1138 $self->annotation_collection->add_Annotation(
1139 Bio
::Annotation
::SimpleValue
->new(-tagname
=> lc($data->{NAME
}),
1140 -value
=> $data->{DATA
})
1147 my ($self, $data) = @_;
1148 $self->debug(Dumper
($data));