Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / OntologyIO / Handlers / InterProHandler.pm
blob53eee32b2c6f5f7587cbfafe91be10c761137134
2 # BioPerl module for InterProHandler
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Peter Dimitrov <dimitrov@gnf.org>
8 # Copyright Peter Dimitrov
9 # (c) Peter Dimitrov, dimitrov@gnf.org, 2003.
10 # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2003.
12 # You may distribute this module under the same terms as perl itself.
13 # Refer to the Perl Artistic License (see the license accompanying this
14 # software package, or see http://www.perl.com/language/misc/Artistic.html)
15 # for the terms under which you may use, modify, and redistribute this module.
17 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
18 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
19 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
21 # POD documentation - main docs before the code
23 =head1 NAME
25 Bio::OntologyIO::Handlers::InterProHandler - XML handler class for InterProParser
27 =head1 SYNOPSIS
29 # do not use directly - used and instantiated by InterProParser
31 =head1 DESCRIPTION
33 Handles xml events generated by InterProParser when parsing InterPro
34 XML files.
36 =head1 FEEDBACK
38 =head2 Mailing Lists
40 User feedback is an integral part of the evolution of this and other
41 Bioperl modules. Send your comments and suggestions preferably to
42 the Bioperl mailing list. Your participation is much appreciated.
44 bioperl-l@bioperl.org - General discussion
45 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
47 =head2 Support
49 Please direct usage questions or support issues to the mailing list:
51 I<bioperl-l@bioperl.org>
53 rather than to the module maintainer directly. Many experienced and
54 reponsive experts will be able look at the problem and quickly
55 address it. Please include a thorough description of the problem
56 with code and data examples if at all possible.
58 =head2 Reporting Bugs
60 Report bugs to the Bioperl bug tracking system to help us keep track
61 of the bugs and their resolution. Bug reports can be submitted via the
62 web:
64 https://github.com/bioperl/bioperl-live/issues
66 =head1 AUTHOR - Peter Dimitrov
68 Email dimitrov@gnf.org
70 =head1 CONTRIBUTORS
72 Juguang Xiao, juguang@tll.org.sg
74 =head1 APPENDIX
76 The rest of the documentation details each of the object methods.
77 Internal methods are usually preceded with a _
79 =cut
81 # Let the code begin...
83 package Bio::OntologyIO::Handlers::InterProHandler;
84 use strict;
85 use Carp;
86 use Bio::Ontology::Ontology;
87 use Bio::Ontology::RelationshipType;
88 use Bio::Ontology::SimpleOntologyEngine;
89 use Bio::Annotation::Reference;
90 use Data::Dumper;
92 use base qw(Bio::Root::Root);
94 my ( $record_count, $processed_count, $is_a_rel, $contains_rel, $found_in_rel );
96 =head2 new
98 Title : new
99 Usage : $h = Bio::OntologyIO::Handlers::InterProHandler->new;
100 Function: Initializes global variables
101 Example :
102 Returns : an InterProHandler object
103 Args :
106 =cut
108 sub new {
109 my ( $class, @args ) = @_;
110 my $self = $class->SUPER::new(@args);
112 my ( $eng, $ont, $name, $fact ) = $self->_rearrange(
113 [qw[
114 ENGINE
115 ONTOLOGY
116 ONTOLOGY_NAME
117 TERM_FACTORY
119 @args
122 if ( defined($ont) ) {
123 $self->ontology($ont);
124 } else {
125 $name = "InterPro" unless $name;
126 $self->ontology( Bio::Ontology::Ontology->new( -name => $name ) );
128 $self->ontology_engine($eng) if $eng;
130 $self->term_factory($fact) if $fact;
132 $is_a_rel = Bio::Ontology::RelationshipType->get_instance("IS_A");
133 $contains_rel = Bio::Ontology::RelationshipType->get_instance("CONTAINS");
134 $found_in_rel = Bio::Ontology::RelationshipType->get_instance("FOUND_IN");
135 $is_a_rel->ontology( $self->ontology() );
136 $contains_rel->ontology( $self->ontology() );
137 $found_in_rel->ontology( $self->ontology() );
138 $self->_cite_skip(0);
139 $self->secondary_accessions_map( {} );
141 return $self;
144 =head2 ontology_engine
146 Title : ontology_engine
147 Usage : $obj->ontology_engine($newval)
148 Function: Get/set ontology engine. Can be initialized only once.
149 Example :
150 Returns : value of ontology_engine (a scalar)
151 Args : new value (a scalar, optional)
154 =cut
156 sub ontology_engine {
157 my ( $self, $value ) = @_;
159 if ( defined $value ) {
160 if ( defined $self->{'ontology_engine'} ) {
161 $self->throw("ontology_engine already defined");
162 } else {
163 $self->throw(
164 ref($value) . " does not implement " . "Bio::Ontology::OntologyEngineI. Bummer." )
165 unless $value->isa("Bio::Ontology::OntologyEngineI");
166 $self->{'ontology_engine'} = $value;
168 # don't forget to set this as the engine of the ontology, otherwise
169 # those two might not point to the same object
170 my $ont = $self->ontology();
171 if ( $ont && $ont->can("engine") && ( !$ont->engine() ) ) {
172 $ont->engine($value);
175 $self->debug(
176 ref($self)
177 . "::ontology_engine: registering ontology engine ("
178 . ref($value) . "):\n"
179 . $value->to_string
180 . "\n" );
184 return $self->{'ontology_engine'};
187 =head2 ontology
189 Title : ontology
190 Usage :
191 Function: Get the ontology to add the InterPro terms to.
193 The value is determined automatically once ontology_engine
194 has been set and if it hasn't been set before.
196 Example :
197 Returns : A L<Bio::Ontology::OntologyI> implementing object.
198 Args : On set, a L<Bio::Ontology::OntologyI> implementing object.
200 =cut
202 sub ontology {
203 my ( $self, $ont ) = @_;
205 if ( defined($ont) ) {
206 $self->throw( ref($ont) . " does not implement Bio::Ontology::OntologyI" . ". Bummer." )
207 unless $ont->isa("Bio::Ontology::OntologyI");
208 $self->{'_ontology'} = $ont;
210 return $self->{'_ontology'};
213 =head2 term_factory
215 Title : term_factory
216 Usage : $obj->term_factory($newval)
217 Function: Get/set the ontology term object factory
218 Example :
219 Returns : value of term_factory (a Bio::Factory::ObjectFactory instance)
220 Args : on set, new value (a Bio::Factory::ObjectFactory instance
221 or undef, optional)
224 =cut
226 sub term_factory {
227 my $self = shift;
229 return $self->{'term_factory'} = shift if @_;
230 return $self->{'term_factory'};
233 =head2 _cite_skip
235 Title : _cite_skip
236 Usage : $obj->_cite_skip($newval)
237 Function:
238 Example :
239 Returns : value of _cite_skip (a scalar)
240 Args : new value (a scalar, optional)
243 =cut
245 sub _cite_skip {
246 my ( $self, $value ) = @_;
248 if ( defined $value ) {
249 $self->{'_cite_skip'} = $value;
252 return $self->{'_cite_skip'};
255 =head2 _hash
257 Title : _hash
258 Usage : $obj->_hash($newval)
259 Function:
260 Example :
261 Returns : value of _hash (a scalar)
262 Args : new value (a scalar, optional)
265 =cut
267 sub _hash {
268 my ( $self, $value ) = @_;
270 if ( defined $value ) {
271 $self->{'_hash'} = $value;
274 return $self->{'_hash'};
277 =head2 _stack
279 Title : _stack
280 Usage : $obj->_stack($newval)
281 Function:
282 Example :
283 Returns : value of _stack (a scalar)
284 Args : new value (a scalar, optional)
287 =cut
289 sub _stack {
290 my ( $self, $value ) = @_;
292 if ( defined $value ) {
293 $self->{'_stack'} = $value;
295 return $self->{'_stack'};
298 =head2 _top
300 Title : _top
301 Usage :
302 Function:
303 Example :
304 Returns :
305 Args :
308 =cut
310 sub _top {
311 my ( $self, $_stack ) = @_;
312 my @stack = @{$_stack};
314 return ( @stack >= 1 ) ? $stack[ @stack - 1 ] : undef;
317 =head2 _term
319 Title : _term
320 Usage : $obj->_term($newval)
321 Function: Get/set method for the term currently processed.
322 Example :
323 Returns : value of term (a scalar)
324 Args : new value (a scalar, optional)
327 =cut
329 sub _term {
330 my ( $self, $value ) = @_;
332 if ( defined $value ) {
333 $self->{'_term'} = $value;
336 return $self->{'_term'};
339 =head2 _clear_term
341 Title : _clear_term
342 Usage :
343 Function: Removes the current term from the handler
344 Example :
345 Returns :
346 Args :
349 =cut
351 sub _clear_term {
352 my ($self) = @_;
354 delete $self->{'_term'};
357 =head2 _names
359 Title : _names
360 Usage : $obj->_names($newval)
361 Function:
362 Example :
363 Returns : value of _names (a scalar)
364 Args : new value (a scalar, optional)
367 =cut
369 sub _names {
370 my ( $self, $value ) = @_;
372 if ( defined $value ) {
373 $self->{'_names'} = $value;
376 return $self->{'_names'};
379 =head2 _create_relationship
381 Title : _create_relationship
382 Usage :
383 Function: Helper function. Adds relationships to one of the relationship stores.
384 Example :
385 Returns :
386 Args :
388 =cut
392 my %relationship_cache;
394 sub _clear_cache { %relationship_cache = () }
396 sub _create_relationship {
397 my ( $self, $ref_id, $rel_type_term ) = @_;
399 my $ont = $self->ontology();
400 my $fact = $self->term_factory();
401 my $term_temp = ( $ont->engine->get_term_by_identifier($ref_id) )[0];
403 if ( !defined $term_temp ) {
404 $term_temp =
405 $ont->engine->add_term(
406 $fact->create_object( -InterPro_id => $ref_id, -name => $ref_id, -ontology => $ont ) );
407 $ont->engine->mark_uninstantiated($term_temp);
409 my $marshalled = join(':', (sort $self->_term->identifier, $ref_id));
411 # check cache to see if the two have been seen before, using marshalled IDs
412 if ($relationship_cache{$marshalled}++) {
413 # TODO: should check that the relationship type for these terms is the
414 # inverse of the stored relationship type
415 return;
418 my $rel_type_name = $self->_top( $self->_names );
420 my $rel = Bio::Ontology::Relationship->new( -predicate_term => $rel_type_term );
422 if ( $rel_type_name eq 'parent_list' || $rel_type_name eq 'found_in' ) {
423 $rel->object_term($term_temp);
424 $rel->subject_term( $self->_term );
425 } else {
426 $rel->object_term( $self->_term );
427 $rel->subject_term($term_temp);
429 $rel->ontology($ont);
430 $ont->add_relationship($rel);
435 =head2 start_element
437 Title : start_element
438 Usage :
439 Function: This is a method that is derived from XML::SAX::Base and
440 has to be overridden for processing start of xml element
441 events. Used internally only.
443 Example :
444 Returns :
445 Args :
448 =cut
450 sub start_element {
451 my ( $self, $element ) = @_;
452 my $ont = $self->ontology();
453 my $fact = $self->term_factory();
455 if ( $element->{Name} eq 'interprodb' ) {
456 $ont->add_term(
457 $fact->create_object(
458 -identifier => "Active_site",
459 -name => "Active Site"
462 $ont->add_term(
463 $fact->create_object(
464 -identifier => "Conserved_site",
465 -name => "Conserved Site"
468 $ont->add_term(
469 $fact->create_object(
470 -identifier => "Binding_site",
471 -name => "Binding Site"
474 $ont->add_term(
475 $fact->create_object(
476 -identifier => "Family",
477 -name => "Family"
480 $ont->add_term(
481 $fact->create_object(
482 -identifier => "Domain",
483 -name => "Domain"
486 $ont->add_term(
487 $fact->create_object(
488 -identifier => "Repeat",
489 -name => "Repeat"
492 $ont->add_term(
493 $fact->create_object(
494 -identifier => "PTM",
495 -name => "post-translational modification"
498 $ont->add_term(
499 $fact->create_object(
500 -identifier => "Region",
501 -name => "Region"
504 } elsif ( $element->{Name} eq 'interpro' ) {
505 my %record_args = %{ $element->{Attributes} };
506 my $id = $record_args{"id"};
508 # this sets the current term
509 my $term = ( $ont->engine->get_term_by_identifier($id) )[0] ||
510 $fact->create_object( -InterPro_id => $id, -name => $id );
511 $self->_term($term);
513 $term->ontology($ont);
514 $term->short_name( $record_args{"short_name"} );
515 $term->protein_count( $record_args{"protein_count"} );
516 $self->_increment_record_count();
517 $self->_stack( [ { interpro => undef } ] );
518 $self->_names( ["interpro"] );
520 ## Adding a relationship between the newly created InterPro term
521 ## and the term describing its type
523 my $rel = Bio::Ontology::Relationship->new( -predicate_term => $is_a_rel );
524 my ($object_term) = $ont->find_terms( -identifier => $record_args{"type"} )
525 or $self->throw(
526 "when processing interpro ID '$id', no term found for interpro type '$record_args{type}'"
528 $rel->object_term($object_term);
529 $rel->subject_term( $self->_term );
530 $rel->ontology($ont);
531 $ont->add_relationship($rel);
532 $ont->add_term($term);
533 } elsif ( defined $self->_stack ) {
534 my %hash = ();
536 if ( keys %{ $element->{Attributes} } > 0 ) {
537 foreach my $key ( keys %{ $element->{Attributes} } ) {
538 $hash{$key} = $element->{Attributes}->{$key};
541 push @{ $self->_stack }, \%hash;
542 if ( $element->{Name} eq 'rel_ref' ) {
543 my $ref_id = $element->{Attributes}->{"ipr_ref"};
544 my $parent = $self->_top( $self->_names );
546 if ( $parent eq 'parent_list' || $parent eq 'child_list' ) {
547 $self->_create_relationship( $ref_id, $is_a_rel );
549 if ( $parent eq 'contains' ) {
550 $self->_create_relationship( $ref_id, $contains_rel );
552 if ( $parent eq 'found_in' ) {
553 $self->_create_relationship( $ref_id, $found_in_rel );
555 } elsif ( $element->{Name} eq 'abstract' ) {
556 $self->_cite_skip(1);
558 push @{ $self->_names }, $element->{Name};
563 =head2 _char_storage
565 Title : _char_storage
566 Usage : $obj->_char_storage($newval)
567 Function:
568 Example :
569 Returns : value of _char_storage (a scalar)
570 Args : new value (a scalar, optional)
573 =cut
575 sub _char_storage {
576 my ( $self, $value ) = @_;
578 if ( defined $value ) {
579 $self->{'_char_storage'} = $value;
582 return $self->{'_char_storage'};
585 =head2 characters
587 Title : characters
588 Usage :
589 Function: This is a method that is derived from XML::SAX::Base and has to be overridden for processing xml characters events. Used internally only.
590 Example :
591 Returns :
592 Args :
595 =cut
597 sub characters {
598 my ( $self, $characters ) = @_;
599 my $text = $characters->{Data};
601 chomp $text;
602 $text =~ s/^(\s+)//;
603 $self->{_char_storage} .= $text;
607 =head2 end_element
609 Title : end_element
610 Usage :
611 Function: This is a method that is derived from XML::SAX::Base and has to be overridden for processing end of xml element events. Used internally only.
612 Example :
613 Returns :
614 Args :
617 =cut
619 sub end_element {
620 my ( $self, $element ) = @_;
622 if ( $element->{Name} eq 'interprodb' ) {
623 $self->debug(
624 "Interpro DB Parser Finished: $record_count read, $processed_count processed\n");
625 $self->_clear_cache();
626 } elsif ( $element->{Name} eq 'interpro' ) {
627 $self->_clear_term;
628 $self->_increment_processed_count();
629 } elsif ( $element->{Name} ne 'cite' ) {
630 $self->{_char_storage} =~ s/<\/?p>//g;
631 if ( ( defined $self->_stack ) ) {
632 my $current_hash = pop @{ $self->_stack };
633 my $parent_hash = $self->_top( $self->_stack );
634 my $current_hash_key = pop @{ $self->_names };
636 if ( keys %{$current_hash} > 0 && $self->_char_storage ne "" ) {
637 $current_hash->{comment} = $self->_char_storage;
638 push @{ $parent_hash->{$current_hash_key} }, $current_hash;
639 } elsif ( $self->_char_storage ne "" ) {
640 push @{ $parent_hash->{$current_hash_key} },
641 { 'accumulated_text_12345' => $self->_char_storage };
642 } elsif ( keys %{$current_hash} > 0 ) {
643 push @{ $parent_hash->{$current_hash_key} }, $current_hash;
645 if ( $element->{Name} eq 'pub_list' ) {
646 my @refs = ();
648 foreach my $pub_record ( @{ $current_hash->{publication} } ) {
649 my $ref = Bio::Annotation::Reference->new;
650 my $loc = $pub_record->{location}->[0];
651 # TODO: Getting unset stuff here; should this be an error?
652 $ref->location(
653 sprintf("%s, %s-%s, %s, %s",
654 $pub_record->{journal}->[0]->{accumulated_text_12345} || '',
655 $loc->{firstpage} || '',
656 $loc->{lastpage} || '',
657 $loc->{volume} || '',
658 $pub_record->{year}->[0]->{accumulated_text_12345} || '')
660 $ref->title( $pub_record->{title}->[0]->{accumulated_text_12345} );
661 my $ttt = $pub_record->{author_list}->[0];
663 $ref->authors( $ttt->{accumulated_text_12345} );
664 $ref->medline( scalar( $ttt->{dbkey} ) )
665 if exists( $ttt->{db} ) && $ttt->{db} eq "MEDLINE";
666 push @refs, $ref;
668 $self->_term->add_reference(@refs);
669 } elsif ( $element->{Name} eq 'name' ) {
670 $self->_term->name( $self->_char_storage );
671 } elsif ( $element->{Name} eq 'abstract' ) {
672 $self->_term->definition( $self->_char_storage );
673 $self->_cite_skip(0);
674 } elsif ( $element->{Name} eq 'member_list' ) {
675 my @refs = ();
677 foreach my $db_xref ( @{ $current_hash->{db_xref} } ) {
678 push @refs,
679 Bio::Annotation::DBLink->new(
680 -database => $db_xref->{db},
681 -primary_id => $db_xref->{dbkey}
684 $self->_term->add_dbxref(-dbxrefs => \@refs,
685 -context => 'member_list');
686 } elsif ( $element->{Name} eq 'sec_list' ) {
687 my @refs = ();
689 foreach my $sec_ac ( @{ $current_hash->{sec_ac} } ) {
690 push @refs, $sec_ac->{sec_ac};
692 $self->_term->add_secondary_id(@refs);
693 $self->secondary_accessions_map->{ $self->_term->identifier } = \@refs;
694 } elsif ( $element->{Name} eq 'example_list' ) {
695 my @refs = ();
697 foreach my $example ( @{ $current_hash->{examples} } ) {
698 push @refs,
699 Bio::Annotation::DBLink->new(
700 -database => $example->{db_xref}->[0]->{db},
701 -primary_id => $example->{db_xref}->[0]->{dbkey},
702 -comment => $example->{comment}
705 $self->_term->add_dbxref(-dbxrefs => \@refs,
706 -context => 'example_list');
707 } elsif ( $element->{Name} eq 'external_doc_list' ) {
708 my @refs = ();
710 foreach my $db_xref ( @{ $current_hash->{db_xref} } ) {
711 push @refs,
712 Bio::Annotation::DBLink->new(
713 -database => $db_xref->{db},
714 -primary_id => $db_xref->{dbkey}
717 $self->_term->add_dbxref(-dbxrefs => \@refs,
718 -context => 'external_doc_list');
719 } elsif ( $element->{Name} eq 'class_list' ) {
720 my @refs = ();
722 foreach my $classification ( @{ $current_hash->{classification} } ) {
723 push @refs,
724 Bio::Annotation::DBLink->new(
725 -database => $classification->{class_type},
726 -primary_id => $classification->{id}
729 $self->_term->add_dbxref(-dbxrefs => \@refs,
730 -context => 'class_list');
731 } elsif ( $element->{Name} eq 'deleted_entries' ) {
732 my @refs = ();
734 foreach my $del_ref ( @{ $current_hash->{del_ref} } ) {
735 my $term =
736 ( $self->ontology_engine->get_term_by_identifier( $del_ref->{id} ) )[0];
738 $term->is_obsolete(1) if defined $term;
742 $self->_char_storage('') if !$self->_cite_skip;
746 =head2 secondary_accessions_map
748 Title : secondary_accessions_map
749 Usage : $obj->secondary_accessions_map($newval)
750 Function:
751 Example : $map = $interpro_handler->secondary_accessions_map();
752 Returns : Reference to a hash that maps InterPro identifier to an
753 array reference of secondary accessions following the InterPro
754 xml schema.
755 Args : Empty hash reference
758 =cut
760 sub secondary_accessions_map {
761 my ( $self, $value ) = @_;
763 if ( defined $value ) {
764 $self->{'secondary_accessions_map'} = $value;
767 return $self->{'secondary_accessions_map'};
770 =head2 _increment_record_count
772 Title : _increment_record_count
773 Usage :
774 Function:
775 Example :
776 Returns :
777 Args :
780 =cut
782 sub _increment_record_count {
783 $record_count++;
786 =head2 _increment_processed_count
788 Title : _increment_processed_count
789 Usage :
790 Function:
791 Example :
792 Returns :
793 Args :
796 =cut
798 sub _increment_processed_count {
799 my $self = shift;
800 $processed_count++;
801 $self->debug("$processed_count\n") if $processed_count % 100 == 0;