2 # BioPerl module for InterPro_BioSQL_Handler
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Juguang Xiao, juguang@tll.org.sg
8 # Copyright Juguang Xiao
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::OntologyIO::Handlers::InterPro_BioSQL_Handler - parse an InterPro XML file and persist the resulting terms to a Biosql database
20 # see load_interpro.pl in bioperl-db/scripts/biosql/
24 This module is for parsing an InterPro XML file and persist the
25 resulting terms to a Biosql database as soon as the term is complete
26 as signaled by the appropriate xml tag. This parser takes advantage of
27 SAX, a stream-based XML parser technology, to keep the used memory as
28 small as possible. The alternative parser for InterPro, module
29 InterProHandler, builds up the entire ontology in memory, which given
30 the size of the latest InterPro releases requires a huge amount of
33 This module takes the following non-standard arguments upon
36 -db the adaptor factory as returned by a call to
38 -version the InterPro version (not available as property!)
39 -term_factory the object factory to use for creating terms
41 Note that there are two alternatives for how to persist the terms and
42 relationships to the database. The default is using the adaptor
43 factory passed as -db or set as a property to create persistent
44 objects and store them in the database. The alternative is to specify
45 a term persistence and a relationship persistence handler; if one or
46 both have been set, the respective handler will be called with each
47 term and relationship that is to be stored. See properties
48 persist_term_handler and persist_relationship_handler.
52 Juguang Xiao, juguang@tll.org.sg
56 Hilmar Lapp, hlapp at gmx.net
60 The rest of the documentation details each of the object methods.
61 Internal methods are usually preceded with a _
65 package Bio
::OntologyIO
::Handlers
::InterPro_BioSQL_Handler
;
67 use Bio
::Ontology
::Ontology
;
68 use Bio
::Ontology
::Term
;
69 use Bio
::Ontology
::TermFactory
;
70 use Bio
::Ontology
::RelationshipType
;
71 use Bio
::Ontology
::Relationship
;
72 use Bio
::Annotation
::DBLink
;
73 use Bio
::Annotation
::Reference
;
75 use base
qw(Bio::OntologyIO::Handlers::BaseSAXHandler);
82 $self->SUPER::_initialize
(@args);
83 my ($db, $version, $fact) = $self->_rearrange(
84 [qw(DB VERSION TERM_FACTORY)], @args);
85 $self->db($db) if $db; # this is now a property and may be set later
87 $fact = Bio
::Ontology
::TermFactory
->new(-type
=>"Bio::Ontology::Term");
89 $self->term_factory($fact);
90 my $ontology = Bio
::Ontology
::Ontology
->new(-name
=> 'InterPro');
91 if (defined($version)) {
92 $version = "InterPro version $version";
93 $ontology->definition($version);
95 $self->_ontology($ontology);
96 $is_a_rel = Bio
::Ontology
::RelationshipType
->get_instance('IS_A');
97 $is_a_rel->ontology($ontology);
103 Usage : $obj->term_factory($newval)
104 Function: Get/set the ontology term factory to use.
106 As a user of this module it is not necessary to call this
107 method as there will be default. In order to change the
108 default, the easiest way is to instantiate
109 L<Bio::Ontology::TermFactory> with the proper -type
110 argument. Most if not all parsers will actually use this
111 very implementation, so even easier than the aforementioned
112 way is to simply call
113 $ontio->term_factory->type("Bio::Ontology::MyTerm").
116 Returns : value of term_factory (a Bio::Factory::ObjectFactoryI object)
117 Args : on set, new value (a Bio::Factory::ObjectFactoryI object, optional)
125 return $self->{'term_factory'} = shift if @_;
126 return $self->{'term_factory'};
132 Usage : $obj->db($newval)
133 Function: Sets or retrieves the database adaptor factory.
135 The adaptor factory is a Bio::DB::DBAdaptorI compliant
136 object and will be used to obtain the persistence adaptors
137 necessary to serialize terms and relationships to the
140 Usually, you will obtain such an object from a call to
141 Bio::DB::BioDB. You *must* set this property before
144 Note that this property is immutable once set, except that
145 you may set it to undef. Therefore, be careful not to set
146 to undef before setting the desired real value.
149 Returns : value of db (a Bio::DB::DBAdaptorI compliant object)
150 Args : on set, new value (a Bio::DB::DBAdaptorI compliant object
160 if ($db && exists($self->{_db
}) && ($self->{_db
} != $db)) {
161 $self->throw('db may not be modified once set');
168 =head2 persist_term_handler
170 Title : persist_term_handler
171 Usage : $obj->persist_term_handler($handler,@args)
172 Function: Sets or retrieves the persistence handler for terms along
173 with the constant set of arguments to be passed to the
176 If set, the first argument will be treated as a closure and
177 be called for each term to persist to the database. The
178 term will be passed as a named parameter (-term), followed
179 by the other arguments passed to this setter. Note that
180 this allows one to pass an arbitrary configuration to the
183 If not set, terms will be persisted along with their
184 relationships using the respective persistence adaptor
185 returned by the adaptor factory (see property db).
188 Returns : an array reference with the values passed on set, or an empty
190 Args : On set, an array of values. The first value is the handler
191 as a closure; all other values will be passed to the handler
192 as constant argument.
197 sub persist_term_handler
{
200 return $self->{'persist_term_handler'} = [@_] if @_;
201 return $self->{'persist_term_handler'} || [];
204 =head2 persist_relationship_handler
206 Title : persist_relationship_handler
207 Usage : $obj->persist_relationship_handler($handler,@args)
208 Function: Sets or retrieves the persistence handler for relationships
209 along with the constant set of arguments to be passed to
212 If set, the first argument will be treated as a closure and
213 be called for each relationship to persist to the database. The
214 relationship will be passed as a named parameter (-rel), followed
215 by the other arguments passed to this setter. Note that
216 this allows one to pass an arbitrary configuration to the
219 If not set, relationships will be persisted along with their
220 relationships using the respective persistence adaptor
221 returned by the adaptor factory (see property db).
224 Returns : an array reference with the values passed on set, or an empty
226 Args : On set, an array of values. The first value is the handler
227 as a closure; all other values will be passed to the handler
228 as constant argument.
233 sub persist_relationship_handler
{
236 return $self->{'persist_relationship_handler'} = [@_] if @_;
237 return $self->{'persist_relationship_handler'} || [];
242 Title : _persist_term
244 Function: Persists a term to the database, using either a previously
245 set persistence handler, or the adaptor factory directly.
248 Args : the ontology term to persist
257 my ($handler,@args) = @
{$self->persist_term_handler};
259 &$handler('-term' => $term, @args);
261 # no handler; we'll do this ourselves straight and simple
262 my $db = $self->db();
263 my $pterm = $db->create_persistent($term);
270 $self->warn("failed to store term '".$term->name."': ".$@
);
275 =head2 _persist_relationship
277 Title : _persist_relationship
279 Function: Persists a relationship to the database, using either a
280 previously set persistence handler, or the adaptor factory
285 Args : the term relationship to persist
290 sub _persist_relationship
{
294 my ($handler,@args) = @
{$self->persist_relationship_handler};
296 &$handler('-rel' => $rel, @args);
298 # no handler; we'll do this ourselves straight and simple
299 my $db = $self->db();
300 my $prel = $db->create_persistent($rel);
307 $self->warn("failed to store relationship of subject '"
308 .$rel->subject_term->name."' to object '"
309 .$rel->object_term->name.": ".$@
);
314 =head2 _persist_ontology
316 Title : _persist_ontology
318 Function: Perists the ontology itself to the database, by either
319 inserting or updating it.
321 Note that this will only create or update the ontology as
322 an entity, not any of its terms, relationships, or
326 Returns : the ontology as a persistent object with primary key
327 Args : the ontology to persist as a Bio::Ontology::OntologyI
333 sub _persist_ontology
{
336 my $db = $self->db();
338 # do a lookup first; chances are we have this already in the database
339 my $adp = $db->get_object_adaptor($ont);
340 # to avoid clobbering this ontology's properties with possibly older ones
341 # from the database we'll need an object factory
343 Bio
::Factory
::ObjectFactory
->new(-type
=>"Bio::Ontology::Ontology");
345 my $found = $adp->find_by_unique_key($ont, '-obj_factory' => $ontfact);
346 # make a persistent object of the ontology
347 $ont = $db->create_persistent($ont);
348 # transfer primary key if found in the lookup
349 $ont->primary_key($found->primary_key) if $found;
353 $result = $ont->store();
355 if ($@
|| !$result) {
357 $self->throw("failed to update ontology '"
358 .$ont->name."' in database".($@ ?
": $@" : ""));
361 # done - we don't commit here
362 return ref($result) ?
$result : $ont;
367 my $ont = $self->_ontology;
369 $self->create_term(-identifier
=>'IPR:Family',
372 $self->create_term(-identifier
=>'IPR:Domain',
375 $self->create_term(-identifier
=>'IPR:Repeat',
378 $self->create_term(-identifier
=>'IPR:PTM',
379 -name
=>'post-translational modification',
381 $self->create_term(-identifier
=>'IPR:Active_site',
382 -name
=>'Active_site',
384 $self->create_term(-identifier
=>'IPR:Binding_site',
385 -name
=>'Binding_site',
388 foreach my $iprtype (@iprtypes) {
389 $self->_persist_term($iprtype);
390 $ont->add_term($iprtype);
396 my $tag=$_[0]->{Name
};
397 my %args=%{$_[0]->{Attributes
}};
398 my $ont = $self->_ontology;
400 if($tag eq 'interpro'){
402 my $term = $self->create_term(-identifier
=>$id);
403 $term->ontology($ont);
404 $term->add_synonym($args{short_name
});
405 #$term->definition();
408 ($ont->engine->get_term_by_identifier("IPR:".$args{type
}));
410 my $rel = Bio
::Ontology
::Relationship
->new(
411 -subject_term
=> $term,
412 -predicate_term
=> $is_a_rel,
413 -object_term
=> $object_term,
416 $self->_relationship($rel);
417 }elsif($tag eq 'example'){
418 my $example = Bio
::Annotation
::DBLink
->new;
419 $self->_current_hash->{example
} = $example;
420 }elsif($tag eq 'db_xref'){
421 my $top = $self->_top_tag;
422 if($top eq 'example'){
423 my $example = $self->_current_hash->{example
};
424 $example->database($args{db
});
425 $example->primary_id($args{dbkey
});
426 #print "EXAmPLE:\t", $example->database, '|', $example->primary_id, "\n";
427 }elsif($top eq 'child'){
429 }elsif($top eq 'member_list'){
430 my $dblink=Bio
::Annotation
::DBLink
->new(
431 -dbname
=> $args{id
},
432 -primary_id
=> $args{dbkey
},
433 -comment
=> $args{name
}
435 }elsif($top eq 'external_doc_list'){
437 }elsif($top eq 'publication'){
438 if($args{db
} eq 'MEDLINE'){
439 $self->_current_hash->{medline
} =$args{dbkey
};
440 } elsif($args{db
} eq 'PUBMED'){
441 $self->_current_hash->{pubmed
} =$args{dbkey
};
443 $self->warn("'".$args{dbkey
}."' is not a MEDLINE publication, "
444 ."don't know how to handle");
446 }elsif($top eq 'structure_db_links'){
448 }elsif($top eq 'abstract'){
452 # $self->warn("unrecognized element '$top' in element '$tag', ignoring");
454 }elsif($tag eq 'publication'){
455 my $publication = Bio
::Annotation
::Reference
->new();
456 $self->_current_hash->{publication
} = $publication;
457 }elsif($tag eq 'author_list'){
459 }elsif($tag eq 'journal'){
461 }elsif($tag eq 'location'){
463 }elsif($tag eq 'year'){
465 } elsif (($tag eq 'dbinfo') && ($self->_top_tag eq 'release')) {
466 my $entrydate = $args{file_date
} || '';
467 $entrydate =~ s/ \d{2}:\d{2}:\d{2}//;
468 my $def = $ont->definition() || '';
469 $def .= "\n" if length($def) > 0;
470 $def .= $args{dbname
}." version ".$args{version
}.", "
471 .$args{entry_count
}." entries, ".$entrydate;
472 $ont->definition($def);
475 # $self->warn("unrecognized element '$tag', ignoring");
478 $self->_visited_count_inc($tag);
479 $self->_push_tag($tag);
485 my $tag=shift->{Name
};
486 my $chars_in=$self->_chars_hash->{$tag};
487 if($tag eq 'interpro'){
488 my $rel = $self->_relationship;
489 # store subject term first in order to give the handler a chance to
490 # apply whatever custom behaviour
491 # (note that the object term is the InterPro type and has been stored
492 # at the start of the whole document)
493 $self->_persist_term($rel->subject_term);
494 # the store the relationship to the InterPro type
495 $self->_persist_relationship($rel);
496 }elsif($tag eq 'name'){
497 my $rel = $self->_relationship;
498 $rel->subject_term->name($self->_chars_hash->{name
});
499 $self->_chars_hash->{name
}='';
500 }elsif($tag eq 'abstract'){
501 my $rel = $self->_relationship;
502 my $abstract = $self->_chars_hash->{abstract
};
503 $abstract =~ s/\n/ /g;
504 $rel->subject_term->definition($abstract);
505 $self->_chars_hash->{abstract
} = '';
506 }elsif($tag eq 'example'){
507 my $example = $self->_current_hash->{example
};
508 my $comment = $self->_chars_hash->{example
};
509 $comment =~ s/^(\s+)//; $comment =~ s/(\s+)$//;
510 $example->comment($comment);
511 $self->_relationship->subject_term->add_dbxref(-dbxrefs
=> [$example]);
512 $self->_chars_hash->{example
}='';
513 }elsif($tag eq 'publication'){
514 my $publication = $self->_create_publication;
515 $self->_relationship->subject_term->add_reference($publication);
516 }elsif($tag eq 'author_list'){
517 $self->_current_hash->{author
} =$chars_in;
518 }elsif($tag eq 'title'){
519 $self->_current_hash->{title
}=$chars_in;
520 } elsif ($tag eq 'release') {
521 my $ont = $self->_persist_ontology($self->_ontology);
522 $self->_ontology($ont) if $ont;
525 $self->_visited_count_dec($tag);
531 my $text = shift->{Data
};
533 my $top_tag =$self->_top_tag;
534 $self->_chars_hash->{$top_tag} .= $text;
536 # $self->_chars_hash->{abstract} .= $text if $self->_visited_count('abstract');
540 return shift->term_factory->create_object(@_);
545 return $self->{_ontology
}=shift if @_;
546 return $self->{_ontology
};
551 $self->{_relationship
}=shift if @_;
552 return $self->{_relationship
};
554 sub _create_publication
{
556 my $publ = $self->_current_hash->{publication
};
557 my $journal = $self->_current_hash->{journal
} || '<no journal>';
558 my $year = $self->_current_hash->{year
} || '<no year>';
559 my $page_location = $self->_current_hash->{page_location
} || '<no pages>';
560 my $volumn = $self->_current_hash->{volumn
} || '<no volume>';
562 $self->_current_hash->{medline
} || $self->_current_hash->{pubmed
};
564 $publ->authors($self->_current_hash->{author
});
565 $publ->location("$journal, $year, V $volumn, $page_location");
566 $publ->title($self->_current_hash->{title
});
567 $publ->medline($medline);
568 if ($self->_current_hash->{pubmed
}
569 && ($self->_current_hash->{pubmed
} != $medline)) {
570 $publ->pubmed($self->_current_hash->{pubmed
});
573 # Clear the above in current hash
574 $self->_current_hash->{publication
} = undef;
575 $self->_current_hash->{author
} = undef;
576 $self->_current_hash->{journal
} = undef;
577 $self->_current_hash->{year
} = undef;
578 $self->_current_hash->{page_location
}=undef;
579 $self->_current_hash->{volumn
} = undef;
580 $self->_current_hash->{title
} = undef;
581 $self->_current_hash->{medline
} = undef;
582 $self->_current_hash->{pubmed
} = undef;