Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / OntologyIO / Handlers / InterPro_BioSQL_Handler.pm
blob8908ba8787b757b2eb30d6d89333243f1852e6aa
2 # BioPerl module for InterPro_BioSQL_Handler
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Juguang Xiao, juguang@tll.org.sg
8 # Copyright Juguang Xiao
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::OntologyIO::Handlers::InterPro_BioSQL_Handler - parse an InterPro XML file and persist the resulting terms to a Biosql database
18 =head1 SYNOPSIS
20 # see load_interpro.pl in bioperl-db/scripts/biosql/
22 =head1 DESCRIPTION
24 This module is for parsing an InterPro XML file and persist the
25 resulting terms to a Biosql database as soon as the term is complete
26 as signaled by the appropriate xml tag. This parser takes advantage of
27 SAX, a stream-based XML parser technology, to keep the used memory as
28 small as possible. The alternative parser for InterPro, module
29 InterProHandler, builds up the entire ontology in memory, which given
30 the size of the latest InterPro releases requires a huge amount of
31 memory.
33 This module takes the following non-standard arguments upon
34 instantiation.
36 -db the adaptor factory as returned by a call to
37 Bio::DB::BioDB->new()
38 -version the InterPro version (not available as property!)
39 -term_factory the object factory to use for creating terms
41 Note that there are two alternatives for how to persist the terms and
42 relationships to the database. The default is using the adaptor
43 factory passed as -db or set as a property to create persistent
44 objects and store them in the database. The alternative is to specify
45 a term persistence and a relationship persistence handler; if one or
46 both have been set, the respective handler will be called with each
47 term and relationship that is to be stored. See properties
48 persist_term_handler and persist_relationship_handler.
50 =head1 AUTHOR
52 Juguang Xiao, juguang@tll.org.sg
54 =head1 Contributors
56 Hilmar Lapp, hlapp at gmx.net
58 =head2 APPENDIX
60 The rest of the documentation details each of the object methods.
61 Internal methods are usually preceded with a _
63 =cut
65 package Bio::OntologyIO::Handlers::InterPro_BioSQL_Handler;
66 use strict;
67 use Bio::Ontology::Ontology;
68 use Bio::Ontology::Term;
69 use Bio::Ontology::TermFactory;
70 use Bio::Ontology::RelationshipType;
71 use Bio::Ontology::Relationship;
72 use Bio::Annotation::DBLink;
73 use Bio::Annotation::Reference;
75 use base qw(Bio::OntologyIO::Handlers::BaseSAXHandler);
77 my $is_a_rel;
78 my $count=0;
80 sub _initialize {
81 my($self,@args)=@_;
82 $self->SUPER::_initialize(@args);
83 my ($db, $version, $fact) = $self->_rearrange(
84 [qw(DB VERSION TERM_FACTORY)], @args);
85 $self->db($db) if $db; # this is now a property and may be set later
86 if (!$fact) {
87 $fact = Bio::Ontology::TermFactory->new(-type=>"Bio::Ontology::Term");
89 $self->term_factory($fact);
90 my $ontology = Bio::Ontology::Ontology->new(-name => 'InterPro');
91 if (defined($version)) {
92 $version = "InterPro version $version";
93 $ontology->definition($version);
95 $self->_ontology($ontology);
96 $is_a_rel = Bio::Ontology::RelationshipType->get_instance('IS_A');
97 $is_a_rel->ontology($ontology);
100 =head2 term_factory
102 Title : term_factory
103 Usage : $obj->term_factory($newval)
104 Function: Get/set the ontology term factory to use.
106 As a user of this module it is not necessary to call this
107 method as there will be default. In order to change the
108 default, the easiest way is to instantiate
109 L<Bio::Ontology::TermFactory> with the proper -type
110 argument. Most if not all parsers will actually use this
111 very implementation, so even easier than the aforementioned
112 way is to simply call
113 $ontio->term_factory->type("Bio::Ontology::MyTerm").
115 Example :
116 Returns : value of term_factory (a Bio::Factory::ObjectFactoryI object)
117 Args : on set, new value (a Bio::Factory::ObjectFactoryI object, optional)
120 =cut
122 sub term_factory{
123 my $self = shift;
125 return $self->{'term_factory'} = shift if @_;
126 return $self->{'term_factory'};
129 =head2 db
131 Title : db
132 Usage : $obj->db($newval)
133 Function: Sets or retrieves the database adaptor factory.
135 The adaptor factory is a Bio::DB::DBAdaptorI compliant
136 object and will be used to obtain the persistence adaptors
137 necessary to serialize terms and relationships to the
138 database.
140 Usually, you will obtain such an object from a call to
141 Bio::DB::BioDB. You *must* set this property before
142 starting the parse.
144 Note that this property is immutable once set, except that
145 you may set it to undef. Therefore, be careful not to set
146 to undef before setting the desired real value.
148 Example :
149 Returns : value of db (a Bio::DB::DBAdaptorI compliant object)
150 Args : on set, new value (a Bio::DB::DBAdaptorI compliant object
151 or undef, optional)
154 =cut
156 sub db {
157 my $self=shift;
158 if(@_){
159 my $db = shift;
160 if ($db && exists($self->{_db}) && ($self->{_db} != $db)) {
161 $self->throw('db may not be modified once set');
163 $self->{_db}=$db;
165 return $self->{_db};
168 =head2 persist_term_handler
170 Title : persist_term_handler
171 Usage : $obj->persist_term_handler($handler,@args)
172 Function: Sets or retrieves the persistence handler for terms along
173 with the constant set of arguments to be passed to the
174 handler.
176 If set, the first argument will be treated as a closure and
177 be called for each term to persist to the database. The
178 term will be passed as a named parameter (-term), followed
179 by the other arguments passed to this setter. Note that
180 this allows one to pass an arbitrary configuration to the
181 handler.
183 If not set, terms will be persisted along with their
184 relationships using the respective persistence adaptor
185 returned by the adaptor factory (see property db).
187 Example :
188 Returns : an array reference with the values passed on set, or an empty
189 array if never set
190 Args : On set, an array of values. The first value is the handler
191 as a closure; all other values will be passed to the handler
192 as constant argument.
195 =cut
197 sub persist_term_handler{
198 my $self = shift;
200 return $self->{'persist_term_handler'} = [@_] if @_;
201 return $self->{'persist_term_handler'} || [];
204 =head2 persist_relationship_handler
206 Title : persist_relationship_handler
207 Usage : $obj->persist_relationship_handler($handler,@args)
208 Function: Sets or retrieves the persistence handler for relationships
209 along with the constant set of arguments to be passed to
210 the handler.
212 If set, the first argument will be treated as a closure and
213 be called for each relationship to persist to the database. The
214 relationship will be passed as a named parameter (-rel), followed
215 by the other arguments passed to this setter. Note that
216 this allows one to pass an arbitrary configuration to the
217 handler.
219 If not set, relationships will be persisted along with their
220 relationships using the respective persistence adaptor
221 returned by the adaptor factory (see property db).
223 Example :
224 Returns : an array reference with the values passed on set, or an empty
225 array if never set
226 Args : On set, an array of values. The first value is the handler
227 as a closure; all other values will be passed to the handler
228 as constant argument.
231 =cut
233 sub persist_relationship_handler{
234 my $self = shift;
236 return $self->{'persist_relationship_handler'} = [@_] if @_;
237 return $self->{'persist_relationship_handler'} || [];
240 =head2 _persist_term
242 Title : _persist_term
243 Usage :
244 Function: Persists a term to the database, using either a previously
245 set persistence handler, or the adaptor factory directly.
246 Example :
247 Returns :
248 Args : the ontology term to persist
251 =cut
253 sub _persist_term {
254 my $self = shift;
255 my $term = shift;
257 my ($handler,@args) = @{$self->persist_term_handler};
258 if ($handler) {
259 &$handler('-term' => $term, @args);
260 } else {
261 # no handler; we'll do this ourselves straight and simple
262 my $db = $self->db();
263 my $pterm = $db->create_persistent($term);
264 eval {
265 $pterm->create();
266 $pterm->commit();
268 if ($@) {
269 $pterm->rollback();
270 $self->warn("failed to store term '".$term->name."': ".$@);
275 =head2 _persist_relationship
277 Title : _persist_relationship
278 Usage :
279 Function: Persists a relationship to the database, using either a
280 previously set persistence handler, or the adaptor factory
281 directly.
283 Example :
284 Returns :
285 Args : the term relationship to persist
288 =cut
290 sub _persist_relationship {
291 my $self = shift;
292 my $rel = shift;
294 my ($handler,@args) = @{$self->persist_relationship_handler};
295 if ($handler) {
296 &$handler('-rel' => $rel, @args);
297 } else {
298 # no handler; we'll do this ourselves straight and simple
299 my $db = $self->db();
300 my $prel = $db->create_persistent($rel);
301 eval {
302 $prel->create();
303 $prel->commit();
305 if ($@) {
306 $prel->rollback();
307 $self->warn("failed to store relationship of subject '"
308 .$rel->subject_term->name."' to object '"
309 .$rel->object_term->name.": ".$@);
314 =head2 _persist_ontology
316 Title : _persist_ontology
317 Usage :
318 Function: Perists the ontology itself to the database, by either
319 inserting or updating it.
321 Note that this will only create or update the ontology as
322 an entity, not any of its terms, relationships, or
323 relationship types.
325 Example :
326 Returns : the ontology as a persistent object with primary key
327 Args : the ontology to persist as a Bio::Ontology::OntologyI
328 compliant object
331 =cut
333 sub _persist_ontology{
334 my $self = shift;
335 my $ont = shift;
336 my $db = $self->db();
338 # do a lookup first; chances are we have this already in the database
339 my $adp = $db->get_object_adaptor($ont);
340 # to avoid clobbering this ontology's properties with possibly older ones
341 # from the database we'll need an object factory
342 my $ontfact =
343 Bio::Factory::ObjectFactory->new(-type=>"Bio::Ontology::Ontology");
344 # do the lookup:
345 my $found = $adp->find_by_unique_key($ont, '-obj_factory' => $ontfact);
346 # make a persistent object of the ontology
347 $ont = $db->create_persistent($ont);
348 # transfer primary key if found in the lookup
349 $ont->primary_key($found->primary_key) if $found;
350 # insert or update
351 my $result;
352 eval {
353 $result = $ont->store();
355 if ($@ || !$result) {
356 $adp->rollback();
357 $self->throw("failed to update ontology '"
358 .$ont->name."' in database".($@ ? ": $@" : ""));
361 # done - we don't commit here
362 return ref($result) ? $result : $ont;
365 sub start_document {
366 my $self = shift;
367 my $ont = $self->_ontology;
368 my @iprtypes = (
369 $self->create_term(-identifier=>'IPR:Family',
370 -name=>'Family',
371 -ontology => $ont),
372 $self->create_term(-identifier=>'IPR:Domain',
373 -name=>'Domain',
374 -ontology => $ont),
375 $self->create_term(-identifier=>'IPR:Repeat',
376 -name=>'Repeat',
377 -ontology => $ont),
378 $self->create_term(-identifier=>'IPR:PTM',
379 -name=>'post-translational modification',
380 -ontology => $ont),
381 $self->create_term(-identifier=>'IPR:Active_site',
382 -name=>'Active_site',
383 -ontology => $ont),
384 $self->create_term(-identifier=>'IPR:Binding_site',
385 -name=>'Binding_site',
386 -ontology => $ont),
388 foreach my $iprtype (@iprtypes) {
389 $self->_persist_term($iprtype);
390 $ont->add_term($iprtype);
394 sub start_element {
395 my $self=shift;
396 my $tag=$_[0]->{Name};
397 my %args=%{$_[0]->{Attributes}};
398 my $ont = $self->_ontology;
400 if($tag eq 'interpro'){
401 my $id = $args{id};
402 my $term = $self->create_term(-identifier=>$id);
403 $term->ontology($ont);
404 $term->add_synonym($args{short_name});
405 #$term->definition();
407 my ($object_term) =
408 ($ont->engine->get_term_by_identifier("IPR:".$args{type}));
410 my $rel = Bio::Ontology::Relationship->new(
411 -subject_term => $term,
412 -predicate_term => $is_a_rel,
413 -object_term => $object_term,
414 -ontology => $ont
416 $self->_relationship($rel);
417 }elsif($tag eq 'example'){
418 my $example = Bio::Annotation::DBLink->new;
419 $self->_current_hash->{example} = $example;
420 }elsif($tag eq 'db_xref'){
421 my $top = $self->_top_tag;
422 if($top eq 'example'){
423 my $example = $self->_current_hash->{example};
424 $example->database($args{db});
425 $example->primary_id($args{dbkey});
426 #print "EXAmPLE:\t", $example->database, '|', $example->primary_id, "\n";
427 }elsif($top eq 'child'){
429 }elsif($top eq 'member_list'){
430 my $dblink=Bio::Annotation::DBLink->new(
431 -dbname => $args{id},
432 -primary_id => $args{dbkey},
433 -comment => $args{name}
435 }elsif($top eq 'external_doc_list'){
437 }elsif($top eq 'publication'){
438 if($args{db} eq 'MEDLINE'){
439 $self->_current_hash->{medline} =$args{dbkey};
440 } elsif($args{db} eq 'PUBMED'){
441 $self->_current_hash->{pubmed} =$args{dbkey};
442 }else{
443 $self->warn("'".$args{dbkey}."' is not a MEDLINE publication, "
444 ."don't know how to handle");
446 }elsif($top eq 'structure_db_links'){
448 }elsif($top eq 'abstract'){
451 #else{
452 # $self->warn("unrecognized element '$top' in element '$tag', ignoring");
454 }elsif($tag eq 'publication'){
455 my $publication = Bio::Annotation::Reference->new();
456 $self->_current_hash->{publication} = $publication;
457 }elsif($tag eq 'author_list'){
459 }elsif($tag eq 'journal'){
461 }elsif($tag eq 'location'){
463 }elsif($tag eq 'year'){
465 } elsif (($tag eq 'dbinfo') && ($self->_top_tag eq 'release')) {
466 my $entrydate = $args{file_date} || '';
467 $entrydate =~ s/ \d{2}:\d{2}:\d{2}//;
468 my $def = $ont->definition() || '';
469 $def .= "\n" if length($def) > 0;
470 $def .= $args{dbname}." version ".$args{version}.", "
471 .$args{entry_count}." entries, ".$entrydate;
472 $ont->definition($def);
474 #else{
475 # $self->warn("unrecognized element '$tag', ignoring");
478 $self->_visited_count_inc($tag);
479 $self->_push_tag($tag);
483 sub end_element {
484 my $self=shift;
485 my $tag=shift->{Name};
486 my $chars_in=$self->_chars_hash->{$tag};
487 if($tag eq 'interpro'){
488 my $rel = $self->_relationship;
489 # store subject term first in order to give the handler a chance to
490 # apply whatever custom behaviour
491 # (note that the object term is the InterPro type and has been stored
492 # at the start of the whole document)
493 $self->_persist_term($rel->subject_term);
494 # the store the relationship to the InterPro type
495 $self->_persist_relationship($rel);
496 }elsif($tag eq 'name'){
497 my $rel = $self->_relationship;
498 $rel->subject_term->name($self->_chars_hash->{name});
499 $self->_chars_hash->{name}='';
500 }elsif($tag eq 'abstract'){
501 my $rel = $self->_relationship;
502 my $abstract = $self->_chars_hash->{abstract};
503 $abstract =~ s/\n/ /g;
504 $rel->subject_term->definition($abstract);
505 $self->_chars_hash->{abstract} = '';
506 }elsif($tag eq 'example'){
507 my $example = $self->_current_hash->{example};
508 my $comment = $self->_chars_hash->{example};
509 $comment =~ s/^(\s+)//; $comment =~ s/(\s+)$//;
510 $example->comment($comment);
511 $self->_relationship->subject_term->add_dbxref(-dbxrefs => [$example]);
512 $self->_chars_hash->{example}='';
513 }elsif($tag eq 'publication'){
514 my $publication = $self->_create_publication;
515 $self->_relationship->subject_term->add_reference($publication);
516 }elsif($tag eq 'author_list'){
517 $self->_current_hash->{author} =$chars_in;
518 }elsif($tag eq 'title'){
519 $self->_current_hash->{title}=$chars_in;
520 } elsif ($tag eq 'release') {
521 my $ont = $self->_persist_ontology($self->_ontology);
522 $self->_ontology($ont) if $ont;
524 $self->_pop_tag;
525 $self->_visited_count_dec($tag);
529 sub characters {
530 my $self=shift;
531 my $text = shift->{Data};
533 my $top_tag =$self->_top_tag;
534 $self->_chars_hash->{$top_tag} .= $text;
536 # $self->_chars_hash->{abstract} .= $text if $self->_visited_count('abstract');
539 sub create_term {
540 return shift->term_factory->create_object(@_);
543 sub _ontology {
544 my $self = shift;
545 return $self->{_ontology}=shift if @_;
546 return $self->{_ontology};
549 sub _relationship {
550 my $self =shift;
551 $self->{_relationship}=shift if @_;
552 return $self->{_relationship};
554 sub _create_publication {
555 my $self=shift;
556 my $publ = $self->_current_hash->{publication};
557 my $journal = $self->_current_hash->{journal} || '<no journal>';
558 my $year = $self->_current_hash->{year} || '<no year>';
559 my $page_location = $self->_current_hash->{page_location} || '<no pages>';
560 my $volumn = $self->_current_hash->{volumn} || '<no volume>';
561 my $medline =
562 $self->_current_hash->{medline} || $self->_current_hash->{pubmed};
564 $publ->authors($self->_current_hash->{author});
565 $publ->location("$journal, $year, V $volumn, $page_location");
566 $publ->title($self->_current_hash->{title});
567 $publ->medline($medline);
568 if ($self->_current_hash->{pubmed}
569 && ($self->_current_hash->{pubmed} != $medline)) {
570 $publ->pubmed($self->_current_hash->{pubmed});
573 # Clear the above in current hash
574 $self->_current_hash->{publication} = undef;
575 $self->_current_hash->{author} = undef;
576 $self->_current_hash->{journal} = undef;
577 $self->_current_hash->{year} = undef;
578 $self->_current_hash->{page_location}=undef;
579 $self->_current_hash->{volumn} = undef;
580 $self->_current_hash->{title} = undef;
581 $self->_current_hash->{medline} = undef;
582 $self->_current_hash->{pubmed} = undef;