Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / OntologyIO / obo.pm
blob6f4428555a15fbe3c7982152ab150ac86654e241
2 # BioPerl module for Bio::OntologyIO::obo
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sohel Merchant, s-merchant at northwestern.edu
8 # Copyright Sohel Merchant
10 # You may distribute this module under the same terms as perl itself
13 =head1 NAME
15 Bio::OntologyIO::obo - parser for OBO flat-file format
17 =head1 SYNOPSIS
19 use Bio::OntologyIO;
21 # do not use directly -- use via Bio::OntologyIO
22 my $parser = Bio::OntologyIO->new
23 ( -format => "obo",
24 -file => "gene_ontology.obo");
26 while(my $ont = $parser->next_ontology()) {
27 print "read ontology ",$ont->name()," with ",
28 scalar($ont->get_root_terms), " root terms, and ",
29 scalar($ont->get_all_terms), " total terms, and ",
30 scalar($ont->get_leaf_terms), " leaf terms\n";
33 =head1 DESCRIPTION
35 Parser for OBO flat-file format. 'obo' example:
37 format-version: 1.2
38 ontology: so/dev/externalDerived
39 property_value: owl:versionInfo "$Revision: 80 $" xsd:string
40 default-namespace: SO
42 [Term]
43 id: SO_0000343
44 name: match
45 def: "A region of sequence, aligned to another sequence." []
47 [Term]
48 id: SO_0000039
49 name: match_part
50 def: "A part of a match." []
51 is_a: SO_0000343
53 Specification: L<http://www.geneontology.org/GO.format.obo-1_2.shtml>.
55 =head1 FEEDBACK
57 =head2 Mailing Lists
59 User feedback is an integral part of the evolution of this and other
60 Bioperl modules. Send your comments and suggestions preferably to the
61 Bioperl mailing lists Your participation is much appreciated.
63 bioperl-l@bioperl.org - General discussion
64 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
66 =head2 Support
68 Please direct usage questions or support issues to the mailing list:
70 I<bioperl-l@bioperl.org>
72 rather than to the module maintainer directly. Many experienced and
73 reponsive experts will be able look at the problem and quickly
74 address it. Please include a thorough description of the problem
75 with code and data examples if at all possible.
77 =head2 Reporting Bugs
79 Report bugs to the Bioperl bug tracking system to help us keep track
80 the bugs and their resolution. Bug reports can be submitted via the
81 web:
83 https://github.com/bioperl/bioperl-live/issues
85 =head1 AUTHOR
87 Sohel Merchant
89 Email: s-merchant@northwestern.edu
91 Address:
93 Northwestern University
94 Center for Genetic Medicine (CGM), dictyBase
95 Suite 1206,
96 676 St. Clair st
97 Chicago IL 60611
99 =head2 CONTRIBUTOR
101 Hilmar Lapp, hlapp at gmx.net
102 Chris Mungall, cjm at fruitfly.org
103 Brian Osborne, briano@bioteam.net
105 =head1 APPENDIX
107 The rest of the documentation details each of the object
108 methods. Internal methods are usually preceded with a _
110 =cut
112 package Bio::OntologyIO::obo;
114 use strict;
116 use Bio::Root::IO;
117 use Bio::Ontology::OBOEngine;
118 use Bio::Ontology::Ontology;
119 use Bio::Ontology::OntologyStore;
120 use Bio::Ontology::TermFactory;
121 use Bio::Annotation::Collection;
122 use Text::Balanced qw(extract_quotelike extract_bracketed);
124 use constant TRUE => 1;
125 use constant FALSE => 0;
127 use base qw(Bio::OntologyIO);
129 =head2 new
131 Title : new
132 Usage : $parser = Bio::OntologyIO->new(
133 -format => "obo",
134 -file => "gene_ontology.obo");
135 Function: Creates a new dagflat parser.
136 Returns : A new dagflat parser object, implementing Bio::OntologyIO.
137 Args : -file => a single ontology flat file holding the
138 terms, descriptions and relationships
139 -ontology_name => the name of the ontology; if not specified the
140 parser will assign the name of the ontology as the
141 default-namespace header value from the OBO file.
142 -engine => the Bio::Ontology::OntologyEngineI object
143 to be reused (will be created otherwise); note
144 that every Bio::Ontology::OntologyI will
145 qualify as well since that one inherits from the
146 former.
148 See L<Bio::OntologyIO>.
150 =cut
152 # Let OntologyIO::new() do the instantiation, and override
153 # _initialize for all initialization work
154 sub _initialize {
155 my ( $self, %arg ) = @_;
157 my ( $file, $name, $eng ) = $self->_rearrange(
158 [ qw( FILE ONTOLOGY_NAME ENGINE) ], %arg
161 $self->SUPER::_initialize(%arg);
162 delete $self->{'_ontologies'};
164 # Ontology engine (and possibly name if it's an OntologyI)
165 $eng = Bio::Ontology::OBOEngine->new() unless $eng;
166 if ( $eng->isa("Bio::Ontology::OntologyI") ) {
167 $self->ontology_name( $eng->name() );
168 $eng = $eng->engine() if $eng->can('engine');
170 $self->_ont_engine($eng);
172 $self->ontology_name($name) if $name;
175 =head2 ontology_name
177 Title : ontology_name
178 Usage : $obj->ontology_name($newval)
179 Function: Get/set the name of the ontology parsed by this module.
180 Example :
181 Returns : value of ontology_name (a scalar)
182 Args : on set, new value (a scalar or undef, optional)
184 =cut
186 sub ontology_name {
187 my $self = shift;
189 return $self->{'ontology_name'} = shift if @_;
190 return $self->{'ontology_name'};
193 =head2 parse
195 Title : parse()
196 Usage : $parser->parse();
197 Function: Parses the files set with "new" or with methods
198 defs_file and _flat_files.
200 Normally you should not need to call this method as it will
201 be called automatically upon the first call to
202 next_ontology().
204 Returns : Bio::Ontology::OntologyEngineI
205 Args :
207 =cut
209 sub parse {
210 my $self = shift;
212 # Setup the default term factory if not done by anyone yet
213 $self->term_factory(Bio::Ontology::TermFactory->new( -type => "Bio::Ontology::OBOterm" ) )
214 unless $self->term_factory();
216 # Parse the file header
217 my $annotations_collection = $self->_header();
219 # Create the default ontology object itself
220 my $ont = Bio::Ontology::Ontology->new(
221 -name => $self->ontology_name(),
222 -engine => $self->_ont_engine()
225 # Assign the file headers
226 $ont->annotation($annotations_collection);
228 # Set up the ontology of the relationship types
229 for (
230 $self->_part_of_relationship(),
231 $self->_is_a_relationship(),
232 $self->_related_to_relationship(),
233 $self->_regulates_relationship(),
234 $self->_positively_regulates_relationship(),
235 $self->_negatively_regulates_relationship(),
238 $_->ontology($ont);
241 $self->_add_ontology($ont);
243 # Adding new terms
244 while ( my $term = $self->_next_term() ) {
246 # Check if the terms has a valid ID and NAME otherwise ignore the term
247 if ( !$term->identifier() || !$term->name() ) {
248 $self->throw( "OBO File Format Error on line "
249 . $self->{'_current_line_no'}
250 . "\nThe term does not have a id/name tag. This term will be ignored."
252 next;
255 my $new_ontology_flag = 1;
256 my $ontologies_array_ref = $self->{'_ontologies'};
258 for my $ontology ( @$ontologies_array_ref ) {
259 my ($oname, $t_ns) = ( $ontology->name, $term->namespace );
260 next unless ( defined($oname) && defined($t_ns) );
261 if ( $oname eq $t_ns ) {
262 # No need to create new ontology
263 $new_ontology_flag = 0;
264 $ont = $ontology;
268 if ( $new_ontology_flag && $term->namespace ) {
269 my $new_ont = Bio::Ontology::Ontology->new(
270 -name => $term->namespace,
271 -engine => $self->_ont_engine
273 $new_ont->annotation($annotations_collection);
274 $self->_add_ontology($new_ont);
275 $ont = $new_ont;
278 $self->_add_term( $term, $ont );
280 # Adding the IS_A relationship
281 for my $parent_term ( @{$self->{'_isa_parents'}} ) {
282 # Check if parent exists, if not then add the term to the graph.
283 if ( ! $self->_has_term($parent_term) ) {
284 $self->_add_term( $parent_term, $ont ); # !
287 $self->_add_relationship( $parent_term, $term,
288 $self->_is_a_relationship(), $ont );
291 # Adding the other relationships like part_of, related_to, develops_from
292 for my $relationship ( keys %{$self->{'_relationships'}} ) {
293 my $reltype;
294 # Check if relationship exists, if not add it
295 if ( $self->_ont_engine->get_relationship_type($relationship) ) {
296 $reltype = $self->_ont_engine->get_relationship_type($relationship);
298 else {
299 $self->_ont_engine->add_relationship_type( $relationship, $ont );
300 $reltype = $self->_ont_engine->get_relationship_type($relationship);
303 # Check if the id already exists in the graph
304 for my $id ( @{$self->{'_relationships'}->{$relationship}} ) {
305 my $parent_term = $self->_create_term_object();
306 $parent_term->identifier($id);
307 $parent_term->ontology($ont);
309 if ( ! $self->_has_term($parent_term) ) {
310 $self->_add_term( $parent_term, $ont );
313 $self->_add_relationship( $parent_term, $term, $reltype, $ont );
318 return $self->_ont_engine();
321 =head2 next_ontology
323 Title : next_ontology
324 Usage :
325 Function: Get the next available ontology from the parser. This is the
326 method prescribed by Bio::OntologyIO.
327 Example :
328 Returns : An object implementing Bio::Ontology::OntologyI, and nothing if
329 there is no more ontology in the input.
330 Args :
332 =cut
334 sub next_ontology {
335 my $self = shift;
337 # Parse if not done already
338 $self->parse() unless exists( $self->{'_ontologies'} );
340 # Return next available ontology
341 if ( exists( $self->{'_ontologies'} ) ) {
342 my $ont = shift( @{ $self->{'_ontologies'} } );
343 if ($ont) {
344 my $store = Bio::Ontology::OntologyStore->new();
345 $store->register_ontology($ont);
347 return $ont;
350 return;
353 =head2 close
355 Title : close
356 Usage :
357 Function: Closes this ontology stream and associated file handles.
359 Clients should call this method especially when they write
360 ontologies.
362 We need to override this here in order to close the file
363 handle for the term definitions file.
365 Example :
366 Returns : none
367 Args : none
369 =cut
371 sub close {
372 my $self = shift;
374 # first call the inherited implementation
375 $self->SUPER::close();
378 # INTERNAL METHODS
380 sub _add_ontology {
381 my $self = shift;
382 $self->{'_ontologies'} = [] unless exists( $self->{'_ontologies'} );
383 for my $ont (@_) {
384 $self->throw( ref($ont) . " does not implement Bio::Ontology::OntologyI" )
385 unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
386 # The ontology name may have been auto-discovered while parsing
387 # the file
388 $ont->name( $self->ontology_name ) unless $ont->name();
389 push( @{ $self->{'_ontologies'} }, $ont );
393 # This simply delegates. See Ontology::OBOEngine::add_term.
394 sub _add_term {
395 my ( $self, $term, $ont ) = @_;
396 $term->ontology($ont) if $ont && ( !$term->ontology );
397 $self->_ont_engine()->add_term($term);
400 # This simply delegates. See OBOEngine
401 sub _part_of_relationship {
402 my $self = shift;
404 return $self->_ont_engine()->part_of_relationship(@_);
407 # This simply delegates. See OBOEngine
408 sub _is_a_relationship {
409 my $self = shift;
411 return $self->_ont_engine()->is_a_relationship(@_);
414 # This simply delegates. See OBOEngine
415 sub _related_to_relationship {
416 my $self = shift;
418 return $self->_ont_engine()->related_to_relationship(@_);
421 # This simply delegates. See OBOEngine
422 sub _regulates_relationship {
423 my $self = shift;
425 return $self->_ont_engine()->regulates_relationship(@_);
428 # This simply delegates. See OBOEngine
429 sub _positively_regulates_relationship {
430 my $self = shift;
432 return $self->_ont_engine()->positively_regulates_relationship(@_);
435 # This simply delegates. See OBOEngine
436 sub _negatively_regulates_relationship {
437 my $self = shift;
439 return $self->_ont_engine()->negatively_regulates_relationship(@_);
442 # This simply delegates. See OBOEngine
443 sub _add_relationship {
444 my ( $self, $parent, $child, $type, $ont ) = @_;
445 # Note the triple terminology (subject,predicate,object) corresponds to
446 # (child,type,parent)
447 $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
450 # This simply delegates. See OBOEngine
451 sub _has_term {
452 my $self = shift;
454 return $self->_ont_engine()->has_term(@_);
457 # Holds the OBO engine to be parsed into
458 sub _ont_engine {
459 my ( $self, $value ) = @_;
461 if ( defined $value ) {
462 $self->{"_ont_engine"} = $value;
465 $self->{"_ont_engine"};
468 # Removes the escape characters from the file
469 sub _filter_line {
470 my ( $self, $line ) = @_;
472 chomp($line);
473 $line =~ tr [\200-\377] [\000-\177];
474 # see 'man perlop', section on tr/
475 # weird ascii characters should be excluded
476 $line =~ tr/\0-\10//d; # remove weird characters; ascii 0-8
477 # preserve \11 (9 - tab) and \12 (10-linefeed)
478 $line =~ tr/\13\14//d; # remove weird characters; 11,12
479 # preserve \15 (13 - carriage return)
480 $line =~ tr/\16-\37//d; # remove 14-31 (all rest before space)
481 $line =~ tr/\177//d; # remove DEL character
483 $line =~ s/^\!.*//;
484 $line =~ s/[^\\]\!.*//;
485 $line =~ s/[^\\]\#.*//;
486 $line =~ s/^\s+//;
487 $line =~ s/\s+$//;
489 return $line;
492 # Parses the header
493 sub _header {
494 my $self = shift;
495 my $annotation_collection = Bio::Annotation::Collection->new();
496 my ( $tag, $value );
497 my $line_counter = 0;
498 $self->{'_current_line_no'} = 0;
499 my $format_version_header_flag = 0;
500 my $default_namespace_header_flag = 0;
502 while ( my $line = $self->_readline() ) {
503 ++$line_counter;
504 my $line = $self->_filter_line($line);
506 if ( !$line ) {
507 if ( !$format_version_header_flag ) {
508 $self->throw("Format Error - Cannot find tag format-version." .
509 "This is required in header" );
512 $self->{'_current_line_no'} = $line_counter;
513 return $annotation_collection;
516 # Check if there is a header
517 if ( $line =~ /\[\w*\]/ ) {
518 $self->throw("Format Error - Cannot find tag format-version." .
519 "This is required in header." );
522 # If the line is not null, check it contains at least one colon
523 $self->_check_colon( $line, $line_counter );
525 # These are the allowed headers. Any other headers will be ignored
526 if ( $line =~ /^(\[|format-version:
527 |data-version:
528 |typeref:
529 |version:
530 |date:
531 |saved-by:
532 |auto-generated-by:
533 |default-namespace:
534 |remark:
535 |subsetdef:
536 |import:
537 |synonymtypedef:
538 |idspace:
539 |default-relationship-id-prefix:
540 |id-mapping:
544 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) {
545 ( $tag, $value ) = ( $1, $2 );
548 if ( $tag =~ /format-version/) {
549 $format_version_header_flag = 1;
550 }elsif( $tag =~ /default-namespace/ ) {
551 $default_namespace_header_flag = 1;
554 my $header = Bio::Annotation::SimpleValue->new( -value => $value );
555 $annotation_collection->add_Annotation( $tag, $header );
557 # Assign the Ontology name as the value of the default-namespace header
558 if ( $tag =~ /default-namespace/i ) {
559 $self->ontology_name($value);
565 # Parses each stanza of the file
566 sub _next_term {
567 my $self = shift;
568 my $term;
569 my $skip_stanza_flag = 1;
570 my $line_counter = $self->{'_current_line_no'};
572 while ( my $line = $self->_readline() ) {
573 ++$line_counter;
574 my $line = $self->_filter_line($line);
576 if ( !$line && $term ) {
577 $self->{'_current_line_no'} = $line_counter;
578 return $term;
581 if ( ( $line =~ /^\[(\w+)\]\s*(.*)/ ) ) { # New stanza
582 if ( uc($1) eq "TERM" ) {
583 $term = $self->_create_term_object;
584 $skip_stanza_flag = 0;
586 # Reset the relationships after each stanza
587 $self->{'_relationships'} = {};
588 $self->{'_isa_parents'} = undef;
590 elsif ( uc($1) eq "TYPEDEF" ) {
591 $skip_stanza_flag = 1;
592 # Check if this typedef is already defined by the relationship
594 else {
595 $skip_stanza_flag = 1;
596 $self->warn("OBO File Format Warning on line $line_counter $line\n"
597 . "Unrecognized stanza type found. Skipping this stanza." );
599 next;
602 # If the line is not null, check it contains at least one colon
603 $self->_check_colon( $line, $line_counter );
605 # If there is any tag value other than the list below move to the next tag
606 next if (( $line !~ /^(\[|id:
607 |is_anonymous:
608 |name:
609 |namespace:
610 |alt_id:
611 |def:
612 |comment:
613 |subset:
614 |synonym:
615 |xref:
616 |is_a:
617 |intersection_of:
618 |union_of:
619 |disjoint_from:
620 |relationship:
621 |is_obsolete:
622 |replaced_by:
623 |consider:
624 |created_by:
625 |creation_date:
627 ) || $skip_stanza_flag );
629 # Tag/value pair
630 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) {
631 my ( $tag, $val ) = ( $1, $2 );
633 # If no value for the tag throw a warning
634 if ( !$val ) {
635 $self->warn("OBO File Format Warning on line $line_counter $line\n" .
636 "Tag has no value."
640 my $qh;
641 ( $val, $qh ) = $self->_extract_quals($val);
642 my $val2 = $val;
643 $val2 =~ s/\\,/,/g;
644 $tag = uc($tag);
645 if ( $tag eq "ID" ) {
646 $term->identifier($val);
647 if ( $self->_has_term($term) ) {
648 $term = $self->_ont_engine()->get_terms($val);
651 elsif ( $tag eq "NAME" ) {
652 $term->name($val);
654 elsif ( $tag eq "XREF_ANALOG" ) {
655 if ( !$term->has_dbxref($val) ) {
656 $term->add_dbxref(-dbxrefs => $self->_to_annotation( [$val] ) );
659 elsif ( $tag eq "XREF_UNKNOWN" ) {
660 $term->add_dbxref(-dbxrefs => $self->_to_annotation( [$val] ) );
662 elsif ( $tag eq "NAMESPACE" ) {
663 $term->namespace($val);
665 elsif ( $tag eq "DEF" ) {
666 my ( $defstr, $parts ) = $self->_extract_qstr($val);
667 $term->definition($defstr);
668 my $ann = $self->_to_annotation($parts);
669 $term->add_dbxref( -dbxrefs => $ann );
671 elsif ( $tag eq "SYNONYM" ) {
672 $term->add_synonym($val);
674 elsif ( $tag eq "ALT_ID" ) {
675 $term->add_secondary_id($val);
677 elsif ( $tag =~ /XREF/i ) {
678 $term->add_secondary_id($val);
680 elsif ( $tag eq "IS_OBSOLETE" ) {
681 if ( $val eq 'true' ) {
682 $val = 1;
684 elsif ( $val eq 'false' ) {
685 $val = 0;
687 $term->is_obsolete($val);
689 elsif ( $tag eq "COMMENT" ) {
690 $term->comment($val);
692 elsif ( $tag eq "RELATIONSHIP" ) {
693 $self->_handle_relationship_tag($val);
695 elsif ( $tag eq "IS_A" ) {
696 $val =~ s/ //g;
697 my $parent_term = $self->_create_term_object();
698 $parent_term->identifier($val);
699 push @{ $self->{'_isa_parents'} }, $parent_term;
704 $term;
708 # Creates a Bio::Ontology::OBOterm object
709 sub _create_term_object {
710 my ($self) = @_;
711 my $term = $self->term_factory->create_object();
712 $term;
715 sub _extract_quals {
716 my ( $self, $str ) = @_;
718 my %q = ();
719 if ( $str =~ /(.*)\s+(\{.*\})\s*$/ ) {
720 my $return_str = $1;
721 my $extr = $2;
722 if ($extr) {
723 my @qparts = $self->_split_on_comma($extr);
724 foreach (@qparts) {
725 if (/(\w+)=\"(.*)\"/) {
726 $q{$1} = $2;
728 elsif (/(\w+)=\'(.*)\'/) {
729 $q{$1} = $2;
731 else {
732 warn("$_ in $str");
736 return ( $return_str, \%q );
738 else {
739 return ( $str, {} );
743 sub _extract_qstr {
744 my ( $self, $str ) = @_;
746 my ( $extr, $rem, $prefix ) = extract_quotelike($str);
747 my $txt = $extr;
748 $txt =~ s/^\"//;
749 $txt =~ s/\"$//;
750 if ($prefix) {
751 warn("illegal prefix: $prefix in: $str");
754 my @extra = ();
756 # e.g. synonym: "foo" EXACT [...]
757 if ( $rem =~ /(\w+)\s+(\[.*)/ ) {
758 $rem = $2;
759 push( @extra, split( ' ', $1 ) );
762 my @parts = ();
763 while ( ( $extr, $rem, $prefix ) = extract_bracketed( $rem, '[]' ) ) {
764 last unless $extr;
765 $extr =~ s/^\[//;
766 $extr =~ s/\]$//;
767 push( @parts, $extr ) if $extr;
769 @parts =
770 map { $self->_split_on_comma($_) } @parts;
772 $txt =~ s/\\//g;
774 ( $txt, \@parts, \@extra );
777 sub _split_on_comma {
778 my ( $self, $str ) = @_;
779 my @parts = ();
780 while ( $str =~ /(.*[^\\],\s*)(.*)/ ) {
781 $str = $1;
782 my $part = $2;
783 unshift( @parts, $part );
784 $str =~ s/,\s*$//;
786 unshift( @parts, $str );
788 return map { s/\\//g; $_ } @parts;
791 # This method checks for an existing colon in a line
792 sub _check_colon {
793 my ( $self, $line, $line_no ) = @_;
794 if ( $line && !( $line =~ /:/ ) ) {
795 $self->throw("OBO File Format Error on line $line_no $line\n" .
796 "Cannot find key-terminating colon"
801 # This method handles relationship tags
802 sub _handle_relationship_tag {
803 my ( $self, $val ) = @_;
804 my @parts = split( / /, $val );
805 my $relationship = uc($parts[0]);
806 my $id = $parts[1] =~ /\^(w+)\s+\!/ ? $1 : $parts[1];
807 push @{$self->{_relationships}->{$relationship}}, $id;
810 # Convert simple strings to Bio::Annotation::DBLinks
811 sub _to_annotation {
812 my ($self , $links) = @_;
813 return unless $links;
814 my @dbxrefs;
815 for my $string (@{$links}) {
816 my ($db, $id) = split(':',$string,2);
817 push @dbxrefs, Bio::Annotation::DBLink->new(-database => $db, -primary_id => $id);
820 \@dbxrefs;