2 # BioPerl module for Bio::OntologyIO::obo
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sohel Merchant, s-merchant at northwestern.edu
8 # Copyright Sohel Merchant
10 # You may distribute this module under the same terms as perl itself
15 Bio::OntologyIO::obo - parser for OBO flat-file format
21 # do not use directly -- use via Bio::OntologyIO
22 my $parser = Bio::OntologyIO->new
24 -file => "gene_ontology.obo");
26 while(my $ont = $parser->next_ontology()) {
27 print "read ontology ",$ont->name()," with ",
28 scalar($ont->get_root_terms), " root terms, and ",
29 scalar($ont->get_all_terms), " total terms, and ",
30 scalar($ont->get_leaf_terms), " leaf terms\n";
35 Parser for OBO flat-file format. 'obo' example:
38 ontology: so/dev/externalDerived
39 property_value: owl:versionInfo "$Revision: 80 $" xsd:string
45 def: "A region of sequence, aligned to another sequence." []
50 def: "A part of a match." []
53 Specification: L<http://www.geneontology.org/GO.format.obo-1_2.shtml>.
59 User feedback is an integral part of the evolution of this and other
60 Bioperl modules. Send your comments and suggestions preferably to the
61 Bioperl mailing lists Your participation is much appreciated.
63 bioperl-l@bioperl.org - General discussion
64 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
68 Please direct usage questions or support issues to the mailing list:
70 I<bioperl-l@bioperl.org>
72 rather than to the module maintainer directly. Many experienced and
73 reponsive experts will be able look at the problem and quickly
74 address it. Please include a thorough description of the problem
75 with code and data examples if at all possible.
79 Report bugs to the Bioperl bug tracking system to help us keep track
80 the bugs and their resolution. Bug reports can be submitted via the
83 https://github.com/bioperl/bioperl-live/issues
89 Email: s-merchant@northwestern.edu
93 Northwestern University
94 Center for Genetic Medicine (CGM), dictyBase
101 Hilmar Lapp, hlapp at gmx.net
102 Chris Mungall, cjm at fruitfly.org
103 Brian Osborne, briano@bioteam.net
107 The rest of the documentation details each of the object
108 methods. Internal methods are usually preceded with a _
112 package Bio
::OntologyIO
::obo
;
117 use Bio
::Ontology
::OBOEngine
;
118 use Bio
::Ontology
::Ontology
;
119 use Bio
::Ontology
::OntologyStore
;
120 use Bio
::Ontology
::TermFactory
;
121 use Bio
::Annotation
::Collection
;
122 use Text
::Balanced
qw(extract_quotelike extract_bracketed);
124 use constant TRUE
=> 1;
125 use constant FALSE
=> 0;
127 use base
qw(Bio::OntologyIO);
132 Usage : $parser = Bio::OntologyIO->new(
134 -file => "gene_ontology.obo");
135 Function: Creates a new dagflat parser.
136 Returns : A new dagflat parser object, implementing Bio::OntologyIO.
137 Args : -file => a single ontology flat file holding the
138 terms, descriptions and relationships
139 -ontology_name => the name of the ontology; if not specified the
140 parser will assign the name of the ontology as the
141 default-namespace header value from the OBO file.
142 -engine => the Bio::Ontology::OntologyEngineI object
143 to be reused (will be created otherwise); note
144 that every Bio::Ontology::OntologyI will
145 qualify as well since that one inherits from the
148 See L<Bio::OntologyIO>.
152 # Let OntologyIO::new() do the instantiation, and override
153 # _initialize for all initialization work
155 my ( $self, %arg ) = @_;
157 my ( $file, $name, $eng ) = $self->_rearrange(
158 [ qw( FILE ONTOLOGY_NAME ENGINE) ], %arg
161 $self->SUPER::_initialize
(%arg);
162 delete $self->{'_ontologies'};
164 # Ontology engine (and possibly name if it's an OntologyI)
165 $eng = Bio
::Ontology
::OBOEngine
->new() unless $eng;
166 if ( $eng->isa("Bio::Ontology::OntologyI") ) {
167 $self->ontology_name( $eng->name() );
168 $eng = $eng->engine() if $eng->can('engine');
170 $self->_ont_engine($eng);
172 $self->ontology_name($name) if $name;
177 Title : ontology_name
178 Usage : $obj->ontology_name($newval)
179 Function: Get/set the name of the ontology parsed by this module.
181 Returns : value of ontology_name (a scalar)
182 Args : on set, new value (a scalar or undef, optional)
189 return $self->{'ontology_name'} = shift if @_;
190 return $self->{'ontology_name'};
196 Usage : $parser->parse();
197 Function: Parses the files set with "new" or with methods
198 defs_file and _flat_files.
200 Normally you should not need to call this method as it will
201 be called automatically upon the first call to
204 Returns : Bio::Ontology::OntologyEngineI
212 # Setup the default term factory if not done by anyone yet
213 $self->term_factory(Bio
::Ontology
::TermFactory
->new( -type
=> "Bio::Ontology::OBOterm" ) )
214 unless $self->term_factory();
216 # Parse the file header
217 my $annotations_collection = $self->_header();
219 # Create the default ontology object itself
220 my $ont = Bio
::Ontology
::Ontology
->new(
221 -name
=> $self->ontology_name(),
222 -engine
=> $self->_ont_engine()
225 # Assign the file headers
226 $ont->annotation($annotations_collection);
228 # Set up the ontology of the relationship types
230 $self->_part_of_relationship(),
231 $self->_is_a_relationship(),
232 $self->_related_to_relationship(),
233 $self->_regulates_relationship(),
234 $self->_positively_regulates_relationship(),
235 $self->_negatively_regulates_relationship(),
241 $self->_add_ontology($ont);
244 while ( my $term = $self->_next_term() ) {
246 # Check if the terms has a valid ID and NAME otherwise ignore the term
247 if ( !$term->identifier() || !$term->name() ) {
248 $self->throw( "OBO File Format Error on line "
249 . $self->{'_current_line_no'}
250 . "\nThe term does not have a id/name tag. This term will be ignored."
255 my $new_ontology_flag = 1;
256 my $ontologies_array_ref = $self->{'_ontologies'};
258 for my $ontology ( @
$ontologies_array_ref ) {
259 my ($oname, $t_ns) = ( $ontology->name, $term->namespace );
260 next unless ( defined($oname) && defined($t_ns) );
261 if ( $oname eq $t_ns ) {
262 # No need to create new ontology
263 $new_ontology_flag = 0;
268 if ( $new_ontology_flag && $term->namespace ) {
269 my $new_ont = Bio
::Ontology
::Ontology
->new(
270 -name
=> $term->namespace,
271 -engine
=> $self->_ont_engine
273 $new_ont->annotation($annotations_collection);
274 $self->_add_ontology($new_ont);
278 $self->_add_term( $term, $ont );
280 # Adding the IS_A relationship
281 for my $parent_term ( @
{$self->{'_isa_parents'}} ) {
282 # Check if parent exists, if not then add the term to the graph.
283 if ( ! $self->_has_term($parent_term) ) {
284 $self->_add_term( $parent_term, $ont ); # !
287 $self->_add_relationship( $parent_term, $term,
288 $self->_is_a_relationship(), $ont );
291 # Adding the other relationships like part_of, related_to, develops_from
292 for my $relationship ( keys %{$self->{'_relationships'}} ) {
294 # Check if relationship exists, if not add it
295 if ( $self->_ont_engine->get_relationship_type($relationship) ) {
296 $reltype = $self->_ont_engine->get_relationship_type($relationship);
299 $self->_ont_engine->add_relationship_type( $relationship, $ont );
300 $reltype = $self->_ont_engine->get_relationship_type($relationship);
303 # Check if the id already exists in the graph
304 for my $id ( @
{$self->{'_relationships'}->{$relationship}} ) {
305 my $parent_term = $self->_create_term_object();
306 $parent_term->identifier($id);
307 $parent_term->ontology($ont);
309 if ( ! $self->_has_term($parent_term) ) {
310 $self->_add_term( $parent_term, $ont );
313 $self->_add_relationship( $parent_term, $term, $reltype, $ont );
318 return $self->_ont_engine();
323 Title : next_ontology
325 Function: Get the next available ontology from the parser. This is the
326 method prescribed by Bio::OntologyIO.
328 Returns : An object implementing Bio::Ontology::OntologyI, and nothing if
329 there is no more ontology in the input.
337 # Parse if not done already
338 $self->parse() unless exists( $self->{'_ontologies'} );
340 # Return next available ontology
341 if ( exists( $self->{'_ontologies'} ) ) {
342 my $ont = shift( @
{ $self->{'_ontologies'} } );
344 my $store = Bio
::Ontology
::OntologyStore
->new();
345 $store->register_ontology($ont);
357 Function: Closes this ontology stream and associated file handles.
359 Clients should call this method especially when they write
362 We need to override this here in order to close the file
363 handle for the term definitions file.
374 # first call the inherited implementation
375 $self->SUPER::close();
382 $self->{'_ontologies'} = [] unless exists( $self->{'_ontologies'} );
384 $self->throw( ref($ont) . " does not implement Bio::Ontology::OntologyI" )
385 unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
386 # The ontology name may have been auto-discovered while parsing
388 $ont->name( $self->ontology_name ) unless $ont->name();
389 push( @
{ $self->{'_ontologies'} }, $ont );
393 # This simply delegates. See Ontology::OBOEngine::add_term.
395 my ( $self, $term, $ont ) = @_;
396 $term->ontology($ont) if $ont && ( !$term->ontology );
397 $self->_ont_engine()->add_term($term);
400 # This simply delegates. See OBOEngine
401 sub _part_of_relationship
{
404 return $self->_ont_engine()->part_of_relationship(@_);
407 # This simply delegates. See OBOEngine
408 sub _is_a_relationship
{
411 return $self->_ont_engine()->is_a_relationship(@_);
414 # This simply delegates. See OBOEngine
415 sub _related_to_relationship
{
418 return $self->_ont_engine()->related_to_relationship(@_);
421 # This simply delegates. See OBOEngine
422 sub _regulates_relationship
{
425 return $self->_ont_engine()->regulates_relationship(@_);
428 # This simply delegates. See OBOEngine
429 sub _positively_regulates_relationship
{
432 return $self->_ont_engine()->positively_regulates_relationship(@_);
435 # This simply delegates. See OBOEngine
436 sub _negatively_regulates_relationship
{
439 return $self->_ont_engine()->negatively_regulates_relationship(@_);
442 # This simply delegates. See OBOEngine
443 sub _add_relationship
{
444 my ( $self, $parent, $child, $type, $ont ) = @_;
445 # Note the triple terminology (subject,predicate,object) corresponds to
446 # (child,type,parent)
447 $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
450 # This simply delegates. See OBOEngine
454 return $self->_ont_engine()->has_term(@_);
457 # Holds the OBO engine to be parsed into
459 my ( $self, $value ) = @_;
461 if ( defined $value ) {
462 $self->{"_ont_engine"} = $value;
465 $self->{"_ont_engine"};
468 # Removes the escape characters from the file
470 my ( $self, $line ) = @_;
473 $line =~ tr
[\200-\377] [\000-\177];
474 # see 'man perlop', section on tr/
475 # weird ascii characters should be excluded
476 $line =~ tr/\0-\10//d; # remove weird characters; ascii 0-8
477 # preserve \11 (9 - tab) and \12 (10-linefeed)
478 $line =~ tr/\13\14//d; # remove weird characters; 11,12
479 # preserve \15 (13 - carriage return)
480 $line =~ tr/\16-\37//d; # remove 14-31 (all rest before space)
481 $line =~ tr/\177//d; # remove DEL character
484 $line =~ s/[^\\]\!.*//;
485 $line =~ s/[^\\]\#.*//;
495 my $annotation_collection = Bio
::Annotation
::Collection
->new();
497 my $line_counter = 0;
498 $self->{'_current_line_no'} = 0;
499 my $format_version_header_flag = 0;
500 my $default_namespace_header_flag = 0;
502 while ( my $line = $self->_readline() ) {
504 my $line = $self->_filter_line($line);
507 if ( !$format_version_header_flag ) {
508 $self->throw("Format Error - Cannot find tag format-version." .
509 "This is required in header" );
512 $self->{'_current_line_no'} = $line_counter;
513 return $annotation_collection;
516 # Check if there is a header
517 if ( $line =~ /\[\w*\]/ ) {
518 $self->throw("Format Error - Cannot find tag format-version." .
519 "This is required in header." );
522 # If the line is not null, check it contains at least one colon
523 $self->_check_colon( $line, $line_counter );
525 # These are the allowed headers. Any other headers will be ignored
526 if ( $line =~ /^(\
[|format
-version
:
539 |default-relationship
-id
-prefix
:
544 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) {
545 ( $tag, $value ) = ( $1, $2 );
548 if ( $tag =~ /format-version/) {
549 $format_version_header_flag = 1;
550 }elsif( $tag =~ /default-namespace/ ) {
551 $default_namespace_header_flag = 1;
554 my $header = Bio
::Annotation
::SimpleValue
->new( -value
=> $value );
555 $annotation_collection->add_Annotation( $tag, $header );
557 # Assign the Ontology name as the value of the default-namespace header
558 if ( $tag =~ /default-namespace/i ) {
559 $self->ontology_name($value);
565 # Parses each stanza of the file
569 my $skip_stanza_flag = 1;
570 my $line_counter = $self->{'_current_line_no'};
572 while ( my $line = $self->_readline() ) {
574 my $line = $self->_filter_line($line);
576 if ( !$line && $term ) {
577 $self->{'_current_line_no'} = $line_counter;
581 if ( ( $line =~ /^\[(\w+)\]\s*(.*)/ ) ) { # New stanza
582 if ( uc($1) eq "TERM" ) {
583 $term = $self->_create_term_object;
584 $skip_stanza_flag = 0;
586 # Reset the relationships after each stanza
587 $self->{'_relationships'} = {};
588 $self->{'_isa_parents'} = undef;
590 elsif ( uc($1) eq "TYPEDEF" ) {
591 $skip_stanza_flag = 1;
592 # Check if this typedef is already defined by the relationship
595 $skip_stanza_flag = 1;
596 $self->warn("OBO File Format Warning on line $line_counter $line\n"
597 . "Unrecognized stanza type found. Skipping this stanza." );
602 # If the line is not null, check it contains at least one colon
603 $self->_check_colon( $line, $line_counter );
605 # If there is any tag value other than the list below move to the next tag
606 next if (( $line !~ /^(\
[|id
:
627 ) || $skip_stanza_flag );
630 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) {
631 my ( $tag, $val ) = ( $1, $2 );
633 # If no value for the tag throw a warning
635 $self->warn("OBO File Format Warning on line $line_counter $line\n" .
641 ( $val, $qh ) = $self->_extract_quals($val);
645 if ( $tag eq "ID" ) {
646 $term->identifier($val);
647 if ( $self->_has_term($term) ) {
648 $term = $self->_ont_engine()->get_terms($val);
651 elsif ( $tag eq "NAME" ) {
654 elsif ( $tag eq "XREF_ANALOG" ) {
655 if ( !$term->has_dbxref($val) ) {
656 $term->add_dbxref(-dbxrefs
=> $self->_to_annotation( [$val] ) );
659 elsif ( $tag eq "XREF_UNKNOWN" ) {
660 $term->add_dbxref(-dbxrefs
=> $self->_to_annotation( [$val] ) );
662 elsif ( $tag eq "NAMESPACE" ) {
663 $term->namespace($val);
665 elsif ( $tag eq "DEF" ) {
666 my ( $defstr, $parts ) = $self->_extract_qstr($val);
667 $term->definition($defstr);
668 my $ann = $self->_to_annotation($parts);
669 $term->add_dbxref( -dbxrefs
=> $ann );
671 elsif ( $tag eq "SYNONYM" ) {
672 $term->add_synonym($val);
674 elsif ( $tag eq "ALT_ID" ) {
675 $term->add_secondary_id($val);
677 elsif ( $tag =~ /XREF/i ) {
678 $term->add_secondary_id($val);
680 elsif ( $tag eq "IS_OBSOLETE" ) {
681 if ( $val eq 'true' ) {
684 elsif ( $val eq 'false' ) {
687 $term->is_obsolete($val);
689 elsif ( $tag eq "COMMENT" ) {
690 $term->comment($val);
692 elsif ( $tag eq "RELATIONSHIP" ) {
693 $self->_handle_relationship_tag($val);
695 elsif ( $tag eq "IS_A" ) {
697 my $parent_term = $self->_create_term_object();
698 $parent_term->identifier($val);
699 push @
{ $self->{'_isa_parents'} }, $parent_term;
708 # Creates a Bio::Ontology::OBOterm object
709 sub _create_term_object
{
711 my $term = $self->term_factory->create_object();
716 my ( $self, $str ) = @_;
719 if ( $str =~ /(.*)\s+(\{.*\})\s*$/ ) {
723 my @qparts = $self->_split_on_comma($extr);
725 if (/(\w+)=\"(.*)\"/) {
728 elsif (/(\w+)=\'(.*)\'/) {
736 return ( $return_str, \
%q );
744 my ( $self, $str ) = @_;
746 my ( $extr, $rem, $prefix ) = extract_quotelike
($str);
751 warn("illegal prefix: $prefix in: $str");
756 # e.g. synonym: "foo" EXACT [...]
757 if ( $rem =~ /(\w+)\s+(\[.*)/ ) {
759 push( @extra, split( ' ', $1 ) );
763 while ( ( $extr, $rem, $prefix ) = extract_bracketed
( $rem, '[]' ) ) {
767 push( @parts, $extr ) if $extr;
770 map { $self->_split_on_comma($_) } @parts;
774 ( $txt, \
@parts, \
@extra );
777 sub _split_on_comma
{
778 my ( $self, $str ) = @_;
780 while ( $str =~ /(.*[^\\],\s*)(.*)/ ) {
783 unshift( @parts, $part );
786 unshift( @parts, $str );
788 return map { s/\\//g; $_ } @parts;
791 # This method checks for an existing colon in a line
793 my ( $self, $line, $line_no ) = @_;
794 if ( $line && !( $line =~ /:/ ) ) {
795 $self->throw("OBO File Format Error on line $line_no $line\n" .
796 "Cannot find key-terminating colon"
801 # This method handles relationship tags
802 sub _handle_relationship_tag
{
803 my ( $self, $val ) = @_;
804 my @parts = split( / /, $val );
805 my $relationship = uc($parts[0]);
806 my $id = $parts[1] =~ /\^(w+)\s+\!/ ?
$1 : $parts[1];
807 push @
{$self->{_relationships
}->{$relationship}}, $id;
810 # Convert simple strings to Bio::Annotation::DBLinks
812 my ($self , $links) = @_;
813 return unless $links;
815 for my $string (@
{$links}) {
816 my ($db, $id) = split(':',$string,2);
817 push @dbxrefs, Bio
::Annotation
::DBLink
->new(-database
=> $db, -primary_id
=> $id);