2 # BioPerl module for Bio::OntologyIO::dagflat
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Hilmar Lapp, hlapp at gmx.net
8 # (c) Christian M. Zmasek, czmasek-at-burnham.org, 2002.
9 # (c) Hilmar Lapp, hlapp at gmx.net, 2003.
10 # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
12 # You may distribute this module under the same terms as perl itself.
13 # Refer to the Perl Artistic License (see the license accompanying this
14 # software package, or see http://www.perl.com/language/misc/Artistic.html)
15 # for the terms under which you may use, modify, and redistribute this module.
17 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
18 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
19 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
21 # You may distribute this module under the same terms as perl itself
23 # POD documentation - main docs before the code
27 Bio::OntologyIO::dagflat - a base class parser for GO flat-file type formats
33 # do not use directly -- use via Bio::OntologyIO
34 # e.g., the GO parser is a simple extension of this class
35 my $parser = Bio::OntologyIO->new
37 -defs_file => "/home/czmasek/GO/GO.defs",
38 -files => ["/home/czmasek/GO/component.ontology",
39 "/home/czmasek/GO/function.ontology",
40 "/home/czmasek/GO/process.ontology"] );
42 my $go_ontology = $parser->next_ontology();
44 my $IS_A = Bio::Ontology::RelationshipType->get_instance( "IS_A" );
45 my $PART_OF = Bio::Ontology::RelationshipType->get_instance( "PART_OF" );
46 my $RELATED_TO = Bio::Ontology::RelationshipType->get_instance( "RELATED_TO" );
50 Needs Graph.pm from CPAN.
56 User feedback is an integral part of the evolution of this and other
57 Bioperl modules. Send your comments and suggestions preferably to the
58 Bioperl mailing lists Your participation is much appreciated.
60 bioperl-l@bioperl.org - General discussion
61 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
65 Please direct usage questions or support issues to the mailing list:
67 I<bioperl-l@bioperl.org>
69 rather than to the module maintainer directly. Many experienced and
70 reponsive experts will be able look at the problem and quickly
71 address it. Please include a thorough description of the problem
72 with code and data examples if at all possible.
76 Report bugs to the Bioperl bug tracking system to help us keep track
77 the bugs and their resolution. Bug reports can be submitted via the
80 https://github.com/bioperl/bioperl-live/issues
86 Email: czmasek-at-burnham.org or cmzmasek@yahoo.com
88 WWW: http://monochrome-effect.net/
92 Genomics Institute of the Novartis Research Foundation
93 10675 John Jay Hopkins Drive
98 Hilmar Lapp, hlapp at gmx.net
102 The rest of the documentation details each of the object
103 methods. Internal methods are usually preceded with a _
108 # Let the code begin...
111 package Bio
::OntologyIO
::dagflat
;
116 use Bio
::Ontology
::OBOEngine
;
117 use Bio
::Ontology
::Ontology
;
118 use Bio
::Ontology
::OntologyStore
;
119 use Bio
::Ontology
::TermFactory
;
120 use Bio
::Annotation
::DBLink
;
122 use constant TRUE
=> 1;
123 use constant FALSE
=> 0;
126 use base
qw(Bio::OntologyIO);
132 Usage : $parser = Bio::OntologyIO->new(
134 -defs_file => "/path/to/GO.defs",
135 -files => ["/path/to/component.ontology",
136 "/path/to/function.ontology",
137 "/path/to/process.ontology"] );
138 Function: Creates a new dagflat parser.
139 Returns : A new dagflat parser object, implementing Bio::OntologyIO.
140 Args : -defs_file => the name of the file holding the term
142 -files => a single ontology flat file holding the
143 term relationships, or an array ref holding
144 the file names (for GO, there will usually be
145 3 files: component.ontology, function.ontology,
147 -file => if there is only a single flat file, it may
148 also be specified via the -file parameter
149 -ontology_name => the name of the ontology; if not specified the
150 parser will auto-discover it by using the term
151 that starts with a $, and converting underscores
153 -engine => the Bio::Ontology::OntologyEngineI object
154 to be reused (will be created otherwise); note
155 that every Bio::Ontology::OntologyI will
156 qualify as well since that one inherits from the
159 See L<Bio::OntologyIO>.
163 # in reality, we let OntologyIO::new do the instantiation, and override
164 # _initialize for all initialization work
166 my ($self, %arg) = @_;
168 my ( $defs_file_name,$files,$defs_url,$url,$name,$eng ) =
169 $self->_rearrange([qw( DEFS_FILE
178 delete($arg{-url
}); #b/c GO has 3 files...
180 $self->SUPER::_initialize
( %arg );
182 $self->_done( FALSE
);
183 $self->_not_first_record( FALSE
);
185 delete $self->{'_ontologies'};
187 # ontology engine (and possibly name if it's an OntologyI)
188 $eng = Bio
::Ontology
::OBOEngine
->new() unless $eng;
189 if($eng->isa("Bio::Ontology::OntologyI")) {
190 $self->ontology_name($eng->name());
191 $eng = $eng->engine() if $eng->can('engine');
193 $self->_ont_engine($eng);
195 # flat files to parse
196 if(defined($defs_file_name) && defined($defs_url)){
197 $self->throw('cannot provide both -defs_file and -defs_url');
199 defined($defs_file_name) && $self->defs_file( $defs_file_name );
200 defined($defs_url) && $self->defs_url( $defs_url );
203 if(defined($files) && defined($url)){
204 } elsif(defined($files)){
205 $self->{_flat_files
} = $files ?
ref($files) ?
$files : [$files] : [];
206 } elsif(defined($url)){
210 # ontology name (overrides implicit one through OntologyI engine)
211 $self->ontology_name($name) if $name;
217 Title : ontology_name
218 Usage : $obj->ontology_name($newval)
219 Function: Get/set the name of the ontology parsed by this module.
221 Returns : value of ontology_name (a scalar)
222 Args : on set, new value (a scalar or undef, optional)
230 return $self->{'ontology_name'} = shift if @_;
231 return $self->{'ontology_name'};
238 Usage : $parser->parse();
239 Function: Parses the files set with "new" or with methods
240 defs_file and _flat_files.
242 Normally you should not need to call this method as it will
243 be called automatically upon the first call to
246 Returns : [Bio::Ontology::OntologyEngineI]
255 # setup the default term factory if not done by anyone yet
256 $self->term_factory(Bio
::Ontology
::TermFactory
->new(
257 -type
=> "Bio::Ontology::Term"))
258 unless $self->term_factory();
260 # create the ontology object itself
261 my $ont = Bio
::Ontology
::Ontology
->new(-name
=> $self->ontology_name(),
262 -engine
=> $self->_ont_engine());
265 while( my $term = $self->_next_term() ) {
266 $self->_add_term( $term, $ont );
269 # set up the ontology of the relationship types
270 foreach ($self->_part_of_relationship(), $self->_is_a_relationship(), $self->_related_to_relationship()) {
274 # pre-seed the IO system with the first flat file if -file wasn't provided
277 if(ref($self->url) eq 'ARRAY'){
279 foreach my $url (@
{ $self->url }){
282 #warn scalar($ont->get_all_terms());
283 $self->_initialize_io(-url
=> $url);
284 $self->_parse_flat_file($ont);
288 $self->_initialize_io(-url
=> $self->url);
290 } elsif($self->_flat_files){
291 $self->_initialize_io(-file
=> shift(@
{$self->_flat_files()}));
296 $self->_parse_flat_file($ont);
297 # advance to next flat file if more are available
298 if(@
{$self->_flat_files()}) {
300 $self->_initialize_io(-file
=> shift(@
{$self->_flat_files()}));
302 last; # nothing else to parse so terminate the loop
305 $self->_add_ontology($ont);
307 # not needed anywhere, only because of backward compatibility
308 return $self->_ont_engine();
313 Title : next_ontology
315 Function: Get the next available ontology from the parser. This is the
316 method prescribed by Bio::OntologyIO.
318 Returns : An object implementing Bio::Ontology::OntologyI, and undef if
319 there is no more ontology in the input.
328 # parse if not done already
329 $self->parse() unless exists($self->{'_ontologies'});
330 # return next available ontology
331 if(exists($self->{'_ontologies'})){
332 my $ont = shift (@
{$self->{'_ontologies'}});
334 my $store = Bio
::Ontology
::OntologyStore
->new();
335 $store->register_ontology($ont);
345 Usage : $parser->defs_file( "GO.defs" );
346 Function: Set/get for the term definitions filename.
347 Returns : The term definitions file name [string].
348 Args : On set, the term definitions file name [string] (optional).
357 $self->{ "_defs_file_name" } = $f;
358 $self->_defs_io->close() if $self->_defs_io();
360 $self->_defs_io( Bio
::Root
::IO
->new( -input
=> $f ) );
363 return $self->{ "_defs_file_name" };
370 $self->{'_defs_url'} = $val;
372 $self->_defs_io->close() if $self->_defs_io();
373 $self->_defs_io( Bio
::Root
::IO
->new( -url
=> $val ) );
375 return $self->{'_defs_url'};
382 $self->{'_url'} = $val;
384 return $self->{'_url'};
391 Function: Closes this ontology stream and associated file handles.
393 Clients should call this method especially when they write
396 We need to override this here in order to close the file
397 handle for the term definitions file.
409 # first call the inherited implementation
410 $self->SUPER::close();
411 # then close the defs file io (if there is one)
412 $self->_defs_io->close() if $self->_defs_io();
418 Usage : $files_to_parse = $parser->_flat_files();
419 Function: Get the array of ontology flat files that need to be parsed.
421 Note that this array will decrease in elements over the
422 parsing process. Therefore, it\'s value outside of this
423 module will be limited. Also, be careful not to alter the
424 array unless you know what you are doing.
426 Returns : a reference to an array of zero or more strings
434 $self->{_flat_files
} = [] unless exists($self->{_flat_files
});
435 return $self->{_flat_files
};
445 Usage : $obj->_defs_io($newval)
446 Function: Get/set the Bio::Root::IO instance representing the
447 definition file, if provided (see defs_file()).
449 Returns : value of _defs_io (a Bio::Root::IO object)
450 Args : on set, new value (a Bio::Root::IO object or undef, optional)
457 return $self->{'_defs_io'} = shift if @_;
458 return $self->{'_defs_io'};
463 $self->{'_ontologies'} = [] unless exists($self->{'_ontologies'});
464 foreach my $ont (@_) {
465 $self->throw(ref($ont)." does not implement Bio::Ontology::OntologyI")
466 unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
467 # the ontology name may have been auto-discovered while parsing
469 $ont->name($self->ontology_name) unless $ont->name();
470 push(@
{$self->{'_ontologies'}}, $ont);
474 # This simply delegates. See SimpleGOEngine.
476 my ( $self, $term, $ont ) = @_;
477 $term->ontology($ont) if $ont && (! $term->ontology);
478 $self->_ont_engine()->add_term( $term );
483 # This simply delegates. See SimpleGOEngine
484 sub _part_of_relationship
{
487 return $self->_ont_engine()->part_of_relationship(@_);
488 } # _part_of_relationship
492 # This simply delegates. See SimpleGOEngine
493 sub _is_a_relationship
{
496 return $self->_ont_engine()->is_a_relationship(@_);
497 } # _is_a_relationship
499 # This simply delegates. See SimpleGOEngine
500 sub _related_to_relationship
{
503 return $self->_ont_engine()->related_to_relationship(@_);
504 } # _is_a_relationship
508 # This simply delegates. See SimpleGOEngine
509 sub _add_relationship
{
510 my ( $self, $parent, $child, $type, $ont ) = @_;
512 # note the triple terminology (subject,predicate,object) corresponds to
513 # (child,type,parent)
514 $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
517 } # _add_relationship
520 # This simply delegates. See SimpleGOEngine
524 return $self->_ont_engine()->has_term( @_ );
529 # This parses the relationships files
530 sub _parse_flat_file
{
535 my $prev_spaces = -1;
538 while ( my $line = $self->_readline() ) {
540 if ( $line =~ /^!/ ) {
544 # split into term specifications
545 my @termspecs = split(/ (?=[%<])/, $line);
546 # the first element is whitespace only
547 shift(@termspecs) if $termspecs[0] =~ /^\s*$/;
549 # parse out the focus term
550 my $current_term = $self->_get_first_termid( $termspecs[0] );
551 my @syns = $self->_get_synonyms( $termspecs[0] );
552 my @sec_go_ids = $self->_get_secondary_termids( $termspecs[0] );
553 my @cross = $self->_get_db_cross_refs( $termspecs[0] );
555 foreach my $cross_ref (@cross) {
556 $cross_ref eq $current_term && next;
557 push(@cross_refs, $cross_ref);
560 # parse out the parents of the focus term
562 my @isa_parents = ();
563 my @partof_parents = ();
564 foreach my $parent (@termspecs) {
565 if (index($parent, "%") == 0) {
566 push(@isa_parents, $self->_get_first_termid($parent));
567 } elsif (index($parent, "<") == 0) {
568 push(@partof_parents, $self->_get_first_termid($parent));
570 $self->warn("unhandled relationship type in '".$parent."'");
574 if ( ! $self->_has_term( $current_term ) ) {
575 my $term =$self->_create_ont_entry($self->_get_name($line,
578 $self->_add_term( $term, $ont );
581 my $current_term_object = $self->_ont_engine()->get_terms( $current_term );
582 my $anno = $self->_to_annotation(\
@cross_refs);
583 $current_term_object->add_dbxref(-dbxrefs
=> $anno);
584 $current_term_object->add_secondary_id( @sec_go_ids );
585 $current_term_object->add_synonym( @syns );
586 unless ( $line =~ /^\$/ ) {
587 $current_term_object->ontology( $ont );
589 foreach my $parent ( @isa_parents ) {
590 if ( ! $self->_has_term( $parent ) ) {
591 my $term = $self->_create_ont_entry($self->_get_name($line,
594 $self->_add_term( $term, $ont );
597 $self->_add_relationship( $parent,
599 $self->_is_a_relationship(),
603 foreach my $parent ( @partof_parents ) {
604 if ( ! $self->_has_term( $parent ) ) {
605 my $term = $self->_create_ont_entry($self->_get_name($line,
608 $self->_add_term( $term, $ont );
611 $self->_add_relationship( $parent,
613 $self->_part_of_relationship(),
617 my $current_spaces = $self->_count_spaces( $line );
619 if ( $current_spaces != $prev_spaces ) {
621 if ( $current_spaces == $prev_spaces + 1 ) {
622 push( @stack, $prev_term );
623 } elsif ( $current_spaces < $prev_spaces ) {
624 my $n = $prev_spaces - $current_spaces;
625 for ( my $i = 0; $i < $n; ++$i ) {
629 $self->throw( "format error (file ".$self->file.")" );
633 my $parent = $stack[ @stack - 1 ];
635 # add a relationship if the line isn\'t the one with the root term
636 # of the ontology (which is also the name of the ontology)
637 if ( index($line,'$') != 0 ) {
638 #adding @reltype@ syntax
639 if ( $line !~ /^\s*([<%~]|\@\w+?\@)/ ) {
640 $self->throw( "format error (file ".$self->file.") offending line:\n$line" );
643 my($relstring) = $line =~ /^\s*([<%~]|\@[^\@]+?\@)/;
647 if ($relstring eq '<') {
648 $reltype = $self->_part_of_relationship;
649 } elsif ($relstring eq '%') {
650 $reltype = $self->_is_a_relationship;
651 } elsif ($relstring eq '~') {
652 $reltype = $self->_related_to_relationship;
654 $relstring =~ s/\@//g;
655 if ($self->_ont_engine->get_relationship_type($relstring)) {
656 $reltype = $self->_ont_engine->get_relationship_type($relstring);
658 $self->_ont_engine->add_relationship_type($relstring, $ont);
659 $reltype = $self->_ont_engine->get_relationship_type($relstring);
663 #my $reltype = ($line =~ /^\s*</) ?
664 #$self->_part_of_relationship() :
665 #$self->_is_a_relationship();
666 $self->_add_relationship( $parent, $current_term, $reltype, $ont);
669 $prev_spaces = $current_spaces;
670 $prev_term = $current_term;
673 } # _parse_relationships_file
677 # Parses the 1st term id number out of line.
678 sub _get_first_termid
{
679 my ( $self, $line ) = @_;
680 if ( $line =~ /;\s*([A-Z_]{1,8}:\d{1,})/ ) {
681 # if ( $line =~ /;\s*(\w+:\w+)/ ) {
685 $self->throw( "format error: no term id in line \"$line\"" );
688 } # _get_first_termid
692 # Parses the name out of line.
694 my ( $self, $line, $termid ) = @_;
696 if ( $line =~ /([^;<%~]+);\s*$termid/ ) {
698 # remove trailing and leading whitespace
701 $name =~ s/\@.+?\@//;
702 # remove leading dollar character; also we default the name of the
703 # ontology to this name unless it is preset to something else
704 if(index($name,'$') == 0) {
705 $name = substr($name,1);
706 # replace underscores by spaces for setting the ontology name
707 $self->ontology_name(join(" ",split(/_/,$name)))
708 unless $self->ontology_name();
718 # Parses the synonyms out of line.
720 my ( $self, $line ) = @_;
724 while ( $line =~ /synonym\s*:\s*([^;<%~]+)/g ) {
728 push( @synonyms, $syn );
736 # Parses the db cross refs out of line.
737 sub _get_db_cross_refs
{
738 my ( $self, $line ) = @_;
742 while ( $line =~ /;([^;<%~:]+:[^;<%~:]+)/g ) {
744 if ( $ref =~ /synonym/ || $ref =~ /[A-Z]{1,8}:\d{3,}/ ) {
750 $ref = $self->unescape( $ref );
752 push( @refs, $ref ) if defined $ref;
759 # Parses the secondary go ids out of a line
760 sub _get_secondary_termids
{
761 my ( $self, $line ) = @_;
764 # while ( $line =~ /,\s*([A-Z]{1,8}:\d{3,})/g ) {
765 while ( $line =~ /,\s*(\w+:\w+)/g ) {
771 } # _get_secondary_termids
774 # Counts the spaces at the beginning of a line in the relationships files
776 my ( $self, $line ) = @_;
778 if ( $line =~ /^(\s+)/ ) {
787 # "next" method for parsing the defintions file
791 if ( ($self->_done() == TRUE
) || (! $self->_defs_io())) {
797 my $next_term = $self->_term();
803 while( $line = ( $self->_defs_io->_readline() ) ) {
805 || $line =~ /^\s*!/ ) {
808 elsif ( $line =~ /^\s*term:\s*(.+)/ ) {
810 last if $self->_not_first_record();
812 $self->_not_first_record( TRUE
);
814 elsif ( $line =~ /^\s*[a-z]{0,8}id:\s*(.+)/ ) {
817 elsif ( $line =~ /^\s*definition:\s*(.+)/ ) {
818 $def = $self->unescape($1);
819 $isobsolete = 1 if index($def,"OBSOLETE") == 0;
821 elsif ( $line =~ /^\s*definition_reference:\s*(.+)/ ) {
822 push( @def_refs, $self->unescape($1) );
824 elsif ( $line =~ /^\s*comment:\s*(.+)/ ) {
825 $comment = $self->unescape($1);
828 $self->_done( TRUE
) unless $line; # we'll come back until done
829 return $self->_create_ont_entry( $next_term, $termid, $def,
830 $comment, \
@def_refs, $isobsolete);
835 # Holds the GO engine to be parsed into
837 my ( $self, $value ) = @_;
839 if ( defined $value ) {
840 $self->{ "_ont_engine" } = $value;
843 return $self->{ "_ont_engine" };
847 # Used to create ontology terms.
848 # Arguments: name, id
849 sub _create_ont_entry
{
850 my ( $self, $name, $termid, $def, $cmt, $dbxrefs, $obsolete ) = @_;
852 if((!defined($obsolete)) && (index(lc($name),"obsolete") == 0)) {
855 my $anno = $self->_to_annotation($dbxrefs);
856 my $term = $self->term_factory->create_object(-name
=> $name,
857 -identifier
=> $termid,
861 -is_obsolete
=> $obsolete);
864 } # _create_ont_entry
868 # Holds whether first record or not
869 sub _not_first_record
{
870 my ( $self, $value ) = @_;
872 if ( defined $value ) {
873 $self->{ "_not_first_record" } = $value;
876 return $self->{ "_not_first_record" };
877 } # _not_first_record
881 # Holds whether done or not
883 my ( $self, $value ) = @_;
885 if ( defined $value ) {
886 $self->{ "_done" } = $value;
889 return $self->{ "_done" };
895 my ( $self, $value ) = @_;
897 if ( defined $value ) {
898 $self->{ "_term" } = $value;
901 return $self->{ "_term" };
904 # convert simple strings to Bio::Annotation::DBLinks
906 my ($self , $links) = @_;
907 return unless $links;
909 for my $string (@
{$links}) {
910 my ($db, $id) = split(':',$string);
911 push @dbxrefs, Bio
::Annotation
::DBLink
->new(-database
=> $db, -primary_id
=> $id);