Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / OntologyIO / simplehierarchy.pm
blob587f6ef694f42054aa357fb496edc4a7444a6fda
2 # BioPerl module for Bio::OntologyIO::simplehierarchy
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Allen Day, allenday@ucla.edu
8 # (c) Allen Day, allenday@ucla.edu, 2003.
9 # (c) Department of Human Genetics, UCLA Medical School, 2003.
11 # You may distribute this module under the same terms as perl itself.
12 # Refer to the Perl Artistic License (see the license accompanying this
13 # software package, or see http://www.perl.com/language/misc/Artistic.html)
14 # for the terms under which you may use, modify, and redistribute this module.
16 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
17 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
18 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
20 # You may distribute this module under the same terms as perl itself
22 # POD documentation - main docs before the code
24 =head1 NAME
26 Bio::OntologyIO::simplehierarchy - a base class parser for simple hierarchy-by-indentation
27 type formats
29 =head1 SYNOPSIS
31 use Bio::OntologyIO;
33 # do not use directly -- use via Bio::OntologyIO
34 my $parser = Bio::OntologyIO->new
35 ( -format => "simplehierarchy",
36 -file => "pathology_terms.csv",
37 -indent_string => ",",
38 -ontology_name => "eVOC",
39 -term_factory => $fact,
42 my $ontology = $parser->next_ontology();
44 =head1 DESCRIPTION
46 Needs Graph.pm from CPAN. This class is nearly identical to
47 OntologyIO::dagflat, see L<Bio::OntologyIO::dagflat> for details.
49 =head1 FEEDBACK
51 =head2 Mailing Lists
53 User feedback is an integral part of the evolution of this and other
54 Bioperl modules. Send your comments and suggestions preferably to the
55 Bioperl mailing lists Your participation is much appreciated.
57 bioperl-l@bioperl.org - General discussion
58 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
60 =head2 Support
62 Please direct usage questions or support issues to the mailing list:
64 I<bioperl-l@bioperl.org>
66 rather than to the module maintainer directly. Many experienced and
67 reponsive experts will be able look at the problem and quickly
68 address it. Please include a thorough description of the problem
69 with code and data examples if at all possible.
71 =head2 Reporting Bugs
73 Report bugs to the Bioperl bug tracking system to help us keep track
74 the bugs and their resolution. Bug reports can be submitted via the
75 web:
77 https://github.com/bioperl/bioperl-live/issues
79 =head1 AUTHOR
81 Allen Day
83 Email: allenday@ucla.edu
85 =head2 CONTRIBUTOR
87 Christian Zmasek
89 =head1 APPENDIX
91 The rest of the documentation details each of the object
92 methods. Internal methods are usually preceded with a _
94 =cut
97 # Let the code begin...
100 package Bio::OntologyIO::simplehierarchy;
102 use strict;
104 use Data::Dumper;
105 use File::Basename;
106 use Bio::Root::IO;
107 use Bio::Ontology::OBOEngine;
108 use Bio::Ontology::Ontology;
109 use Bio::Ontology::TermFactory;
111 use constant TRUE => 1;
112 use constant FALSE => 0;
115 use base qw(Bio::OntologyIO);
118 =head2 new
120 Title : new
121 Usage : see SYNOPSIS
122 Function: Creates a new simplehierarchy parser.
123 Returns : A new simplehierarchy parser object, implementing Bio::OntologyIO.
124 Args : -files => a single ontology flat file holding the
125 term relationships, or an array ref holding
126 the file names
127 -file => if there is only a single flat file, it may
128 also be specified via the -file parameter
129 -ontology_name => the name of the ontology, defaults to
130 "Gene Ontology"
131 -file_is_root => Boolean indicating whether a virtual root
132 term is to be added, the name of which will
133 be derived from the file name. Default is false.
134 Enabling this allows one to parse multiple input
135 files into the same ontology and still have
136 separately rooted.
137 -engine => the L<Bio::Ontology::OntologyEngineI> object
138 to be reused (will be created otherwise); note
139 that every L<Bio::Ontology::OntologyI> will
140 qualify as well since that one inherits from the
141 former.
142 -indent_string => the string used to indent hierarchical
143 levels in the file.
145 For a file like this:
147 term0
148 subterm1A
149 subterm2A
150 subterm1B
151 subterm1C
153 indent_string would be " ". Defaults to
154 one space (" ").
155 -comment_char => Allows specification of a regular
156 expression string to indicate a comment line.
157 Currently defaults to "[\|\-]".
158 Note: this is not yet implemented.
160 See L<Bio::OntologyIO>.
162 =cut
164 # in reality, we let OntologyIO::new do the instantiation, and override
165 # _initialize for all initialization work
166 sub _initialize {
167 my ($self, @args) = @_;
168 $self->SUPER::_initialize( @args );
170 my ( $indent,$files,$fileisroot,$name,$eng ) =
171 $self->_rearrange([qw(INDENT_STRING
172 FILES
173 FILE_IS_ROOT
174 ONTOLOGY_NAME
175 ENGINE)
176 ], @args);
178 $self->_done( FALSE );
179 $self->_not_first_record( FALSE );
180 $self->_term( "" );
181 $self->file_is_root($fileisroot) if defined($fileisroot);
182 $indent = ' ' unless defined($indent); #reasonable default?
183 # the indentation string may have escaped chars
184 if (($indent =~ /\\/) && ($indent !~ /[\$\`]/)) {
185 $indent = "\$indent = \"$indent\"";
186 eval $indent;
188 $self->indent_string($indent);
189 delete $self->{'_ontologies'};
191 # ontology engine (and possibly name if it's an OntologyI)
192 $eng = Bio::Ontology::OBOEngine->new() unless $eng;
193 if($eng->isa("Bio::Ontology::OntologyI")) {
194 $self->ontology_name($eng->name());
195 $eng = $eng->engine() if $eng->can('engine');
197 $self->_ont_engine($eng);
199 # flat files to parse
200 $self->{_flat_files} = $files ? ref($files) ? $files : [$files] : [];
202 # ontology name (overrides implicit one through OntologyI engine)
203 $self->ontology_name($name) if $name;
205 } # _initialize
207 =head2 ontology_name
209 Title : ontology_name
210 Usage : $obj->ontology_name($newval)
211 Function: Get/set the name of the ontology parsed by this module.
212 Example :
213 Returns : value of ontology_name (a scalar)
214 Args : on set, new value (a scalar or undef, optional)
216 =cut
218 sub ontology_name{
219 my $self = shift;
221 return $self->{'ontology_name'} = shift if @_;
222 return $self->{'ontology_name'};
226 =head2 parse
228 Title : parse()
229 Usage : $parser->parse();
230 Function: Parses the files set with "new" or with methods
231 defs_file and _flat_files.
233 Normally you should not need to call this method as it will
234 be called automatically upon the first call to
235 next_ontology().
237 Returns : [Bio::Ontology::OntologyEngineI]
238 Args :
240 =cut
242 sub parse {
243 my $self = shift;
245 # setup the default term factory if not done by anyone yet
246 $self->term_factory(Bio::Ontology::TermFactory->new(
247 -type => "Bio::Ontology::Term"))
248 unless $self->term_factory();
250 # create the ontology object itself
251 my $ont = Bio::Ontology::Ontology->new(-name => $self->ontology_name(),
252 -engine => $self->_ont_engine());
254 # set up the ontology of the relationship types
255 foreach ($self->_part_of_relationship(),
256 $self->_is_a_relationship(),
257 $self->_related_to_relationship()) {
258 $_->ontology($ont);
261 # pre-seed the IO system with the first flat file if -file wasn't provided
262 if(! $self->_fh) {
263 $self->_initialize_io(-file => shift(@{$self->_flat_files()}));
266 while($self->_fh) {
267 $self->_parse_flat_file($ont);
268 # advance to next flat file if more are available
269 if(@{$self->_flat_files()}) {
270 $self->close();
271 # reset the virtual root so that the next one is generated from
272 # the next file
273 $self->_virtual_root(undef);
274 # now re-initialize the IO object
275 $self->_initialize_io(-file => shift(@{$self->_flat_files()}));
276 } else {
277 last; # nothing else to parse so terminate the loop
280 $self->_add_ontology($ont);
281 # not needed anywhere, only because of backward compatibility
282 return $self->_ont_engine();
283 } # parse
285 =head2 next_ontology
287 Title : next_ontology
288 Usage :
289 Function: Get the next available ontology from the parser. This is the
290 method prescribed by Bio::OntologyIO.
291 Example :
292 Returns : An object implementing Bio::Ontology::OntologyI, and undef if
293 there is no more ontology in the input.
294 Args :
296 =cut
298 sub next_ontology{
299 my $self = shift;
301 # parse if not done already
302 $self->parse() unless exists($self->{'_ontologies'});
303 # return next available ontology
304 return shift(@{$self->{'_ontologies'}}) if exists($self->{'_ontologies'});
305 return;
308 =head2 _flat_files
310 Title : _flat_files
311 Usage : $files_to_parse = $parser->_flat_files();
312 Function: Get the array of ontology flat files that need to be parsed.
314 Note that this array will decrease in elements over the
315 parsing process. Therefore, it\'s value outside of this
316 module will be limited. Also, be careful not to alter the
317 array unless you know what you are doing.
319 Returns : a reference to an array of zero or more strings
320 Args : none
322 =cut
324 sub _flat_files {
325 my $self = shift;
327 $self->{_flat_files} = [] unless exists($self->{_flat_files});
328 return $self->{_flat_files};
332 # INTERNAL METHODS
333 # ----------------
335 =head2 _defs_io
337 Title : _defs_io
338 Usage : $obj->_defs_io($newval)
339 Function: Get/set the Bio::Root::IO instance representing the
340 definition file, if provided (see defs_file()).
341 Example :
342 Returns : value of _defs_io (a Bio::Root::IO object)
343 Args : on set, new value (a Bio::Root::IO object or undef, optional)
345 =cut
347 sub _defs_io{
348 my $self = shift;
350 return $self->{'_defs_io'} = shift if @_;
351 return $self->{'_defs_io'};
354 sub _add_ontology {
355 my $self = shift;
356 $self->{'_ontologies'} = [] unless exists($self->{'_ontologies'});
357 foreach my $ont (@_) {
358 $self->throw(ref($ont)." does not implement Bio::Ontology::OntologyI")
359 unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
360 push(@{$self->{'_ontologies'}}, $ont);
364 # This simply delegates. See SimpleGOEngine.
365 sub _add_term {
366 my ( $self, $term, $ont ) = @_;
368 $term->ontology($ont) if $ont && (! $term->ontology);
369 $self->_ont_engine()->add_term( $term );
372 } # _add_term
374 # This simply delegates. See SimpleGOEngine
375 sub _part_of_relationship {
376 my ( $self, $term ) = @_;
378 return $self->_ont_engine()->part_of_relationship();
380 } # _part_of_relationship
384 # This simply delegates. See SimpleGOEngine
385 sub _is_a_relationship {
386 my ( $self, $term ) = @_;
388 return $self->_ont_engine()->is_a_relationship();
390 } # _is_a_relationship
393 # This simply delegates. See SimpleGOEngine
394 sub _related_to_relationship {
395 my ( $self, $term ) = @_;
397 return $self->_ont_engine()->related_to_relationship();
399 } # _is_a_relationship
402 # This simply delegates. See SimpleGOEngine
403 sub _add_relationship {
404 my ( $self, $parent, $child, $type, $ont ) = @_;
406 # note the triple terminology (subject,predicate,object) corresponds to
407 # (child,type,parent)
408 $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
411 } # _add_relationship
414 # This simply delegates. See SimpleGOEngine
415 sub _has_term {
416 my ( $self, $term ) = @_;
418 $term = $self->ontology_name() .'|'. $term
419 unless ref($term) || !$self->ontology_name();
420 return $self->_ont_engine()->has_term( $term );
422 } # _add_term
424 # This simply delegates after prefixing the namespace name if it is just a
425 # base identifier. See SimpleGOEngine
426 sub _get_terms{
427 my $self = shift;
428 my @args = ();
430 while(@_) {
431 unshift(@args, pop(@_)); # this actually does preserve the order
432 $args[0] = $self->ontology_name() .'|'. $args[0]
433 unless ref($args[0]) || !$self->ontology_name();
435 return $self->_ont_engine->get_terms(@args);
439 # This parses the relationships files
440 sub _parse_flat_file {
441 my $self = shift;
442 my $ont = shift;
444 my @stack = ();
445 my $prev_indent = -1;
446 my $parent = "";
447 my $prev_term = "";
449 my $indent_string = $self->indent_string;
452 while ( my $line = $self->_readline() ) {
453 if ( $line =~ /^[$indent_string]*[\|\-]/ ) { #this is not yet generalized
454 next;
457 my ($current_term) = $line =~ /^[$indent_string]*(.*)/;
458 my $current_indent = $self->_count_indents( $line );
459 chomp $current_term;
460 # remove extraneous delimiter characters at the end of the name if any
461 $current_term =~ s/[$indent_string]+$//;
462 # remove double quotes surrounding the entry, if any
463 $current_term =~ s/^\"(.*)\"$/$1/;
464 # also, the name might contain a synonym
465 my $syn = $current_term =~ s/\s+{([^}]+)}// ? $1 : undef;
467 if ( ! $self->_has_term( $current_term ) ) {
468 my $term = $self->_create_ont_entry($current_term);
469 # add synonym(s) if any
470 $term->add_synonym(split(/[;,]\s*/,$syn)) if $syn;
471 # add to the machine
472 $self->_add_term( $term, $ont );
474 #go on to the next term if a root node.
475 if($current_indent == 0) {
476 # add the virtual root as parent if there is one
477 if($self->_virtual_root()) {
478 $self->_add_relationship($self->_virtual_root(),
479 $term,
480 $self->_is_a_relationship(),
481 $ont);
483 $prev_indent = $current_indent;
484 $prev_term = $current_term;
485 push @stack, $current_term;
486 next;
490 # note: we are ensured to see the parent first in this type of file,
491 # so we never need to possibly insert the parent here
493 if ( $current_indent != $prev_indent ) {
494 if ( $current_indent == $prev_indent + 1 ) {
495 push( @stack, $prev_term );
496 } elsif ( $current_indent < $prev_indent ) {
497 my $n = $prev_indent - $current_indent;
498 for ( my $i = 0; $i < $n; ++$i ) {
499 pop( @stack );
501 } else {
502 $self->throw("format error: indentation level $current_indent "
503 ."is more than one higher than the previous "
504 ."level $prev_indent ('$current_term', "
505 ."file ".$self->file.")" );
509 $parent = $stack[-1];
511 if($parent ne $current_term) { #this prevents infinite recursion from a parent linking to itself
512 $self->_add_relationship($self->_get_terms($parent),
513 $self->_get_terms($current_term),
514 $self->_is_a_relationship(),
515 $ont);
518 $prev_indent = $current_indent;
519 $prev_term = $current_term;
521 return $ont;
522 } # _parse_relationships_file
526 # Parses the 1st term id number out of line.
527 sub _get_first_termid {
528 my ( $self, $line ) = @_;
530 if ( $line =~ /;\s*([A-Z]{1,8}:\d{7})/ ) {
531 return $1;
533 else {
534 $self->throw( "format error: no term id in line \"$line\"" );
537 } # _get_first_termid
539 # Counts the indents at the beginning of a line in the relationships files
540 sub _count_indents {
541 my ( $self, $line ) = @_;
543 my $indent = $self->indent_string;
545 if ( $line =~ /^($indent+)/ ) {
546 return (length($1)/length($indent));
548 else {
549 return 0;
551 } # _count_indents
554 # Holds the GO engine to be parsed into
555 sub _ont_engine {
556 my ( $self, $value ) = @_;
558 if ( defined $value ) {
559 $self->{ "_ont_engine" } = $value;
562 return $self->{ "_ont_engine" };
563 } # _ont_engine
566 # Used to create ontology terms.
567 # Arguments: name, id
568 sub _create_ont_entry {
569 my ( $self, $name, $termid ) = @_;
571 my $term = $self->term_factory->create_object(-name => $name,
572 -identifier => $termid);
574 return $term;
576 } # _create_ont_entry
579 # Holds whether first record or not
580 sub _not_first_record {
581 my ( $self, $value ) = @_;
583 if ( defined $value ) {
584 unless ( $value == FALSE || $value == TRUE ) {
585 $self->throw( "Argument to method \"_not_first_record\" must be either ".TRUE." or ".FALSE );
587 $self->{ "_not_first_record" } = $value;
590 return $self->{ "_not_first_record" };
591 } # _not_first_record
595 # Holds whether done or not
596 sub _done {
597 my ( $self, $value ) = @_;
599 if ( defined $value ) {
600 unless ( $value == FALSE || $value == TRUE ) {
601 $self->throw( "Found [$value] where [" . TRUE
602 ." or " . FALSE . "] expected" );
604 $self->{ "_done" } = $value;
607 return $self->{ "_done" };
608 } # _done
611 # Holds a term.
612 sub _term {
613 my ( $self, $value ) = @_;
615 if ( defined $value ) {
616 $self->{ "_term" } = $value;
619 return $self->{ "_term" };
620 } # _term
622 =head2 indent_string
624 Title : indent_string
625 Usage : $obj->indent_string($newval)
626 Function:
627 Example :
628 Returns : value of indent_string (a scalar)
629 Args : on set, new value (a scalar or undef, optional)
631 =cut
633 sub indent_string{
634 my $self = shift;
636 return $self->{'indent_string'} = shift if @_;
637 return $self->{'indent_string'};
640 =head2 file_is_root
642 Title : file_is_root
643 Usage : $obj->file_is_root($newval)
644 Function: Boolean indicating whether a virtual root term is to be
645 added, the name of which will be derived from the file
646 name.
648 Enabling this allows one to parse multiple input files into the
649 same ontology and still have separately rooted.
651 Example :
652 Returns : value of file_is_root (a scalar)
653 Args : on set, new value (a scalar or undef, optional)
655 =cut
657 sub file_is_root{
658 my $self = shift;
660 return $self->{'file_is_root'} = shift if @_;
661 return $self->{'file_is_root'};
664 =head2 _virtual_root
666 Title : _virtual_root
667 Usage : $obj->_virtual_root($newval)
668 Function:
669 Example :
670 Returns : value of _virtual_root (a scalar)
671 Args : on set, new value (a scalar or undef, optional)
673 =cut
675 sub _virtual_root{
676 my $self = shift;
678 return $self->{'_virtual_root'} = shift if @_;
680 # don't return anything if not in file_is_root mode, or if we don't
681 # have a file to derive the root node from
682 return unless $self->file_is_root() && $self->file();
684 # construct it if we haven't done this before
685 if(! $self->{'_virtual_root'}) {
686 my ($rt,undef,undef) = fileparse($self->file(), '\..*');
687 $rt =~ s/_/ /g;
688 $rt = $self->_create_ont_entry($rt);
689 $self->_add_term($rt, $self->ontology_name());
690 $self->{'_virtual_root'} = $rt;
693 return $self->{'_virtual_root'};