change rules for cluster accessible dirs.
[cxgn-corelibs.git] / lib / CXGN / Tools / InterProGFF3.pm
blob1e7f60b9733f005104e7d813aeb0e0cc4bae44d0
1 package CXGN::Tools::InterProGFF3;
2 use Moose;
3 use Moose::Util::TypeConstraints;
4 use Bio::OntologyIO::InterProParser;
5 use feature 'say';
6 use Data::Dumper;
7 use autodie;
8 use URI::Escape;
9 with 'MooseX::Runnable';
10 with 'MooseX::Getopt';
12 =head1 NAME
14 CXGN::Tools::InterProGFF3 - Convert InterPro XML to GFF3
16 =head1 SYNOPSIS
18 This tool converts InterPro XML to GFF3 so that InterPro domains
19 can be loaded as features into Chado.
21 =head1 DESCRIPTION
23 =head1 MAINTAINER
25 Jonathan "Duke" Leto <jonathan@leto.net>
27 =head1 AUTHOR
29 Jonathan "Duke" Leto <jonathan@leto.net>
31 =head1 COPYRIGHT & LICENSE
33 Copyright 2010 Boyce Thompson Institute for Plant Research
35 This program is free software; you can redistribute it and/or modify
36 it under the same terms as Perl itself.
38 =cut
40 has gff3_preamble => (
41 is => 'ro',
42 isa => 'Str',
43 default => "##gff-version 3
44 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.220\n",
47 has parent_list => (
48 is => 'rw',
49 isa => 'HashRef',
52 has filename => (
53 is => 'ro',
54 isa => 'Str',
57 has output => (
58 is => 'ro',
59 isa => 'Str',
62 has parser => (
63 is => 'rw',
64 isa => 'Bio::OntologyIO::InterProParser',
67 has ontology => (
68 is => 'rw',
71 has source => (
72 is => 'rw',
73 isa => 'Str',
74 default => 'InterPro Version X',
77 has term_type => (
78 is => 'ro',
79 isa => 'Str',
80 default => 'polypeptide_domain',
83 has gff3 => (
84 is => 'rw',
85 isa => 'Str',
86 default => '',
89 sub BUILDARGS {
90 my $class = shift;
91 my %args = @_;
92 return $class->SUPER::BUILDARGS( %args );
95 sub run {
96 my ($self,%args) = @_;
97 $self->parser( Bio::OntologyIO->new(
98 -format => 'interpro',
99 -file => $self->filename,
101 $self->ontology( $self->parser->next_ontology );
102 $self->gff3( $self->gff3_preamble );
103 $self->generate_parent_list;
104 $self->convert;
105 if ($self->output) {
106 open my $fh, '>', $self->output;
107 print $fh $self->gff3;
108 close $fh;
109 } else {
110 print $self->gff3;
112 #exit code
113 return 0;
116 sub generate_parent_list {
117 my ($self) = @_;
118 my $relations = $self->ontology->{engine}->{_inverted_relationship_store} ;
119 my $parent_list = {};
121 while ( my ($k,$v) = each %$relations ) {
122 $parent_list->{$k} = join(',',grep { $_ =~ m/^IPR/ && $v->{$_}->name eq 'IS_A' } keys %$v);
124 $self->parent_list( $parent_list );
127 sub convert {
128 my ($self) = @_;
129 my @domains = $self->get_domains;
130 for my $domain (@domains) {
132 my (@relations) = $self->ontology->get_relationships($domain);
134 # Find all IS_A relations of this domain, excluding itself
135 # This should include parent terms, but does not. See
136 # generate_parent_list for how parents are found
137 my @isa_relations = grep {
138 $_->predicate_term->name eq 'IS_A' &&
139 $_->object_term->identifier ne $domain->identifier
140 } @relations;
141 my $type = @isa_relations ? $isa_relations[0]->object_term->name : '';
143 $self->gff3( $self->gff3 . $self->make_gff3_line($domain, $type) );
147 sub make_gff3_line {
148 my ($self,$domain, $type) = @_;
149 my $fmt = "%s\t" x 8 . "%s\n";
150 return sprintf $fmt, $domain->identifier,
151 $self->source, $self->term_type,
152 0, 0, qw/. . ./, $self->make_attribute_string($domain, $type);
155 sub escape_gff {
156 my ($self, $data) = @_;
157 return uri_escape ($data, ';=%&,');
160 sub make_attribute_string {
161 my ($self,$domain, $type) = @_;
162 my $fmt = 'ID=%s;Name=%s;Alias=%s;Parent=%s;Note=%s;Dbxref=%s;interpro_type=%s;protein_count=%s';
163 no warnings 'uninitialized';
165 return sprintf $fmt, (
166 $domain->identifier, $self->escape_gff($domain->name),
167 $self->escape_gff($domain->short_name),
168 $self->parent_list()->{$domain->identifier},
169 $self->escape_gff( $domain->definition),
170 join(',', "INTERPRO:" . $domain->identifier, (map { $_->database . ':' . $_->primary_id } $domain->get_members)),
171 $type, $domain->protein_count);
174 sub get_domains {
175 my ($self) = @_;
176 return sort { $b <=> $a } grep { $_->identifier =~ m/^IPR/ } $self->ontology->get_all_terms;
179 __PACKAGE__->meta->make_immutable;
180 no Moose;