bin/concatenate_cvterms_into_multiterm_traits.pl

   1 #!/usr/bin/env perl
   2
   3 =head1
   4 concatenate_cvterms_into_multi_term_traits.pl
   5
   6 =head1 SYNOPSIS
   7
   8     this script is very specific to cassbase ontologies, but could possibly be generalized through more sophisticated recursion.
   9
  10     perl concatenate_cvterms_into_multiterm_traits.pl -H localhost -D fixture2 -l 'chebi_compounds|CHEBI:00000[OR]ec_terms|EC:0000000,cass_tissues|CASSTISS:0000000,cass time of day|CASSTIME:0000001,cass number of weeks|CASSTIME:0000005,cass_units|CASSUNIT:0000000' -d CASSFT -c cassava_trait
  11
  12 =head1 COMMAND-LINE OPTIONS
  13
  14  -H  host name
  15  -D  database name
  16  -l  comma separated list of parent cvterms. the first term can take [OR] separated parent cvterms. all children of the parent term (is_a relationship) will be concatenated together and saved.
  17  -d  the db name that the new cvterms will be stored under. Must be a new db name in this implementation.
  18  -c  the cv name that the new cvterms will be stored under.
  19
  20 =head2 DESCRIPTION
  21
  22
  23 =head2 AUTHOR
  24
  25 Nicolas Morales (nm529@cornell.edu)
  26
  27 April 2014
  28
  29 =head2
  30
  31 The CASS project requires traits to be composed of many separate terms. This script concatenates cvterms into multi-term traits. A list of parent cvterms is given using the -l parameter. The script then finds all child terms that are of 'is_a' relationship to the parent terms given. All the children terms are then linearly combined in the order that the parent terms are in. The cvterms are separated by || in the concatenated string.
  32
  33 The new concatenated strings are stored as cvterms, with cv = $opt_c and db = $opt_d
  34
  35
  36
  37 TODO during pheno spreadsheet upload make trait validation split the term on || and then nvalidate the individual terms.
  38
  39 Example: specifying -l CHEBI:00000,cass_tissue|CASSTISS:000000,cass time of day|CASSTIME:0000001 will create many new concatenated terms, one of which would be the concatenation of 'ADP|CHEBI:16761', 'cass leaf|CASSTISS:0000001', and 'cass end of night|CASSTIME:0000002' into 'ADP|CHEBI:16761||cass leaf|CASSTISS:0000001||cass end of night|CASSTIME:0000002'
  40
  41 =cut
  42
  43 use strict;
  44 use warnings;
  45
  46 use lib 'lib';
  47 use Getopt::Std;
  48 use Bio::Chado::Schema;
  49 use CXGN::DB::InsertDBH;
  50 use CXGN::DB::Connection;
  51 use Data::Dumper;
  52 use SGN::Model::Cvterm;
  53 use Try::Tiny;
  54
  55 our ($opt_H, $opt_D, $opt_l, $opt_d, $opt_c);
  56 getopts('H:D:l:d:c:');
  57
  58
  59
  60 if (!$opt_D || !$opt_H || !$opt_l || !$opt_d || !$opt_c ) {
  61   die("Exiting: options missing\nRequires: -D -H -l -d -c");
  62 }
  63
  64 my $dbh = CXGN::DB::InsertDBH
  65   ->new({
  66          dbname => $opt_D,
  67          dbhost => $opt_H,
  68          dbargs => {AutoCommit => 1,
  69                     RaiseError => 1},
  70         });
  71
  72 my $schema = Bio::Chado::Schema->connect(  sub { $dbh->get_actual_dbh() } );
  73
  74 my $db = $schema->resultset("General::Db")->create({name=>$opt_d});
  75 my $db_id = $db->db_id();
  76 my $cv = $schema->resultset("Cv::Cv")->find_or_create({name=>$opt_c});
  77 my $cv_id = $cv->cv_id();
  78
  79 my $accession = 0;
  80
  81 my @parent_trait_names = split /,/, $opt_l;
  82
  83 my $first_element = splice @parent_trait_names, 0, 1;
  84 my @first_parent_names = split /\[OR\]/, $first_element;
  85 foreach my $i (@first_parent_names) {
  86     my @children_array;
  87
  88     my $children = get_children($schema, $i);
  89     push (@children_array, $children);
  90
  91     foreach my $j (@parent_trait_names) {
  92         my $children;
  93         if ($j eq 'cass_tissues|CASSTISS:0000000') {
  94             my @sub_nodes = ('cass leaf|CASSTISS:0000001', 'cass stem|CASSTISS:0000002', 'cass root|CASSTISS:0000003');
  95             foreach my $t (@sub_nodes) {
  96                 my $sub_children = get_children($schema, $t);
  97                 push @$children, @$sub_children;
  98             }
  99         } else {
 100             $children = get_children($schema, $j);
 101         }
 102         push (@children_array, $children);
 103     }
 104
 105     print Dumper \@children_array;
 106
 107     my $count = 0;
 108     my @concatenated_terms;
 109     my $first_term = $children_array[0];
 110     foreach my $a (@$first_term) {
 111         my $a_concat_term = $a;
 112         my $second_term = $children_array[1];
 113         foreach my $b (@$second_term) {
 114             my $b_concat_term = $a_concat_term.'||'.$b;
 115             my $third_term = $children_array[2];
 116             foreach my $c (@$third_term) {
 117                 my $c_concat_term = $b_concat_term.'||'.$c;
 118                 my $fourth_term = $children_array[3];
 119                 foreach my $d (@$fourth_term) {
 120                     my $d_concat_term = $c_concat_term.'||'.$d;
 121                     my $fifth_term = $children_array[4];
 122                     foreach my $e (@$fifth_term) {
 123                         my $e_concat_term = $d_concat_term.'||'.$e;
 124                         push @concatenated_terms, $e_concat_term;
 125                     }
 126                 }
 127             }
 128         }
 129     }
 130
 131
 132
 133     #print Dumper \@concatenated_terms;
 134     print scalar(@{$children_array[0]}) * scalar(@{$children_array[1]}) * scalar(@{$children_array[2]}) * scalar(@{$children_array[3]}) * scalar(@{$children_array[4]})."\n";
 135     print scalar(@concatenated_terms)."\n";
 136
 137     foreach (@concatenated_terms) {
 138         my $accession_string = sprintf("%07d",$accession);
 139         my $dbxref = $schema->resultset("General::Dbxref")->create({db_id=>$db_id, accession=>$accession_string});
 140         my $dbxref_id = $dbxref->dbxref_id();
 141         my $cvterm = $schema->resultset("Cv::Cvterm")->create({cv_id=>$cv_id, name=>$_, dbxref_id=>$dbxref_id});
 142         $accession++;
 143         $count++;
 144     }
 145
 146     print STDERR "Added $count new terms.\n";
 147 }
 148
 149 print STDERR "Complete.\n";
 150
 151
 152 sub get_children {
 153     my $schema = shift;
 154     my $term = shift;
 155     print $term."\n";
 156     my $parent_node_cvterm_id = SGN::Model::Cvterm->get_cvterm_row_from_trait_name($schema, $term)->cvterm_id();
 157     my $rel_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'is_a', 'relationship')->cvterm_id();
 158
 159     my $children_ref = $schema->resultset("Cv::CvtermRelationship")->search({type_id => $rel_cvterm_id, object_id => $parent_node_cvterm_id})->search_related('subject');
 160     my @children;
 161     while (my $child = $children_ref->next() ) {
 162         my $dbxref_info = $child->search_related('dbxref');
 163         my $accession = $dbxref_info->first()->accession();
 164         my $db_info = $dbxref_info->search_related('db');
 165         my $db_name = $db_info->first()->name();
 166         push @children, $child->name."|".$db_name.":".$accession;
 167     }
 168     return \@children;
 169 }