add cross autocomplete works for vector constructs
[sgn.git] / bin / concatenate_cvterms_into_multiterm_traits.pl
blobc80b0181f9d256e7a813b93765cfee32fba46c18
1 #!/usr/bin/env perl
3 =head1
4 concatenate_cvterms_into_multi_term_traits.pl
6 =head1 SYNOPSIS
8 this script is very specific to cassbase ontologies, but could possibly be generalized through more sophisticated recursion.
10 perl concatenate_cvterms_into_multiterm_traits.pl -H localhost -D fixture2 -l 'chebi_compounds|CHEBI:00000[OR]ec_terms|EC:0000000,cass_tissues|CASSTISS:0000000,cass time of day|CASSTIME:0000001,cass number of weeks|CASSTIME:0000005,cass_units|CASSUNIT:0000000' -d CASSFT -c cassava_trait
12 =head1 COMMAND-LINE OPTIONS
14 -H host name
15 -D database name
16 -l comma separated list of parent cvterms. the first term can take [OR] separated parent cvterms. all children of the parent term (is_a relationship) will be concatenated together and saved.
17 -d the db name that the new cvterms will be stored under. Must be a new db name in this implementation.
18 -c the cv name that the new cvterms will be stored under.
20 =head2 DESCRIPTION
23 =head2 AUTHOR
25 Nicolas Morales (nm529@cornell.edu)
27 April 2014
29 =head2
31 The CASS project requires traits to be composed of many separate terms. This script concatenates cvterms into multi-term traits. A list of parent cvterms is given using the -l parameter. The script then finds all child terms that are of 'is_a' relationship to the parent terms given. All the children terms are then linearly combined in the order that the parent terms are in. The cvterms are separated by || in the concatenated string.
33 The new concatenated strings are stored as cvterms, with cv = $opt_c and db = $opt_d
37 TODO during pheno spreadsheet upload make trait validation split the term on || and then nvalidate the individual terms.
39 Example: specifying -l CHEBI:00000,cass_tissue|CASSTISS:000000,cass time of day|CASSTIME:0000001 will create many new concatenated terms, one of which would be the concatenation of 'ADP|CHEBI:16761', 'cass leaf|CASSTISS:0000001', and 'cass end of night|CASSTIME:0000002' into 'ADP|CHEBI:16761||cass leaf|CASSTISS:0000001||cass end of night|CASSTIME:0000002'
41 =cut
43 use strict;
44 use warnings;
46 use lib 'lib';
47 use Getopt::Std;
48 use Bio::Chado::Schema;
49 use CXGN::DB::InsertDBH;
50 use CXGN::DB::Connection;
51 use Data::Dumper;
52 use SGN::Model::Cvterm;
53 use Try::Tiny;
55 our ($opt_H, $opt_D, $opt_l, $opt_d, $opt_c);
56 getopts('H:D:l:d:c:');
60 if (!$opt_D || !$opt_H || !$opt_l || !$opt_d || !$opt_c ) {
61 die("Exiting: options missing\nRequires: -D -H -l -d -c");
64 my $dbh = CXGN::DB::InsertDBH
65 ->new({
66 dbname => $opt_D,
67 dbhost => $opt_H,
68 dbargs => {AutoCommit => 1,
69 RaiseError => 1},
70 });
72 my $schema = Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } );
74 my $db = $schema->resultset("General::Db")->create({name=>$opt_d});
75 my $db_id = $db->db_id();
76 my $cv = $schema->resultset("Cv::Cv")->find_or_create({name=>$opt_c});
77 my $cv_id = $cv->cv_id();
79 my $accession = 0;
81 my @parent_trait_names = split /,/, $opt_l;
83 my $first_element = splice @parent_trait_names, 0, 1;
84 my @first_parent_names = split /\[OR\]/, $first_element;
85 foreach my $i (@first_parent_names) {
86 my @children_array;
88 my $children = get_children($schema, $i);
89 push (@children_array, $children);
91 foreach my $j (@parent_trait_names) {
92 my $children;
93 if ($j eq 'cass_tissues|CASSTISS:0000000') {
94 my @sub_nodes = ('cass leaf|CASSTISS:0000001', 'cass stem|CASSTISS:0000002', 'cass root|CASSTISS:0000003');
95 foreach my $t (@sub_nodes) {
96 my $sub_children = get_children($schema, $t);
97 push @$children, @$sub_children;
99 } else {
100 $children = get_children($schema, $j);
102 push (@children_array, $children);
105 print Dumper \@children_array;
107 my $count = 0;
108 my @concatenated_terms;
109 my $first_term = $children_array[0];
110 foreach my $a (@$first_term) {
111 my $a_concat_term = $a;
112 my $second_term = $children_array[1];
113 foreach my $b (@$second_term) {
114 my $b_concat_term = $a_concat_term.'||'.$b;
115 my $third_term = $children_array[2];
116 foreach my $c (@$third_term) {
117 my $c_concat_term = $b_concat_term.'||'.$c;
118 my $fourth_term = $children_array[3];
119 foreach my $d (@$fourth_term) {
120 my $d_concat_term = $c_concat_term.'||'.$d;
121 my $fifth_term = $children_array[4];
122 foreach my $e (@$fifth_term) {
123 my $e_concat_term = $d_concat_term.'||'.$e;
124 push @concatenated_terms, $e_concat_term;
133 #print Dumper \@concatenated_terms;
134 print scalar(@{$children_array[0]}) * scalar(@{$children_array[1]}) * scalar(@{$children_array[2]}) * scalar(@{$children_array[3]}) * scalar(@{$children_array[4]})."\n";
135 print scalar(@concatenated_terms)."\n";
137 foreach (@concatenated_terms) {
138 my $accession_string = sprintf("%07d",$accession);
139 my $dbxref = $schema->resultset("General::Dbxref")->create({db_id=>$db_id, accession=>$accession_string});
140 my $dbxref_id = $dbxref->dbxref_id();
141 my $cvterm = $schema->resultset("Cv::Cvterm")->create({cv_id=>$cv_id, name=>$_, dbxref_id=>$dbxref_id});
142 $accession++;
143 $count++;
146 print STDERR "Added $count new terms.\n";
149 print STDERR "Complete.\n";
152 sub get_children {
153 my $schema = shift;
154 my $term = shift;
155 print $term."\n";
156 my $parent_node_cvterm_id = SGN::Model::Cvterm->get_cvterm_row_from_trait_name($schema, $term)->cvterm_id();
157 my $rel_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'is_a', 'relationship')->cvterm_id();
159 my $children_ref = $schema->resultset("Cv::CvtermRelationship")->search({type_id => $rel_cvterm_id, object_id => $parent_node_cvterm_id})->search_related('subject');
160 my @children;
161 while (my $child = $children_ref->next() ) {
162 my $dbxref_info = $child->search_related('dbxref');
163 my $accession = $dbxref_info->first()->accession();
164 my $db_info = $dbxref_info->search_related('db');
165 my $db_name = $db_info->first()->name();
166 push @children, $child->name."|".$db_name.":".$accession;
168 return \@children;