4 concatenate_cvterms_into_multi_term_traits.pl
8 this script is very specific to cassbase ontologies, but could possibly be generalized through more sophisticated recursion.
10 perl concatenate_cvterms_into_multiterm_traits.pl -H localhost -D fixture2 -l 'chebi_compounds|CHEBI:00000[OR]ec_terms|EC:0000000,cass_tissues|CASSTISS:0000000,cass time of day|CASSTIME:0000001,cass number of weeks|CASSTIME:0000005,cass_units|CASSUNIT:0000000' -d CASSFT -c cassava_trait
12 =head1 COMMAND-LINE OPTIONS
16 -l comma separated list of parent cvterms. the first term can take [OR] separated parent cvterms. all children of the parent term (is_a relationship) will be concatenated together and saved.
17 -d the db name that the new cvterms will be stored under. Must be a new db name in this implementation.
18 -c the cv name that the new cvterms will be stored under.
25 Nicolas Morales (nm529@cornell.edu)
31 The CASS project requires traits to be composed of many separate terms. This script concatenates cvterms into multi-term traits. A list of parent cvterms is given using the -l parameter. The script then finds all child terms that are of 'is_a' relationship to the parent terms given. All the children terms are then linearly combined in the order that the parent terms are in. The cvterms are separated by || in the concatenated string.
33 The new concatenated strings are stored as cvterms, with cv = $opt_c and db = $opt_d
37 TODO during pheno spreadsheet upload make trait validation split the term on || and then nvalidate the individual terms.
39 Example: specifying -l CHEBI:00000,cass_tissue|CASSTISS:000000,cass time of day|CASSTIME:0000001 will create many new concatenated terms, one of which would be the concatenation of 'ADP|CHEBI:16761', 'cass leaf|CASSTISS:0000001', and 'cass end of night|CASSTIME:0000002' into 'ADP|CHEBI:16761||cass leaf|CASSTISS:0000001||cass end of night|CASSTIME:0000002'
48 use Bio
::Chado
::Schema
;
49 use CXGN
::DB
::InsertDBH
;
50 use CXGN
::DB
::Connection
;
52 use SGN
::Model
::Cvterm
;
55 our ($opt_H, $opt_D, $opt_l, $opt_d, $opt_c);
56 getopts
('H:D:l:d:c:');
60 if (!$opt_D || !$opt_H || !$opt_l || !$opt_d || !$opt_c ) {
61 die("Exiting: options missing\nRequires: -D -H -l -d -c");
64 my $dbh = CXGN
::DB
::InsertDBH
68 dbargs
=> {AutoCommit
=> 1,
72 my $schema = Bio
::Chado
::Schema
->connect( sub { $dbh->get_actual_dbh() } );
74 my $db = $schema->resultset("General::Db")->create({name
=>$opt_d});
75 my $db_id = $db->db_id();
76 my $cv = $schema->resultset("Cv::Cv")->find_or_create({name
=>$opt_c});
77 my $cv_id = $cv->cv_id();
81 my @parent_trait_names = split /,/, $opt_l;
83 my $first_element = splice @parent_trait_names, 0, 1;
84 my @first_parent_names = split /\[OR\]/, $first_element;
85 foreach my $i (@first_parent_names) {
88 my $children = get_children
($schema, $i);
89 push (@children_array, $children);
91 foreach my $j (@parent_trait_names) {
93 if ($j eq 'cass_tissues|CASSTISS:0000000') {
94 my @sub_nodes = ('cass leaf|CASSTISS:0000001', 'cass stem|CASSTISS:0000002', 'cass root|CASSTISS:0000003');
95 foreach my $t (@sub_nodes) {
96 my $sub_children = get_children
($schema, $t);
97 push @
$children, @
$sub_children;
100 $children = get_children
($schema, $j);
102 push (@children_array, $children);
105 print Dumper \
@children_array;
108 my @concatenated_terms;
109 my $first_term = $children_array[0];
110 foreach my $a (@
$first_term) {
111 my $a_concat_term = $a;
112 my $second_term = $children_array[1];
113 foreach my $b (@
$second_term) {
114 my $b_concat_term = $a_concat_term.'||'.$b;
115 my $third_term = $children_array[2];
116 foreach my $c (@
$third_term) {
117 my $c_concat_term = $b_concat_term.'||'.$c;
118 my $fourth_term = $children_array[3];
119 foreach my $d (@
$fourth_term) {
120 my $d_concat_term = $c_concat_term.'||'.$d;
121 my $fifth_term = $children_array[4];
122 foreach my $e (@
$fifth_term) {
123 my $e_concat_term = $d_concat_term.'||'.$e;
124 push @concatenated_terms, $e_concat_term;
133 #print Dumper \@concatenated_terms;
134 print scalar(@
{$children_array[0]}) * scalar(@
{$children_array[1]}) * scalar(@
{$children_array[2]}) * scalar(@
{$children_array[3]}) * scalar(@
{$children_array[4]})."\n";
135 print scalar(@concatenated_terms)."\n";
137 foreach (@concatenated_terms) {
138 my $accession_string = sprintf("%07d",$accession);
139 my $dbxref = $schema->resultset("General::Dbxref")->create({db_id
=>$db_id, accession
=>$accession_string});
140 my $dbxref_id = $dbxref->dbxref_id();
141 my $cvterm = $schema->resultset("Cv::Cvterm")->create({cv_id
=>$cv_id, name
=>$_, dbxref_id
=>$dbxref_id});
146 print STDERR
"Added $count new terms.\n";
149 print STDERR
"Complete.\n";
156 my $parent_node_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row_from_trait_name($schema, $term)->cvterm_id();
157 my $rel_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'is_a', 'relationship')->cvterm_id();
159 my $children_ref = $schema->resultset("Cv::CvtermRelationship")->search({type_id
=> $rel_cvterm_id, object_id
=> $parent_node_cvterm_id})->search_related('subject');
161 while (my $child = $children_ref->next() ) {
162 my $dbxref_info = $child->search_related('dbxref');
163 my $accession = $dbxref_info->first()->accession();
164 my $db_info = $dbxref_info->search_related('db');
165 my $db_name = $db_info->first()->name();
166 push @children, $child->name."|".$db_name.":".$accession;