make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / loading_scripts / load_stock_data.pl
blob3168f68f01d97c20d603be657b1be142697d2cc5
1 #!/usr/bin/perl
3 =head1
5 load_stock_data.pl - a script to load stock data
7 =head1 SYNOPSIS
9 load_stock_data.pl -H [dbhost] -D [dbname] [-t] [-s species name ] [-p stock population name]
11 =head1 COMMAND-LINE OPTIONS
13 -H host name
14 -D database name
15 -i infile
16 -u username for associating the new stocks
17 -s species name - must be in the database. Can also be read from the input file
18 -p population name - will create a new stock of type 'population' if doesn't exist.
19 -t Test run . Rolling back at the end.
21 =head1 DESCRIPTION
23 Updated script for loading and adding stock names and synonyms.
24 The owners of the stock accession are not stored in stockprop, but in phenome.stock_owner.
26 File format for infile (tab delimited):
28 accession genus species population_name synonyms
30 =head1 AUTHOR
32 Naama Menda (nm249@cornell.edu)
34 April 2013
36 =cut
39 use strict;
40 use Getopt::Std;
41 use CXGN::Tools::File::Spreadsheet;
43 use CXGN::Phenome::Schema;
44 use Bio::Chado::Schema;
45 use CXGN::DB::InsertDBH;
46 use Carp qw /croak/ ;
48 use CXGN::Chado::Dbxref;
49 use CXGN::Chado::Phenotype;
50 use CXGN::People::Person;
51 use Try::Tiny;
52 use SGN::Model::Cvterm;
53 use Getopt::Long;
55 my ( $dbhost, $dbname, $file, $population_name, $species, $username, $test );
56 GetOptions(
57 'i=s' => \$file,
58 'p=s' => \$population_name,
59 's=s' => \$species,
60 'u=s' => \$username,
61 't' => \$test,
62 'dbname|D=s' => \$dbname,
63 'dbhost|H=s' => \$dbhost,
67 my $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
68 dbname=>$dbname,
69 dbargs => {AutoCommit => 1,
70 RaiseError => 1}
73 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } , { on_connect_do => ['SET search_path TO public;'] }
75 my $phenome_schema= CXGN::Phenome::Schema->connect( sub { $dbh->get_actual_dbh } , { on_connect_do => ['set search_path to public,phenome;'] } );
78 #getting the last database ids for resetting at the end in case of rolling back
79 my $last_stockprop_id= $schema->resultset('Stock::Stockprop')->get_column('stockprop_id')->max;
80 my $last_stock_id= $schema->resultset('Stock::Stock')->get_column('stock_id')->max;
81 my $last_stockrel_id= $schema->resultset('Stock::StockRelationship')->get_column('stock_relationship_id')->max;
82 my $last_cvterm_id= $schema->resultset('Cv::Cvterm')->get_column('cvterm_id')->max;
83 my $last_cv_id= $schema->resultset('Cv::Cv')->get_column('cv_id')->max;
84 my $last_db_id= $schema->resultset('General::Db')->get_column('db_id')->max;
85 my $last_dbxref_id= $schema->resultset('General::Dbxref')->get_column('dbxref_id')->max;
88 my %seq = (
89 'db_db_id_seq' => $last_db_id,
90 'dbxref_dbxref_id_seq' => $last_dbxref_id,
91 'cv_cv_id_seq' => $last_cv_id,
92 'cvterm_cvterm_id_seq' => $last_cvterm_id,
93 'stock_stock_id_seq' => $last_stock_id,
94 'stockprop_stockprop_id_seq' => $last_stockprop_id,
95 'stock_relationship_stock_relationship_id_seq' => $last_stockrel_id,
98 #new spreadsheet
99 my $spreadsheet=CXGN::Tools::File::Spreadsheet->new($file);
101 ##############
102 ##parse first the file with the clone names and synonyms. Load into stock, and stockprop
103 #############
104 # population for grouping the clones
107 my $sp_person_id = CXGN::People::Person->get_person_by_username($dbh, $username);
108 die "Need to have a user pre-loaded in the database! " if !$sp_person_id;
110 my $organism_id ;
112 if ($species) { ## can also read species name from the input file
113 my $organism = $schema->resultset("Organism::Organism")->find( {
114 species => $species } );
115 $organism_id = $organism->organism_id();
116 die "Species $species does not exist in the database! " if !$organism_id;
117 } #check this again if species name is provided in the file
119 my $stock_rs = $schema->resultset("Stock::Stock");
121 #the cvterm for the population
122 print "Finding/creating cvterm for population\n";
123 my $population_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'population', 'stock_type');
127 #the cvterm for the accession
128 my $accession_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type');
130 #the cvterm for the relationship type
131 my $member_of = SGN::Model::Cvterm->get_cvterm_row($schema, 'member_of', 'stock_relationship');
133 ## For the stock module
134 ################################
136 print "parsing spreadsheet... \n";
137 my @rows = $spreadsheet->row_labels();
138 my @columns = $spreadsheet->column_labels();
140 my $syn_count;
141 #accession genus species population_name synonyms
142 my $coderef= sub {
143 foreach my $accession (@rows ) {
144 #remove spaces from accession name
145 $accession=~s/\s+//g;
147 my $species_name = $spreadsheet->value_at($accession, "species");
148 if (!$species) {
149 my $organism = $schema->resultset("Organism::Organism")->find( {
150 species => $species_name } );
151 $organism_id = $organism->organism_id();
152 die "Species $species_name does not exist in the database! " if !$organism_id;
155 my $population_name = $spreadsheet->value_at($accession, "population_name");
156 my $synonym_string = $spreadsheet->value_at($accession, "synonyms");
157 my @synonyms = split /\|/ , $synonym_string;
158 # see if a stock exists with any of the synonyms
159 #my @stocks = $stock_rs->search( {
160 # -or => [
161 # uniquename => $accession,
162 # uniquename => $iita_clone_name,
163 # uniquename => $syn1,
164 # uniquename => $syn2,
165 # uniquename => $syn3,
166 # uniquename => $syn4,
167 # uniquename => $syn5,
168 # uniquename => $icass,
169 # ], }, );
172 print "Creating a stock for population $population_name (cvterm = " . $population_cvterm->name . ")\n";
173 my $population = $stock_rs->find_or_create(
175 'me.name' => $population_name,
176 'me.uniquename' => $population_name,
177 'me.organism_id' => $organism_id,
178 type_id => $population_cvterm->cvterm_id,
180 { join => 'type' }
183 print "Find or create stock for accesssion $accession\n";
184 my $stock = $schema->resultset("Stock::Stock")->find_or_create(
185 { organism_id => $organism_id,
186 name => $accession,
187 uniquename => $accession,
188 type_id => $accession_cvterm->cvterm_id(),
190 my $stock_id = $stock->stock_id;
191 print "Adding owner $sp_person_id \n";
192 #add the owner for this stock
193 $phenome_schema->resultset("StockOwner")->find_or_create(
195 stock_id => $stock->stock_id,
196 sp_person_id => $sp_person_id,
198 #####################
200 #the stock belongs to the population:
201 #add new stock_relationship
202 print "Accession $accession is member_of population $population_name \n";
203 $population->find_or_create_related('stock_relationship_objects', {
204 type_id => $member_of->cvterm_id(),
205 subject_id => $stock->stock_id(),
206 } );
207 if ($synonym_string) {print "Adding synonyms #" . scalar(@synonyms) . "\n"; }
208 foreach my $syn (@synonyms) {
209 if ($syn && defined($syn) && ($syn ne $accession) ) {
210 my $existing_synonym = $stock->search_related(
211 'stockprops' , {
212 'me.value' => $syn,
213 'type.name' => { ilike => '%synonym%' }
215 { join => 'type' }
216 )->single;
217 if (!$existing_synonym) {
218 $syn_count++;
219 print STDOUT "Adding synonym: $syn \n" ;
220 #add the synonym as a stockprop
221 $stock->create_stockprops({ stock_synonym => $syn},
222 {autocreate => 0,
223 allow_duplicate_values=> 1,
229 #########
231 if ($test) {
232 die "TEST RUN! rolling back\n";
237 try {
238 $schema->txn_do($coderef);
239 if (!$test) { print "Transaction succeeded! Commiting stocks and their properties! \n\n"; }
240 } catch {
241 # Transaction failed
242 foreach my $value ( keys %seq ) {
243 my $maxval= $seq{$value} || 0;
244 if ($maxval) { $dbh->do("SELECT setval ('$value', $maxval, true)") ; }
245 else { $dbh->do("SELECT setval ('$value', 1, false)"); }
247 die "An error occured! Rolling back and reseting database sequences!" . $_ . "\n";