make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / load_stock_banana.pl
blob296560c374c91a1df9576642fe022fb77b810fb4
1 =head1
3 load_stock_data.pl
5 =head1 SYNOPSIS
7 $load_stock_data.pl -H [dbhost] -D [dbname] [-t]
9 =head1 COMMAND-LINE OPTIONS
11 -H host name
12 -D database name
13 -i infile
14 -t Test run . Rolling back at the end.
17 =head2 DESCRIPTION
19 Updated script for loading and adding banana accession names and synonyms.
20 The owners of the stock accession are not stored in stockprop, but in phenome.stock_owner.
22 Naama Menda (nm249@cornell.edu)
23 Guillaume Bauchet (gjb99@cornell.edu)
25 July 2015
29 =cut
32 #!/usr/bin/perl
33 use strict;
34 use Getopt::Std;
35 use CXGN::Tools::File::Spreadsheet;
37 use CXGN::Phenome::Schema;
38 use CXGN::Trial;
39 use Bio::Chado::Schema;
40 use CXGN::DB::InsertDBH;
41 use Carp qw /croak/ ;
43 use CXGN::Chado::Dbxref;
44 use CXGN::Chado::Phenotype;
45 use CXGN::People::Person;
46 use Try::Tiny;
47 use SGN::Model::Cvterm;
49 our ($opt_H, $opt_D, $opt_i, $opt_t);
51 getopts('H:i:tD:');
53 my $dbhost = $opt_H;
54 my $dbname = $opt_D;
55 my $file = $opt_i;
57 my $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
58 dbname=>$dbname,
59 dbargs => {AutoCommit => 0,
60 RaiseError => 1}
63 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } , { on_connect_do => ['SET search_path TO public;'] }
65 my $phenome_schema= CXGN::Phenome::Schema->connect( sub { $dbh->get_actual_dbh } , { on_connect_do => ['set search_path to public,phenome;'] } );
68 #getting the last database ids for resetting at the end in case of rolling back
69 # my $last_stockprop_id= $schema->resultset('Stock::Stockprop')->get_column('stockprop_id')->max;
70 # my $last_stock_id= $schema->resultset('Stock::Stock')->get_column('stock_id')->max;
71 # my $last_stockrel_id= $schema->resultset('Stock::StockRelationship')->get_column('stock_relationship_id')->max;
72 # my $last_cvterm_id= $schema->resultset('Cv::Cvterm')->get_column('cvterm_id')->max;
73 # my $last_cv_id= $schema->resultset('Cv::Cv')->get_column('cv_id')->max;
74 # my $last_db_id= $schema->resultset('General::Db')->get_column('db_id')->max;
75 # my $last_dbxref_id= $schema->resultset('General::Dbxref')->get_column('dbxref_id')->max;
76 # my $last_organism_id = $schema->resultset('Organism::Organism')->get_column('organism_id')->max;
78 # my %seq = (
79 # 'db_db_id_seq' => $last_db_id,
80 # 'dbxref_dbxref_id_seq' => $last_dbxref_id,
81 # 'cv_cv_id_seq' => $last_cv_id,
82 # 'cvterm_cvterm_id_seq' => $last_cvterm_id,
83 # 'stock_stock_id_seq' => $last_stock_id,
84 # 'stockprop_stockprop_id_seq' => $last_stockprop_id,
85 # 'stock_relationship_stock_relationship_id_seq' => $last_stockrel_id,
86 # 'organism_organism_id_seq' => $last_organism_id,
87 # );
89 #new spreadsheet, skip first column
90 my $spreadsheet=CXGN::Tools::File::Spreadsheet->new($file); # 0 or 1
92 ##############
93 ##parse first the file with the clone names and synonyms. Load is into stock, and stockprop
94 #############
95 # population for grouping the clones
97 my $population_name = 'Banana accessions';
99 my $species = 'Musa';
101 my $organism = $schema->resultset("Organism::Organism")->find_or_create( {
102 species => $species } );
103 my $organism_id = $organism->organism_id();
105 my $stock_rs = $schema->resultset("Stock::Stock");
109 #the cvterm for the accession
110 print "Finding/creating cvterm for 'stock type' \n";
111 my $accession_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type');
113 #the cvterm for the relationship type
114 print "Finding/creating cvterm for stock relationship 'member_of' \n";
116 my $member_of = SGN::Model::Cvterm->get_cvterm_row($schema, 'member_of', 'stock_relationship');
118 #the cvterm for the population
119 print "Finding/creating cvterm for population\n";
120 my $population_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'population', 'stock_type');
122 print "Creating a stock for population $population_name (cvterm = " . $population_cvterm->name . ")\n";
124 my $population = $stock_rs->find_or_create(
126 'me.name' => $population_name,
127 'me.uniquename' => $population_name,
128 'me.organism_id' => $organism_id,
129 type_id => $population_cvterm->cvterm_id,
131 { join => 'type' }
134 # link to MGIS database
135 my $mg_dbid;
136 my $mg_row = $schema->resultset("General::Db")->find( { name=>'MGIS' });
137 if (!$mg_row) {
138 my $new_mg_row = $schema->resultset("General::Db")->create( {
139 name => "MGIS"
142 $mg_dbid = $new_mg_row->db_id();
144 else {
145 $mg_dbid = $mg_row->db_id();
151 ## For the stock module
152 ################################
154 print "parsing spreadsheet... \n";
155 my @rows = $spreadsheet->row_labels();
156 my @columns = $spreadsheet->column_labels();
158 ## LABEL GENOTYPE SYNONYM1 SYNONYM2 SYNONYM3 SYNONYM4 PLOIDY GERMPLASM_GROUP CATEGORY INSTITUTE LOCATION LOCATION_CODE CURATOR NAME PROJECT_DESCRIPTION PROJECT_TYPE YEAR
159 # LABEL NAME UNIQUENAME GENOTYPE SYNONYM1 SYNONYM2 PLOIDY GERMPLASM_GROUP CATEGORY INSTITUTE LOCATION LOCATION_CODE CURATOR PROJECT_NAME PROJECT_DESCRIPTION PROJECT_TYPE YEAR
160 # 1 Calcutta-4 Calcutta-4_ITC0249_2015_Collection_IITA_SEN Calcutta-4 ITC0249 2X AA Collection IITA Sendusu SEN mibatte 2015_Collection_IITA_SEN Collection, IITA-Sendusu, 2015 2015_banana_germplasm 2015
163 my ($new_count,$existing, $count, $syn_count, $merge);
164 my $coderef= sub {
165 foreach my $num (@rows ) {
166 my $accession = $spreadsheet->value_at($num,'NAME');
167 if (!$accession) { next; }
168 print "\nBanana accession name is '" . $accession . "'\n";
169 $count++;
170 my $organization = $spreadsheet->value_at($num, 'INSTITUTE');
171 my $location_code = $spreadsheet->value_at($num, 'LOCATION_CODE');
172 my $location = $spreadsheet->value_at($num, 'LOCATION');
173 my $ITC_name = $spreadsheet->value_at($num, "SYNONYM1");
174 #my $original_name = $spreadsheet->value_at($num, "GENOTYPE_NAME_ORIGINAL");
175 my $syn1 = $spreadsheet->value_at($num, "UNIQUENAME");
176 my $syn2 = $spreadsheet->value_at($num, "GENOTYPE");
177 my $syn3 = $spreadsheet->value_at($num, "SYNONYM1");
178 #my $syn4 = $spreadsheet->value_at($num, "SYNONYM4");
179 my $ploidy = $spreadsheet->value_at($num, "PLOIDY");
180 my $germplasm_group = $spreadsheet->value_at($num, "GERMPLASM_GROUP");
181 my $curator = $spreadsheet->value_at($num, "CURATOR");
182 my $name = $spreadsheet->value_at($num, "PROJECT_NAME");
183 my $category = $spreadsheet->value_at($num, "CATEGORY");
184 my $project_type = $spreadsheet->value_at($num, "PROJECT_TYPE");
185 my $year = $spreadsheet->value_at($num, "YEAR");
186 my $project_description = $spreadsheet->value_at($num, "PROJECT_DESCRIPTION");
187 #my $project_description = "$name $project_type ($year) $location";
189 print"project descr is $project_description /n";
191 # see if a stock exists with any of the synonyms
192 my @stocks = $stock_rs->search( {
193 -or => [
194 uniquename => $accession,
195 uniquename => $syn1,
196 uniquename => $syn2,
197 uniquename => $syn3,
198 #uniquename => $syn4,
199 ], }, );
201 my $existing_stock = $stock_rs->search( { uniquename => $accession } )->single;
202 foreach my $s(@stocks) {
203 print "Looking at accession $accession, Found stock '" . $s->uniquename . "(stock_id = " . $s->stock_id . ")'\n";
204 $existing++;
209 if (!@stocks) {
210 print "NEW stock: $accession\n";
211 $new_count++;
212 }elsif (!$existing_stock) {
214 my %stock_hash = map { $_->stock_id => $_ } @stocks;
215 my @keys = keys %stock_hash;
216 my @sorted = sort { $a <=> $b } @keys;
217 print "Existing stock_id is " . $sorted[0] . " name = " . ($stock_hash{$sorted[0]})->uniquename . "\n";
218 $existing_stock = $stock_hash{$sorted[0]};
219 $existing_stock->uniquename($accession);
220 $existing_stock->name($accession);
221 $existing_stock->update;
224 if (scalar(@stocks) >1) {
225 my @stock_names = map( $_->uniquename , @stocks );
226 my @stock_ids = map ($_->stock_id, @stocks);
227 print "MERGE: stocks " . join (", " , @stock_names) . "need to be merged\n";
228 $merge .= "$accession : merge stock_ids : " .join (", " , @stock_ids) . "( names: " . join (" | " , @stock_names) . ")\n";
230 my $stock = $existing_stock ? $existing_stock :
231 $schema->resultset("Stock::Stock")->find_or_create(
232 { organism_id => $organism_id,
233 name => $accession,
234 uniquename => $accession,
235 type_id => $accession_cvterm->cvterm_id(),
236 #description => '',
238 my $stock_id = $stock->stock_id;
239 print "Adding owner $curator\n";
241 my $curator_person_id = CXGN::People::Person->get_person_by_username($dbh, $curator); #add person id as an option.
242 if (!$curator_person_id) { die "Person $curator does not exist in the database! Please add this user before continuing \n";}
243 #add owner for this stock
244 $phenome_schema->resultset("StockOwner")->find_or_create(
246 stock_id => $stock->stock_id,
247 sp_person_id => $curator_person_id,
250 if ($ITC_name) {
251 my $row = $schema -> resultset("General::Dbxref")->find_or_create(
253 accession => $ITC_name, db_id=>$mg_dbid,
255 my $link_row = $schema -> resultset("Stock::StockDbxref")->find_or_create(
256 { stock_id => $stock_id,
257 dbxref_id => $row->dbxref_id(),
259 $link_row->update();
260 print "Creating a MGIS link for accession $accession (stock_id = $stock_id dbxref_id= " . $row->dbxref_id . ")\n";
262 # you store the location in nd_geolocation
263 # link it with each experiment
264 # the nd_experiment table has nd_geolocation_id column
266 # you store a project name for your group of phenotyping /genotyping experiments
267 # call it something like "2015 phenotyping trials in location ABC..."
268 # and link that project with the project_id for the breeding program using project_relationship
269 # if you don't have a project_id for the breeding program you create a new one in the project table
270 # and add to it a projectprop woth type_id of the "breeding_program" cvterm
271 # select * from stock join nd_experiment_stock ON nd_experiment_stock.nd_experiment_stock_id = stock.stock_id join nd_experiment on nd_experiment.nd_experiment_id = nd_experiment_stock.nd_experiment_id join nd_experiment_project on nd_experiment_project.nd_experiment_id = nd_experiment_stock.nd_experiment_id ;
272 my $banana_project = $schema->resultset("Project::Project")->find_or_create(
274 name => "2015_banana_germplasm_IITA_NARO",
275 #name => $name,
276 description => $project_type,
277 } ) ;
279 my $project_description = "$name $project_type ($year) $location";
281 #print"project descr is $project_description /n";
283 my $project = $schema->resultset("Project::Project")->find_or_create(
285 name => $name,
286 description => $project_description,
287 } ) ;
289 #associate the new trial with breeding program
290 my $trial = CXGN::Trial->new({ schema => $schema, trial_id => $project->project_id() });
291 $trial->set_breeding_program($banana_project->project_id);
293 print "banana id = " . $banana_project->project_id . " project_id = " . $project->project_id . "\n";
294 #store the geolocation data and props:
295 my $geo_description = $location;
298 #####################
299 if ($organization) { $stock->create_stockprops( { organization => $organization }, { autocreate => 1 } ); }
300 if ($location_code) { $stock->create_stockprops( { location_code => $location_code }, { autocreate => 1 } ) } ;
301 if ($ploidy) { $stock->create_stockprops( { ploidy_level => $ploidy }, { autocreate => 1 }) };
302 if ($germplasm_group) { $stock->create_stockprops( { genome_structure => $germplasm_group }, { autocreate=>1})};
304 #the stock belongs to the population:
305 #add new stock_relationship
307 $population->find_or_create_related('stock_relationship_objects', {
308 type_id => $member_of->cvterm_id(),
309 subject_id => $stock->stock_id(),
310 } );
311 print "Adding synonyms #\n";
312 my @synonyms = ($accession,$syn2, $syn3);
313 foreach my $syn (@synonyms) {
314 if ($syn && defined($syn) && ($syn ne $accession) ) {
315 my $existing_synonym = $stock->search_related(
316 'stockprops' , {
317 'me.value' => $syn,
318 'type.name' => { ilike => '%synonym' }
320 { join => 'type' }
321 )->single;
322 if (!$existing_synonym) {
323 $syn_count++;
324 print STDOUT "Adding synonym: $syn \n" ;
325 #add the synonym as a stockprop
326 $stock->create_stockprops({ stock_synonym => $syn},
327 {autocreate => 1,
328 #cv_name => 'local', #use default stock_property cv
329 allow_duplicate_values=> 1,
335 my @props = $stock->search_related('stockprops');
336 foreach my $p ( @props ) {
337 print "**the prop value for stock " . $stock->name() . " is " . $p->value() . "\n" if $p;
341 #########
349 print "TOTAL: \n $count rows \n $new_count new accessions \n $existing existing stocks \n $syn_count new synonyms \n MERGE :\n $merge\n";
350 #if ($opt_t) { die "test rolling back";}
354 try {
355 $schema->txn_do($coderef);
356 if (!$opt_t) {
357 print "Transaction succeeded! Commiting stocks and their properties! \n\n";
358 $schema->txn_commit();
360 else {
361 $schema->txn_rollback();
362 die "TEST RUN! rolling back\n";
364 } catch {
365 # Transaction failed
366 # foreach my $value ( keys %seq ) {
367 # my $maxval= $seq{$value} || 0;
368 # if ($maxval) { $dbh->do("SELECT setval ('$value', $maxval, true)") ; }
369 # else { $dbh->do("SELECT setval ('$value', 1, false)"); }
371 die "An error occured! Rolling back and reseting database sequences!" . $_ . "\n";