7 $load_stock_data.pl -H [dbhost] -D [dbname] [-t]
9 =head1 COMMAND-LINE OPTIONS
14 -t Test run . Rolling back at the end.
19 Updated script for loading and adding banana accession names and synonyms.
20 The owners of the stock accession are not stored in stockprop, but in phenome.stock_owner.
22 Naama Menda (nm249@cornell.edu)
23 Guillaume Bauchet (gjb99@cornell.edu)
35 use CXGN
::Tools
::File
::Spreadsheet
;
37 use CXGN
::Phenome
::Schema
;
38 use CXGN
::BreedersToolbox
::Projects
;
39 use Bio
::Chado
::Schema
;
40 use CXGN
::DB
::InsertDBH
;
43 use CXGN
::Chado
::Dbxref
;
44 use CXGN
::Chado
::Phenotype
;
45 use CXGN
::People
::Person
;
48 our ($opt_H, $opt_D, $opt_i, $opt_t);
56 my $dbh = CXGN
::DB
::InsertDBH
->new( { dbhost
=>$dbhost,
58 dbargs
=> {AutoCommit
=> 0,
62 my $schema= Bio
::Chado
::Schema
->connect( sub { $dbh->get_actual_dbh() } , { on_connect_do
=> ['SET search_path TO public;'] }
64 my $phenome_schema= CXGN
::Phenome
::Schema
->connect( sub { $dbh->get_actual_dbh } , { on_connect_do
=> ['set search_path to public,phenome;'] } );
67 #getting the last database ids for resetting at the end in case of rolling back
68 # my $last_stockprop_id= $schema->resultset('Stock::Stockprop')->get_column('stockprop_id')->max;
69 # my $last_stock_id= $schema->resultset('Stock::Stock')->get_column('stock_id')->max;
70 # my $last_stockrel_id= $schema->resultset('Stock::StockRelationship')->get_column('stock_relationship_id')->max;
71 # my $last_cvterm_id= $schema->resultset('Cv::Cvterm')->get_column('cvterm_id')->max;
72 # my $last_cv_id= $schema->resultset('Cv::Cv')->get_column('cv_id')->max;
73 # my $last_db_id= $schema->resultset('General::Db')->get_column('db_id')->max;
74 # my $last_dbxref_id= $schema->resultset('General::Dbxref')->get_column('dbxref_id')->max;
75 # my $last_organism_id = $schema->resultset('Organism::Organism')->get_column('organism_id')->max;
78 # 'db_db_id_seq' => $last_db_id,
79 # 'dbxref_dbxref_id_seq' => $last_dbxref_id,
80 # 'cv_cv_id_seq' => $last_cv_id,
81 # 'cvterm_cvterm_id_seq' => $last_cvterm_id,
82 # 'stock_stock_id_seq' => $last_stock_id,
83 # 'stockprop_stockprop_id_seq' => $last_stockprop_id,
84 # 'stock_relationship_stock_relationship_id_seq' => $last_stockrel_id,
85 # 'organism_organism_id_seq' => $last_organism_id,
88 #new spreadsheet, skip first column
89 my $spreadsheet=CXGN
::Tools
::File
::Spreadsheet
->new($file); # 0 or 1
92 ##parse first the file with the clone names and synonyms. Load is into stock, and stockprop
94 # population for grouping the clones
96 my $population_name = 'Banana accessions';
100 my $organism = $schema->resultset("Organism::Organism")->find_or_create( {
101 species
=> $species } );
102 my $organism_id = $organism->organism_id();
104 my $stock_rs = $schema->resultset("Stock::Stock");
108 #the cvterm for the accession
109 print "Finding/creating cvterm for 'stock type' \n";
110 my $accession_cvterm = $schema->resultset("Cv::Cvterm")->create_with(
111 { name
=> 'accession',
114 dbxref
=> 'accession',
117 #the cvterm for the relationship type
118 print "Finding/creating cvterm for stock relationship 'member_of' \n";
120 my $member_of = $schema->resultset("Cv::Cvterm")->create_with(
121 { name
=> 'member_of',
122 cv
=> 'stock relationship',
124 dbxref
=> 'member_of',
127 #the cvterm for the population
128 print "Finding/creating cvterm for population\n";
129 my $population_cvterm = $schema->resultset("Cv::Cvterm")->create_with(
130 { name
=> 'population',
133 dbxref
=> 'population',
136 print "Creating a stock for population $population_name (cvterm = " . $population_cvterm->name . ")\n";
138 my $population = $stock_rs->find_or_create(
140 'me.name' => $population_name,
141 'me.uniquename' => $population_name,
142 'me.organism_id' => $organism_id,
143 type_id
=> $population_cvterm->cvterm_id,
148 # link to MGIS database
150 my $mg_row = $schema->resultset("General::Db")->find( { name
=>'MGIS' });
152 my $new_mg_row = $schema->resultset("General::Db")->create( {
156 $mg_dbid = $new_mg_row->db_id();
159 $mg_dbid = $mg_row->db_id();
165 ## For the stock module
166 ################################
168 print "parsing spreadsheet... \n";
169 my @rows = $spreadsheet->row_labels();
170 my @columns = $spreadsheet->column_labels();
172 ## LABEL GENOTYPE SYNONYM1 SYNONYM2 SYNONYM3 SYNONYM4 PLOIDY GERMPLASM_GROUP CATEGORY INSTITUTE LOCATION LOCATION_CODE CURATOR NAME PROJECT_DESCRIPTION PROJECT_TYPE YEAR
173 # LABEL NAME UNIQUENAME GENOTYPE SYNONYM1 SYNONYM2 PLOIDY GERMPLASM_GROUP CATEGORY INSTITUTE LOCATION LOCATION_CODE CURATOR PROJECT_NAME PROJECT_DESCRIPTION PROJECT_TYPE YEAR
174 # 1 Calcutta-4 Calcutta-4_ITC0249_2015_Collection_IITA_SEN Calcutta-4 ITC0249 2X AA Collection IITA Sendusu SEN mibatte 2015_Collection_IITA_SEN Collection, IITA-Sendusu, 2015 2015_banana_germplasm 2015
177 my ($new_count,$existing, $count, $syn_count, $merge);
179 foreach my $num (@rows ) {
180 my $accession = $spreadsheet->value_at($num,'NAME');
181 if (!$accession) { next; }
182 print "\nBanana accession name is '" . $accession . "'\n";
184 my $organization = $spreadsheet->value_at($num, 'INSTITUTE');
185 my $location_code = $spreadsheet->value_at($num, 'LOCATION_CODE');
186 my $location = $spreadsheet->value_at($num, 'LOCATION');
187 my $ITC_name = $spreadsheet->value_at($num, "SYNONYM1");
188 #my $original_name = $spreadsheet->value_at($num, "GENOTYPE_NAME_ORIGINAL");
189 my $syn1 = $spreadsheet->value_at($num, "UNIQUENAME");
190 my $syn2 = $spreadsheet->value_at($num, "GENOTYPE");
191 my $syn3 = $spreadsheet->value_at($num, "SYNONYM1");
192 #my $syn4 = $spreadsheet->value_at($num, "SYNONYM4");
193 my $ploidy = $spreadsheet->value_at($num, "PLOIDY");
194 my $germplasm_group = $spreadsheet->value_at($num, "GERMPLASM_GROUP");
195 my $curator = $spreadsheet->value_at($num, "CURATOR");
196 my $name = $spreadsheet->value_at($num, "PROJECT_NAME");
197 my $category = $spreadsheet->value_at($num, "CATEGORY");
198 my $project_type = $spreadsheet->value_at($num, "PROJECT_TYPE");
199 my $year = $spreadsheet->value_at($num, "YEAR");
200 my $project_description = $spreadsheet->value_at($num, "PROJECT_DESCRIPTION");
201 #my $project_description = "$name $project_type ($year) $location";
203 print"project descr is $project_description /n";
205 # see if a stock exists with any of the synonyms
206 my @stocks = $stock_rs->search( {
208 uniquename
=> $accession,
212 #uniquename => $syn4,
215 my $existing_stock = $stock_rs->search( { uniquename
=> $accession } )->single;
216 foreach my $s(@stocks) {
217 print "Looking at accession $accession, Found stock '" . $s->uniquename . "(stock_id = " . $s->stock_id . ")'\n";
224 print "NEW stock: $accession\n";
226 }elsif (!$existing_stock) {
228 my %stock_hash = map { $_->stock_id => $_ } @stocks;
229 my @keys = keys %stock_hash;
230 my @sorted = sort { $a <=> $b } @keys;
231 print "Existing stock_id is " . $sorted[0] . " name = " . ($stock_hash{$sorted[0]})->uniquename . "\n";
232 $existing_stock = $stock_hash{$sorted[0]};
233 $existing_stock->uniquename($accession);
234 $existing_stock->name($accession);
235 $existing_stock->update;
238 if (scalar(@stocks) >1) {
239 my @stock_names = map( $_->uniquename , @stocks );
240 my @stock_ids = map ($_->stock_id, @stocks);
241 print "MERGE: stocks " . join (", " , @stock_names) . "need to be merged\n";
242 $merge .= "$accession : merge stock_ids : " .join (", " , @stock_ids) . "( names: " . join (" | " , @stock_names) . ")\n";
244 my $stock = $existing_stock ?
$existing_stock :
245 $schema->resultset("Stock::Stock")->find_or_create(
246 { organism_id
=> $organism_id,
248 uniquename
=> $accession,
249 type_id
=> $accession_cvterm->cvterm_id(),
252 my $stock_id = $stock->stock_id;
253 print "Adding owner $curator\n";
255 my $curator_person_id = CXGN
::People
::Person
->get_person_by_username($dbh, $curator); #add person id as an option.
256 if (!$curator_person_id) { die "Person $curator does not exist in the database! Please add this user before continuing \n";}
257 #add owner for this stock
258 $phenome_schema->resultset("StockOwner")->find_or_create(
260 stock_id
=> $stock->stock_id,
261 sp_person_id
=> $curator_person_id,
265 my $row = $schema -> resultset
("General::Dbxref")->find_or_create(
267 accession
=> $ITC_name, db_id
=>$mg_dbid,
269 my $link_row = $schema -> resultset
("Stock::StockDbxref")->find_or_create(
270 { stock_id
=> $stock_id,
271 dbxref_id
=> $row->dbxref_id(),
274 print "Creating a MGIS link for accession $accession (stock_id = $stock_id dbxref_id= " . $row->dbxref_id . ")\n";
276 # you store the location in nd_geolocation
277 # link it with each experiment
278 # the nd_experiment table has nd_geolocation_id column
280 # you store a project name for your group of phenotyping /genotyping experiments
281 # call it something like "2015 phenotyping trials in location ABC..."
282 # and link that project with the project_id for the breeding program using project_relationship
283 # if you don't have a project_id for the breeding program you create a new one in the project table
284 # and add to it a projectprop woth type_id of the "breeding_program" cvterm
285 # select * from stock join nd_experiment_stock ON nd_experiment_stock.nd_experiment_stock_id = stock.stock_id join nd_experiment on nd_experiment.nd_experiment_id = nd_experiment_stock.nd_experiment_id join nd_experiment_project on nd_experiment_project.nd_experiment_id = nd_experiment_stock.nd_experiment_id ;
286 my $banana_project = $schema->resultset("Project::Project")->find_or_create(
288 name
=> "2015_banana_germplasm_IITA_NARO",
290 description
=> $project_type,
293 my $project_description = "$name $project_type ($year) $location";
295 #print"project descr is $project_description /n";
297 my $project = $schema->resultset("Project::Project")->find_or_create(
300 description
=> $project_description,
303 #associate the new project with breeding program
304 my $cxgn_project = CXGN
::BreedersToolbox
::Projects
->new( { schema
=> $schema } ) ;
305 $cxgn_project->associate_breeding_program_with_trial( $banana_project->project_id, $project->project_id);
307 print "banana id = " . $banana_project->project_id . " project_id = " . $project->project_id . "\n";
308 #store the geolocation data and props:
309 my $geo_description = $location;
312 #####################
313 if ($organization) { $stock->create_stockprops( { organization
=> $organization }, { autocreate
=> 1 } ); }
314 if ($location_code) { $stock->create_stockprops( { location_code
=> $location_code }, { autocreate
=> 1 } ) } ;
315 if ($ploidy) { $stock->create_stockprops( { ploidy_level
=> $ploidy }, { autocreate
=> 1 }) };
316 if ($germplasm_group) { $stock->create_stockprops( { genome_structure
=> $germplasm_group }, { autocreate
=>1})};
318 #the stock belongs to the population:
319 #add new stock_relationship
321 $population->find_or_create_related('stock_relationship_objects', {
322 type_id
=> $member_of->cvterm_id(),
323 subject_id
=> $stock->stock_id(),
325 print "Adding synonyms #\n";
326 my @synonyms = ($accession,$syn2, $syn3);
327 foreach my $syn (@synonyms) {
328 if ($syn && defined($syn) && ($syn ne $accession) ) {
329 my $existing_synonym = $stock->search_related(
332 'type.name' => 'synonym'
336 if (!$existing_synonym) {
338 print STDOUT
"Adding synonym: $syn \n" ;
339 #add the synonym as a stockprop
340 $stock->create_stockprops({ synonym
=> $syn},
343 allow_duplicate_values
=> 1,
349 my @props = $stock->search_related('stockprops');
350 foreach my $p ( @props ) {
351 print "**the prop value for stock " . $stock->name() . " is " . $p->value() . "\n" if $p;
363 print "TOTAL: \n $count rows \n $new_count new accessions \n $existing existing stocks \n $syn_count new synonyms \n MERGE :\n $merge\n";
364 #if ($opt_t) { die "test rolling back";}
369 $schema->txn_do($coderef);
371 print "Transaction succeeded! Commiting stocks and their properties! \n\n";
372 $schema->txn_commit();
375 $schema->txn_rollback();
376 die "TEST RUN! rolling back\n";
380 # foreach my $value ( keys %seq ) {
381 # my $maxval= $seq{$value} || 0;
382 # if ($maxval) { $dbh->do("SELECT setval ('$value', $maxval, true)") ; }
383 # else { $dbh->do("SELECT setval ('$value', 1, false)"); }
385 die "An error occured! Rolling back and reseting database sequences!" . $_ . "\n";