make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / upload_multiple_trial_design.pl
blob055169ce64e22d2fb318f57a2d021d8533228858
1 #!/usr/bin/perl
3 =head1
5 upload_multiple_trial_design.pl
7 =head1 SYNOPSIS
9 upload_multiple_trial_design.pl -H [dbhost] -D [dbname] -P [dbpass] -w [basepath] -U [dbuser] -b [breeding program name] -i infile -un [username] -e [email address] -eo [email_option_enabled] -r [temp_file_nd_experiment_id]
11 =head1 COMMAND-LINE OPTIONS
12 ARGUMENTS
13 -H host name (required) Ex: "breedbase_db"
14 -D database name (required) Ex: "breedbase"
15 -P database userpass (required) Ex: "postgres"
16 -w basepath (required) Ex: /home/production/cxgn/sgn
17 -i path to infile (required)
18 -U username (required) Ex: "postgres"
19 -b breeding program name (required) Ex: test
20 -t test run . Rolling back at the end
21 -e email address of the user
22 -l name of the user
23 -r temp_file_nd_experiment_id (required) Ex: /temp/delete_nd_experiment_ids.txt
24 if loading trial data from metadata file, phenotypes + layout from infile
26 =head2 DESCRIPTION
28 perl bin/upload_multiple_trial_design.pl -h breedbase_db -d breedbase -p postgres -w /home/cxgn/sgn/ -u postgres -i ~/Desktop/test_multi.xlsx -b test -n janedoe -e 'sk2783@cornell.edu' -l 'sri' -r /tmp/delete_nd_experiment_ids.txt
30 This script will parse and validate the input file. If there are any warnings or errors during validation it will send a error message to the provided email.
31 If there are no warnings(or errors) during validation it will then store the data.
32 The input file should be either .xlsx or .xls format.
34 CHECK cvterms for trial metadata!!
36 ################################################
37 Minimal metadata requirements are
38 trial_name
39 trial_description (can also be built from the trial name, type, year, location)
40 trial_type (read from an input file)
41 trial_location geo_description ( must be in the database - nd_geolocation.description - can be read from metadata file)
42 year (can be read from the metadata file )
43 design (defaults to 'RCBD' )
44 breeding_program (provide with option -b )
47 Other OPTIONAL trial metadata (projectprops)
49 project planting date
50 project fertilizer date
51 project harvest date
52 project sown plants
53 project harvested plants
55 =head1 AUTHOR
57 Srikanth (sk2783@cornell.edu)
59 =cut
61 use strict;
62 use Getopt::Long;
63 use File::Basename;
64 use CXGN::File::Parse;
65 use Bio::Chado::Schema;
66 use CXGN::DB::InsertDBH;
67 use Carp qw /croak/ ;
68 use Try::Tiny;
69 use DateTime;
70 use Pod::Usage;
71 use List::Util qw(max);
72 use List::Util qw(first);
73 use List::Util qw(uniq);
74 use CXGN::Metadata::Schema;
75 use CXGN::Phenome::Schema;
76 use CXGN::People::Person;
77 use Data::Dumper;
78 use CXGN::Phenotypes::StorePhenotypes;
79 use CXGN::Trial; # add project metadata
80 #use CXGN::BreedersToolbox::Projects; # associating a breeding program
81 use CXGN::Trial::TrialCreate;
82 use CXGN::Tools::Run;
83 use CXGN::Contact;
84 use CXGN::Trial::ParseUpload;
85 use CXGN::TrialStatus;
86 use CXGN::Calendar;
87 use CXGN::UploadFile;
88 use File::Path qw(make_path);
89 use File::Spec;
90 use JSON::MaybeXS qw(encode_json);
92 # sub print_json_response {
93 # my ($status, $message) = @_;
94 # my $response = {
95 # status => $status,
96 # message => $message,
97 # };
98 # print encode_json($response);
99 # exit;
102 my @errors;
103 my ( $help, $dbhost, $dbname, $basepath, $dbuser, $dbpass, $infile, $sites, $types, $username, $breeding_program_name, $email_address, $logged_in_name, $email_option_enabled, $temp_file_nd_experiment_id);
104 GetOptions(
105 'dbhost|H=s' => \$dbhost,
106 'dbname|D=s' => \$dbname,
107 'dbpass|P=s' => \$dbpass,
108 'basepath|w=s' => \$basepath,
109 'dbuser|U=s' => \$dbuser,
110 'i=s' => \$infile,
111 'b=s' => \$breeding_program_name,
112 'user|un=s' => \$username,
113 'help' => \$help,
114 'email|e=s' => \$email_address,
115 # 'logged_in_user|l=s' => \$logged_in_name,
116 'temp_file|r=s' => \$temp_file_nd_experiment_id,
117 'email_option_enabled|eo=s' => \$email_option_enabled,
120 #Ensure the parent directory exists before creating the temporary file
121 my $parent_dir = File::Spec->catdir($basepath, 'static', 'documents', 'tempfiles', 'delete_nd_experiment_ids');
122 unless (-d $parent_dir) {
123 # make_path($parent_dir) or die "Failed to create directory $parent_dir: $!";
124 make_path($parent_dir) or push @errors, "Failed to create directory $parent_dir: $!";
127 # Create the temporary file in the parent directory
128 my $temp_file_nd_experiment_id = File::Spec->catfile($parent_dir, 'fileXXXX');
130 pod2usage(1) if $help;
131 if (!$infile || !$breeding_program_name || !$username || !$dbname || !$dbhost ) {
132 pod2usage( { -msg => 'Error. Missing options!' , -verbose => 1, -exitval => 1 } ) ;
135 my $dbh;
137 if ($dbpass) {
138 print STDERR "Logging in with password\n";
139 $dbh = DBI->connect("dbi:Pg:database=$dbname;host=$dbhost",
140 $dbuser,
141 $dbpass,
142 {AutoCommit => 1,
143 RaiseError => 1});
144 } else {
145 $dbh = CXGN::DB::InsertDBH->new( {
146 dbhost =>$dbhost,
147 dbname =>$dbname,
148 dbargs => {AutoCommit => 1,
149 RaiseError => 1}
153 print STDERR "Database connection ok!\n";
155 my $schema= Bio::Chado::Schema->connect( sub { $dbh } , { on_connect_do => ['SET search_path TO public, sgn, metadata, phenome;'] } );
157 # ################
158 # getting the last database ids for resetting at the end in case of rolling back
159 # ###############
161 # my $last_nd_experiment_id = $schema->resultset('NaturalDiversity::NdExperiment')->get_column('nd_experiment_id')->max;
162 # my $last_cvterm_id = $schema->resultset('Cv::Cvterm')->get_column('cvterm_id')->max;
164 # my $last_nd_experiment_project_id = $schema->resultset('NaturalDiversity::NdExperimentProject')->get_column('nd_experiment_project_id')->max;
165 # my $last_nd_experiment_stock_id = $schema->resultset('NaturalDiversity::NdExperimentStock')->get_column('nd_experiment_stock_id')->max;
166 # my $last_nd_experiment_phenotype_id = $schema->resultset('NaturalDiversity::NdExperimentPhenotype')->get_column('nd_experiment_phenotype_id')->max;
167 # my $last_phenotype_id = $schema->resultset('Phenotype::Phenotype')->get_column('phenotype_id')->max;
168 # my $last_stock_id = $schema->resultset('Stock::Stock')->get_column('stock_id')->max;
169 # my $last_stock_relationship_id = $schema->resultset('Stock::StockRelationship')->get_column('stock_relationship_id')->max;
170 # my $last_project_id = $schema->resultset('Project::Project')->get_column('project_id')->max;
171 # my $last_nd_geolocation_id = $schema->resultset('NaturalDiversity::NdGeolocation')->get_column('nd_geolocation_id')->max;
172 # my $last_geoprop_id = $schema->resultset('NaturalDiversity::NdGeolocationprop')->get_column('nd_geolocationprop_id')->max;
173 # my $last_projectprop_id = $schema->resultset('Project::Projectprop')->get_column('projectprop_id')->max;
175 # my %seq = (
176 # 'nd_experiment_nd_experiment_id_seq' => $last_nd_experiment_id,
177 # 'cvterm_cvterm_id_seq' => $last_cvterm_id,
178 # 'nd_experiment_project_nd_experiment_project_id_seq' => $last_nd_experiment_project_id,
179 # 'nd_experiment_stock_nd_experiment_stock_id_seq' => $last_nd_experiment_stock_id,
180 # 'nd_experiment_phenotype_nd_experiment_phenotype_id_seq' => $last_nd_experiment_phenotype_id,
181 # 'phenotype_phenotype_id_seq' => $last_phenotype_id,
182 # 'stock_stock_id_seq' => $last_stock_id,
183 # 'stock_relationship_stock_relationship_id_seq' => $last_stock_relationship_id,
184 # 'project_project_id_seq' => $last_project_id,
185 # 'nd_geolocation_nd_geolocation_id_seq' => $last_nd_geolocation_id,
186 # 'nd_geolocationprop_nd_geolocationprop_id_seq' => $last_geoprop_id,
187 # 'projectprop_projectprop_id_seq' => $last_projectprop_id,
188 # );
191 # ##############
192 # Breeding program for associating the trial/s ##
193 # ##############
195 my $breeding_program = $schema->resultset("Project::Project")->find(
196 {'me.name' => $breeding_program_name, 'type.name' => 'breeding_program'},
197 { join => { projectprops => 'type' }}
200 if (!$breeding_program) {
201 push @errors, "Breeding program $breeding_program_name does not exist in the database. Check your input";
204 my $sp_person_id= CXGN::People::Person->get_person_by_username($dbh, $username);
205 if (!$sp_person_id) {
206 push @errors, "Need to have a user pre-loaded in the database!";
209 #Column headers for trial design/s
210 #plot_name accession_name plot_number block_number trial_name trial_description trial_location year trial_type is_a_control rep_number range_number row_number col_number entry_numbers
212 #parse file using the generic file parser
213 my $parser = CXGN::File::Parse->new(
214 file => $infile,
215 required_columns => ['trial_name', 'accession_name', 'plot_number', 'block_number', 'location', 'year'],
218 # warn "Starting to parse the file...\n";
219 my $parsed = $parser->parse();
220 # warn "Parsed data = " . Dumper($parsed);
221 if (scalar(@{$parsed->{errors}}) > 0) {
222 push @errors, "Error parsing file: " . join(',', @{$parsed->{errors}}) if scalar(@{$parsed->{errors}}) > 0;
225 if (exists $parsed->{warnings}) {
226 print "Warnings: " . join("\n", @{$parsed->{warnings}}) . "\n";
228 # #parse the file
229 # my $parsed = $parser->parse();
231 # if (scalar(@{$parsed->{errors}}) > 0) {
232 # die "Error parsing file: ".join(',', @{$parsed->{errors}});
235 my @traits;
236 my %multi_trial_data;
237 my %metadata_fields = map { $_ => 1 } qw(trial_name accession_name plot_number block_number location year design_type trial_description);
239 foreach my $row (@{$parsed->{data}}) {
240 my $trial_name = $row->{trial_name};
241 next unless $trial_name; # Skip rows with empty trial names
243 # Check if the location exists in the database
244 my $trial_location = $row->{location};
245 my $location_rs = $schema->resultset("NaturalDiversity::NdGeolocation")->search({
246 description => { ilike => '%' . $trial_location . '%' },
248 if (scalar($location_rs) == 0) {
249 push @errors, "ERROR: location must be pre-loaded in the database. Location name = " . $trial_location . "\n";
251 my $location_id = $location_rs->first->nd_geolocation_id;
252 ######################################################
254 # # Store all data for the current trial
255 $multi_trial_data{$trial_name} = {
256 trial_location => $row->{location},
257 trial_year => $row->{year},
258 design_type => $row->{design_type},
259 trial_description => $row->{description},
260 program => $breeding_program->name,
261 plot_name => $row->{plot_name},
262 accession_name => $row->{accession_name},
263 plots => [],
266 foreach my $col (@{$parsed->{columns}}) {
267 next if exists $metadata_fields{$col};
271 print STDERR "unique trial names:\n";
272 foreach my $name(keys %multi_trial_data) {
273 print"$name\n";
276 print STDERR "Reading phenotyping file:\n";
277 my %phen_params = map { if ($_ =~ m/^\w+\|(\w+:\d{7})$/ ) { $_ => $1 } } @traits;
278 delete $phen_params{''};
280 # my @traits = keys %phen_params;
281 print STDERR "Found traits: " . Dumper(\%phen_params) . "\n";
284 foreach my $trial_name (keys %multi_trial_data) {
285 $multi_trial_data{$trial_name}->{design} = $multi_trial_data{$trial_name};
289 my %trial_design_hash;
290 my %phen_data_by_trial;
292 foreach my $row (@{$parsed->{data}}) {
293 my $trial_name = $row->{trial_name};
294 next unless $trial_name;
296 my $plot_number = $row->{plot_number};
297 my $plot_name = $row->{plot_name};
298 $trial_design_hash{$trial_name}{$plot_number} = {
299 trial_name => $trial_name,
300 trial_type => $row->{trial_type},
301 planting_date => $row->{planting_date},
302 harvest_date => $row->{harvest_date},
303 entry_numbers => $row->{entry_numbers},
304 is_a_control => $row->{is_a_control},
305 rep_number => $row->{rep_number},
306 range_number => $row->{range_number},
307 row_number => $row->{row_number},
308 col_number => $row->{col_number},
309 seedlot_name => $row->{seedlot_name},
310 num_seed_per_plot => $row->{num_seed_per_plot},
311 weight_gram_seed_per_plot => $row->{weight_gram_seed_per_plot},
314 foreach my $trait_string (keys %phen_params) {
315 my $phen_value = $row->{$trait_string};
316 $phen_data_by_trial{$trial_name}{$plot_name}{$trait_string} = [$phen_value, DateTime->now->datetime];
320 print STDERR "multi trial hash:" . Dumper(\%multi_trial_data);
321 print STDERR "trial design " . Dumper(\%trial_design_hash);
322 print STDERR "Processed trials: " . scalar(keys %trial_design_hash) . "\n";
323 print STDERR "Phen data by trial: " . Dumper(\%phen_data_by_trial) . "\n";
325 #####create the design hash#####
326 print Dumper(keys %trial_design_hash);
327 foreach my $trial_name (keys %trial_design_hash) {
328 $multi_trial_data{$trial_name}->{design} = $trial_design_hash{$trial_name} ;
331 my $date = localtime();
332 my $parser;
333 my %parsed_data;
334 my $parse_errors;
335 my @errors;
336 my $parsed_data;
337 my $ignore_warnings;
338 my $time = DateTime->now();
339 my $timestamp = $time->ymd()."_".$time->hms();
341 my %phenotype_metadata = {
342 'archived_file' => $infile,
343 'archived_file_type' => 'spreadsheet phenotype file',
344 'operator' => $username,
345 'date' => $date,
348 #parse uploaded file with appropriate plugin
349 $parser = CXGN::Trial::ParseUpload->new(chado_schema => $schema, filename => $infile);
350 $parser->load_plugin('MultipleTrialDesignExcelFormat');
351 $parsed_data = $parser->parse();
354 # if (!$parsed_data) {
355 # my $return_error = '';
357 # if (!$parser->has_parse_errors() ){
358 # print STDERR "Parsing errors: " . join("\n", @{$parse_errors->{'error_messages'}}) . "\n";
359 # # push @errors, "could not get parsing errors\n";
360 # }else {
361 # $parse_errors = $parser->get_parse_errors();
362 # print STDERR "Unknown parsing error occurred.\n";
365 # # push @errors, $return_error;
368 if ($parser->has_parse_warnings()) {
369 unless ($ignore_warnings) {
370 my $warnings = $parser->get_parse_warnings();
371 # print "Warnings: " . join("\n", @{$warnings->{'warning_messages'}}) . "\n";
376 my %all_desings = %{$parsed_data};
377 my %save;
378 $save{'errors'} = [];
379 my $coderef= sub {
380 foreach my $trial_name (keys %multi_trial_data) {
381 my $trial_location = $multi_trial_data{$trial_name}->{trial_location};
382 my $trial_design_info = $all_desings{$trial_name};
384 my %trial_info_hash = (
385 chado_schema => $schema,
386 dbh => $dbh,
387 trial_year => $trial_design_info->{'year'},
388 trial_description => $trial_design_info->{'description'},
389 trial_location => $trial_design_info->{'location'},
390 trial_name => $trial_name,
391 design_type => $trial_design_info->{'design_type'},
392 design => $trial_design_info->{'design_details'},
393 program => $trial_design_info->{'breeding_program'},
394 operator => $username,
395 owner_id => $sp_person_id,
398 # my $entry_numbers = $trial_design->{'entry_numbers'};
400 if ($trial_design_info->{'entry_numbers'}){
401 $trial_info_hash{trial_type} = $trial_design_info->{'entry_numbers'};
403 if ($trial_design_info->{'trial_type'}){
404 $trial_info_hash{trial_type} = $trial_design_info->{'trial_type'};
406 if ($trial_design_info->{'plot_width'}){
407 $trial_info_hash{plot_width} = $trial_design_info->{'plot_width'};
409 if ($trial_design_info->{'plot_length'}){
410 $trial_info_hash{plot_length} = $trial_design_info->{'plot_length'};
412 if ($trial_design_info->{'field_size'}){
413 $trial_info_hash{field_size} = $trial_design_info->{'field_size'};
415 if ($trial_design_info->{'planting_date'}){
416 $trial_info_hash{planting_date} = $trial_design_info->{'planting_date'};
418 if ($trial_design_info->{'harvest_date'}){
419 $trial_info_hash{harvest_date} = $trial_design_info->{'harvest_date'};
421 my $trial_create = CXGN::Trial::TrialCreate->new(%trial_info_hash);
422 my $current_save = $trial_create->save_trial();
424 if ($current_save->{error}){
425 # $schema->txn_rollback();
426 push @{$save{'errors'}}, $current_save->{'error'};
427 } elsif ($current_save->{'trial_id'}) {
428 my $trial_id = $current_save->{'trial_id'};
429 my $timestamp = $time->ymd();
430 my $calendar_funcs = CXGN::Calendar->new({});
431 my $formatted_date = $calendar_funcs->check_value_format($timestamp);
432 my $upload_date = $calendar_funcs->display_start_date($formatted_date);
434 my %trial_activity;
435 $trial_activity{'Trial Uploaded'}{'user_id'} = $sp_person_id;
436 $trial_activity{'Trial Uploaded'}{'activity_date'} = $upload_date;
438 my $trial_activity_obj = CXGN::TrialStatus->new({ bcs_schema => $schema });
439 $trial_activity_obj->trial_activities(\%trial_activity);
440 $trial_activity_obj->parent_id($trial_id);
441 my $activity_prop_id = $trial_activity_obj->store();
444 # save entry numbers, if provided
445 my $entry_numbers;
446 if ($entry_numbers = $trial_design_info->{'entry_numbers'}) {
447 if (scalar(keys %$entry_numbers) > 0 && $current_save->{'trial_id'} ) {
448 my %entry_numbers_prop;
449 my @stock_names = keys %$entry_numbers;
451 # Convert stock names from parsed trial template to stock ids for data storage
452 my $stocks = $schema->resultset('Stock::Stock')->search({ uniquename=>{-in=>\@stock_names} });
453 while (my $s = $stocks->next()) {
454 $entry_numbers_prop{$s->stock_id} = $entry_numbers->{$s->uniquename};
457 # Lookup synonyms of accession names
458 my $synonym_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'stock_synonym', 'stock_property')->cvterm_id();
459 my $acc_synonym_rs = $schema->resultset("Stock::Stock")->search({
460 'me.is_obsolete' => { '!=' => 't' },
461 'stockprops.value' => { -in => \@stock_names},
462 'stockprops.type_id' => $synonym_cvterm_id
463 },{join => 'stockprops', '+select'=>['stockprops.value'], '+as'=>['synonym']});
464 while (my $r=$acc_synonym_rs->next) {
465 if ( exists($entry_numbers->{$r->get_column('synonym')}) ) {
466 $entry_numbers_prop{$r->stock_id} = $entry_numbers->{$r->get_column('synonym')};
470 # store entry numbers
471 my $trial = CXGN::Trial->new({ bcs_schema => $schema, trial_id => $current_save->{'trial_id'} });
472 $trial->set_entry_numbers(\%entry_numbers_prop);
476 print STDERR "TrialCreate object created for trial: $trial_name\n";
478 my @plots = @{$multi_trial_data{$trial_name}->{plots} // []};
479 print STDERR "Trial Name = $trial_name\n";
481 my %parsed_data = $phen_data_by_trial{$trial_name};
482 if (scalar(@traits) > 0) {
483 foreach my $pname (keys %parsed_data) {
484 print STDERR "PLOT = $pname\n";
485 my %trait_string_hash = $parsed_data{$pname};
487 foreach my $trait_string (keys %trait_string_hash ) {
488 print STDERR "trait = $trait_string\n";
489 print STDERR "value = " . $trait_string_hash{$trait_string}[0] . "\n";
493 # # after storing the trial desgin store the phenotypes
494 my $metadata_schema = CXGN::Metadata::Schema->connect( sub { $dbh->get_actual_dbh() }, {on_connect_do => ['SET search_path TO metadata;'] } );
495 my $phenome_schema = CXGN::Phenome::Schema->connect( sub { $dbh->get_actual_dbh() } , {on_connect_do => ['SET search_path TO phenome, sgn, public;'] } );
496 my $store_phenotypes = CXGN::Phenotypes::StorePhenotypes->new(
497 basepath => $basepath,
498 dbhost => $dbhost,
499 dbname => $dbname,
500 dbuser => $dbuser,
501 dbpass => $dbpass,
502 temp_file_nd_experiment_id => $temp_file_nd_experiment_id,
503 bcs_schema => $schema,
504 metadata_schema => $metadata_schema,
505 phenome_schema => $phenome_schema,
506 user_id => $sp_person_id,
507 stock_list => \@plots,
508 trait_list => \@traits,
509 values_hash => \%parsed_data,
510 has_timestamps => 0,
511 overwrite_values => 0,
512 metadata_hash => \%phenotype_metadata,
515 #store the phenotypes
516 my ($verified_warning, $verified_error) = $store_phenotypes->verify();
517 # print STDERR "Verified phenotypes. warning = $verified_warning, error = $verified_error\n";
518 my $stored_phenotype_error = $store_phenotypes->store();
519 # print STDERR "Stored phenotypes Error:" . Dumper($stored_phenotype_error). "\n";
520 } else {
521 print STDERR "No traits defined for these $trial_name\n";
526 my ($email_subject, $email_body);
528 try {
529 $schema->txn_do($coderef);
531 if ($email_option_enabled == 1 && $email_address) {
532 print "Transaction succeeded! Committing project and its metadata \n\n";
534 $email_subject = "Multiple Trial Designs upload status";
535 $email_body = "Dear $username,\n\nCongratulations, all the multiple trial designs have been successfully uploaded into the database\n\nThank you\nHave a nice day\n\n";
537 CXGN::Contact::send_email($email_subject, $email_body, $email_address);
539 } catch {
540 # Transaction failed
541 my $error_message = "An error occurred! Rolling back! $_\n";
542 push @errors, $error_message;
543 # push @{$save{'errors'}}, $error_message;
544 # print STDERR $error_message;
546 if ($email_option_enabled == 1 && $email_address) {
547 $email_subject = 'Error in Trial Upload';
548 $email_body = "Dear $username,\n\n$error_message\n\nThank You\nHave a nice day\n";
550 # print STDERR $error_message;
552 CXGN::Contact::send_email($email_subject, $email_body, $email_address);
556 if (@errors) {
557 my $error_message = join("\n", @errors);
558 print STDERR $error_message;
559 exit(1);
562 print STDERR "All trials loaded successfully\n";
563 exit(0);