setting up fieldbook image zip
[sgn.git] / lib / CXGN / Phenotypes / StorePhenotypes.pm
blob6d78d142838442e98b7a6de1c7504dfc893df7b4
1 package CXGN::Phenotypes::StorePhenotypes;
3 =head1 NAME
5 CXGN::Phenotypes::StorePhenotypes - an object to handle storing phenotypes for SGN stocks
7 =head1 USAGE
9 my $store_phenotypes = CXGN::Phenotypes::StorePhenotypes->new();
10 $store_phenotypes->store($c,\@plot_list, \@trait_list, \%plot_trait_value, \%phenotype_metadata);
12 =head1 DESCRIPTION
15 =head1 AUTHORS
17 Jeremy D. Edwards (jde22@cornell.edu)
18 Naama Menda (nm249@cornell.edu)
19 Nicolas Morales (nm529@cornell.edu)
21 =cut
23 use strict;
24 use warnings;
25 use Moose;
26 use Try::Tiny;
27 use File::Basename qw | basename dirname|;
28 use Digest::MD5;
29 use CXGN::List::Validate;
30 use Data::Dumper;
31 use Scalar::Util qw(looks_like_number);
32 use Archive::Zip;
34 sub verify {
35 my $self = shift;
36 my $c = shift;
37 my $plot_list_ref = shift;
38 my $trait_list_ref = shift;
39 my $plot_trait_value_hashref = shift;
40 my $phenotype_metadata_ref = shift;
41 my $timestamp_included = shift;
42 my $image_zip = shift;
43 my $schema = $c->dbic_schema("Bio::Chado::Schema");
44 my $transaction_error;
45 my @plot_list = @{$plot_list_ref};
46 my @trait_list = @{$trait_list_ref};
47 my %phenotype_metadata = %{$phenotype_metadata_ref};
48 my %plot_trait_value = %{$plot_trait_value_hashref};
49 #print STDERR Dumper \%plot_trait_value;
50 my $plot_validator = CXGN::List::Validate->new();
51 my $trait_validator = CXGN::List::Validate->new();
52 my @plots_missing = @{$plot_validator->validate($schema,'plots_or_plants',\@plot_list)->{'missing'}};
53 my @traits_missing = @{$trait_validator->validate($schema,'traits',\@trait_list)->{'missing'}};
54 my $phenotyping_experiment_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'phenotyping_experiment', 'experiment_type');
55 my $error_message;
56 my $warning_message;
58 if (scalar(@plots_missing) > 0 || scalar(@traits_missing) > 0) {
59 print STDERR "Plots or traits not valid\n";
60 print STDERR "Invalid plots: ".join(", ", map { "'$_'" } @plots_missing)."\n" if (@plots_missing);
61 print STDERR "Invalid traits: ".join(", ", map { "'$_'" } @traits_missing)."\n" if (@traits_missing);
62 $error_message = "Invalid plots: <br/>".join(", <br/>", map { "'$_'" } @plots_missing) if (@plots_missing);
63 $error_message = "Invalid traits: <br/>".join(", <br/>", map { "'$_'" } @traits_missing) if (@traits_missing);
64 return ($warning_message, $error_message);
67 my %check_unique_value_trait_stock;
68 my %check_unique_trait_stock;
69 my $sql = "SELECT value, cvalue_id, uniquename FROM phenotype WHERE value is not NULL; ";
70 my $sth = $c->dbc->dbh->prepare($sql);
71 $sth->execute();
73 while (my ($db_value, $db_cvalue_id, $db_uniquename) = $sth->fetchrow_array) {
74 my ($stock_string, $rest_of_name) = split( /,/, $db_uniquename);
75 $check_unique_value_trait_stock{$db_value, $db_cvalue_id, $stock_string} = 1;
76 $check_unique_trait_stock{$db_cvalue_id, $stock_string} = $db_value;
79 my %check_trait_category;
80 $sql = "SELECT b.value, c.cvterm_id from cvtermprop as b join cvterm as a on (b.type_id = a.cvterm_id) join cvterm as c on (b.cvterm_id=c.cvterm_id) where a.name = 'trait_categories';";
81 $sth = $c->dbc->dbh->prepare($sql);
82 $sth->execute();
83 while (my ($category_value, $cvterm_id) = $sth->fetchrow_array) {
84 $check_trait_category{$cvterm_id} = $category_value;
87 my %check_trait_format;
88 $sql = "SELECT b.value, c.cvterm_id from cvtermprop as b join cvterm as a on (b.type_id = a.cvterm_id) join cvterm as c on (b.cvterm_id=c.cvterm_id) where a.name = 'trait_format';";
89 $sth = $c->dbc->dbh->prepare($sql);
90 $sth->execute();
91 while (my ($format_value, $cvterm_id) = $sth->fetchrow_array) {
92 $check_trait_format{$cvterm_id} = $format_value;
95 if ($image_zip) {
96 my $zipmod = Archive::Zip->new();
97 unless ( $zipmod->read( $image_zip ) == AZ_OK ) {
98 $error_message = $error_message." Reading zipfile failed!";
99 return ($warning_message, $error_message);
103 #print STDERR Dumper \@trait_list;
104 my %check_file_stock_trait_duplicates;
106 foreach my $plot_name (@plot_list) {
107 foreach my $trait_name (@trait_list) {
108 my $value_array = $plot_trait_value{$plot_name}->{$trait_name};
109 #print STDERR Dumper $value_array;
110 my $trait_value = $value_array->[0];
111 my $timestamp = $value_array->[1];
113 if ($trait_value) {
114 my $trait_cvterm_id;
115 #For multiterm traits of the form trait1|CO:0000001||trait2|CO:00000002
116 if ($trait_name =~ /\|\|/ ) {
117 $trait_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, $trait_name, 'cassava_trait')->cvterm_id();
118 } else {
119 $trait_cvterm_id = SGN::Model::Cvterm->get_cvterm_row_from_trait_name($schema, $trait_name)->cvterm_id();
121 my $stock_id = $schema->resultset('Stock::Stock')->find({'uniquename' => $plot_name})->stock_id();
123 #check that trait value is valid for trait name
124 if (exists($check_trait_format{$trait_cvterm_id})) {
125 if ($check_trait_format{$trait_cvterm_id} eq 'numeric') {
126 my $trait_format_checked = looks_like_number($trait_value);
127 if (!$trait_format_checked) {
128 $error_message = $error_message."<small>This trait value should be numeric: <br/>Plot Name: ".$plot_name."<br/>Trait Name: ".$trait_name."<br/>Value: ".$trait_value."</small><hr>";
132 if (exists($check_trait_category{$trait_cvterm_id})) {
133 my @trait_categories = split /\//, $check_trait_category{$trait_cvterm_id};
134 my %trait_categories_hash = map { $_ => 1 } @trait_categories;
135 if (!exists($trait_categories_hash{$trait_value})) {
136 $error_message = $error_message."<small>This trait value should be one of ".$check_trait_category{$trait_cvterm_id}.": <br/>Plot Name: ".$plot_name."<br/>Trait Name: ".$trait_name."<br/>Value: ".$trait_value."</small><hr>";
140 #check if the plot_name, trait_name combination already exists in database.
141 if (exists($check_unique_value_trait_stock{$trait_value, $trait_cvterm_id, "Stock: ".$stock_id})) {
142 $warning_message = $warning_message."<small>$plot_name already has the same value as in your file ($trait_value) stored for the trait $trait_name.</small><hr>";
143 } elsif (exists($check_unique_trait_stock{$trait_cvterm_id, "Stock: ".$stock_id})) {
144 $warning_message = $warning_message."<small>$plot_name already has a different value ($check_unique_trait_stock{$trait_cvterm_id, 'Stock: '.$stock_id}) than in your file ($trait_value) stored in the database for the trait $trait_name.</small><hr>";
147 #check if the plot_name, trait_name combination already exists in same file.
148 if (exists($check_file_stock_trait_duplicates{$trait_cvterm_id, $stock_id})) {
149 $warning_message = $warning_message."<small>$plot_name already has a value for the trait $trait_name in your file. Possible duplicate in your file?</small><hr>";
151 $check_file_stock_trait_duplicates{$trait_cvterm_id, $stock_id} = 1;
154 if ($timestamp_included) {
155 if ( (!$timestamp && !$trait_value) || ($timestamp && !$trait_value) || ($timestamp && $trait_value) ) {
156 if ($timestamp) {
157 if( !$timestamp =~ m/(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})(\S)(\d{4})/) {
158 $error_message = $error_message."<small>Bad timestamp for value for Plot Name: ".$plot_name."<br/>Trait Name: ".$trait_name."<br/>Should be YYYY-MM-DD HH:MM:SS-0000 or YYYY-MM-DD HH:MM:SS+0000</small><hr>";
161 } else {
162 $error_message = $error_message."<small>'Timestamps Included' is selected, but no timestamp for value for Plot Name: ".$plot_name."<br/>Trait Name: ".$trait_name."</small><hr>";
164 } else {
165 if ($timestamp) {
166 $error_message = $error_message."<small>Timestamps found in file, but 'Timestamps Included' is not selected.</small><hr>";
173 ## Verify metadata
174 if ($phenotype_metadata{'archived_file'} && (!$phenotype_metadata{'archived_file_type'} || $phenotype_metadata{'archived_file_type'} eq "")) {
175 $error_message = "No file type provided for archived file.";
176 return ($warning_message, $error_message);
178 if (!$phenotype_metadata{'operator'} || $phenotype_metadata{'operator'} eq "") {
179 $error_message = "No operaror provided in file upload metadata.";
180 return ($warning_message, $error_message);
182 if (!$phenotype_metadata{'date'} || $phenotype_metadata{'date'} eq "") {
183 $error_message = "No date provided in file upload metadata.";
184 return ($warning_message, $error_message);
187 return ($warning_message, $error_message);
190 sub store {
191 my $self = shift;
192 my $c = shift;
193 my $size = shift;
194 my $plot_list_ref = shift;
196 ####
197 #specify a trait list in addition to the hash of plot->trait->value because not all traits need to be present for each plot
198 #the parser can decide to set an empty string as a trait value to create a record for missing data,
199 #or store nothing in the hash to create no phenotype record for missing data
200 my $trait_list_ref = shift;
201 my $plot_trait_value_hashref = shift;
202 #####
204 my $phenotype_metadata = shift;
205 my $data_level = shift;
206 my $overwrite_values = shift;
207 my $error_message;
208 my $transaction_error;
209 my @plot_list = @{$plot_list_ref};
210 my @trait_list = @{$trait_list_ref};
211 my %plot_trait_value = %{$plot_trait_value_hashref};
212 my $schema = $c->dbic_schema("Bio::Chado::Schema");
213 my $metadata_schema = $c->dbic_schema("CXGN::Metadata::Schema");
214 my $phenome_schema = $c->dbic_schema("CXGN::Phenome::Schema");
215 my $user_id = $c->user()->get_object()->get_sp_person_id();
216 if (!$user_id) { #For unit_test, SimulateC
217 $user_id = $c->sp_person_id();
219 my $archived_file = $phenotype_metadata->{'archived_file'};
220 my $archived_file_type = $phenotype_metadata->{'archived_file_type'};
221 my $operator = $phenotype_metadata->{'operator'};
222 my $upload_date = $phenotype_metadata->{'date'};
224 my $phenotyping_experiment_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'phenotyping_experiment', 'experiment_type');
225 my $plot_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'plot', 'stock_type')->cvterm_id();
226 my $plant_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'plant', 'stock_type')->cvterm_id();
228 ## Track experiments seen to allow for multiple trials and experiments to exist in an uploaded file.
229 ## Used later to attach file metadata.
230 my %experiment_ids;##
233 my %check_unique_trait_stock;
234 if ($overwrite_values) {
235 my $sql = "SELECT cvalue_id, uniquename FROM phenotype WHERE value is not NULL; ";
236 my $sth = $c->dbc->dbh->prepare($sql);
237 $sth->execute();
239 while (my ($db_cvalue_id, $db_uniquename) = $sth->fetchrow_array) {
240 my ($stock_string, $rest_of_name) = split( /,/, $db_uniquename);
241 $check_unique_trait_stock{$db_cvalue_id, $stock_string} = 1;
245 ## Use txn_do with the following coderef so that if any part fails, the entire transaction fails.
247 #For storing files where num_plots * num_traits <= 100.
248 my $coderef_small_file = sub {
250 foreach my $plot_name (@plot_list) {
252 #print STDERR "plot: $plot_name\n";
253 my $stock = $schema->resultset("Stock::Stock")->find( { uniquename => $plot_name, 'me.type_id' => [$plot_cvterm_id, $plant_cvterm_id] } );
254 my $stock_id = $stock->stock_id;
256 my $field_layout_experiment = $stock
257 ->search_related('nd_experiment_stocks')
258 ->search_related('nd_experiment')
259 ->find({'type.name' => 'field_layout' },
260 { join => 'type' });
262 my $location_id = $field_layout_experiment->nd_geolocation_id;
263 my $project = $field_layout_experiment->nd_experiment_projects->single ; #there should be one project linked with the field experiment
264 my $project_id = $project->project_id;
266 foreach my $trait_name (@trait_list) {
268 #print STDERR "trait: $trait_name\n";
269 my $trait_cvterm;
270 #For multiterm traits of the form trait1|CO:0000001||trait2|CO:00000002
271 if ($trait_name =~ /\|\|/ ) {
272 $trait_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, $trait_name, 'cassava_trait');
273 } else {
274 $trait_cvterm = SGN::Model::Cvterm->get_cvterm_row_from_trait_name($schema, $trait_name);
276 my $value_array = $plot_trait_value{$plot_name}->{$trait_name};
277 #print STDERR Dumper $value_array;
278 my $trait_value = $value_array->[0];
279 my $timestamp = $value_array->[1];
280 if (!$timestamp) {
281 $timestamp = 'NA'.$upload_date;
284 if ($trait_value || $trait_value eq '0') {
286 #Remove previous phenotype values for a given stock and trait, if $overwrite values is checked
287 if ($overwrite_values) {
288 if (exists($check_unique_trait_stock{$trait_cvterm->cvterm_id(), "Stock: ".$stock_id})) {
289 my $overwrite_phenotypes_rs = $schema->resultset("Phenotype::Phenotype")->search({uniquename=>{'like' => 'Stock: '.$stock_id.'%'}, cvalue_id=>$trait_cvterm->cvterm_id() });
290 while (my $previous_phenotype = $overwrite_phenotypes_rs->next()) {
291 #print STDERR "removing phenotype: ".$previous_phenotype->uniquename()."\n";
292 $previous_phenotype->delete();
295 $check_unique_trait_stock{$trait_cvterm->cvterm_id(), "Stock: ".$stock_id} = 1;
298 my $plot_trait_uniquename = "Stock: " .
299 $stock_id . ", trait: " .
300 $trait_cvterm->name .
301 " date: $timestamp" .
302 " operator = $operator" ;
303 my $phenotype = $trait_cvterm
304 ->find_or_create_related("phenotype_cvalues", {
305 observable_id => $trait_cvterm->cvterm_id,
306 value => $trait_value ,
307 uniquename => $plot_trait_uniquename,
310 #print STDERR "\n[StorePhenotypes] Storing plot: $plot_name trait: $trait_name value: $trait_value:\n";
311 my $experiment;
313 ## Find the experiment that matches the location, type, operator, and date/timestamp if it exists
314 # my $experiment = $schema->resultset('NaturalDiversity::NdExperiment')
315 # ->find({
316 # nd_geolocation_id => $location_id,
317 # type_id => $phenotyping_experiment_cvterm->cvterm_id(),
318 # 'type.name' => 'operator',
319 # 'nd_experimentprops.value' => $operator,
320 # 'type_2.name' => 'date',
321 # 'nd_experimentprops_2.value' => $upload_date,
322 # },
324 # join => [{'nd_experimentprops' => 'type'},{'nd_experimentprops' => 'type'},{'nd_experiment_phenotypes' => 'type'}],
325 # });
328 # Create a new experiment, if one does not exist
329 if (!$experiment) {
330 $experiment = $schema->resultset('NaturalDiversity::NdExperiment')
331 ->create({nd_geolocation_id => $location_id, type_id => $phenotyping_experiment_cvterm->cvterm_id()});
332 $experiment->create_nd_experimentprops({date => $upload_date},{autocreate => 1, cv_name => 'local'});
333 $experiment->create_nd_experimentprops({operator => $operator}, {autocreate => 1 ,cv_name => 'local'});
336 ## Link the experiment to the project
337 $experiment->create_related('nd_experiment_projects', {project_id => $project_id});
339 # Link the experiment to the stock
340 $experiment->create_related('nd_experiment_stocks', {stock_id => $stock_id, type_id => $phenotyping_experiment_cvterm->cvterm_id});
342 ## Link the phenotype to the experiment
343 $experiment->create_related('nd_experiment_phenotypes', {phenotype_id => $phenotype->phenotype_id });
344 #print STDERR "[StorePhenotypes] Linking phenotype: $plot_trait_uniquename to experiment " .$experiment->nd_experiment_id . "Time:".localtime()."\n";
346 $experiment_ids{$experiment->nd_experiment_id()}=1;
352 #For storing files where num_plots * num_traits > 100.
353 my $coderef_large_file = sub {
355 my $rs;
356 my %data;
358 $rs = $schema->resultset('Stock::Stock')->search(
359 {'type.name' => 'field_layout', 'me.type_id' => [$plot_cvterm_id, $plant_cvterm_id] },
360 {join=> {'nd_experiment_stocks' => {'nd_experiment' => ['type', 'nd_experiment_projects' ] } } ,
361 '+select'=> ['me.stock_id', 'me.uniquename', 'nd_experiment.nd_geolocation_id', 'nd_experiment_projects.project_id'],
362 '+as'=> ['stock_id', 'uniquename', 'nd_geolocation_id', 'project_id']
365 while (my $s = $rs->next()) {
366 $data{$s->get_column('uniquename')} = [$s->get_column('stock_id'), $s->get_column('nd_geolocation_id'), $s->get_column('project_id') ];
369 foreach my $plot_name (@plot_list) {
371 my $stock_id = $data{$plot_name}[0];
372 my $location_id = $data{$plot_name}[1];
373 my $project_id = $data{$plot_name}[2];
375 foreach my $trait_name (@trait_list) {
377 #print STDERR "trait: $trait_name\n";
378 my $trait_cvterm;
379 #For multiterm traits of the form trait1|CO:0000001||trait2|CO:00000002
380 if ($trait_name =~ /\|\|/ ) {
381 $trait_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, $trait_name, 'cassava_trait');
382 } else {
383 $trait_cvterm = SGN::Model::Cvterm->get_cvterm_row_from_trait_name($schema, $trait_name);
386 my $value_array = $plot_trait_value{$plot_name}->{$trait_name};
387 #print STDERR Dumper $value_array;
388 my $trait_value = $value_array->[0];
389 my $timestamp = $value_array->[1];
390 if (!$timestamp) {
391 $timestamp = 'NA';
394 if ($trait_value || $trait_value eq '0') {
396 #Remove previous phenotype values for a given stock and trait, if $overwrite values is checked
397 if ($overwrite_values) {
398 if (exists($check_unique_trait_stock{$trait_cvterm->cvterm_id(), "Stock: ".$stock_id})) {
399 my $overwrite_phenotypes_rs = $schema->resultset("Phenotype::Phenotype")->search({uniquename=>{'like' => 'Stock: '.$stock_id.'%'}, cvalue_id=>$trait_cvterm->cvterm_id() });
400 while (my $previous_phenotype = $overwrite_phenotypes_rs->next()) {
401 #print STDERR "removing phenotype: ".$previous_phenotype->uniquename()."\n";
402 $previous_phenotype->delete();
405 $check_unique_trait_stock{$trait_cvterm->cvterm_id(), "Stock: ".$stock_id} = 1;
408 my $plot_trait_uniquename = "Stock: " .
409 $stock_id . ", trait: " .
410 $trait_cvterm->name .
411 " date: $timestamp" .
412 " operator = $operator" ;
413 my $phenotype = $trait_cvterm
414 ->find_or_create_related("phenotype_cvalues", {
415 observable_id => $trait_cvterm->cvterm_id,
416 value => $trait_value ,
417 uniquename => $plot_trait_uniquename,
420 my $experiment;
422 ## Find the experiment that matches the location, type, operator, and date/timestamp if it exists
423 # my $experiment = $schema->resultset('NaturalDiversity::NdExperiment')
424 # ->find({
425 # nd_geolocation_id => $location_id,
426 # type_id => $phenotyping_experiment_cvterm->cvterm_id(),
427 # 'type.name' => 'operator',
428 # 'nd_experimentprops.value' => $operator,
429 # 'type_2.name' => 'date',
430 # 'nd_experimentprops_2.value' => $upload_date,
431 # },
433 # join => [{'nd_experimentprops' => 'type'},{'nd_experimentprops' => 'type'},{'nd_experiment_phenotypes' => 'type'}],
434 # });
437 # Create a new experiment, if one does not exist
438 if (!$experiment) {
439 $experiment = $schema->resultset('NaturalDiversity::NdExperiment')
440 ->create({nd_geolocation_id => $location_id, type_id => $phenotyping_experiment_cvterm->cvterm_id()});
441 $experiment->create_nd_experimentprops({date => $upload_date},{autocreate => 1, cv_name => 'local'});
442 $experiment->create_nd_experimentprops({operator => $operator}, {autocreate => 1 ,cv_name => 'local'});
445 ## Link the experiment to the project
446 $experiment->create_related('nd_experiment_projects', {project_id => $project_id});
448 # Link the experiment to the stock
449 $experiment->create_related('nd_experiment_stocks', { stock_id => $stock_id, type_id => $phenotyping_experiment_cvterm->cvterm_id });
451 ## Link the phenotype to the experiment
452 $experiment->create_related('nd_experiment_phenotypes', {phenotype_id => $phenotype->phenotype_id });
453 #print STDERR "[StorePhenotypes] Linking phenotype: $plot_trait_uniquename to experiment " .$experiment->nd_experiment_id . "Time:".localtime()."\n";
455 $experiment_ids{$experiment->nd_experiment_id()}=1;
461 if ($size <= 100) {
462 try {
463 $schema->txn_do($coderef_small_file);
464 } catch {
465 $transaction_error = $_;
468 elsif ($size > 100) {
469 try {
470 $schema->txn_do($coderef_large_file);
471 } catch {
472 $transaction_error = $_;
476 if ($transaction_error) {
477 $error_message = $transaction_error;
478 print STDERR "Transaction error storing phenotypes: $transaction_error\n";
479 return $error_message;
482 if ($archived_file) {
483 ## Insert metadata about the uploaded file only after a successful phenotype data transaction
484 my $md5 = Digest::MD5->new();
485 my $file_row;
486 my $md_row;
487 my $file_metadata_transaction_error;
488 if ($archived_file ne 'none') {
489 open(my $F, "<", $archived_file) || die "Can't open file ".$archived_file;
490 binmode $F;
491 $md5->addfile($F);
492 close($F);
494 $md_row = $metadata_schema->resultset("MdMetadata")->create({create_person_id => $user_id,});
495 $md_row->insert();
496 $file_row = $metadata_schema->resultset("MdFiles")
497 ->create({
498 basename => basename($archived_file),
499 dirname => dirname($archived_file),
500 filetype => $archived_file_type,
501 md5checksum => $md5->hexdigest(),
502 metadata_id => $md_row->metadata_id(),
504 $file_row->insert();
505 foreach my $nd_experiment_id (keys %experiment_ids) {
506 ## Link the file to the experiment
507 my $experiment_files = $phenome_schema->resultset("NdExperimentMdFiles")
508 ->create({
509 nd_experiment_id => $nd_experiment_id,
510 file_id => $file_row->file_id(),
512 $experiment_files->insert();
513 #print STDERR "[StorePhenotypes] Linking file: $archived_file \n\t to experiment id " . $nd_experiment_id . "\n";
517 return $error_message;