1 package CXGN
::Phenotypes
::StorePhenotypes
;
5 CXGN::Phenotypes::StorePhenotypes - an object to handle storing phenotypes for SGN stocks
9 my $store_phenotypes = CXGN::Phenotypes::StorePhenotypes->new(
11 metadata_schema=>$metadata_schema,
12 phenome_schema=>$phenome_schema,
16 values_hash=>$parsed_data,
17 has_timestamps=>$timestamp_included,
18 overwrite_values=>$overwrite,
19 metadata_hash=>$phenotype_metadata,
20 image_zipfile_path=>$image_zip
22 my ($verified_warning, $verified_error) = $store_phenotypes->verify();
23 my ($stored_phenotype_error, $stored_Phenotype_success) = $store_phenotypes->store();
30 Jeremy D. Edwards (jde22@cornell.edu)
31 Naama Menda (nm249@cornell.edu)
32 Nicolas Morales (nm529@cornell.edu)
40 use File
::Basename qw
| basename dirname
|;
42 use CXGN
::List
::Validate
;
44 use Scalar
::Util
qw(looks_like_number);
48 use CXGN
::List
::Transform
;
50 has
'bcs_schema' => ( isa
=> 'Bio::Chado::Schema',
55 has
'metadata_schema' => ( isa
=> 'CXGN::Metadata::Schema',
60 has
'phenome_schema' => ( isa
=> 'CXGN::Phenome::Schema',
65 has
'user_id' => (isa
=> "Int",
70 has
'stock_list' => (isa
=> "ArrayRef",
75 has
'stock_id_list' => (isa
=> "ArrayRef[Int]|Undef",
80 has
'trait_list' => (isa
=> "ArrayRef",
85 has
'values_hash' => (isa
=> "HashRef",
90 has
'has_timestamps' => (isa
=> "Bool",
95 has
'overwrite_values' => (isa
=> "Bool",
100 has
'metadata_hash' => (isa
=> "HashRef",
105 has
'image_zipfile_path' => (isa
=> "Str | Undef",
110 has
'trait_objs' => (isa
=> "HashRef",
114 has
'unique_value_trait_stock' => (isa
=> "HashRef",
118 has
'unique_trait_stock' => (isa
=> "HashRef",
122 #build is used for creating hash lookups in this case
123 sub create_hash_lookups
{
125 my $schema = $self->bcs_schema;
127 #Find trait cvterm objects and put them in a hash
129 my @trait_list = @
{$self->trait_list};
130 my @stock_list = @
{$self->stock_list};
133 my $t = CXGN
::List
::Transform
->new();
134 my $stock_id_list = $t->transform($schema, 'stocks_2_stock_ids', \
@stock_list);
135 $self->stock_id_list($stock_id_list->{'transform'});
137 foreach my $trait_name (@trait_list) {
138 #print STDERR "trait: $trait_name\n";
139 my $trait_cvterm = SGN
::Model
::Cvterm
->get_cvterm_row_from_trait_name($schema, $trait_name);
140 $trait_objs{$trait_name} = $trait_cvterm;
141 push @cvterm_ids, $trait_cvterm->cvterm_id();
143 $self->trait_objs(\
%trait_objs);
145 #for checking if values in the file are already stored in the database or in the same file
146 my %check_unique_trait_stock;
147 my %check_unique_value_trait_stock;
148 my $previous_phenotype_rs = $schema->resultset('Phenotype::Phenotype')->search({'me.cvalue_id'=>{-in=>\
@cvterm_ids}, 'stock.stock_id'=>{-in=>$self->stock_id_list}}, {'join'=>{'nd_experiment_phenotypes'=>{'nd_experiment'=>{'nd_experiment_stocks'=>'stock'}}}, 'select' => ['me.value', 'me.cvalue_id', 'stock.stock_id'], 'as' => ['value', 'cvterm_id', 'stock_id']});
149 while (my $previous_phenotype_cvterm = $previous_phenotype_rs->next() ) {
150 my $cvterm_id = $previous_phenotype_cvterm->get_column('cvterm_id');
151 my $stock_id = $previous_phenotype_cvterm->get_column('stock_id');
153 my $previous_value = $previous_phenotype_cvterm->get_column('value') || ' ';
154 $check_unique_trait_stock{$cvterm_id, $stock_id} = $previous_value;
155 $check_unique_value_trait_stock{$previous_value, $cvterm_id, $stock_id} = 1;
158 $self->unique_value_trait_stock(\
%check_unique_value_trait_stock);
159 $self->unique_trait_stock(\
%check_unique_trait_stock);
165 my @plot_list = @
{$self->stock_list};
166 my @trait_list = @
{$self->trait_list};
167 my %plot_trait_value = %{$self->values_hash};
168 my %phenotype_metadata = %{$self->metadata_hash};
169 my $timestamp_included = $self->has_timestamps;
170 my $archived_image_zipfile_with_path = $self->image_zipfile_path;
171 my $schema = $self->bcs_schema;
172 my $transaction_error;
173 #print STDERR Dumper \%plot_trait_value;
174 my $plot_validator = CXGN
::List
::Validate
->new();
175 my $trait_validator = CXGN
::List
::Validate
->new();
176 my @plots_missing = @
{$plot_validator->validate($schema,'plots_or_subplots_or_plants',\
@plot_list)->{'missing'}};
177 my @traits_missing = @
{$trait_validator->validate($schema,'traits',\
@trait_list)->{'missing'}};
178 @trait_list = @
{$self->trait_list};
182 if (scalar(@plots_missing) > 0 || scalar(@traits_missing) > 0) {
183 print STDERR
"Plots or traits not valid\n";
184 print STDERR
"Invalid plots: ".join(", ", map { "'$_'" } @plots_missing)."\n" if (@plots_missing);
185 print STDERR
"Invalid traits: ".join(", ", map { "'$_'" } @traits_missing)."\n" if (@traits_missing);
186 $error_message = "Invalid plots: <br/>".join(", <br/>", map { "'$_'" } @plots_missing) if (@plots_missing);
187 $error_message = "Invalid traits: <br/>".join(", <br/>", map { "'$_'" } @traits_missing) if (@traits_missing);
188 return ($warning_message, $error_message);
191 $self->create_hash_lookups();
192 my %trait_objs = %{$self->trait_objs};
193 my %check_unique_value_trait_stock = %{$self->unique_value_trait_stock};
194 my %check_unique_trait_stock = %{$self->unique_trait_stock};
196 my %check_trait_category;
197 my $sql = "SELECT b.value, c.cvterm_id from cvtermprop as b join cvterm as a on (b.type_id = a.cvterm_id) join cvterm as c on (b.cvterm_id=c.cvterm_id) where a.name = 'trait_categories';";
198 my $sth = $schema->storage->dbh->prepare($sql);
200 while (my ($category_value, $cvterm_id) = $sth->fetchrow_array) {
201 $check_trait_category{$cvterm_id} = $category_value;
204 my %check_trait_format;
205 $sql = "SELECT b.value, c.cvterm_id from cvtermprop as b join cvterm as a on (b.type_id = a.cvterm_id) join cvterm as c on (b.cvterm_id=c.cvterm_id) where a.name = 'trait_format';";
206 $sth = $schema->storage->dbh->prepare($sql);
208 while (my ($format_value, $cvterm_id) = $sth->fetchrow_array) {
209 $check_trait_format{$cvterm_id} = $format_value;
212 my %image_plot_full_names;
213 if ($archived_image_zipfile_with_path) {
215 my $archived_zip = CXGN
::ZipFile
->new(archived_zipfile_path
=>$archived_image_zipfile_with_path);
216 my @archived_zipfile_return = $archived_zip->file_names();
217 if (!@archived_zipfile_return){
218 $error_message = $error_message."<small>Image zipfile could not be read. Is it .zip format?</small><hr>";
220 my $file_names_stripped = $archived_zipfile_return[0];
221 my $file_names_full = $archived_zipfile_return[1];
222 foreach (@
$file_names_full) {
223 $image_plot_full_names{$_} = 1;
226 foreach (@plot_list) {
227 $plot_name_check{$_} = 1;
229 foreach my $img_name (@
$file_names_stripped) {
230 $img_name = substr($img_name, 0, -20);
231 if (!exists($plot_name_check{$img_name})) {
232 $error_message = $error_message."<small>Image ".$img_name." in images zip file does not reference a plot or plant_name!</small><hr>";
238 my %check_file_stock_trait_duplicates;
240 foreach my $plot_name (@plot_list) {
241 foreach my $trait_name (@trait_list) {
242 my $value_array = $plot_trait_value{$plot_name}->{$trait_name};
243 #print STDERR Dumper $value_array;
244 my $trait_value = $value_array->[0];
245 my $timestamp = $value_array->[1];
246 #print STDERR "$plot_name, $trait_name, $trait_value\n";
248 my $trait_cvterm = $trait_objs{$trait_name};
249 my $trait_cvterm_id = $trait_cvterm->cvterm_id();
250 my $stock_id = $schema->resultset('Stock::Stock')->find({'uniquename' => $plot_name})->stock_id();
252 if ($trait_value eq '.' || ($trait_value =~ m/[^a-zA-Z0-9.\-\/\_
]/ && $trait_value ne '.')){
253 $error_message = $error_message."<small>Trait values must be alphanumeric with no spaces: <br/>Plot Name: ".$plot_name."<br/>Trait Name: ".$trait_name."<br/>Value: ".$trait_value."</small><hr>";
256 #check that trait value is valid for trait name
257 if (exists($check_trait_format{$trait_cvterm_id})) {
258 if ($check_trait_format{$trait_cvterm_id} eq 'numeric') {
259 my $trait_format_checked = looks_like_number
($trait_value);
260 if (!$trait_format_checked) {
261 $error_message = $error_message."<small>This trait value should be numeric: <br/>Plot Name: ".$plot_name."<br/>Trait Name: ".$trait_name."<br/>Value: ".$trait_value."</small><hr>";
264 if ($check_trait_format{$trait_cvterm_id} eq 'image') {
265 $trait_value =~ s/^.*photos\///;
266 if (!exists($image_plot_full_names{$trait_value})) {
267 $error_message = $error_message."<small>For Plot Name: $plot_name there should be a corresponding image named in the zipfile called $trait_value. </small><hr>";
271 if (exists($check_trait_category{$trait_cvterm_id})) {
272 my @trait_categories = split /\//, $check_trait_category{$trait_cvterm_id};
273 my %trait_categories_hash = map { $_ => 1 } @trait_categories;
274 if (!exists($trait_categories_hash{$trait_value})) {
275 $error_message = $error_message."<small>This trait value should be one of ".$check_trait_category{$trait_cvterm_id}.": <br/>Plot Name: ".$plot_name."<br/>Trait Name: ".$trait_name."<br/>Value: ".$trait_value."</small><hr>";
279 #print STDERR "$trait_value, $trait_cvterm_id, $stock_id\n";
280 #check if the plot_name, trait_name combination already exists in database.
281 if (exists($check_unique_value_trait_stock{$trait_value, $trait_cvterm_id, $stock_id})) {
282 $warning_message = $warning_message."<small>$plot_name already has the same value as in your file ($trait_value) stored for the trait $trait_name.</small><hr>";
283 } elsif (exists($check_unique_trait_stock{$trait_cvterm_id, $stock_id})) {
284 $warning_message = $warning_message."<small>$plot_name already has a different value ($check_unique_trait_stock{$trait_cvterm_id, $stock_id}) than in your file ($trait_value) stored in the database for the trait $trait_name.</small><hr>";
287 #check if the plot_name, trait_name combination already exists in same file.
288 if (exists($check_file_stock_trait_duplicates{$trait_cvterm_id, $stock_id})) {
289 $warning_message = $warning_message."<small>$plot_name already has a value for the trait $trait_name in your file. Possible duplicate in your file?</small><hr>";
291 $check_file_stock_trait_duplicates{$trait_cvterm_id, $stock_id} = 1;
294 if ($timestamp_included) {
295 if ( (!$timestamp && !$trait_value) || ($timestamp && !$trait_value) || ($timestamp && $trait_value) ) {
297 if( !$timestamp =~ m/(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})(\S)(\d{4})/) {
298 $error_message = $error_message."<small>Bad timestamp for value for Plot Name: ".$plot_name."<br/>Trait Name: ".$trait_name."<br/>Should be YYYY-MM-DD HH:MM:SS-0000 or YYYY-MM-DD HH:MM:SS+0000</small><hr>";
302 $error_message = $error_message."<small>'Timestamps Included' is selected, but no timestamp for value for Plot Name: ".$plot_name."<br/>Trait Name: ".$trait_name."</small><hr>";
306 $error_message = $error_message."<small>Timestamps found in file, but 'Timestamps Included' is not selected.</small><hr>";
314 if ($phenotype_metadata{'archived_file'} && (!$phenotype_metadata{'archived_file_type'} || $phenotype_metadata{'archived_file_type'} eq "")) {
315 $error_message = "No file type provided for archived file.";
316 return ($warning_message, $error_message);
318 if (!$phenotype_metadata{'operator'} || $phenotype_metadata{'operator'} eq "") {
319 $error_message = "No operaror provided in file upload metadata.";
320 return ($warning_message, $error_message);
322 if (!$phenotype_metadata{'date'} || $phenotype_metadata{'date'} eq "") {
323 $error_message = "No date provided in file upload metadata.";
324 return ($warning_message, $error_message);
327 return ($warning_message, $error_message);
332 $self->create_hash_lookups();
333 my @plot_list = @
{$self->stock_list};
334 my @trait_list = @
{$self->trait_list};
335 my %trait_objs = %{$self->trait_objs};
336 my %plot_trait_value = %{$self->values_hash};
337 my %phenotype_metadata = %{$self->metadata_hash};
338 my $timestamp_included = $self->has_timestamps;
339 my $archived_image_zipfile_with_path = $self->image_zipfile_path;
340 my $phenotype_metadata = $self->metadata_hash;
341 my $schema = $self->bcs_schema;
342 my $metadata_schema = $self->metadata_schema;
343 my $phenome_schema = $self->phenome_schema;
344 my $overwrite_values = $self->overwrite_values;
346 my $transaction_error;
347 my $user_id = $self->user_id;
348 my $archived_file = $phenotype_metadata->{'archived_file'};
349 my $archived_file_type = $phenotype_metadata->{'archived_file_type'};
350 my $operator = $phenotype_metadata->{'operator'};
351 my $upload_date = $phenotype_metadata->{'date'};
354 my $phenotyping_experiment_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'phenotyping_experiment', 'experiment_type')->cvterm_id();
355 my $plot_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'plot', 'stock_type')->cvterm_id();
356 my $plant_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'plant', 'stock_type')->cvterm_id();
357 my $subplot_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'subplot', 'stock_type')->cvterm_id();
359 ## Track experiments seen to allow for multiple trials and experiments to exist in an uploaded file.
360 ## Used later to attach file metadata.
361 my %experiment_ids;##
364 my %check_unique_trait_stock = %{$self->unique_trait_stock};
368 $rs = $schema->resultset('Stock::Stock')->search(
369 {'type.name' => 'field_layout', 'me.type_id' => [$plot_cvterm_id, $plant_cvterm_id, $subplot_cvterm_id], 'me.stock_id' => {-in=>$self->stock_id_list } },
370 {join=> {'nd_experiment_stocks' => {'nd_experiment' => ['type', 'nd_experiment_projects' ] } } ,
371 '+select'=> ['me.stock_id', 'me.uniquename', 'nd_experiment.nd_geolocation_id', 'nd_experiment_projects.project_id'],
372 '+as'=> ['stock_id', 'uniquename', 'nd_geolocation_id', 'project_id']
375 while (my $s = $rs->next()) {
376 $data{$s->get_column('uniquename')} = [$s->get_column('stock_id'), $s->get_column('nd_geolocation_id'), $s->get_column('project_id') ];
379 ## Use txn_do with the following coderef so that if any part fails, the entire transaction fails.
381 my @overwritten_values;
383 foreach my $plot_name (@plot_list) {
385 my $stock_id = $data{$plot_name}[0];
386 my $location_id = $data{$plot_name}[1];
387 my $project_id = $data{$plot_name}[2];
389 foreach my $trait_name (@trait_list) {
391 #print STDERR "trait: $trait_name\n";
392 my $trait_cvterm = $trait_objs{$trait_name};
394 my $value_array = $plot_trait_value{$plot_name}->{$trait_name};
395 #print STDERR Dumper $value_array;
396 my $trait_value = $value_array->[0];
397 my $timestamp = $value_array->[1];
399 $timestamp = 'NA'.$upload_date;
401 my $treatments = $value_array->[2];
403 if (defined($trait_value) && length($trait_value)) {
405 #Remove previous phenotype values for a given stock and trait, if $overwrite values is checked
406 if ($overwrite_values) {
407 if (exists($check_unique_trait_stock{$trait_cvterm->cvterm_id(), $stock_id})) {
408 push @overwritten_values, $self->delete_previous_phenotypes($trait_cvterm->cvterm_id(), $stock_id);
410 $check_unique_trait_stock{$trait_cvterm->cvterm_id(), $stock_id} = 1;
413 my $plot_trait_uniquename = "Stock: " .
414 $stock_id . ", trait: " .
415 $trait_cvterm->name .
416 " date: $timestamp" .
417 " operator = $operator" ;
419 my $phenotype = $trait_cvterm
420 ->find_related("phenotype_cvalues", {
421 observable_id
=> $trait_cvterm->cvterm_id,
422 value
=> $trait_value ,
423 uniquename
=> $plot_trait_uniquename,
428 my $phenotype = $trait_cvterm
429 ->create_related("phenotype_cvalues", {
430 observable_id
=> $trait_cvterm->cvterm_id,
431 value
=> $trait_value ,
432 uniquename
=> $plot_trait_uniquename,
435 my $experiment = $schema->resultset('NaturalDiversity::NdExperiment')->create({
436 nd_geolocation_id
=> $location_id,
437 type_id
=> $phenotyping_experiment_cvterm_id
439 $experiment->create_nd_experimentprops({date
=> $upload_date},{autocreate
=> 1, cv_name
=> 'local'});
440 $experiment->create_nd_experimentprops({operator
=> $operator}, {autocreate
=> 1 ,cv_name
=> 'local'});
442 ## Link the experiment to the project
443 $experiment->create_related('nd_experiment_projects', {project_id
=> $project_id});
445 #Link the experiment to the treatments
446 foreach my $treatment (@
$treatments){
447 my $treatment_project_id = $schema->resultset('Project::Project')->find({name
=>$treatment})->project_id();
448 $experiment->create_related('nd_experiment_projects', {project_id
=> $treatment_project_id});
451 # Link the experiment to the stock
452 $experiment->create_related('nd_experiment_stocks', { stock_id
=> $stock_id, type_id
=> $phenotyping_experiment_cvterm_id });
454 ## Link the phenotype to the experiment
455 $experiment->create_related('nd_experiment_phenotypes', {phenotype_id
=> $phenotype->phenotype_id });
456 #print STDERR "[StorePhenotypes] Linking phenotype: $plot_trait_uniquename to experiment " .$experiment->nd_experiment_id . "Time:".localtime()."\n";
458 $experiment_ids{$experiment->nd_experiment_id()}=1;
465 $success_message = 'All values in your file are now saved in the database!';
466 #print STDERR Dumper \@overwritten_values;
467 my %files_with_overwritten_values = map {$_->[0] => 1} @overwritten_values;
468 my $obsoleted_files = $self->check_overwritten_files_status(keys %files_with_overwritten_values);
469 if (scalar (@
$obsoleted_files) > 0){
470 $success_message .= ' The following previously uploaded files are now obsolete because all values from them were overwritten by your upload: ';
471 foreach (@
$obsoleted_files){
472 $success_message .= " ".$_->[1];
478 $schema->txn_do($coderef);
480 $transaction_error = $_;
483 if ($transaction_error) {
484 $error_message = $transaction_error;
485 print STDERR
"Transaction error storing phenotypes: $transaction_error\n";
486 return ($error_message, $success_message);
489 if ($archived_file) {
490 $self->save_archived_file_metadata($archived_file, $archived_file_type, \
%experiment_ids);
493 return ($error_message, $success_message);
497 sub delete_previous_phenotypes
{
499 my $trait_cvterm_id = shift;
500 my $stock_id = shift;
503 DROP TABLE IF EXISTS temp_pheno_duplicate_deletion;
504 CREATE TEMP TABLE temp_pheno_duplicate_deletion AS
505 (SELECT phenotype_id, nd_experiment_id, file_id
507 JOIN nd_experiment_phenotype using(phenotype_id)
508 JOIN nd_experiment_stock using(nd_experiment_id)
509 LEFT JOIN phenome.nd_experiment_md_files using(nd_experiment_id)
510 JOIN stock using(stock_id)
511 WHERE stock.stock_id=?
512 AND phenotype.cvalue_id=?);
513 DELETE FROM phenotype WHERE phenotype_id IN (SELECT phenotype_id FROM temp_pheno_duplicate_deletion);
514 DELETE FROM phenome.nd_experiment_md_files WHERE nd_experiment_id IN (SELECT nd_experiment_id FROM temp_pheno_duplicate_deletion);
515 DELETE FROM nd_experiment WHERE nd_experiment_id IN (SELECT nd_experiment_id FROM temp_pheno_duplicate_deletion);
517 my $q2 = "SELECT phenotype_id, nd_experiment_id, file_id FROM temp_pheno_duplicate_deletion;";
519 my $h = $self->bcs_schema->storage->dbh()->prepare($q);
520 my $h2 = $self->bcs_schema->storage->dbh()->prepare($q2);
521 $h->execute($stock_id, $trait_cvterm_id);
524 my @deleted_phenotypes;
525 while (my ($phenotype_id, $nd_experiment_id, $file_id) = $h2->fetchrow_array()) {
526 push @deleted_phenotypes, [$file_id, $phenotype_id, $nd_experiment_id];
528 return @deleted_phenotypes;
531 sub check_overwritten_files_status
{
533 my @file_ids = shift;
534 #print STDERR Dumper \@file_ids;
536 my $q = "SELECT count(nd_experiment_md_files_id) FROM metadata.md_files JOIN phenome.nd_experiment_md_files using(file_id) WHERE file_id=?;";
537 my $q2 = "UPDATE metadata.md_metadata SET obsolete=1 where metadata_id IN (SELECT metadata_id FROM metadata.md_files where file_id=?);";
538 my $q3 = "SELECT basename FROM metadata.md_files where file_id=?;";
539 my $h = $self->bcs_schema->storage->dbh()->prepare($q);
540 my $h2 = $self->bcs_schema->storage->dbh()->prepare($q2);
541 my $h3 = $self->bcs_schema->storage->dbh()->prepare($q3);
546 my $count = $h->fetchrow;
547 print STDERR
"COUNT $count \n";
551 my $basename = $h3->fetchrow;
552 push @obsoleted_files, [$_, $basename];
553 print STDERR
"MADE file_id $_ OBSOLETE\n";
557 #print STDERR Dumper \@obsoleted_files;
558 return \
@obsoleted_files;
561 sub save_archived_file_metadata
{
563 my $archived_file = shift;
564 my $archived_file_type = shift;
565 my $experiment_ids = shift;
567 my $upload_file = CXGN
::UploadFile
->new();
568 my $md5 = $upload_file->get_md5($archived_file);
570 my $md_row = $self->metadata_schema->resultset("MdMetadata")->create({create_person_id
=> $self->user_id,});
572 my $file_row = $self->metadata_schema->resultset("MdFiles")
574 basename
=> basename
($archived_file),
575 dirname
=> dirname
($archived_file),
576 filetype
=> $archived_file_type,
577 md5checksum
=> $md5->hexdigest(),
578 metadata_id
=> $md_row->metadata_id(),
582 foreach my $nd_experiment_id (keys %$experiment_ids) {
583 ## Link the file to the experiment
584 my $experiment_files = $self->phenome_schema->resultset("NdExperimentMdFiles")
586 nd_experiment_id
=> $nd_experiment_id,
587 file_id
=> $file_row->file_id(),
589 $experiment_files->insert();
590 #print STDERR "[StorePhenotypes] Linking file: $archived_file \n\t to experiment id " . $nd_experiment_id . "\n";