Merge pull request #5205 from solgenomics/topic/generic_trial_upload
[sgn.git] / lib / CXGN / Phenotypes / PhenotypeMatrix.pm
blobe1e3abb909ed4eaa5a67431f25be1059ad2ccdba
1 package CXGN::Phenotypes::PhenotypeMatrix;
3 =head1 NAME
5 CXGN::Phenotypes::PhenotypeMatrix - an object to handle creating the phenotype matrix. Uses SearchFactory to handle searching native database or materialized views.
7 =head1 USAGE
9 my $phenotypes_search = CXGN::Phenotypes::PhenotypeMatrix->new(
10 bcs_schema=>$schema,
11 search_type=>$search_type,
12 data_level=>$data_level,
13 trait_list=>$trait_list,
14 trial_list=>$trial_list,
15 program_list=>$self->program_list,
16 folder_list=>$self->folder_list,
17 year_list=>$year_list,
18 location_list=>$location_list,
19 accession_list=>$accession_list,
20 plot_list=>$plot_list,
21 plant_list=>$plant_list,
22 include_timestamp=>$include_timestamp,
23 include_pedigree_parents=>$include_pedigree_parents,
24 exclude_phenotype_outlier=>0,
25 dataset_exluded_outliers=>$dataset_exluded_outliers,
26 trait_contains=>$trait_contains,
27 phenotype_min_value=>$phenotype_min_value,
28 phenotype_max_value=>$phenotype_max_value,
29 start_date => $start_date,
30 end_date => $end_date,
31 include_dateless_items => $include_dateless_items,
32 limit=>$limit,
33 offset=>$offset
35 my @data = $phenotypes_search->get_phenotype_matrix();
37 =head1 DESCRIPTION
40 =head1 AUTHORS
43 =cut
45 use strict;
46 use warnings;
47 use Moose;
48 use Data::Dumper;
49 use SGN::Model::Cvterm;
50 use CXGN::Stock::StockLookup;
51 use CXGN::Phenotypes::SearchFactory;
52 use CXGN::BreedersToolbox::Projects;
54 has 'bcs_schema' => (
55 isa => 'Bio::Chado::Schema',
56 is => 'rw',
57 required => 1,
60 #PREFERRED MaterializedViewTable (MaterializedViewTable or Native)
61 has 'search_type' => (
62 isa => 'Str',
63 is => 'rw',
64 required => 1,
67 #(plot, plant, or all)
68 has 'data_level' => (
69 isa => 'Str|Undef',
70 is => 'ro',
73 has 'trial_list' => (
74 isa => 'ArrayRef[Int]|Undef',
75 is => 'rw',
78 has 'program_list' => (
79 isa => 'ArrayRef[Int]|Undef',
80 is => 'rw',
83 has 'folder_list' => (
84 isa => 'ArrayRef[Int]|Undef',
85 is => 'rw',
88 has 'trait_list' => (
89 isa => 'ArrayRef[Int]|Undef',
90 is => 'rw',
93 has 'accession_list' => (
94 isa => 'ArrayRef[Int]|Undef',
95 is => 'rw',
98 has 'plot_list' => (
99 isa => 'ArrayRef[Int]|Undef',
100 is => 'rw',
103 has 'plant_list' => (
104 isa => 'ArrayRef[Int]|Undef',
105 is => 'rw',
108 has 'subplot_list' => (
109 isa => 'ArrayRef[Int]|Undef',
110 is => 'rw',
113 has 'location_list' => (
114 isa => 'ArrayRef[Int]|Undef',
115 is => 'rw',
118 has 'year_list' => (
119 isa => 'ArrayRef[Int]|Undef',
120 is => 'rw',
123 has 'include_pedigree_parents' => (
124 isa => 'Bool|Undef',
125 is => 'ro',
126 default => 0
129 has 'include_timestamp' => (
130 isa => 'Bool|Undef',
131 is => 'ro',
132 default => 0
135 has 'include_phenotype_primary_key' => (
136 isa => 'Bool|Undef',
137 is => 'ro',
138 default => 0
141 has 'exclude_phenotype_outlier' => (
142 isa => 'Bool',
143 is => 'ro',
144 default => 0
147 has 'dataset_exluded_outliers' => (
148 isa => 'ArrayRef[Int]|Undef',
149 is => 'rw',
152 has 'trait_contains' => (
153 isa => 'ArrayRef[Str]|Undef',
154 is => 'rw'
157 has 'phenotype_min_value' => (
158 isa => 'Str|Undef',
159 is => 'rw'
162 has 'phenotype_max_value' => (
163 isa => 'Str|Undef',
164 is => 'rw'
167 has 'start_date' => (
168 isa => 'Str|Undef',
169 is => 'rw',
170 default => sub { return "1900-01-01"; },
173 has 'end_date' => (
174 isa => 'Str|Undef',
175 is => 'rw',
176 default => sub { return "2100-12-31"; },
179 has 'include_dateless_items' => (
180 isa => 'Str|Undef',
181 is => 'rw',
182 default => sub { return 1; },
185 has 'limit' => (
186 isa => 'Int|Undef',
187 is => 'rw'
190 has 'offset' => (
191 isa => 'Int|Undef',
192 is => 'rw'
195 sub get_phenotype_matrix {
196 my $self = shift;
197 my $include_pedigree_parents = $self->include_pedigree_parents();
198 my $include_timestamp = $self->include_timestamp;
199 my $include_phenotype_primary_key = $self->include_phenotype_primary_key;
201 print STDERR "GET PHENOMATRIX ".$self->search_type."\n";
203 my $phenotypes_search = CXGN::Phenotypes::SearchFactory->instantiate(
204 $self->search_type,
206 bcs_schema=>$self->bcs_schema,
207 data_level=>$self->data_level,
208 trait_list=>$self->trait_list,
209 trial_list=>$self->trial_list,
210 program_list=>$self->program_list,
211 folder_list=>$self->folder_list,
212 year_list=>$self->year_list,
213 location_list=>$self->location_list,
214 accession_list=>$self->accession_list,
215 plot_list=>$self->plot_list,
216 plant_list=>$self->plant_list,
217 subplot_list=>$self->subplot_list,
218 include_timestamp=>$include_timestamp,
219 exclude_phenotype_outlier=>$self->exclude_phenotype_outlier,
220 dataset_exluded_outliers=>$self->dataset_exluded_outliers,
221 trait_contains=>$self->trait_contains,
222 phenotype_min_value=>$self->phenotype_min_value,
223 phenotype_max_value=>$self->phenotype_max_value,
224 start_date => $self->start_date(),
225 end_date => $self->end_date(),
226 include_dateless_items => $self->include_dateless_items(),
227 limit=>$self->limit,
228 offset=>$self->offset
232 my ($data, $unique_traits);
233 my @info;
234 my @metadata_headers = ( 'studyYear', 'programDbId', 'programName', 'programDescription', 'studyDbId', 'studyName', 'studyDescription', 'studyDesign', 'plotWidth', 'plotLength', 'fieldSize', 'fieldTrialIsPlannedToBeGenotyped', 'fieldTrialIsPlannedToCross', 'plantingDate', 'harvestDate', 'locationDbId', 'locationName', 'germplasmDbId', 'germplasmName', 'germplasmSynonyms', 'observationLevel', 'observationUnitDbId', 'observationUnitName', 'replicate', 'blockNumber', 'plotNumber', 'rowNumber', 'colNumber', 'entryType', 'plantNumber');
236 if ($self->search_type eq 'MaterializedViewTable'){
237 ($data, $unique_traits) = $phenotypes_search->search();
238 print STDERR "No of lines retrieved: ".scalar(@$data)."\n";
239 print STDERR "Construct Pheno Matrix Start:".localtime."\n";
241 my @line = @metadata_headers;
242 push @line, ('plantedSeedlotStockDbId', 'plantedSeedlotStockUniquename', 'plantedSeedlotCurrentCount', 'plantedSeedlotCurrentWeightGram', 'plantedSeedlotBoxName', 'plantedSeedlotTransactionCount', 'plantedSeedlotTransactionWeight', 'plantedSeedlotTransactionDescription', 'availableGermplasmSeedlotUniquenames');
244 if ($include_pedigree_parents){
245 push @line, ('germplasmPedigreeFemaleParentName', 'germplasmPedigreeFemaleParentDbId', 'germplasmPedigreeMaleParentName', 'germplasmPedigreeMaleParentDbId');
248 my @sorted_traits = sort keys(%$unique_traits);
249 foreach my $trait (@sorted_traits) {
250 push @line, $trait;
251 if ($include_phenotype_primary_key) {
252 push @line, $trait.'_phenotype_id';
255 push @line, 'notes';
257 # retrieve treatments and add treatment names to header
258 my %seen_obsunits = map { $_->{observationunit_stock_id} => 1 } @$data;
259 my $project_object = CXGN::BreedersToolbox::Projects->new( { schema => $self->bcs_schema });
260 my $treatment_info = {};
261 if ($self->trial_list) {
262 $treatment_info = $project_object->get_related_treatments($self->trial_list, \%seen_obsunits);
264 my $treatment_names = $treatment_info->{treatment_names};
265 my $treatment_details = $treatment_info->{treatment_details};
267 foreach my $name (@$treatment_names) {
268 push @line, $name;
271 push @info, \@line;
273 foreach my $obs_unit (@$data){
274 my $entry_type = $obs_unit->{obsunit_is_a_control} ? 'check' : 'test';
275 my $synonyms = $obs_unit->{germplasm_synonyms};
276 my $synonym_string = $synonyms ? join ("," , @$synonyms) : '';
277 my $available_germplasm_seedlots = $obs_unit->{available_germplasm_seedlots};
278 my %available_germplasm_seedlots_uniquenames;
279 foreach (@$available_germplasm_seedlots){
280 $available_germplasm_seedlots_uniquenames{$_->{stock_uniquename}}++;
282 my $available_germplasm_seedlots_uniquenames = join ' AND ', (keys %available_germplasm_seedlots_uniquenames);
284 my $trial_name = $obs_unit->{trial_name};
285 my $trial_desc = $obs_unit->{trial_description};
287 $trial_name =~ s/\s+$//g;
288 $trial_desc =~ s/\s+$//g;
290 my @line = ($obs_unit->{year}, $obs_unit->{breeding_program_id}, $obs_unit->{breeding_program_name}, $obs_unit->{breeding_program_description}, $obs_unit->{trial_id}, $trial_name, $trial_desc, $obs_unit->{design}, $obs_unit->{plot_width}, $obs_unit->{plot_length}, $obs_unit->{field_size}, $obs_unit->{field_trial_is_planned_to_be_genotyped}, $obs_unit->{field_trial_is_planned_to_cross}, $obs_unit->{planting_date}, $obs_unit->{harvest_date}, $obs_unit->{trial_location_id}, $obs_unit->{trial_location_name}, $obs_unit->{germplasm_stock_id}, $obs_unit->{germplasm_uniquename}, $synonym_string, $obs_unit->{observationunit_type_name}, $obs_unit->{observationunit_stock_id}, $obs_unit->{observationunit_uniquename}, $obs_unit->{obsunit_rep}, $obs_unit->{obsunit_block}, $obs_unit->{obsunit_plot_number}, $obs_unit->{obsunit_row_number}, $obs_unit->{obsunit_col_number}, $entry_type, $obs_unit->{obsunit_plant_number}, $obs_unit->{seedlot_stock_id}, $obs_unit->{seedlot_uniquename}, $obs_unit->{seedlot_current_count}, $obs_unit->{seedlot_current_weight_gram}, $obs_unit->{seedlot_box_name}, $obs_unit->{seedlot_transaction_amount}, $obs_unit->{seedlot_transaction_weight_gram}, $obs_unit->{seedlot_transaction_description}, $available_germplasm_seedlots_uniquenames);
292 if ($include_pedigree_parents) {
293 my $germplasm = CXGN::Stock->new({schema => $self->bcs_schema, stock_id=>$obs_unit->{germplasm_stock_id}});
294 my $parents = $germplasm->get_parents();
295 push @line, ($parents->{'mother'}, $parents->{'mother_id'}, $parents->{'father'}, $parents->{'father_id'});
298 my $observations = $obs_unit->{observations};
299 # print STDERR "OBSERVATIONS =".Dumper($observations)."\n";
300 my $include_timestamp = $self->include_timestamp;
301 my %trait_observations;
302 my %phenotype_ids;
303 my $dataset_exluded_outliers_ref = $self->dataset_exluded_outliers;
304 foreach my $observation (@$observations){
305 my $collect_date = $observation->{collect_date};
306 my $timestamp = $observation->{timestamp};
308 if ($include_timestamp && $timestamp) {
309 $trait_observations{$observation->{trait_name}} = "$observation->{value},$timestamp";
311 elsif ($include_timestamp && $collect_date) {
312 $trait_observations{$observation->{trait_name}} = "$observation->{value},$collect_date";
314 else {
315 $trait_observations{$observation->{trait_name}} = $observation->{value};
318 # dataset outliers will be empty fields if are in @$dataset_exluded_outliers_ref list of pheno_id outliers
319 if(grep {$_ == $observation->{'phenotype_id'}} @$dataset_exluded_outliers_ref) {
320 $trait_observations{$observation->{trait_name}} = ''; # empty field for outlier NA
324 if ($include_phenotype_primary_key) {
325 foreach my $observation (@$observations) {
326 $phenotype_ids{$observation->{trait_name}} = $observation->{phenotype_id};
329 foreach my $trait (@sorted_traits) {
330 push @line, $trait_observations{$trait};
331 if ($include_phenotype_primary_key) {
332 push @line, $phenotype_ids{$trait};
335 push @line, $obs_unit->{notes};
337 # add treatment values to each obsunit line
338 my %unit_treatments;
339 if ($treatment_details->{$obs_unit->{observationunit_stock_id}}) {
340 %unit_treatments = %{$treatment_details->{$obs_unit->{observationunit_stock_id}}};
342 foreach my $name (@$treatment_names) {
343 push @line, $unit_treatments{$name};
346 push @info, \@line;
348 } else {
349 $data = $phenotypes_search->search();
350 #print STDERR "DOWNLOAD DATA =".Dumper($data)."\n";
352 my %obsunit_data;
353 my %traits;
355 print STDERR "No of lines retrieved: ".scalar(@$data)."\n";
356 print STDERR "Construct Pheno Matrix Start:".localtime."\n";
357 my @unique_obsunit_list = ();
358 my %seen_obsunits;
360 foreach my $d (@$data) {
361 my $cvterm = $d->{trait_name};
362 if ($cvterm){
363 my $obsunit_id = $d->{obsunit_stock_id};
364 if (!exists($seen_obsunits{$obsunit_id})) {
365 push @unique_obsunit_list, $obsunit_id;
366 $seen_obsunits{$obsunit_id} = 1;
369 my $timestamp_value = $d->{timestamp};
370 my $value = $d->{phenotype_value};
371 #my $cvterm = $trait."|".$cvterm_accession;
372 if ($include_timestamp && $timestamp_value) {
373 $obsunit_data{$obsunit_id}->{$cvterm} = "$value,$timestamp_value";
374 } else {
375 $obsunit_data{$obsunit_id}->{$cvterm} = $value;
377 $obsunit_data{$obsunit_id}->{'notes'} = $d->{notes};
379 my $synonyms = $d->{synonyms};
380 my $synonym_string = $synonyms ? join ("," , @$synonyms) : '';
381 my $entry_type = $d->{is_a_control} ? 'check' : 'test';
383 my $trial_name = $d->{trial_name};
384 my $trial_desc = $d->{trial_description};
386 $trial_name =~ s/\s+$//g;
387 $trial_desc =~ s/\s+$//g;
389 $obsunit_data{$obsunit_id}->{metadata} = [
390 $d->{year},
391 $d->{breeding_program_id},
392 $d->{breeding_program_name},
393 $d->{breeding_program_description},
394 $d->{trial_id},
395 $trial_name,
396 $trial_desc,
397 $d->{design},
398 $d->{plot_width},
399 $d->{plot_length},
400 $d->{field_size},
401 $d->{field_trial_is_planned_to_be_genotyped},
402 $d->{field_trial_is_planned_to_cross},
403 $d->{planting_date},
404 $d->{harvest_date},
405 $d->{location_id},
406 $d->{location_name},
407 $d->{accession_stock_id},
408 $d->{accession_uniquename},
409 $synonym_string,
410 $d->{obsunit_type_name},
411 $d->{obsunit_stock_id},
412 $d->{obsunit_uniquename},
413 $d->{rep},
414 $d->{block},
415 $d->{plot_number},
416 $d->{row_number},
417 $d->{col_number},
418 $entry_type,
419 $d->{plant_number}
421 $traits{$cvterm}++;
424 #print STDERR Dumper \%plot_data;
425 #print STDERR Dumper \%traits;
427 # retrieve treatments
428 my $project_object = CXGN::BreedersToolbox::Projects->new( { schema => $self->bcs_schema });
429 my $treatment_info = {};
430 if ($self->trial_list) {
431 $treatment_info = $project_object->get_related_treatments($self->trial_list, \%seen_obsunits);
433 my $treatment_names = $treatment_info->{treatment_names};
434 my $treatment_details = $treatment_info->{treatment_details};
436 my @line = @metadata_headers;
438 my @sorted_traits = sort keys(%traits);
439 foreach my $trait (@sorted_traits) {
440 push @line, $trait;
442 push @line, 'notes';
444 # add treatment names to header
445 foreach my $name (@$treatment_names) {
446 push @line, $name;
449 push @info, \@line;
451 foreach my $p (@unique_obsunit_list) {
452 my @line = @{$obsunit_data{$p}->{metadata}};
454 foreach my $trait (@sorted_traits) {
455 push @line, $obsunit_data{$p}->{$trait};
457 push @line, $obsunit_data{$p}->{'notes'};
459 # add treatment values to each obsunit line
460 my %unit_treatments;
461 if ($treatment_details->{$p}) {
462 %unit_treatments = %{$treatment_details->{$p}};
464 foreach my $name (@$treatment_names) {
465 push @line, $unit_treatments{$name};
467 push @info, \@line;
471 #print STDERR Dumper \@info;
472 print STDERR "Construct Pheno Matrix End:".localtime."\n";
473 return @info;